Merge tag '5.4-rc-smb3-fixes' of git://git.samba.org/sfrench/cifs-2.6

[linux-2.6-block.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index f3c15bb07cce4be6dc9eb6143da2625828c56c4a..c313c49074cad4916e7aa6b74968c548690a451d 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
  #include <linux/lockdep.h>
  #include <linux/file.h>
  #include <linux/tracehook.h>
+#include <linux/psi.h>
  #include <linux/seq_buf.h>
  #include "internal.h"
  #include <net/sock.h>
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
  
  struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
  
  static int memcg_shrinker_map_size;
  static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
         }
  }
  
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-       return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
  /**
   * mem_cgroup_css_from_page - css of the memcg associated with a page
   * @page: page of interest
@@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
         for_each_online_cpu(cpu) {
                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                 struct mem_cgroup *memcg;
+               bool flush = false;
  
+               rcu_read_lock();
                 memcg = stock->cached;
-               if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
-                       continue;
-               if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
-                       css_put(&memcg->css);
-                       continue;
-               }
-               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+               if (memcg && stock->nr_pages &&
+                   mem_cgroup_is_descendant(memcg, root_memcg))
+                       flush = true;
+               rcu_read_unlock();
+
+               if (flush &&
+                   !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
                         if (cpu == curcpu)
                                 drain_local_stock(&stock->work);
                         else
                                 schedule_work_on(cpu, &stock->work);
                 }
-               css_put(&memcg->css);
         }
         put_cpu();
         mutex_unlock(&percpu_charge_mutex);
@@ -2358,12 +2353,68 @@ static void high_work_func(struct work_struct *work)
         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
  }
  
+/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ *   overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ *   proposed penalty in order to reduce to a reasonable number of jiffies, and
+ *   to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ *  +-------+------------------------+
+ *  | usage | time to allocate in ms |
+ *  +-------+------------------------+
+ *  | 100M  |                      0 |
+ *  | 101M  |                      6 |
+ *  | 102M  |                     25 |
+ *  | 103M  |                     57 |
+ *  | 104M  |                    102 |
+ *  | 105M  |                    159 |
+ *  | 106M  |                    230 |
+ *  | 107M  |                    313 |
+ *  | 108M  |                    409 |
+ *  | 109M  |                    518 |
+ *  | 110M  |                    639 |
+ *  | 111M  |                    774 |
+ *  | 112M  |                    921 |
+ *  | 113M  |                   1081 |
+ *  | 114M  |                   1254 |
+ *  | 115M  |                   1439 |
+ *  | 116M  |                   1638 |
+ *  | 117M  |                   1849 |
+ *  | 118M  |                   2000 |
+ *  | 119M  |                   2000 |
+ *  | 120M  |                   2000 |
+ *  +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
  /*
   * Scheduled by try_charge() to be executed from the userland return path
   * and reclaims memory over the high limit.
   */
  void mem_cgroup_handle_over_high(void)
  {
+       unsigned long usage, high, clamped_high;
+       unsigned long pflags;
+       unsigned long penalty_jiffies, overage;
         unsigned int nr_pages = current->memcg_nr_pages_over_high;
         struct mem_cgroup *memcg;
  
@@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void)
  
         memcg = get_mem_cgroup_from_mm(current->mm);
         reclaim_high(memcg, nr_pages, GFP_KERNEL);
-       css_put(&memcg->css);
         current->memcg_nr_pages_over_high = 0;
+
+       /*
+        * memory.high is breached and reclaim is unable to keep up. Throttle
+        * allocators proactively to slow down excessive growth.
+        *
+        * We use overage compared to memory.high to calculate the number of
+        * jiffies to sleep (penalty_jiffies). Ideally this value should be
+        * fairly lenient on small overages, and increasingly harsh when the
+        * memcg in question makes it clear that it has no intention of stopping
+        * its crazy behaviour, so we exponentially increase the delay based on
+        * overage amount.
+        */
+
+       usage = page_counter_read(&memcg->memory);
+       high = READ_ONCE(memcg->high);
+
+       if (usage <= high)
+               goto out;
+
+       /*
+        * Prevent division by 0 in overage calculation by acting as if it was a
+        * threshold of 1 page
+        */
+       clamped_high = max(high, 1UL);
+
+       overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+                         clamped_high);
+
+       penalty_jiffies = ((u64)overage * overage * HZ)
+               >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+       /*
+        * Factor in the task's own contribution to the overage, such that four
+        * N-sized allocations are throttled approximately the same as one
+        * 4N-sized allocation.
+        *
+        * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+        * larger the current charge patch is than that.
+        */
+       penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+       /*
+        * Clamp the max delay per usermode return so as to still keep the
+        * application moving forwards and also permit diagnostics, albeit
+        * extremely slowly.
+        */
+       penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+       /*
+        * Don't sleep if the amount of jiffies this memcg owes us is so low
+        * that it's not even worth doing, in an attempt to be nice to those who
+        * go only a small amount over their memory.high value and maybe haven't
+        * been aggressively reclaimed enough yet.
+        */
+       if (penalty_jiffies <= HZ / 100)
+               goto out;
+
+       /*
+        * If we exit early, we're guaranteed to die (since
+        * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+        * need to account for any ill-begotten jiffies to pay them off later.
+        */
+       psi_memstall_enter(&pflags);
+       schedule_timeout_killable(penalty_jiffies);
+       psi_memstall_leave(&pflags);
+
+out:
+       css_put(&memcg->css);
  }
  
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2825,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
  
         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+               /*
+                * Enforce __GFP_NOFAIL allocation because callers are not
+                * prepared to see failures and likely do not have any failure
+                * handling code.
+                */
+               if (gfp & __GFP_NOFAIL) {
+                       page_counter_charge(&memcg->kmem, nr_pages);
+                       return 0;
+               }
                 cancel_charge(memcg, nr_pages);
                 return -ENOMEM;
         }
@@ -3512,6 +3640,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
                         break;
                 case _KMEM:
+                       pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+                                    "Please report your usecase to linux-mm@kvack.org if you "
+                                    "depend on this functionality.\n");
                         ret = memcg_update_kmem_max(memcg, nr_pages);
                         break;
                 case _TCP:
@@ -4805,11 +4936,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
         }
  }
  
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
-       mem_cgroup_id_get_many(memcg, 1);
-}
-
  static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
  {
         mem_cgroup_id_put_many(memcg, 1);
@@ -4954,6 +5080,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
                 memcg->cgwb_frn[i].done =
                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+       memcg->deferred_split_queue.split_queue_len = 0;
  #endif
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
@@ -5333,6 +5464,14 @@ static int mem_cgroup_move_account(struct page *page,
                 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
         }
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && !list_empty(page_deferred_list(page))) {
+               spin_lock(&from->deferred_split_queue.split_queue_lock);
+               list_del_init(page_deferred_list(page));
+               from->deferred_split_queue.split_queue_len--;
+               spin_unlock(&from->deferred_split_queue.split_queue_lock);
+       }
+#endif
         /*
          * It is safe to change page->mem_cgroup here because the page
          * is referenced, charged, and isolated - we can't race with
@@ -5341,6 +5480,17 @@ static int mem_cgroup_move_account(struct page *page,
  
         /* caller should have done css_get */
         page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && list_empty(page_deferred_list(page))) {
+               spin_lock(&to->deferred_split_queue.split_queue_lock);
+               list_add_tail(page_deferred_list(page),
+                             &to->deferred_split_queue.split_queue);
+               to->deferred_split_queue.split_queue_len++;
+               spin_unlock(&to->deferred_split_queue.split_queue_lock);
+       }
+#endif
+
         spin_unlock_irqrestore(&from->move_lock, flags);
  
         ret = 0;
@@ -6511,7 +6661,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                 unsigned int nr_pages = 1;
  
                 if (PageTransHuge(page)) {
-                       nr_pages <<= compound_order(page);
+                       nr_pages = compound_nr(page);
                         ug->nr_huge += nr_pages;
                 }
                 if (PageAnon(page))
@@ -6523,7 +6673,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                 }
                 ug->pgpgout++;
         } else {
-               ug->nr_kmem += 1 << compound_order(page);
+               ug->nr_kmem += compound_nr(page);
                 __ClearPageKmemcg(page);
         }