Merge tag 'slab-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka...

[linux-2.6-block.git] / mm / slub.c
diff --git a/mm/slub.c b/mm/slub.c

index 24f702afd4584b36dade17e4a6def6a392bbcb51..4954999183d58ecba9d7e86f44025131934308f2 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -624,11 +624,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
         nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
         s->cpu_partial_slabs = nr_slabs;
  }
+
+static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
+{
+       return s->cpu_partial_slabs;
+}
  #else
  static inline void
  slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
  {
  }
+
+static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
+{
+       return 0;
+}
  #endif /* CONFIG_SLUB_CPU_PARTIAL */
  
  /*
@@ -2609,19 +2619,18 @@ static struct slab *get_partial_node(struct kmem_cache *s,
                 if (!partial) {
                         partial = slab;
                         stat(s, ALLOC_FROM_PARTIAL);
+
+                       if ((slub_get_cpu_partial(s) == 0)) {
+                               break;
+                       }
                 } else {
                         put_cpu_partial(s, slab, 0);
                         stat(s, CPU_PARTIAL_NODE);
-                       partial_slabs++;
-               }
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-               if (!kmem_cache_has_cpu_partial(s)
-                       || partial_slabs > s->cpu_partial_slabs / 2)
-                       break;
-#else
-               break;
-#endif
  
+                       if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
+                               break;
+                       }
+               }
         }
         spin_unlock_irqrestore(&n->list_lock, flags);
         return partial;
@@ -2704,7 +2713,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node,
                 searchnode = numa_mem_id();
  
         slab = get_partial_node(s, get_node(s, searchnode), pc);
-       if (slab || node != NUMA_NO_NODE)
+       if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
                 return slab;
  
         return get_any_partial(s, pc);
@@ -2802,7 +2811,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
         struct slab new;
         struct slab old;
  
-       if (slab->freelist) {
+       if (READ_ONCE(slab->freelist)) {
                 stat(s, DEACTIVATE_REMOTE_FREES);
                 tail = DEACTIVATE_TO_TAIL;
         }
@@ -3234,6 +3243,43 @@ static unsigned long count_partial(struct kmem_cache_node *n,
  #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
  
  #ifdef CONFIG_SLUB_DEBUG
+#define MAX_PARTIAL_TO_SCAN 10000
+
+static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
+{
+       unsigned long flags;
+       unsigned long x = 0;
+       struct slab *slab;
+
+       spin_lock_irqsave(&n->list_lock, flags);
+       if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
+               list_for_each_entry(slab, &n->partial, slab_list)
+                       x += slab->objects - slab->inuse;
+       } else {
+               /*
+                * For a long list, approximate the total count of objects in
+                * it to meet the limit on the number of slabs to scan.
+                * Scan from both the list's head and tail for better accuracy.
+                */
+               unsigned long scanned = 0;
+
+               list_for_each_entry(slab, &n->partial, slab_list) {
+                       x += slab->objects - slab->inuse;
+                       if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
+                               break;
+               }
+               list_for_each_entry_reverse(slab, &n->partial, slab_list) {
+                       x += slab->objects - slab->inuse;
+                       if (++scanned == MAX_PARTIAL_TO_SCAN)
+                               break;
+               }
+               x = mult_frac(x, n->nr_partial, scanned);
+               x = min(x, node_nr_objs(n));
+       }
+       spin_unlock_irqrestore(&n->list_lock, flags);
+       return x;
+}
+
  static noinline void
  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
  {
@@ -3260,7 +3306,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                 unsigned long nr_objs;
                 unsigned long nr_free;
  
-               nr_free  = count_partial(n, count_free);
+               nr_free  = count_partial_free_approx(n);
                 nr_slabs = node_nr_slabs(n);
                 nr_objs  = node_nr_objs(n);
  
@@ -3380,6 +3426,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
         struct slab *slab;
         unsigned long flags;
         struct partial_context pc;
+       bool try_thisnode = true;
  
         stat(s, ALLOC_SLOWPATH);
  
@@ -3506,6 +3553,21 @@ new_slab:
  new_objects:
  
         pc.flags = gfpflags;
+       /*
+        * When a preferred node is indicated but no __GFP_THISNODE
+        *
+        * 1) try to get a partial slab from target node only by having
+        *    __GFP_THISNODE in pc.flags for get_partial()
+        * 2) if 1) failed, try to allocate a new slab from target node with
+        *    GPF_NOWAIT | __GFP_THISNODE opportunistically
+        * 3) if 2) failed, retry with original gfpflags which will allow
+        *    get_partial() try partial lists of other nodes before potentially
+        *    allocating new page from other nodes
+        */
+       if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
+                    && try_thisnode))
+               pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+
         pc.orig_size = orig_size;
         slab = get_partial(s, node, &pc);
         if (slab) {
@@ -3527,10 +3589,15 @@ new_objects:
         }
  
         slub_put_cpu_ptr(s->cpu_slab);
-       slab = new_slab(s, gfpflags, node);
+       slab = new_slab(s, pc.flags, node);
         c = slub_get_cpu_ptr(s->cpu_slab);
  
         if (unlikely(!slab)) {
+               if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
+                   && try_thisnode) {
+                       try_thisnode = false;
+                       goto new_objects;
+               }
                 slab_out_of_memory(s, gfpflags, node);
                 return NULL;
         }
@@ -4232,7 +4299,7 @@ redo:
         c = raw_cpu_ptr(s->cpu_slab);
         tid = READ_ONCE(c->tid);
  
-       /* Same with comment on barrier() in slab_alloc_node() */
+       /* Same with comment on barrier() in __slab_alloc_node() */
         barrier();
  
         if (unlikely(slab != c->slab)) {
@@ -4853,7 +4920,6 @@ static void early_kmem_cache_node_alloc(int node)
         BUG_ON(!n);
  #ifdef CONFIG_SLUB_DEBUG
         init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
-       init_tracking(kmem_cache_node, n);
  #endif
         n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
         slab->freelist = get_freepointer(kmem_cache_node, n);
@@ -5066,9 +5132,7 @@ static int calculate_sizes(struct kmem_cache *s)
         if ((int)order < 0)
                 return 0;
  
-       s->allocflags = 0;
-       if (order)
-               s->allocflags |= __GFP_COMP;
+       s->allocflags = __GFP_COMP;
  
         if (s->flags & SLAB_CACHE_DMA)
                 s->allocflags |= GFP_DMA;
@@ -6042,7 +6106,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                                 else if (flags & SO_OBJECTS)
                                         WARN_ON_ONCE(1);
                                 else
-                                       x = slab->slabs;
+                                       x = data_race(slab->slabs);
                                 total += x;
                                 nodes[node] += x;
                         }
@@ -6247,7 +6311,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
                 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
  
                 if (slab)
-                       slabs += slab->slabs;
+                       slabs += data_race(slab->slabs);
         }
  #endif
  
@@ -6261,7 +6325,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
  
                 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
                 if (slab) {
-                       slabs = READ_ONCE(slab->slabs);
+                       slabs = data_race(slab->slabs);
                         objects = (slabs * oo_objects(s->oo)) / 2;
                         len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
                                              cpu, objects, slabs);
@@ -7095,7 +7159,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
         for_each_kmem_cache_node(s, node, n) {
                 nr_slabs += node_nr_slabs(n);
                 nr_objs += node_nr_objs(n);
-               nr_free += count_partial(n, count_free);
+               nr_free += count_partial_free_approx(n);
         }
  
         sinfo->active_objs = nr_objs - nr_free;
@@ -7105,14 +7169,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
         sinfo->objects_per_slab = oo_objects(s->oo);
         sinfo->cache_order = oo_order(s->oo);
  }
-
-void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
-{
-}
-
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-                      size_t count, loff_t *ppos)
-{
-       return -EIO;
-}
  #endif /* CONFIG_SLUB_DEBUG */