lib/nodemask: optimize node_random for nodemask with single NUMA node
[linux-block.git] / mm / memory-tiers.c
CommitLineData
992bf775 1// SPDX-License-Identifier: GPL-2.0
992bf775
AK
2#include <linux/slab.h>
3#include <linux/lockdep.h>
91952440
AK
4#include <linux/sysfs.h>
5#include <linux/kobject.h>
c6123a19 6#include <linux/memory.h>
992bf775
AK
7#include <linux/memory-tiers.h>
8
6c542ab7
AK
9#include "internal.h"
10
992bf775
AK
11struct memory_tier {
12 /* hierarchy of memory tiers */
13 struct list_head list;
14 /* list of all memory types part of this tier */
15 struct list_head memory_types;
16 /*
17 * start value of abstract distance. memory tier maps
18 * an abstract distance range,
19 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 */
21 int adistance_start;
32008027
JG
22 /* All the nodes that are part of all the lower memory tiers. */
23 nodemask_t lower_tier_mask;
992bf775
AK
24};
25
6c542ab7
AK
26struct demotion_nodes {
27 nodemask_t preferred;
28};
29
7b88bda3
AK
30struct node_memory_type_map {
31 struct memory_dev_type *memtype;
32 int map_count;
992bf775
AK
33};
34
35static DEFINE_MUTEX(memory_tier_lock);
36static LIST_HEAD(memory_tiers);
7b88bda3
AK
37static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
38static struct memory_dev_type *default_dram_type;
6c542ab7 39#ifdef CONFIG_MIGRATION
467b171a 40static int top_tier_adistance;
6c542ab7
AK
41/*
42 * node_demotion[] examples:
43 *
44 * Example 1:
45 *
46 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
47 *
48 * node distances:
49 * node 0 1 2 3
50 * 0 10 20 30 40
51 * 1 20 10 40 30
52 * 2 30 40 10 40
53 * 3 40 30 40 10
54 *
55 * memory_tiers0 = 0-1
56 * memory_tiers1 = 2-3
57 *
58 * node_demotion[0].preferred = 2
59 * node_demotion[1].preferred = 3
60 * node_demotion[2].preferred = <empty>
61 * node_demotion[3].preferred = <empty>
62 *
63 * Example 2:
64 *
65 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
66 *
67 * node distances:
68 * node 0 1 2
69 * 0 10 20 30
70 * 1 20 10 30
71 * 2 30 30 10
72 *
73 * memory_tiers0 = 0-2
74 *
75 * node_demotion[0].preferred = <empty>
76 * node_demotion[1].preferred = <empty>
77 * node_demotion[2].preferred = <empty>
78 *
79 * Example 3:
80 *
81 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
82 *
83 * node distances:
84 * node 0 1 2
85 * 0 10 20 30
86 * 1 20 10 40
87 * 2 30 40 10
88 *
89 * memory_tiers0 = 1
90 * memory_tiers1 = 0
91 * memory_tiers2 = 2
92 *
93 * node_demotion[0].preferred = 2
94 * node_demotion[1].preferred = 0
95 * node_demotion[2].preferred = <empty>
96 *
97 */
98static struct demotion_nodes *node_demotion __read_mostly;
99#endif /* CONFIG_MIGRATION */
992bf775
AK
100
101static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
102{
103 bool found_slot = false;
104 struct memory_tier *memtier, *new_memtier;
105 int adistance = memtype->adistance;
106 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
107
108 lockdep_assert_held_once(&memory_tier_lock);
109
b26ac6f3 110 adistance = round_down(adistance, memtier_adistance_chunk_size);
992bf775
AK
111 /*
112 * If the memtype is already part of a memory tier,
113 * just return that.
114 */
b26ac6f3
AK
115 if (!list_empty(&memtype->tier_sibiling)) {
116 list_for_each_entry(memtier, &memory_tiers, list) {
117 if (adistance == memtier->adistance_start)
118 return memtier;
119 }
120 WARN_ON(1);
121 return ERR_PTR(-EINVAL);
122 }
992bf775 123
992bf775
AK
124 list_for_each_entry(memtier, &memory_tiers, list) {
125 if (adistance == memtier->adistance_start) {
992bf775
AK
126 list_add(&memtype->tier_sibiling, &memtier->memory_types);
127 return memtier;
128 } else if (adistance < memtier->adistance_start) {
129 found_slot = true;
130 break;
131 }
132 }
133
134 new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
135 if (!new_memtier)
136 return ERR_PTR(-ENOMEM);
137
138 new_memtier->adistance_start = adistance;
139 INIT_LIST_HEAD(&new_memtier->list);
140 INIT_LIST_HEAD(&new_memtier->memory_types);
141 if (found_slot)
142 list_add_tail(&new_memtier->list, &memtier->list);
143 else
144 list_add_tail(&new_memtier->list, &memory_tiers);
992bf775
AK
145 list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
146 return new_memtier;
147}
148
6c542ab7
AK
149static struct memory_tier *__node_get_memory_tier(int node)
150{
7766cf7a 151 pg_data_t *pgdat;
6c542ab7 152
7766cf7a
AK
153 pgdat = NODE_DATA(node);
154 if (!pgdat)
155 return NULL;
156 /*
157 * Since we hold memory_tier_lock, we can avoid
158 * RCU read locks when accessing the details. No
159 * parallel updates are possible here.
160 */
161 return rcu_dereference_check(pgdat->memtier,
162 lockdep_is_held(&memory_tier_lock));
6c542ab7
AK
163}
164
165#ifdef CONFIG_MIGRATION
467b171a
AK
166bool node_is_toptier(int node)
167{
168 bool toptier;
169 pg_data_t *pgdat;
170 struct memory_tier *memtier;
171
172 pgdat = NODE_DATA(node);
173 if (!pgdat)
174 return false;
175
176 rcu_read_lock();
177 memtier = rcu_dereference(pgdat->memtier);
178 if (!memtier) {
179 toptier = true;
180 goto out;
181 }
182 if (memtier->adistance_start <= top_tier_adistance)
183 toptier = true;
184 else
185 toptier = false;
186out:
187 rcu_read_unlock();
188 return toptier;
189}
190
32008027
JG
191void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
192{
193 struct memory_tier *memtier;
194
195 /*
196 * pg_data_t.memtier updates includes a synchronize_rcu()
197 * which ensures that we either find NULL or a valid memtier
198 * in NODE_DATA. protect the access via rcu_read_lock();
199 */
200 rcu_read_lock();
201 memtier = rcu_dereference(pgdat->memtier);
202 if (memtier)
203 *targets = memtier->lower_tier_mask;
204 else
205 *targets = NODE_MASK_NONE;
206 rcu_read_unlock();
207}
208
6c542ab7
AK
209/**
210 * next_demotion_node() - Get the next node in the demotion path
211 * @node: The starting node to lookup the next node
212 *
213 * Return: node id for next memory node in the demotion path hierarchy
214 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
215 * @node online or guarantee that it *continues* to be the next demotion
216 * target.
217 */
218int next_demotion_node(int node)
219{
220 struct demotion_nodes *nd;
221 int target;
222
223 if (!node_demotion)
224 return NUMA_NO_NODE;
225
226 nd = &node_demotion[node];
227
228 /*
229 * node_demotion[] is updated without excluding this
230 * function from running.
231 *
232 * Make sure to use RCU over entire code blocks if
233 * node_demotion[] reads need to be consistent.
234 */
235 rcu_read_lock();
236 /*
237 * If there are multiple target nodes, just select one
238 * target node randomly.
239 *
240 * In addition, we can also use round-robin to select
241 * target node, but we should introduce another variable
242 * for node_demotion[] to record last selected target node,
243 * that may cause cache ping-pong due to the changing of
244 * last target node. Or introducing per-cpu data to avoid
245 * caching issue, which seems more complicated. So selecting
246 * target node randomly seems better until now.
247 */
248 target = node_random(&nd->preferred);
249 rcu_read_unlock();
250
251 return target;
252}
253
254static void disable_all_demotion_targets(void)
255{
32008027 256 struct memory_tier *memtier;
6c542ab7
AK
257 int node;
258
32008027 259 for_each_node_state(node, N_MEMORY) {
6c542ab7 260 node_demotion[node].preferred = NODE_MASK_NONE;
32008027
JG
261 /*
262 * We are holding memory_tier_lock, it is safe
263 * to access pgda->memtier.
264 */
265 memtier = __node_get_memory_tier(node);
266 if (memtier)
267 memtier->lower_tier_mask = NODE_MASK_NONE;
268 }
6c542ab7
AK
269 /*
270 * Ensure that the "disable" is visible across the system.
271 * Readers will see either a combination of before+disable
272 * state or disable+after. They will never see before and
273 * after state together.
274 */
275 synchronize_rcu();
276}
277
278static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
279{
280 nodemask_t nodes = NODE_MASK_NONE;
281 struct memory_dev_type *memtype;
282
283 list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
284 nodes_or(nodes, nodes, memtype->nodes);
285
286 return nodes;
287}
288
289/*
290 * Find an automatic demotion target for all memory
291 * nodes. Failing here is OK. It might just indicate
292 * being at the end of a chain.
293 */
294static void establish_demotion_targets(void)
295{
296 struct memory_tier *memtier;
297 struct demotion_nodes *nd;
298 int target = NUMA_NO_NODE, node;
299 int distance, best_distance;
32008027 300 nodemask_t tier_nodes, lower_tier;
6c542ab7
AK
301
302 lockdep_assert_held_once(&memory_tier_lock);
303
304 if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
305 return;
306
307 disable_all_demotion_targets();
308
309 for_each_node_state(node, N_MEMORY) {
310 best_distance = -1;
311 nd = &node_demotion[node];
312
313 memtier = __node_get_memory_tier(node);
314 if (!memtier || list_is_last(&memtier->list, &memory_tiers))
315 continue;
316 /*
317 * Get the lower memtier to find the demotion node list.
318 */
319 memtier = list_next_entry(memtier, list);
320 tier_nodes = get_memtier_nodemask(memtier);
321 /*
322 * find_next_best_node, use 'used' nodemask as a skip list.
323 * Add all memory nodes except the selected memory tier
324 * nodelist to skip list so that we find the best node from the
325 * memtier nodelist.
326 */
327 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
328
329 /*
330 * Find all the nodes in the memory tier node list of same best distance.
331 * add them to the preferred mask. We randomly select between nodes
332 * in the preferred mask when allocating pages during demotion.
333 */
334 do {
335 target = find_next_best_node(node, &tier_nodes);
336 if (target == NUMA_NO_NODE)
337 break;
338
339 distance = node_distance(node, target);
340 if (distance == best_distance || best_distance == -1) {
341 best_distance = distance;
342 node_set(target, nd->preferred);
343 } else {
344 break;
345 }
346 } while (1);
347 }
467b171a
AK
348 /*
349 * Promotion is allowed from a memory tier to higher
350 * memory tier only if the memory tier doesn't include
351 * compute. We want to skip promotion from a memory tier,
352 * if any node that is part of the memory tier have CPUs.
353 * Once we detect such a memory tier, we consider that tier
354 * as top tiper from which promotion is not allowed.
355 */
356 list_for_each_entry_reverse(memtier, &memory_tiers, list) {
357 tier_nodes = get_memtier_nodemask(memtier);
358 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
359 if (!nodes_empty(tier_nodes)) {
360 /*
361 * abstract distance below the max value of this memtier
362 * is considered toptier.
363 */
364 top_tier_adistance = memtier->adistance_start +
365 MEMTIER_CHUNK_SIZE - 1;
366 break;
367 }
368 }
32008027
JG
369 /*
370 * Now build the lower_tier mask for each node collecting node mask from
371 * all memory tier below it. This allows us to fallback demotion page
372 * allocation to a set of nodes that is closer the above selected
373 * perferred node.
374 */
375 lower_tier = node_states[N_MEMORY];
376 list_for_each_entry(memtier, &memory_tiers, list) {
377 /*
378 * Keep removing current tier from lower_tier nodes,
379 * This will remove all nodes in current and above
380 * memory tier from the lower_tier mask.
381 */
382 tier_nodes = get_memtier_nodemask(memtier);
383 nodes_andnot(lower_tier, lower_tier, tier_nodes);
384 memtier->lower_tier_mask = lower_tier;
385 }
6c542ab7
AK
386}
387
388#else
389static inline void disable_all_demotion_targets(void) {}
390static inline void establish_demotion_targets(void) {}
391#endif /* CONFIG_MIGRATION */
392
7b88bda3
AK
393static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
394{
395 if (!node_memory_types[node].memtype)
396 node_memory_types[node].memtype = memtype;
397 /*
398 * for each device getting added in the same NUMA node
399 * with this specific memtype, bump the map count. We
400 * Only take memtype device reference once, so that
401 * changing a node memtype can be done by droping the
402 * only reference count taken here.
403 */
404
405 if (node_memory_types[node].memtype == memtype) {
406 if (!node_memory_types[node].map_count++)
407 kref_get(&memtype->kref);
408 }
409}
410
992bf775
AK
411static struct memory_tier *set_node_memory_tier(int node)
412{
413 struct memory_tier *memtier;
414 struct memory_dev_type *memtype;
7766cf7a
AK
415 pg_data_t *pgdat = NODE_DATA(node);
416
992bf775
AK
417
418 lockdep_assert_held_once(&memory_tier_lock);
419
420 if (!node_state(node, N_MEMORY))
421 return ERR_PTR(-EINVAL);
422
7b88bda3 423 __init_node_memory_type(node, default_dram_type);
992bf775 424
7b88bda3 425 memtype = node_memory_types[node].memtype;
992bf775
AK
426 node_set(node, memtype->nodes);
427 memtier = find_create_memory_tier(memtype);
7766cf7a
AK
428 if (!IS_ERR(memtier))
429 rcu_assign_pointer(pgdat->memtier, memtier);
992bf775
AK
430 return memtier;
431}
432
c6123a19
AK
433static void destroy_memory_tier(struct memory_tier *memtier)
434{
435 list_del(&memtier->list);
7766cf7a
AK
436 /*
437 * synchronize_rcu in clear_node_memory_tier makes sure
438 * we don't have rcu access to this memory tier.
439 */
c6123a19
AK
440 kfree(memtier);
441}
442
443static bool clear_node_memory_tier(int node)
444{
445 bool cleared = false;
7766cf7a 446 pg_data_t *pgdat;
c6123a19
AK
447 struct memory_tier *memtier;
448
7766cf7a
AK
449 pgdat = NODE_DATA(node);
450 if (!pgdat)
451 return false;
452
453 /*
454 * Make sure that anybody looking at NODE_DATA who finds
455 * a valid memtier finds memory_dev_types with nodes still
456 * linked to the memtier. We achieve this by waiting for
457 * rcu read section to finish using synchronize_rcu.
458 * This also enables us to free the destroyed memory tier
459 * with kfree instead of kfree_rcu
460 */
c6123a19
AK
461 memtier = __node_get_memory_tier(node);
462 if (memtier) {
463 struct memory_dev_type *memtype;
464
7766cf7a
AK
465 rcu_assign_pointer(pgdat->memtier, NULL);
466 synchronize_rcu();
7b88bda3 467 memtype = node_memory_types[node].memtype;
c6123a19
AK
468 node_clear(node, memtype->nodes);
469 if (nodes_empty(memtype->nodes)) {
470 list_del_init(&memtype->tier_sibiling);
c6123a19
AK
471 if (list_empty(&memtier->memory_types))
472 destroy_memory_tier(memtier);
473 }
474 cleared = true;
475 }
476 return cleared;
477}
478
7b88bda3
AK
479static void release_memtype(struct kref *kref)
480{
481 struct memory_dev_type *memtype;
482
483 memtype = container_of(kref, struct memory_dev_type, kref);
484 kfree(memtype);
485}
486
487struct memory_dev_type *alloc_memory_type(int adistance)
488{
489 struct memory_dev_type *memtype;
490
491 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
492 if (!memtype)
493 return ERR_PTR(-ENOMEM);
494
495 memtype->adistance = adistance;
496 INIT_LIST_HEAD(&memtype->tier_sibiling);
497 memtype->nodes = NODE_MASK_NONE;
7b88bda3
AK
498 kref_init(&memtype->kref);
499 return memtype;
500}
501EXPORT_SYMBOL_GPL(alloc_memory_type);
502
503void destroy_memory_type(struct memory_dev_type *memtype)
504{
505 kref_put(&memtype->kref, release_memtype);
506}
507EXPORT_SYMBOL_GPL(destroy_memory_type);
508
509void init_node_memory_type(int node, struct memory_dev_type *memtype)
510{
511
512 mutex_lock(&memory_tier_lock);
513 __init_node_memory_type(node, memtype);
514 mutex_unlock(&memory_tier_lock);
515}
516EXPORT_SYMBOL_GPL(init_node_memory_type);
517
518void clear_node_memory_type(int node, struct memory_dev_type *memtype)
519{
520 mutex_lock(&memory_tier_lock);
521 if (node_memory_types[node].memtype == memtype)
522 node_memory_types[node].map_count--;
523 /*
524 * If we umapped all the attached devices to this node,
525 * clear the node memory type.
526 */
527 if (!node_memory_types[node].map_count) {
528 node_memory_types[node].memtype = NULL;
529 kref_put(&memtype->kref, release_memtype);
530 }
531 mutex_unlock(&memory_tier_lock);
532}
533EXPORT_SYMBOL_GPL(clear_node_memory_type);
534
c6123a19
AK
535static int __meminit memtier_hotplug_callback(struct notifier_block *self,
536 unsigned long action, void *_arg)
537{
6c542ab7 538 struct memory_tier *memtier;
c6123a19
AK
539 struct memory_notify *arg = _arg;
540
541 /*
542 * Only update the node migration order when a node is
543 * changing status, like online->offline.
544 */
545 if (arg->status_change_nid < 0)
546 return notifier_from_errno(0);
547
548 switch (action) {
549 case MEM_OFFLINE:
550 mutex_lock(&memory_tier_lock);
6c542ab7
AK
551 if (clear_node_memory_tier(arg->status_change_nid))
552 establish_demotion_targets();
c6123a19
AK
553 mutex_unlock(&memory_tier_lock);
554 break;
555 case MEM_ONLINE:
556 mutex_lock(&memory_tier_lock);
6c542ab7
AK
557 memtier = set_node_memory_tier(arg->status_change_nid);
558 if (!IS_ERR(memtier))
559 establish_demotion_targets();
c6123a19
AK
560 mutex_unlock(&memory_tier_lock);
561 break;
562 }
563
564 return notifier_from_errno(0);
565}
566
992bf775
AK
567static int __init memory_tier_init(void)
568{
569 int node;
570 struct memory_tier *memtier;
571
6c542ab7
AK
572#ifdef CONFIG_MIGRATION
573 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
574 GFP_KERNEL);
575 WARN_ON(!node_demotion);
576#endif
992bf775 577 mutex_lock(&memory_tier_lock);
7b88bda3
AK
578 /*
579 * For now we can have 4 faster memory tiers with smaller adistance
580 * than default DRAM tier.
581 */
582 default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
583 if (!default_dram_type)
584 panic("%s() failed to allocate default DRAM tier\n", __func__);
585
992bf775
AK
586 /*
587 * Look at all the existing N_MEMORY nodes and add them to
588 * default memory tier or to a tier if we already have memory
589 * types assigned.
590 */
591 for_each_node_state(node, N_MEMORY) {
592 memtier = set_node_memory_tier(node);
593 if (IS_ERR(memtier))
594 /*
595 * Continue with memtiers we are able to setup
596 */
597 break;
598 }
6c542ab7 599 establish_demotion_targets();
992bf775
AK
600 mutex_unlock(&memory_tier_lock);
601
c6123a19 602 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
992bf775
AK
603 return 0;
604}
605subsys_initcall(memory_tier_init);
91952440
AK
606
607bool numa_demotion_enabled = false;
608
609#ifdef CONFIG_MIGRATION
610#ifdef CONFIG_SYSFS
611static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
612 struct kobj_attribute *attr, char *buf)
613{
614 return sysfs_emit(buf, "%s\n",
615 numa_demotion_enabled ? "true" : "false");
616}
617
618static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
619 struct kobj_attribute *attr,
620 const char *buf, size_t count)
621{
622 ssize_t ret;
623
624 ret = kstrtobool(buf, &numa_demotion_enabled);
625 if (ret)
626 return ret;
627
628 return count;
629}
630
631static struct kobj_attribute numa_demotion_enabled_attr =
632 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
633 numa_demotion_enabled_store);
634
635static struct attribute *numa_attrs[] = {
636 &numa_demotion_enabled_attr.attr,
637 NULL,
638};
639
640static const struct attribute_group numa_attr_group = {
641 .attrs = numa_attrs,
642};
643
644static int __init numa_init_sysfs(void)
645{
646 int err;
647 struct kobject *numa_kobj;
648
649 numa_kobj = kobject_create_and_add("numa", mm_kobj);
650 if (!numa_kobj) {
651 pr_err("failed to create numa kobject\n");
652 return -ENOMEM;
653 }
654 err = sysfs_create_group(numa_kobj, &numa_attr_group);
655 if (err) {
656 pr_err("failed to register numa group\n");
657 goto delete_obj;
658 }
659 return 0;
660
661delete_obj:
662 kobject_put(numa_kobj);
663 return err;
664}
665subsys_initcall(numa_init_sysfs);
666#endif /* CONFIG_SYSFS */
667#endif