Commit | Line | Data |
---|---|---|
992bf775 | 1 | // SPDX-License-Identifier: GPL-2.0 |
992bf775 AK |
2 | #include <linux/slab.h> |
3 | #include <linux/lockdep.h> | |
91952440 AK |
4 | #include <linux/sysfs.h> |
5 | #include <linux/kobject.h> | |
c6123a19 | 6 | #include <linux/memory.h> |
992bf775 AK |
7 | #include <linux/memory-tiers.h> |
8 | ||
6c542ab7 AK |
9 | #include "internal.h" |
10 | ||
992bf775 AK |
11 | struct memory_tier { |
12 | /* hierarchy of memory tiers */ | |
13 | struct list_head list; | |
14 | /* list of all memory types part of this tier */ | |
15 | struct list_head memory_types; | |
16 | /* | |
17 | * start value of abstract distance. memory tier maps | |
18 | * an abstract distance range, | |
19 | * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE | |
20 | */ | |
21 | int adistance_start; | |
32008027 JG |
22 | /* All the nodes that are part of all the lower memory tiers. */ |
23 | nodemask_t lower_tier_mask; | |
992bf775 AK |
24 | }; |
25 | ||
6c542ab7 AK |
26 | struct demotion_nodes { |
27 | nodemask_t preferred; | |
28 | }; | |
29 | ||
7b88bda3 AK |
30 | struct node_memory_type_map { |
31 | struct memory_dev_type *memtype; | |
32 | int map_count; | |
992bf775 AK |
33 | }; |
34 | ||
35 | static DEFINE_MUTEX(memory_tier_lock); | |
36 | static LIST_HEAD(memory_tiers); | |
7b88bda3 AK |
37 | static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; |
38 | static struct memory_dev_type *default_dram_type; | |
6c542ab7 | 39 | #ifdef CONFIG_MIGRATION |
467b171a | 40 | static int top_tier_adistance; |
6c542ab7 AK |
41 | /* |
42 | * node_demotion[] examples: | |
43 | * | |
44 | * Example 1: | |
45 | * | |
46 | * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. | |
47 | * | |
48 | * node distances: | |
49 | * node 0 1 2 3 | |
50 | * 0 10 20 30 40 | |
51 | * 1 20 10 40 30 | |
52 | * 2 30 40 10 40 | |
53 | * 3 40 30 40 10 | |
54 | * | |
55 | * memory_tiers0 = 0-1 | |
56 | * memory_tiers1 = 2-3 | |
57 | * | |
58 | * node_demotion[0].preferred = 2 | |
59 | * node_demotion[1].preferred = 3 | |
60 | * node_demotion[2].preferred = <empty> | |
61 | * node_demotion[3].preferred = <empty> | |
62 | * | |
63 | * Example 2: | |
64 | * | |
65 | * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. | |
66 | * | |
67 | * node distances: | |
68 | * node 0 1 2 | |
69 | * 0 10 20 30 | |
70 | * 1 20 10 30 | |
71 | * 2 30 30 10 | |
72 | * | |
73 | * memory_tiers0 = 0-2 | |
74 | * | |
75 | * node_demotion[0].preferred = <empty> | |
76 | * node_demotion[1].preferred = <empty> | |
77 | * node_demotion[2].preferred = <empty> | |
78 | * | |
79 | * Example 3: | |
80 | * | |
81 | * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. | |
82 | * | |
83 | * node distances: | |
84 | * node 0 1 2 | |
85 | * 0 10 20 30 | |
86 | * 1 20 10 40 | |
87 | * 2 30 40 10 | |
88 | * | |
89 | * memory_tiers0 = 1 | |
90 | * memory_tiers1 = 0 | |
91 | * memory_tiers2 = 2 | |
92 | * | |
93 | * node_demotion[0].preferred = 2 | |
94 | * node_demotion[1].preferred = 0 | |
95 | * node_demotion[2].preferred = <empty> | |
96 | * | |
97 | */ | |
98 | static struct demotion_nodes *node_demotion __read_mostly; | |
99 | #endif /* CONFIG_MIGRATION */ | |
992bf775 AK |
100 | |
101 | static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) | |
102 | { | |
103 | bool found_slot = false; | |
104 | struct memory_tier *memtier, *new_memtier; | |
105 | int adistance = memtype->adistance; | |
106 | unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; | |
107 | ||
108 | lockdep_assert_held_once(&memory_tier_lock); | |
109 | ||
b26ac6f3 | 110 | adistance = round_down(adistance, memtier_adistance_chunk_size); |
992bf775 AK |
111 | /* |
112 | * If the memtype is already part of a memory tier, | |
113 | * just return that. | |
114 | */ | |
b26ac6f3 AK |
115 | if (!list_empty(&memtype->tier_sibiling)) { |
116 | list_for_each_entry(memtier, &memory_tiers, list) { | |
117 | if (adistance == memtier->adistance_start) | |
118 | return memtier; | |
119 | } | |
120 | WARN_ON(1); | |
121 | return ERR_PTR(-EINVAL); | |
122 | } | |
992bf775 | 123 | |
992bf775 AK |
124 | list_for_each_entry(memtier, &memory_tiers, list) { |
125 | if (adistance == memtier->adistance_start) { | |
992bf775 AK |
126 | list_add(&memtype->tier_sibiling, &memtier->memory_types); |
127 | return memtier; | |
128 | } else if (adistance < memtier->adistance_start) { | |
129 | found_slot = true; | |
130 | break; | |
131 | } | |
132 | } | |
133 | ||
134 | new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL); | |
135 | if (!new_memtier) | |
136 | return ERR_PTR(-ENOMEM); | |
137 | ||
138 | new_memtier->adistance_start = adistance; | |
139 | INIT_LIST_HEAD(&new_memtier->list); | |
140 | INIT_LIST_HEAD(&new_memtier->memory_types); | |
141 | if (found_slot) | |
142 | list_add_tail(&new_memtier->list, &memtier->list); | |
143 | else | |
144 | list_add_tail(&new_memtier->list, &memory_tiers); | |
992bf775 AK |
145 | list_add(&memtype->tier_sibiling, &new_memtier->memory_types); |
146 | return new_memtier; | |
147 | } | |
148 | ||
6c542ab7 AK |
149 | static struct memory_tier *__node_get_memory_tier(int node) |
150 | { | |
7766cf7a | 151 | pg_data_t *pgdat; |
6c542ab7 | 152 | |
7766cf7a AK |
153 | pgdat = NODE_DATA(node); |
154 | if (!pgdat) | |
155 | return NULL; | |
156 | /* | |
157 | * Since we hold memory_tier_lock, we can avoid | |
158 | * RCU read locks when accessing the details. No | |
159 | * parallel updates are possible here. | |
160 | */ | |
161 | return rcu_dereference_check(pgdat->memtier, | |
162 | lockdep_is_held(&memory_tier_lock)); | |
6c542ab7 AK |
163 | } |
164 | ||
165 | #ifdef CONFIG_MIGRATION | |
467b171a AK |
166 | bool node_is_toptier(int node) |
167 | { | |
168 | bool toptier; | |
169 | pg_data_t *pgdat; | |
170 | struct memory_tier *memtier; | |
171 | ||
172 | pgdat = NODE_DATA(node); | |
173 | if (!pgdat) | |
174 | return false; | |
175 | ||
176 | rcu_read_lock(); | |
177 | memtier = rcu_dereference(pgdat->memtier); | |
178 | if (!memtier) { | |
179 | toptier = true; | |
180 | goto out; | |
181 | } | |
182 | if (memtier->adistance_start <= top_tier_adistance) | |
183 | toptier = true; | |
184 | else | |
185 | toptier = false; | |
186 | out: | |
187 | rcu_read_unlock(); | |
188 | return toptier; | |
189 | } | |
190 | ||
32008027 JG |
191 | void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) |
192 | { | |
193 | struct memory_tier *memtier; | |
194 | ||
195 | /* | |
196 | * pg_data_t.memtier updates includes a synchronize_rcu() | |
197 | * which ensures that we either find NULL or a valid memtier | |
198 | * in NODE_DATA. protect the access via rcu_read_lock(); | |
199 | */ | |
200 | rcu_read_lock(); | |
201 | memtier = rcu_dereference(pgdat->memtier); | |
202 | if (memtier) | |
203 | *targets = memtier->lower_tier_mask; | |
204 | else | |
205 | *targets = NODE_MASK_NONE; | |
206 | rcu_read_unlock(); | |
207 | } | |
208 | ||
6c542ab7 AK |
209 | /** |
210 | * next_demotion_node() - Get the next node in the demotion path | |
211 | * @node: The starting node to lookup the next node | |
212 | * | |
213 | * Return: node id for next memory node in the demotion path hierarchy | |
214 | * from @node; NUMA_NO_NODE if @node is terminal. This does not keep | |
215 | * @node online or guarantee that it *continues* to be the next demotion | |
216 | * target. | |
217 | */ | |
218 | int next_demotion_node(int node) | |
219 | { | |
220 | struct demotion_nodes *nd; | |
221 | int target; | |
222 | ||
223 | if (!node_demotion) | |
224 | return NUMA_NO_NODE; | |
225 | ||
226 | nd = &node_demotion[node]; | |
227 | ||
228 | /* | |
229 | * node_demotion[] is updated without excluding this | |
230 | * function from running. | |
231 | * | |
232 | * Make sure to use RCU over entire code blocks if | |
233 | * node_demotion[] reads need to be consistent. | |
234 | */ | |
235 | rcu_read_lock(); | |
236 | /* | |
237 | * If there are multiple target nodes, just select one | |
238 | * target node randomly. | |
239 | * | |
240 | * In addition, we can also use round-robin to select | |
241 | * target node, but we should introduce another variable | |
242 | * for node_demotion[] to record last selected target node, | |
243 | * that may cause cache ping-pong due to the changing of | |
244 | * last target node. Or introducing per-cpu data to avoid | |
245 | * caching issue, which seems more complicated. So selecting | |
246 | * target node randomly seems better until now. | |
247 | */ | |
248 | target = node_random(&nd->preferred); | |
249 | rcu_read_unlock(); | |
250 | ||
251 | return target; | |
252 | } | |
253 | ||
254 | static void disable_all_demotion_targets(void) | |
255 | { | |
32008027 | 256 | struct memory_tier *memtier; |
6c542ab7 AK |
257 | int node; |
258 | ||
32008027 | 259 | for_each_node_state(node, N_MEMORY) { |
6c542ab7 | 260 | node_demotion[node].preferred = NODE_MASK_NONE; |
32008027 JG |
261 | /* |
262 | * We are holding memory_tier_lock, it is safe | |
263 | * to access pgda->memtier. | |
264 | */ | |
265 | memtier = __node_get_memory_tier(node); | |
266 | if (memtier) | |
267 | memtier->lower_tier_mask = NODE_MASK_NONE; | |
268 | } | |
6c542ab7 AK |
269 | /* |
270 | * Ensure that the "disable" is visible across the system. | |
271 | * Readers will see either a combination of before+disable | |
272 | * state or disable+after. They will never see before and | |
273 | * after state together. | |
274 | */ | |
275 | synchronize_rcu(); | |
276 | } | |
277 | ||
278 | static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) | |
279 | { | |
280 | nodemask_t nodes = NODE_MASK_NONE; | |
281 | struct memory_dev_type *memtype; | |
282 | ||
283 | list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) | |
284 | nodes_or(nodes, nodes, memtype->nodes); | |
285 | ||
286 | return nodes; | |
287 | } | |
288 | ||
289 | /* | |
290 | * Find an automatic demotion target for all memory | |
291 | * nodes. Failing here is OK. It might just indicate | |
292 | * being at the end of a chain. | |
293 | */ | |
294 | static void establish_demotion_targets(void) | |
295 | { | |
296 | struct memory_tier *memtier; | |
297 | struct demotion_nodes *nd; | |
298 | int target = NUMA_NO_NODE, node; | |
299 | int distance, best_distance; | |
32008027 | 300 | nodemask_t tier_nodes, lower_tier; |
6c542ab7 AK |
301 | |
302 | lockdep_assert_held_once(&memory_tier_lock); | |
303 | ||
304 | if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION)) | |
305 | return; | |
306 | ||
307 | disable_all_demotion_targets(); | |
308 | ||
309 | for_each_node_state(node, N_MEMORY) { | |
310 | best_distance = -1; | |
311 | nd = &node_demotion[node]; | |
312 | ||
313 | memtier = __node_get_memory_tier(node); | |
314 | if (!memtier || list_is_last(&memtier->list, &memory_tiers)) | |
315 | continue; | |
316 | /* | |
317 | * Get the lower memtier to find the demotion node list. | |
318 | */ | |
319 | memtier = list_next_entry(memtier, list); | |
320 | tier_nodes = get_memtier_nodemask(memtier); | |
321 | /* | |
322 | * find_next_best_node, use 'used' nodemask as a skip list. | |
323 | * Add all memory nodes except the selected memory tier | |
324 | * nodelist to skip list so that we find the best node from the | |
325 | * memtier nodelist. | |
326 | */ | |
327 | nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); | |
328 | ||
329 | /* | |
330 | * Find all the nodes in the memory tier node list of same best distance. | |
331 | * add them to the preferred mask. We randomly select between nodes | |
332 | * in the preferred mask when allocating pages during demotion. | |
333 | */ | |
334 | do { | |
335 | target = find_next_best_node(node, &tier_nodes); | |
336 | if (target == NUMA_NO_NODE) | |
337 | break; | |
338 | ||
339 | distance = node_distance(node, target); | |
340 | if (distance == best_distance || best_distance == -1) { | |
341 | best_distance = distance; | |
342 | node_set(target, nd->preferred); | |
343 | } else { | |
344 | break; | |
345 | } | |
346 | } while (1); | |
347 | } | |
467b171a AK |
348 | /* |
349 | * Promotion is allowed from a memory tier to higher | |
350 | * memory tier only if the memory tier doesn't include | |
351 | * compute. We want to skip promotion from a memory tier, | |
352 | * if any node that is part of the memory tier have CPUs. | |
353 | * Once we detect such a memory tier, we consider that tier | |
354 | * as top tiper from which promotion is not allowed. | |
355 | */ | |
356 | list_for_each_entry_reverse(memtier, &memory_tiers, list) { | |
357 | tier_nodes = get_memtier_nodemask(memtier); | |
358 | nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); | |
359 | if (!nodes_empty(tier_nodes)) { | |
360 | /* | |
361 | * abstract distance below the max value of this memtier | |
362 | * is considered toptier. | |
363 | */ | |
364 | top_tier_adistance = memtier->adistance_start + | |
365 | MEMTIER_CHUNK_SIZE - 1; | |
366 | break; | |
367 | } | |
368 | } | |
32008027 JG |
369 | /* |
370 | * Now build the lower_tier mask for each node collecting node mask from | |
371 | * all memory tier below it. This allows us to fallback demotion page | |
372 | * allocation to a set of nodes that is closer the above selected | |
373 | * perferred node. | |
374 | */ | |
375 | lower_tier = node_states[N_MEMORY]; | |
376 | list_for_each_entry(memtier, &memory_tiers, list) { | |
377 | /* | |
378 | * Keep removing current tier from lower_tier nodes, | |
379 | * This will remove all nodes in current and above | |
380 | * memory tier from the lower_tier mask. | |
381 | */ | |
382 | tier_nodes = get_memtier_nodemask(memtier); | |
383 | nodes_andnot(lower_tier, lower_tier, tier_nodes); | |
384 | memtier->lower_tier_mask = lower_tier; | |
385 | } | |
6c542ab7 AK |
386 | } |
387 | ||
388 | #else | |
389 | static inline void disable_all_demotion_targets(void) {} | |
390 | static inline void establish_demotion_targets(void) {} | |
391 | #endif /* CONFIG_MIGRATION */ | |
392 | ||
7b88bda3 AK |
393 | static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) |
394 | { | |
395 | if (!node_memory_types[node].memtype) | |
396 | node_memory_types[node].memtype = memtype; | |
397 | /* | |
398 | * for each device getting added in the same NUMA node | |
399 | * with this specific memtype, bump the map count. We | |
400 | * Only take memtype device reference once, so that | |
401 | * changing a node memtype can be done by droping the | |
402 | * only reference count taken here. | |
403 | */ | |
404 | ||
405 | if (node_memory_types[node].memtype == memtype) { | |
406 | if (!node_memory_types[node].map_count++) | |
407 | kref_get(&memtype->kref); | |
408 | } | |
409 | } | |
410 | ||
992bf775 AK |
411 | static struct memory_tier *set_node_memory_tier(int node) |
412 | { | |
413 | struct memory_tier *memtier; | |
414 | struct memory_dev_type *memtype; | |
7766cf7a AK |
415 | pg_data_t *pgdat = NODE_DATA(node); |
416 | ||
992bf775 AK |
417 | |
418 | lockdep_assert_held_once(&memory_tier_lock); | |
419 | ||
420 | if (!node_state(node, N_MEMORY)) | |
421 | return ERR_PTR(-EINVAL); | |
422 | ||
7b88bda3 | 423 | __init_node_memory_type(node, default_dram_type); |
992bf775 | 424 | |
7b88bda3 | 425 | memtype = node_memory_types[node].memtype; |
992bf775 AK |
426 | node_set(node, memtype->nodes); |
427 | memtier = find_create_memory_tier(memtype); | |
7766cf7a AK |
428 | if (!IS_ERR(memtier)) |
429 | rcu_assign_pointer(pgdat->memtier, memtier); | |
992bf775 AK |
430 | return memtier; |
431 | } | |
432 | ||
c6123a19 AK |
433 | static void destroy_memory_tier(struct memory_tier *memtier) |
434 | { | |
435 | list_del(&memtier->list); | |
7766cf7a AK |
436 | /* |
437 | * synchronize_rcu in clear_node_memory_tier makes sure | |
438 | * we don't have rcu access to this memory tier. | |
439 | */ | |
c6123a19 AK |
440 | kfree(memtier); |
441 | } | |
442 | ||
443 | static bool clear_node_memory_tier(int node) | |
444 | { | |
445 | bool cleared = false; | |
7766cf7a | 446 | pg_data_t *pgdat; |
c6123a19 AK |
447 | struct memory_tier *memtier; |
448 | ||
7766cf7a AK |
449 | pgdat = NODE_DATA(node); |
450 | if (!pgdat) | |
451 | return false; | |
452 | ||
453 | /* | |
454 | * Make sure that anybody looking at NODE_DATA who finds | |
455 | * a valid memtier finds memory_dev_types with nodes still | |
456 | * linked to the memtier. We achieve this by waiting for | |
457 | * rcu read section to finish using synchronize_rcu. | |
458 | * This also enables us to free the destroyed memory tier | |
459 | * with kfree instead of kfree_rcu | |
460 | */ | |
c6123a19 AK |
461 | memtier = __node_get_memory_tier(node); |
462 | if (memtier) { | |
463 | struct memory_dev_type *memtype; | |
464 | ||
7766cf7a AK |
465 | rcu_assign_pointer(pgdat->memtier, NULL); |
466 | synchronize_rcu(); | |
7b88bda3 | 467 | memtype = node_memory_types[node].memtype; |
c6123a19 AK |
468 | node_clear(node, memtype->nodes); |
469 | if (nodes_empty(memtype->nodes)) { | |
470 | list_del_init(&memtype->tier_sibiling); | |
c6123a19 AK |
471 | if (list_empty(&memtier->memory_types)) |
472 | destroy_memory_tier(memtier); | |
473 | } | |
474 | cleared = true; | |
475 | } | |
476 | return cleared; | |
477 | } | |
478 | ||
7b88bda3 AK |
479 | static void release_memtype(struct kref *kref) |
480 | { | |
481 | struct memory_dev_type *memtype; | |
482 | ||
483 | memtype = container_of(kref, struct memory_dev_type, kref); | |
484 | kfree(memtype); | |
485 | } | |
486 | ||
487 | struct memory_dev_type *alloc_memory_type(int adistance) | |
488 | { | |
489 | struct memory_dev_type *memtype; | |
490 | ||
491 | memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); | |
492 | if (!memtype) | |
493 | return ERR_PTR(-ENOMEM); | |
494 | ||
495 | memtype->adistance = adistance; | |
496 | INIT_LIST_HEAD(&memtype->tier_sibiling); | |
497 | memtype->nodes = NODE_MASK_NONE; | |
7b88bda3 AK |
498 | kref_init(&memtype->kref); |
499 | return memtype; | |
500 | } | |
501 | EXPORT_SYMBOL_GPL(alloc_memory_type); | |
502 | ||
503 | void destroy_memory_type(struct memory_dev_type *memtype) | |
504 | { | |
505 | kref_put(&memtype->kref, release_memtype); | |
506 | } | |
507 | EXPORT_SYMBOL_GPL(destroy_memory_type); | |
508 | ||
509 | void init_node_memory_type(int node, struct memory_dev_type *memtype) | |
510 | { | |
511 | ||
512 | mutex_lock(&memory_tier_lock); | |
513 | __init_node_memory_type(node, memtype); | |
514 | mutex_unlock(&memory_tier_lock); | |
515 | } | |
516 | EXPORT_SYMBOL_GPL(init_node_memory_type); | |
517 | ||
518 | void clear_node_memory_type(int node, struct memory_dev_type *memtype) | |
519 | { | |
520 | mutex_lock(&memory_tier_lock); | |
521 | if (node_memory_types[node].memtype == memtype) | |
522 | node_memory_types[node].map_count--; | |
523 | /* | |
524 | * If we umapped all the attached devices to this node, | |
525 | * clear the node memory type. | |
526 | */ | |
527 | if (!node_memory_types[node].map_count) { | |
528 | node_memory_types[node].memtype = NULL; | |
529 | kref_put(&memtype->kref, release_memtype); | |
530 | } | |
531 | mutex_unlock(&memory_tier_lock); | |
532 | } | |
533 | EXPORT_SYMBOL_GPL(clear_node_memory_type); | |
534 | ||
c6123a19 AK |
535 | static int __meminit memtier_hotplug_callback(struct notifier_block *self, |
536 | unsigned long action, void *_arg) | |
537 | { | |
6c542ab7 | 538 | struct memory_tier *memtier; |
c6123a19 AK |
539 | struct memory_notify *arg = _arg; |
540 | ||
541 | /* | |
542 | * Only update the node migration order when a node is | |
543 | * changing status, like online->offline. | |
544 | */ | |
545 | if (arg->status_change_nid < 0) | |
546 | return notifier_from_errno(0); | |
547 | ||
548 | switch (action) { | |
549 | case MEM_OFFLINE: | |
550 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
551 | if (clear_node_memory_tier(arg->status_change_nid)) |
552 | establish_demotion_targets(); | |
c6123a19 AK |
553 | mutex_unlock(&memory_tier_lock); |
554 | break; | |
555 | case MEM_ONLINE: | |
556 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
557 | memtier = set_node_memory_tier(arg->status_change_nid); |
558 | if (!IS_ERR(memtier)) | |
559 | establish_demotion_targets(); | |
c6123a19 AK |
560 | mutex_unlock(&memory_tier_lock); |
561 | break; | |
562 | } | |
563 | ||
564 | return notifier_from_errno(0); | |
565 | } | |
566 | ||
992bf775 AK |
567 | static int __init memory_tier_init(void) |
568 | { | |
569 | int node; | |
570 | struct memory_tier *memtier; | |
571 | ||
6c542ab7 AK |
572 | #ifdef CONFIG_MIGRATION |
573 | node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), | |
574 | GFP_KERNEL); | |
575 | WARN_ON(!node_demotion); | |
576 | #endif | |
992bf775 | 577 | mutex_lock(&memory_tier_lock); |
7b88bda3 AK |
578 | /* |
579 | * For now we can have 4 faster memory tiers with smaller adistance | |
580 | * than default DRAM tier. | |
581 | */ | |
582 | default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); | |
583 | if (!default_dram_type) | |
584 | panic("%s() failed to allocate default DRAM tier\n", __func__); | |
585 | ||
992bf775 AK |
586 | /* |
587 | * Look at all the existing N_MEMORY nodes and add them to | |
588 | * default memory tier or to a tier if we already have memory | |
589 | * types assigned. | |
590 | */ | |
591 | for_each_node_state(node, N_MEMORY) { | |
592 | memtier = set_node_memory_tier(node); | |
593 | if (IS_ERR(memtier)) | |
594 | /* | |
595 | * Continue with memtiers we are able to setup | |
596 | */ | |
597 | break; | |
598 | } | |
6c542ab7 | 599 | establish_demotion_targets(); |
992bf775 AK |
600 | mutex_unlock(&memory_tier_lock); |
601 | ||
c6123a19 | 602 | hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); |
992bf775 AK |
603 | return 0; |
604 | } | |
605 | subsys_initcall(memory_tier_init); | |
91952440 AK |
606 | |
607 | bool numa_demotion_enabled = false; | |
608 | ||
609 | #ifdef CONFIG_MIGRATION | |
610 | #ifdef CONFIG_SYSFS | |
611 | static ssize_t numa_demotion_enabled_show(struct kobject *kobj, | |
612 | struct kobj_attribute *attr, char *buf) | |
613 | { | |
614 | return sysfs_emit(buf, "%s\n", | |
615 | numa_demotion_enabled ? "true" : "false"); | |
616 | } | |
617 | ||
618 | static ssize_t numa_demotion_enabled_store(struct kobject *kobj, | |
619 | struct kobj_attribute *attr, | |
620 | const char *buf, size_t count) | |
621 | { | |
622 | ssize_t ret; | |
623 | ||
624 | ret = kstrtobool(buf, &numa_demotion_enabled); | |
625 | if (ret) | |
626 | return ret; | |
627 | ||
628 | return count; | |
629 | } | |
630 | ||
631 | static struct kobj_attribute numa_demotion_enabled_attr = | |
632 | __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, | |
633 | numa_demotion_enabled_store); | |
634 | ||
635 | static struct attribute *numa_attrs[] = { | |
636 | &numa_demotion_enabled_attr.attr, | |
637 | NULL, | |
638 | }; | |
639 | ||
640 | static const struct attribute_group numa_attr_group = { | |
641 | .attrs = numa_attrs, | |
642 | }; | |
643 | ||
644 | static int __init numa_init_sysfs(void) | |
645 | { | |
646 | int err; | |
647 | struct kobject *numa_kobj; | |
648 | ||
649 | numa_kobj = kobject_create_and_add("numa", mm_kobj); | |
650 | if (!numa_kobj) { | |
651 | pr_err("failed to create numa kobject\n"); | |
652 | return -ENOMEM; | |
653 | } | |
654 | err = sysfs_create_group(numa_kobj, &numa_attr_group); | |
655 | if (err) { | |
656 | pr_err("failed to register numa group\n"); | |
657 | goto delete_obj; | |
658 | } | |
659 | return 0; | |
660 | ||
661 | delete_obj: | |
662 | kobject_put(numa_kobj); | |
663 | return err; | |
664 | } | |
665 | subsys_initcall(numa_init_sysfs); | |
666 | #endif /* CONFIG_SYSFS */ | |
667 | #endif |