Commit | Line | Data |
---|---|---|
2bc64a20 AK |
1 | /* |
2 | * | |
3 | * Copyright IBM Corporation, 2012 | |
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | |
5 | * | |
faced7e0 GS |
6 | * Cgroup v2 |
7 | * Copyright (C) 2019 Red Hat, Inc. | |
8 | * Author: Giuseppe Scrivano <gscrivan@redhat.com> | |
9 | * | |
2bc64a20 AK |
10 | * This program is free software; you can redistribute it and/or modify it |
11 | * under the terms of version 2.1 of the GNU Lesser General Public License | |
12 | * as published by the Free Software Foundation. | |
13 | * | |
14 | * This program is distributed in the hope that it would be useful, but | |
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
17 | * | |
18 | */ | |
19 | ||
20 | #include <linux/cgroup.h> | |
71f87bee | 21 | #include <linux/page_counter.h> |
2bc64a20 AK |
22 | #include <linux/slab.h> |
23 | #include <linux/hugetlb.h> | |
24 | #include <linux/hugetlb_cgroup.h> | |
25 | ||
abb8206c AK |
26 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
27 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) | |
28 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | |
29 | ||
47179fe0 XJ |
30 | /* Use t->m[0] to encode the offset */ |
31 | #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) | |
32 | #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) | |
33 | #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) | |
34 | ||
35 | #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) | |
36 | #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) | |
37 | ||
2bc64a20 | 38 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; |
47179fe0 XJ |
39 | static struct cftype *dfl_files; |
40 | static struct cftype *legacy_files; | |
2bc64a20 | 41 | |
cdc2fcfe | 42 | static inline struct page_counter * |
1adc4d41 MA |
43 | __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, |
44 | bool rsvd) | |
cdc2fcfe MA |
45 | { |
46 | if (rsvd) | |
47 | return &h_cg->rsvd_hugepage[idx]; | |
48 | return &h_cg->hugepage[idx]; | |
49 | } | |
50 | ||
1adc4d41 MA |
51 | static inline struct page_counter * |
52 | hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) | |
53 | { | |
54 | return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); | |
55 | } | |
56 | ||
57 | static inline struct page_counter * | |
58 | hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) | |
59 | { | |
60 | return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); | |
61 | } | |
62 | ||
2bc64a20 AK |
63 | static inline |
64 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) | |
65 | { | |
a7c6d554 | 66 | return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; |
2bc64a20 AK |
67 | } |
68 | ||
2bc64a20 AK |
69 | static inline |
70 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) | |
71 | { | |
073219e9 | 72 | return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); |
2bc64a20 AK |
73 | } |
74 | ||
75 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) | |
76 | { | |
77 | return (h_cg == root_h_cgroup); | |
78 | } | |
79 | ||
3f798518 TH |
80 | static inline struct hugetlb_cgroup * |
81 | parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) | |
2bc64a20 | 82 | { |
5c9d535b | 83 | return hugetlb_cgroup_from_css(h_cg->css.parent); |
2bc64a20 AK |
84 | } |
85 | ||
3f798518 | 86 | static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) |
2bc64a20 | 87 | { |
c37213c5 | 88 | struct hstate *h; |
2bc64a20 | 89 | |
c37213c5 | 90 | for_each_hstate(h) { |
1adc4d41 | 91 | if (page_counter_read( |
c37213c5 | 92 | hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) |
2bc64a20 AK |
93 | return true; |
94 | } | |
95 | return false; | |
96 | } | |
97 | ||
297880f4 DR |
98 | static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, |
99 | struct hugetlb_cgroup *parent_h_cgroup) | |
100 | { | |
101 | int idx; | |
102 | ||
103 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { | |
0e2759af SB |
104 | struct page_counter *fault, *fault_parent = NULL; |
105 | struct page_counter *rsvd, *rsvd_parent = NULL; | |
297880f4 | 106 | unsigned long limit; |
297880f4 | 107 | |
1adc4d41 MA |
108 | if (parent_h_cgroup) { |
109 | fault_parent = hugetlb_cgroup_counter_from_cgroup( | |
110 | parent_h_cgroup, idx); | |
111 | rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( | |
112 | parent_h_cgroup, idx); | |
113 | } | |
0e2759af SB |
114 | fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx); |
115 | rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx); | |
116 | ||
117 | page_counter_init(fault, fault_parent, false); | |
118 | page_counter_init(rsvd, rsvd_parent, false); | |
119 | ||
120 | if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) { | |
121 | fault->track_failcnt = true; | |
122 | rsvd->track_failcnt = true; | |
123 | } | |
297880f4 DR |
124 | |
125 | limit = round_down(PAGE_COUNTER_MAX, | |
8938494c | 126 | pages_per_huge_page(&hstates[idx])); |
1adc4d41 | 127 | |
0e2759af SB |
128 | VM_BUG_ON(page_counter_set_max(fault, limit)); |
129 | VM_BUG_ON(page_counter_set_max(rsvd, limit)); | |
297880f4 DR |
130 | } |
131 | } | |
132 | ||
f4776199 MA |
133 | static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) |
134 | { | |
135 | int node; | |
136 | ||
137 | for_each_node(node) | |
138 | kfree(h_cgroup->nodeinfo[node]); | |
139 | kfree(h_cgroup); | |
140 | } | |
141 | ||
eb95419b TH |
142 | static struct cgroup_subsys_state * |
143 | hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |
2bc64a20 | 144 | { |
eb95419b TH |
145 | struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); |
146 | struct hugetlb_cgroup *h_cgroup; | |
f4776199 MA |
147 | int node; |
148 | ||
149 | h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), | |
150 | GFP_KERNEL); | |
2bc64a20 | 151 | |
2bc64a20 AK |
152 | if (!h_cgroup) |
153 | return ERR_PTR(-ENOMEM); | |
154 | ||
297880f4 | 155 | if (!parent_h_cgroup) |
2bc64a20 | 156 | root_h_cgroup = h_cgroup; |
297880f4 | 157 | |
f4776199 MA |
158 | /* |
159 | * TODO: this routine can waste much memory for nodes which will | |
160 | * never be onlined. It's better to use memory hotplug callback | |
161 | * function. | |
162 | */ | |
163 | for_each_node(node) { | |
99249387 | 164 | /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ |
f4776199 | 165 | int node_to_alloc = |
99249387 | 166 | node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; |
f4776199 MA |
167 | h_cgroup->nodeinfo[node] = |
168 | kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), | |
169 | GFP_KERNEL, node_to_alloc); | |
170 | if (!h_cgroup->nodeinfo[node]) | |
171 | goto fail_alloc_nodeinfo; | |
172 | } | |
173 | ||
297880f4 | 174 | hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); |
2bc64a20 | 175 | return &h_cgroup->css; |
f4776199 MA |
176 | |
177 | fail_alloc_nodeinfo: | |
178 | hugetlb_cgroup_free(h_cgroup); | |
179 | return ERR_PTR(-ENOMEM); | |
2bc64a20 AK |
180 | } |
181 | ||
eb95419b | 182 | static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) |
2bc64a20 | 183 | { |
f4776199 | 184 | hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); |
2bc64a20 AK |
185 | } |
186 | ||
da1def55 AK |
187 | /* |
188 | * Should be called with hugetlb_lock held. | |
189 | * Since we are holding hugetlb_lock, pages cannot get moved from | |
190 | * active list or uncharged from the cgroup, So no need to get | |
191 | * page reference and test for page active here. This function | |
192 | * cannot fail. | |
193 | */ | |
3f798518 | 194 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, |
3f982b9b | 195 | struct folio *folio) |
da1def55 | 196 | { |
71f87bee JW |
197 | unsigned int nr_pages; |
198 | struct page_counter *counter; | |
3f982b9b | 199 | struct hugetlb_cgroup *hcg; |
3f798518 | 200 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); |
da1def55 | 201 | |
3f982b9b | 202 | hcg = hugetlb_cgroup_from_folio(folio); |
da1def55 AK |
203 | /* |
204 | * We can have pages in active list without any cgroup | |
205 | * ie, hugepage with less than 3 pages. We can safely | |
206 | * ignore those pages. | |
207 | */ | |
3f982b9b | 208 | if (!hcg || hcg != h_cg) |
da1def55 AK |
209 | goto out; |
210 | ||
3f982b9b | 211 | nr_pages = folio_nr_pages(folio); |
da1def55 AK |
212 | if (!parent) { |
213 | parent = root_h_cgroup; | |
214 | /* root has no limit */ | |
71f87bee | 215 | page_counter_charge(&parent->hugepage[idx], nr_pages); |
da1def55 AK |
216 | } |
217 | counter = &h_cg->hugepage[idx]; | |
71f87bee JW |
218 | /* Take the pages off the local counter */ |
219 | page_counter_cancel(counter, nr_pages); | |
da1def55 | 220 | |
de656ed3 | 221 | set_hugetlb_cgroup(folio, parent); |
da1def55 AK |
222 | out: |
223 | return; | |
224 | } | |
225 | ||
226 | /* | |
227 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | |
228 | * the parent cgroup. | |
229 | */ | |
eb95419b | 230 | static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) |
2bc64a20 | 231 | { |
eb95419b | 232 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
da1def55 | 233 | struct hstate *h; |
3f982b9b | 234 | struct folio *folio; |
da1def55 AK |
235 | |
236 | do { | |
da1def55 | 237 | for_each_hstate(h) { |
db71ef79 | 238 | spin_lock_irq(&hugetlb_lock); |
3f982b9b DH |
239 | list_for_each_entry(folio, &h->hugepage_activelist, lru) |
240 | hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); | |
da1def55 | 241 | |
db71ef79 | 242 | spin_unlock_irq(&hugetlb_lock); |
da1def55 AK |
243 | } |
244 | cond_resched(); | |
3f798518 | 245 | } while (hugetlb_cgroup_have_usage(h_cg)); |
2bc64a20 AK |
246 | } |
247 | ||
faced7e0 GS |
248 | static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, |
249 | enum hugetlb_memory_event event) | |
250 | { | |
251 | atomic_long_inc(&hugetlb->events_local[idx][event]); | |
252 | cgroup_file_notify(&hugetlb->events_local_file[idx]); | |
253 | ||
254 | do { | |
255 | atomic_long_inc(&hugetlb->events[idx][event]); | |
256 | cgroup_file_notify(&hugetlb->events_file[idx]); | |
257 | } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && | |
258 | !hugetlb_cgroup_is_root(hugetlb)); | |
259 | } | |
260 | ||
1adc4d41 MA |
261 | static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
262 | struct hugetlb_cgroup **ptr, | |
263 | bool rsvd) | |
6d76dcf4 AK |
264 | { |
265 | int ret = 0; | |
71f87bee | 266 | struct page_counter *counter; |
6d76dcf4 | 267 | struct hugetlb_cgroup *h_cg = NULL; |
6d76dcf4 AK |
268 | |
269 | if (hugetlb_cgroup_disabled()) | |
270 | goto done; | |
6d76dcf4 AK |
271 | again: |
272 | rcu_read_lock(); | |
273 | h_cg = hugetlb_cgroup_from_task(current); | |
0362f326 | 274 | if (!css_tryget(&h_cg->css)) { |
6d76dcf4 AK |
275 | rcu_read_unlock(); |
276 | goto again; | |
277 | } | |
278 | rcu_read_unlock(); | |
279 | ||
1adc4d41 MA |
280 | if (!page_counter_try_charge( |
281 | __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), | |
282 | nr_pages, &counter)) { | |
6071ca52 | 283 | ret = -ENOMEM; |
726b7bbe | 284 | hugetlb_event(h_cg, idx, HUGETLB_MAX); |
1adc4d41 MA |
285 | css_put(&h_cg->css); |
286 | goto done; | |
faced7e0 | 287 | } |
1adc4d41 MA |
288 | /* Reservations take a reference to the css because they do not get |
289 | * reparented. | |
290 | */ | |
291 | if (!rsvd) | |
292 | css_put(&h_cg->css); | |
6d76dcf4 AK |
293 | done: |
294 | *ptr = h_cg; | |
295 | return ret; | |
296 | } | |
297 | ||
1adc4d41 MA |
298 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
299 | struct hugetlb_cgroup **ptr) | |
300 | { | |
301 | return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); | |
302 | } | |
303 | ||
304 | int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, | |
305 | struct hugetlb_cgroup **ptr) | |
306 | { | |
307 | return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); | |
308 | } | |
309 | ||
94ae8ba7 | 310 | /* Should be called with hugetlb_lock held */ |
1adc4d41 MA |
311 | static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, |
312 | struct hugetlb_cgroup *h_cg, | |
541b7c7b | 313 | struct folio *folio, bool rsvd) |
6d76dcf4 AK |
314 | { |
315 | if (hugetlb_cgroup_disabled() || !h_cg) | |
316 | return; | |
3ccae1dc | 317 | lockdep_assert_held(&hugetlb_lock); |
541b7c7b | 318 | __set_hugetlb_cgroup(folio, h_cg, rsvd); |
f4776199 MA |
319 | if (!rsvd) { |
320 | unsigned long usage = | |
541b7c7b | 321 | h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; |
f4776199 MA |
322 | /* |
323 | * This write is not atomic due to fetching usage and writing | |
324 | * to it, but that's fine because we call this with | |
325 | * hugetlb_lock held anyway. | |
326 | */ | |
541b7c7b | 327 | WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], |
f4776199 MA |
328 | usage + nr_pages); |
329 | } | |
6d76dcf4 AK |
330 | } |
331 | ||
1adc4d41 MA |
332 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, |
333 | struct hugetlb_cgroup *h_cg, | |
ff7d853b | 334 | struct folio *folio) |
1adc4d41 | 335 | { |
541b7c7b | 336 | __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); |
1adc4d41 MA |
337 | } |
338 | ||
339 | void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, | |
340 | struct hugetlb_cgroup *h_cg, | |
ff7d853b | 341 | struct folio *folio) |
1adc4d41 | 342 | { |
541b7c7b | 343 | __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); |
1adc4d41 MA |
344 | } |
345 | ||
6d76dcf4 AK |
346 | /* |
347 | * Should be called with hugetlb_lock held | |
348 | */ | |
d4ab0316 SK |
349 | static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, |
350 | struct folio *folio, bool rsvd) | |
6d76dcf4 AK |
351 | { |
352 | struct hugetlb_cgroup *h_cg; | |
6d76dcf4 AK |
353 | |
354 | if (hugetlb_cgroup_disabled()) | |
355 | return; | |
7ea8574e | 356 | lockdep_assert_held(&hugetlb_lock); |
f074732d | 357 | h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); |
6d76dcf4 AK |
358 | if (unlikely(!h_cg)) |
359 | return; | |
f074732d | 360 | __set_hugetlb_cgroup(folio, NULL, rsvd); |
1adc4d41 MA |
361 | |
362 | page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, | |
363 | rsvd), | |
364 | nr_pages); | |
365 | ||
366 | if (rsvd) | |
367 | css_put(&h_cg->css); | |
f4776199 MA |
368 | else { |
369 | unsigned long usage = | |
d4ab0316 | 370 | h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; |
f4776199 MA |
371 | /* |
372 | * This write is not atomic due to fetching usage and writing | |
373 | * to it, but that's fine because we call this with | |
374 | * hugetlb_lock held anyway. | |
375 | */ | |
d4ab0316 | 376 | WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], |
f4776199 MA |
377 | usage - nr_pages); |
378 | } | |
6d76dcf4 AK |
379 | } |
380 | ||
d4ab0316 SK |
381 | void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, |
382 | struct folio *folio) | |
1adc4d41 | 383 | { |
d4ab0316 | 384 | __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); |
1adc4d41 MA |
385 | } |
386 | ||
d4ab0316 SK |
387 | void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, |
388 | struct folio *folio) | |
1adc4d41 | 389 | { |
d4ab0316 | 390 | __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); |
1adc4d41 MA |
391 | } |
392 | ||
393 | static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | |
394 | struct hugetlb_cgroup *h_cg, | |
395 | bool rsvd) | |
6d76dcf4 | 396 | { |
6d76dcf4 AK |
397 | if (hugetlb_cgroup_disabled() || !h_cg) |
398 | return; | |
399 | ||
1adc4d41 MA |
400 | page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, |
401 | rsvd), | |
402 | nr_pages); | |
403 | ||
404 | if (rsvd) | |
405 | css_put(&h_cg->css); | |
406 | } | |
407 | ||
408 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | |
409 | struct hugetlb_cgroup *h_cg) | |
410 | { | |
411 | __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); | |
412 | } | |
413 | ||
414 | void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, | |
415 | struct hugetlb_cgroup *h_cg) | |
416 | { | |
417 | __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); | |
418 | } | |
419 | ||
e9fe92ae MA |
420 | void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, |
421 | unsigned long end) | |
1adc4d41 | 422 | { |
e9fe92ae MA |
423 | if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || |
424 | !resv->css) | |
1adc4d41 MA |
425 | return; |
426 | ||
e9fe92ae MA |
427 | page_counter_uncharge(resv->reservation_counter, |
428 | (end - start) * resv->pages_per_hpage); | |
429 | css_put(resv->css); | |
6d76dcf4 AK |
430 | } |
431 | ||
075a61d0 MA |
432 | void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, |
433 | struct file_region *rg, | |
d85aecf2 ML |
434 | unsigned long nr_pages, |
435 | bool region_del) | |
075a61d0 MA |
436 | { |
437 | if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) | |
438 | return; | |
439 | ||
862f7f65 | 440 | if (rg->reservation_counter && resv->pages_per_hpage && |
075a61d0 MA |
441 | !resv->reservation_counter) { |
442 | page_counter_uncharge(rg->reservation_counter, | |
443 | nr_pages * resv->pages_per_hpage); | |
d85aecf2 ML |
444 | /* |
445 | * Only do css_put(rg->css) when we delete the entire region | |
446 | * because one file_region must hold exactly one css reference. | |
447 | */ | |
448 | if (region_del) | |
449 | css_put(rg->css); | |
075a61d0 MA |
450 | } |
451 | } | |
452 | ||
71f87bee JW |
453 | enum { |
454 | RES_USAGE, | |
cdc2fcfe | 455 | RES_RSVD_USAGE, |
71f87bee | 456 | RES_LIMIT, |
cdc2fcfe | 457 | RES_RSVD_LIMIT, |
71f87bee | 458 | RES_MAX_USAGE, |
cdc2fcfe | 459 | RES_RSVD_MAX_USAGE, |
71f87bee | 460 | RES_FAILCNT, |
cdc2fcfe | 461 | RES_RSVD_FAILCNT, |
71f87bee JW |
462 | }; |
463 | ||
f4776199 MA |
464 | static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) |
465 | { | |
466 | int nid; | |
467 | struct cftype *cft = seq_cft(seq); | |
468 | int idx = MEMFILE_IDX(cft->private); | |
520de595 | 469 | bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); |
f4776199 MA |
470 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); |
471 | struct cgroup_subsys_state *css; | |
472 | unsigned long usage; | |
473 | ||
474 | if (legacy) { | |
475 | /* Add up usage across all nodes for the non-hierarchical total. */ | |
476 | usage = 0; | |
477 | for_each_node_state(nid, N_MEMORY) | |
478 | usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); | |
479 | seq_printf(seq, "total=%lu", usage * PAGE_SIZE); | |
480 | ||
481 | /* Simply print the per-node usage for the non-hierarchical total. */ | |
482 | for_each_node_state(nid, N_MEMORY) | |
483 | seq_printf(seq, " N%d=%lu", nid, | |
484 | READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * | |
485 | PAGE_SIZE); | |
486 | seq_putc(seq, '\n'); | |
487 | } | |
488 | ||
489 | /* | |
490 | * The hierarchical total is pretty much the value recorded by the | |
491 | * counter, so use that. | |
492 | */ | |
493 | seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", | |
494 | page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); | |
495 | ||
496 | /* | |
497 | * For each node, transverse the css tree to obtain the hierarchical | |
498 | * node usage. | |
499 | */ | |
500 | for_each_node_state(nid, N_MEMORY) { | |
501 | usage = 0; | |
502 | rcu_read_lock(); | |
503 | css_for_each_descendant_pre(css, &h_cg->css) { | |
504 | usage += READ_ONCE(hugetlb_cgroup_from_css(css) | |
505 | ->nodeinfo[nid] | |
506 | ->usage[idx]); | |
507 | } | |
508 | rcu_read_unlock(); | |
509 | seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); | |
510 | } | |
511 | ||
512 | seq_putc(seq, '\n'); | |
513 | ||
514 | return 0; | |
515 | } | |
516 | ||
716f479d TH |
517 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
518 | struct cftype *cft) | |
abb8206c | 519 | { |
71f87bee | 520 | struct page_counter *counter; |
cdc2fcfe | 521 | struct page_counter *rsvd_counter; |
182446d0 | 522 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
abb8206c | 523 | |
71f87bee | 524 | counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; |
cdc2fcfe | 525 | rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; |
abb8206c | 526 | |
71f87bee JW |
527 | switch (MEMFILE_ATTR(cft->private)) { |
528 | case RES_USAGE: | |
529 | return (u64)page_counter_read(counter) * PAGE_SIZE; | |
cdc2fcfe MA |
530 | case RES_RSVD_USAGE: |
531 | return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; | |
71f87bee | 532 | case RES_LIMIT: |
bbec2e15 | 533 | return (u64)counter->max * PAGE_SIZE; |
cdc2fcfe MA |
534 | case RES_RSVD_LIMIT: |
535 | return (u64)rsvd_counter->max * PAGE_SIZE; | |
71f87bee JW |
536 | case RES_MAX_USAGE: |
537 | return (u64)counter->watermark * PAGE_SIZE; | |
cdc2fcfe MA |
538 | case RES_RSVD_MAX_USAGE: |
539 | return (u64)rsvd_counter->watermark * PAGE_SIZE; | |
71f87bee JW |
540 | case RES_FAILCNT: |
541 | return counter->failcnt; | |
cdc2fcfe MA |
542 | case RES_RSVD_FAILCNT: |
543 | return rsvd_counter->failcnt; | |
71f87bee JW |
544 | default: |
545 | BUG(); | |
546 | } | |
abb8206c AK |
547 | } |
548 | ||
faced7e0 GS |
549 | static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) |
550 | { | |
551 | int idx; | |
552 | u64 val; | |
553 | struct cftype *cft = seq_cft(seq); | |
554 | unsigned long limit; | |
555 | struct page_counter *counter; | |
556 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); | |
557 | ||
558 | idx = MEMFILE_IDX(cft->private); | |
559 | counter = &h_cg->hugepage[idx]; | |
560 | ||
561 | limit = round_down(PAGE_COUNTER_MAX, | |
8938494c | 562 | pages_per_huge_page(&hstates[idx])); |
faced7e0 GS |
563 | |
564 | switch (MEMFILE_ATTR(cft->private)) { | |
cdc2fcfe MA |
565 | case RES_RSVD_USAGE: |
566 | counter = &h_cg->rsvd_hugepage[idx]; | |
e4a9bc58 | 567 | fallthrough; |
faced7e0 GS |
568 | case RES_USAGE: |
569 | val = (u64)page_counter_read(counter); | |
570 | seq_printf(seq, "%llu\n", val * PAGE_SIZE); | |
571 | break; | |
cdc2fcfe MA |
572 | case RES_RSVD_LIMIT: |
573 | counter = &h_cg->rsvd_hugepage[idx]; | |
e4a9bc58 | 574 | fallthrough; |
faced7e0 GS |
575 | case RES_LIMIT: |
576 | val = (u64)counter->max; | |
577 | if (val == limit) | |
578 | seq_puts(seq, "max\n"); | |
579 | else | |
580 | seq_printf(seq, "%llu\n", val * PAGE_SIZE); | |
581 | break; | |
582 | default: | |
583 | BUG(); | |
584 | } | |
585 | ||
586 | return 0; | |
587 | } | |
588 | ||
71f87bee JW |
589 | static DEFINE_MUTEX(hugetlb_limit_mutex); |
590 | ||
451af504 | 591 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, |
faced7e0 GS |
592 | char *buf, size_t nbytes, loff_t off, |
593 | const char *max) | |
abb8206c | 594 | { |
71f87bee JW |
595 | int ret, idx; |
596 | unsigned long nr_pages; | |
451af504 | 597 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
cdc2fcfe | 598 | bool rsvd = false; |
abb8206c | 599 | |
71f87bee JW |
600 | if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ |
601 | return -EINVAL; | |
602 | ||
451af504 | 603 | buf = strstrip(buf); |
faced7e0 | 604 | ret = page_counter_memparse(buf, max, &nr_pages); |
71f87bee JW |
605 | if (ret) |
606 | return ret; | |
607 | ||
451af504 | 608 | idx = MEMFILE_IDX(of_cft(of)->private); |
8938494c | 609 | nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); |
abb8206c | 610 | |
71f87bee | 611 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
cdc2fcfe MA |
612 | case RES_RSVD_LIMIT: |
613 | rsvd = true; | |
e4a9bc58 | 614 | fallthrough; |
abb8206c | 615 | case RES_LIMIT: |
71f87bee | 616 | mutex_lock(&hugetlb_limit_mutex); |
cdc2fcfe | 617 | ret = page_counter_set_max( |
1adc4d41 | 618 | __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), |
cdc2fcfe | 619 | nr_pages); |
71f87bee | 620 | mutex_unlock(&hugetlb_limit_mutex); |
abb8206c AK |
621 | break; |
622 | default: | |
623 | ret = -EINVAL; | |
624 | break; | |
625 | } | |
451af504 | 626 | return ret ?: nbytes; |
abb8206c AK |
627 | } |
628 | ||
faced7e0 GS |
629 | static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, |
630 | char *buf, size_t nbytes, loff_t off) | |
631 | { | |
632 | return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); | |
633 | } | |
634 | ||
635 | static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, | |
636 | char *buf, size_t nbytes, loff_t off) | |
637 | { | |
638 | return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); | |
639 | } | |
640 | ||
6770c64e TH |
641 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, |
642 | char *buf, size_t nbytes, loff_t off) | |
abb8206c | 643 | { |
71f87bee | 644 | int ret = 0; |
cdc2fcfe | 645 | struct page_counter *counter, *rsvd_counter; |
6770c64e | 646 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
abb8206c | 647 | |
71f87bee | 648 | counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
cdc2fcfe | 649 | rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
abb8206c | 650 | |
71f87bee | 651 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
abb8206c | 652 | case RES_MAX_USAGE: |
71f87bee | 653 | page_counter_reset_watermark(counter); |
abb8206c | 654 | break; |
cdc2fcfe MA |
655 | case RES_RSVD_MAX_USAGE: |
656 | page_counter_reset_watermark(rsvd_counter); | |
657 | break; | |
abb8206c | 658 | case RES_FAILCNT: |
71f87bee | 659 | counter->failcnt = 0; |
abb8206c | 660 | break; |
cdc2fcfe MA |
661 | case RES_RSVD_FAILCNT: |
662 | rsvd_counter->failcnt = 0; | |
663 | break; | |
abb8206c AK |
664 | default: |
665 | ret = -EINVAL; | |
666 | break; | |
667 | } | |
6770c64e | 668 | return ret ?: nbytes; |
abb8206c AK |
669 | } |
670 | ||
671 | static char *mem_fmt(char *buf, int size, unsigned long hsize) | |
672 | { | |
abfb09e2 ML |
673 | if (hsize >= SZ_1G) |
674 | snprintf(buf, size, "%luGB", hsize / SZ_1G); | |
675 | else if (hsize >= SZ_1M) | |
676 | snprintf(buf, size, "%luMB", hsize / SZ_1M); | |
abb8206c | 677 | else |
abfb09e2 | 678 | snprintf(buf, size, "%luKB", hsize / SZ_1K); |
abb8206c AK |
679 | return buf; |
680 | } | |
681 | ||
faced7e0 GS |
682 | static int __hugetlb_events_show(struct seq_file *seq, bool local) |
683 | { | |
684 | int idx; | |
685 | long max; | |
686 | struct cftype *cft = seq_cft(seq); | |
687 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); | |
688 | ||
689 | idx = MEMFILE_IDX(cft->private); | |
690 | ||
691 | if (local) | |
692 | max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); | |
693 | else | |
694 | max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); | |
695 | ||
696 | seq_printf(seq, "max %lu\n", max); | |
697 | ||
698 | return 0; | |
699 | } | |
700 | ||
701 | static int hugetlb_events_show(struct seq_file *seq, void *v) | |
702 | { | |
703 | return __hugetlb_events_show(seq, false); | |
704 | } | |
705 | ||
706 | static int hugetlb_events_local_show(struct seq_file *seq, void *v) | |
707 | { | |
708 | return __hugetlb_events_show(seq, true); | |
709 | } | |
710 | ||
47179fe0 XJ |
711 | static struct cftype hugetlb_dfl_tmpl[] = { |
712 | { | |
713 | .name = "max", | |
714 | .private = RES_LIMIT, | |
715 | .seq_show = hugetlb_cgroup_read_u64_max, | |
716 | .write = hugetlb_cgroup_write_dfl, | |
717 | .flags = CFTYPE_NOT_ON_ROOT, | |
718 | }, | |
719 | { | |
720 | .name = "rsvd.max", | |
721 | .private = RES_RSVD_LIMIT, | |
722 | .seq_show = hugetlb_cgroup_read_u64_max, | |
723 | .write = hugetlb_cgroup_write_dfl, | |
724 | .flags = CFTYPE_NOT_ON_ROOT, | |
725 | }, | |
726 | { | |
727 | .name = "current", | |
728 | .private = RES_USAGE, | |
729 | .seq_show = hugetlb_cgroup_read_u64_max, | |
730 | .flags = CFTYPE_NOT_ON_ROOT, | |
731 | }, | |
732 | { | |
733 | .name = "rsvd.current", | |
734 | .private = RES_RSVD_USAGE, | |
735 | .seq_show = hugetlb_cgroup_read_u64_max, | |
736 | .flags = CFTYPE_NOT_ON_ROOT, | |
737 | }, | |
738 | { | |
739 | .name = "events", | |
740 | .seq_show = hugetlb_events_show, | |
741 | .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), | |
742 | .flags = CFTYPE_NOT_ON_ROOT, | |
743 | }, | |
744 | { | |
745 | .name = "events.local", | |
746 | .seq_show = hugetlb_events_local_show, | |
747 | .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), | |
748 | .flags = CFTYPE_NOT_ON_ROOT, | |
749 | }, | |
750 | { | |
751 | .name = "numa_stat", | |
752 | .seq_show = hugetlb_cgroup_read_numa_stat, | |
753 | .flags = CFTYPE_NOT_ON_ROOT, | |
754 | }, | |
755 | /* don't need terminator here */ | |
756 | }; | |
757 | ||
758 | static struct cftype hugetlb_legacy_tmpl[] = { | |
759 | { | |
760 | .name = "limit_in_bytes", | |
761 | .private = RES_LIMIT, | |
762 | .read_u64 = hugetlb_cgroup_read_u64, | |
763 | .write = hugetlb_cgroup_write_legacy, | |
764 | }, | |
765 | { | |
766 | .name = "rsvd.limit_in_bytes", | |
767 | .private = RES_RSVD_LIMIT, | |
768 | .read_u64 = hugetlb_cgroup_read_u64, | |
769 | .write = hugetlb_cgroup_write_legacy, | |
770 | }, | |
771 | { | |
772 | .name = "usage_in_bytes", | |
773 | .private = RES_USAGE, | |
774 | .read_u64 = hugetlb_cgroup_read_u64, | |
775 | }, | |
776 | { | |
777 | .name = "rsvd.usage_in_bytes", | |
778 | .private = RES_RSVD_USAGE, | |
779 | .read_u64 = hugetlb_cgroup_read_u64, | |
780 | }, | |
781 | { | |
782 | .name = "max_usage_in_bytes", | |
783 | .private = RES_MAX_USAGE, | |
784 | .write = hugetlb_cgroup_reset, | |
785 | .read_u64 = hugetlb_cgroup_read_u64, | |
786 | }, | |
787 | { | |
788 | .name = "rsvd.max_usage_in_bytes", | |
789 | .private = RES_RSVD_MAX_USAGE, | |
790 | .write = hugetlb_cgroup_reset, | |
791 | .read_u64 = hugetlb_cgroup_read_u64, | |
792 | }, | |
793 | { | |
794 | .name = "failcnt", | |
795 | .private = RES_FAILCNT, | |
796 | .write = hugetlb_cgroup_reset, | |
797 | .read_u64 = hugetlb_cgroup_read_u64, | |
798 | }, | |
799 | { | |
800 | .name = "rsvd.failcnt", | |
801 | .private = RES_RSVD_FAILCNT, | |
802 | .write = hugetlb_cgroup_reset, | |
803 | .read_u64 = hugetlb_cgroup_read_u64, | |
804 | }, | |
805 | { | |
806 | .name = "numa_stat", | |
807 | .seq_show = hugetlb_cgroup_read_numa_stat, | |
808 | }, | |
809 | /* don't need terminator here */ | |
810 | }; | |
811 | ||
812 | static void __init | |
813 | hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, | |
814 | struct cftype *tmpl, int tmpl_size) | |
815 | { | |
816 | char buf[32]; | |
817 | int i, idx = hstate_index(h); | |
818 | ||
819 | /* format the size */ | |
820 | mem_fmt(buf, sizeof(buf), huge_page_size(h)); | |
821 | ||
822 | for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { | |
823 | *cft = *tmpl; | |
824 | /* rebuild the name */ | |
825 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); | |
826 | /* rebuild the private */ | |
827 | cft->private = MEMFILE_PRIVATE(idx, tmpl->private); | |
828 | /* rebuild the file_offset */ | |
829 | if (tmpl->file_offset) { | |
830 | unsigned int offset = tmpl->file_offset; | |
831 | ||
832 | cft->file_offset = MEMFILE_OFFSET0(offset) + | |
833 | MEMFILE_FIELD_SIZE(offset) * idx; | |
834 | } | |
835 | ||
836 | lockdep_register_key(&cft->lockdep_key); | |
837 | } | |
838 | } | |
839 | ||
b79d715c | 840 | static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) |
abb8206c | 841 | { |
b79d715c | 842 | int idx = hstate_index(h); |
abb8206c | 843 | |
47179fe0 XJ |
844 | hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, |
845 | hugetlb_dfl_tmpl, DFL_TMPL_SIZE); | |
faced7e0 GS |
846 | } |
847 | ||
b79d715c | 848 | static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) |
faced7e0 | 849 | { |
b79d715c | 850 | int idx = hstate_index(h); |
faced7e0 | 851 | |
47179fe0 XJ |
852 | hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, |
853 | hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); | |
faced7e0 GS |
854 | } |
855 | ||
b79d715c | 856 | static void __init __hugetlb_cgroup_file_init(struct hstate *h) |
faced7e0 | 857 | { |
b79d715c XJ |
858 | __hugetlb_cgroup_file_dfl_init(h); |
859 | __hugetlb_cgroup_file_legacy_init(h); | |
7179e7bf JW |
860 | } |
861 | ||
47179fe0 XJ |
862 | static void __init __hugetlb_cgroup_file_pre_init(void) |
863 | { | |
864 | int cft_count; | |
865 | ||
866 | cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ | |
867 | dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); | |
868 | BUG_ON(!dfl_files); | |
869 | cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ | |
870 | legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); | |
871 | BUG_ON(!legacy_files); | |
872 | } | |
873 | ||
b79d715c XJ |
874 | static void __init __hugetlb_cgroup_file_post_init(void) |
875 | { | |
876 | WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, | |
877 | dfl_files)); | |
878 | WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, | |
879 | legacy_files)); | |
880 | } | |
881 | ||
7179e7bf JW |
882 | void __init hugetlb_cgroup_file_init(void) |
883 | { | |
884 | struct hstate *h; | |
885 | ||
47179fe0 | 886 | __hugetlb_cgroup_file_pre_init(); |
59838b25 | 887 | for_each_hstate(h) |
b79d715c XJ |
888 | __hugetlb_cgroup_file_init(h); |
889 | __hugetlb_cgroup_file_post_init(); | |
abb8206c AK |
890 | } |
891 | ||
75754681 AK |
892 | /* |
893 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen | |
894 | * when we migrate hugepages | |
895 | */ | |
29f39430 | 896 | void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) |
8e6ac7fa AK |
897 | { |
898 | struct hugetlb_cgroup *h_cg; | |
1adc4d41 | 899 | struct hugetlb_cgroup *h_cg_rsvd; |
29f39430 | 900 | struct hstate *h = folio_hstate(old_folio); |
8e6ac7fa AK |
901 | |
902 | if (hugetlb_cgroup_disabled()) | |
903 | return; | |
904 | ||
db71ef79 | 905 | spin_lock_irq(&hugetlb_lock); |
f074732d SK |
906 | h_cg = hugetlb_cgroup_from_folio(old_folio); |
907 | h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); | |
de656ed3 SK |
908 | set_hugetlb_cgroup(old_folio, NULL); |
909 | set_hugetlb_cgroup_rsvd(old_folio, NULL); | |
8e6ac7fa AK |
910 | |
911 | /* move the h_cg details to new cgroup */ | |
de656ed3 SK |
912 | set_hugetlb_cgroup(new_folio, h_cg); |
913 | set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); | |
29f39430 | 914 | list_move(&new_folio->lru, &h->hugepage_activelist); |
db71ef79 | 915 | spin_unlock_irq(&hugetlb_lock); |
8e6ac7fa AK |
916 | } |
917 | ||
faced7e0 GS |
918 | static struct cftype hugetlb_files[] = { |
919 | {} /* terminate */ | |
920 | }; | |
921 | ||
073219e9 | 922 | struct cgroup_subsys hugetlb_cgrp_subsys = { |
92fb9748 TH |
923 | .css_alloc = hugetlb_cgroup_css_alloc, |
924 | .css_offline = hugetlb_cgroup_css_offline, | |
925 | .css_free = hugetlb_cgroup_css_free, | |
faced7e0 GS |
926 | .dfl_cftypes = hugetlb_files, |
927 | .legacy_cftypes = hugetlb_files, | |
2bc64a20 | 928 | }; |