Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3e32cb2e JW |
2 | /* |
3 | * Lockless hierarchical page accounting & limiting | |
4 | * | |
5 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner | |
6 | */ | |
7 | ||
8 | #include <linux/page_counter.h> | |
9 | #include <linux/atomic.h> | |
10 | #include <linux/kernel.h> | |
11 | #include <linux/string.h> | |
12 | #include <linux/sched.h> | |
13 | #include <linux/bug.h> | |
14 | #include <asm/page.h> | |
15 | ||
bf8d5d52 RG |
16 | static void propagate_protected_usage(struct page_counter *c, |
17 | unsigned long usage) | |
23067153 | 18 | { |
bf8d5d52 | 19 | unsigned long protected, old_protected; |
c3d53200 | 20 | unsigned long low, min; |
23067153 RG |
21 | long delta; |
22 | ||
23 | if (!c->parent) | |
24 | return; | |
25 | ||
c3d53200 CD |
26 | min = READ_ONCE(c->min); |
27 | if (min || atomic_long_read(&c->min_usage)) { | |
28 | protected = min(usage, min); | |
bf8d5d52 RG |
29 | old_protected = atomic_long_xchg(&c->min_usage, protected); |
30 | delta = protected - old_protected; | |
31 | if (delta) | |
32 | atomic_long_add(delta, &c->parent->children_min_usage); | |
33 | } | |
23067153 | 34 | |
f86b810c CD |
35 | low = READ_ONCE(c->low); |
36 | if (low || atomic_long_read(&c->low_usage)) { | |
37 | protected = min(usage, low); | |
bf8d5d52 RG |
38 | old_protected = atomic_long_xchg(&c->low_usage, protected); |
39 | delta = protected - old_protected; | |
40 | if (delta) | |
41 | atomic_long_add(delta, &c->parent->children_low_usage); | |
42 | } | |
23067153 RG |
43 | } |
44 | ||
3e32cb2e JW |
45 | /** |
46 | * page_counter_cancel - take pages out of the local counter | |
47 | * @counter: counter | |
48 | * @nr_pages: number of pages to cancel | |
3e32cb2e | 49 | */ |
64f21993 | 50 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
51 | { |
52 | long new; | |
53 | ||
bbec2e15 | 54 | new = atomic_long_sub_return(nr_pages, &counter->usage); |
3e32cb2e | 55 | /* More uncharges than charges? */ |
9317d0ff JW |
56 | if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", |
57 | new, nr_pages)) { | |
58 | new = 0; | |
59 | atomic_long_set(&counter->usage, new); | |
60 | } | |
61 | propagate_protected_usage(counter, new); | |
3e32cb2e JW |
62 | } |
63 | ||
64 | /** | |
65 | * page_counter_charge - hierarchically charge pages | |
66 | * @counter: counter | |
67 | * @nr_pages: number of pages to charge | |
68 | * | |
69 | * NOTE: This does not consider any configured counter limits. | |
70 | */ | |
71 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | |
72 | { | |
73 | struct page_counter *c; | |
74 | ||
75 | for (c = counter; c; c = c->parent) { | |
76 | long new; | |
77 | ||
bbec2e15 | 78 | new = atomic_long_add_return(nr_pages, &c->usage); |
a6f23d14 | 79 | propagate_protected_usage(c, new); |
3e32cb2e JW |
80 | /* |
81 | * This is indeed racy, but we can live with some | |
82 | * inaccuracy in the watermark. | |
83 | */ | |
6e4bd50f QC |
84 | if (new > READ_ONCE(c->watermark)) |
85 | WRITE_ONCE(c->watermark, new); | |
3e32cb2e JW |
86 | } |
87 | } | |
88 | ||
89 | /** | |
90 | * page_counter_try_charge - try to hierarchically charge pages | |
91 | * @counter: counter | |
92 | * @nr_pages: number of pages to charge | |
93 | * @fail: points first counter to hit its limit, if any | |
94 | * | |
6071ca52 JW |
95 | * Returns %true on success, or %false and @fail if the counter or one |
96 | * of its ancestors has hit its configured limit. | |
3e32cb2e | 97 | */ |
6071ca52 JW |
98 | bool page_counter_try_charge(struct page_counter *counter, |
99 | unsigned long nr_pages, | |
100 | struct page_counter **fail) | |
3e32cb2e JW |
101 | { |
102 | struct page_counter *c; | |
103 | ||
104 | for (c = counter; c; c = c->parent) { | |
105 | long new; | |
106 | /* | |
107 | * Charge speculatively to avoid an expensive CAS. If | |
108 | * a bigger charge fails, it might falsely lock out a | |
109 | * racing smaller charge and send it into reclaim | |
110 | * early, but the error is limited to the difference | |
111 | * between the two sizes, which is less than 2M/4M in | |
112 | * case of a THP locking out a regular page charge. | |
113 | * | |
114 | * The atomic_long_add_return() implies a full memory | |
115 | * barrier between incrementing the count and reading | |
d437024e | 116 | * the limit. When racing with page_counter_set_max(), |
3e32cb2e JW |
117 | * we either see the new limit or the setter sees the |
118 | * counter has changed and retries. | |
119 | */ | |
bbec2e15 RG |
120 | new = atomic_long_add_return(nr_pages, &c->usage); |
121 | if (new > c->max) { | |
122 | atomic_long_sub(nr_pages, &c->usage); | |
3e32cb2e JW |
123 | /* |
124 | * This is racy, but we can live with some | |
6e4bd50f QC |
125 | * inaccuracy in the failcnt which is only used |
126 | * to report stats. | |
3e32cb2e | 127 | */ |
6e4bd50f | 128 | data_race(c->failcnt++); |
3e32cb2e JW |
129 | *fail = c; |
130 | goto failed; | |
131 | } | |
a6f23d14 | 132 | propagate_protected_usage(c, new); |
3e32cb2e JW |
133 | /* |
134 | * Just like with failcnt, we can live with some | |
135 | * inaccuracy in the watermark. | |
136 | */ | |
6e4bd50f QC |
137 | if (new > READ_ONCE(c->watermark)) |
138 | WRITE_ONCE(c->watermark, new); | |
3e32cb2e | 139 | } |
6071ca52 | 140 | return true; |
3e32cb2e JW |
141 | |
142 | failed: | |
143 | for (c = counter; c != *fail; c = c->parent) | |
144 | page_counter_cancel(c, nr_pages); | |
145 | ||
6071ca52 | 146 | return false; |
3e32cb2e JW |
147 | } |
148 | ||
149 | /** | |
150 | * page_counter_uncharge - hierarchically uncharge pages | |
151 | * @counter: counter | |
152 | * @nr_pages: number of pages to uncharge | |
3e32cb2e | 153 | */ |
64f21993 | 154 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
155 | { |
156 | struct page_counter *c; | |
3e32cb2e | 157 | |
64f21993 JW |
158 | for (c = counter; c; c = c->parent) |
159 | page_counter_cancel(c, nr_pages); | |
3e32cb2e JW |
160 | } |
161 | ||
162 | /** | |
bbec2e15 | 163 | * page_counter_set_max - set the maximum number of pages allowed |
3e32cb2e | 164 | * @counter: counter |
bbec2e15 | 165 | * @nr_pages: limit to set |
3e32cb2e JW |
166 | * |
167 | * Returns 0 on success, -EBUSY if the current number of pages on the | |
168 | * counter already exceeds the specified limit. | |
169 | * | |
170 | * The caller must serialize invocations on the same counter. | |
171 | */ | |
bbec2e15 | 172 | int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
173 | { |
174 | for (;;) { | |
175 | unsigned long old; | |
bbec2e15 | 176 | long usage; |
3e32cb2e JW |
177 | |
178 | /* | |
179 | * Update the limit while making sure that it's not | |
180 | * below the concurrently-changing counter value. | |
181 | * | |
182 | * The xchg implies two full memory barriers before | |
183 | * and after, so the read-swap-read is ordered and | |
184 | * ensures coherency with page_counter_try_charge(): | |
185 | * that function modifies the count before checking | |
186 | * the limit, so if it sees the old limit, we see the | |
187 | * modified counter and retry. | |
188 | */ | |
13064781 | 189 | usage = page_counter_read(counter); |
3e32cb2e | 190 | |
bbec2e15 | 191 | if (usage > nr_pages) |
3e32cb2e JW |
192 | return -EBUSY; |
193 | ||
bbec2e15 | 194 | old = xchg(&counter->max, nr_pages); |
3e32cb2e | 195 | |
13064781 | 196 | if (page_counter_read(counter) <= usage) |
3e32cb2e JW |
197 | return 0; |
198 | ||
bbec2e15 | 199 | counter->max = old; |
3e32cb2e JW |
200 | cond_resched(); |
201 | } | |
202 | } | |
203 | ||
bf8d5d52 RG |
204 | /** |
205 | * page_counter_set_min - set the amount of protected memory | |
206 | * @counter: counter | |
207 | * @nr_pages: value to set | |
208 | * | |
209 | * The caller must serialize invocations on the same counter. | |
210 | */ | |
211 | void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) | |
212 | { | |
213 | struct page_counter *c; | |
214 | ||
c3d53200 | 215 | WRITE_ONCE(counter->min, nr_pages); |
bf8d5d52 RG |
216 | |
217 | for (c = counter; c; c = c->parent) | |
218 | propagate_protected_usage(c, atomic_long_read(&c->usage)); | |
219 | } | |
220 | ||
23067153 RG |
221 | /** |
222 | * page_counter_set_low - set the amount of protected memory | |
223 | * @counter: counter | |
224 | * @nr_pages: value to set | |
225 | * | |
226 | * The caller must serialize invocations on the same counter. | |
227 | */ | |
228 | void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) | |
229 | { | |
230 | struct page_counter *c; | |
231 | ||
f86b810c | 232 | WRITE_ONCE(counter->low, nr_pages); |
23067153 RG |
233 | |
234 | for (c = counter; c; c = c->parent) | |
bf8d5d52 | 235 | propagate_protected_usage(c, atomic_long_read(&c->usage)); |
23067153 RG |
236 | } |
237 | ||
3e32cb2e JW |
238 | /** |
239 | * page_counter_memparse - memparse() for page counter limits | |
240 | * @buf: string to parse | |
650c5e56 | 241 | * @max: string meaning maximum possible value |
3e32cb2e JW |
242 | * @nr_pages: returns the result in number of pages |
243 | * | |
244 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | |
245 | * limited to %PAGE_COUNTER_MAX. | |
246 | */ | |
650c5e56 JW |
247 | int page_counter_memparse(const char *buf, const char *max, |
248 | unsigned long *nr_pages) | |
3e32cb2e | 249 | { |
3e32cb2e JW |
250 | char *end; |
251 | u64 bytes; | |
252 | ||
650c5e56 | 253 | if (!strcmp(buf, max)) { |
3e32cb2e JW |
254 | *nr_pages = PAGE_COUNTER_MAX; |
255 | return 0; | |
256 | } | |
257 | ||
258 | bytes = memparse(buf, &end); | |
259 | if (*end != '\0') | |
260 | return -EINVAL; | |
261 | ||
262 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); | |
263 | ||
264 | return 0; | |
265 | } |