Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3e32cb2e JW |
2 | /* |
3 | * Lockless hierarchical page accounting & limiting | |
4 | * | |
5 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner | |
6 | */ | |
7 | ||
8 | #include <linux/page_counter.h> | |
9 | #include <linux/atomic.h> | |
10 | #include <linux/kernel.h> | |
11 | #include <linux/string.h> | |
12 | #include <linux/sched.h> | |
13 | #include <linux/bug.h> | |
14 | #include <asm/page.h> | |
15 | ||
23067153 RG |
16 | static void propagate_low_usage(struct page_counter *c, unsigned long usage) |
17 | { | |
18 | unsigned long low_usage, old; | |
19 | long delta; | |
20 | ||
21 | if (!c->parent) | |
22 | return; | |
23 | ||
24 | if (!c->low && !atomic_long_read(&c->low_usage)) | |
25 | return; | |
26 | ||
27 | if (usage <= c->low) | |
28 | low_usage = usage; | |
29 | else | |
30 | low_usage = 0; | |
31 | ||
32 | old = atomic_long_xchg(&c->low_usage, low_usage); | |
33 | delta = low_usage - old; | |
34 | if (delta) | |
35 | atomic_long_add(delta, &c->parent->children_low_usage); | |
36 | } | |
37 | ||
3e32cb2e JW |
38 | /** |
39 | * page_counter_cancel - take pages out of the local counter | |
40 | * @counter: counter | |
41 | * @nr_pages: number of pages to cancel | |
3e32cb2e | 42 | */ |
64f21993 | 43 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
44 | { |
45 | long new; | |
46 | ||
bbec2e15 | 47 | new = atomic_long_sub_return(nr_pages, &counter->usage); |
23067153 | 48 | propagate_low_usage(counter, new); |
3e32cb2e JW |
49 | /* More uncharges than charges? */ |
50 | WARN_ON_ONCE(new < 0); | |
3e32cb2e JW |
51 | } |
52 | ||
53 | /** | |
54 | * page_counter_charge - hierarchically charge pages | |
55 | * @counter: counter | |
56 | * @nr_pages: number of pages to charge | |
57 | * | |
58 | * NOTE: This does not consider any configured counter limits. | |
59 | */ | |
60 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | |
61 | { | |
62 | struct page_counter *c; | |
63 | ||
64 | for (c = counter; c; c = c->parent) { | |
65 | long new; | |
66 | ||
bbec2e15 | 67 | new = atomic_long_add_return(nr_pages, &c->usage); |
23067153 | 68 | propagate_low_usage(counter, new); |
3e32cb2e JW |
69 | /* |
70 | * This is indeed racy, but we can live with some | |
71 | * inaccuracy in the watermark. | |
72 | */ | |
73 | if (new > c->watermark) | |
74 | c->watermark = new; | |
75 | } | |
76 | } | |
77 | ||
78 | /** | |
79 | * page_counter_try_charge - try to hierarchically charge pages | |
80 | * @counter: counter | |
81 | * @nr_pages: number of pages to charge | |
82 | * @fail: points first counter to hit its limit, if any | |
83 | * | |
6071ca52 JW |
84 | * Returns %true on success, or %false and @fail if the counter or one |
85 | * of its ancestors has hit its configured limit. | |
3e32cb2e | 86 | */ |
6071ca52 JW |
87 | bool page_counter_try_charge(struct page_counter *counter, |
88 | unsigned long nr_pages, | |
89 | struct page_counter **fail) | |
3e32cb2e JW |
90 | { |
91 | struct page_counter *c; | |
92 | ||
93 | for (c = counter; c; c = c->parent) { | |
94 | long new; | |
95 | /* | |
96 | * Charge speculatively to avoid an expensive CAS. If | |
97 | * a bigger charge fails, it might falsely lock out a | |
98 | * racing smaller charge and send it into reclaim | |
99 | * early, but the error is limited to the difference | |
100 | * between the two sizes, which is less than 2M/4M in | |
101 | * case of a THP locking out a regular page charge. | |
102 | * | |
103 | * The atomic_long_add_return() implies a full memory | |
104 | * barrier between incrementing the count and reading | |
105 | * the limit. When racing with page_counter_limit(), | |
106 | * we either see the new limit or the setter sees the | |
107 | * counter has changed and retries. | |
108 | */ | |
bbec2e15 RG |
109 | new = atomic_long_add_return(nr_pages, &c->usage); |
110 | if (new > c->max) { | |
111 | atomic_long_sub(nr_pages, &c->usage); | |
23067153 | 112 | propagate_low_usage(counter, new); |
3e32cb2e JW |
113 | /* |
114 | * This is racy, but we can live with some | |
115 | * inaccuracy in the failcnt. | |
116 | */ | |
117 | c->failcnt++; | |
118 | *fail = c; | |
119 | goto failed; | |
120 | } | |
23067153 | 121 | propagate_low_usage(counter, new); |
3e32cb2e JW |
122 | /* |
123 | * Just like with failcnt, we can live with some | |
124 | * inaccuracy in the watermark. | |
125 | */ | |
126 | if (new > c->watermark) | |
127 | c->watermark = new; | |
128 | } | |
6071ca52 | 129 | return true; |
3e32cb2e JW |
130 | |
131 | failed: | |
132 | for (c = counter; c != *fail; c = c->parent) | |
133 | page_counter_cancel(c, nr_pages); | |
134 | ||
6071ca52 | 135 | return false; |
3e32cb2e JW |
136 | } |
137 | ||
138 | /** | |
139 | * page_counter_uncharge - hierarchically uncharge pages | |
140 | * @counter: counter | |
141 | * @nr_pages: number of pages to uncharge | |
3e32cb2e | 142 | */ |
64f21993 | 143 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
144 | { |
145 | struct page_counter *c; | |
3e32cb2e | 146 | |
64f21993 JW |
147 | for (c = counter; c; c = c->parent) |
148 | page_counter_cancel(c, nr_pages); | |
3e32cb2e JW |
149 | } |
150 | ||
151 | /** | |
bbec2e15 | 152 | * page_counter_set_max - set the maximum number of pages allowed |
3e32cb2e | 153 | * @counter: counter |
bbec2e15 | 154 | * @nr_pages: limit to set |
3e32cb2e JW |
155 | * |
156 | * Returns 0 on success, -EBUSY if the current number of pages on the | |
157 | * counter already exceeds the specified limit. | |
158 | * | |
159 | * The caller must serialize invocations on the same counter. | |
160 | */ | |
bbec2e15 | 161 | int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
162 | { |
163 | for (;;) { | |
164 | unsigned long old; | |
bbec2e15 | 165 | long usage; |
3e32cb2e JW |
166 | |
167 | /* | |
168 | * Update the limit while making sure that it's not | |
169 | * below the concurrently-changing counter value. | |
170 | * | |
171 | * The xchg implies two full memory barriers before | |
172 | * and after, so the read-swap-read is ordered and | |
173 | * ensures coherency with page_counter_try_charge(): | |
174 | * that function modifies the count before checking | |
175 | * the limit, so if it sees the old limit, we see the | |
176 | * modified counter and retry. | |
177 | */ | |
bbec2e15 | 178 | usage = atomic_long_read(&counter->usage); |
3e32cb2e | 179 | |
bbec2e15 | 180 | if (usage > nr_pages) |
3e32cb2e JW |
181 | return -EBUSY; |
182 | ||
bbec2e15 | 183 | old = xchg(&counter->max, nr_pages); |
3e32cb2e | 184 | |
bbec2e15 | 185 | if (atomic_long_read(&counter->usage) <= usage) |
3e32cb2e JW |
186 | return 0; |
187 | ||
bbec2e15 | 188 | counter->max = old; |
3e32cb2e JW |
189 | cond_resched(); |
190 | } | |
191 | } | |
192 | ||
23067153 RG |
193 | /** |
194 | * page_counter_set_low - set the amount of protected memory | |
195 | * @counter: counter | |
196 | * @nr_pages: value to set | |
197 | * | |
198 | * The caller must serialize invocations on the same counter. | |
199 | */ | |
200 | void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) | |
201 | { | |
202 | struct page_counter *c; | |
203 | ||
204 | counter->low = nr_pages; | |
205 | ||
206 | for (c = counter; c; c = c->parent) | |
207 | propagate_low_usage(c, atomic_long_read(&c->usage)); | |
208 | } | |
209 | ||
3e32cb2e JW |
210 | /** |
211 | * page_counter_memparse - memparse() for page counter limits | |
212 | * @buf: string to parse | |
650c5e56 | 213 | * @max: string meaning maximum possible value |
3e32cb2e JW |
214 | * @nr_pages: returns the result in number of pages |
215 | * | |
216 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | |
217 | * limited to %PAGE_COUNTER_MAX. | |
218 | */ | |
650c5e56 JW |
219 | int page_counter_memparse(const char *buf, const char *max, |
220 | unsigned long *nr_pages) | |
3e32cb2e | 221 | { |
3e32cb2e JW |
222 | char *end; |
223 | u64 bytes; | |
224 | ||
650c5e56 | 225 | if (!strcmp(buf, max)) { |
3e32cb2e JW |
226 | *nr_pages = PAGE_COUNTER_MAX; |
227 | return 0; | |
228 | } | |
229 | ||
230 | bytes = memparse(buf, &end); | |
231 | if (*end != '\0') | |
232 | return -EINVAL; | |
233 | ||
234 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); | |
235 | ||
236 | return 0; | |
237 | } |