Commit | Line | Data |
---|---|---|
d1a4c0b3 GC |
1 | #include <net/tcp.h> |
2 | #include <net/tcp_memcontrol.h> | |
3 | #include <net/sock.h> | |
3dc43e3e GC |
4 | #include <net/ip.h> |
5 | #include <linux/nsproxy.h> | |
d1a4c0b3 GC |
6 | #include <linux/memcontrol.h> |
7 | #include <linux/module.h> | |
8 | ||
9 | static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) | |
10 | { | |
11 | return container_of(cg_proto, struct tcp_memcontrol, cg_proto); | |
12 | } | |
13 | ||
14 | static void memcg_tcp_enter_memory_pressure(struct sock *sk) | |
15 | { | |
c48e074c | 16 | if (sk->sk_cgrp->memory_pressure) |
d1a4c0b3 GC |
17 | *sk->sk_cgrp->memory_pressure = 1; |
18 | } | |
19 | EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); | |
20 | ||
1d62e436 | 21 | int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
d1a4c0b3 GC |
22 | { |
23 | /* | |
24 | * The root cgroup does not use res_counters, but rather, | |
25 | * rely on the data already collected by the network | |
26 | * subsystem | |
27 | */ | |
28 | struct res_counter *res_parent = NULL; | |
29 | struct cg_proto *cg_proto, *parent_cg; | |
30 | struct tcp_memcontrol *tcp; | |
d1a4c0b3 | 31 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
3dc43e3e | 32 | struct net *net = current->nsproxy->net_ns; |
d1a4c0b3 GC |
33 | |
34 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
35 | if (!cg_proto) | |
6bc10349 | 36 | return 0; |
d1a4c0b3 GC |
37 | |
38 | tcp = tcp_from_cgproto(cg_proto); | |
39 | ||
3dc43e3e GC |
40 | tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; |
41 | tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; | |
42 | tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; | |
d1a4c0b3 GC |
43 | tcp->tcp_memory_pressure = 0; |
44 | ||
45 | parent_cg = tcp_prot.proto_cgroup(parent); | |
46 | if (parent_cg) | |
47 | res_parent = parent_cg->memory_allocated; | |
48 | ||
49 | res_counter_init(&tcp->tcp_memory_allocated, res_parent); | |
50 | percpu_counter_init(&tcp->tcp_sockets_allocated, 0); | |
51 | ||
52 | cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; | |
53 | cg_proto->memory_pressure = &tcp->tcp_memory_pressure; | |
54 | cg_proto->sysctl_mem = tcp->tcp_prot_mem; | |
55 | cg_proto->memory_allocated = &tcp->tcp_memory_allocated; | |
56 | cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; | |
57 | cg_proto->memcg = memcg; | |
58 | ||
6bc10349 | 59 | return 0; |
d1a4c0b3 GC |
60 | } |
61 | EXPORT_SYMBOL(tcp_init_cgroup); | |
62 | ||
1d62e436 | 63 | void tcp_destroy_cgroup(struct mem_cgroup *memcg) |
d1a4c0b3 | 64 | { |
d1a4c0b3 GC |
65 | struct cg_proto *cg_proto; |
66 | struct tcp_memcontrol *tcp; | |
3aaabe23 | 67 | u64 val; |
d1a4c0b3 GC |
68 | |
69 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
70 | if (!cg_proto) | |
71 | return; | |
72 | ||
73 | tcp = tcp_from_cgproto(cg_proto); | |
74 | percpu_counter_destroy(&tcp->tcp_sockets_allocated); | |
3aaabe23 | 75 | |
1398eee0 | 76 | val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); |
d1a4c0b3 GC |
77 | } |
78 | EXPORT_SYMBOL(tcp_destroy_cgroup); | |
3aaabe23 GC |
79 | |
80 | static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |
81 | { | |
82 | struct net *net = current->nsproxy->net_ns; | |
83 | struct tcp_memcontrol *tcp; | |
84 | struct cg_proto *cg_proto; | |
85 | u64 old_lim; | |
86 | int i; | |
87 | int ret; | |
88 | ||
89 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
90 | if (!cg_proto) | |
91 | return -EINVAL; | |
92 | ||
93 | if (val > RESOURCE_MAX) | |
94 | val = RESOURCE_MAX; | |
95 | ||
96 | tcp = tcp_from_cgproto(cg_proto); | |
97 | ||
98 | old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); | |
99 | ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); | |
100 | if (ret) | |
101 | return ret; | |
102 | ||
103 | for (i = 0; i < 3; i++) | |
104 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, | |
105 | net->ipv4.sysctl_tcp_mem[i]); | |
106 | ||
3f134619 GC |
107 | if (val == RESOURCE_MAX) |
108 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | |
109 | else if (val != RESOURCE_MAX) { | |
110 | /* | |
111 | * The active bit needs to be written after the static_key | |
112 | * update. This is what guarantees that the socket activation | |
113 | * function is the last one to run. See sock_update_memcg() for | |
114 | * details, and note that we don't mark any socket as belonging | |
115 | * to this memcg until that flag is up. | |
116 | * | |
117 | * We need to do this, because static_keys will span multiple | |
118 | * sites, but we can't control their order. If we mark a socket | |
119 | * as accounted, but the accounting functions are not patched in | |
120 | * yet, we'll lose accounting. | |
121 | * | |
122 | * We never race with the readers in sock_update_memcg(), | |
123 | * because when this value change, the code to process it is not | |
124 | * patched in yet. | |
125 | * | |
126 | * The activated bit is used to guarantee that no two writers | |
127 | * will do the update in the same memcg. Without that, we can't | |
128 | * properly shutdown the static key. | |
129 | */ | |
130 | if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) | |
131 | static_key_slow_inc(&memcg_socket_limit_enabled); | |
132 | set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | |
133 | } | |
3aaabe23 GC |
134 | |
135 | return 0; | |
136 | } | |
137 | ||
138 | static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, | |
139 | const char *buffer) | |
140 | { | |
141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | |
142 | unsigned long long val; | |
143 | int ret = 0; | |
144 | ||
145 | switch (cft->private) { | |
146 | case RES_LIMIT: | |
147 | /* see memcontrol.c */ | |
148 | ret = res_counter_memparse_write_strategy(buffer, &val); | |
149 | if (ret) | |
150 | break; | |
151 | ret = tcp_update_limit(memcg, val); | |
152 | break; | |
153 | default: | |
154 | ret = -EINVAL; | |
155 | break; | |
156 | } | |
157 | return ret; | |
158 | } | |
159 | ||
160 | static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) | |
161 | { | |
162 | struct tcp_memcontrol *tcp; | |
163 | struct cg_proto *cg_proto; | |
164 | ||
165 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
166 | if (!cg_proto) | |
167 | return default_val; | |
168 | ||
169 | tcp = tcp_from_cgproto(cg_proto); | |
170 | return res_counter_read_u64(&tcp->tcp_memory_allocated, type); | |
171 | } | |
172 | ||
5a6dd343 GC |
173 | static u64 tcp_read_usage(struct mem_cgroup *memcg) |
174 | { | |
175 | struct tcp_memcontrol *tcp; | |
176 | struct cg_proto *cg_proto; | |
177 | ||
178 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
179 | if (!cg_proto) | |
180 | return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; | |
181 | ||
182 | tcp = tcp_from_cgproto(cg_proto); | |
183 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); | |
184 | } | |
185 | ||
3aaabe23 GC |
186 | static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) |
187 | { | |
188 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | |
189 | u64 val; | |
190 | ||
191 | switch (cft->private) { | |
192 | case RES_LIMIT: | |
193 | val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); | |
194 | break; | |
5a6dd343 GC |
195 | case RES_USAGE: |
196 | val = tcp_read_usage(memcg); | |
197 | break; | |
ffea59e5 | 198 | case RES_FAILCNT: |
0850f0f5 GC |
199 | case RES_MAX_USAGE: |
200 | val = tcp_read_stat(memcg, cft->private, 0); | |
ffea59e5 | 201 | break; |
3aaabe23 GC |
202 | default: |
203 | BUG(); | |
204 | } | |
205 | return val; | |
206 | } | |
207 | ||
ffea59e5 GC |
208 | static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) |
209 | { | |
210 | struct mem_cgroup *memcg; | |
211 | struct tcp_memcontrol *tcp; | |
212 | struct cg_proto *cg_proto; | |
213 | ||
214 | memcg = mem_cgroup_from_cont(cont); | |
215 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
216 | if (!cg_proto) | |
217 | return 0; | |
218 | tcp = tcp_from_cgproto(cg_proto); | |
219 | ||
220 | switch (event) { | |
0850f0f5 GC |
221 | case RES_MAX_USAGE: |
222 | res_counter_reset_max(&tcp->tcp_memory_allocated); | |
223 | break; | |
ffea59e5 GC |
224 | case RES_FAILCNT: |
225 | res_counter_reset_failcnt(&tcp->tcp_memory_allocated); | |
226 | break; | |
227 | } | |
228 | ||
229 | return 0; | |
230 | } | |
231 | ||
3aaabe23 GC |
232 | unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) |
233 | { | |
234 | struct tcp_memcontrol *tcp; | |
235 | struct cg_proto *cg_proto; | |
236 | ||
237 | cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); | |
238 | if (!cg_proto) | |
239 | return 0; | |
240 | ||
241 | tcp = tcp_from_cgproto(cg_proto); | |
242 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); | |
243 | } | |
244 | ||
245 | void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) | |
246 | { | |
247 | struct tcp_memcontrol *tcp; | |
248 | struct cg_proto *cg_proto; | |
249 | ||
250 | cg_proto = tcp_prot.proto_cgroup(memcg); | |
251 | if (!cg_proto) | |
252 | return; | |
253 | ||
254 | tcp = tcp_from_cgproto(cg_proto); | |
255 | ||
256 | tcp->tcp_prot_mem[idx] = val; | |
257 | } | |
676f7c8f TH |
258 | |
259 | static struct cftype tcp_files[] = { | |
260 | { | |
261 | .name = "kmem.tcp.limit_in_bytes", | |
262 | .write_string = tcp_cgroup_write, | |
263 | .read_u64 = tcp_cgroup_read, | |
264 | .private = RES_LIMIT, | |
265 | }, | |
266 | { | |
267 | .name = "kmem.tcp.usage_in_bytes", | |
268 | .read_u64 = tcp_cgroup_read, | |
269 | .private = RES_USAGE, | |
270 | }, | |
271 | { | |
272 | .name = "kmem.tcp.failcnt", | |
273 | .private = RES_FAILCNT, | |
274 | .trigger = tcp_cgroup_reset, | |
275 | .read_u64 = tcp_cgroup_read, | |
276 | }, | |
277 | { | |
278 | .name = "kmem.tcp.max_usage_in_bytes", | |
279 | .private = RES_MAX_USAGE, | |
280 | .trigger = tcp_cgroup_reset, | |
281 | .read_u64 = tcp_cgroup_read, | |
282 | }, | |
6bc10349 | 283 | { } /* terminate */ |
676f7c8f | 284 | }; |
6bc10349 TH |
285 | |
286 | static int __init tcp_memcontrol_init(void) | |
287 | { | |
288 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); | |
289 | return 0; | |
290 | } | |
291 | __initcall(tcp_memcontrol_init); |