Commit | Line | Data |
---|---|---|
5dc4c4b7 MKL |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (c) 2018 Facebook | |
4 | */ | |
5 | #include <linux/bpf.h> | |
6 | #include <linux/err.h> | |
7 | #include <linux/sock_diag.h> | |
8 | #include <net/sock_reuseport.h> | |
9 | ||
10 | struct reuseport_array { | |
11 | struct bpf_map map; | |
12 | struct sock __rcu *ptrs[]; | |
13 | }; | |
14 | ||
15 | static struct reuseport_array *reuseport_array(struct bpf_map *map) | |
16 | { | |
17 | return (struct reuseport_array *)map; | |
18 | } | |
19 | ||
20 | /* The caller must hold the reuseport_lock */ | |
21 | void bpf_sk_reuseport_detach(struct sock *sk) | |
22 | { | |
23 | struct sock __rcu **socks; | |
24 | ||
25 | write_lock_bh(&sk->sk_callback_lock); | |
26 | socks = sk->sk_user_data; | |
27 | if (socks) { | |
28 | WRITE_ONCE(sk->sk_user_data, NULL); | |
29 | /* | |
30 | * Do not move this NULL assignment outside of | |
31 | * sk->sk_callback_lock because there is | |
32 | * a race with reuseport_array_free() | |
33 | * which does not hold the reuseport_lock. | |
34 | */ | |
35 | RCU_INIT_POINTER(*socks, NULL); | |
36 | } | |
37 | write_unlock_bh(&sk->sk_callback_lock); | |
38 | } | |
39 | ||
40 | static int reuseport_array_alloc_check(union bpf_attr *attr) | |
41 | { | |
42 | if (attr->value_size != sizeof(u32) && | |
43 | attr->value_size != sizeof(u64)) | |
44 | return -EINVAL; | |
45 | ||
46 | return array_map_alloc_check(attr); | |
47 | } | |
48 | ||
49 | static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) | |
50 | { | |
51 | struct reuseport_array *array = reuseport_array(map); | |
52 | u32 index = *(u32 *)key; | |
53 | ||
54 | if (unlikely(index >= array->map.max_entries)) | |
55 | return NULL; | |
56 | ||
57 | return rcu_dereference(array->ptrs[index]); | |
58 | } | |
59 | ||
60 | /* Called from syscall only */ | |
61 | static int reuseport_array_delete_elem(struct bpf_map *map, void *key) | |
62 | { | |
63 | struct reuseport_array *array = reuseport_array(map); | |
64 | u32 index = *(u32 *)key; | |
65 | struct sock *sk; | |
66 | int err; | |
67 | ||
68 | if (index >= map->max_entries) | |
69 | return -E2BIG; | |
70 | ||
71 | if (!rcu_access_pointer(array->ptrs[index])) | |
72 | return -ENOENT; | |
73 | ||
74 | spin_lock_bh(&reuseport_lock); | |
75 | ||
76 | sk = rcu_dereference_protected(array->ptrs[index], | |
77 | lockdep_is_held(&reuseport_lock)); | |
78 | if (sk) { | |
79 | write_lock_bh(&sk->sk_callback_lock); | |
80 | WRITE_ONCE(sk->sk_user_data, NULL); | |
81 | RCU_INIT_POINTER(array->ptrs[index], NULL); | |
82 | write_unlock_bh(&sk->sk_callback_lock); | |
83 | err = 0; | |
84 | } else { | |
85 | err = -ENOENT; | |
86 | } | |
87 | ||
88 | spin_unlock_bh(&reuseport_lock); | |
89 | ||
90 | return err; | |
91 | } | |
92 | ||
93 | static void reuseport_array_free(struct bpf_map *map) | |
94 | { | |
95 | struct reuseport_array *array = reuseport_array(map); | |
96 | struct sock *sk; | |
97 | u32 i; | |
98 | ||
99 | synchronize_rcu(); | |
100 | ||
101 | /* | |
102 | * ops->map_*_elem() will not be able to access this | |
103 | * array now. Hence, this function only races with | |
104 | * bpf_sk_reuseport_detach() which was triggerred by | |
105 | * close() or disconnect(). | |
106 | * | |
107 | * This function and bpf_sk_reuseport_detach() are | |
108 | * both removing sk from "array". Who removes it | |
109 | * first does not matter. | |
110 | * | |
111 | * The only concern here is bpf_sk_reuseport_detach() | |
112 | * may access "array" which is being freed here. | |
113 | * bpf_sk_reuseport_detach() access this "array" | |
114 | * through sk->sk_user_data _and_ with sk->sk_callback_lock | |
115 | * held which is enough because this "array" is not freed | |
116 | * until all sk->sk_user_data has stopped referencing this "array". | |
117 | * | |
118 | * Hence, due to the above, taking "reuseport_lock" is not | |
119 | * needed here. | |
120 | */ | |
121 | ||
122 | /* | |
123 | * Since reuseport_lock is not taken, sk is accessed under | |
124 | * rcu_read_lock() | |
125 | */ | |
126 | rcu_read_lock(); | |
127 | for (i = 0; i < map->max_entries; i++) { | |
128 | sk = rcu_dereference(array->ptrs[i]); | |
129 | if (sk) { | |
130 | write_lock_bh(&sk->sk_callback_lock); | |
131 | /* | |
132 | * No need for WRITE_ONCE(). At this point, | |
133 | * no one is reading it without taking the | |
134 | * sk->sk_callback_lock. | |
135 | */ | |
136 | sk->sk_user_data = NULL; | |
137 | write_unlock_bh(&sk->sk_callback_lock); | |
138 | RCU_INIT_POINTER(array->ptrs[i], NULL); | |
139 | } | |
140 | } | |
141 | rcu_read_unlock(); | |
142 | ||
143 | /* | |
144 | * Once reaching here, all sk->sk_user_data is not | |
145 | * referenceing this "array". "array" can be freed now. | |
146 | */ | |
147 | bpf_map_area_free(array); | |
148 | } | |
149 | ||
150 | static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) | |
151 | { | |
152 | int err, numa_node = bpf_map_attr_numa_node(attr); | |
153 | struct reuseport_array *array; | |
b936ca64 | 154 | struct bpf_map_memory mem; |
c85d6913 | 155 | u64 array_size; |
5dc4c4b7 MKL |
156 | |
157 | if (!capable(CAP_SYS_ADMIN)) | |
158 | return ERR_PTR(-EPERM); | |
159 | ||
160 | array_size = sizeof(*array); | |
161 | array_size += (u64)attr->max_entries * sizeof(struct sock *); | |
162 | ||
c85d6913 | 163 | err = bpf_map_charge_init(&mem, array_size); |
5dc4c4b7 MKL |
164 | if (err) |
165 | return ERR_PTR(err); | |
166 | ||
167 | /* allocate all map elements and zero-initialize them */ | |
168 | array = bpf_map_area_alloc(array_size, numa_node); | |
b936ca64 RG |
169 | if (!array) { |
170 | bpf_map_charge_finish(&mem); | |
5dc4c4b7 | 171 | return ERR_PTR(-ENOMEM); |
b936ca64 | 172 | } |
5dc4c4b7 MKL |
173 | |
174 | /* copy mandatory map attributes */ | |
175 | bpf_map_init_from_attr(&array->map, attr); | |
b936ca64 | 176 | bpf_map_charge_move(&array->map.memory, &mem); |
5dc4c4b7 MKL |
177 | |
178 | return &array->map; | |
179 | } | |
180 | ||
181 | int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, | |
182 | void *value) | |
183 | { | |
184 | struct sock *sk; | |
185 | int err; | |
186 | ||
187 | if (map->value_size != sizeof(u64)) | |
188 | return -ENOSPC; | |
189 | ||
190 | rcu_read_lock(); | |
191 | sk = reuseport_array_lookup_elem(map, key); | |
192 | if (sk) { | |
193 | *(u64 *)value = sock_gen_cookie(sk); | |
194 | err = 0; | |
195 | } else { | |
196 | err = -ENOENT; | |
197 | } | |
198 | rcu_read_unlock(); | |
199 | ||
200 | return err; | |
201 | } | |
202 | ||
203 | static int | |
204 | reuseport_array_update_check(const struct reuseport_array *array, | |
205 | const struct sock *nsk, | |
206 | const struct sock *osk, | |
207 | const struct sock_reuseport *nsk_reuse, | |
208 | u32 map_flags) | |
209 | { | |
210 | if (osk && map_flags == BPF_NOEXIST) | |
211 | return -EEXIST; | |
212 | ||
213 | if (!osk && map_flags == BPF_EXIST) | |
214 | return -ENOENT; | |
215 | ||
216 | if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) | |
217 | return -ENOTSUPP; | |
218 | ||
219 | if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) | |
220 | return -ENOTSUPP; | |
221 | ||
222 | if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) | |
223 | return -ENOTSUPP; | |
224 | ||
225 | /* | |
226 | * sk must be hashed (i.e. listening in the TCP case or binded | |
227 | * in the UDP case) and | |
228 | * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). | |
229 | * | |
230 | * Also, sk will be used in bpf helper that is protected by | |
231 | * rcu_read_lock(). | |
232 | */ | |
233 | if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) | |
234 | return -EINVAL; | |
235 | ||
236 | /* READ_ONCE because the sk->sk_callback_lock may not be held here */ | |
237 | if (READ_ONCE(nsk->sk_user_data)) | |
238 | return -EBUSY; | |
239 | ||
240 | return 0; | |
241 | } | |
242 | ||
243 | /* | |
244 | * Called from syscall only. | |
245 | * The "nsk" in the fd refcnt. | |
246 | * The "osk" and "reuse" are protected by reuseport_lock. | |
247 | */ | |
248 | int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, | |
249 | void *value, u64 map_flags) | |
250 | { | |
251 | struct reuseport_array *array = reuseport_array(map); | |
252 | struct sock *free_osk = NULL, *osk, *nsk; | |
253 | struct sock_reuseport *reuse; | |
254 | u32 index = *(u32 *)key; | |
255 | struct socket *socket; | |
256 | int err, fd; | |
257 | ||
258 | if (map_flags > BPF_EXIST) | |
259 | return -EINVAL; | |
260 | ||
261 | if (index >= map->max_entries) | |
262 | return -E2BIG; | |
263 | ||
264 | if (map->value_size == sizeof(u64)) { | |
265 | u64 fd64 = *(u64 *)value; | |
266 | ||
267 | if (fd64 > S32_MAX) | |
268 | return -EINVAL; | |
269 | fd = fd64; | |
270 | } else { | |
271 | fd = *(int *)value; | |
272 | } | |
273 | ||
274 | socket = sockfd_lookup(fd, &err); | |
275 | if (!socket) | |
276 | return err; | |
277 | ||
278 | nsk = socket->sk; | |
279 | if (!nsk) { | |
280 | err = -EINVAL; | |
281 | goto put_file; | |
282 | } | |
283 | ||
284 | /* Quick checks before taking reuseport_lock */ | |
285 | err = reuseport_array_update_check(array, nsk, | |
286 | rcu_access_pointer(array->ptrs[index]), | |
287 | rcu_access_pointer(nsk->sk_reuseport_cb), | |
288 | map_flags); | |
289 | if (err) | |
290 | goto put_file; | |
291 | ||
292 | spin_lock_bh(&reuseport_lock); | |
293 | /* | |
294 | * Some of the checks only need reuseport_lock | |
295 | * but it is done under sk_callback_lock also | |
296 | * for simplicity reason. | |
297 | */ | |
298 | write_lock_bh(&nsk->sk_callback_lock); | |
299 | ||
300 | osk = rcu_dereference_protected(array->ptrs[index], | |
301 | lockdep_is_held(&reuseport_lock)); | |
302 | reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, | |
303 | lockdep_is_held(&reuseport_lock)); | |
304 | err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); | |
305 | if (err) | |
306 | goto put_file_unlock; | |
307 | ||
308 | /* Ensure reuse->reuseport_id is set */ | |
309 | err = reuseport_get_id(reuse); | |
310 | if (err < 0) | |
311 | goto put_file_unlock; | |
312 | ||
313 | WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); | |
314 | rcu_assign_pointer(array->ptrs[index], nsk); | |
315 | free_osk = osk; | |
316 | err = 0; | |
317 | ||
318 | put_file_unlock: | |
319 | write_unlock_bh(&nsk->sk_callback_lock); | |
320 | ||
321 | if (free_osk) { | |
322 | write_lock_bh(&free_osk->sk_callback_lock); | |
323 | WRITE_ONCE(free_osk->sk_user_data, NULL); | |
324 | write_unlock_bh(&free_osk->sk_callback_lock); | |
325 | } | |
326 | ||
327 | spin_unlock_bh(&reuseport_lock); | |
328 | put_file: | |
329 | fput(socket->file); | |
330 | return err; | |
331 | } | |
332 | ||
333 | /* Called from syscall */ | |
334 | static int reuseport_array_get_next_key(struct bpf_map *map, void *key, | |
335 | void *next_key) | |
336 | { | |
337 | struct reuseport_array *array = reuseport_array(map); | |
338 | u32 index = key ? *(u32 *)key : U32_MAX; | |
339 | u32 *next = (u32 *)next_key; | |
340 | ||
341 | if (index >= array->map.max_entries) { | |
342 | *next = 0; | |
343 | return 0; | |
344 | } | |
345 | ||
346 | if (index == array->map.max_entries - 1) | |
347 | return -ENOENT; | |
348 | ||
349 | *next = index + 1; | |
350 | return 0; | |
351 | } | |
352 | ||
353 | const struct bpf_map_ops reuseport_array_ops = { | |
354 | .map_alloc_check = reuseport_array_alloc_check, | |
355 | .map_alloc = reuseport_array_alloc, | |
356 | .map_free = reuseport_array_free, | |
357 | .map_lookup_elem = reuseport_array_lookup_elem, | |
358 | .map_get_next_key = reuseport_array_get_next_key, | |
359 | .map_delete_elem = reuseport_array_delete_elem, | |
360 | }; |