bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT.
[linux-2.6-block.git] / net / core / sock_reuseport.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
ef456144
CG
2/*
3 * To speed up listener socket lookup, create an array to store all sockets
4 * listening on the same port. This allows a decision to be made after finding
538950a1
CG
5 * the first socket. An optional BPF program can also be configured for
6 * selecting the socket index from the array of available sockets.
ef456144
CG
7 */
8
9#include <net/sock_reuseport.h>
538950a1 10#include <linux/bpf.h>
736b4602 11#include <linux/idr.h>
8217ca65 12#include <linux/filter.h>
ef456144
CG
13#include <linux/rcupdate.h>
14
15#define INIT_SOCKS 128
16
736b4602
MKL
17DEFINE_SPINLOCK(reuseport_lock);
18
736b4602 19static DEFINE_IDA(reuseport_ida);
333bb73f
KI
20static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
21 struct sock_reuseport *reuse, bool bind_inany);
736b4602 22
5c040eaf
KI
23static int reuseport_sock_index(struct sock *sk,
24 const struct sock_reuseport *reuse,
25 bool closed)
26{
27 int left, right;
28
29 if (!closed) {
30 left = 0;
31 right = reuse->num_socks;
32 } else {
33 left = reuse->max_socks - reuse->num_closed_socks;
34 right = reuse->max_socks;
35 }
36
37 for (; left < right; left++)
38 if (reuse->socks[left] == sk)
39 return left;
40 return -1;
41}
42
43static void __reuseport_add_sock(struct sock *sk,
44 struct sock_reuseport *reuse)
45{
46 reuse->socks[reuse->num_socks] = sk;
1cd62c21 47 /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
5c040eaf
KI
48 smp_wmb();
49 reuse->num_socks++;
50}
51
52static bool __reuseport_detach_sock(struct sock *sk,
53 struct sock_reuseport *reuse)
54{
55 int i = reuseport_sock_index(sk, reuse, false);
56
57 if (i == -1)
58 return false;
59
60 reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
61 reuse->num_socks--;
62
63 return true;
64}
65
333bb73f
KI
66static void __reuseport_add_closed_sock(struct sock *sk,
67 struct sock_reuseport *reuse)
68{
69 reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
70 /* paired with READ_ONCE() in inet_csk_bind_conflict() */
71 WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
72}
73
74static bool __reuseport_detach_closed_sock(struct sock *sk,
75 struct sock_reuseport *reuse)
76{
77 int i = reuseport_sock_index(sk, reuse, true);
78
79 if (i == -1)
80 return false;
81
82 reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
83 /* paired with READ_ONCE() in inet_csk_bind_conflict() */
84 WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
85
86 return true;
87}
88
822f9bb1 89static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
ef456144 90{
822f9bb1 91 unsigned int size = sizeof(struct sock_reuseport) +
ef456144
CG
92 sizeof(struct sock *) * max_socks;
93 struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
94
95 if (!reuse)
96 return NULL;
97
98 reuse->max_socks = max_socks;
99
538950a1 100 RCU_INIT_POINTER(reuse->prog, NULL);
ef456144
CG
101 return reuse;
102}
103
2dbb9b9e 104int reuseport_alloc(struct sock *sk, bool bind_inany)
ef456144
CG
105{
106 struct sock_reuseport *reuse;
035ff358 107 int id, ret = 0;
ef456144
CG
108
109 /* bh lock used since this function call may precede hlist lock in
110 * soft irq of receive path or setsockopt from process context
111 */
112 spin_lock_bh(&reuseport_lock);
1b5f962e
CG
113
114 /* Allocation attempts can occur concurrently via the setsockopt path
115 * and the bind/hash path. Nothing to do when we lose the race.
116 */
2dbb9b9e
MKL
117 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
118 lockdep_is_held(&reuseport_lock));
119 if (reuse) {
333bb73f
KI
120 if (reuse->num_closed_socks) {
121 /* sk was shutdown()ed before */
122 ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
123 goto out;
124 }
125
2dbb9b9e
MKL
126 /* Only set reuse->bind_inany if the bind_inany is true.
127 * Otherwise, it will overwrite the reuse->bind_inany
128 * which was set by the bind/hash path.
129 */
130 if (bind_inany)
131 reuse->bind_inany = bind_inany;
1b5f962e 132 goto out;
2dbb9b9e 133 }
1b5f962e 134
ef456144
CG
135 reuse = __reuseport_alloc(INIT_SOCKS);
136 if (!reuse) {
035ff358
JS
137 ret = -ENOMEM;
138 goto out;
ef456144
CG
139 }
140
035ff358
JS
141 id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
142 if (id < 0) {
143 kfree(reuse);
144 ret = id;
145 goto out;
146 }
147
148 reuse->reuseport_id = id;
5c040eaf 149 reuse->bind_inany = bind_inany;
ef456144
CG
150 reuse->socks[0] = sk;
151 reuse->num_socks = 1;
152 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
153
1b5f962e 154out:
ef456144
CG
155 spin_unlock_bh(&reuseport_lock);
156
035ff358 157 return ret;
ef456144
CG
158}
159EXPORT_SYMBOL(reuseport_alloc);
160
161static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
162{
163 struct sock_reuseport *more_reuse;
164 u32 more_socks_size, i;
165
166 more_socks_size = reuse->max_socks * 2U;
333bb73f
KI
167 if (more_socks_size > U16_MAX) {
168 if (reuse->num_closed_socks) {
169 /* Make room by removing a closed sk.
170 * The child has already been migrated.
171 * Only reqsk left at this point.
172 */
173 struct sock *sk;
174
175 sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
176 RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
177 __reuseport_detach_closed_sock(sk, reuse);
178
179 return reuse;
180 }
181
ef456144 182 return NULL;
333bb73f 183 }
ef456144
CG
184
185 more_reuse = __reuseport_alloc(more_socks_size);
186 if (!more_reuse)
187 return NULL;
188
ef456144 189 more_reuse->num_socks = reuse->num_socks;
5c040eaf 190 more_reuse->num_closed_socks = reuse->num_closed_socks;
538950a1 191 more_reuse->prog = reuse->prog;
736b4602 192 more_reuse->reuseport_id = reuse->reuseport_id;
2dbb9b9e 193 more_reuse->bind_inany = reuse->bind_inany;
f2b2c55e 194 more_reuse->has_conns = reuse->has_conns;
ef456144
CG
195
196 memcpy(more_reuse->socks, reuse->socks,
197 reuse->num_socks * sizeof(struct sock *));
5c040eaf
KI
198 memcpy(more_reuse->socks +
199 (more_reuse->max_socks - more_reuse->num_closed_socks),
200 reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
201 reuse->num_closed_socks * sizeof(struct sock *));
40a1227e 202 more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
ef456144 203
5c040eaf 204 for (i = 0; i < reuse->max_socks; ++i)
ef456144
CG
205 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
206 more_reuse);
207
538950a1
CG
208 /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
209 * that reuse and more_reuse can temporarily share a reference
210 * to prog.
211 */
ef456144
CG
212 kfree_rcu(reuse, rcu);
213 return more_reuse;
214}
215
4db428a7
ED
216static void reuseport_free_rcu(struct rcu_head *head)
217{
218 struct sock_reuseport *reuse;
219
220 reuse = container_of(head, struct sock_reuseport, rcu);
8217ca65 221 sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
035ff358 222 ida_free(&reuseport_ida, reuse->reuseport_id);
4db428a7
ED
223 kfree(reuse);
224}
225
ef456144
CG
226/**
227 * reuseport_add_sock - Add a socket to the reuseport group of another.
228 * @sk: New socket to add to the group.
229 * @sk2: Socket belonging to the existing reuseport group.
37f3c421
BVA
230 * @bind_inany: Whether or not the group is bound to a local INANY address.
231 *
ef456144
CG
232 * May return ENOMEM and not add socket to group under memory pressure.
233 */
2dbb9b9e 234int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
ef456144 235{
4db428a7 236 struct sock_reuseport *old_reuse, *reuse;
ef456144 237
b4ace4f1 238 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
2dbb9b9e 239 int err = reuseport_alloc(sk2, bind_inany);
b4ace4f1
CG
240
241 if (err)
242 return err;
243 }
244
ef456144
CG
245 spin_lock_bh(&reuseport_lock);
246 reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
4db428a7
ED
247 lockdep_is_held(&reuseport_lock));
248 old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
333bb73f
KI
249 lockdep_is_held(&reuseport_lock));
250 if (old_reuse && old_reuse->num_closed_socks) {
251 /* sk was shutdown()ed before */
252 int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
253
254 spin_unlock_bh(&reuseport_lock);
255 return err;
256 }
257
4db428a7
ED
258 if (old_reuse && old_reuse->num_socks != 1) {
259 spin_unlock_bh(&reuseport_lock);
260 return -EBUSY;
261 }
ef456144 262
5c040eaf 263 if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
ef456144
CG
264 reuse = reuseport_grow(reuse);
265 if (!reuse) {
266 spin_unlock_bh(&reuseport_lock);
267 return -ENOMEM;
268 }
269 }
270
5c040eaf 271 __reuseport_add_sock(sk, reuse);
ef456144
CG
272 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
273
274 spin_unlock_bh(&reuseport_lock);
275
4db428a7
ED
276 if (old_reuse)
277 call_rcu(&old_reuse->rcu, reuseport_free_rcu);
ef456144
CG
278 return 0;
279}
76c6d988 280EXPORT_SYMBOL(reuseport_add_sock);
ef456144 281
333bb73f
KI
282static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
283 struct sock_reuseport *reuse, bool bind_inany)
284{
285 if (old_reuse == reuse) {
286 /* If sk was in the same reuseport group, just pop sk out of
287 * the closed section and push sk into the listening section.
288 */
289 __reuseport_detach_closed_sock(sk, old_reuse);
290 __reuseport_add_sock(sk, old_reuse);
291 return 0;
292 }
293
294 if (!reuse) {
295 /* In bind()/listen() path, we cannot carry over the eBPF prog
296 * for the shutdown()ed socket. In setsockopt() path, we should
297 * not change the eBPF prog of listening sockets by attaching a
298 * prog to the shutdown()ed socket. Thus, we will allocate a new
299 * reuseport group and detach sk from the old group.
300 */
301 int id;
302
303 reuse = __reuseport_alloc(INIT_SOCKS);
304 if (!reuse)
305 return -ENOMEM;
306
307 id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
308 if (id < 0) {
309 kfree(reuse);
310 return id;
311 }
312
313 reuse->reuseport_id = id;
314 reuse->bind_inany = bind_inany;
315 } else {
316 /* Move sk from the old group to the new one if
317 * - all the other listeners in the old group were close()d or
318 * shutdown()ed, and then sk2 has listen()ed on the same port
319 * OR
320 * - sk listen()ed without bind() (or with autobind), was
321 * shutdown()ed, and then listen()s on another port which
322 * sk2 listen()s on.
323 */
324 if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
325 reuse = reuseport_grow(reuse);
326 if (!reuse)
327 return -ENOMEM;
328 }
329 }
330
331 __reuseport_detach_closed_sock(sk, old_reuse);
332 __reuseport_add_sock(sk, reuse);
333 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
334
335 if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
336 call_rcu(&old_reuse->rcu, reuseport_free_rcu);
337
338 return 0;
339}
340
ef456144
CG
341void reuseport_detach_sock(struct sock *sk)
342{
343 struct sock_reuseport *reuse;
ef456144
CG
344
345 spin_lock_bh(&reuseport_lock);
346 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
347 lockdep_is_held(&reuseport_lock));
5dc4c4b7 348
333bb73f
KI
349 /* reuseport_grow() has detached a closed sk */
350 if (!reuse)
351 goto out;
352
035ff358
JS
353 /* Notify the bpf side. The sk may be added to a sockarray
354 * map. If so, sockarray logic will remove it from the map.
355 *
356 * Other bpf map types that work with reuseport, like sockmap,
357 * don't need an explicit callback from here. They override sk
358 * unhash/close ops to remove the sk from the map before we
359 * get to this point.
5dc4c4b7 360 */
035ff358 361 bpf_sk_reuseport_detach(sk);
5dc4c4b7 362
ef456144 363 rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
333bb73f
KI
364
365 if (!__reuseport_detach_closed_sock(sk, reuse))
366 __reuseport_detach_sock(sk, reuse);
5c040eaf
KI
367
368 if (reuse->num_socks + reuse->num_closed_socks == 0)
369 call_rcu(&reuse->rcu, reuseport_free_rcu);
ef456144 370
333bb73f 371out:
ef456144
CG
372 spin_unlock_bh(&reuseport_lock);
373}
374EXPORT_SYMBOL(reuseport_detach_sock);
375
333bb73f
KI
376void reuseport_stop_listen_sock(struct sock *sk)
377{
378 if (sk->sk_protocol == IPPROTO_TCP) {
379 struct sock_reuseport *reuse;
380
381 spin_lock_bh(&reuseport_lock);
382
383 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
384 lockdep_is_held(&reuseport_lock));
385
386 if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
387 /* Migration capable, move sk from the listening section
388 * to the closed section.
389 */
390 bpf_sk_reuseport_detach(sk);
391
392 __reuseport_detach_sock(sk, reuse);
393 __reuseport_add_closed_sock(sk, reuse);
394
395 spin_unlock_bh(&reuseport_lock);
396 return;
397 }
398
399 spin_unlock_bh(&reuseport_lock);
400 }
401
402 /* Not capable to do migration, detach immediately */
403 reuseport_detach_sock(sk);
404}
405EXPORT_SYMBOL(reuseport_stop_listen_sock);
406
8217ca65
MKL
407static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
408 struct bpf_prog *prog, struct sk_buff *skb,
409 int hdr_len)
538950a1
CG
410{
411 struct sk_buff *nskb = NULL;
412 u32 index;
413
414 if (skb_shared(skb)) {
415 nskb = skb_clone(skb, GFP_ATOMIC);
416 if (!nskb)
417 return NULL;
418 skb = nskb;
419 }
420
421 /* temporarily advance data past protocol header */
422 if (!pskb_pull(skb, hdr_len)) {
00ce3a15 423 kfree_skb(nskb);
538950a1
CG
424 return NULL;
425 }
426 index = bpf_prog_run_save_cb(prog, skb);
427 __skb_push(skb, hdr_len);
428
429 consume_skb(nskb);
430
431 if (index >= socks)
432 return NULL;
433
434 return reuse->socks[index];
435}
436
1cd62c21
KI
437static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
438 u32 hash, u16 num_socks)
439{
440 int i, j;
441
442 i = j = reciprocal_scale(hash, num_socks);
443 while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
444 i++;
445 if (i >= num_socks)
446 i = 0;
447 if (i == j)
448 return NULL;
449 }
450
451 return reuse->socks[i];
452}
453
ef456144
CG
454/**
455 * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
456 * @sk: First socket in the group.
538950a1
CG
457 * @hash: When no BPF filter is available, use this hash to select.
458 * @skb: skb to run through BPF filter.
459 * @hdr_len: BPF filter expects skb data pointer at payload data. If
460 * the skb does not yet point at the payload, this parameter represents
461 * how far the pointer needs to advance to reach the payload.
ef456144
CG
462 * Returns a socket that should receive the packet (or NULL on error).
463 */
538950a1
CG
464struct sock *reuseport_select_sock(struct sock *sk,
465 u32 hash,
466 struct sk_buff *skb,
467 int hdr_len)
ef456144
CG
468{
469 struct sock_reuseport *reuse;
538950a1 470 struct bpf_prog *prog;
ef456144
CG
471 struct sock *sk2 = NULL;
472 u16 socks;
473
474 rcu_read_lock();
475 reuse = rcu_dereference(sk->sk_reuseport_cb);
476
477 /* if memory allocation failed or add call is not yet complete */
478 if (!reuse)
479 goto out;
480
538950a1 481 prog = rcu_dereference(reuse->prog);
ef456144
CG
482 socks = READ_ONCE(reuse->num_socks);
483 if (likely(socks)) {
5c040eaf 484 /* paired with smp_wmb() in __reuseport_add_sock() */
ef456144
CG
485 smp_rmb();
486
8217ca65
MKL
487 if (!prog || !skb)
488 goto select_by_hash;
489
490 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
491 sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
492 else
493 sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
e94a62f5 494
8217ca65 495select_by_hash:
e94a62f5 496 /* no bpf or invalid bpf result: fall back to hash usage */
1cd62c21
KI
497 if (!sk2)
498 sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
ef456144
CG
499 }
500
501out:
502 rcu_read_unlock();
503 return sk2;
504}
505EXPORT_SYMBOL(reuseport_select_sock);
538950a1 506
1cd62c21
KI
507/**
508 * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
509 * @sk: close()ed or shutdown()ed socket in the group.
510 * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
511 * NEW_SYN_RECV request socket during 3WHS.
512 * @skb: skb to run through BPF filter.
513 * Returns a socket (with sk_refcnt +1) that should accept the child socket
514 * (or NULL on error).
515 */
516struct sock *reuseport_migrate_sock(struct sock *sk,
517 struct sock *migrating_sk,
518 struct sk_buff *skb)
519{
520 struct sock_reuseport *reuse;
521 struct sock *nsk = NULL;
522 u16 socks;
523 u32 hash;
524
525 rcu_read_lock();
526
527 reuse = rcu_dereference(sk->sk_reuseport_cb);
528 if (!reuse)
529 goto out;
530
531 socks = READ_ONCE(reuse->num_socks);
532 if (unlikely(!socks))
533 goto out;
534
535 /* paired with smp_wmb() in __reuseport_add_sock() */
536 smp_rmb();
537
538 hash = migrating_sk->sk_hash;
539 if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
540 nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
541
542 if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
543 nsk = NULL;
544
545out:
546 rcu_read_unlock();
547 return nsk;
548}
549EXPORT_SYMBOL(reuseport_migrate_sock);
550
8217ca65 551int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
538950a1
CG
552{
553 struct sock_reuseport *reuse;
554 struct bpf_prog *old_prog;
555
333bb73f
KI
556 if (sk_unhashed(sk)) {
557 int err;
8217ca65 558
333bb73f
KI
559 if (!sk->sk_reuseport)
560 return -EINVAL;
561
562 err = reuseport_alloc(sk, false);
8217ca65
MKL
563 if (err)
564 return err;
565 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
566 /* The socket wasn't bound with SO_REUSEPORT */
567 return -EINVAL;
568 }
569
538950a1
CG
570 spin_lock_bh(&reuseport_lock);
571 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
572 lockdep_is_held(&reuseport_lock));
573 old_prog = rcu_dereference_protected(reuse->prog,
574 lockdep_is_held(&reuseport_lock));
575 rcu_assign_pointer(reuse->prog, prog);
576 spin_unlock_bh(&reuseport_lock);
577
8217ca65
MKL
578 sk_reuseport_prog_free(old_prog);
579 return 0;
538950a1
CG
580}
581EXPORT_SYMBOL(reuseport_attach_prog);
99f3a064
MKL
582
583int reuseport_detach_prog(struct sock *sk)
584{
585 struct sock_reuseport *reuse;
586 struct bpf_prog *old_prog;
587
99f3a064
MKL
588 old_prog = NULL;
589 spin_lock_bh(&reuseport_lock);
590 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
591 lockdep_is_held(&reuseport_lock));
333bb73f
KI
592
593 /* reuse must be checked after acquiring the reuseport_lock
594 * because reuseport_grow() can detach a closed sk.
595 */
596 if (!reuse) {
597 spin_unlock_bh(&reuseport_lock);
598 return sk->sk_reuseport ? -ENOENT : -EINVAL;
599 }
600
601 if (sk_unhashed(sk) && reuse->num_closed_socks) {
602 spin_unlock_bh(&reuseport_lock);
603 return -ENOENT;
604 }
605
e3f0d761
PM
606 old_prog = rcu_replace_pointer(reuse->prog, old_prog,
607 lockdep_is_held(&reuseport_lock));
99f3a064
MKL
608 spin_unlock_bh(&reuseport_lock);
609
610 if (!old_prog)
611 return -ENOENT;
612
613 sk_reuseport_prog_free(old_prog);
614 return 0;
615}
616EXPORT_SYMBOL(reuseport_detach_prog);