Merge tag 'pci-v6.16-fixes-3' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci
[linux-2.6-block.git] / net / ipv4 / inet_hashtables.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
77d8bf9c
ACM
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic INET transport hashtables
8 *
9 * Authors: Lotsa people, from code originally in tcp
77d8bf9c
ACM
10 */
11
2d8c4ce5 12#include <linux/module.h>
a7f5e7f1 13#include <linux/random.h>
f3f05f70 14#include <linux/sched.h>
77d8bf9c 15#include <linux/slab.h>
f3f05f70 16#include <linux/wait.h>
095dc8e0 17#include <linux/vmalloc.h>
57c8a661 18#include <linux/memblock.h>
77d8bf9c 19
c125e80b 20#include <net/addrconf.h>
463c84b9 21#include <net/inet_connection_sock.h>
77d8bf9c 22#include <net/inet_hashtables.h>
01770a16
RD
23#if IS_ENABLED(CONFIG_IPV6)
24#include <net/inet6_hashtables.h>
25#endif
6e073572 26#include <net/hotdata.h>
a7f5e7f1 27#include <net/ip.h>
9cd5ef0b
ED
28#include <net/rps.h>
29#include <net/secure_seq.h>
c125e80b 30#include <net/sock_reuseport.h>
9cd5ef0b 31#include <net/tcp.h>
77d8bf9c 32
0f495f76
LB
33u32 inet_ehashfn(const struct net *net, const __be32 laddr,
34 const __u16 lport, const __be32 faddr,
35 const __be16 fport)
65cd8033 36{
1bbdceef
HFS
37 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
38
9544d60a
ED
39 return lport + __inet_ehashfn(laddr, 0, faddr, fport,
40 inet_ehash_secret + net_hash_mix(net));
65cd8033 41}
0f495f76 42EXPORT_SYMBOL_GPL(inet_ehashfn);
65cd8033 43
d1e559d0
ED
44/* This function handles inet_sock, but also timewait and request sockets
45 * for IPv4/IPv6.
46 */
784c372a 47static u32 sk_ehashfn(const struct sock *sk)
65cd8033 48{
d1e559d0
ED
49#if IS_ENABLED(CONFIG_IPV6)
50 if (sk->sk_family == AF_INET6 &&
51 !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
52 return inet6_ehashfn(sock_net(sk),
53 &sk->sk_v6_rcv_saddr, sk->sk_num,
54 &sk->sk_v6_daddr, sk->sk_dport);
55#endif
5b441f76
ED
56 return inet_ehashfn(sock_net(sk),
57 sk->sk_rcv_saddr, sk->sk_num,
58 sk->sk_daddr, sk->sk_dport);
65cd8033
HFS
59}
60
77d8bf9c
ACM
61/*
62 * Allocate and initialize a new local port bind bucket.
63 * The bindhash mutex for snum's hash chain must be held here.
64 */
e18b890b 65struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
941b1d22 66 struct net *net,
77d8bf9c 67 struct inet_bind_hashbucket *head,
3c82a21f
RS
68 const unsigned short snum,
69 int l3mdev)
77d8bf9c 70{
54e6ecb2 71 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
77d8bf9c 72
00db4124 73 if (tb) {
efd7ef1c 74 write_pnet(&tb->ib_net, net);
3c82a21f 75 tb->l3mdev = l3mdev;
77d8bf9c
ACM
76 tb->port = snum;
77 tb->fastreuse = 0;
da5e3630 78 tb->fastreuseport = 0;
822fb91f 79 INIT_HLIST_HEAD(&tb->bhash2);
d186f405 80 hlist_add_head_rcu(&tb->node, &head->chain);
77d8bf9c
ACM
81 }
82 return tb;
83}
84
77d8bf9c
ACM
85/*
86 * Caller must hold hashbucket lock for this tb with local BH disabled
87 */
d186f405 88void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
77d8bf9c 89{
8002d44f 90 if (hlist_empty(&tb->bhash2)) {
d186f405
ED
91 hlist_del_rcu(&tb->node);
92 kfree_rcu(tb, rcu);
77d8bf9c
ACM
93 }
94}
2d8c4ce5 95
28044fc1
JK
96bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
97 unsigned short port, int l3mdev)
98{
99 return net_eq(ib_net(tb), net) && tb->port == port &&
100 tb->l3mdev == l3mdev;
101}
102
4dd71088 103static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
28044fc1
JK
104 struct net *net,
105 struct inet_bind_hashbucket *head,
822fb91f 106 struct inet_bind_bucket *tb,
28044fc1
JK
107 const struct sock *sk)
108{
4dd71088 109 write_pnet(&tb2->ib_net, net);
822fb91f
KI
110 tb2->l3mdev = tb->l3mdev;
111 tb2->port = tb->port;
28044fc1 112#if IS_ENABLED(CONFIG_IPV6)
5a22bba1
KI
113 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
114 if (sk->sk_family == AF_INET6) {
4dd71088
KI
115 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
116 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
5a22bba1 117 } else {
4dd71088
KI
118 tb2->addr_type = IPV6_ADDR_MAPPED;
119 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
5a22bba1 120 }
06a8c04f 121#else
4dd71088 122 tb2->rcv_saddr = sk->sk_rcv_saddr;
28044fc1 123#endif
4dd71088 124 INIT_HLIST_HEAD(&tb2->owners);
4dd71088 125 hlist_add_head(&tb2->node, &head->chain);
822fb91f 126 hlist_add_head(&tb2->bhash_node, &tb->bhash2);
28044fc1
JK
127}
128
129struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
130 struct net *net,
131 struct inet_bind_hashbucket *head,
822fb91f 132 struct inet_bind_bucket *tb,
28044fc1
JK
133 const struct sock *sk)
134{
4dd71088 135 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
28044fc1 136
4dd71088 137 if (tb2)
822fb91f 138 inet_bind2_bucket_init(tb2, net, head, tb, sk);
28044fc1 139
4dd71088 140 return tb2;
28044fc1
JK
141}
142
143/* Caller must hold hashbucket lock for this tb with local BH disabled */
144void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
145{
8191792c 146 if (hlist_empty(&tb->owners)) {
28044fc1 147 __hlist_del(&tb->node);
822fb91f 148 __hlist_del(&tb->bhash_node);
28044fc1
JK
149 kmem_cache_free(cachep, tb);
150 }
151}
152
153static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
154 const struct sock *sk)
155{
156#if IS_ENABLED(CONFIG_IPV6)
06a8c04f
KI
157 if (sk->sk_family == AF_INET6)
158 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
5456262d 159
5a22bba1 160 if (tb2->addr_type != IPV6_ADDR_MAPPED)
06a8c04f 161 return false;
28044fc1
JK
162#endif
163 return tb2->rcv_saddr == sk->sk_rcv_saddr;
164}
165
2d8c4ce5 166void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
28044fc1 167 struct inet_bind2_bucket *tb2, unsigned short port)
2d8c4ce5 168{
28044fc1 169 inet_sk(sk)->inet_num = port;
463c84b9 170 inet_csk(sk)->icsk_bind_hash = tb;
28044fc1 171 inet_csk(sk)->icsk_bind2_hash = tb2;
770041d3 172 sk_add_bind_node(sk, &tb2->owners);
2d8c4ce5
ACM
173}
174
2d8c4ce5
ACM
175/*
176 * Get rid of any references to a local port held by the given sock.
177 */
ab1e0a13 178static void __inet_put_port(struct sock *sk)
2d8c4ce5 179{
235bd9d2 180 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
08eaef90
KI
181 struct inet_bind_hashbucket *head, *head2;
182 struct net *net = sock_net(sk);
2d8c4ce5 183 struct inet_bind_bucket *tb;
08eaef90
KI
184 int bhash;
185
186 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
187 head = &hashinfo->bhash[bhash];
188 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
2d8c4ce5
ACM
189
190 spin_lock(&head->lock);
463c84b9 191 tb = inet_csk(sk)->icsk_bind_hash;
463c84b9 192 inet_csk(sk)->icsk_bind_hash = NULL;
c720c7e8 193 inet_sk(sk)->inet_num = 0;
28044fc1
JK
194
195 spin_lock(&head2->lock);
196 if (inet_csk(sk)->icsk_bind2_hash) {
197 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
198
770041d3 199 __sk_del_bind_node(sk);
28044fc1
JK
200 inet_csk(sk)->icsk_bind2_hash = NULL;
201 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
202 }
203 spin_unlock(&head2->lock);
204
d186f405 205 inet_bind_bucket_destroy(tb);
2d8c4ce5
ACM
206 spin_unlock(&head->lock);
207}
208
ab1e0a13 209void inet_put_port(struct sock *sk)
2d8c4ce5
ACM
210{
211 local_bh_disable();
ab1e0a13 212 __inet_put_port(sk);
2d8c4ce5
ACM
213 local_bh_enable();
214}
2d8c4ce5 215EXPORT_SYMBOL(inet_put_port);
f3f05f70 216
1ce31c9e 217int __inet_inherit_port(const struct sock *sk, struct sock *child)
53083773 218{
235bd9d2 219 struct inet_hashinfo *table = tcp_get_hashinfo(sk);
093d2823 220 unsigned short port = inet_sk(child)->inet_num;
08eaef90 221 struct inet_bind_hashbucket *head, *head2;
28044fc1 222 bool created_inet_bind_bucket = false;
28044fc1 223 struct net *net = sock_net(sk);
08eaef90 224 bool update_fastreuse = false;
28044fc1 225 struct inet_bind2_bucket *tb2;
53083773 226 struct inet_bind_bucket *tb;
08eaef90
KI
227 int bhash, l3mdev;
228
229 bhash = inet_bhashfn(net, port, table->bhash_size);
230 head = &table->bhash[bhash];
231 head2 = inet_bhashfn_portaddr(table, child, net, port);
53083773
PE
232
233 spin_lock(&head->lock);
28044fc1 234 spin_lock(&head2->lock);
53083773 235 tb = inet_csk(sk)->icsk_bind_hash;
28044fc1
JK
236 tb2 = inet_csk(sk)->icsk_bind2_hash;
237 if (unlikely(!tb || !tb2)) {
238 spin_unlock(&head2->lock);
c2f34a65
ED
239 spin_unlock(&head->lock);
240 return -ENOENT;
241 }
093d2823 242 if (tb->port != port) {
3c82a21f
RS
243 l3mdev = inet_sk_bound_l3mdev(sk);
244
093d2823
BS
245 /* NOTE: using tproxy and redirecting skbs to a proxy
246 * on a different listener port breaks the assumption
247 * that the listener socket's icsk_bind_hash is the same
248 * as that of the child socket. We have to look up or
249 * create a new bind bucket for the child here. */
b67bfe0d 250 inet_bind_bucket_for_each(tb, &head->chain) {
28044fc1 251 if (inet_bind_bucket_match(tb, net, port, l3mdev))
093d2823
BS
252 break;
253 }
b67bfe0d 254 if (!tb) {
093d2823 255 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
28044fc1 256 net, head, port, l3mdev);
093d2823 257 if (!tb) {
28044fc1 258 spin_unlock(&head2->lock);
093d2823
BS
259 spin_unlock(&head->lock);
260 return -ENOMEM;
261 }
28044fc1
JK
262 created_inet_bind_bucket = true;
263 }
264 update_fastreuse = true;
265
266 goto bhash2_find;
267 } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
268 l3mdev = inet_sk_bound_l3mdev(sk);
269
270bhash2_find:
271 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
272 if (!tb2) {
273 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
822fb91f 274 net, head2, tb, child);
28044fc1
JK
275 if (!tb2)
276 goto error;
093d2823
BS
277 }
278 }
28044fc1
JK
279 if (update_fastreuse)
280 inet_csk_update_fastreuse(tb, child);
281 inet_bind_hash(child, tb, tb2, port);
282 spin_unlock(&head2->lock);
53083773 283 spin_unlock(&head->lock);
093d2823
BS
284
285 return 0;
28044fc1
JK
286
287error:
288 if (created_inet_bind_bucket)
d186f405 289 inet_bind_bucket_destroy(tb);
28044fc1
JK
290 spin_unlock(&head2->lock);
291 spin_unlock(&head->lock);
292 return -ENOMEM;
53083773 293}
53083773
PE
294EXPORT_SYMBOL_GPL(__inet_inherit_port);
295
61b7c691
MKL
296static struct inet_listen_hashbucket *
297inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
298{
299 u32 hash;
300
301#if IS_ENABLED(CONFIG_IPV6)
302 if (sk->sk_family == AF_INET6)
303 hash = ipv6_portaddr_hash(sock_net(sk),
304 &sk->sk_v6_rcv_saddr,
305 inet_sk(sk)->inet_num);
306 else
307#endif
308 hash = ipv4_portaddr_hash(sock_net(sk),
309 inet_sk(sk)->inet_rcv_saddr,
310 inet_sk(sk)->inet_num);
311 return inet_lhash2_bucket(h, hash);
312}
313
a2dc7bee 314static inline int compute_score(struct sock *sk, const struct net *net,
c25eb3bf 315 const unsigned short hnum, const __be32 daddr,
34e1ec31 316 const int dif, const int sdif)
c25eb3bf
ED
317{
318 int score = -1;
c25eb3bf 319
d9fbc7f6 320 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
c25eb3bf 321 !ipv6_only_sock(sk)) {
d9fbc7f6
PO
322 if (sk->sk_rcv_saddr != daddr)
323 return -1;
324
325 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
e7819058 326 return -1;
8d6c414c 327 score = sk->sk_bound_dev_if ? 2 : 1;
3fa6f616 328
8d6c414c
MM
329 if (sk->sk_family == PF_INET)
330 score++;
7170a977 331 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
70da268b 332 score++;
c25eb3bf
ED
333 }
334 return score;
335}
336
2a617763
LB
337/**
338 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
339 * @net: network namespace.
340 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
341 * @skb: context for a potential SK_REUSEPORT program.
342 * @doff: header offset.
343 * @saddr: source address.
344 * @sport: source port.
345 * @daddr: destination address.
346 * @hnum: destination port in host byte order.
347 * @ehashfn: hash function used to generate the fallback hash.
348 *
349 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
350 * the selected sock or an error.
351 */
d4433e8b 352struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
ce796e60
LB
353 struct sk_buff *skb, int doff,
354 __be32 saddr, __be16 sport,
0f495f76
LB
355 __be32 daddr, unsigned short hnum,
356 inet_ehashfn_t *ehashfn)
80b373f7
JS
357{
358 struct sock *reuse_sk = NULL;
359 u32 phash;
360
361 if (sk->sk_reuseport) {
0f495f76
LB
362 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
363 net, daddr, hnum, saddr, sport);
80b373f7
JS
364 reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
365 }
366 return reuse_sk;
367}
ce796e60 368EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
80b373f7 369
33b62231 370/*
3b24d854
ED
371 * Here are some nice properties to exploit here. The BSD API
372 * does not allow a listening sock to specify the remote port nor the
33b62231
ACM
373 * remote address for the connection. So always assume those are both
374 * wildcarded during the search since they can never be otherwise.
375 */
e48c414e 376
3b24d854 377/* called with rcu_read_lock() : No refcount taken on the socket */
d4433e8b 378static struct sock *inet_lhash2_lookup(const struct net *net,
61b7c691
MKL
379 struct inet_listen_hashbucket *ilb2,
380 struct sk_buff *skb, int doff,
381 const __be32 saddr, __be16 sport,
382 const __be32 daddr, const unsigned short hnum,
383 const int dif, const int sdif)
384{
61b7c691 385 struct sock *sk, *result = NULL;
cae3873c 386 struct hlist_nulls_node *node;
61b7c691 387 int score, hiscore = 0;
61b7c691 388
cae3873c 389 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
34e1ec31 390 score = compute_score(sk, net, hnum, daddr, dif, sdif);
61b7c691 391 if (score > hiscore) {
ce796e60 392 result = inet_lookup_reuseport(net, sk, skb, doff,
0f495f76 393 saddr, sport, daddr, hnum, inet_ehashfn);
80b373f7
JS
394 if (result)
395 return result;
396
61b7c691
MKL
397 result = sk;
398 hiscore = score;
399 }
400 }
401
402 return result;
403}
404
d4433e8b 405struct sock *inet_lookup_run_sk_lookup(const struct net *net,
6c886db2
LB
406 int protocol,
407 struct sk_buff *skb, int doff,
408 __be32 saddr, __be16 sport,
409 __be32 daddr, u16 hnum, const int dif,
410 inet_ehashfn_t *ehashfn)
1559b4aa
JS
411{
412 struct sock *sk, *reuse_sk;
413 bool no_reuseport;
414
6c886db2 415 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
f8931565 416 daddr, hnum, dif, &sk);
1559b4aa
JS
417 if (no_reuseport || IS_ERR_OR_NULL(sk))
418 return sk;
419
0f495f76 420 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
6c886db2 421 ehashfn);
1559b4aa
JS
422 if (reuse_sk)
423 sk = reuse_sk;
424 return sk;
425}
426
d4433e8b 427struct sock *__inet_lookup_listener(const struct net *net,
c67499c0 428 struct inet_hashinfo *hashinfo,
a583636a 429 struct sk_buff *skb, int doff,
da5e3630 430 const __be32 saddr, __be16 sport,
fb99c848 431 const __be32 daddr, const unsigned short hnum,
3fa6f616 432 const int dif, const int sdif)
99a92ff5 433{
61b7c691 434 struct inet_listen_hashbucket *ilb2;
d9fbc7f6 435 struct sock *result = NULL;
61b7c691 436 unsigned int hash2;
61b7c691 437
1559b4aa 438 /* Lookup redirect from BPF */
6c886db2
LB
439 if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
440 hashinfo == net->ipv4.tcp_death_row.hashinfo) {
441 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
442 saddr, sport, daddr, hnum, dif,
443 inet_ehashfn);
1559b4aa
JS
444 if (result)
445 goto done;
446 }
447
61b7c691
MKL
448 hash2 = ipv4_portaddr_hash(net, daddr, hnum);
449 ilb2 = inet_lhash2_bucket(hashinfo, hash2);
61b7c691
MKL
450
451 result = inet_lhash2_lookup(net, ilb2, skb, doff,
452 saddr, sport, daddr, hnum,
453 dif, sdif);
454 if (result)
8217ca65 455 goto done;
61b7c691
MKL
456
457 /* Lookup lhash2 with INADDR_ANY */
61b7c691
MKL
458 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
459 ilb2 = inet_lhash2_bucket(hashinfo, hash2);
61b7c691 460
8217ca65 461 result = inet_lhash2_lookup(net, ilb2, skb, doff,
d9fbc7f6 462 saddr, sport, htonl(INADDR_ANY), hnum,
8217ca65 463 dif, sdif);
8217ca65 464done:
88e235b8 465 if (IS_ERR(result))
8217ca65 466 return NULL;
c25eb3bf 467 return result;
99a92ff5 468}
8f491069 469EXPORT_SYMBOL_GPL(__inet_lookup_listener);
a7f5e7f1 470
05dbc7b5
ED
471/* All sockets share common refcount, but have different destructors */
472void sock_gen_put(struct sock *sk)
473{
41c6d650 474 if (!refcount_dec_and_test(&sk->sk_refcnt))
05dbc7b5
ED
475 return;
476
477 if (sk->sk_state == TCP_TIME_WAIT)
478 inet_twsk_free(inet_twsk(sk));
41b822c5
ED
479 else if (sk->sk_state == TCP_NEW_SYN_RECV)
480 reqsk_free(inet_reqsk(sk));
05dbc7b5
ED
481 else
482 sk_free(sk);
483}
484EXPORT_SYMBOL_GPL(sock_gen_put);
485
2c13270b
ED
486void sock_edemux(struct sk_buff *skb)
487{
488 sock_gen_put(skb->sk);
489}
490EXPORT_SYMBOL(sock_edemux);
491
d4433e8b 492struct sock *__inet_lookup_established(const struct net *net,
c67499c0 493 struct inet_hashinfo *hashinfo,
77a5ba55
PE
494 const __be32 saddr, const __be16 sport,
495 const __be32 daddr, const u16 hnum,
3fa6f616 496 const int dif, const int sdif)
77a5ba55 497{
c7228317 498 INET_ADDR_COOKIE(acookie, saddr, daddr);
77a5ba55
PE
499 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
500 struct sock *sk;
3ab5aee7 501 const struct hlist_nulls_node *node;
77a5ba55
PE
502 /* Optimize here for direct hit, only listening connections can
503 * have wildcards anyways.
504 */
9f26b3ad 505 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
f373b53b 506 unsigned int slot = hash & hashinfo->ehash_mask;
3ab5aee7 507 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
77a5ba55 508
3ab5aee7
ED
509begin:
510 sk_nulls_for_each_rcu(sk, node, &head->chain) {
ce43b03e
ED
511 if (sk->sk_hash != hash)
512 continue;
eda090c3 513 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
41c6d650 514 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
05dbc7b5 515 goto out;
eda090c3 516 if (unlikely(!inet_match(net, sk, acookie,
4915d50e 517 ports, dif, sdif))) {
05dbc7b5 518 sock_gen_put(sk);
3ab5aee7
ED
519 goto begin;
520 }
05dbc7b5 521 goto found;
3ab5aee7 522 }
77a5ba55 523 }
3ab5aee7
ED
524 /*
525 * if the nulls value we got at the end of this lookup is
526 * not the expected one, we must restart lookup.
527 * We probably met an item that was moved to another chain.
528 */
529 if (get_nulls_value(node) != slot)
530 goto begin;
77a5ba55 531out:
05dbc7b5
ED
532 sk = NULL;
533found:
77a5ba55 534 return sk;
77a5ba55
PE
535}
536EXPORT_SYMBOL_GPL(__inet_lookup_established);
537
a7f5e7f1
ACM
538/* called with local bh disabled */
539static int __inet_check_established(struct inet_timewait_death_row *death_row,
540 struct sock *sk, __u16 lport,
86c2bc29 541 struct inet_timewait_sock **twp,
d4438ce6
ED
542 bool rcu_lookup,
543 u32 hash)
a7f5e7f1
ACM
544{
545 struct inet_hashinfo *hinfo = death_row->hashinfo;
546 struct inet_sock *inet = inet_sk(sk);
c720c7e8
ED
547 __be32 daddr = inet->inet_rcv_saddr;
548 __be32 saddr = inet->inet_daddr;
a7f5e7f1 549 int dif = sk->sk_bound_dev_if;
3fa6f616
DA
550 struct net *net = sock_net(sk);
551 int sdif = l3mdev_master_ifindex_by_index(net, dif);
c7228317 552 INET_ADDR_COOKIE(acookie, saddr, daddr);
c720c7e8 553 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
a7f5e7f1 554 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
05dbc7b5 555 struct inet_timewait_sock *tw = NULL;
ae9d5b19
ED
556 const struct hlist_nulls_node *node;
557 struct sock *sk2;
558 spinlock_t *lock;
559
86c2bc29
ED
560 if (rcu_lookup) {
561 sk_nulls_for_each(sk2, node, &head->chain) {
562 if (sk2->sk_hash != hash ||
563 !inet_match(net, sk2, acookie, ports, dif, sdif))
564 continue;
565 if (sk2->sk_state == TCP_TIME_WAIT)
566 break;
567 return -EADDRNOTAVAIL;
568 }
569 return 0;
ae9d5b19 570 }
a7f5e7f1 571
ae9d5b19 572 lock = inet_ehash_lockp(hinfo, hash);
9db66bdc 573 spin_lock(lock);
a7f5e7f1 574
3ab5aee7 575 sk_nulls_for_each(sk2, node, &head->chain) {
ce43b03e
ED
576 if (sk2->sk_hash != hash)
577 continue;
05dbc7b5 578
eda090c3 579 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
05dbc7b5
ED
580 if (sk2->sk_state == TCP_TIME_WAIT) {
581 tw = inet_twsk(sk2);
383eed2d
ED
582 if (sk->sk_protocol == IPPROTO_TCP &&
583 tcp_twsk_unique(sk, sk2, twp))
05dbc7b5
ED
584 break;
585 }
a7f5e7f1 586 goto not_unique;
05dbc7b5 587 }
a7f5e7f1
ACM
588 }
589
a7f5e7f1 590 /* Must record num and sport now. Otherwise we will see
05dbc7b5
ED
591 * in hash table socket with a funny identity.
592 */
c720c7e8
ED
593 inet->inet_num = lport;
594 inet->inet_sport = htons(lport);
a7f5e7f1 595 sk->sk_hash = hash;
547b792c 596 WARN_ON(!sk_unhashed(sk));
3ab5aee7 597 __sk_nulls_add_node_rcu(sk, &head->chain);
13475a30 598 if (tw) {
fc01538f 599 sk_nulls_del_node_init_rcu((struct sock *)tw);
02a1d6e7 600 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
13475a30 601 }
9db66bdc 602 spin_unlock(lock);
c29a0bc4 603 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
a7f5e7f1
ACM
604
605 if (twp) {
606 *twp = tw;
a7f5e7f1
ACM
607 } else if (tw) {
608 /* Silly. Should hash-dance instead... */
dbe7faa4 609 inet_twsk_deschedule_put(tw);
a7f5e7f1 610 }
a7f5e7f1
ACM
611 return 0;
612
613not_unique:
9db66bdc 614 spin_unlock(lock);
a7f5e7f1
ACM
615 return -EADDRNOTAVAIL;
616}
617
b2d05756 618static u64 inet_sk_port_offset(const struct sock *sk)
a7f5e7f1
ACM
619{
620 const struct inet_sock *inet = inet_sk(sk);
e2baad9e 621
c720c7e8
ED
622 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
623 inet->inet_daddr,
624 inet->inet_dport);
a7f5e7f1
ACM
625}
626
01770a16
RD
627/* Searches for an exsiting socket in the ehash bucket list.
628 * Returns true if found, false otherwise.
079096f1 629 */
01770a16
RD
630static bool inet_ehash_lookup_by_sk(struct sock *sk,
631 struct hlist_nulls_head *list)
632{
633 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
634 const int sdif = sk->sk_bound_dev_if;
635 const int dif = sk->sk_bound_dev_if;
636 const struct hlist_nulls_node *node;
637 struct net *net = sock_net(sk);
638 struct sock *esk;
639
640 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
641
642 sk_nulls_for_each_rcu(esk, node, list) {
643 if (esk->sk_hash != sk->sk_hash)
644 continue;
645 if (sk->sk_family == AF_INET) {
eda090c3 646 if (unlikely(inet_match(net, esk, acookie,
01770a16
RD
647 ports, dif, sdif))) {
648 return true;
649 }
650 }
651#if IS_ENABLED(CONFIG_IPV6)
652 else if (sk->sk_family == AF_INET6) {
5d368f03 653 if (unlikely(inet6_match(net, esk,
01770a16
RD
654 &sk->sk_v6_daddr,
655 &sk->sk_v6_rcv_saddr,
656 ports, dif, sdif))) {
657 return true;
658 }
659 }
660#endif
661 }
662 return false;
663}
664
665/* Insert a socket into ehash, and eventually remove another one
666 * (The another one can be a SYN_RECV or TIMEWAIT)
667 * If an existing socket already exists, socket sk is not inserted,
668 * and sets found_dup_sk parameter to true.
669 */
670bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
152da81d 671{
235bd9d2 672 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
152da81d 673 struct inet_ehash_bucket *head;
08eaef90 674 struct hlist_nulls_head *list;
5b441f76 675 spinlock_t *lock;
5e0724d0 676 bool ret = true;
152da81d 677
079096f1 678 WARN_ON_ONCE(!sk_unhashed(sk));
152da81d 679
5b441f76 680 sk->sk_hash = sk_ehashfn(sk);
152da81d
PE
681 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
682 list = &head->chain;
683 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
684
9db66bdc 685 spin_lock(lock);
fc01538f 686 if (osk) {
5e0724d0 687 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
81b3ade5
KI
688 ret = sk_nulls_del_node_init_rcu(osk);
689 } else if (found_dup_sk) {
01770a16
RD
690 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
691 if (*found_dup_sk)
692 ret = false;
9327f705 693 }
01770a16 694
5e0724d0
ED
695 if (ret)
696 __sk_nulls_add_node_rcu(sk, list);
01770a16 697
9db66bdc 698 spin_unlock(lock);
01770a16 699
079096f1
ED
700 return ret;
701}
702
01770a16 703bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
079096f1 704{
01770a16 705 bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
5e0724d0
ED
706
707 if (ok) {
708 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
709 } else {
19757ceb 710 this_cpu_inc(*sk->sk_prot->orphan_count);
563e0bb0 711 inet_sk_set_state(sk, TCP_CLOSE);
5e0724d0
ED
712 sock_set_flag(sk, SOCK_DEAD);
713 inet_csk_destroy_sock(sk);
714 }
715 return ok;
152da81d 716}
22d6c9ee 717EXPORT_IPV6_MOD(inet_ehash_nolisten);
152da81d 718
c125e80b 719static int inet_reuseport_add_sock(struct sock *sk,
fe38d2a1 720 struct inet_listen_hashbucket *ilb)
c125e80b 721{
90e5d0db 722 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
8dbd76e7 723 const struct hlist_nulls_node *node;
c125e80b 724 struct sock *sk2;
c125e80b
CG
725 kuid_t uid = sock_i_uid(sk);
726
8dbd76e7 727 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
c125e80b
CG
728 if (sk2 != sk &&
729 sk2->sk_family == sk->sk_family &&
730 ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
731 sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
90e5d0db 732 inet_csk(sk2)->icsk_bind_hash == tb &&
c125e80b 733 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
fe38d2a1 734 inet_rcv_saddr_equal(sk, sk2, false))
2dbb9b9e
MKL
735 return reuseport_add_sock(sk, sk2,
736 inet_rcv_saddr_any(sk));
c125e80b
CG
737 }
738
2dbb9b9e 739 return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
c125e80b
CG
740}
741
fe38d2a1 742int __inet_hash(struct sock *sk, struct sock *osk)
152da81d 743{
235bd9d2 744 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
e8d00590 745 struct inet_listen_hashbucket *ilb2;
c125e80b 746 int err = 0;
152da81d 747
5e0724d0 748 if (sk->sk_state != TCP_LISTEN) {
4f9bf2a2 749 local_bh_disable();
01770a16 750 inet_ehash_nolisten(sk, osk, NULL);
4f9bf2a2 751 local_bh_enable();
c125e80b 752 return 0;
5e0724d0 753 }
547b792c 754 WARN_ON(!sk_unhashed(sk));
e8d00590 755 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
152da81d 756
e8d00590 757 spin_lock(&ilb2->lock);
c125e80b 758 if (sk->sk_reuseport) {
cae3873c 759 err = inet_reuseport_add_sock(sk, ilb2);
c125e80b
CG
760 if (err)
761 goto unlock;
762 }
871019b2 763 sock_set_flag(sk, SOCK_RCU_FREE);
d296ba60 764 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
cae3873c
MKL
765 sk->sk_family == AF_INET6)
766 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
767 else
768 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
c29a0bc4 769 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
c125e80b 770unlock:
e8d00590 771 spin_unlock(&ilb2->lock);
c125e80b
CG
772
773 return err;
152da81d 774}
22d6c9ee 775EXPORT_IPV6_MOD(__inet_hash);
ab1e0a13 776
086c653f 777int inet_hash(struct sock *sk)
ab1e0a13 778{
c125e80b
CG
779 int err = 0;
780
4f9bf2a2 781 if (sk->sk_state != TCP_CLOSE)
fe38d2a1 782 err = __inet_hash(sk, NULL);
086c653f 783
c125e80b 784 return err;
ab1e0a13 785}
ab1e0a13 786
4f9bf2a2
SAS
787void inet_unhash(struct sock *sk)
788{
235bd9d2 789 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
4f9bf2a2
SAS
790
791 if (sk_unhashed(sk))
792 return;
793
9cd5ef0b 794 sock_rps_delete_flow(sk);
4f9bf2a2 795 if (sk->sk_state == TCP_LISTEN) {
e8d00590 796 struct inet_listen_hashbucket *ilb2;
4f9bf2a2 797
e8d00590 798 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
4f9bf2a2
SAS
799 /* Don't disable bottom halves while acquiring the lock to
800 * avoid circular locking dependency on PREEMPT_RT.
801 */
e8d00590
MKL
802 spin_lock(&ilb2->lock);
803 if (sk_unhashed(sk)) {
804 spin_unlock(&ilb2->lock);
e8d00590
MKL
805 return;
806 }
807
808 if (rcu_access_pointer(sk->sk_reuseport_cb))
809 reuseport_stop_listen_sock(sk);
810
e8d00590
MKL
811 __sk_nulls_del_node_init_rcu(sk);
812 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
813 spin_unlock(&ilb2->lock);
4f9bf2a2
SAS
814 } else {
815 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
816
817 spin_lock_bh(lock);
e8d00590
MKL
818 if (sk_unhashed(sk)) {
819 spin_unlock_bh(lock);
820 return;
821 }
822 __sk_nulls_del_node_init_rcu(sk);
823 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
4f9bf2a2
SAS
824 spin_unlock_bh(lock);
825 }
ab1e0a13 826}
22d6c9ee 827EXPORT_IPV6_MOD(inet_unhash);
152da81d 828
28044fc1
JK
829static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
830 const struct net *net, unsigned short port,
831 int l3mdev, const struct sock *sk)
832{
c6d27706
KI
833 if (!net_eq(ib2_net(tb), net) || tb->port != port ||
834 tb->l3mdev != l3mdev)
835 return false;
836
8702cf12 837 return inet_bind2_bucket_addr_match(tb, sk);
28044fc1
JK
838}
839
840bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
841 unsigned short port, int l3mdev, const struct sock *sk)
842{
c6d27706
KI
843 if (!net_eq(ib2_net(tb), net) || tb->port != port ||
844 tb->l3mdev != l3mdev)
845 return false;
846
28044fc1 847#if IS_ENABLED(CONFIG_IPV6)
5a22bba1 848 if (tb->addr_type == IPV6_ADDR_ANY)
06a8c04f
KI
849 return true;
850
5a22bba1
KI
851 if (tb->addr_type != IPV6_ADDR_MAPPED)
852 return false;
853
854 if (sk->sk_family == AF_INET6 &&
855 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
06a8c04f 856 return false;
28044fc1 857#endif
c6d27706 858 return tb->rcv_saddr == 0;
28044fc1
JK
859}
860
861/* The socket's bhash2 hashbucket spinlock must be held when this is called */
862struct inet_bind2_bucket *
863inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
864 unsigned short port, int l3mdev, const struct sock *sk)
865{
866 struct inet_bind2_bucket *bhash2 = NULL;
867
868 inet_bind_bucket_for_each(bhash2, &head->chain)
869 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
870 break;
871
872 return bhash2;
873}
874
875struct inet_bind_hashbucket *
876inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
877{
235bd9d2 878 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
28044fc1 879 u32 hash;
28044fc1 880
8cdc3223 881#if IS_ENABLED(CONFIG_IPV6)
28044fc1 882 if (sk->sk_family == AF_INET6)
8cdc3223 883 hash = ipv6_portaddr_hash(net, &in6addr_any, port);
28044fc1
JK
884 else
885#endif
886 hash = ipv4_portaddr_hash(net, 0, port);
887
888 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
889}
890
8c5dae4c
KI
891static void inet_update_saddr(struct sock *sk, void *saddr, int family)
892{
893 if (family == AF_INET) {
894 inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
895 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
896 }
897#if IS_ENABLED(CONFIG_IPV6)
898 else {
899 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
900 }
901#endif
902}
903
e0833d1f 904static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
28044fc1 905{
235bd9d2 906 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
8c5dae4c 907 struct inet_bind_hashbucket *head, *head2;
28044fc1
JK
908 struct inet_bind2_bucket *tb2, *new_tb2;
909 int l3mdev = inet_sk_bound_l3mdev(sk);
28044fc1
JK
910 int port = inet_sk(sk)->inet_num;
911 struct net *net = sock_net(sk);
8c5dae4c
KI
912 int bhash;
913
914 if (!inet_csk(sk)->icsk_bind2_hash) {
915 /* Not bind()ed before. */
e0833d1f
KI
916 if (reset)
917 inet_reset_saddr(sk);
918 else
919 inet_update_saddr(sk, saddr, family);
920
8c5dae4c
KI
921 return 0;
922 }
28044fc1
JK
923
924 /* Allocate a bind2 bucket ahead of time to avoid permanently putting
925 * the bhash2 table in an inconsistent state if a new tb2 bucket
926 * allocation fails.
927 */
928 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
e0833d1f
KI
929 if (!new_tb2) {
930 if (reset) {
931 /* The (INADDR_ANY, port) bucket might have already
932 * been freed, then we cannot fixup icsk_bind2_hash,
933 * so we give up and unlink sk from bhash/bhash2 not
934 * to leave inconsistency in bhash2.
935 */
936 inet_put_port(sk);
937 inet_reset_saddr(sk);
938 }
939
28044fc1 940 return -ENOMEM;
e0833d1f 941 }
28044fc1 942
8c5dae4c
KI
943 bhash = inet_bhashfn(net, port, hinfo->bhash_size);
944 head = &hinfo->bhash[bhash];
28044fc1
JK
945 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
946
8c5dae4c
KI
947 /* If we change saddr locklessly, another thread
948 * iterating over bhash might see corrupted address.
949 */
950 spin_lock_bh(&head->lock);
951
952 spin_lock(&head2->lock);
770041d3 953 __sk_del_bind_node(sk);
8acdad37 954 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
8c5dae4c
KI
955 spin_unlock(&head2->lock);
956
e0833d1f
KI
957 if (reset)
958 inet_reset_saddr(sk);
959 else
960 inet_update_saddr(sk, saddr, family);
28044fc1 961
8c5dae4c
KI
962 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
963
964 spin_lock(&head2->lock);
28044fc1
JK
965 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
966 if (!tb2) {
967 tb2 = new_tb2;
822fb91f 968 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
28044fc1 969 }
28044fc1 970 inet_csk(sk)->icsk_bind2_hash = tb2;
770041d3 971 sk_add_bind_node(sk, &tb2->owners);
8c5dae4c
KI
972 spin_unlock(&head2->lock);
973
974 spin_unlock_bh(&head->lock);
28044fc1
JK
975
976 if (tb2 != new_tb2)
977 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
978
979 return 0;
980}
e0833d1f
KI
981
982int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
983{
984 return __inet_bhash2_update_saddr(sk, saddr, family, false);
985}
22d6c9ee 986EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
28044fc1 987
e0833d1f
KI
988void inet_bhash2_reset_saddr(struct sock *sk)
989{
990 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
991 __inet_bhash2_update_saddr(sk, NULL, 0, true);
992}
22d6c9ee 993EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
e0833d1f 994
190cc824
ED
995/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
996 * Note that we use 32bit integers (vs RFC 'short integers')
997 * because 2^16 is not a multiple of num_ephemeral and this
998 * property might be used by clever attacker.
aeac4ec8 999 *
4c2c8f03 1000 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
aeac4ec8
GM
1001 * attacks were since demonstrated, thus we use 65536 by default instead
1002 * to really give more isolation and privacy, at the expense of 256kB
1003 * of kernel memory.
190cc824 1004 */
aeac4ec8 1005#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
e9261476 1006static u32 *table_perturb;
190cc824 1007
5ee31fc1 1008int __inet_hash_connect(struct inet_timewait_death_row *death_row,
b2d05756 1009 struct sock *sk, u64 port_offset,
d4438ce6 1010 u32 hash_port0,
5ee31fc1 1011 int (*check_established)(struct inet_timewait_death_row *,
86c2bc29 1012 struct sock *, __u16, struct inet_timewait_sock **,
d4438ce6 1013 bool rcu_lookup, u32 hash))
a7f5e7f1
ACM
1014{
1015 struct inet_hashinfo *hinfo = death_row->hashinfo;
28044fc1 1016 struct inet_bind_hashbucket *head, *head2;
1580ab63 1017 struct inet_timewait_sock *tw = NULL;
1580ab63 1018 int port = inet_sk(sk)->inet_num;
3b1e0a65 1019 struct net *net = sock_net(sk);
28044fc1 1020 struct inet_bind2_bucket *tb2;
1580ab63 1021 struct inet_bind_bucket *tb;
28044fc1 1022 bool tb_created = false;
1580ab63
ED
1023 u32 remaining, offset;
1024 int ret, i, low, high;
20718485
ED
1025 bool local_ports;
1026 int step, l3mdev;
190cc824 1027 u32 index;
1580ab63
ED
1028
1029 if (port) {
21cbd90a 1030 local_bh_disable();
d4438ce6
ED
1031 ret = check_established(death_row, sk, port, NULL, false,
1032 hash_port0 + port);
1580ab63
ED
1033 local_bh_enable();
1034 return ret;
1035 }
a7f5e7f1 1036
3c82a21f
RS
1037 l3mdev = inet_sk_bound_l3mdev(sk);
1038
20718485
ED
1039 local_ports = inet_sk_get_local_port_range(sk, &low, &high);
1040 step = local_ports ? 1 : 2;
1041
1580ab63
ED
1042 high++; /* [32768, 60999] -> [32768, 61000[ */
1043 remaining = high - low;
20718485 1044 if (!local_ports && remaining > 1)
1580ab63 1045 remaining &= ~1U;
a7f5e7f1 1046
2a4187f4
JD
1047 get_random_sleepable_once(table_perturb,
1048 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
e8161345 1049 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
190cc824 1050
9e9b70ae 1051 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
b2d05756
WT
1052 offset %= remaining;
1053
1580ab63
ED
1054 /* In first pass we try ports of @low parity.
1055 * inet_csk_get_port() does the opposite choice.
1056 */
20718485
ED
1057 if (!local_ports)
1058 offset &= ~1U;
1580ab63
ED
1059other_parity_scan:
1060 port = low + offset;
20718485 1061 for (i = 0; i < remaining; i += step, port += step) {
1580ab63
ED
1062 if (unlikely(port >= high))
1063 port -= remaining;
1064 if (inet_is_local_reserved_port(net, port))
1065 continue;
1066 head = &hinfo->bhash[inet_bhashfn(net, port,
1067 hinfo->bhash_size)];
86c2bc29
ED
1068 rcu_read_lock();
1069 hlist_for_each_entry_rcu(tb, &head->chain, node) {
1070 if (!inet_bind_bucket_match(tb, net, port, l3mdev))
1071 continue;
1072 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
1073 rcu_read_unlock();
1074 goto next_port;
1075 }
d4438ce6
ED
1076 if (!check_established(death_row, sk, port, &tw, true,
1077 hash_port0 + port))
86c2bc29
ED
1078 break;
1079 rcu_read_unlock();
1080 goto next_port;
1081 }
1082 rcu_read_unlock();
1083
1580ab63 1084 spin_lock_bh(&head->lock);
227b60f5 1085
1580ab63
ED
1086 /* Does not bother with rcv_saddr checks, because
1087 * the established check is already unique enough.
07f4c900 1088 */
1580ab63 1089 inet_bind_bucket_for_each(tb, &head->chain) {
28044fc1 1090 if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
1580ab63
ED
1091 if (tb->fastreuse >= 0 ||
1092 tb->fastreuseport >= 0)
86c2bc29 1093 goto next_port_unlock;
8002d44f 1094 WARN_ON(hlist_empty(&tb->bhash2));
1580ab63 1095 if (!check_established(death_row, sk,
d4438ce6
ED
1096 port, &tw, false,
1097 hash_port0 + port))
1580ab63 1098 goto ok;
86c2bc29 1099 goto next_port_unlock;
e905a9ed 1100 }
e905a9ed 1101 }
a7f5e7f1 1102
1580ab63 1103 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
3c82a21f 1104 net, head, port, l3mdev);
1580ab63
ED
1105 if (!tb) {
1106 spin_unlock_bh(&head->lock);
1107 return -ENOMEM;
e905a9ed 1108 }
28044fc1 1109 tb_created = true;
1580ab63
ED
1110 tb->fastreuse = -1;
1111 tb->fastreuseport = -1;
1112 goto ok;
86c2bc29 1113next_port_unlock:
1580ab63 1114 spin_unlock_bh(&head->lock);
86c2bc29 1115next_port:
1580ab63
ED
1116 cond_resched();
1117 }
a7f5e7f1 1118
20718485
ED
1119 if (!local_ports) {
1120 offset++;
1121 if ((offset & 1) && remaining > 1)
1122 goto other_parity_scan;
1123 }
1580ab63 1124 return -EADDRNOTAVAIL;
a7f5e7f1 1125
1580ab63 1126ok:
28044fc1
JK
1127 /* Find the corresponding tb2 bucket since we need to
1128 * add the socket to the bhash2 table as well
1129 */
1130 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
1131 spin_lock(&head2->lock);
1132
1133 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
1134 if (!tb2) {
1135 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
822fb91f 1136 head2, tb, sk);
28044fc1
JK
1137 if (!tb2)
1138 goto error;
1139 }
1140
ca7af040
WT
1141 /* Here we want to add a little bit of randomness to the next source
1142 * port that will be chosen. We use a max() with a random here so that
1143 * on low contention the randomness is maximal and on high contention
1144 * it may be inexistent.
c579bd1b 1145 */
20718485
ED
1146 i = max_t(int, i, get_random_u32_below(8) * step);
1147 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
1580ab63
ED
1148
1149 /* Head lock still held and bh's disabled */
28044fc1
JK
1150 inet_bind_hash(sk, tb, tb2, port);
1151
1580ab63
ED
1152 if (sk_unhashed(sk)) {
1153 inet_sk(sk)->inet_sport = htons(port);
01770a16 1154 inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
a7f5e7f1 1155 }
1580ab63
ED
1156 if (tw)
1157 inet_twsk_bind_unhash(tw, hinfo);
936a192f
KI
1158
1159 spin_unlock(&head2->lock);
1580ab63 1160 spin_unlock(&head->lock);
936a192f 1161
1580ab63
ED
1162 if (tw)
1163 inet_twsk_deschedule_put(tw);
1164 local_bh_enable();
1165 return 0;
28044fc1
JK
1166
1167error:
66b60b0c
KI
1168 if (sk_hashed(sk)) {
1169 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);
1170
1171 sock_prot_inuse_add(net, sk->sk_prot, -1);
1172
1173 spin_lock(lock);
04d9d1fc 1174 __sk_nulls_del_node_init_rcu(sk);
66b60b0c
KI
1175 spin_unlock(lock);
1176
1177 sk->sk_hash = 0;
1178 inet_sk(sk)->inet_sport = 0;
1179 inet_sk(sk)->inet_num = 0;
1180
1181 if (tw)
1182 inet_twsk_bind_unhash(tw, hinfo);
1183 }
1184
28044fc1
JK
1185 spin_unlock(&head2->lock);
1186 if (tb_created)
d186f405 1187 inet_bind_bucket_destroy(tb);
66b60b0c
KI
1188 spin_unlock(&head->lock);
1189
1190 if (tw)
1191 inet_twsk_deschedule_put(tw);
1192
1193 local_bh_enable();
1194
28044fc1 1195 return -ENOMEM;
a7f5e7f1 1196}
5ee31fc1
PE
1197
1198/*
1199 * Bind a port for a connect operation and hash it.
1200 */
1201int inet_hash_connect(struct inet_timewait_death_row *death_row,
1202 struct sock *sk)
1203{
d4438ce6
ED
1204 const struct inet_sock *inet = inet_sk(sk);
1205 const struct net *net = sock_net(sk);
b2d05756 1206 u64 port_offset = 0;
d4438ce6 1207 u32 hash_port0;
e2baad9e
ED
1208
1209 if (!inet_sk(sk)->inet_num)
1210 port_offset = inet_sk_port_offset(sk);
d4438ce6
ED
1211
1212 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
1213 inet->inet_daddr, inet->inet_dport);
1214
1215 return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
b4d6444e 1216 __inet_check_established);
5ee31fc1 1217}
5caea4ea 1218
c92c81df
PO
1219static void init_hashinfo_lhash2(struct inet_hashinfo *h)
1220{
1221 int i;
1222
1223 for (i = 0; i <= h->lhash2_mask; i++) {
1224 spin_lock_init(&h->lhash2[i].lock);
cae3873c
MKL
1225 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
1226 i + LISTENING_NULLS_BASE);
c92c81df
PO
1227 }
1228}
1229
61b7c691
MKL
1230void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
1231 unsigned long numentries, int scale,
1232 unsigned long low_limit,
1233 unsigned long high_limit)
1234{
61b7c691
MKL
1235 h->lhash2 = alloc_large_system_hash(name,
1236 sizeof(*h->lhash2),
1237 numentries,
1238 scale,
1239 0,
1240 NULL,
1241 &h->lhash2_mask,
1242 low_limit,
1243 high_limit);
c92c81df 1244 init_hashinfo_lhash2(h);
e9261476
WT
1245
1246 /* this one is used for source ports of outgoing connections */
e67b72b9
MS
1247 table_perturb = alloc_large_system_hash("Table-perturb",
1248 sizeof(*table_perturb),
1249 INET_TABLE_PERTURB_SIZE,
1250 0, 0, NULL, NULL,
1251 INET_TABLE_PERTURB_SIZE,
1252 INET_TABLE_PERTURB_SIZE);
c92c81df 1253}
61b7c691 1254
c92c81df
PO
1255int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
1256{
1257 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
1258 if (!h->lhash2)
1259 return -ENOMEM;
1260
1261 h->lhash2_mask = INET_LHTABLE_SIZE - 1;
1262 /* INET_LHTABLE_SIZE must be a power of 2 */
1263 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
1264
1265 init_hashinfo_lhash2(h);
1266 return 0;
61b7c691
MKL
1267}
1268
095dc8e0
ED
1269int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
1270{
89e478a2 1271 unsigned int locksz = sizeof(spinlock_t);
095dc8e0 1272 unsigned int i, nblocks = 1;
f8ece407 1273 spinlock_t *ptr = NULL;
095dc8e0 1274
f8ece407
ED
1275 if (locksz == 0)
1276 goto set_mask;
095dc8e0 1277
f8ece407
ED
1278 /* Allocate 2 cache lines or at least one spinlock per cpu. */
1279 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
095dc8e0 1280
f8ece407
ED
1281 /* At least one page per NUMA node. */
1282 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
1283
1284 nblocks = roundup_pow_of_two(nblocks);
1285
1286 /* No more locks than number of hash buckets. */
1287 nblocks = min(nblocks, hashinfo->ehash_mask + 1);
095dc8e0 1288
f8ece407
ED
1289 if (num_online_nodes() > 1) {
1290 /* Use vmalloc() to allow NUMA policy to spread pages
1291 * on all available nodes if desired.
1292 */
1293 ptr = vmalloc_array(nblocks, locksz);
1294 }
1295 if (!ptr) {
1296 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
1297 if (!ptr)
1298 return -ENOMEM;
095dc8e0 1299 }
f8ece407
ED
1300 for (i = 0; i < nblocks; i++)
1301 spin_lock_init(&ptr[i]);
1302 hashinfo->ehash_locks = ptr;
1303set_mask:
095dc8e0
ED
1304 hashinfo->ehash_locks_mask = nblocks - 1;
1305 return 0;
1306}
d1e5e640
KI
1307
1308struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
1309 unsigned int ehash_entries)
1310{
1311 struct inet_hashinfo *new_hashinfo;
1312 int i;
1313
1314 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
1315 if (!new_hashinfo)
1316 goto err;
1317
1318 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
1319 GFP_KERNEL_ACCOUNT);
1320 if (!new_hashinfo->ehash)
1321 goto free_hashinfo;
1322
1323 new_hashinfo->ehash_mask = ehash_entries - 1;
1324
1325 if (inet_ehash_locks_alloc(new_hashinfo))
1326 goto free_ehash;
1327
1328 for (i = 0; i < ehash_entries; i++)
1329 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
1330
1331 new_hashinfo->pernet = true;
1332
1333 return new_hashinfo;
1334
1335free_ehash:
1336 vfree(new_hashinfo->ehash);
1337free_hashinfo:
1338 kfree(new_hashinfo);
1339err:
1340 return NULL;
1341}
d1e5e640
KI
1342
1343void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
1344{
1345 if (!hashinfo->pernet)
1346 return;
1347
1348 inet_ehash_locks_free(hashinfo);
1349 vfree(hashinfo->ehash);
1350 kfree(hashinfo);
1351}