treewide: use get_random_u32() when possible
[linux-block.git] / net / netfilter / ipvs / ip_vs_conn.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the Netfilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
7 * cluster of servers.
8 *
1da177e4
LT
9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * Peter Kese <peter.kese@ijs.si>
11 * Julian Anastasov <ja@ssi.bg>
12 *
1da177e4
LT
13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
15 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
16 *
17 * Changes:
1da177e4
LT
18 */
19
9aada7ac
HE
20#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
e924283b 23#include <linux/interrupt.h>
14c85021 24#include <linux/in.h>
f18ae720 25#include <linux/inet.h>
f190055f 26#include <linux/net.h>
1da177e4 27#include <linux/kernel.h>
14c85021 28#include <linux/module.h>
1da177e4
LT
29#include <linux/vmalloc.h>
30#include <linux/proc_fs.h> /* for proc_net_* */
5a0e3ad6 31#include <linux/slab.h>
1da177e4
LT
32#include <linux/seq_file.h>
33#include <linux/jhash.h>
34#include <linux/random.h>
35
457c4cbc 36#include <net/net_namespace.h>
1da177e4
LT
37#include <net/ip_vs.h>
38
39
6f7edb48
CB
40#ifndef CONFIG_IP_VS_TAB_BITS
41#define CONFIG_IP_VS_TAB_BITS 12
42#endif
43
44/*
45 * Connection hash size. Default is what was selected at compile time.
46*/
4ecd2944 47static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
6f7edb48
CB
48module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
49MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
50
51/* size and mask values */
4ecd2944
ED
52int ip_vs_conn_tab_size __read_mostly;
53static int ip_vs_conn_tab_mask __read_mostly;
6f7edb48 54
1da177e4
LT
55/*
56 * Connection hash table: for input and output packets lookups of IPVS
57 */
731109e7 58static struct hlist_head *ip_vs_conn_tab __read_mostly;
1da177e4
LT
59
60/* SLAB cache for IPVS connections */
e18b890b 61static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
1da177e4 62
1da177e4
LT
63/* counter for no client port connections */
64static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
65
66/* random value for IPVS connection hash */
4ecd2944 67static unsigned int ip_vs_conn_rnd __read_mostly;
1da177e4
LT
68
69/*
70 * Fine locking granularity for big connection hash table
71 */
6e67e586 72#define CT_LOCKARRAY_BITS 5
1da177e4
LT
73#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
74#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
75
f18ae720
JA
76/* We need an addrstrlen that works with or without v6 */
77#ifdef CONFIG_IP_VS_IPV6
78#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
79#else
80#define IP_VS_ADDRSTRLEN (8+1)
81#endif
82
1da177e4
LT
83struct ip_vs_aligned_lock
84{
088339a5 85 spinlock_t l;
1da177e4
LT
86} __attribute__((__aligned__(SMP_CACHE_BYTES)));
87
88/* lock array for conn table */
89static struct ip_vs_aligned_lock
90__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
91
ac69269a 92static inline void ct_write_lock_bh(unsigned int key)
1da177e4 93{
ac69269a 94 spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
1da177e4
LT
95}
96
ac69269a 97static inline void ct_write_unlock_bh(unsigned int key)
1da177e4 98{
ac69269a 99 spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
1da177e4
LT
100}
101
8ef81c65 102static void ip_vs_conn_expire(struct timer_list *t);
1da177e4
LT
103
104/*
105 * Returns hash value for IPVS connection entry
106 */
754b81a3 107static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
28364a59
JV
108 const union nf_inet_addr *addr,
109 __be16 port)
1da177e4 110{
28364a59
JV
111#ifdef CONFIG_IP_VS_IPV6
112 if (af == AF_INET6)
6e67e586
HS
113 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
114 (__force u32)port, proto, ip_vs_conn_rnd) ^
754b81a3 115 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
28364a59 116#endif
6e67e586
HS
117 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
118 ip_vs_conn_rnd) ^
754b81a3 119 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
1da177e4
LT
120}
121
85999283
SH
122static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
123 bool inverse)
124{
125 const union nf_inet_addr *addr;
126 __be16 port;
127
f71499aa 128 if (p->pe_data && p->pe->hashkey_raw)
85999283
SH
129 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
130 ip_vs_conn_tab_mask;
131
132 if (likely(!inverse)) {
133 addr = p->caddr;
134 port = p->cport;
135 } else {
136 addr = p->vaddr;
137 port = p->vport;
138 }
139
754b81a3 140 return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
85999283
SH
141}
142
143static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
144{
145 struct ip_vs_conn_param p;
146
19913dec 147 ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
6e67e586 148 &cp->caddr, cp->cport, NULL, 0, &p);
85999283 149
e9e5eee8
SH
150 if (cp->pe) {
151 p.pe = cp->pe;
85999283
SH
152 p.pe_data = cp->pe_data;
153 p.pe_data_len = cp->pe_data_len;
154 }
155
156 return ip_vs_conn_hashkey_param(&p, false);
157}
1da177e4
LT
158
159/*
6e67e586 160 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
1da177e4
LT
161 * returns bool success.
162 */
163static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
164{
95c96174 165 unsigned int hash;
1da177e4
LT
166 int ret;
167
26ec037f
NC
168 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
169 return 0;
170
1da177e4 171 /* Hash by protocol, client address and port */
85999283 172 hash = ip_vs_conn_hashkey_conn(cp);
1da177e4 173
ac69269a 174 ct_write_lock_bh(hash);
aea9d711 175 spin_lock(&cp->lock);
1da177e4
LT
176
177 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
1da177e4 178 cp->flags |= IP_VS_CONN_F_HASHED;
b54ab92b 179 refcount_inc(&cp->refcnt);
088339a5 180 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
1da177e4
LT
181 ret = 1;
182 } else {
c5cc0c69 183 pr_err("%s(): request for already hashed, called from %pS\n",
1e3e238e 184 __func__, __builtin_return_address(0));
1da177e4
LT
185 ret = 0;
186 }
187
aea9d711 188 spin_unlock(&cp->lock);
ac69269a 189 ct_write_unlock_bh(hash);
1da177e4
LT
190
191 return ret;
192}
193
194
195/*
196 * UNhashes ip_vs_conn from ip_vs_conn_tab.
088339a5 197 * returns bool success. Caller should hold conn reference.
1da177e4
LT
198 */
199static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
200{
95c96174 201 unsigned int hash;
1da177e4
LT
202 int ret;
203
204 /* unhash it and decrease its reference counter */
85999283 205 hash = ip_vs_conn_hashkey_conn(cp);
1da177e4 206
ac69269a 207 ct_write_lock_bh(hash);
aea9d711 208 spin_lock(&cp->lock);
1da177e4
LT
209
210 if (cp->flags & IP_VS_CONN_F_HASHED) {
088339a5 211 hlist_del_rcu(&cp->c_list);
1da177e4 212 cp->flags &= ~IP_VS_CONN_F_HASHED;
b54ab92b 213 refcount_dec(&cp->refcnt);
1da177e4
LT
214 ret = 1;
215 } else
216 ret = 0;
217
aea9d711 218 spin_unlock(&cp->lock);
ac69269a 219 ct_write_unlock_bh(hash);
1da177e4
LT
220
221 return ret;
222}
223
088339a5
JA
224/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
225 * returns bool success.
226 */
227static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
228{
229 unsigned int hash;
a050d345
JA
230 bool ret = false;
231
232 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
233 return refcount_dec_if_one(&cp->refcnt);
088339a5
JA
234
235 hash = ip_vs_conn_hashkey_conn(cp);
236
ac69269a 237 ct_write_lock_bh(hash);
088339a5
JA
238 spin_lock(&cp->lock);
239
240 if (cp->flags & IP_VS_CONN_F_HASHED) {
088339a5 241 /* Decrease refcnt and unlink conn only if we are last user */
b54ab92b 242 if (refcount_dec_if_one(&cp->refcnt)) {
088339a5
JA
243 hlist_del_rcu(&cp->c_list);
244 cp->flags &= ~IP_VS_CONN_F_HASHED;
245 ret = true;
246 }
a050d345 247 }
088339a5
JA
248
249 spin_unlock(&cp->lock);
ac69269a 250 ct_write_unlock_bh(hash);
088339a5
JA
251
252 return ret;
253}
254
1da177e4
LT
255
256/*
257 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
258 * Called for pkts coming from OUTside-to-INside.
f11017ec
SH
259 * p->caddr, p->cport: pkt source address (foreign host)
260 * p->vaddr, p->vport: pkt dest address (load balancer)
1da177e4 261 */
f11017ec
SH
262static inline struct ip_vs_conn *
263__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
1da177e4 264{
95c96174 265 unsigned int hash;
1da177e4
LT
266 struct ip_vs_conn *cp;
267
85999283 268 hash = ip_vs_conn_hashkey_param(p, false);
1da177e4 269
088339a5 270 rcu_read_lock();
1da177e4 271
088339a5 272 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
1845ed0b
JA
273 if (p->cport == cp->cport && p->vport == cp->vport &&
274 cp->af == p->af &&
f11017ec
SH
275 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
276 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
f11017ec 277 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
6e67e586 278 p->protocol == cp->protocol &&
e64e2b46 279 cp->ipvs == p->ipvs) {
088339a5
JA
280 if (!__ip_vs_conn_get(cp))
281 continue;
1da177e4 282 /* HIT */
088339a5 283 rcu_read_unlock();
1da177e4
LT
284 return cp;
285 }
286 }
287
088339a5 288 rcu_read_unlock();
1da177e4
LT
289
290 return NULL;
291}
292
f11017ec 293struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
1da177e4
LT
294{
295 struct ip_vs_conn *cp;
296
f11017ec
SH
297 cp = __ip_vs_conn_in_get(p);
298 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
299 struct ip_vs_conn_param cport_zero_p = *p;
300 cport_zero_p.cport = 0;
301 cp = __ip_vs_conn_in_get(&cport_zero_p);
302 }
1da177e4 303
28364a59 304 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
f11017ec
SH
305 ip_vs_proto_name(p->protocol),
306 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
307 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
28364a59 308 cp ? "hit" : "not hit");
1da177e4
LT
309
310 return cp;
311}
312
f11017ec 313static int
f5099dd4
EB
314ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
315 int af, const struct sk_buff *skb,
f11017ec 316 const struct ip_vs_iphdr *iph,
802c41ad 317 struct ip_vs_conn_param *p)
f11017ec
SH
318{
319 __be16 _ports[2], *pptr;
320
6b3d9330 321 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
f11017ec
SH
322 if (pptr == NULL)
323 return 1;
324
802c41ad 325 if (likely(!ip_vs_iph_inverse(iph)))
19913dec 326 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
6e67e586 327 pptr[0], &iph->daddr, pptr[1], p);
f11017ec 328 else
19913dec 329 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
6e67e586 330 pptr[1], &iph->saddr, pptr[0], p);
f11017ec
SH
331 return 0;
332}
333
5c0d2374 334struct ip_vs_conn *
ab161976
EB
335ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
336 const struct sk_buff *skb,
802c41ad 337 const struct ip_vs_iphdr *iph)
5c0d2374 338{
f11017ec 339 struct ip_vs_conn_param p;
5c0d2374 340
f5099dd4 341 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
5c0d2374
SH
342 return NULL;
343
f11017ec 344 return ip_vs_conn_in_get(&p);
5c0d2374
SH
345}
346EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
347
87375ab4 348/* Get reference to connection template */
f11017ec 349struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
87375ab4 350{
95c96174 351 unsigned int hash;
87375ab4
JA
352 struct ip_vs_conn *cp;
353
85999283 354 hash = ip_vs_conn_hashkey_param(p, false);
87375ab4 355
088339a5 356 rcu_read_lock();
87375ab4 357
088339a5 358 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
1845ed0b 359 if (unlikely(p->pe_data && p->pe->ct_match)) {
e64e2b46 360 if (cp->ipvs != p->ipvs)
1845ed0b 361 continue;
088339a5
JA
362 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
363 if (__ip_vs_conn_get(cp))
364 goto out;
365 }
85999283
SH
366 continue;
367 }
368
f11017ec
SH
369 if (cp->af == p->af &&
370 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
be8be9ec 371 /* protocol should only be IPPROTO_IP if
f11017ec
SH
372 * p->vaddr is a fwmark */
373 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
374 p->af, p->vaddr, &cp->vaddr) &&
1845ed0b 375 p->vport == cp->vport && p->cport == cp->cport &&
87375ab4 376 cp->flags & IP_VS_CONN_F_TEMPLATE &&
1845ed0b 377 p->protocol == cp->protocol &&
e64e2b46 378 cp->ipvs == p->ipvs) {
088339a5
JA
379 if (__ip_vs_conn_get(cp))
380 goto out;
381 }
87375ab4
JA
382 }
383 cp = NULL;
384
385 out:
088339a5 386 rcu_read_unlock();
87375ab4 387
28364a59 388 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
f11017ec
SH
389 ip_vs_proto_name(p->protocol),
390 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
391 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
28364a59 392 cp ? "hit" : "not hit");
87375ab4
JA
393
394 return cp;
395}
1da177e4 396
f11017ec
SH
397/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
398 * Called for pkts coming from inside-to-OUTside.
399 * p->caddr, p->cport: pkt source address (inside host)
400 * p->vaddr, p->vport: pkt dest address (foreign host) */
401struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
1da177e4 402{
95c96174 403 unsigned int hash;
1da177e4 404 struct ip_vs_conn *cp, *ret=NULL;
073b04e7 405 const union nf_inet_addr *saddr;
406 __be16 sport;
1da177e4
LT
407
408 /*
409 * Check for "full" addressed entries
410 */
85999283 411 hash = ip_vs_conn_hashkey_param(p, true);
1da177e4 412
088339a5 413 rcu_read_lock();
1da177e4 414
088339a5 415 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
073b04e7 416 if (p->vport != cp->cport)
417 continue;
418
419 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
420 sport = cp->vport;
421 saddr = &cp->vaddr;
422 } else {
423 sport = cp->dport;
424 saddr = &cp->daddr;
425 }
426
427 if (p->cport == sport && cp->af == p->af &&
f11017ec 428 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
073b04e7 429 ip_vs_addr_equal(p->af, p->caddr, saddr) &&
6e67e586 430 p->protocol == cp->protocol &&
e64e2b46 431 cp->ipvs == p->ipvs) {
088339a5
JA
432 if (!__ip_vs_conn_get(cp))
433 continue;
1da177e4 434 /* HIT */
1da177e4
LT
435 ret = cp;
436 break;
437 }
438 }
439
088339a5 440 rcu_read_unlock();
1da177e4 441
28364a59 442 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
f11017ec
SH
443 ip_vs_proto_name(p->protocol),
444 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
445 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
28364a59 446 ret ? "hit" : "not hit");
1da177e4
LT
447
448 return ret;
449}
450
5c0d2374 451struct ip_vs_conn *
0cf705c8
EB
452ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
453 const struct sk_buff *skb,
802c41ad 454 const struct ip_vs_iphdr *iph)
5c0d2374 455{
f11017ec 456 struct ip_vs_conn_param p;
5c0d2374 457
f5099dd4 458 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
5c0d2374
SH
459 return NULL;
460
f11017ec 461 return ip_vs_conn_out_get(&p);
5c0d2374
SH
462}
463EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
1da177e4
LT
464
465/*
466 * Put back the conn and restart its timer with its timeout
467 */
013b0424 468static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
1da177e4 469{
26ec037f
NC
470 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
471 0 : cp->timeout;
472 mod_timer(&cp->timer, jiffies+t);
1da177e4
LT
473
474 __ip_vs_conn_put(cp);
475}
476
013b0424
MA
477void ip_vs_conn_put(struct ip_vs_conn *cp)
478{
479 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
b54ab92b 480 (refcount_read(&cp->refcnt) == 1) &&
013b0424
MA
481 !timer_pending(&cp->timer))
482 /* expire connection immediately */
a050d345 483 ip_vs_conn_expire(&cp->timer);
013b0424
MA
484 else
485 __ip_vs_conn_put_timer(cp);
486}
1da177e4
LT
487
488/*
489 * Fill a no_client_port connection with a client port number
490 */
014d730d 491void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
1da177e4
LT
492{
493 if (ip_vs_conn_unhash(cp)) {
ac69269a 494 spin_lock_bh(&cp->lock);
1da177e4
LT
495 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
496 atomic_dec(&ip_vs_conn_no_cport_cnt);
497 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
498 cp->cport = cport;
499 }
ac69269a 500 spin_unlock_bh(&cp->lock);
1da177e4
LT
501
502 /* hash on new dport */
503 ip_vs_conn_hash(cp);
504 }
505}
506
507
508/*
509 * Bind a connection entry with the corresponding packet_xmit.
510 * Called by ip_vs_conn_new.
511 */
512static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
513{
514 switch (IP_VS_FWD_METHOD(cp)) {
515 case IP_VS_CONN_F_MASQ:
516 cp->packet_xmit = ip_vs_nat_xmit;
517 break;
518
519 case IP_VS_CONN_F_TUNNEL:
8052ba29
AG
520#ifdef CONFIG_IP_VS_IPV6
521 if (cp->daf == AF_INET6)
522 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
523 else
524#endif
525 cp->packet_xmit = ip_vs_tunnel_xmit;
1da177e4
LT
526 break;
527
528 case IP_VS_CONN_F_DROUTE:
529 cp->packet_xmit = ip_vs_dr_xmit;
530 break;
531
532 case IP_VS_CONN_F_LOCALNODE:
533 cp->packet_xmit = ip_vs_null_xmit;
534 break;
535
536 case IP_VS_CONN_F_BYPASS:
537 cp->packet_xmit = ip_vs_bypass_xmit;
538 break;
539 }
540}
541
b3cdd2a7
JV
542#ifdef CONFIG_IP_VS_IPV6
543static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
544{
545 switch (IP_VS_FWD_METHOD(cp)) {
546 case IP_VS_CONN_F_MASQ:
547 cp->packet_xmit = ip_vs_nat_xmit_v6;
548 break;
549
550 case IP_VS_CONN_F_TUNNEL:
8052ba29
AG
551 if (cp->daf == AF_INET6)
552 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
553 else
554 cp->packet_xmit = ip_vs_tunnel_xmit;
b3cdd2a7
JV
555 break;
556
557 case IP_VS_CONN_F_DROUTE:
558 cp->packet_xmit = ip_vs_dr_xmit_v6;
559 break;
560
561 case IP_VS_CONN_F_LOCALNODE:
562 cp->packet_xmit = ip_vs_null_xmit;
563 break;
564
565 case IP_VS_CONN_F_BYPASS:
566 cp->packet_xmit = ip_vs_bypass_xmit_v6;
567 break;
568 }
569}
570#endif
571
1da177e4
LT
572
573static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
574{
575 return atomic_read(&dest->activeconns)
576 + atomic_read(&dest->inactconns);
577}
578
579/*
580 * Bind a connection entry with a virtual service destination
581 * Called just after a new connection entry is created.
582 */
583static inline void
584ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
585{
3575792e 586 unsigned int conn_flags;
6b324dbf 587 __u32 flags;
3575792e 588
1da177e4
LT
589 /* if dest is NULL, then return directly */
590 if (!dest)
591 return;
592
593 /* Increase the refcnt counter of the dest */
fca9c20a 594 ip_vs_dest_hold(dest);
1da177e4 595
3575792e
JA
596 conn_flags = atomic_read(&dest->conn_flags);
597 if (cp->protocol != IPPROTO_UDP)
598 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
6b324dbf 599 flags = cp->flags;
1da177e4 600 /* Bind with the destination and its corresponding transmitter */
6b324dbf 601 if (flags & IP_VS_CONN_F_SYNC) {
b209639e
RB
602 /* if the connection is not template and is created
603 * by sync, preserve the activity flag.
604 */
6b324dbf 605 if (!(flags & IP_VS_CONN_F_TEMPLATE))
3575792e 606 conn_flags &= ~IP_VS_CONN_F_INACTIVE;
3233759b 607 /* connections inherit forwarding method from dest */
6b324dbf 608 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
3575792e 609 }
6b324dbf
PNA
610 flags |= conn_flags;
611 cp->flags = flags;
1da177e4
LT
612 cp->dest = dest;
613
cfc78c5a
JV
614 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
615 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
616 "dest->refcnt:%d\n",
617 ip_vs_proto_name(cp->protocol),
618 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
619 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
f18ae720 620 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
cfc78c5a 621 ip_vs_fwd_tag(cp), cp->state,
b54ab92b
RE
622 cp->flags, refcount_read(&cp->refcnt),
623 refcount_read(&dest->refcnt));
1da177e4
LT
624
625 /* Update the connection counters */
6b324dbf 626 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
06611f82
JA
627 /* It is a normal connection, so modify the counters
628 * according to the flags, later the protocol can
629 * update them on state change
630 */
6b324dbf 631 if (!(flags & IP_VS_CONN_F_INACTIVE))
b209639e
RB
632 atomic_inc(&dest->activeconns);
633 else
634 atomic_inc(&dest->inactconns);
1da177e4
LT
635 } else {
636 /* It is a persistent connection/template, so increase
25985edc 637 the persistent connection counter */
1da177e4
LT
638 atomic_inc(&dest->persistconns);
639 }
640
641 if (dest->u_threshold != 0 &&
642 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
643 dest->flags |= IP_VS_DEST_F_OVERLOAD;
644}
645
646
1e356f9c
RB
647/*
648 * Check if there is a destination for the connection, if so
649 * bind the connection to the destination.
650 */
413c2d04 651void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
1e356f9c
RB
652{
653 struct ip_vs_dest *dest;
654
413c2d04 655 rcu_read_lock();
655eef10
AG
656
657 /* This function is only invoked by the synchronization code. We do
658 * not currently support heterogeneous pools with synchronization,
659 * so we can make the assumption that the svc_af is the same as the
660 * dest_af
661 */
dc2add6f 662 dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
882a844b
JA
663 cp->dport, &cp->vaddr, cp->vport,
664 cp->protocol, cp->fwmark, cp->flags);
665 if (dest) {
666 struct ip_vs_proto_data *pd;
667
ac69269a 668 spin_lock_bh(&cp->lock);
f73181c8 669 if (cp->dest) {
ac69269a 670 spin_unlock_bh(&cp->lock);
413c2d04
JA
671 rcu_read_unlock();
672 return;
f73181c8
PNA
673 }
674
882a844b
JA
675 /* Applications work depending on the forwarding method
676 * but better to reassign them always when binding dest */
677 if (cp->app)
678 ip_vs_unbind_app(cp);
679
1e356f9c 680 ip_vs_bind_dest(cp, dest);
ac69269a 681 spin_unlock_bh(&cp->lock);
882a844b
JA
682
683 /* Update its packet transmitter */
684 cp->packet_xmit = NULL;
685#ifdef CONFIG_IP_VS_IPV6
686 if (cp->af == AF_INET6)
687 ip_vs_bind_xmit_v6(cp);
688 else
689#endif
690 ip_vs_bind_xmit(cp);
691
18d6ade6 692 pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
882a844b
JA
693 if (pd && atomic_read(&pd->appcnt))
694 ip_vs_bind_app(cp, pd->pp);
695 }
413c2d04 696 rcu_read_unlock();
1e356f9c 697}
1e356f9c
RB
698
699
1da177e4
LT
700/*
701 * Unbind a connection entry with its VS destination
702 * Called by the ip_vs_conn_expire function.
703 */
704static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
705{
706 struct ip_vs_dest *dest = cp->dest;
707
708 if (!dest)
709 return;
710
cfc78c5a
JV
711 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
712 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
713 "dest->refcnt:%d\n",
714 ip_vs_proto_name(cp->protocol),
715 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
716 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
f18ae720 717 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
cfc78c5a 718 ip_vs_fwd_tag(cp), cp->state,
b54ab92b
RE
719 cp->flags, refcount_read(&cp->refcnt),
720 refcount_read(&dest->refcnt));
1da177e4
LT
721
722 /* Update the connection counters */
87375ab4 723 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1da177e4
LT
724 /* It is a normal connection, so decrease the inactconns
725 or activeconns counter */
726 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
727 atomic_dec(&dest->inactconns);
728 } else {
729 atomic_dec(&dest->activeconns);
730 }
731 } else {
732 /* It is a persistent connection/template, so decrease
25985edc 733 the persistent connection counter */
1da177e4
LT
734 atomic_dec(&dest->persistconns);
735 }
736
737 if (dest->l_threshold != 0) {
738 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
739 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
740 } else if (dest->u_threshold != 0) {
741 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
742 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
743 } else {
744 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
745 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
746 }
747
fca9c20a 748 ip_vs_dest_put(dest);
1da177e4
LT
749}
750
8e1b0b1b
SH
751static int expire_quiescent_template(struct netns_ipvs *ipvs,
752 struct ip_vs_dest *dest)
753{
754#ifdef CONFIG_SYSCTL
755 return ipvs->sysctl_expire_quiescent_template &&
756 (atomic_read(&dest->weight) == 0);
757#else
758 return 0;
759#endif
760}
1da177e4
LT
761
762/*
763 * Checking if the destination of a connection template is available.
764 * If available, return 1, otherwise invalidate this connection
765 * template and return 0.
766 */
3ec10d3a 767int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
1da177e4
LT
768{
769 struct ip_vs_dest *dest = ct->dest;
58dbc6f2 770 struct netns_ipvs *ipvs = ct->ipvs;
1da177e4
LT
771
772 /*
773 * Checking the dest server status.
774 */
775 if ((dest == NULL) ||
e905a9ed 776 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
3ec10d3a
MA
777 expire_quiescent_template(ipvs, dest) ||
778 (cdest && (dest != cdest))) {
cfc78c5a
JV
779 IP_VS_DBG_BUF(9, "check_template: dest not available for "
780 "protocol %s s:%s:%d v:%s:%d "
781 "-> d:%s:%d\n",
782 ip_vs_proto_name(ct->protocol),
783 IP_VS_DBG_ADDR(ct->af, &ct->caddr),
784 ntohs(ct->cport),
785 IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
786 ntohs(ct->vport),
f18ae720 787 IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
cfc78c5a 788 ntohs(ct->dport));
1da177e4
LT
789
790 /*
791 * Invalidate the connection template
792 */
014d730d 793 if (ct->vport != htons(0xffff)) {
1da177e4 794 if (ip_vs_conn_unhash(ct)) {
014d730d
AV
795 ct->dport = htons(0xffff);
796 ct->vport = htons(0xffff);
1da177e4
LT
797 ct->cport = 0;
798 ip_vs_conn_hash(ct);
799 }
800 }
801
802 /*
803 * Simply decrease the refcnt of the template,
804 * don't restart its timer.
805 */
088339a5 806 __ip_vs_conn_put(ct);
1da177e4
LT
807 return 0;
808 }
809 return 1;
810}
811
088339a5
JA
812static void ip_vs_conn_rcu_free(struct rcu_head *head)
813{
814 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
815 rcu_head);
816
817 ip_vs_pe_put(cp->pe);
818 kfree(cp->pe_data);
819 kmem_cache_free(ip_vs_conn_cachep, cp);
820}
821
f9200a52
JA
822/* Try to delete connection while not holding reference */
823static void ip_vs_conn_del(struct ip_vs_conn *cp)
824{
825 if (del_timer(&cp->timer)) {
826 /* Drop cp->control chain too */
827 if (cp->control)
828 cp->timeout = 0;
829 ip_vs_conn_expire(&cp->timer);
830 }
831}
832
833/* Try to delete connection while holding reference */
834static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
835{
836 if (del_timer(&cp->timer)) {
837 /* Drop cp->control chain too */
838 if (cp->control)
839 cp->timeout = 0;
840 __ip_vs_conn_put(cp);
841 ip_vs_conn_expire(&cp->timer);
842 } else {
843 __ip_vs_conn_put(cp);
844 }
845}
846
8ef81c65 847static void ip_vs_conn_expire(struct timer_list *t)
1da177e4 848{
8ef81c65 849 struct ip_vs_conn *cp = from_timer(cp, t, timer);
58dbc6f2 850 struct netns_ipvs *ipvs = cp->ipvs;
1da177e4 851
1da177e4
LT
852 /*
853 * do I control anybody?
854 */
855 if (atomic_read(&cp->n_control))
856 goto expire_later;
857
088339a5
JA
858 /* Unlink conn if not referenced anymore */
859 if (likely(ip_vs_conn_unlink(cp))) {
762c4007
JA
860 struct ip_vs_conn *ct = cp->control;
861
1da177e4 862 /* delete the timer if it is activated by other users */
25cc4ae9 863 del_timer(&cp->timer);
1da177e4
LT
864
865 /* does anybody control me? */
762c4007 866 if (ct) {
f9200a52
JA
867 bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
868
1da177e4 869 ip_vs_control_del(cp);
762c4007 870 /* Drop CTL or non-assured TPL if not used anymore */
f9200a52 871 if (has_ref && !atomic_read(&ct->n_control) &&
762c4007
JA
872 (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
873 !(ct->state & IP_VS_CTPL_S_ASSURED))) {
874 IP_VS_DBG(4, "drop controlling connection\n");
f9200a52
JA
875 ip_vs_conn_del_put(ct);
876 } else if (has_ref) {
877 __ip_vs_conn_put(ct);
762c4007
JA
878 }
879 }
1da177e4 880
8fb04d9f
MA
881 if ((cp->flags & IP_VS_CONN_F_NFCT) &&
882 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
8f4e0a18
HS
883 /* Do not access conntracks during subsys cleanup
884 * because nf_conntrack_find_get can not be used after
885 * conntrack cleanup for the net.
886 */
887 smp_rmb();
888 if (ipvs->enable)
889 ip_vs_conn_drop_conntrack(cp);
890 }
f4bc17cd 891
1da177e4
LT
892 if (unlikely(cp->app != NULL))
893 ip_vs_unbind_app(cp);
894 ip_vs_unbind_dest(cp);
895 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
896 atomic_dec(&ip_vs_conn_no_cport_cnt);
013b0424
MA
897 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
898 ip_vs_conn_rcu_free(&cp->rcu_head);
899 else
900 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
6e67e586 901 atomic_dec(&ipvs->conn_count);
1da177e4
LT
902 return;
903 }
904
1da177e4 905 expire_later:
088339a5 906 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
b54ab92b 907 refcount_read(&cp->refcnt),
1da177e4
LT
908 atomic_read(&cp->n_control));
909
b54ab92b 910 refcount_inc(&cp->refcnt);
088339a5
JA
911 cp->timeout = 60*HZ;
912
749c42b6 913 if (ipvs->sync_state & IP_VS_STATE_MASTER)
b61a8c1a 914 ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
749c42b6 915
013b0424 916 __ip_vs_conn_put_timer(cp);
1da177e4
LT
917}
918
088339a5
JA
919/* Modify timer, so that it expires as soon as possible.
920 * Can be called without reference only if under RCU lock.
762c4007
JA
921 * We can have such chain of conns linked with ->control: DATA->CTL->TPL
922 * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
923 * - cp->timeout=0 indicates all conns from chain should be dropped but
924 * TPL is not dropped if in assured state
088339a5 925 */
1da177e4
LT
926void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
927{
088339a5
JA
928 /* Using mod_timer_pending will ensure the timer is not
929 * modified after the final del_timer in ip_vs_conn_expire.
930 */
931 if (timer_pending(&cp->timer) &&
932 time_after(cp->timer.expires, jiffies))
933 mod_timer_pending(&cp->timer, jiffies);
1da177e4
LT
934}
935
936
937/*
938 * Create a new connection entry and hash it into the ip_vs_conn_tab
939 */
940struct ip_vs_conn *
ba38528a 941ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
95c96174 942 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
0e051e68 943 struct ip_vs_dest *dest, __u32 fwmark)
1da177e4
LT
944{
945 struct ip_vs_conn *cp;
e64e2b46 946 struct netns_ipvs *ipvs = p->ipvs;
18d6ade6 947 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
6e67e586 948 p->protocol);
1da177e4 949
9a05475c 950 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1da177e4 951 if (cp == NULL) {
1e3e238e 952 IP_VS_ERR_RL("%s(): no memory\n", __func__);
1da177e4
LT
953 return NULL;
954 }
955
731109e7 956 INIT_HLIST_NODE(&cp->c_list);
8ef81c65 957 timer_setup(&cp->timer, ip_vs_conn_expire, 0);
58dbc6f2 958 cp->ipvs = ipvs;
f11017ec 959 cp->af = p->af;
ba38528a 960 cp->daf = dest_af;
f11017ec 961 cp->protocol = p->protocol;
9a05475c 962 ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
f11017ec 963 cp->cport = p->cport;
2a971354 964 /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
9a05475c 965 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
2a971354
MK
966 &cp->vaddr, p->vaddr);
967 cp->vport = p->vport;
ba38528a 968 ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
1da177e4
LT
969 cp->dport = dport;
970 cp->flags = flags;
0e051e68 971 cp->fwmark = fwmark;
e9e5eee8
SH
972 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
973 ip_vs_pe_get(p->pe);
974 cp->pe = p->pe;
85999283
SH
975 cp->pe_data = p->pe_data;
976 cp->pe_data_len = p->pe_data_len;
9a05475c
JA
977 } else {
978 cp->pe = NULL;
979 cp->pe_data = NULL;
980 cp->pe_data_len = 0;
85999283 981 }
1da177e4
LT
982 spin_lock_init(&cp->lock);
983
984 /*
985 * Set the entry is referenced by the current thread before hashing
986 * it in the table, so that other thread run ip_vs_random_dropentry
987 * but cannot drop this entry.
988 */
b54ab92b 989 refcount_set(&cp->refcnt, 1);
1da177e4 990
9a05475c 991 cp->control = NULL;
1da177e4
LT
992 atomic_set(&cp->n_control, 0);
993 atomic_set(&cp->in_pkts, 0);
994
9a05475c
JA
995 cp->packet_xmit = NULL;
996 cp->app = NULL;
997 cp->app_data = NULL;
998 /* reset struct ip_vs_seq */
999 cp->in_seq.delta = 0;
1000 cp->out_seq.delta = 0;
1001
6e67e586 1002 atomic_inc(&ipvs->conn_count);
1da177e4
LT
1003 if (flags & IP_VS_CONN_F_NO_CPORT)
1004 atomic_inc(&ip_vs_conn_no_cport_cnt);
1005
1006 /* Bind the connection with a destination server */
9a05475c 1007 cp->dest = NULL;
1da177e4
LT
1008 ip_vs_bind_dest(cp, dest);
1009
1010 /* Set its state and timeout */
1011 cp->state = 0;
9a05475c 1012 cp->old_state = 0;
1da177e4 1013 cp->timeout = 3*HZ;
749c42b6 1014 cp->sync_endtime = jiffies & ~3UL;
1da177e4
LT
1015
1016 /* Bind its packet transmitter */
b3cdd2a7 1017#ifdef CONFIG_IP_VS_IPV6
f11017ec 1018 if (p->af == AF_INET6)
b3cdd2a7
JV
1019 ip_vs_bind_xmit_v6(cp);
1020 else
1021#endif
1022 ip_vs_bind_xmit(cp);
1da177e4 1023
9bbac6a9
HS
1024 if (unlikely(pd && atomic_read(&pd->appcnt)))
1025 ip_vs_bind_app(cp, pd->pp);
1da177e4 1026
f4bc17cd
JA
1027 /*
1028 * Allow conntrack to be preserved. By default, conntrack
1029 * is created and destroyed for every packet.
1030 * Sometimes keeping conntrack can be useful for
1031 * IP_VS_CONN_F_ONE_PACKET too.
1032 */
1033
a0840e2e 1034 if (ip_vs_conntrack_enabled(ipvs))
f4bc17cd
JA
1035 cp->flags |= IP_VS_CONN_F_NFCT;
1036
1da177e4
LT
1037 /* Hash it in the ip_vs_conn_tab finally */
1038 ip_vs_conn_hash(cp);
1039
1040 return cp;
1041}
1042
1da177e4
LT
1043/*
1044 * /proc/net/ip_vs_conn entries
1045 */
1046#ifdef CONFIG_PROC_FS
6e67e586 1047struct ip_vs_iter_state {
731109e7
CG
1048 struct seq_net_private p;
1049 struct hlist_head *l;
6e67e586 1050};
1da177e4
LT
1051
1052static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
1053{
1054 int idx;
1055 struct ip_vs_conn *cp;
6e67e586 1056 struct ip_vs_iter_state *iter = seq->private;
e905a9ed 1057
6f7edb48 1058 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
088339a5
JA
1059 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1060 /* __ip_vs_conn_get() is not needed by
1061 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
1062 */
1da177e4 1063 if (pos-- == 0) {
6e67e586 1064 iter->l = &ip_vs_conn_tab[idx];
731109e7 1065 return cp;
1da177e4
LT
1066 }
1067 }
a38e5e23 1068 cond_resched_rcu();
1da177e4
LT
1069 }
1070
1071 return NULL;
1072}
1073
1074static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
7cf2eb7b 1075 __acquires(RCU)
1da177e4 1076{
6e67e586
HS
1077 struct ip_vs_iter_state *iter = seq->private;
1078
1079 iter->l = NULL;
7cf2eb7b 1080 rcu_read_lock();
1da177e4
LT
1081 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
1082}
1083
1084static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1085{
1086 struct ip_vs_conn *cp = v;
6e67e586 1087 struct ip_vs_iter_state *iter = seq->private;
088339a5 1088 struct hlist_node *e;
731109e7 1089 struct hlist_head *l = iter->l;
1da177e4
LT
1090 int idx;
1091
1092 ++*pos;
e905a9ed 1093 if (v == SEQ_START_TOKEN)
1da177e4
LT
1094 return ip_vs_conn_array(seq, 0);
1095
1096 /* more on same hash chain? */
088339a5
JA
1097 e = rcu_dereference(hlist_next_rcu(&cp->c_list));
1098 if (e)
1099 return hlist_entry(e, struct ip_vs_conn, c_list);
1da177e4
LT
1100
1101 idx = l - ip_vs_conn_tab;
6f7edb48 1102 while (++idx < ip_vs_conn_tab_size) {
088339a5 1103 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
6e67e586 1104 iter->l = &ip_vs_conn_tab[idx];
1da177e4 1105 return cp;
e905a9ed 1106 }
a38e5e23 1107 cond_resched_rcu();
1da177e4 1108 }
6e67e586 1109 iter->l = NULL;
1da177e4
LT
1110 return NULL;
1111}
1112
1113static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
7cf2eb7b 1114 __releases(RCU)
1da177e4 1115{
7cf2eb7b 1116 rcu_read_unlock();
1da177e4
LT
1117}
1118
1119static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1120{
1121
1122 if (v == SEQ_START_TOKEN)
1123 seq_puts(seq,
a3c918ac 1124 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
1da177e4
LT
1125 else {
1126 const struct ip_vs_conn *cp = v;
6e67e586 1127 struct net *net = seq_file_net(seq);
a3c918ac
SH
1128 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1129 size_t len = 0;
f18ae720 1130 char dbuf[IP_VS_ADDRSTRLEN];
a3c918ac 1131
58dbc6f2 1132 if (!net_eq(cp->ipvs->net, net))
6e67e586 1133 return 0;
e9e5eee8 1134 if (cp->pe_data) {
a3c918ac 1135 pe_data[0] = ' ';
e9e5eee8
SH
1136 len = strlen(cp->pe->name);
1137 memcpy(pe_data + 1, cp->pe->name, len);
a3c918ac
SH
1138 pe_data[len + 1] = ' ';
1139 len += 2;
e9e5eee8 1140 len += cp->pe->show_pe_data(cp, pe_data + len);
a3c918ac
SH
1141 }
1142 pe_data[len] = '\0';
1da177e4 1143
f18ae720
JA
1144#ifdef CONFIG_IP_VS_IPV6
1145 if (cp->daf == AF_INET6)
1146 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1147 else
1148#endif
1149 snprintf(dbuf, sizeof(dbuf), "%08X",
1150 ntohl(cp->daddr.ip));
1151
667a5f18
VB
1152#ifdef CONFIG_IP_VS_IPV6
1153 if (cp->af == AF_INET6)
a3c918ac 1154 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
b71ed54d 1155 "%s %04X %-11s %7u%s\n",
667a5f18 1156 ip_vs_proto_name(cp->protocol),
38ff4fa4
HH
1157 &cp->caddr.in6, ntohs(cp->cport),
1158 &cp->vaddr.in6, ntohs(cp->vport),
f18ae720 1159 dbuf, ntohs(cp->dport),
ec1b28ca 1160 ip_vs_state_name(cp),
b71ed54d
MC
1161 jiffies_delta_to_msecs(cp->timer.expires -
1162 jiffies) / 1000,
1163 pe_data);
667a5f18
VB
1164 else
1165#endif
1166 seq_printf(seq,
1167 "%-3s %08X %04X %08X %04X"
b71ed54d 1168 " %s %04X %-11s %7u%s\n",
1da177e4 1169 ip_vs_proto_name(cp->protocol),
e7ade46a
JV
1170 ntohl(cp->caddr.ip), ntohs(cp->cport),
1171 ntohl(cp->vaddr.ip), ntohs(cp->vport),
f18ae720 1172 dbuf, ntohs(cp->dport),
ec1b28ca 1173 ip_vs_state_name(cp),
b71ed54d
MC
1174 jiffies_delta_to_msecs(cp->timer.expires -
1175 jiffies) / 1000,
1176 pe_data);
1da177e4
LT
1177 }
1178 return 0;
1179}
1180
56b3d975 1181static const struct seq_operations ip_vs_conn_seq_ops = {
1da177e4
LT
1182 .start = ip_vs_conn_seq_start,
1183 .next = ip_vs_conn_seq_next,
1184 .stop = ip_vs_conn_seq_stop,
1185 .show = ip_vs_conn_seq_show,
1186};
1187
95c96174 1188static const char *ip_vs_origin_name(unsigned int flags)
7a4fbb1f
RB
1189{
1190 if (flags & IP_VS_CONN_F_SYNC)
1191 return "SYNC";
1192 else
1193 return "LOCAL";
1194}
1195
1196static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1197{
f18ae720 1198 char dbuf[IP_VS_ADDRSTRLEN];
7a4fbb1f
RB
1199
1200 if (v == SEQ_START_TOKEN)
1201 seq_puts(seq,
1202 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
1203 else {
1204 const struct ip_vs_conn *cp = v;
6e67e586
HS
1205 struct net *net = seq_file_net(seq);
1206
58dbc6f2 1207 if (!net_eq(cp->ipvs->net, net))
6e67e586 1208 return 0;
7a4fbb1f 1209
f18ae720
JA
1210#ifdef CONFIG_IP_VS_IPV6
1211 if (cp->daf == AF_INET6)
1212 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1213 else
1214#endif
1215 snprintf(dbuf, sizeof(dbuf), "%08X",
1216 ntohl(cp->daddr.ip));
1217
667a5f18
VB
1218#ifdef CONFIG_IP_VS_IPV6
1219 if (cp->af == AF_INET6)
f18ae720 1220 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
b71ed54d 1221 "%s %04X %-11s %-6s %7u\n",
667a5f18 1222 ip_vs_proto_name(cp->protocol),
38ff4fa4
HH
1223 &cp->caddr.in6, ntohs(cp->cport),
1224 &cp->vaddr.in6, ntohs(cp->vport),
f18ae720 1225 dbuf, ntohs(cp->dport),
ec1b28ca 1226 ip_vs_state_name(cp),
667a5f18 1227 ip_vs_origin_name(cp->flags),
b71ed54d
MC
1228 jiffies_delta_to_msecs(cp->timer.expires -
1229 jiffies) / 1000);
667a5f18
VB
1230 else
1231#endif
1232 seq_printf(seq,
1233 "%-3s %08X %04X %08X %04X "
b71ed54d 1234 "%s %04X %-11s %-6s %7u\n",
7a4fbb1f 1235 ip_vs_proto_name(cp->protocol),
e7ade46a
JV
1236 ntohl(cp->caddr.ip), ntohs(cp->cport),
1237 ntohl(cp->vaddr.ip), ntohs(cp->vport),
f18ae720 1238 dbuf, ntohs(cp->dport),
ec1b28ca 1239 ip_vs_state_name(cp),
7a4fbb1f 1240 ip_vs_origin_name(cp->flags),
b71ed54d
MC
1241 jiffies_delta_to_msecs(cp->timer.expires -
1242 jiffies) / 1000);
7a4fbb1f
RB
1243 }
1244 return 0;
1245}
1246
1247static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1248 .start = ip_vs_conn_seq_start,
1249 .next = ip_vs_conn_seq_next,
1250 .stop = ip_vs_conn_seq_stop,
1251 .show = ip_vs_conn_sync_seq_show,
1252};
1da177e4
LT
1253#endif
1254
1255
762c4007
JA
1256/* Randomly drop connection entries before running out of memory
1257 * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
1258 * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
1259 * - traffic for services not in OPS mode does not increase ct->in_pkts in
1260 * all cases, so it is not supported
1da177e4
LT
1261 */
1262static inline int todrop_entry(struct ip_vs_conn *cp)
1263{
1264 /*
1265 * The drop rate array needs tuning for real environments.
1266 * Called from timer bh only => no locking
1267 */
9b5b5cff 1268 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1da177e4
LT
1269 static char todrop_counter[9] = {0};
1270 int i;
1271
1272 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1273 This will leave enough time for normal connection to get
1274 through. */
1275 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1276 return 0;
1277
1278 /* Don't drop the entry if its number of incoming packets is not
1279 located in [0, 8] */
1280 i = atomic_read(&cp->in_pkts);
1281 if (i > 8 || i < 0) return 0;
1282
1283 if (!todrop_rate[i]) return 0;
1284 if (--todrop_counter[i] > 0) return 0;
1285
1286 todrop_counter[i] = todrop_rate[i];
1287 return 1;
1288}
1289
698e2a8d
MA
1290static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
1291{
1292 struct ip_vs_service *svc;
1293
1294 if (!cp->dest)
1295 return false;
1296 svc = rcu_dereference(cp->dest->svc);
1297 return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
1298}
1299
af9debd4 1300/* Called from keventd and must protect itself from softirqs */
423b5595 1301void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1da177e4
LT
1302{
1303 int idx;
762c4007 1304 struct ip_vs_conn *cp;
1da177e4 1305
a38e5e23 1306 rcu_read_lock();
1da177e4
LT
1307 /*
1308 * Randomly scan 1/32 of the whole table every second
1309 */
6f7edb48 1310 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
a251c17a 1311 unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
1da177e4 1312
088339a5 1313 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
423b5595 1314 if (cp->ipvs != ipvs)
f6340ee0 1315 continue;
762c4007
JA
1316 if (atomic_read(&cp->n_control))
1317 continue;
698e2a8d 1318 if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
762c4007
JA
1319 /* connection template of OPS */
1320 if (ip_vs_conn_ops_mode(cp))
698e2a8d 1321 goto try_drop;
762c4007
JA
1322 if (!(cp->state & IP_VS_CTPL_S_ASSURED))
1323 goto drop;
1324 continue;
698e2a8d 1325 }
1da177e4
LT
1326 if (cp->protocol == IPPROTO_TCP) {
1327 switch(cp->state) {
1328 case IP_VS_TCP_S_SYN_RECV:
1329 case IP_VS_TCP_S_SYNACK:
1330 break;
1331
1332 case IP_VS_TCP_S_ESTABLISHED:
1333 if (todrop_entry(cp))
1334 break;
1335 continue;
1336
1337 default:
1338 continue;
1339 }
acaac5d8
JA
1340 } else if (cp->protocol == IPPROTO_SCTP) {
1341 switch (cp->state) {
1342 case IP_VS_SCTP_S_INIT1:
1343 case IP_VS_SCTP_S_INIT:
1344 break;
1345 case IP_VS_SCTP_S_ESTABLISHED:
1346 if (todrop_entry(cp))
1347 break;
1348 continue;
1349 default:
1350 continue;
1351 }
1da177e4 1352 } else {
698e2a8d 1353try_drop:
1da177e4
LT
1354 if (!todrop_entry(cp))
1355 continue;
1356 }
1357
762c4007
JA
1358drop:
1359 IP_VS_DBG(4, "drop connection\n");
f9200a52 1360 ip_vs_conn_del(cp);
1da177e4 1361 }
a38e5e23 1362 cond_resched_rcu();
1da177e4 1363 }
a38e5e23 1364 rcu_read_unlock();
1da177e4
LT
1365}
1366
1367
1368/*
1369 * Flush all the connection entries in the ip_vs_conn_tab
1370 */
d889717a 1371static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
1da177e4
LT
1372{
1373 int idx;
088339a5 1374 struct ip_vs_conn *cp, *cp_c;
1da177e4 1375
a0840e2e 1376flush_again:
a38e5e23 1377 rcu_read_lock();
6f7edb48 1378 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1da177e4 1379
088339a5 1380 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
58dbc6f2 1381 if (cp->ipvs != ipvs)
6e67e586 1382 continue;
f9200a52
JA
1383 if (atomic_read(&cp->n_control))
1384 continue;
088339a5 1385 cp_c = cp->control;
f9200a52
JA
1386 IP_VS_DBG(4, "del connection\n");
1387 ip_vs_conn_del(cp);
1388 if (cp_c && !atomic_read(&cp_c->n_control)) {
762c4007 1389 IP_VS_DBG(4, "del controlling connection\n");
f9200a52 1390 ip_vs_conn_del(cp_c);
1da177e4 1391 }
1da177e4 1392 }
a38e5e23 1393 cond_resched_rcu();
1da177e4 1394 }
a38e5e23 1395 rcu_read_unlock();
1da177e4
LT
1396
1397 /* the counter may be not NULL, because maybe some conn entries
1398 are run by slow timer handler or unhashed but still referred */
6e67e586 1399 if (atomic_read(&ipvs->conn_count) != 0) {
1da177e4
LT
1400 schedule();
1401 goto flush_again;
1402 }
1403}
35dfb013
ASK
1404
1405#ifdef CONFIG_SYSCTL
1406void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
1407{
1408 int idx;
1409 struct ip_vs_conn *cp, *cp_c;
1410 struct ip_vs_dest *dest;
1411
1412 rcu_read_lock();
1413 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1414 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1415 if (cp->ipvs != ipvs)
1416 continue;
1417
1418 dest = cp->dest;
1419 if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
1420 continue;
1421
1422 if (atomic_read(&cp->n_control))
1423 continue;
1424
1425 cp_c = cp->control;
1426 IP_VS_DBG(4, "del connection\n");
1427 ip_vs_conn_del(cp);
1428 if (cp_c && !atomic_read(&cp_c->n_control)) {
1429 IP_VS_DBG(4, "del controlling connection\n");
1430 ip_vs_conn_del(cp_c);
1431 }
1432 }
1433 cond_resched_rcu();
1434
1435 /* netns clean up started, abort delayed work */
1436 if (!ipvs->enable)
1437 break;
1438 }
1439 rcu_read_unlock();
1440}
1441#endif
1442
61b1ab45
HS
1443/*
1444 * per netns init and exit
1445 */
2f3edc6a 1446int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
61b1ab45 1447{
6e67e586 1448 atomic_set(&ipvs->conn_count, 0);
1da177e4 1449
c3506372
CH
1450 proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
1451 &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
1452 proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
1453 &ip_vs_conn_sync_seq_ops,
1454 sizeof(struct ip_vs_iter_state));
61b1ab45
HS
1455 return 0;
1456}
1457
2f3edc6a 1458void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
61b1ab45 1459{
6e67e586 1460 /* flush all the connection entries first */
d889717a 1461 ip_vs_conn_flush(ipvs);
92240e8d
SH
1462 remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
1463 remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
61b1ab45 1464}
1da177e4 1465
048cf48b 1466int __init ip_vs_conn_init(void)
1da177e4
LT
1467{
1468 int idx;
1469
6f7edb48 1470 /* Compute size and mask */
69e73dbf
AC
1471 if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
1472 pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
1473 ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
1474 }
6f7edb48
CB
1475 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1476 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1477
1da177e4
LT
1478 /*
1479 * Allocate the connection hash table and initialize its list heads
1480 */
42bc47b3
KC
1481 ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size,
1482 sizeof(*ip_vs_conn_tab)));
1da177e4
LT
1483 if (!ip_vs_conn_tab)
1484 return -ENOMEM;
1485
1486 /* Allocate ip_vs_conn slab cache */
1487 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1488 sizeof(struct ip_vs_conn), 0,
20c2df83 1489 SLAB_HWCACHE_ALIGN, NULL);
1da177e4
LT
1490 if (!ip_vs_conn_cachep) {
1491 vfree(ip_vs_conn_tab);
1492 return -ENOMEM;
1493 }
1494
1e3e238e
HE
1495 pr_info("Connection hash table configured "
1496 "(size=%d, memory=%ldKbytes)\n",
6f7edb48 1497 ip_vs_conn_tab_size,
eba1a872 1498 (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024);
5b5e0928 1499 IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
1da177e4
LT
1500 sizeof(struct ip_vs_conn));
1501
731109e7
CG
1502 for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
1503 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
1da177e4
LT
1504
1505 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
088339a5 1506 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
1da177e4
LT
1507 }
1508
1da177e4
LT
1509 /* calculate the random value for connection hash */
1510 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1511
7a4f0761 1512 return 0;
1da177e4
LT
1513}
1514
1da177e4
LT
1515void ip_vs_conn_cleanup(void)
1516{
088339a5
JA
1517 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
1518 rcu_barrier();
1da177e4
LT
1519 /* Release the empty cache */
1520 kmem_cache_destroy(ip_vs_conn_cachep);
1da177e4
LT
1521 vfree(ip_vs_conn_tab);
1522}