ipvs: avoid indirect calls when calculating checksums
[linux-2.6-block.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
4a85b96c 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
1da177e4 13 *
4a85b96c
HS
14 * Network name space (netns) aware.
15 * Global data moved to netns i.e struct netns_ipvs
16 * tcp_timeouts table has copy per netns in a hash table per
17 * protocol ip_vs_proto_data and is handled by netns
1da177e4
LT
18 */
19
9aada7ac
HE
20#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
1da177e4
LT
23#include <linux/kernel.h>
24#include <linux/ip.h>
25#include <linux/tcp.h> /* for tcphdr */
26#include <net/ip.h>
27#include <net/tcp.h> /* for csum_tcpudp_magic */
63f2c046 28#include <net/ip6_checksum.h>
af1e1cf0 29#include <linux/netfilter.h>
1da177e4
LT
30#include <linux/netfilter_ipv4.h>
31
32#include <net/ip_vs.h>
33
fe19a8fe
MC
34static int
35tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp);
36
1da177e4 37static int
d8f44c33
EB
38tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
39 struct ip_vs_proto_data *pd,
d4383f04
JDB
40 int *verdict, struct ip_vs_conn **cpp,
41 struct ip_vs_iphdr *iph)
1da177e4
LT
42{
43 struct ip_vs_service *svc;
44 struct tcphdr _tcph, *th;
8f88ea68 45 __be16 _ports[2], *ports = NULL;
1da177e4 46
8f88ea68
AG
47 /* In the event of icmp, we're only guaranteed to have the first 8
48 * bytes of the transport header, so we only check the rest of the
49 * TCP packet for non-ICMP packets
50 */
51 if (likely(!ip_vs_iph_icmp(iph))) {
52 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
53 if (th) {
54 if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
55 return 1;
56 ports = &th->source;
57 }
58 } else {
59 ports = skb_header_pointer(
60 skb, iph->len, sizeof(_ports), &_ports);
6044eeff
AG
61 }
62
8f88ea68 63 if (!ports) {
1da177e4
LT
64 *verdict = NF_DROP;
65 return 0;
66 }
6044eeff 67
190ecd27 68 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
8f88ea68
AG
69
70 if (likely(!ip_vs_iph_inverse(iph)))
0a4fd6ce 71 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
8f88ea68
AG
72 &iph->daddr, ports[1]);
73 else
0a4fd6ce 74 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
8f88ea68
AG
75 &iph->saddr, ports[0]);
76
77 if (svc) {
190ecd27
JA
78 int ignored;
79
c6c96c18 80 if (ip_vs_todrop(ipvs)) {
1da177e4
LT
81 /*
82 * It seems that we are very loaded.
83 * We have to drop this packet :(
84 */
1da177e4
LT
85 *verdict = NF_DROP;
86 return 0;
87 }
88
89 /*
90 * Let the virtual server select a real server for the
91 * incoming connection, and create a connection entry.
92 */
d4383f04 93 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
a5959d53
HS
94 if (!*cpp && ignored <= 0) {
95 if (!ignored)
d4383f04 96 *verdict = ip_vs_leave(svc, skb, pd, iph);
ceec4c38 97 else
a5959d53 98 *verdict = NF_DROP;
1da177e4
LT
99 return 0;
100 }
1da177e4 101 }
a5959d53 102 /* NF_ACCEPT */
1da177e4
LT
103 return 1;
104}
105
106
107static inline void
0bbdd42b
JV
108tcp_fast_csum_update(int af, struct tcphdr *tcph,
109 const union nf_inet_addr *oldip,
110 const union nf_inet_addr *newip,
014d730d 111 __be16 oldport, __be16 newport)
1da177e4 112{
0bbdd42b
JV
113#ifdef CONFIG_IP_VS_IPV6
114 if (af == AF_INET6)
115 tcph->check =
116 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
117 ip_vs_check_diff2(oldport, newport,
118 ~csum_unfold(tcph->check))));
119 else
120#endif
1da177e4 121 tcph->check =
0bbdd42b 122 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
f9214b26
AV
123 ip_vs_check_diff2(oldport, newport,
124 ~csum_unfold(tcph->check))));
1da177e4
LT
125}
126
127
503e81f6
SH
128static inline void
129tcp_partial_csum_update(int af, struct tcphdr *tcph,
130 const union nf_inet_addr *oldip,
131 const union nf_inet_addr *newip,
132 __be16 oldlen, __be16 newlen)
133{
134#ifdef CONFIG_IP_VS_IPV6
135 if (af == AF_INET6)
136 tcph->check =
5bc9068e 137 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
503e81f6 138 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 139 csum_unfold(tcph->check))));
503e81f6
SH
140 else
141#endif
142 tcph->check =
5bc9068e 143 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
503e81f6 144 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 145 csum_unfold(tcph->check))));
503e81f6
SH
146}
147
148
1da177e4 149static int
d4383f04
JDB
150tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
151 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
1da177e4
LT
152{
153 struct tcphdr *tcph;
d4383f04 154 unsigned int tcphoff = iph->len;
503e81f6 155 int oldlen;
8b27b10f 156 int payload_csum = 0;
0bbdd42b
JV
157
158#ifdef CONFIG_IP_VS_IPV6
d4383f04 159 if (cp->af == AF_INET6 && iph->fragoffs)
63dca2c0 160 return 1;
0bbdd42b 161#endif
503e81f6 162 oldlen = skb->len - tcphoff;
1da177e4
LT
163
164 /* csum_check requires unshared skb */
3db05fea 165 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
166 return 0;
167
168 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
169 int ret;
170
1da177e4 171 /* Some checks before mangling */
fe19a8fe 172 if (!tcp_csum_check(cp->af, skb, pp))
1da177e4
LT
173 return 0;
174
175 /* Call application helper if needed */
d12e1229 176 if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
1da177e4 177 return 0;
8b27b10f
JA
178 /* ret=2: csum update is needed after payload mangling */
179 if (ret == 1)
180 oldlen = skb->len - tcphoff;
181 else
182 payload_csum = 1;
1da177e4
LT
183 }
184
0bbdd42b 185 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
186 tcph->source = cp->vport;
187
188 /* Adjust TCP checksums */
503e81f6
SH
189 if (skb->ip_summed == CHECKSUM_PARTIAL) {
190 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
ca62059b
HH
191 htons(oldlen),
192 htons(skb->len - tcphoff));
8b27b10f 193 } else if (!payload_csum) {
1da177e4 194 /* Only port and addr are changed, do fast csum update */
0bbdd42b 195 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
1da177e4 196 cp->dport, cp->vport);
3db05fea 197 if (skb->ip_summed == CHECKSUM_COMPLETE)
fe19a8fe 198 skb->ip_summed = cp->app ?
8b27b10f 199 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
200 } else {
201 /* full checksum calculation */
202 tcph->check = 0;
3db05fea 203 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
204#ifdef CONFIG_IP_VS_IPV6
205 if (cp->af == AF_INET6)
206 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
207 &cp->caddr.in6,
208 skb->len - tcphoff,
209 cp->protocol, skb->csum);
210 else
211#endif
212 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
213 cp->caddr.ip,
214 skb->len - tcphoff,
215 cp->protocol,
216 skb->csum);
8b27b10f 217 skb->ip_summed = CHECKSUM_UNNECESSARY;
0bbdd42b 218
1da177e4
LT
219 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
220 pp->name, tcph->check,
221 (char*)&(tcph->check) - (char*)tcph);
222 }
223 return 1;
224}
225
226
227static int
d4383f04
JDB
228tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
229 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
1da177e4
LT
230{
231 struct tcphdr *tcph;
d4383f04 232 unsigned int tcphoff = iph->len;
503e81f6 233 int oldlen;
8b27b10f 234 int payload_csum = 0;
0bbdd42b
JV
235
236#ifdef CONFIG_IP_VS_IPV6
d4383f04 237 if (cp->af == AF_INET6 && iph->fragoffs)
63dca2c0 238 return 1;
0bbdd42b 239#endif
503e81f6 240 oldlen = skb->len - tcphoff;
1da177e4
LT
241
242 /* csum_check requires unshared skb */
3db05fea 243 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
244 return 0;
245
246 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
247 int ret;
248
1da177e4 249 /* Some checks before mangling */
fe19a8fe 250 if (!tcp_csum_check(cp->af, skb, pp))
1da177e4
LT
251 return 0;
252
253 /*
254 * Attempt ip_vs_app call.
255 * It will fix ip_vs_conn and iph ack_seq stuff
256 */
d12e1229 257 if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
1da177e4 258 return 0;
8b27b10f
JA
259 /* ret=2: csum update is needed after payload mangling */
260 if (ret == 1)
261 oldlen = skb->len - tcphoff;
262 else
263 payload_csum = 1;
1da177e4
LT
264 }
265
0bbdd42b 266 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
267 tcph->dest = cp->dport;
268
269 /*
270 * Adjust TCP checksums
271 */
503e81f6 272 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5bc9068e 273 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
ca62059b
HH
274 htons(oldlen),
275 htons(skb->len - tcphoff));
8b27b10f 276 } else if (!payload_csum) {
1da177e4 277 /* Only port and addr are changed, do fast csum update */
0bbdd42b 278 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
1da177e4 279 cp->vport, cp->dport);
3db05fea 280 if (skb->ip_summed == CHECKSUM_COMPLETE)
fe19a8fe 281 skb->ip_summed = cp->app ?
8b27b10f 282 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
283 } else {
284 /* full checksum calculation */
285 tcph->check = 0;
3db05fea 286 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
287#ifdef CONFIG_IP_VS_IPV6
288 if (cp->af == AF_INET6)
289 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
290 &cp->daddr.in6,
291 skb->len - tcphoff,
292 cp->protocol, skb->csum);
293 else
294#endif
295 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
296 cp->daddr.ip,
297 skb->len - tcphoff,
298 cp->protocol,
299 skb->csum);
3db05fea 300 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4
LT
301 }
302 return 1;
303}
304
305
306static int
51ef348b 307tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
1da177e4 308{
51ef348b
JV
309 unsigned int tcphoff;
310
311#ifdef CONFIG_IP_VS_IPV6
312 if (af == AF_INET6)
313 tcphoff = sizeof(struct ipv6hdr);
314 else
315#endif
316 tcphoff = ip_hdrlen(skb);
1da177e4
LT
317
318 switch (skb->ip_summed) {
319 case CHECKSUM_NONE:
320 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
e8542dce 321 /* fall through */
84fa7933 322 case CHECKSUM_COMPLETE:
51ef348b
JV
323#ifdef CONFIG_IP_VS_IPV6
324 if (af == AF_INET6) {
325 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
326 &ipv6_hdr(skb)->daddr,
327 skb->len - tcphoff,
328 ipv6_hdr(skb)->nexthdr,
329 skb->csum)) {
0d79641a 330 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
331 "Failed checksum for");
332 return 0;
333 }
334 } else
335#endif
336 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
337 ip_hdr(skb)->daddr,
338 skb->len - tcphoff,
339 ip_hdr(skb)->protocol,
340 skb->csum)) {
0d79641a 341 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
342 "Failed checksum for");
343 return 0;
344 }
1da177e4
LT
345 break;
346 default:
84fa7933 347 /* No need to checksum. */
1da177e4
LT
348 break;
349 }
350
351 return 1;
352}
353
354
355#define TCP_DIR_INPUT 0
356#define TCP_DIR_OUTPUT 4
357#define TCP_DIR_INPUT_ONLY 8
358
9b5b5cff 359static const int tcp_state_off[IP_VS_DIR_LAST] = {
1da177e4
LT
360 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
361 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
362 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
363};
364
365/*
366 * Timeout table[state]
367 */
4a85b96c 368static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
1da177e4
LT
369 [IP_VS_TCP_S_NONE] = 2*HZ,
370 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
371 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
372 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
373 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
374 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
375 [IP_VS_TCP_S_CLOSE] = 10*HZ,
376 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
377 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
378 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
379 [IP_VS_TCP_S_SYNACK] = 120*HZ,
380 [IP_VS_TCP_S_LAST] = 2*HZ,
381};
382
36cbd3dc 383static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
1da177e4
LT
384 [IP_VS_TCP_S_NONE] = "NONE",
385 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
386 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
387 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
388 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
389 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
390 [IP_VS_TCP_S_CLOSE] = "CLOSE",
391 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
392 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
393 [IP_VS_TCP_S_LISTEN] = "LISTEN",
394 [IP_VS_TCP_S_SYNACK] = "SYNACK",
395 [IP_VS_TCP_S_LAST] = "BUG!",
396};
397
be2cef49
MK
398static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
399 [IP_VS_TCP_S_NONE] = false,
400 [IP_VS_TCP_S_ESTABLISHED] = true,
401 [IP_VS_TCP_S_SYN_SENT] = true,
402 [IP_VS_TCP_S_SYN_RECV] = true,
403 [IP_VS_TCP_S_FIN_WAIT] = false,
404 [IP_VS_TCP_S_TIME_WAIT] = false,
405 [IP_VS_TCP_S_CLOSE] = false,
406 [IP_VS_TCP_S_CLOSE_WAIT] = false,
407 [IP_VS_TCP_S_LAST_ACK] = false,
408 [IP_VS_TCP_S_LISTEN] = false,
409 [IP_VS_TCP_S_SYNACK] = true,
410};
411
1da177e4
LT
412#define sNO IP_VS_TCP_S_NONE
413#define sES IP_VS_TCP_S_ESTABLISHED
414#define sSS IP_VS_TCP_S_SYN_SENT
415#define sSR IP_VS_TCP_S_SYN_RECV
416#define sFW IP_VS_TCP_S_FIN_WAIT
417#define sTW IP_VS_TCP_S_TIME_WAIT
418#define sCL IP_VS_TCP_S_CLOSE
419#define sCW IP_VS_TCP_S_CLOSE_WAIT
420#define sLA IP_VS_TCP_S_LAST_ACK
421#define sLI IP_VS_TCP_S_LISTEN
422#define sSA IP_VS_TCP_S_SYNACK
423
424struct tcp_states_t {
425 int next_state[IP_VS_TCP_S_LAST];
426};
427
428static const char * tcp_state_name(int state)
429{
430 if (state >= IP_VS_TCP_S_LAST)
431 return "ERR!";
432 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
433}
434
be2cef49
MK
435static bool tcp_state_active(int state)
436{
437 if (state >= IP_VS_TCP_S_LAST)
438 return false;
439 return tcp_state_active_table[state];
440}
441
535101ec 442static struct tcp_states_t tcp_states[] = {
1da177e4
LT
443/* INPUT */
444/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
445/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
446/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
c6c96c18 447/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
1da177e4
LT
448/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
449
450/* OUTPUT */
451/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
452/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
453/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
454/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
455/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
456
457/* INPUT-ONLY */
458/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
459/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
460/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
c6c96c18 461/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
1da177e4
LT
462/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
463};
464
535101ec 465static struct tcp_states_t tcp_states_dos[] = {
1da177e4
LT
466/* INPUT */
467/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
468/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
469/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
c6c96c18 470/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
1da177e4
LT
471/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
472
473/* OUTPUT */
474/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
475/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
476/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
477/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
478/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
479
480/* INPUT-ONLY */
481/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
482/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
483/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
c6c96c18 484/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
1da177e4
LT
485/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
486};
487
9330419d 488static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
1da177e4
LT
489{
490 int on = (flags & 1); /* secure_tcp */
491
492 /*
493 ** FIXME: change secure_tcp to independent sysctl var
494 ** or make it per-service or per-app because it is valid
495 ** for most if not for all of the applications. Something
496 ** like "capabilities" (flags) for each object.
497 */
9330419d 498 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
1da177e4
LT
499}
500
1da177e4
LT
501static inline int tcp_state_idx(struct tcphdr *th)
502{
503 if (th->rst)
504 return 3;
505 if (th->syn)
506 return 0;
507 if (th->fin)
508 return 1;
509 if (th->ack)
510 return 2;
511 return -1;
512}
513
514static inline void
9330419d 515set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
1da177e4
LT
516 int direction, struct tcphdr *th)
517{
518 int state_idx;
519 int new_state = IP_VS_TCP_S_CLOSE;
520 int state_off = tcp_state_off[direction];
521
522 /*
523 * Update state offset to INPUT_ONLY if necessary
524 * or delete NO_OUTPUT flag if output packet detected
525 */
526 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
527 if (state_off == TCP_DIR_OUTPUT)
528 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
529 else
530 state_off = TCP_DIR_INPUT_ONLY;
531 }
532
533 if ((state_idx = tcp_state_idx(th)) < 0) {
534 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
535 goto tcp_state_out;
536 }
537
9330419d
HS
538 new_state =
539 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
1da177e4
LT
540
541 tcp_state_out:
542 if (new_state != cp->state) {
543 struct ip_vs_dest *dest = cp->dest;
544
cfc78c5a
JV
545 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
546 "%s:%d state: %s->%s conn->refcnt:%d\n",
9330419d 547 pd->pp->name,
cfc78c5a
JV
548 ((state_off == TCP_DIR_OUTPUT) ?
549 "output " : "input "),
550 th->syn ? 'S' : '.',
551 th->fin ? 'F' : '.',
552 th->ack ? 'A' : '.',
553 th->rst ? 'R' : '.',
f18ae720 554 IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
cfc78c5a
JV
555 ntohs(cp->dport),
556 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
557 ntohs(cp->cport),
558 tcp_state_name(cp->state),
559 tcp_state_name(new_state),
b54ab92b 560 refcount_read(&cp->refcnt));
cfc78c5a 561
1da177e4
LT
562 if (dest) {
563 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
be2cef49 564 !tcp_state_active(new_state)) {
1da177e4
LT
565 atomic_dec(&dest->activeconns);
566 atomic_inc(&dest->inactconns);
567 cp->flags |= IP_VS_CONN_F_INACTIVE;
568 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
be2cef49 569 tcp_state_active(new_state)) {
1da177e4
LT
570 atomic_inc(&dest->activeconns);
571 atomic_dec(&dest->inactconns);
572 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
573 }
574 }
27541143
JA
575 if (new_state == IP_VS_TCP_S_ESTABLISHED)
576 ip_vs_control_assure_ct(cp);
1da177e4
LT
577 }
578
4a85b96c
HS
579 if (likely(pd))
580 cp->timeout = pd->timeout_table[cp->state = new_state];
581 else /* What to do ? */
582 cp->timeout = tcp_timeouts[cp->state = new_state];
1da177e4
LT
583}
584
1da177e4
LT
585/*
586 * Handle state transitions
587 */
4a516f11 588static void
1da177e4
LT
589tcp_state_transition(struct ip_vs_conn *cp, int direction,
590 const struct sk_buff *skb,
9330419d 591 struct ip_vs_proto_data *pd)
1da177e4
LT
592{
593 struct tcphdr _tcph, *th;
594
0bbdd42b
JV
595#ifdef CONFIG_IP_VS_IPV6
596 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
597#else
598 int ihl = ip_hdrlen(skb);
599#endif
600
601 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
1da177e4 602 if (th == NULL)
4a516f11 603 return;
1da177e4 604
ac69269a 605 spin_lock_bh(&cp->lock);
9330419d 606 set_tcp_state(pd, cp, direction, th);
ac69269a 607 spin_unlock_bh(&cp->lock);
1da177e4
LT
608}
609
75e7ce66 610static inline __u16 tcp_app_hashkey(__be16 port)
1da177e4 611{
75e7ce66
AV
612 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
613 & TCP_APP_TAB_MASK;
1da177e4
LT
614}
615
616
19648918 617static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
1da177e4
LT
618{
619 struct ip_vs_app *i;
75e7ce66
AV
620 __u16 hash;
621 __be16 port = inc->port;
1da177e4 622 int ret = 0;
18d6ade6 623 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
1da177e4
LT
624
625 hash = tcp_app_hashkey(port);
626
4a85b96c 627 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
1da177e4
LT
628 if (i->port == port) {
629 ret = -EEXIST;
630 goto out;
631 }
632 }
363c97d7 633 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
9bbac6a9 634 atomic_inc(&pd->appcnt);
1da177e4
LT
635
636 out:
1da177e4
LT
637 return ret;
638}
639
640
641static void
19648918 642tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
1da177e4 643{
19648918 644 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
4a85b96c 645
9bbac6a9 646 atomic_dec(&pd->appcnt);
363c97d7 647 list_del_rcu(&inc->p_list);
1da177e4
LT
648}
649
650
651static int
652tcp_app_conn_bind(struct ip_vs_conn *cp)
653{
58dbc6f2 654 struct netns_ipvs *ipvs = cp->ipvs;
1da177e4
LT
655 int hash;
656 struct ip_vs_app *inc;
657 int result = 0;
658
659 /* Default binding: bind app only for NAT */
660 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
661 return 0;
662
663 /* Lookup application incarnations and bind the right one */
664 hash = tcp_app_hashkey(cp->vport);
665
363c97d7 666 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
1da177e4
LT
667 if (inc->port == cp->vport) {
668 if (unlikely(!ip_vs_app_inc_get(inc)))
669 break;
1da177e4 670
1e3e238e 671 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
cfc78c5a
JV
672 "%s:%u to app %s on port %u\n",
673 __func__,
674 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
675 ntohs(cp->cport),
676 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
677 ntohs(cp->vport),
678 inc->name, ntohs(inc->port));
679
1da177e4
LT
680 cp->app = inc;
681 if (inc->init_conn)
682 result = inc->init_conn(inc, cp);
0b35f603 683 break;
1da177e4
LT
684 }
685 }
1da177e4 686
1da177e4
LT
687 return result;
688}
689
690
691/*
692 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
693 */
69f39093 694void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
1da177e4 695{
69f39093 696 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
4a85b96c 697
ac69269a 698 spin_lock_bh(&cp->lock);
1da177e4 699 cp->state = IP_VS_TCP_S_LISTEN;
4a85b96c
HS
700 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
701 : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
ac69269a 702 spin_unlock_bh(&cp->lock);
1da177e4
LT
703}
704
4a85b96c
HS
705/* ---------------------------------------------
706 * timeouts is netns related now.
707 * ---------------------------------------------
708 */
1281a9c2 709static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
1da177e4 710{
4a85b96c 711 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
4a85b96c
HS
712 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
713 sizeof(tcp_timeouts));
582b8e3e
HS
714 if (!pd->timeout_table)
715 return -ENOMEM;
9330419d 716 pd->tcp_state_table = tcp_states;
582b8e3e 717 return 0;
4a85b96c 718}
1da177e4 719
1281a9c2 720static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
1da177e4 721{
4a85b96c 722 kfree(pd->timeout_table);
1da177e4
LT
723}
724
725
726struct ip_vs_protocol ip_vs_protocol_tcp = {
727 .name = "TCP",
728 .protocol = IPPROTO_TCP,
2ad17def 729 .num_states = IP_VS_TCP_S_LAST,
1da177e4 730 .dont_defrag = 0,
4a85b96c
HS
731 .init = NULL,
732 .exit = NULL,
733 .init_netns = __ip_vs_tcp_init,
734 .exit_netns = __ip_vs_tcp_exit,
1da177e4
LT
735 .register_app = tcp_register_app,
736 .unregister_app = tcp_unregister_app,
737 .conn_schedule = tcp_conn_schedule,
5c0d2374
SH
738 .conn_in_get = ip_vs_conn_in_get_proto,
739 .conn_out_get = ip_vs_conn_out_get_proto,
1da177e4
LT
740 .snat_handler = tcp_snat_handler,
741 .dnat_handler = tcp_dnat_handler,
1da177e4
LT
742 .state_name = tcp_state_name,
743 .state_transition = tcp_state_transition,
744 .app_conn_bind = tcp_app_conn_bind,
745 .debug_packet = ip_vs_tcpudp_debug_packet,
746 .timeout_change = tcp_timeout_change,
1da177e4 747};