ipvs: Use kthread_run() instead of doing a double-fork via kernel_thread()
[linux-2.6-block.git] / net / ipv4 / ipvs / ip_vs_sync.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
1da177e4
LT
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 *
10 * ip_vs_sync: sync connection info from master load balancer to backups
11 * through multicast
12 *
13 * Changes:
14 * Alexandre Cassen : Added master & backup support at a time.
15 * Alexandre Cassen : Added SyncID support for incoming sync
16 * messages filtering.
17 * Justin Ossevoort : Fix endian problem on sync message size.
18 */
19
20#include <linux/module.h>
21#include <linux/slab.h>
14c85021 22#include <linux/inetdevice.h>
1da177e4
LT
23#include <linux/net.h>
24#include <linux/completion.h>
25#include <linux/delay.h>
26#include <linux/skbuff.h>
27#include <linux/in.h>
28#include <linux/igmp.h> /* for ip_mc_join_group */
14c85021 29#include <linux/udp.h>
e6dd731c 30#include <linux/err.h>
998e7a76 31#include <linux/kthread.h>
1da177e4
LT
32
33#include <net/ip.h>
34#include <net/sock.h>
1da177e4
LT
35
36#include <net/ip_vs.h>
37
38#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
39#define IP_VS_SYNC_PORT 8848 /* multicast port */
40
41
42/*
43 * IPVS sync connection entry
44 */
45struct ip_vs_sync_conn {
46 __u8 reserved;
47
48 /* Protocol, addresses and port numbers */
49 __u8 protocol; /* Which protocol (TCP/UDP) */
014d730d
AV
50 __be16 cport;
51 __be16 vport;
52 __be16 dport;
53 __be32 caddr; /* client address */
54 __be32 vaddr; /* virtual address */
55 __be32 daddr; /* destination address */
1da177e4
LT
56
57 /* Flags and state transition */
014d730d
AV
58 __be16 flags; /* status flags */
59 __be16 state; /* state info */
1da177e4
LT
60
61 /* The sequence options start here */
62};
63
64struct ip_vs_sync_conn_options {
65 struct ip_vs_seq in_seq; /* incoming seq. struct */
66 struct ip_vs_seq out_seq; /* outgoing seq. struct */
67};
68
cc0191ae 69struct ip_vs_sync_thread_data {
998e7a76
SW
70 struct socket *sock;
71 char *buf;
cc0191ae
NH
72};
73
1da177e4
LT
74#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
75#define FULL_CONN_SIZE \
76(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
77
78
79/*
80 The master mulitcasts messages to the backup load balancers in the
81 following format.
82
83 0 1 2 3
84 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
85 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
86 | Count Conns | SyncID | Size |
87 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 | |
89 | IPVS Sync Connection (1) |
90 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
91 | . |
92 | . |
93 | . |
94 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
95 | |
96 | IPVS Sync Connection (n) |
97 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98*/
99
100#define SYNC_MESG_HEADER_LEN 4
101
102struct ip_vs_sync_mesg {
103 __u8 nr_conns;
104 __u8 syncid;
105 __u16 size;
106
107 /* ip_vs_sync_conn entries start here */
108};
109
110/* the maximum length of sync (sending/receiving) message */
111static int sync_send_mesg_maxlen;
112static int sync_recv_mesg_maxlen;
113
114struct ip_vs_sync_buff {
115 struct list_head list;
116 unsigned long firstuse;
117
118 /* pointers for the message data */
119 struct ip_vs_sync_mesg *mesg;
120 unsigned char *head;
121 unsigned char *end;
122};
123
124
125/* the sync_buff list head and the lock */
126static LIST_HEAD(ip_vs_sync_queue);
127static DEFINE_SPINLOCK(ip_vs_sync_lock);
128
129/* current sync_buff for accepting new conn entries */
130static struct ip_vs_sync_buff *curr_sb = NULL;
131static DEFINE_SPINLOCK(curr_sb_lock);
132
133/* ipvs sync daemon state */
134volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
135volatile int ip_vs_master_syncid = 0;
136volatile int ip_vs_backup_syncid = 0;
137
138/* multicast interface name */
139char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
140char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
141
998e7a76
SW
142/* sync daemon tasks */
143static struct task_struct *sync_master_thread;
144static struct task_struct *sync_backup_thread;
145
1da177e4 146/* multicast addr */
d5640050
SW
147static struct sockaddr_in mcast_addr = {
148 .sin_family = AF_INET,
149 .sin_port = __constant_htons(IP_VS_SYNC_PORT),
150 .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
151};
1da177e4
LT
152
153
998e7a76 154static inline struct ip_vs_sync_buff *sb_dequeue(void)
1da177e4
LT
155{
156 struct ip_vs_sync_buff *sb;
157
158 spin_lock_bh(&ip_vs_sync_lock);
159 if (list_empty(&ip_vs_sync_queue)) {
160 sb = NULL;
161 } else {
162 sb = list_entry(ip_vs_sync_queue.next,
163 struct ip_vs_sync_buff,
164 list);
165 list_del(&sb->list);
166 }
167 spin_unlock_bh(&ip_vs_sync_lock);
168
169 return sb;
170}
171
172static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
173{
174 struct ip_vs_sync_buff *sb;
175
176 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
177 return NULL;
178
179 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
180 kfree(sb);
181 return NULL;
182 }
183 sb->mesg->nr_conns = 0;
184 sb->mesg->syncid = ip_vs_master_syncid;
185 sb->mesg->size = 4;
186 sb->head = (unsigned char *)sb->mesg + 4;
187 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
188 sb->firstuse = jiffies;
189 return sb;
190}
191
192static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
193{
194 kfree(sb->mesg);
195 kfree(sb);
196}
197
998e7a76
SW
198static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
199{
200 spin_lock(&ip_vs_sync_lock);
201 if (ip_vs_sync_state & IP_VS_STATE_MASTER)
202 list_add_tail(&sb->list, &ip_vs_sync_queue);
203 else
204 ip_vs_sync_buff_release(sb);
205 spin_unlock(&ip_vs_sync_lock);
206}
207
1da177e4
LT
208/*
209 * Get the current sync buffer if it has been created for more
210 * than the specified time or the specified time is zero.
211 */
212static inline struct ip_vs_sync_buff *
213get_curr_sync_buff(unsigned long time)
214{
215 struct ip_vs_sync_buff *sb;
216
217 spin_lock_bh(&curr_sb_lock);
218 if (curr_sb && (time == 0 ||
219 time_before(jiffies - curr_sb->firstuse, time))) {
220 sb = curr_sb;
221 curr_sb = NULL;
222 } else
223 sb = NULL;
224 spin_unlock_bh(&curr_sb_lock);
225 return sb;
226}
227
228
229/*
230 * Add an ip_vs_conn information into the current sync_buff.
231 * Called by ip_vs_in.
232 */
233void ip_vs_sync_conn(struct ip_vs_conn *cp)
234{
235 struct ip_vs_sync_mesg *m;
236 struct ip_vs_sync_conn *s;
237 int len;
238
239 spin_lock(&curr_sb_lock);
240 if (!curr_sb) {
241 if (!(curr_sb=ip_vs_sync_buff_create())) {
242 spin_unlock(&curr_sb_lock);
243 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
244 return;
245 }
246 }
247
248 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
249 SIMPLE_CONN_SIZE;
250 m = curr_sb->mesg;
251 s = (struct ip_vs_sync_conn *)curr_sb->head;
252
253 /* copy members */
254 s->protocol = cp->protocol;
255 s->cport = cp->cport;
256 s->vport = cp->vport;
257 s->dport = cp->dport;
258 s->caddr = cp->caddr;
259 s->vaddr = cp->vaddr;
260 s->daddr = cp->daddr;
261 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
262 s->state = htons(cp->state);
263 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
264 struct ip_vs_sync_conn_options *opt =
265 (struct ip_vs_sync_conn_options *)&s[1];
266 memcpy(opt, &cp->in_seq, sizeof(*opt));
267 }
268
269 m->nr_conns++;
270 m->size += len;
271 curr_sb->head += len;
272
273 /* check if there is a space for next one */
274 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
275 sb_queue_tail(curr_sb);
276 curr_sb = NULL;
277 }
278 spin_unlock(&curr_sb_lock);
279
280 /* synchronize its controller if it has */
281 if (cp->control)
282 ip_vs_sync_conn(cp->control);
283}
284
285
286/*
287 * Process received multicast message and create the corresponding
288 * ip_vs_conn entries.
289 */
290static void ip_vs_process_message(const char *buffer, const size_t buflen)
291{
292 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
293 struct ip_vs_sync_conn *s;
294 struct ip_vs_sync_conn_options *opt;
295 struct ip_vs_conn *cp;
5c81833c 296 struct ip_vs_protocol *pp;
1e356f9c 297 struct ip_vs_dest *dest;
1da177e4
LT
298 char *p;
299 int i;
300
2ad17def
JA
301 if (buflen < sizeof(struct ip_vs_sync_mesg)) {
302 IP_VS_ERR_RL("sync message header too short\n");
303 return;
304 }
305
1da177e4
LT
306 /* Convert size back to host byte order */
307 m->size = ntohs(m->size);
308
309 if (buflen != m->size) {
2ad17def 310 IP_VS_ERR_RL("bogus sync message size\n");
1da177e4
LT
311 return;
312 }
313
314 /* SyncID sanity check */
315 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
316 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
317 m->syncid);
318 return;
319 }
320
321 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
322 for (i=0; i<m->nr_conns; i++) {
b209639e 323 unsigned flags, state;
87375ab4 324
2ad17def
JA
325 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
326 IP_VS_ERR_RL("bogus conn in sync message\n");
327 return;
328 }
329 s = (struct ip_vs_sync_conn *) p;
7a4fbb1f 330 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
2ad17def
JA
331 flags &= ~IP_VS_CONN_F_HASHED;
332 if (flags & IP_VS_CONN_F_SEQ_MASK) {
333 opt = (struct ip_vs_sync_conn_options *)&s[1];
334 p += FULL_CONN_SIZE;
335 if (p > buffer+buflen) {
336 IP_VS_ERR_RL("bogus conn options in sync message\n");
337 return;
338 }
339 } else {
340 opt = NULL;
341 p += SIMPLE_CONN_SIZE;
342 }
343
b209639e 344 state = ntohs(s->state);
2ad17def
JA
345 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
346 pp = ip_vs_proto_get(s->protocol);
347 if (!pp) {
348 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
349 s->protocol);
350 continue;
351 }
352 if (state >= pp->num_states) {
353 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
354 pp->name, state);
355 continue;
356 }
357 } else {
358 /* protocol in templates is not used for state/timeout */
359 pp = NULL;
360 if (state > 0) {
361 IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
362 state);
363 state = 0;
364 }
365 }
366
87375ab4
JA
367 if (!(flags & IP_VS_CONN_F_TEMPLATE))
368 cp = ip_vs_conn_in_get(s->protocol,
369 s->caddr, s->cport,
370 s->vaddr, s->vport);
371 else
372 cp = ip_vs_ct_in_get(s->protocol,
373 s->caddr, s->cport,
374 s->vaddr, s->vport);
1da177e4 375 if (!cp) {
1e356f9c
RB
376 /*
377 * Find the appropriate destination for the connection.
378 * If it is not found the connection will remain unbound
379 * but still handled.
380 */
381 dest = ip_vs_find_dest(s->daddr, s->dport,
382 s->vaddr, s->vport,
383 s->protocol);
b209639e
RB
384 /* Set the approprite ativity flag */
385 if (s->protocol == IPPROTO_TCP) {
386 if (state != IP_VS_TCP_S_ESTABLISHED)
387 flags |= IP_VS_CONN_F_INACTIVE;
388 else
389 flags &= ~IP_VS_CONN_F_INACTIVE;
390 }
1da177e4
LT
391 cp = ip_vs_conn_new(s->protocol,
392 s->caddr, s->cport,
393 s->vaddr, s->vport,
394 s->daddr, s->dport,
1e356f9c
RB
395 flags, dest);
396 if (dest)
397 atomic_dec(&dest->refcnt);
1da177e4
LT
398 if (!cp) {
399 IP_VS_ERR("ip_vs_conn_new failed\n");
400 return;
401 }
1da177e4 402 } else if (!cp->dest) {
1e356f9c 403 dest = ip_vs_try_bind_dest(cp);
2ad17def 404 if (dest)
1e356f9c 405 atomic_dec(&dest->refcnt);
b209639e
RB
406 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
407 (cp->state != state)) {
408 /* update active/inactive flag for the connection */
409 dest = cp->dest;
410 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
411 (state != IP_VS_TCP_S_ESTABLISHED)) {
412 atomic_dec(&dest->activeconns);
413 atomic_inc(&dest->inactconns);
414 cp->flags |= IP_VS_CONN_F_INACTIVE;
415 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
416 (state == IP_VS_TCP_S_ESTABLISHED)) {
417 atomic_inc(&dest->activeconns);
418 atomic_dec(&dest->inactconns);
419 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
420 }
421 }
1da177e4 422
2ad17def 423 if (opt)
1da177e4 424 memcpy(&cp->in_seq, opt, sizeof(*opt));
1da177e4 425 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
b209639e 426 cp->state = state;
2ad17def
JA
427 cp->old_state = cp->state;
428 /*
429 * We can not recover the right timeout for templates
430 * in all cases, we can not find the right fwmark
431 * virtual service. If needed, we can do it for
432 * non-fwmark persistent services.
433 */
434 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
435 cp->timeout = pp->timeout_table[state];
436 else
437 cp->timeout = (3*60*HZ);
1da177e4 438 ip_vs_conn_put(cp);
1da177e4
LT
439 }
440}
441
442
443/*
444 * Setup loopback of outgoing multicasts on a sending socket
445 */
446static void set_mcast_loop(struct sock *sk, u_char loop)
447{
448 struct inet_sock *inet = inet_sk(sk);
449
450 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
451 lock_sock(sk);
452 inet->mc_loop = loop ? 1 : 0;
453 release_sock(sk);
454}
455
456/*
457 * Specify TTL for outgoing multicasts on a sending socket
458 */
459static void set_mcast_ttl(struct sock *sk, u_char ttl)
460{
461 struct inet_sock *inet = inet_sk(sk);
462
463 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
464 lock_sock(sk);
465 inet->mc_ttl = ttl;
466 release_sock(sk);
467}
468
469/*
470 * Specifiy default interface for outgoing multicasts
471 */
472static int set_mcast_if(struct sock *sk, char *ifname)
473{
474 struct net_device *dev;
475 struct inet_sock *inet = inet_sk(sk);
476
881d966b 477 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
1da177e4
LT
478 return -ENODEV;
479
480 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
481 return -EINVAL;
482
483 lock_sock(sk);
484 inet->mc_index = dev->ifindex;
485 /* inet->mc_addr = 0; */
486 release_sock(sk);
487
488 return 0;
489}
490
491
492/*
493 * Set the maximum length of sync message according to the
494 * specified interface's MTU.
495 */
496static int set_sync_mesg_maxlen(int sync_state)
497{
498 struct net_device *dev;
499 int num;
500
501 if (sync_state == IP_VS_STATE_MASTER) {
881d966b 502 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
1da177e4
LT
503 return -ENODEV;
504
505 num = (dev->mtu - sizeof(struct iphdr) -
506 sizeof(struct udphdr) -
507 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
508 sync_send_mesg_maxlen =
509 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
510 IP_VS_DBG(7, "setting the maximum length of sync sending "
511 "message %d.\n", sync_send_mesg_maxlen);
512 } else if (sync_state == IP_VS_STATE_BACKUP) {
881d966b 513 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
1da177e4
LT
514 return -ENODEV;
515
516 sync_recv_mesg_maxlen = dev->mtu -
517 sizeof(struct iphdr) - sizeof(struct udphdr);
518 IP_VS_DBG(7, "setting the maximum length of sync receiving "
519 "message %d.\n", sync_recv_mesg_maxlen);
520 }
521
522 return 0;
523}
524
525
526/*
527 * Join a multicast group.
528 * the group is specified by a class D multicast address 224.0.0.0/8
529 * in the in_addr structure passed in as a parameter.
530 */
531static int
532join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
533{
534 struct ip_mreqn mreq;
535 struct net_device *dev;
536 int ret;
537
538 memset(&mreq, 0, sizeof(mreq));
539 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
540
881d966b 541 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
1da177e4
LT
542 return -ENODEV;
543 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
544 return -EINVAL;
545
546 mreq.imr_ifindex = dev->ifindex;
547
548 lock_sock(sk);
549 ret = ip_mc_join_group(sk, &mreq);
550 release_sock(sk);
551
552 return ret;
553}
554
555
556static int bind_mcastif_addr(struct socket *sock, char *ifname)
557{
558 struct net_device *dev;
a61ced5d 559 __be32 addr;
1da177e4
LT
560 struct sockaddr_in sin;
561
881d966b 562 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
1da177e4
LT
563 return -ENODEV;
564
565 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
566 if (!addr)
567 IP_VS_ERR("You probably need to specify IP address on "
568 "multicast interface.\n");
569
570 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
571 ifname, NIPQUAD(addr));
572
573 /* Now bind the socket with the address of multicast interface */
574 sin.sin_family = AF_INET;
575 sin.sin_addr.s_addr = addr;
576 sin.sin_port = 0;
577
578 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
579}
580
581/*
582 * Set up sending multicast socket over UDP
583 */
584static struct socket * make_send_sock(void)
585{
586 struct socket *sock;
e6dd731c 587 int result;
1da177e4
LT
588
589 /* First create a socket */
e6dd731c
SW
590 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
591 if (result < 0) {
1da177e4 592 IP_VS_ERR("Error during creation of socket; terminating\n");
e6dd731c 593 return ERR_PTR(result);
1da177e4
LT
594 }
595
e6dd731c
SW
596 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
597 if (result < 0) {
1da177e4
LT
598 IP_VS_ERR("Error setting outbound mcast interface\n");
599 goto error;
600 }
601
602 set_mcast_loop(sock->sk, 0);
603 set_mcast_ttl(sock->sk, 1);
604
e6dd731c
SW
605 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
606 if (result < 0) {
1da177e4
LT
607 IP_VS_ERR("Error binding address of the mcast interface\n");
608 goto error;
609 }
610
e6dd731c
SW
611 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
612 sizeof(struct sockaddr), 0);
613 if (result < 0) {
1da177e4
LT
614 IP_VS_ERR("Error connecting to the multicast addr\n");
615 goto error;
616 }
617
618 return sock;
619
620 error:
621 sock_release(sock);
e6dd731c 622 return ERR_PTR(result);
1da177e4
LT
623}
624
625
626/*
627 * Set up receiving multicast socket over UDP
628 */
629static struct socket * make_receive_sock(void)
630{
631 struct socket *sock;
e6dd731c 632 int result;
1da177e4
LT
633
634 /* First create a socket */
e6dd731c
SW
635 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
636 if (result < 0) {
1da177e4 637 IP_VS_ERR("Error during creation of socket; terminating\n");
e6dd731c 638 return ERR_PTR(result);
1da177e4
LT
639 }
640
641 /* it is equivalent to the REUSEADDR option in user-space */
642 sock->sk->sk_reuse = 1;
643
e6dd731c
SW
644 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
645 sizeof(struct sockaddr));
646 if (result < 0) {
1da177e4
LT
647 IP_VS_ERR("Error binding to the multicast addr\n");
648 goto error;
649 }
650
651 /* join the multicast group */
e6dd731c
SW
652 result = join_mcast_group(sock->sk,
653 (struct in_addr *) &mcast_addr.sin_addr,
654 ip_vs_backup_mcast_ifn);
655 if (result < 0) {
1da177e4
LT
656 IP_VS_ERR("Error joining to the multicast group\n");
657 goto error;
658 }
659
660 return sock;
661
662 error:
663 sock_release(sock);
e6dd731c 664 return ERR_PTR(result);
1da177e4
LT
665}
666
667
668static int
669ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
670{
671 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
672 struct kvec iov;
673 int len;
674
675 EnterFunction(7);
676 iov.iov_base = (void *)buffer;
677 iov.iov_len = length;
678
679 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
680
681 LeaveFunction(7);
682 return len;
683}
684
685static void
686ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
687{
688 int msize;
689
690 msize = msg->size;
691
692 /* Put size in network byte order */
693 msg->size = htons(msg->size);
694
695 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
696 IP_VS_ERR("ip_vs_send_async error\n");
697}
698
699static int
700ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
701{
702 struct msghdr msg = {NULL,};
703 struct kvec iov;
704 int len;
705
706 EnterFunction(7);
707
708 /* Receive a packet */
709 iov.iov_base = buffer;
710 iov.iov_len = (size_t)buflen;
711
712 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
713
714 if (len < 0)
715 return -1;
716
717 LeaveFunction(7);
718 return len;
719}
720
721
998e7a76 722static int sync_thread_master(void *data)
1da177e4 723{
998e7a76 724 struct ip_vs_sync_thread_data *tinfo = data;
1da177e4
LT
725 struct ip_vs_sync_buff *sb;
726
1da177e4
LT
727 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
728 "syncid = %d\n",
729 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
730
998e7a76
SW
731 while (!kthread_should_stop()) {
732 while ((sb = sb_dequeue())) {
733 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1da177e4
LT
734 ip_vs_sync_buff_release(sb);
735 }
736
737 /* check if entries stay in curr_sb for 2 seconds */
998e7a76
SW
738 sb = get_curr_sync_buff(2 * HZ);
739 if (sb) {
740 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1da177e4
LT
741 ip_vs_sync_buff_release(sb);
742 }
743
89eaeb09 744 msleep_interruptible(1000);
1da177e4
LT
745 }
746
747 /* clean up the sync_buff queue */
748 while ((sb=sb_dequeue())) {
749 ip_vs_sync_buff_release(sb);
750 }
751
752 /* clean up the current sync_buff */
753 if ((sb = get_curr_sync_buff(0))) {
754 ip_vs_sync_buff_release(sb);
755 }
756
757 /* release the sending multicast socket */
998e7a76
SW
758 sock_release(tinfo->sock);
759 kfree(tinfo);
760
761 return 0;
1da177e4
LT
762}
763
764
998e7a76 765static int sync_thread_backup(void *data)
1da177e4 766{
998e7a76 767 struct ip_vs_sync_thread_data *tinfo = data;
1da177e4
LT
768 int len;
769
1da177e4
LT
770 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
771 "syncid = %d\n",
772 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
773
998e7a76
SW
774 while (!kthread_should_stop()) {
775 /* do we have data now? */
776 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
777 len = ip_vs_receive(tinfo->sock, tinfo->buf,
778 sync_recv_mesg_maxlen);
779 if (len <= 0) {
1da177e4
LT
780 IP_VS_ERR("receiving message error\n");
781 break;
782 }
998e7a76
SW
783
784 /* disable bottom half, because it accesses the data
1da177e4
LT
785 shared by softirq while getting/creating conns */
786 local_bh_disable();
998e7a76 787 ip_vs_process_message(tinfo->buf, len);
1da177e4
LT
788 local_bh_enable();
789 }
790
89eaeb09 791 msleep_interruptible(1000);
1da177e4
LT
792 }
793
794 /* release the sending multicast socket */
998e7a76
SW
795 sock_release(tinfo->sock);
796 kfree(tinfo->buf);
797 kfree(tinfo);
1da177e4 798
998e7a76 799 return 0;
1da177e4
LT
800}
801
802
998e7a76 803int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
1da177e4 804{
998e7a76
SW
805 struct ip_vs_sync_thread_data *tinfo;
806 struct task_struct **realtask, *task;
807 struct socket *sock;
808 char *name, *buf = NULL;
809 int (*threadfn)(void *data);
810 int result = -ENOMEM;
1da177e4 811
998e7a76
SW
812 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
813 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
814 sizeof(struct ip_vs_sync_conn));
1da177e4 815
998e7a76
SW
816 if (state == IP_VS_STATE_MASTER) {
817 if (sync_master_thread)
818 return -EEXIST;
1da177e4 819
998e7a76
SW
820 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
821 sizeof(ip_vs_master_mcast_ifn));
822 ip_vs_master_syncid = syncid;
823 realtask = &sync_master_thread;
1da177e4 824 name = "ipvs_syncmaster";
998e7a76
SW
825 threadfn = sync_thread_master;
826 sock = make_send_sock();
827 } else if (state == IP_VS_STATE_BACKUP) {
828 if (sync_backup_thread)
829 return -EEXIST;
830
831 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
832 sizeof(ip_vs_backup_mcast_ifn));
833 ip_vs_backup_syncid = syncid;
834 realtask = &sync_backup_thread;
1da177e4 835 name = "ipvs_syncbackup";
998e7a76
SW
836 threadfn = sync_thread_backup;
837 sock = make_receive_sock();
1da177e4 838 } else {
1da177e4
LT
839 return -EINVAL;
840 }
841
998e7a76
SW
842 if (IS_ERR(sock)) {
843 result = PTR_ERR(sock);
844 goto out;
845 }
1da177e4 846
1da177e4 847 set_sync_mesg_maxlen(state);
998e7a76
SW
848 if (state == IP_VS_STATE_BACKUP) {
849 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
850 if (!buf)
851 goto outsocket;
852 }
1da177e4 853
998e7a76
SW
854 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
855 if (!tinfo)
856 goto outbuf;
cc0191ae 857
998e7a76
SW
858 tinfo->sock = sock;
859 tinfo->buf = buf;
1da177e4 860
998e7a76
SW
861 task = kthread_run(threadfn, tinfo, name);
862 if (IS_ERR(task)) {
863 result = PTR_ERR(task);
864 goto outtinfo;
865 }
1da177e4 866
998e7a76
SW
867 /* mark as active */
868 *realtask = task;
869 ip_vs_sync_state |= state;
1da177e4 870
998e7a76
SW
871 /* increase the module use count */
872 ip_vs_use_count_inc();
1da177e4
LT
873
874 return 0;
1da177e4 875
998e7a76
SW
876outtinfo:
877 kfree(tinfo);
878outbuf:
879 kfree(buf);
880outsocket:
881 sock_release(sock);
882out:
883 return result;
1da177e4
LT
884}
885
886
998e7a76 887int stop_sync_thread(int state)
1da177e4 888{
0dc47877 889 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
1da177e4 890
1da177e4 891 if (state == IP_VS_STATE_MASTER) {
998e7a76
SW
892 if (!sync_master_thread)
893 return -ESRCH;
cc0191ae 894
998e7a76
SW
895 IP_VS_INFO("stopping master sync thread %d ...\n",
896 task_pid_nr(sync_master_thread));
1da177e4 897
998e7a76
SW
898 /*
899 * The lock synchronizes with sb_queue_tail(), so that we don't
900 * add sync buffers to the queue, when we are already in
901 * progress of stopping the master sync daemon.
902 */
1da177e4 903
998e7a76
SW
904 spin_lock(&ip_vs_sync_lock);
905 ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
906 spin_unlock(&ip_vs_sync_lock);
907 kthread_stop(sync_master_thread);
908 sync_master_thread = NULL;
909 } else if (state == IP_VS_STATE_BACKUP) {
910 if (!sync_backup_thread)
911 return -ESRCH;
912
913 IP_VS_INFO("stopping backup sync thread %d ...\n",
914 task_pid_nr(sync_backup_thread));
915
916 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
917 kthread_stop(sync_backup_thread);
918 sync_backup_thread = NULL;
919 } else {
920 return -EINVAL;
921 }
1da177e4 922
998e7a76
SW
923 /* decrease the module use count */
924 ip_vs_use_count_dec();
1da177e4
LT
925
926 return 0;
927}