cfb1d20993d1d412042a61501d73b57c27f3d8af
[linux-2.6-block.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #include <linux/module.h>
22 #include <linux/init.h>
23 #include <linux/types.h>
24 #include <linux/capability.h>
25 #include <linux/fs.h>
26 #include <linux/sysctl.h>
27 #include <linux/proc_fs.h>
28 #include <linux/workqueue.h>
29 #include <linux/swap.h>
30 #include <linux/seq_file.h>
31
32 #include <linux/netfilter.h>
33 #include <linux/netfilter_ipv4.h>
34 #include <linux/mutex.h>
35
36 #include <net/net_namespace.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <net/sock.h>
40
41 #include <asm/uaccess.h>
42
43 #include <net/ip_vs.h>
44
45 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46 static DEFINE_MUTEX(__ip_vs_mutex);
47
48 /* lock for service table */
49 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51 /* lock for table with the real services */
52 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54 /* lock for state and timeout tables */
55 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57 /* lock for drop entry handling */
58 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60 /* lock for drop packet handling */
61 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63 /* 1/rate drop and drop-entry variables */
64 int ip_vs_drop_rate = 0;
65 int ip_vs_drop_counter = 0;
66 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68 /* number of virtual services */
69 static int ip_vs_num_services = 0;
70
71 /* sysctl variables */
72 static int sysctl_ip_vs_drop_entry = 0;
73 static int sysctl_ip_vs_drop_packet = 0;
74 static int sysctl_ip_vs_secure_tcp = 0;
75 static int sysctl_ip_vs_amemthresh = 1024;
76 static int sysctl_ip_vs_am_droprate = 10;
77 int sysctl_ip_vs_cache_bypass = 0;
78 int sysctl_ip_vs_expire_nodest_conn = 0;
79 int sysctl_ip_vs_expire_quiescent_template = 0;
80 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81 int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84 #ifdef CONFIG_IP_VS_DEBUG
85 static int sysctl_ip_vs_debug_level = 0;
86
87 int ip_vs_get_debug_level(void)
88 {
89         return sysctl_ip_vs_debug_level;
90 }
91 #endif
92
93 /*
94  *      update_defense_level is called from keventd and from sysctl,
95  *      so it needs to protect itself from softirqs
96  */
97 static void update_defense_level(void)
98 {
99         struct sysinfo i;
100         static int old_secure_tcp = 0;
101         int availmem;
102         int nomem;
103         int to_change = -1;
104
105         /* we only count free and buffered memory (in pages) */
106         si_meminfo(&i);
107         availmem = i.freeram + i.bufferram;
108         /* however in linux 2.5 the i.bufferram is total page cache size,
109            we need adjust it */
110         /* si_swapinfo(&i); */
111         /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113         nomem = (availmem < sysctl_ip_vs_amemthresh);
114
115         local_bh_disable();
116
117         /* drop_entry */
118         spin_lock(&__ip_vs_dropentry_lock);
119         switch (sysctl_ip_vs_drop_entry) {
120         case 0:
121                 atomic_set(&ip_vs_dropentry, 0);
122                 break;
123         case 1:
124                 if (nomem) {
125                         atomic_set(&ip_vs_dropentry, 1);
126                         sysctl_ip_vs_drop_entry = 2;
127                 } else {
128                         atomic_set(&ip_vs_dropentry, 0);
129                 }
130                 break;
131         case 2:
132                 if (nomem) {
133                         atomic_set(&ip_vs_dropentry, 1);
134                 } else {
135                         atomic_set(&ip_vs_dropentry, 0);
136                         sysctl_ip_vs_drop_entry = 1;
137                 };
138                 break;
139         case 3:
140                 atomic_set(&ip_vs_dropentry, 1);
141                 break;
142         }
143         spin_unlock(&__ip_vs_dropentry_lock);
144
145         /* drop_packet */
146         spin_lock(&__ip_vs_droppacket_lock);
147         switch (sysctl_ip_vs_drop_packet) {
148         case 0:
149                 ip_vs_drop_rate = 0;
150                 break;
151         case 1:
152                 if (nomem) {
153                         ip_vs_drop_rate = ip_vs_drop_counter
154                                 = sysctl_ip_vs_amemthresh /
155                                 (sysctl_ip_vs_amemthresh-availmem);
156                         sysctl_ip_vs_drop_packet = 2;
157                 } else {
158                         ip_vs_drop_rate = 0;
159                 }
160                 break;
161         case 2:
162                 if (nomem) {
163                         ip_vs_drop_rate = ip_vs_drop_counter
164                                 = sysctl_ip_vs_amemthresh /
165                                 (sysctl_ip_vs_amemthresh-availmem);
166                 } else {
167                         ip_vs_drop_rate = 0;
168                         sysctl_ip_vs_drop_packet = 1;
169                 }
170                 break;
171         case 3:
172                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173                 break;
174         }
175         spin_unlock(&__ip_vs_droppacket_lock);
176
177         /* secure_tcp */
178         write_lock(&__ip_vs_securetcp_lock);
179         switch (sysctl_ip_vs_secure_tcp) {
180         case 0:
181                 if (old_secure_tcp >= 2)
182                         to_change = 0;
183                 break;
184         case 1:
185                 if (nomem) {
186                         if (old_secure_tcp < 2)
187                                 to_change = 1;
188                         sysctl_ip_vs_secure_tcp = 2;
189                 } else {
190                         if (old_secure_tcp >= 2)
191                                 to_change = 0;
192                 }
193                 break;
194         case 2:
195                 if (nomem) {
196                         if (old_secure_tcp < 2)
197                                 to_change = 1;
198                 } else {
199                         if (old_secure_tcp >= 2)
200                                 to_change = 0;
201                         sysctl_ip_vs_secure_tcp = 1;
202                 }
203                 break;
204         case 3:
205                 if (old_secure_tcp < 2)
206                         to_change = 1;
207                 break;
208         }
209         old_secure_tcp = sysctl_ip_vs_secure_tcp;
210         if (to_change >= 0)
211                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212         write_unlock(&__ip_vs_securetcp_lock);
213
214         local_bh_enable();
215 }
216
217
218 /*
219  *      Timer for checking the defense
220  */
221 #define DEFENSE_TIMER_PERIOD    1*HZ
222 static void defense_work_handler(struct work_struct *work);
223 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
224
225 static void defense_work_handler(struct work_struct *work)
226 {
227         update_defense_level();
228         if (atomic_read(&ip_vs_dropentry))
229                 ip_vs_random_dropentry();
230
231         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232 }
233
234 int
235 ip_vs_use_count_inc(void)
236 {
237         return try_module_get(THIS_MODULE);
238 }
239
240 void
241 ip_vs_use_count_dec(void)
242 {
243         module_put(THIS_MODULE);
244 }
245
246
247 /*
248  *      Hash table: for virtual service lookups
249  */
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259 /*
260  *      Hash table: for real service lookups
261  */
262 #define IP_VS_RTAB_BITS 4
263 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268 /*
269  *      Trash for destinations
270  */
271 static LIST_HEAD(ip_vs_dest_trash);
272
273 /*
274  *      FTP & NULL virtual service counters
275  */
276 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280 /*
281  *      Returns hash value for virtual service
282  */
283 static __inline__ unsigned
284 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
285 {
286         register unsigned porth = ntohs(port);
287
288         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289                 & IP_VS_SVC_TAB_MASK;
290 }
291
292 /*
293  *      Returns hash value of fwmark for virtual service lookup
294  */
295 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296 {
297         return fwmark & IP_VS_SVC_TAB_MASK;
298 }
299
300 /*
301  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302  *      or in the ip_vs_svc_fwm_table by fwmark.
303  *      Should be called with locked tables.
304  */
305 static int ip_vs_svc_hash(struct ip_vs_service *svc)
306 {
307         unsigned hash;
308
309         if (svc->flags & IP_VS_SVC_F_HASHED) {
310                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311                           "called from %p\n", __builtin_return_address(0));
312                 return 0;
313         }
314
315         if (svc->fwmark == 0) {
316                 /*
317                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
318                  */
319                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321         } else {
322                 /*
323                  *  Hash it by fwmark in ip_vs_svc_fwm_table
324                  */
325                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327         }
328
329         svc->flags |= IP_VS_SVC_F_HASHED;
330         /* increase its refcnt because it is referenced by the svc table */
331         atomic_inc(&svc->refcnt);
332         return 1;
333 }
334
335
336 /*
337  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338  *      Should be called with locked tables.
339  */
340 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341 {
342         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344                           "called from %p\n", __builtin_return_address(0));
345                 return 0;
346         }
347
348         if (svc->fwmark == 0) {
349                 /* Remove it from the ip_vs_svc_table table */
350                 list_del(&svc->s_list);
351         } else {
352                 /* Remove it from the ip_vs_svc_fwm_table table */
353                 list_del(&svc->f_list);
354         }
355
356         svc->flags &= ~IP_VS_SVC_F_HASHED;
357         atomic_dec(&svc->refcnt);
358         return 1;
359 }
360
361
362 /*
363  *      Get service by {proto,addr,port} in the service table.
364  */
365 static __inline__ struct ip_vs_service *
366 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
367 {
368         unsigned hash;
369         struct ip_vs_service *svc;
370
371         /* Check for "full" addressed entries */
372         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375                 if ((svc->addr == vaddr)
376                     && (svc->port == vport)
377                     && (svc->protocol == protocol)) {
378                         /* HIT */
379                         atomic_inc(&svc->usecnt);
380                         return svc;
381                 }
382         }
383
384         return NULL;
385 }
386
387
388 /*
389  *      Get service by {fwmark} in the service table.
390  */
391 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392 {
393         unsigned hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark) {
401                         /* HIT */
402                         atomic_inc(&svc->usecnt);
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
412 {
413         struct ip_vs_service *svc;
414
415         read_lock(&__ip_vs_svc_lock);
416
417         /*
418          *      Check the table hashed by fwmark first
419          */
420         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421                 goto out;
422
423         /*
424          *      Check the table hashed by <protocol,addr,port>
425          *      for "full" addressed entries
426          */
427         svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429         if (svc == NULL
430             && protocol == IPPROTO_TCP
431             && atomic_read(&ip_vs_ftpsvc_counter)
432             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433                 /*
434                  * Check if ftp service entry exists, the packet
435                  * might belong to FTP data connections.
436                  */
437                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438         }
439
440         if (svc == NULL
441             && atomic_read(&ip_vs_nullsvc_counter)) {
442                 /*
443                  * Check if the catch-all port (port zero) exists
444                  */
445                 svc = __ip_vs_service_get(protocol, vaddr, 0);
446         }
447
448   out:
449         read_unlock(&__ip_vs_svc_lock);
450
451         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452                   fwmark, ip_vs_proto_name(protocol),
453                   NIPQUAD(vaddr), ntohs(vport),
454                   svc?"hit":"not hit");
455
456         return svc;
457 }
458
459
460 static inline void
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 {
463         atomic_inc(&svc->refcnt);
464         dest->svc = svc;
465 }
466
467 static inline void
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 {
470         struct ip_vs_service *svc = dest->svc;
471
472         dest->svc = NULL;
473         if (atomic_dec_and_test(&svc->refcnt))
474                 kfree(svc);
475 }
476
477
478 /*
479  *      Returns hash value for real service
480  */
481 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
482 {
483         register unsigned porth = ntohs(port);
484
485         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486                 & IP_VS_RTAB_MASK;
487 }
488
489 /*
490  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491  *      should be called with locked tables.
492  */
493 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494 {
495         unsigned hash;
496
497         if (!list_empty(&dest->d_list)) {
498                 return 0;
499         }
500
501         /*
502          *      Hash by proto,addr,port,
503          *      which are the parameters of the real service.
504          */
505         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506         list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508         return 1;
509 }
510
511 /*
512  *      UNhashes ip_vs_dest from ip_vs_rtable.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516 {
517         /*
518          * Remove it from the ip_vs_rtable table.
519          */
520         if (!list_empty(&dest->d_list)) {
521                 list_del(&dest->d_list);
522                 INIT_LIST_HEAD(&dest->d_list);
523         }
524
525         return 1;
526 }
527
528 /*
529  *      Lookup real service by <proto,addr,port> in the real service table.
530  */
531 struct ip_vs_dest *
532 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
533 {
534         unsigned hash;
535         struct ip_vs_dest *dest;
536
537         /*
538          *      Check for "full" addressed entries
539          *      Return the first found entry
540          */
541         hash = ip_vs_rs_hashkey(daddr, dport);
542
543         read_lock(&__ip_vs_rs_lock);
544         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545                 if ((dest->addr == daddr)
546                     && (dest->port == dport)
547                     && ((dest->protocol == protocol) ||
548                         dest->vfwmark)) {
549                         /* HIT */
550                         read_unlock(&__ip_vs_rs_lock);
551                         return dest;
552                 }
553         }
554         read_unlock(&__ip_vs_rs_lock);
555
556         return NULL;
557 }
558
559 /*
560  *      Lookup destination by {addr,port} in the given service
561  */
562 static struct ip_vs_dest *
563 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
564 {
565         struct ip_vs_dest *dest;
566
567         /*
568          * Find the destination for the given service
569          */
570         list_for_each_entry(dest, &svc->destinations, n_list) {
571                 if ((dest->addr == daddr) && (dest->port == dport)) {
572                         /* HIT */
573                         return dest;
574                 }
575         }
576
577         return NULL;
578 }
579
580 /*
581  * Find destination by {daddr,dport,vaddr,protocol}
582  * Cretaed to be used in ip_vs_process_message() in
583  * the backup synchronization daemon. It finds the
584  * destination to be bound to the received connection
585  * on the backup.
586  *
587  * ip_vs_lookup_real_service() looked promissing, but
588  * seems not working as expected.
589  */
590 struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
591                                     __be32 vaddr, __be16 vport, __u16 protocol)
592 {
593         struct ip_vs_dest *dest;
594         struct ip_vs_service *svc;
595
596         svc = ip_vs_service_get(0, protocol, vaddr, vport);
597         if (!svc)
598                 return NULL;
599         dest = ip_vs_lookup_dest(svc, daddr, dport);
600         if (dest)
601                 atomic_inc(&dest->refcnt);
602         ip_vs_service_put(svc);
603         return dest;
604 }
605
606 /*
607  *  Lookup dest by {svc,addr,port} in the destination trash.
608  *  The destination trash is used to hold the destinations that are removed
609  *  from the service table but are still referenced by some conn entries.
610  *  The reason to add the destination trash is when the dest is temporary
611  *  down (either by administrator or by monitor program), the dest can be
612  *  picked back from the trash, the remaining connections to the dest can
613  *  continue, and the counting information of the dest is also useful for
614  *  scheduling.
615  */
616 static struct ip_vs_dest *
617 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
618 {
619         struct ip_vs_dest *dest, *nxt;
620
621         /*
622          * Find the destination in trash
623          */
624         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
625                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
626                           "dest->refcnt=%d\n",
627                           dest->vfwmark,
628                           NIPQUAD(dest->addr), ntohs(dest->port),
629                           atomic_read(&dest->refcnt));
630                 if (dest->addr == daddr &&
631                     dest->port == dport &&
632                     dest->vfwmark == svc->fwmark &&
633                     dest->protocol == svc->protocol &&
634                     (svc->fwmark ||
635                      (dest->vaddr == svc->addr &&
636                       dest->vport == svc->port))) {
637                         /* HIT */
638                         return dest;
639                 }
640
641                 /*
642                  * Try to purge the destination from trash if not referenced
643                  */
644                 if (atomic_read(&dest->refcnt) == 1) {
645                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
646                                   "from trash\n",
647                                   dest->vfwmark,
648                                   NIPQUAD(dest->addr), ntohs(dest->port));
649                         list_del(&dest->n_list);
650                         ip_vs_dst_reset(dest);
651                         __ip_vs_unbind_svc(dest);
652                         kfree(dest);
653                 }
654         }
655
656         return NULL;
657 }
658
659
660 /*
661  *  Clean up all the destinations in the trash
662  *  Called by the ip_vs_control_cleanup()
663  *
664  *  When the ip_vs_control_clearup is activated by ipvs module exit,
665  *  the service tables must have been flushed and all the connections
666  *  are expired, and the refcnt of each destination in the trash must
667  *  be 1, so we simply release them here.
668  */
669 static void ip_vs_trash_cleanup(void)
670 {
671         struct ip_vs_dest *dest, *nxt;
672
673         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
674                 list_del(&dest->n_list);
675                 ip_vs_dst_reset(dest);
676                 __ip_vs_unbind_svc(dest);
677                 kfree(dest);
678         }
679 }
680
681
682 static void
683 ip_vs_zero_stats(struct ip_vs_stats *stats)
684 {
685         spin_lock_bh(&stats->lock);
686         memset(stats, 0, (char *)&stats->lock - (char *)stats);
687         ip_vs_zero_estimator(stats);
688         spin_unlock_bh(&stats->lock);
689 }
690
691 /*
692  *      Update a destination in the given service
693  */
694 static void
695 __ip_vs_update_dest(struct ip_vs_service *svc,
696                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
697 {
698         int conn_flags;
699
700         /* set the weight and the flags */
701         atomic_set(&dest->weight, udest->weight);
702         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
703
704         /* check if local node and update the flags */
705         if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
706                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
707                         | IP_VS_CONN_F_LOCALNODE;
708         }
709
710         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
711         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
712                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
713         } else {
714                 /*
715                  *    Put the real service in ip_vs_rtable if not present.
716                  *    For now only for NAT!
717                  */
718                 write_lock_bh(&__ip_vs_rs_lock);
719                 ip_vs_rs_hash(dest);
720                 write_unlock_bh(&__ip_vs_rs_lock);
721         }
722         atomic_set(&dest->conn_flags, conn_flags);
723
724         /* bind the service */
725         if (!dest->svc) {
726                 __ip_vs_bind_svc(dest, svc);
727         } else {
728                 if (dest->svc != svc) {
729                         __ip_vs_unbind_svc(dest);
730                         ip_vs_zero_stats(&dest->stats);
731                         __ip_vs_bind_svc(dest, svc);
732                 }
733         }
734
735         /* set the dest status flags */
736         dest->flags |= IP_VS_DEST_F_AVAILABLE;
737
738         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
739                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
740         dest->u_threshold = udest->u_threshold;
741         dest->l_threshold = udest->l_threshold;
742 }
743
744
745 /*
746  *      Create a destination for the given service
747  */
748 static int
749 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
750                struct ip_vs_dest **dest_p)
751 {
752         struct ip_vs_dest *dest;
753         unsigned atype;
754
755         EnterFunction(2);
756
757         atype = inet_addr_type(&init_net, udest->addr);
758         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
759                 return -EINVAL;
760
761         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
762         if (dest == NULL) {
763                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
764                 return -ENOMEM;
765         }
766
767         dest->protocol = svc->protocol;
768         dest->vaddr = svc->addr;
769         dest->vport = svc->port;
770         dest->vfwmark = svc->fwmark;
771         dest->addr = udest->addr;
772         dest->port = udest->port;
773
774         atomic_set(&dest->activeconns, 0);
775         atomic_set(&dest->inactconns, 0);
776         atomic_set(&dest->persistconns, 0);
777         atomic_set(&dest->refcnt, 0);
778
779         INIT_LIST_HEAD(&dest->d_list);
780         spin_lock_init(&dest->dst_lock);
781         spin_lock_init(&dest->stats.lock);
782         __ip_vs_update_dest(svc, dest, udest);
783         ip_vs_new_estimator(&dest->stats);
784
785         *dest_p = dest;
786
787         LeaveFunction(2);
788         return 0;
789 }
790
791
792 /*
793  *      Add a destination into an existing service
794  */
795 static int
796 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
797 {
798         struct ip_vs_dest *dest;
799         __be32 daddr = udest->addr;
800         __be16 dport = udest->port;
801         int ret;
802
803         EnterFunction(2);
804
805         if (udest->weight < 0) {
806                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
807                 return -ERANGE;
808         }
809
810         if (udest->l_threshold > udest->u_threshold) {
811                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
812                           "upper threshold\n");
813                 return -ERANGE;
814         }
815
816         /*
817          * Check if the dest already exists in the list
818          */
819         dest = ip_vs_lookup_dest(svc, daddr, dport);
820         if (dest != NULL) {
821                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
822                 return -EEXIST;
823         }
824
825         /*
826          * Check if the dest already exists in the trash and
827          * is from the same service
828          */
829         dest = ip_vs_trash_get_dest(svc, daddr, dport);
830         if (dest != NULL) {
831                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
832                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
833                           NIPQUAD(daddr), ntohs(dport),
834                           atomic_read(&dest->refcnt),
835                           dest->vfwmark,
836                           NIPQUAD(dest->vaddr),
837                           ntohs(dest->vport));
838                 __ip_vs_update_dest(svc, dest, udest);
839
840                 /*
841                  * Get the destination from the trash
842                  */
843                 list_del(&dest->n_list);
844
845                 ip_vs_new_estimator(&dest->stats);
846
847                 write_lock_bh(&__ip_vs_svc_lock);
848
849                 /*
850                  * Wait until all other svc users go away.
851                  */
852                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
853
854                 list_add(&dest->n_list, &svc->destinations);
855                 svc->num_dests++;
856
857                 /* call the update_service function of its scheduler */
858                 svc->scheduler->update_service(svc);
859
860                 write_unlock_bh(&__ip_vs_svc_lock);
861                 return 0;
862         }
863
864         /*
865          * Allocate and initialize the dest structure
866          */
867         ret = ip_vs_new_dest(svc, udest, &dest);
868         if (ret) {
869                 return ret;
870         }
871
872         /*
873          * Add the dest entry into the list
874          */
875         atomic_inc(&dest->refcnt);
876
877         write_lock_bh(&__ip_vs_svc_lock);
878
879         /*
880          * Wait until all other svc users go away.
881          */
882         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
883
884         list_add(&dest->n_list, &svc->destinations);
885         svc->num_dests++;
886
887         /* call the update_service function of its scheduler */
888         svc->scheduler->update_service(svc);
889
890         write_unlock_bh(&__ip_vs_svc_lock);
891
892         LeaveFunction(2);
893
894         return 0;
895 }
896
897
898 /*
899  *      Edit a destination in the given service
900  */
901 static int
902 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
903 {
904         struct ip_vs_dest *dest;
905         __be32 daddr = udest->addr;
906         __be16 dport = udest->port;
907
908         EnterFunction(2);
909
910         if (udest->weight < 0) {
911                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
912                 return -ERANGE;
913         }
914
915         if (udest->l_threshold > udest->u_threshold) {
916                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
917                           "upper threshold\n");
918                 return -ERANGE;
919         }
920
921         /*
922          *  Lookup the destination list
923          */
924         dest = ip_vs_lookup_dest(svc, daddr, dport);
925         if (dest == NULL) {
926                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
927                 return -ENOENT;
928         }
929
930         __ip_vs_update_dest(svc, dest, udest);
931
932         write_lock_bh(&__ip_vs_svc_lock);
933
934         /* Wait until all other svc users go away */
935         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
936
937         /* call the update_service, because server weight may be changed */
938         svc->scheduler->update_service(svc);
939
940         write_unlock_bh(&__ip_vs_svc_lock);
941
942         LeaveFunction(2);
943
944         return 0;
945 }
946
947
948 /*
949  *      Delete a destination (must be already unlinked from the service)
950  */
951 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
952 {
953         ip_vs_kill_estimator(&dest->stats);
954
955         /*
956          *  Remove it from the d-linked list with the real services.
957          */
958         write_lock_bh(&__ip_vs_rs_lock);
959         ip_vs_rs_unhash(dest);
960         write_unlock_bh(&__ip_vs_rs_lock);
961
962         /*
963          *  Decrease the refcnt of the dest, and free the dest
964          *  if nobody refers to it (refcnt=0). Otherwise, throw
965          *  the destination into the trash.
966          */
967         if (atomic_dec_and_test(&dest->refcnt)) {
968                 ip_vs_dst_reset(dest);
969                 /* simply decrease svc->refcnt here, let the caller check
970                    and release the service if nobody refers to it.
971                    Only user context can release destination and service,
972                    and only one user context can update virtual service at a
973                    time, so the operation here is OK */
974                 atomic_dec(&dest->svc->refcnt);
975                 kfree(dest);
976         } else {
977                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
978                           "dest->refcnt=%d\n",
979                           NIPQUAD(dest->addr), ntohs(dest->port),
980                           atomic_read(&dest->refcnt));
981                 list_add(&dest->n_list, &ip_vs_dest_trash);
982                 atomic_inc(&dest->refcnt);
983         }
984 }
985
986
987 /*
988  *      Unlink a destination from the given service
989  */
990 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
991                                 struct ip_vs_dest *dest,
992                                 int svcupd)
993 {
994         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
995
996         /*
997          *  Remove it from the d-linked destination list.
998          */
999         list_del(&dest->n_list);
1000         svc->num_dests--;
1001         if (svcupd) {
1002                 /*
1003                  *  Call the update_service function of its scheduler
1004                  */
1005                 svc->scheduler->update_service(svc);
1006         }
1007 }
1008
1009
1010 /*
1011  *      Delete a destination server in the given service
1012  */
1013 static int
1014 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1015 {
1016         struct ip_vs_dest *dest;
1017         __be32 daddr = udest->addr;
1018         __be16 dport = udest->port;
1019
1020         EnterFunction(2);
1021
1022         dest = ip_vs_lookup_dest(svc, daddr, dport);
1023         if (dest == NULL) {
1024                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1025                 return -ENOENT;
1026         }
1027
1028         write_lock_bh(&__ip_vs_svc_lock);
1029
1030         /*
1031          *      Wait until all other svc users go away.
1032          */
1033         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1034
1035         /*
1036          *      Unlink dest from the service
1037          */
1038         __ip_vs_unlink_dest(svc, dest, 1);
1039
1040         write_unlock_bh(&__ip_vs_svc_lock);
1041
1042         /*
1043          *      Delete the destination
1044          */
1045         __ip_vs_del_dest(dest);
1046
1047         LeaveFunction(2);
1048
1049         return 0;
1050 }
1051
1052
1053 /*
1054  *      Add a service into the service hash table
1055  */
1056 static int
1057 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1058 {
1059         int ret = 0;
1060         struct ip_vs_scheduler *sched = NULL;
1061         struct ip_vs_service *svc = NULL;
1062
1063         /* increase the module use count */
1064         ip_vs_use_count_inc();
1065
1066         /* Lookup the scheduler by 'u->sched_name' */
1067         sched = ip_vs_scheduler_get(u->sched_name);
1068         if (sched == NULL) {
1069                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1070                            u->sched_name);
1071                 ret = -ENOENT;
1072                 goto out_mod_dec;
1073         }
1074
1075         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1076         if (svc == NULL) {
1077                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1078                 ret = -ENOMEM;
1079                 goto out_err;
1080         }
1081
1082         /* I'm the first user of the service */
1083         atomic_set(&svc->usecnt, 1);
1084         atomic_set(&svc->refcnt, 0);
1085
1086         svc->protocol = u->protocol;
1087         svc->addr = u->addr;
1088         svc->port = u->port;
1089         svc->fwmark = u->fwmark;
1090         svc->flags = u->flags;
1091         svc->timeout = u->timeout * HZ;
1092         svc->netmask = u->netmask;
1093
1094         INIT_LIST_HEAD(&svc->destinations);
1095         rwlock_init(&svc->sched_lock);
1096         spin_lock_init(&svc->stats.lock);
1097
1098         /* Bind the scheduler */
1099         ret = ip_vs_bind_scheduler(svc, sched);
1100         if (ret)
1101                 goto out_err;
1102         sched = NULL;
1103
1104         /* Update the virtual service counters */
1105         if (svc->port == FTPPORT)
1106                 atomic_inc(&ip_vs_ftpsvc_counter);
1107         else if (svc->port == 0)
1108                 atomic_inc(&ip_vs_nullsvc_counter);
1109
1110         ip_vs_new_estimator(&svc->stats);
1111         ip_vs_num_services++;
1112
1113         /* Hash the service into the service table */
1114         write_lock_bh(&__ip_vs_svc_lock);
1115         ip_vs_svc_hash(svc);
1116         write_unlock_bh(&__ip_vs_svc_lock);
1117
1118         *svc_p = svc;
1119         return 0;
1120
1121   out_err:
1122         if (svc != NULL) {
1123                 if (svc->scheduler)
1124                         ip_vs_unbind_scheduler(svc);
1125                 if (svc->inc) {
1126                         local_bh_disable();
1127                         ip_vs_app_inc_put(svc->inc);
1128                         local_bh_enable();
1129                 }
1130                 kfree(svc);
1131         }
1132         ip_vs_scheduler_put(sched);
1133
1134   out_mod_dec:
1135         /* decrease the module use count */
1136         ip_vs_use_count_dec();
1137
1138         return ret;
1139 }
1140
1141
1142 /*
1143  *      Edit a service and bind it with a new scheduler
1144  */
1145 static int
1146 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1147 {
1148         struct ip_vs_scheduler *sched, *old_sched;
1149         int ret = 0;
1150
1151         /*
1152          * Lookup the scheduler, by 'u->sched_name'
1153          */
1154         sched = ip_vs_scheduler_get(u->sched_name);
1155         if (sched == NULL) {
1156                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1157                            u->sched_name);
1158                 return -ENOENT;
1159         }
1160         old_sched = sched;
1161
1162         write_lock_bh(&__ip_vs_svc_lock);
1163
1164         /*
1165          * Wait until all other svc users go away.
1166          */
1167         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1168
1169         /*
1170          * Set the flags and timeout value
1171          */
1172         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1173         svc->timeout = u->timeout * HZ;
1174         svc->netmask = u->netmask;
1175
1176         old_sched = svc->scheduler;
1177         if (sched != old_sched) {
1178                 /*
1179                  * Unbind the old scheduler
1180                  */
1181                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1182                         old_sched = sched;
1183                         goto out;
1184                 }
1185
1186                 /*
1187                  * Bind the new scheduler
1188                  */
1189                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1190                         /*
1191                          * If ip_vs_bind_scheduler fails, restore the old
1192                          * scheduler.
1193                          * The main reason of failure is out of memory.
1194                          *
1195                          * The question is if the old scheduler can be
1196                          * restored all the time. TODO: if it cannot be
1197                          * restored some time, we must delete the service,
1198                          * otherwise the system may crash.
1199                          */
1200                         ip_vs_bind_scheduler(svc, old_sched);
1201                         old_sched = sched;
1202                         goto out;
1203                 }
1204         }
1205
1206   out:
1207         write_unlock_bh(&__ip_vs_svc_lock);
1208
1209         if (old_sched)
1210                 ip_vs_scheduler_put(old_sched);
1211
1212         return ret;
1213 }
1214
1215
1216 /*
1217  *      Delete a service from the service list
1218  *      - The service must be unlinked, unlocked and not referenced!
1219  *      - We are called under _bh lock
1220  */
1221 static void __ip_vs_del_service(struct ip_vs_service *svc)
1222 {
1223         struct ip_vs_dest *dest, *nxt;
1224         struct ip_vs_scheduler *old_sched;
1225
1226         ip_vs_num_services--;
1227         ip_vs_kill_estimator(&svc->stats);
1228
1229         /* Unbind scheduler */
1230         old_sched = svc->scheduler;
1231         ip_vs_unbind_scheduler(svc);
1232         if (old_sched)
1233                 ip_vs_scheduler_put(old_sched);
1234
1235         /* Unbind app inc */
1236         if (svc->inc) {
1237                 ip_vs_app_inc_put(svc->inc);
1238                 svc->inc = NULL;
1239         }
1240
1241         /*
1242          *    Unlink the whole destination list
1243          */
1244         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1245                 __ip_vs_unlink_dest(svc, dest, 0);
1246                 __ip_vs_del_dest(dest);
1247         }
1248
1249         /*
1250          *    Update the virtual service counters
1251          */
1252         if (svc->port == FTPPORT)
1253                 atomic_dec(&ip_vs_ftpsvc_counter);
1254         else if (svc->port == 0)
1255                 atomic_dec(&ip_vs_nullsvc_counter);
1256
1257         /*
1258          *    Free the service if nobody refers to it
1259          */
1260         if (atomic_read(&svc->refcnt) == 0)
1261                 kfree(svc);
1262
1263         /* decrease the module use count */
1264         ip_vs_use_count_dec();
1265 }
1266
1267 /*
1268  *      Delete a service from the service list
1269  */
1270 static int ip_vs_del_service(struct ip_vs_service *svc)
1271 {
1272         if (svc == NULL)
1273                 return -EEXIST;
1274
1275         /*
1276          * Unhash it from the service table
1277          */
1278         write_lock_bh(&__ip_vs_svc_lock);
1279
1280         ip_vs_svc_unhash(svc);
1281
1282         /*
1283          * Wait until all the svc users go away.
1284          */
1285         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1286
1287         __ip_vs_del_service(svc);
1288
1289         write_unlock_bh(&__ip_vs_svc_lock);
1290
1291         return 0;
1292 }
1293
1294
1295 /*
1296  *      Flush all the virtual services
1297  */
1298 static int ip_vs_flush(void)
1299 {
1300         int idx;
1301         struct ip_vs_service *svc, *nxt;
1302
1303         /*
1304          * Flush the service table hashed by <protocol,addr,port>
1305          */
1306         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1307                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1308                         write_lock_bh(&__ip_vs_svc_lock);
1309                         ip_vs_svc_unhash(svc);
1310                         /*
1311                          * Wait until all the svc users go away.
1312                          */
1313                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1314                         __ip_vs_del_service(svc);
1315                         write_unlock_bh(&__ip_vs_svc_lock);
1316                 }
1317         }
1318
1319         /*
1320          * Flush the service table hashed by fwmark
1321          */
1322         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1323                 list_for_each_entry_safe(svc, nxt,
1324                                          &ip_vs_svc_fwm_table[idx], f_list) {
1325                         write_lock_bh(&__ip_vs_svc_lock);
1326                         ip_vs_svc_unhash(svc);
1327                         /*
1328                          * Wait until all the svc users go away.
1329                          */
1330                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1331                         __ip_vs_del_service(svc);
1332                         write_unlock_bh(&__ip_vs_svc_lock);
1333                 }
1334         }
1335
1336         return 0;
1337 }
1338
1339
1340 /*
1341  *      Zero counters in a service or all services
1342  */
1343 static int ip_vs_zero_service(struct ip_vs_service *svc)
1344 {
1345         struct ip_vs_dest *dest;
1346
1347         write_lock_bh(&__ip_vs_svc_lock);
1348         list_for_each_entry(dest, &svc->destinations, n_list) {
1349                 ip_vs_zero_stats(&dest->stats);
1350         }
1351         ip_vs_zero_stats(&svc->stats);
1352         write_unlock_bh(&__ip_vs_svc_lock);
1353         return 0;
1354 }
1355
1356 static int ip_vs_zero_all(void)
1357 {
1358         int idx;
1359         struct ip_vs_service *svc;
1360
1361         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1362                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1363                         ip_vs_zero_service(svc);
1364                 }
1365         }
1366
1367         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1368                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1369                         ip_vs_zero_service(svc);
1370                 }
1371         }
1372
1373         ip_vs_zero_stats(&ip_vs_stats);
1374         return 0;
1375 }
1376
1377
1378 static int
1379 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1380                      void __user *buffer, size_t *lenp, loff_t *ppos)
1381 {
1382         int *valp = table->data;
1383         int val = *valp;
1384         int rc;
1385
1386         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1387         if (write && (*valp != val)) {
1388                 if ((*valp < 0) || (*valp > 3)) {
1389                         /* Restore the correct value */
1390                         *valp = val;
1391                 } else {
1392                         update_defense_level();
1393                 }
1394         }
1395         return rc;
1396 }
1397
1398
1399 static int
1400 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1401                        void __user *buffer, size_t *lenp, loff_t *ppos)
1402 {
1403         int *valp = table->data;
1404         int val[2];
1405         int rc;
1406
1407         /* backup the value first */
1408         memcpy(val, valp, sizeof(val));
1409
1410         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1411         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1412                 /* Restore the correct value */
1413                 memcpy(valp, val, sizeof(val));
1414         }
1415         return rc;
1416 }
1417
1418
1419 /*
1420  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1421  */
1422
1423 static struct ctl_table vs_vars[] = {
1424         {
1425                 .procname       = "amemthresh",
1426                 .data           = &sysctl_ip_vs_amemthresh,
1427                 .maxlen         = sizeof(int),
1428                 .mode           = 0644,
1429                 .proc_handler   = &proc_dointvec,
1430         },
1431 #ifdef CONFIG_IP_VS_DEBUG
1432         {
1433                 .procname       = "debug_level",
1434                 .data           = &sysctl_ip_vs_debug_level,
1435                 .maxlen         = sizeof(int),
1436                 .mode           = 0644,
1437                 .proc_handler   = &proc_dointvec,
1438         },
1439 #endif
1440         {
1441                 .procname       = "am_droprate",
1442                 .data           = &sysctl_ip_vs_am_droprate,
1443                 .maxlen         = sizeof(int),
1444                 .mode           = 0644,
1445                 .proc_handler   = &proc_dointvec,
1446         },
1447         {
1448                 .procname       = "drop_entry",
1449                 .data           = &sysctl_ip_vs_drop_entry,
1450                 .maxlen         = sizeof(int),
1451                 .mode           = 0644,
1452                 .proc_handler   = &proc_do_defense_mode,
1453         },
1454         {
1455                 .procname       = "drop_packet",
1456                 .data           = &sysctl_ip_vs_drop_packet,
1457                 .maxlen         = sizeof(int),
1458                 .mode           = 0644,
1459                 .proc_handler   = &proc_do_defense_mode,
1460         },
1461         {
1462                 .procname       = "secure_tcp",
1463                 .data           = &sysctl_ip_vs_secure_tcp,
1464                 .maxlen         = sizeof(int),
1465                 .mode           = 0644,
1466                 .proc_handler   = &proc_do_defense_mode,
1467         },
1468 #if 0
1469         {
1470                 .procname       = "timeout_established",
1471                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1472                 .maxlen         = sizeof(int),
1473                 .mode           = 0644,
1474                 .proc_handler   = &proc_dointvec_jiffies,
1475         },
1476         {
1477                 .procname       = "timeout_synsent",
1478                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1479                 .maxlen         = sizeof(int),
1480                 .mode           = 0644,
1481                 .proc_handler   = &proc_dointvec_jiffies,
1482         },
1483         {
1484                 .procname       = "timeout_synrecv",
1485                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1486                 .maxlen         = sizeof(int),
1487                 .mode           = 0644,
1488                 .proc_handler   = &proc_dointvec_jiffies,
1489         },
1490         {
1491                 .procname       = "timeout_finwait",
1492                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1493                 .maxlen         = sizeof(int),
1494                 .mode           = 0644,
1495                 .proc_handler   = &proc_dointvec_jiffies,
1496         },
1497         {
1498                 .procname       = "timeout_timewait",
1499                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1500                 .maxlen         = sizeof(int),
1501                 .mode           = 0644,
1502                 .proc_handler   = &proc_dointvec_jiffies,
1503         },
1504         {
1505                 .procname       = "timeout_close",
1506                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1507                 .maxlen         = sizeof(int),
1508                 .mode           = 0644,
1509                 .proc_handler   = &proc_dointvec_jiffies,
1510         },
1511         {
1512                 .procname       = "timeout_closewait",
1513                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1514                 .maxlen         = sizeof(int),
1515                 .mode           = 0644,
1516                 .proc_handler   = &proc_dointvec_jiffies,
1517         },
1518         {
1519                 .procname       = "timeout_lastack",
1520                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1521                 .maxlen         = sizeof(int),
1522                 .mode           = 0644,
1523                 .proc_handler   = &proc_dointvec_jiffies,
1524         },
1525         {
1526                 .procname       = "timeout_listen",
1527                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1528                 .maxlen         = sizeof(int),
1529                 .mode           = 0644,
1530                 .proc_handler   = &proc_dointvec_jiffies,
1531         },
1532         {
1533                 .procname       = "timeout_synack",
1534                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1535                 .maxlen         = sizeof(int),
1536                 .mode           = 0644,
1537                 .proc_handler   = &proc_dointvec_jiffies,
1538         },
1539         {
1540                 .procname       = "timeout_udp",
1541                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1542                 .maxlen         = sizeof(int),
1543                 .mode           = 0644,
1544                 .proc_handler   = &proc_dointvec_jiffies,
1545         },
1546         {
1547                 .procname       = "timeout_icmp",
1548                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1549                 .maxlen         = sizeof(int),
1550                 .mode           = 0644,
1551                 .proc_handler   = &proc_dointvec_jiffies,
1552         },
1553 #endif
1554         {
1555                 .procname       = "cache_bypass",
1556                 .data           = &sysctl_ip_vs_cache_bypass,
1557                 .maxlen         = sizeof(int),
1558                 .mode           = 0644,
1559                 .proc_handler   = &proc_dointvec,
1560         },
1561         {
1562                 .procname       = "expire_nodest_conn",
1563                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1564                 .maxlen         = sizeof(int),
1565                 .mode           = 0644,
1566                 .proc_handler   = &proc_dointvec,
1567         },
1568         {
1569                 .procname       = "expire_quiescent_template",
1570                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1571                 .maxlen         = sizeof(int),
1572                 .mode           = 0644,
1573                 .proc_handler   = &proc_dointvec,
1574         },
1575         {
1576                 .procname       = "sync_threshold",
1577                 .data           = &sysctl_ip_vs_sync_threshold,
1578                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1579                 .mode           = 0644,
1580                 .proc_handler   = &proc_do_sync_threshold,
1581         },
1582         {
1583                 .procname       = "nat_icmp_send",
1584                 .data           = &sysctl_ip_vs_nat_icmp_send,
1585                 .maxlen         = sizeof(int),
1586                 .mode           = 0644,
1587                 .proc_handler   = &proc_dointvec,
1588         },
1589         { .ctl_name = 0 }
1590 };
1591
1592 const struct ctl_path net_vs_ctl_path[] = {
1593         { .procname = "net", .ctl_name = CTL_NET, },
1594         { .procname = "ipv4", .ctl_name = NET_IPV4, },
1595         { .procname = "vs", },
1596         { }
1597 };
1598 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1599
1600 static struct ctl_table_header * sysctl_header;
1601
1602 #ifdef CONFIG_PROC_FS
1603
1604 struct ip_vs_iter {
1605         struct list_head *table;
1606         int bucket;
1607 };
1608
1609 /*
1610  *      Write the contents of the VS rule table to a PROCfs file.
1611  *      (It is kept just for backward compatibility)
1612  */
1613 static inline const char *ip_vs_fwd_name(unsigned flags)
1614 {
1615         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1616         case IP_VS_CONN_F_LOCALNODE:
1617                 return "Local";
1618         case IP_VS_CONN_F_TUNNEL:
1619                 return "Tunnel";
1620         case IP_VS_CONN_F_DROUTE:
1621                 return "Route";
1622         default:
1623                 return "Masq";
1624         }
1625 }
1626
1627
1628 /* Get the Nth entry in the two lists */
1629 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1630 {
1631         struct ip_vs_iter *iter = seq->private;
1632         int idx;
1633         struct ip_vs_service *svc;
1634
1635         /* look in hash by protocol */
1636         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1637                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1638                         if (pos-- == 0){
1639                                 iter->table = ip_vs_svc_table;
1640                                 iter->bucket = idx;
1641                                 return svc;
1642                         }
1643                 }
1644         }
1645
1646         /* keep looking in fwmark */
1647         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1648                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1649                         if (pos-- == 0) {
1650                                 iter->table = ip_vs_svc_fwm_table;
1651                                 iter->bucket = idx;
1652                                 return svc;
1653                         }
1654                 }
1655         }
1656
1657         return NULL;
1658 }
1659
1660 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1661 {
1662
1663         read_lock_bh(&__ip_vs_svc_lock);
1664         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1665 }
1666
1667
1668 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1669 {
1670         struct list_head *e;
1671         struct ip_vs_iter *iter;
1672         struct ip_vs_service *svc;
1673
1674         ++*pos;
1675         if (v == SEQ_START_TOKEN)
1676                 return ip_vs_info_array(seq,0);
1677
1678         svc = v;
1679         iter = seq->private;
1680
1681         if (iter->table == ip_vs_svc_table) {
1682                 /* next service in table hashed by protocol */
1683                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1684                         return list_entry(e, struct ip_vs_service, s_list);
1685
1686
1687                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1688                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1689                                             s_list) {
1690                                 return svc;
1691                         }
1692                 }
1693
1694                 iter->table = ip_vs_svc_fwm_table;
1695                 iter->bucket = -1;
1696                 goto scan_fwmark;
1697         }
1698
1699         /* next service in hashed by fwmark */
1700         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1701                 return list_entry(e, struct ip_vs_service, f_list);
1702
1703  scan_fwmark:
1704         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1705                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1706                                     f_list)
1707                         return svc;
1708         }
1709
1710         return NULL;
1711 }
1712
1713 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1714 {
1715         read_unlock_bh(&__ip_vs_svc_lock);
1716 }
1717
1718
1719 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1720 {
1721         if (v == SEQ_START_TOKEN) {
1722                 seq_printf(seq,
1723                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1724                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1725                 seq_puts(seq,
1726                          "Prot LocalAddress:Port Scheduler Flags\n");
1727                 seq_puts(seq,
1728                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1729         } else {
1730                 const struct ip_vs_service *svc = v;
1731                 const struct ip_vs_iter *iter = seq->private;
1732                 const struct ip_vs_dest *dest;
1733
1734                 if (iter->table == ip_vs_svc_table)
1735                         seq_printf(seq, "%s  %08X:%04X %s ",
1736                                    ip_vs_proto_name(svc->protocol),
1737                                    ntohl(svc->addr),
1738                                    ntohs(svc->port),
1739                                    svc->scheduler->name);
1740                 else
1741                         seq_printf(seq, "FWM  %08X %s ",
1742                                    svc->fwmark, svc->scheduler->name);
1743
1744                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1745                         seq_printf(seq, "persistent %d %08X\n",
1746                                 svc->timeout,
1747                                 ntohl(svc->netmask));
1748                 else
1749                         seq_putc(seq, '\n');
1750
1751                 list_for_each_entry(dest, &svc->destinations, n_list) {
1752                         seq_printf(seq,
1753                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1754                                    ntohl(dest->addr), ntohs(dest->port),
1755                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1756                                    atomic_read(&dest->weight),
1757                                    atomic_read(&dest->activeconns),
1758                                    atomic_read(&dest->inactconns));
1759                 }
1760         }
1761         return 0;
1762 }
1763
1764 static const struct seq_operations ip_vs_info_seq_ops = {
1765         .start = ip_vs_info_seq_start,
1766         .next  = ip_vs_info_seq_next,
1767         .stop  = ip_vs_info_seq_stop,
1768         .show  = ip_vs_info_seq_show,
1769 };
1770
1771 static int ip_vs_info_open(struct inode *inode, struct file *file)
1772 {
1773         return seq_open_private(file, &ip_vs_info_seq_ops,
1774                         sizeof(struct ip_vs_iter));
1775 }
1776
1777 static const struct file_operations ip_vs_info_fops = {
1778         .owner   = THIS_MODULE,
1779         .open    = ip_vs_info_open,
1780         .read    = seq_read,
1781         .llseek  = seq_lseek,
1782         .release = seq_release_private,
1783 };
1784
1785 #endif
1786
1787 struct ip_vs_stats ip_vs_stats = {
1788         .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1789 };
1790
1791 #ifdef CONFIG_PROC_FS
1792 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1793 {
1794
1795 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1796         seq_puts(seq,
1797                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1798         seq_printf(seq,
1799                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1800
1801         spin_lock_bh(&ip_vs_stats.lock);
1802         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1803                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1804                    (unsigned long long) ip_vs_stats.inbytes,
1805                    (unsigned long long) ip_vs_stats.outbytes);
1806
1807 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1808         seq_puts(seq,
1809                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1810         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1811                         ip_vs_stats.cps,
1812                         ip_vs_stats.inpps,
1813                         ip_vs_stats.outpps,
1814                         ip_vs_stats.inbps,
1815                         ip_vs_stats.outbps);
1816         spin_unlock_bh(&ip_vs_stats.lock);
1817
1818         return 0;
1819 }
1820
1821 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1822 {
1823         return single_open(file, ip_vs_stats_show, NULL);
1824 }
1825
1826 static const struct file_operations ip_vs_stats_fops = {
1827         .owner = THIS_MODULE,
1828         .open = ip_vs_stats_seq_open,
1829         .read = seq_read,
1830         .llseek = seq_lseek,
1831         .release = single_release,
1832 };
1833
1834 #endif
1835
1836 /*
1837  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1838  */
1839 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1840 {
1841         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1842                   u->tcp_timeout,
1843                   u->tcp_fin_timeout,
1844                   u->udp_timeout);
1845
1846 #ifdef CONFIG_IP_VS_PROTO_TCP
1847         if (u->tcp_timeout) {
1848                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1849                         = u->tcp_timeout * HZ;
1850         }
1851
1852         if (u->tcp_fin_timeout) {
1853                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1854                         = u->tcp_fin_timeout * HZ;
1855         }
1856 #endif
1857
1858 #ifdef CONFIG_IP_VS_PROTO_UDP
1859         if (u->udp_timeout) {
1860                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1861                         = u->udp_timeout * HZ;
1862         }
1863 #endif
1864         return 0;
1865 }
1866
1867
1868 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1869 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1870 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1871                                  sizeof(struct ip_vs_dest_user))
1872 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1873 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1874 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1875
1876 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1877         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1878         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1879         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1880         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1881         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1882         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1883         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1884         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1885         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1886         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1887         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1888 };
1889
1890 static int
1891 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1892 {
1893         int ret;
1894         unsigned char arg[MAX_ARG_LEN];
1895         struct ip_vs_service_user *usvc;
1896         struct ip_vs_service *svc;
1897         struct ip_vs_dest_user *udest;
1898
1899         if (!capable(CAP_NET_ADMIN))
1900                 return -EPERM;
1901
1902         if (len != set_arglen[SET_CMDID(cmd)]) {
1903                 IP_VS_ERR("set_ctl: len %u != %u\n",
1904                           len, set_arglen[SET_CMDID(cmd)]);
1905                 return -EINVAL;
1906         }
1907
1908         if (copy_from_user(arg, user, len) != 0)
1909                 return -EFAULT;
1910
1911         /* increase the module use count */
1912         ip_vs_use_count_inc();
1913
1914         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1915                 ret = -ERESTARTSYS;
1916                 goto out_dec;
1917         }
1918
1919         if (cmd == IP_VS_SO_SET_FLUSH) {
1920                 /* Flush the virtual service */
1921                 ret = ip_vs_flush();
1922                 goto out_unlock;
1923         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1924                 /* Set timeout values for (tcp tcpfin udp) */
1925                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1926                 goto out_unlock;
1927         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1928                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1929                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1930                 goto out_unlock;
1931         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1932                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1933                 ret = stop_sync_thread(dm->state);
1934                 goto out_unlock;
1935         }
1936
1937         usvc = (struct ip_vs_service_user *)arg;
1938         udest = (struct ip_vs_dest_user *)(usvc + 1);
1939
1940         if (cmd == IP_VS_SO_SET_ZERO) {
1941                 /* if no service address is set, zero counters in all */
1942                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1943                         ret = ip_vs_zero_all();
1944                         goto out_unlock;
1945                 }
1946         }
1947
1948         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1949         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1950                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1951                           usvc->protocol, NIPQUAD(usvc->addr),
1952                           ntohs(usvc->port), usvc->sched_name);
1953                 ret = -EFAULT;
1954                 goto out_unlock;
1955         }
1956
1957         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1958         if (usvc->fwmark == 0)
1959                 svc = __ip_vs_service_get(usvc->protocol,
1960                                           usvc->addr, usvc->port);
1961         else
1962                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1963
1964         if (cmd != IP_VS_SO_SET_ADD
1965             && (svc == NULL || svc->protocol != usvc->protocol)) {
1966                 ret = -ESRCH;
1967                 goto out_unlock;
1968         }
1969
1970         switch (cmd) {
1971         case IP_VS_SO_SET_ADD:
1972                 if (svc != NULL)
1973                         ret = -EEXIST;
1974                 else
1975                         ret = ip_vs_add_service(usvc, &svc);
1976                 break;
1977         case IP_VS_SO_SET_EDIT:
1978                 ret = ip_vs_edit_service(svc, usvc);
1979                 break;
1980         case IP_VS_SO_SET_DEL:
1981                 ret = ip_vs_del_service(svc);
1982                 if (!ret)
1983                         goto out_unlock;
1984                 break;
1985         case IP_VS_SO_SET_ZERO:
1986                 ret = ip_vs_zero_service(svc);
1987                 break;
1988         case IP_VS_SO_SET_ADDDEST:
1989                 ret = ip_vs_add_dest(svc, udest);
1990                 break;
1991         case IP_VS_SO_SET_EDITDEST:
1992                 ret = ip_vs_edit_dest(svc, udest);
1993                 break;
1994         case IP_VS_SO_SET_DELDEST:
1995                 ret = ip_vs_del_dest(svc, udest);
1996                 break;
1997         default:
1998                 ret = -EINVAL;
1999         }
2000
2001         if (svc)
2002                 ip_vs_service_put(svc);
2003
2004   out_unlock:
2005         mutex_unlock(&__ip_vs_mutex);
2006   out_dec:
2007         /* decrease the module use count */
2008         ip_vs_use_count_dec();
2009
2010         return ret;
2011 }
2012
2013
2014 static void
2015 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2016 {
2017         spin_lock_bh(&src->lock);
2018         memcpy(dst, src, (char*)&src->lock - (char*)src);
2019         spin_unlock_bh(&src->lock);
2020 }
2021
2022 static void
2023 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2024 {
2025         dst->protocol = src->protocol;
2026         dst->addr = src->addr;
2027         dst->port = src->port;
2028         dst->fwmark = src->fwmark;
2029         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2030         dst->flags = src->flags;
2031         dst->timeout = src->timeout / HZ;
2032         dst->netmask = src->netmask;
2033         dst->num_dests = src->num_dests;
2034         ip_vs_copy_stats(&dst->stats, &src->stats);
2035 }
2036
2037 static inline int
2038 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2039                             struct ip_vs_get_services __user *uptr)
2040 {
2041         int idx, count=0;
2042         struct ip_vs_service *svc;
2043         struct ip_vs_service_entry entry;
2044         int ret = 0;
2045
2046         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2047                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2048                         if (count >= get->num_services)
2049                                 goto out;
2050                         memset(&entry, 0, sizeof(entry));
2051                         ip_vs_copy_service(&entry, svc);
2052                         if (copy_to_user(&uptr->entrytable[count],
2053                                          &entry, sizeof(entry))) {
2054                                 ret = -EFAULT;
2055                                 goto out;
2056                         }
2057                         count++;
2058                 }
2059         }
2060
2061         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2062                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2063                         if (count >= get->num_services)
2064                                 goto out;
2065                         memset(&entry, 0, sizeof(entry));
2066                         ip_vs_copy_service(&entry, svc);
2067                         if (copy_to_user(&uptr->entrytable[count],
2068                                          &entry, sizeof(entry))) {
2069                                 ret = -EFAULT;
2070                                 goto out;
2071                         }
2072                         count++;
2073                 }
2074         }
2075   out:
2076         return ret;
2077 }
2078
2079 static inline int
2080 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2081                          struct ip_vs_get_dests __user *uptr)
2082 {
2083         struct ip_vs_service *svc;
2084         int ret = 0;
2085
2086         if (get->fwmark)
2087                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2088         else
2089                 svc = __ip_vs_service_get(get->protocol,
2090                                           get->addr, get->port);
2091         if (svc) {
2092                 int count = 0;
2093                 struct ip_vs_dest *dest;
2094                 struct ip_vs_dest_entry entry;
2095
2096                 list_for_each_entry(dest, &svc->destinations, n_list) {
2097                         if (count >= get->num_dests)
2098                                 break;
2099
2100                         entry.addr = dest->addr;
2101                         entry.port = dest->port;
2102                         entry.conn_flags = atomic_read(&dest->conn_flags);
2103                         entry.weight = atomic_read(&dest->weight);
2104                         entry.u_threshold = dest->u_threshold;
2105                         entry.l_threshold = dest->l_threshold;
2106                         entry.activeconns = atomic_read(&dest->activeconns);
2107                         entry.inactconns = atomic_read(&dest->inactconns);
2108                         entry.persistconns = atomic_read(&dest->persistconns);
2109                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2110                         if (copy_to_user(&uptr->entrytable[count],
2111                                          &entry, sizeof(entry))) {
2112                                 ret = -EFAULT;
2113                                 break;
2114                         }
2115                         count++;
2116                 }
2117                 ip_vs_service_put(svc);
2118         } else
2119                 ret = -ESRCH;
2120         return ret;
2121 }
2122
2123 static inline void
2124 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2125 {
2126 #ifdef CONFIG_IP_VS_PROTO_TCP
2127         u->tcp_timeout =
2128                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2129         u->tcp_fin_timeout =
2130                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2131 #endif
2132 #ifdef CONFIG_IP_VS_PROTO_UDP
2133         u->udp_timeout =
2134                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2135 #endif
2136 }
2137
2138
2139 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2140 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2141 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2142 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2143 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2144 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2145 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2146
2147 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2148         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2149         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2150         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2151         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2152         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2153         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2154         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2155 };
2156
2157 static int
2158 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2159 {
2160         unsigned char arg[128];
2161         int ret = 0;
2162
2163         if (!capable(CAP_NET_ADMIN))
2164                 return -EPERM;
2165
2166         if (*len < get_arglen[GET_CMDID(cmd)]) {
2167                 IP_VS_ERR("get_ctl: len %u < %u\n",
2168                           *len, get_arglen[GET_CMDID(cmd)]);
2169                 return -EINVAL;
2170         }
2171
2172         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2173                 return -EFAULT;
2174
2175         if (mutex_lock_interruptible(&__ip_vs_mutex))
2176                 return -ERESTARTSYS;
2177
2178         switch (cmd) {
2179         case IP_VS_SO_GET_VERSION:
2180         {
2181                 char buf[64];
2182
2183                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2184                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2185                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2186                         ret = -EFAULT;
2187                         goto out;
2188                 }
2189                 *len = strlen(buf)+1;
2190         }
2191         break;
2192
2193         case IP_VS_SO_GET_INFO:
2194         {
2195                 struct ip_vs_getinfo info;
2196                 info.version = IP_VS_VERSION_CODE;
2197                 info.size = IP_VS_CONN_TAB_SIZE;
2198                 info.num_services = ip_vs_num_services;
2199                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2200                         ret = -EFAULT;
2201         }
2202         break;
2203
2204         case IP_VS_SO_GET_SERVICES:
2205         {
2206                 struct ip_vs_get_services *get;
2207                 int size;
2208
2209                 get = (struct ip_vs_get_services *)arg;
2210                 size = sizeof(*get) +
2211                         sizeof(struct ip_vs_service_entry) * get->num_services;
2212                 if (*len != size) {
2213                         IP_VS_ERR("length: %u != %u\n", *len, size);
2214                         ret = -EINVAL;
2215                         goto out;
2216                 }
2217                 ret = __ip_vs_get_service_entries(get, user);
2218         }
2219         break;
2220
2221         case IP_VS_SO_GET_SERVICE:
2222         {
2223                 struct ip_vs_service_entry *entry;
2224                 struct ip_vs_service *svc;
2225
2226                 entry = (struct ip_vs_service_entry *)arg;
2227                 if (entry->fwmark)
2228                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2229                 else
2230                         svc = __ip_vs_service_get(entry->protocol,
2231                                                   entry->addr, entry->port);
2232                 if (svc) {
2233                         ip_vs_copy_service(entry, svc);
2234                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2235                                 ret = -EFAULT;
2236                         ip_vs_service_put(svc);
2237                 } else
2238                         ret = -ESRCH;
2239         }
2240         break;
2241
2242         case IP_VS_SO_GET_DESTS:
2243         {
2244                 struct ip_vs_get_dests *get;
2245                 int size;
2246
2247                 get = (struct ip_vs_get_dests *)arg;
2248                 size = sizeof(*get) +
2249                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2250                 if (*len != size) {
2251                         IP_VS_ERR("length: %u != %u\n", *len, size);
2252                         ret = -EINVAL;
2253                         goto out;
2254                 }
2255                 ret = __ip_vs_get_dest_entries(get, user);
2256         }
2257         break;
2258
2259         case IP_VS_SO_GET_TIMEOUT:
2260         {
2261                 struct ip_vs_timeout_user t;
2262
2263                 __ip_vs_get_timeouts(&t);
2264                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2265                         ret = -EFAULT;
2266         }
2267         break;
2268
2269         case IP_VS_SO_GET_DAEMON:
2270         {
2271                 struct ip_vs_daemon_user d[2];
2272
2273                 memset(&d, 0, sizeof(d));
2274                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2275                         d[0].state = IP_VS_STATE_MASTER;
2276                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2277                         d[0].syncid = ip_vs_master_syncid;
2278                 }
2279                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2280                         d[1].state = IP_VS_STATE_BACKUP;
2281                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2282                         d[1].syncid = ip_vs_backup_syncid;
2283                 }
2284                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2285                         ret = -EFAULT;
2286         }
2287         break;
2288
2289         default:
2290                 ret = -EINVAL;
2291         }
2292
2293   out:
2294         mutex_unlock(&__ip_vs_mutex);
2295         return ret;
2296 }
2297
2298
2299 static struct nf_sockopt_ops ip_vs_sockopts = {
2300         .pf             = PF_INET,
2301         .set_optmin     = IP_VS_BASE_CTL,
2302         .set_optmax     = IP_VS_SO_SET_MAX+1,
2303         .set            = do_ip_vs_set_ctl,
2304         .get_optmin     = IP_VS_BASE_CTL,
2305         .get_optmax     = IP_VS_SO_GET_MAX+1,
2306         .get            = do_ip_vs_get_ctl,
2307         .owner          = THIS_MODULE,
2308 };
2309
2310
2311 int __init ip_vs_control_init(void)
2312 {
2313         int ret;
2314         int idx;
2315
2316         EnterFunction(2);
2317
2318         ret = nf_register_sockopt(&ip_vs_sockopts);
2319         if (ret) {
2320                 IP_VS_ERR("cannot register sockopt.\n");
2321                 return ret;
2322         }
2323
2324         proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2325         proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2326
2327         sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
2328
2329         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2330         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2331                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2332                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2333         }
2334         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2335                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2336         }
2337
2338         ip_vs_new_estimator(&ip_vs_stats);
2339
2340         /* Hook the defense timer */
2341         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2342
2343         LeaveFunction(2);
2344         return 0;
2345 }
2346
2347
2348 void ip_vs_control_cleanup(void)
2349 {
2350         EnterFunction(2);
2351         ip_vs_trash_cleanup();
2352         cancel_rearming_delayed_work(&defense_work);
2353         cancel_work_sync(&defense_work.work);
2354         ip_vs_kill_estimator(&ip_vs_stats);
2355         unregister_sysctl_table(sysctl_header);
2356         proc_net_remove(&init_net, "ip_vs_stats");
2357         proc_net_remove(&init_net, "ip_vs");
2358         nf_unregister_sockopt(&ip_vs_sockopts);
2359         LeaveFunction(2);
2360 }