2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/skbuff.h>
84 #include <linux/rtnetlink.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/protocol.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
99 #include <net/ip_fib.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/ip_mp_alg.h>
106 #include <linux/sysctl.h>
109 #define RT_FL_TOS(oldflp) \
110 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112 #define IP_MAX_MTU 0xFFF0
114 #define RT_GC_TIMEOUT (300*HZ)
116 static int ip_rt_min_delay = 2 * HZ;
117 static int ip_rt_max_delay = 10 * HZ;
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval = 60 * HZ;
121 static int ip_rt_gc_min_interval = HZ / 2;
122 static int ip_rt_redirect_number = 9;
123 static int ip_rt_redirect_load = HZ / 50;
124 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost = HZ;
126 static int ip_rt_error_burst = 5 * HZ;
127 static int ip_rt_gc_elasticity = 8;
128 static int ip_rt_mtu_expires = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu = 512 + 20 + 20;
130 static int ip_rt_min_advmss = 256;
131 static int ip_rt_secret_interval = 10 * 60 * HZ;
132 static unsigned long rt_deadline;
134 #define RTprint(a...) printk(KERN_DEBUG a)
136 static struct timer_list rt_flush_timer;
137 static struct timer_list rt_periodic_timer;
138 static struct timer_list rt_secret_timer;
141 * Interface to generic destination cache.
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static void ipv4_dst_destroy(struct dst_entry *dst);
146 static void ipv4_dst_ifdown(struct dst_entry *dst,
147 struct net_device *dev, int how);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void ipv4_link_failure(struct sk_buff *skb);
150 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(void);
154 static struct dst_ops ipv4_dst_ops = {
156 .protocol = __constant_htons(ETH_P_IP),
157 .gc = rt_garbage_collect,
158 .check = ipv4_dst_check,
159 .destroy = ipv4_dst_destroy,
160 .ifdown = ipv4_dst_ifdown,
161 .negative_advice = ipv4_negative_advice,
162 .link_failure = ipv4_link_failure,
163 .update_pmtu = ip_rt_update_pmtu,
164 .entry_size = sizeof(struct rtable),
167 #define ECN_OR_COST(class) TC_PRIO_##class
169 __u8 ip_tos2prio[16] = {
173 ECN_OR_COST(BESTEFFORT),
179 ECN_OR_COST(INTERACTIVE),
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK)
193 /* The locking scheme is rather straight forward:
195 * 1) Read-Copy Update protects the buckets of the central route hash.
196 * 2) Only writers remove entries, and they hold the lock
197 * as they look at rtable reference counts.
198 * 3) Only readers acquire references to rtable entries,
199 * they do so with atomic increments and with the
203 struct rt_hash_bucket {
204 struct rtable *chain;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
212 #define RT_HASH_LOCK_SZ 4096
214 #define RT_HASH_LOCK_SZ 2048
216 #define RT_HASH_LOCK_SZ 1024
218 #define RT_HASH_LOCK_SZ 512
220 #define RT_HASH_LOCK_SZ 256
223 static spinlock_t *rt_hash_locks;
224 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
225 # define rt_hash_lock_init() { \
227 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
228 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
229 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
230 spin_lock_init(&rt_hash_locks[i]); \
233 # define rt_hash_lock_addr(slot) NULL
234 # define rt_hash_lock_init()
237 static struct rt_hash_bucket *rt_hash_table;
238 static unsigned rt_hash_mask;
239 static int rt_hash_log;
240 static unsigned int rt_hash_rnd;
242 struct rt_cache_stat *rt_cache_stat;
244 static int rt_intern_hash(unsigned hash, struct rtable *rth,
245 struct rtable **res);
247 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
249 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253 #ifdef CONFIG_PROC_FS
254 struct rt_cache_iter_state {
258 static struct rtable *rt_cache_get_first(struct seq_file *seq)
260 struct rtable *r = NULL;
261 struct rt_cache_iter_state *st = seq->private;
263 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
265 r = rt_hash_table[st->bucket].chain;
268 rcu_read_unlock_bh();
273 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
275 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279 rcu_read_unlock_bh();
280 if (--st->bucket < 0)
283 r = rt_hash_table[st->bucket].chain;
288 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
290 struct rtable *r = rt_cache_get_first(seq);
293 while (pos && (r = rt_cache_get_next(seq, r)))
295 return pos ? NULL : r;
298 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
300 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
303 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
305 struct rtable *r = NULL;
307 if (v == SEQ_START_TOKEN)
308 r = rt_cache_get_first(seq);
310 r = rt_cache_get_next(seq, v);
315 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
317 if (v && v != SEQ_START_TOKEN)
318 rcu_read_unlock_bh();
321 static int rt_cache_seq_show(struct seq_file *seq, void *v)
323 if (v == SEQ_START_TOKEN)
324 seq_printf(seq, "%-127s\n",
325 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
326 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
329 struct rtable *r = v;
332 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
333 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
334 r->u.dst.dev ? r->u.dst.dev->name : "*",
335 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
336 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
337 r->u.dst.__use, 0, (unsigned long)r->rt_src,
338 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
339 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
340 dst_metric(&r->u.dst, RTAX_WINDOW),
341 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
342 dst_metric(&r->u.dst, RTAX_RTTVAR)),
344 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
345 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
348 seq_printf(seq, "%-127s\n", temp);
353 static struct seq_operations rt_cache_seq_ops = {
354 .start = rt_cache_seq_start,
355 .next = rt_cache_seq_next,
356 .stop = rt_cache_seq_stop,
357 .show = rt_cache_seq_show,
360 static int rt_cache_seq_open(struct inode *inode, struct file *file)
362 struct seq_file *seq;
364 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368 rc = seq_open(file, &rt_cache_seq_ops);
371 seq = file->private_data;
373 memset(s, 0, sizeof(*s));
381 static struct file_operations rt_cache_seq_fops = {
382 .owner = THIS_MODULE,
383 .open = rt_cache_seq_open,
386 .release = seq_release_private,
390 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395 return SEQ_START_TOKEN;
397 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
398 if (!cpu_possible(cpu))
401 return per_cpu_ptr(rt_cache_stat, cpu);
406 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
411 if (!cpu_possible(cpu))
414 return per_cpu_ptr(rt_cache_stat, cpu);
420 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 struct rt_cache_stat *st = v;
429 if (v == SEQ_START_TOKEN) {
430 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
434 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
435 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
436 atomic_read(&ipv4_dst_ops.entries),
459 static struct seq_operations rt_cpu_seq_ops = {
460 .start = rt_cpu_seq_start,
461 .next = rt_cpu_seq_next,
462 .stop = rt_cpu_seq_stop,
463 .show = rt_cpu_seq_show,
467 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 return seq_open(file, &rt_cpu_seq_ops);
472 static struct file_operations rt_cpu_seq_fops = {
473 .owner = THIS_MODULE,
474 .open = rt_cpu_seq_open,
477 .release = seq_release,
480 #endif /* CONFIG_PROC_FS */
482 static __inline__ void rt_free(struct rtable *rt)
484 multipath_remove(rt);
485 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
488 static __inline__ void rt_drop(struct rtable *rt)
490 multipath_remove(rt);
492 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
495 static __inline__ int rt_fast_clean(struct rtable *rth)
497 /* Kill broadcast/multicast entries very aggresively, if they
498 collide in hash table with more useful entries */
499 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
500 rth->fl.iif && rth->u.rt_next;
503 static __inline__ int rt_valuable(struct rtable *rth)
505 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
514 if (atomic_read(&rth->u.dst.__refcnt))
518 if (rth->u.dst.expires &&
519 time_after_eq(jiffies, rth->u.dst.expires))
522 age = jiffies - rth->u.dst.lastuse;
524 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
525 (age <= tmo2 && rt_valuable(rth)))
531 /* Bits of score are:
533 * 30: not quite useless
534 * 29..0: usage counter
536 static inline u32 rt_score(struct rtable *rt)
538 u32 score = jiffies - rt->u.dst.lastuse;
540 score = ~score & ~(3<<30);
546 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
552 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
554 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
555 fl1->oif == fl2->oif &&
556 fl1->iif == fl2->iif;
559 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
560 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
561 struct rtable *expentry,
564 int passedexpired = 0;
565 struct rtable **nextstep = NULL;
566 struct rtable **rthp = chain_head;
572 while ((rth = *rthp) != NULL) {
576 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
577 compare_keys(&(*rthp)->fl, &expentry->fl)) {
578 if (*rthp == expentry) {
579 *rthp = rth->u.rt_next;
582 *rthp = rth->u.rt_next;
588 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
589 passedexpired && !nextstep)
590 nextstep = &rth->u.rt_next;
592 rthp = &rth->u.rt_next;
602 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
605 /* This runs via a timer and thus is always in BH context. */
606 static void rt_check_expire(unsigned long dummy)
610 struct rtable *rth, **rthp;
611 unsigned long now = jiffies;
613 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
614 t -= ip_rt_gc_timeout) {
615 unsigned long tmo = ip_rt_gc_timeout;
617 i = (i + 1) & rt_hash_mask;
618 rthp = &rt_hash_table[i].chain;
620 spin_lock(rt_hash_lock_addr(i));
621 while ((rth = *rthp) != NULL) {
622 if (rth->u.dst.expires) {
623 /* Entry is expired even if it is in use */
624 if (time_before_eq(now, rth->u.dst.expires)) {
626 rthp = &rth->u.rt_next;
629 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
631 rthp = &rth->u.rt_next;
635 /* Cleanup aged off entries. */
636 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
637 /* remove all related balanced entries if necessary */
638 if (rth->u.dst.flags & DST_BALANCED) {
639 rthp = rt_remove_balanced_route(
640 &rt_hash_table[i].chain,
645 *rthp = rth->u.rt_next;
648 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
649 *rthp = rth->u.rt_next;
651 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
653 spin_unlock(rt_hash_lock_addr(i));
655 /* Fallback loop breaker. */
656 if (time_after(jiffies, now))
660 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
663 /* This can run from both BH and non-BH contexts, the latter
664 * in the case of a forced flush event.
666 static void rt_run_flush(unsigned long dummy)
669 struct rtable *rth, *next;
673 get_random_bytes(&rt_hash_rnd, 4);
675 for (i = rt_hash_mask; i >= 0; i--) {
676 spin_lock_bh(rt_hash_lock_addr(i));
677 rth = rt_hash_table[i].chain;
679 rt_hash_table[i].chain = NULL;
680 spin_unlock_bh(rt_hash_lock_addr(i));
682 for (; rth; rth = next) {
683 next = rth->u.rt_next;
689 static DEFINE_SPINLOCK(rt_flush_lock);
691 void rt_cache_flush(int delay)
693 unsigned long now = jiffies;
694 int user_mode = !in_softirq();
697 delay = ip_rt_min_delay;
699 /* flush existing multipath state*/
702 spin_lock_bh(&rt_flush_lock);
704 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
705 long tmo = (long)(rt_deadline - now);
707 /* If flush timer is already running
708 and flush request is not immediate (delay > 0):
710 if deadline is not achieved, prolongate timer to "delay",
711 otherwise fire it at deadline time.
714 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
722 spin_unlock_bh(&rt_flush_lock);
727 if (rt_deadline == 0)
728 rt_deadline = now + ip_rt_max_delay;
730 mod_timer(&rt_flush_timer, now+delay);
731 spin_unlock_bh(&rt_flush_lock);
734 static void rt_secret_rebuild(unsigned long dummy)
736 unsigned long now = jiffies;
739 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
743 Short description of GC goals.
745 We want to build algorithm, which will keep routing cache
746 at some equilibrium point, when number of aged off entries
747 is kept approximately equal to newly generated ones.
749 Current expiration strength is variable "expire".
750 We try to adjust it dynamically, so that if networking
751 is idle expires is large enough to keep enough of warm entries,
752 and when load increases it reduces to limit cache size.
755 static int rt_garbage_collect(void)
757 static unsigned long expire = RT_GC_TIMEOUT;
758 static unsigned long last_gc;
760 static int equilibrium;
761 struct rtable *rth, **rthp;
762 unsigned long now = jiffies;
766 * Garbage collection is pretty expensive,
767 * do not make it too frequently.
770 RT_CACHE_STAT_INC(gc_total);
772 if (now - last_gc < ip_rt_gc_min_interval &&
773 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
774 RT_CACHE_STAT_INC(gc_ignored);
778 /* Calculate number of entries, which we want to expire now. */
779 goal = atomic_read(&ipv4_dst_ops.entries) -
780 (ip_rt_gc_elasticity << rt_hash_log);
782 if (equilibrium < ipv4_dst_ops.gc_thresh)
783 equilibrium = ipv4_dst_ops.gc_thresh;
784 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
786 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
787 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
790 /* We are in dangerous area. Try to reduce cache really
793 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
794 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
797 if (now - last_gc >= ip_rt_gc_min_interval)
808 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
809 unsigned long tmo = expire;
811 k = (k + 1) & rt_hash_mask;
812 rthp = &rt_hash_table[k].chain;
813 spin_lock_bh(rt_hash_lock_addr(k));
814 while ((rth = *rthp) != NULL) {
815 if (!rt_may_expire(rth, tmo, expire)) {
817 rthp = &rth->u.rt_next;
820 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
821 /* remove all related balanced entries
824 if (rth->u.dst.flags & DST_BALANCED) {
827 rthp = rt_remove_balanced_route(
828 &rt_hash_table[i].chain,
835 *rthp = rth->u.rt_next;
839 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
840 *rthp = rth->u.rt_next;
843 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
845 spin_unlock_bh(rt_hash_lock_addr(k));
854 /* Goal is not achieved. We stop process if:
856 - if expire reduced to zero. Otherwise, expire is halfed.
857 - if table is not full.
858 - if we are called from interrupt.
859 - jiffies check is just fallback/debug loop breaker.
860 We will not spin here for long time in any case.
863 RT_CACHE_STAT_INC(gc_goal_miss);
869 #if RT_CACHE_DEBUG >= 2
870 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
871 atomic_read(&ipv4_dst_ops.entries), goal, i);
874 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
876 } while (!in_softirq() && time_before_eq(jiffies, now));
878 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
881 printk(KERN_WARNING "dst cache overflow\n");
882 RT_CACHE_STAT_INC(gc_dst_overflow);
886 expire += ip_rt_gc_min_interval;
887 if (expire > ip_rt_gc_timeout ||
888 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
889 expire = ip_rt_gc_timeout;
890 #if RT_CACHE_DEBUG >= 2
891 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
892 atomic_read(&ipv4_dst_ops.entries), goal, rover);
897 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
899 struct rtable *rth, **rthp;
901 struct rtable *cand, **candp;
904 int attempts = !in_softirq();
913 rthp = &rt_hash_table[hash].chain;
915 spin_lock_bh(rt_hash_lock_addr(hash));
916 while ((rth = *rthp) != NULL) {
917 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
918 if (!(rth->u.dst.flags & DST_BALANCED) &&
919 compare_keys(&rth->fl, &rt->fl)) {
921 if (compare_keys(&rth->fl, &rt->fl)) {
924 *rthp = rth->u.rt_next;
926 * Since lookup is lockfree, the deletion
927 * must be visible to another weakly ordered CPU before
928 * the insertion at the start of the hash chain.
930 rcu_assign_pointer(rth->u.rt_next,
931 rt_hash_table[hash].chain);
933 * Since lookup is lockfree, the update writes
934 * must be ordered for consistency on SMP.
936 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
939 dst_hold(&rth->u.dst);
940 rth->u.dst.lastuse = now;
941 spin_unlock_bh(rt_hash_lock_addr(hash));
948 if (!atomic_read(&rth->u.dst.__refcnt)) {
949 u32 score = rt_score(rth);
951 if (score <= min_score) {
960 rthp = &rth->u.rt_next;
964 /* ip_rt_gc_elasticity used to be average length of chain
965 * length, when exceeded gc becomes really aggressive.
967 * The second limit is less certain. At the moment it allows
968 * only 2 entries per bucket. We will see.
970 if (chain_length > ip_rt_gc_elasticity) {
971 *candp = cand->u.rt_next;
976 /* Try to bind route to arp only if it is output
977 route or unicast forwarding path.
979 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
980 int err = arp_bind_neighbour(&rt->u.dst);
982 spin_unlock_bh(rt_hash_lock_addr(hash));
984 if (err != -ENOBUFS) {
989 /* Neighbour tables are full and nothing
990 can be released. Try to shrink route cache,
991 it is most likely it holds some neighbour records.
993 if (attempts-- > 0) {
994 int saved_elasticity = ip_rt_gc_elasticity;
995 int saved_int = ip_rt_gc_min_interval;
996 ip_rt_gc_elasticity = 1;
997 ip_rt_gc_min_interval = 0;
998 rt_garbage_collect();
999 ip_rt_gc_min_interval = saved_int;
1000 ip_rt_gc_elasticity = saved_elasticity;
1004 if (net_ratelimit())
1005 printk(KERN_WARNING "Neighbour table overflow.\n");
1011 rt->u.rt_next = rt_hash_table[hash].chain;
1012 #if RT_CACHE_DEBUG >= 2
1013 if (rt->u.rt_next) {
1015 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1016 NIPQUAD(rt->rt_dst));
1017 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1018 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1022 rt_hash_table[hash].chain = rt;
1023 spin_unlock_bh(rt_hash_lock_addr(hash));
1028 void rt_bind_peer(struct rtable *rt, int create)
1030 static DEFINE_SPINLOCK(rt_peer_lock);
1031 struct inet_peer *peer;
1033 peer = inet_getpeer(rt->rt_dst, create);
1035 spin_lock_bh(&rt_peer_lock);
1036 if (rt->peer == NULL) {
1040 spin_unlock_bh(&rt_peer_lock);
1046 * Peer allocation may fail only in serious out-of-memory conditions. However
1047 * we still can generate some output.
1048 * Random ID selection looks a bit dangerous because we have no chances to
1049 * select ID being unique in a reasonable period of time.
1050 * But broken packet identifier may be better than no packet at all.
1052 static void ip_select_fb_ident(struct iphdr *iph)
1054 static DEFINE_SPINLOCK(ip_fb_id_lock);
1055 static u32 ip_fallback_id;
1058 spin_lock_bh(&ip_fb_id_lock);
1059 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1060 iph->id = htons(salt & 0xFFFF);
1061 ip_fallback_id = salt;
1062 spin_unlock_bh(&ip_fb_id_lock);
1065 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1067 struct rtable *rt = (struct rtable *) dst;
1070 if (rt->peer == NULL)
1071 rt_bind_peer(rt, 1);
1073 /* If peer is attached to destination, it is never detached,
1074 so that we need not to grab a lock to dereference it.
1077 iph->id = htons(inet_getid(rt->peer, more));
1081 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1082 __builtin_return_address(0));
1084 ip_select_fb_ident(iph);
1087 static void rt_del(unsigned hash, struct rtable *rt)
1089 struct rtable **rthp;
1091 spin_lock_bh(rt_hash_lock_addr(hash));
1093 for (rthp = &rt_hash_table[hash].chain; *rthp;
1094 rthp = &(*rthp)->u.rt_next)
1096 *rthp = rt->u.rt_next;
1100 spin_unlock_bh(rt_hash_lock_addr(hash));
1103 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1104 u32 saddr, u8 tos, struct net_device *dev)
1107 struct in_device *in_dev = in_dev_get(dev);
1108 struct rtable *rth, **rthp;
1109 u32 skeys[2] = { saddr, 0 };
1110 int ikeys[2] = { dev->ifindex, 0 };
1112 tos &= IPTOS_RT_MASK;
1117 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1118 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1119 goto reject_redirect;
1121 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1122 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1123 goto reject_redirect;
1124 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1125 goto reject_redirect;
1127 if (inet_addr_type(new_gw) != RTN_UNICAST)
1128 goto reject_redirect;
1131 for (i = 0; i < 2; i++) {
1132 for (k = 0; k < 2; k++) {
1133 unsigned hash = rt_hash_code(daddr,
1134 skeys[i] ^ (ikeys[k] << 5),
1137 rthp=&rt_hash_table[hash].chain;
1140 while ((rth = rcu_dereference(*rthp)) != NULL) {
1143 if (rth->fl.fl4_dst != daddr ||
1144 rth->fl.fl4_src != skeys[i] ||
1145 rth->fl.fl4_tos != tos ||
1146 rth->fl.oif != ikeys[k] ||
1148 rthp = &rth->u.rt_next;
1152 if (rth->rt_dst != daddr ||
1153 rth->rt_src != saddr ||
1155 rth->rt_gateway != old_gw ||
1156 rth->u.dst.dev != dev)
1159 dst_hold(&rth->u.dst);
1162 rt = dst_alloc(&ipv4_dst_ops);
1169 /* Copy all the information. */
1171 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1172 rt->u.dst.__use = 1;
1173 atomic_set(&rt->u.dst.__refcnt, 1);
1174 rt->u.dst.child = NULL;
1176 dev_hold(rt->u.dst.dev);
1178 in_dev_hold(rt->idev);
1179 rt->u.dst.obsolete = 0;
1180 rt->u.dst.lastuse = jiffies;
1181 rt->u.dst.path = &rt->u.dst;
1182 rt->u.dst.neighbour = NULL;
1183 rt->u.dst.hh = NULL;
1184 rt->u.dst.xfrm = NULL;
1186 rt->rt_flags |= RTCF_REDIRECTED;
1188 /* Gateway is different ... */
1189 rt->rt_gateway = new_gw;
1191 /* Redirect received -> path was valid */
1192 dst_confirm(&rth->u.dst);
1195 atomic_inc(&rt->peer->refcnt);
1197 if (arp_bind_neighbour(&rt->u.dst) ||
1198 !(rt->u.dst.neighbour->nud_state &
1200 if (rt->u.dst.neighbour)
1201 neigh_event_send(rt->u.dst.neighbour, NULL);
1208 if (!rt_intern_hash(hash, rt, &rt))
1221 #ifdef CONFIG_IP_ROUTE_VERBOSE
1222 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1223 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1224 "%u.%u.%u.%u ignored.\n"
1225 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1227 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1228 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1233 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1235 struct rtable *rt = (struct rtable*)dst;
1236 struct dst_entry *ret = dst;
1239 if (dst->obsolete) {
1242 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1243 rt->u.dst.expires) {
1244 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1248 #if RT_CACHE_DEBUG >= 1
1249 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1250 "%u.%u.%u.%u/%02x dropped\n",
1251 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262 * 1. The first ip_rt_redirect_number redirects are sent
1263 * with exponential backoff, then we stop sending them at all,
1264 * assuming that the host ignores our redirects.
1265 * 2. If we did not see packets requiring redirects
1266 * during ip_rt_redirect_silence, we assume that the host
1267 * forgot redirected route and start to send redirects again.
1269 * This algorithm is much cheaper and more intelligent than dumb load limiting
1272 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1273 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1276 void ip_rt_send_redirect(struct sk_buff *skb)
1278 struct rtable *rt = (struct rtable*)skb->dst;
1279 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1284 if (!IN_DEV_TX_REDIRECTS(in_dev))
1287 /* No redirected packets during ip_rt_redirect_silence;
1288 * reset the algorithm.
1290 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1291 rt->u.dst.rate_tokens = 0;
1293 /* Too many ignored redirects; do not send anything
1294 * set u.dst.rate_last to the last seen redirected packet.
1296 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1297 rt->u.dst.rate_last = jiffies;
1301 /* Check for load limit; set rate_last to the latest sent
1304 if (time_after(jiffies,
1305 (rt->u.dst.rate_last +
1306 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1307 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1308 rt->u.dst.rate_last = jiffies;
1309 ++rt->u.dst.rate_tokens;
1310 #ifdef CONFIG_IP_ROUTE_VERBOSE
1311 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1312 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1314 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1315 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1316 NIPQUAD(rt->rt_src), rt->rt_iif,
1317 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1324 static int ip_error(struct sk_buff *skb)
1326 struct rtable *rt = (struct rtable*)skb->dst;
1330 switch (rt->u.dst.error) {
1335 code = ICMP_HOST_UNREACH;
1338 code = ICMP_NET_UNREACH;
1341 code = ICMP_PKT_FILTERED;
1346 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1347 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1348 rt->u.dst.rate_tokens = ip_rt_error_burst;
1349 rt->u.dst.rate_last = now;
1350 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1351 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1352 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1355 out: kfree_skb(skb);
1360 * The last two values are not from the RFC but
1361 * are needed for AMPRnet AX.25 paths.
1364 static unsigned short mtu_plateau[] =
1365 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1367 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1371 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1372 if (old_mtu > mtu_plateau[i])
1373 return mtu_plateau[i];
1377 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1380 unsigned short old_mtu = ntohs(iph->tot_len);
1382 u32 skeys[2] = { iph->saddr, 0, };
1383 u32 daddr = iph->daddr;
1384 u8 tos = iph->tos & IPTOS_RT_MASK;
1385 unsigned short est_mtu = 0;
1387 if (ipv4_config.no_pmtu_disc)
1390 for (i = 0; i < 2; i++) {
1391 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1394 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1395 rth = rcu_dereference(rth->u.rt_next)) {
1396 if (rth->fl.fl4_dst == daddr &&
1397 rth->fl.fl4_src == skeys[i] &&
1398 rth->rt_dst == daddr &&
1399 rth->rt_src == iph->saddr &&
1400 rth->fl.fl4_tos == tos &&
1402 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1403 unsigned short mtu = new_mtu;
1405 if (new_mtu < 68 || new_mtu >= old_mtu) {
1407 /* BSD 4.2 compatibility hack :-( */
1409 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1410 old_mtu >= 68 + (iph->ihl << 2))
1411 old_mtu -= iph->ihl << 2;
1413 mtu = guess_mtu(old_mtu);
1415 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1416 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1417 dst_confirm(&rth->u.dst);
1418 if (mtu < ip_rt_min_pmtu) {
1419 mtu = ip_rt_min_pmtu;
1420 rth->u.dst.metrics[RTAX_LOCK-1] |=
1423 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1424 dst_set_expires(&rth->u.dst,
1433 return est_mtu ? : new_mtu;
1436 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1438 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1439 !(dst_metric_locked(dst, RTAX_MTU))) {
1440 if (mtu < ip_rt_min_pmtu) {
1441 mtu = ip_rt_min_pmtu;
1442 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1444 dst->metrics[RTAX_MTU-1] = mtu;
1445 dst_set_expires(dst, ip_rt_mtu_expires);
1449 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1454 static void ipv4_dst_destroy(struct dst_entry *dst)
1456 struct rtable *rt = (struct rtable *) dst;
1457 struct inet_peer *peer = rt->peer;
1458 struct in_device *idev = rt->idev;
1471 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1474 struct rtable *rt = (struct rtable *) dst;
1475 struct in_device *idev = rt->idev;
1476 if (dev != &loopback_dev && idev && idev->dev == dev) {
1477 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1478 if (loopback_idev) {
1479 rt->idev = loopback_idev;
1485 static void ipv4_link_failure(struct sk_buff *skb)
1489 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1491 rt = (struct rtable *) skb->dst;
1493 dst_set_expires(&rt->u.dst, 0);
1496 static int ip_rt_bug(struct sk_buff *skb)
1498 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1499 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1500 skb->dev ? skb->dev->name : "?");
1506 We do not cache source address of outgoing interface,
1507 because it is used only by IP RR, TS and SRR options,
1508 so that it out of fast path.
1510 BTW remember: "addr" is allowed to be not aligned
1514 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1517 struct fib_result res;
1519 if (rt->fl.iif == 0)
1521 else if (fib_lookup(&rt->fl, &res) == 0) {
1522 src = FIB_RES_PREFSRC(res);
1525 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1527 memcpy(addr, &src, 4);
1530 #ifdef CONFIG_NET_CLS_ROUTE
1531 static void set_class_tag(struct rtable *rt, u32 tag)
1533 if (!(rt->u.dst.tclassid & 0xFFFF))
1534 rt->u.dst.tclassid |= tag & 0xFFFF;
1535 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1536 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1540 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1542 struct fib_info *fi = res->fi;
1545 if (FIB_RES_GW(*res) &&
1546 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1547 rt->rt_gateway = FIB_RES_GW(*res);
1548 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1549 sizeof(rt->u.dst.metrics));
1550 if (fi->fib_mtu == 0) {
1551 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1552 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1553 rt->rt_gateway != rt->rt_dst &&
1554 rt->u.dst.dev->mtu > 576)
1555 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1557 #ifdef CONFIG_NET_CLS_ROUTE
1558 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1561 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1563 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1564 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1565 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1566 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1567 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1568 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1570 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1571 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1573 #ifdef CONFIG_NET_CLS_ROUTE
1574 #ifdef CONFIG_IP_MULTIPLE_TABLES
1575 set_class_tag(rt, fib_rules_tclass(res));
1577 set_class_tag(rt, itag);
1579 rt->rt_type = res->type;
1582 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1583 u8 tos, struct net_device *dev, int our)
1588 struct in_device *in_dev = in_dev_get(dev);
1591 /* Primary sanity checks. */
1596 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1597 skb->protocol != htons(ETH_P_IP))
1600 if (ZERONET(saddr)) {
1601 if (!LOCAL_MCAST(daddr))
1603 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1604 } else if (fib_validate_source(saddr, 0, tos, 0,
1605 dev, &spec_dst, &itag) < 0)
1608 rth = dst_alloc(&ipv4_dst_ops);
1612 rth->u.dst.output= ip_rt_bug;
1614 atomic_set(&rth->u.dst.__refcnt, 1);
1615 rth->u.dst.flags= DST_HOST;
1616 if (in_dev->cnf.no_policy)
1617 rth->u.dst.flags |= DST_NOPOLICY;
1618 rth->fl.fl4_dst = daddr;
1619 rth->rt_dst = daddr;
1620 rth->fl.fl4_tos = tos;
1621 #ifdef CONFIG_IP_ROUTE_FWMARK
1622 rth->fl.fl4_fwmark= skb->nfmark;
1624 rth->fl.fl4_src = saddr;
1625 rth->rt_src = saddr;
1626 #ifdef CONFIG_NET_CLS_ROUTE
1627 rth->u.dst.tclassid = itag;
1630 rth->fl.iif = dev->ifindex;
1631 rth->u.dst.dev = &loopback_dev;
1632 dev_hold(rth->u.dst.dev);
1633 rth->idev = in_dev_get(rth->u.dst.dev);
1635 rth->rt_gateway = daddr;
1636 rth->rt_spec_dst= spec_dst;
1637 rth->rt_type = RTN_MULTICAST;
1638 rth->rt_flags = RTCF_MULTICAST;
1640 rth->u.dst.input= ip_local_deliver;
1641 rth->rt_flags |= RTCF_LOCAL;
1644 #ifdef CONFIG_IP_MROUTE
1645 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1646 rth->u.dst.input = ip_mr_input;
1648 RT_CACHE_STAT_INC(in_slow_mc);
1651 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1652 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1664 static void ip_handle_martian_source(struct net_device *dev,
1665 struct in_device *in_dev,
1666 struct sk_buff *skb,
1670 RT_CACHE_STAT_INC(in_martian_src);
1671 #ifdef CONFIG_IP_ROUTE_VERBOSE
1672 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1674 * RFC1812 recommendation, if source is martian,
1675 * the only hint is MAC header.
1677 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1678 "%u.%u.%u.%u, on dev %s\n",
1679 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1680 if (dev->hard_header_len) {
1682 unsigned char *p = skb->mac.raw;
1683 printk(KERN_WARNING "ll header: ");
1684 for (i = 0; i < dev->hard_header_len; i++, p++) {
1686 if (i < (dev->hard_header_len - 1))
1695 static inline int __mkroute_input(struct sk_buff *skb,
1696 struct fib_result* res,
1697 struct in_device *in_dev,
1698 u32 daddr, u32 saddr, u32 tos,
1699 struct rtable **result)
1704 struct in_device *out_dev;
1708 /* get a working reference to the output device */
1709 out_dev = in_dev_get(FIB_RES_DEV(*res));
1710 if (out_dev == NULL) {
1711 if (net_ratelimit())
1712 printk(KERN_CRIT "Bug in ip_route_input" \
1713 "_slow(). Please, report\n");
1718 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1719 in_dev->dev, &spec_dst, &itag);
1721 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1729 flags |= RTCF_DIRECTSRC;
1731 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1732 (IN_DEV_SHARED_MEDIA(out_dev) ||
1733 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1734 flags |= RTCF_DOREDIRECT;
1736 if (skb->protocol != htons(ETH_P_IP)) {
1737 /* Not IP (i.e. ARP). Do not create route, if it is
1738 * invalid for proxy arp. DNAT routes are always valid.
1740 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1747 rth = dst_alloc(&ipv4_dst_ops);
1753 rth->u.dst.flags= DST_HOST;
1754 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1755 if (res->fi->fib_nhs > 1)
1756 rth->u.dst.flags |= DST_BALANCED;
1758 if (in_dev->cnf.no_policy)
1759 rth->u.dst.flags |= DST_NOPOLICY;
1760 if (in_dev->cnf.no_xfrm)
1761 rth->u.dst.flags |= DST_NOXFRM;
1762 rth->fl.fl4_dst = daddr;
1763 rth->rt_dst = daddr;
1764 rth->fl.fl4_tos = tos;
1765 #ifdef CONFIG_IP_ROUTE_FWMARK
1766 rth->fl.fl4_fwmark= skb->nfmark;
1768 rth->fl.fl4_src = saddr;
1769 rth->rt_src = saddr;
1770 rth->rt_gateway = daddr;
1772 rth->fl.iif = in_dev->dev->ifindex;
1773 rth->u.dst.dev = (out_dev)->dev;
1774 dev_hold(rth->u.dst.dev);
1775 rth->idev = in_dev_get(rth->u.dst.dev);
1777 rth->rt_spec_dst= spec_dst;
1779 rth->u.dst.input = ip_forward;
1780 rth->u.dst.output = ip_output;
1782 rt_set_nexthop(rth, res, itag);
1784 rth->rt_flags = flags;
1789 /* release the working reference to the output device */
1790 in_dev_put(out_dev);
1794 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1795 struct fib_result* res,
1796 const struct flowi *fl,
1797 struct in_device *in_dev,
1798 u32 daddr, u32 saddr, u32 tos)
1800 struct rtable* rth = NULL;
1804 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1805 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1806 fib_select_multipath(fl, res);
1809 /* create a routing cache entry */
1810 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1813 atomic_set(&rth->u.dst.__refcnt, 1);
1815 /* put it into the cache */
1816 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1817 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1820 static inline int ip_mkroute_input(struct sk_buff *skb,
1821 struct fib_result* res,
1822 const struct flowi *fl,
1823 struct in_device *in_dev,
1824 u32 daddr, u32 saddr, u32 tos)
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1827 struct rtable* rth = NULL;
1828 unsigned char hop, hopcount, lasthop;
1833 hopcount = res->fi->fib_nhs;
1837 lasthop = hopcount - 1;
1839 /* distinguish between multipath and singlepath */
1841 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1844 /* add all alternatives to the routing cache */
1845 for (hop = 0; hop < hopcount; hop++) {
1848 /* create a routing cache entry */
1849 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1854 /* put it into the cache */
1855 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1856 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1860 /* forward hop information to multipath impl. */
1861 multipath_set_nhinfo(rth,
1862 FIB_RES_NETWORK(*res),
1863 FIB_RES_NETMASK(*res),
1867 /* only for the last hop the reference count is handled
1871 atomic_set(&(skb->dst->__refcnt), 1);
1874 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1875 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 * NOTE. We drop all the packets that has local source
1882 * addresses, because every properly looped back packet
1883 * must have correct destination already attached by output routine.
1885 * Such approach solves two big problems:
1886 * 1. Not simplex devices are handled properly.
1887 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1890 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891 u8 tos, struct net_device *dev)
1893 struct fib_result res;
1894 struct in_device *in_dev = in_dev_get(dev);
1895 struct flowi fl = { .nl_u = { .ip4_u =
1899 .scope = RT_SCOPE_UNIVERSE,
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901 .fwmark = skb->nfmark
1904 .iif = dev->ifindex };
1907 struct rtable * rth;
1913 /* IP on this device is disabled. */
1918 /* Check for the most weird martians, which can be not detected
1922 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923 goto martian_source;
1925 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1928 /* Accept zero addresses only to limited broadcast;
1929 * I even do not know to fix it or not. Waiting for complains :-)
1932 goto martian_source;
1934 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935 goto martian_destination;
1938 * Now we are ready to route packet.
1940 if ((err = fib_lookup(&fl, &res)) != 0) {
1941 if (!IN_DEV_FORWARD(in_dev))
1947 RT_CACHE_STAT_INC(in_slow_tot);
1949 if (res.type == RTN_BROADCAST)
1952 if (res.type == RTN_LOCAL) {
1954 result = fib_validate_source(saddr, daddr, tos,
1955 loopback_dev.ifindex,
1956 dev, &spec_dst, &itag);
1958 goto martian_source;
1960 flags |= RTCF_DIRECTSRC;
1965 if (!IN_DEV_FORWARD(in_dev))
1967 if (res.type != RTN_UNICAST)
1968 goto martian_destination;
1970 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 if (err == -ENOBUFS)
1983 if (skb->protocol != htons(ETH_P_IP))
1987 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1989 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1992 goto martian_source;
1994 flags |= RTCF_DIRECTSRC;
1996 flags |= RTCF_BROADCAST;
1997 res.type = RTN_BROADCAST;
1998 RT_CACHE_STAT_INC(in_brd);
2001 rth = dst_alloc(&ipv4_dst_ops);
2005 rth->u.dst.output= ip_rt_bug;
2007 atomic_set(&rth->u.dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST;
2009 if (in_dev->cnf.no_policy)
2010 rth->u.dst.flags |= DST_NOPOLICY;
2011 rth->fl.fl4_dst = daddr;
2012 rth->rt_dst = daddr;
2013 rth->fl.fl4_tos = tos;
2014 #ifdef CONFIG_IP_ROUTE_FWMARK
2015 rth->fl.fl4_fwmark= skb->nfmark;
2017 rth->fl.fl4_src = saddr;
2018 rth->rt_src = saddr;
2019 #ifdef CONFIG_NET_CLS_ROUTE
2020 rth->u.dst.tclassid = itag;
2023 rth->fl.iif = dev->ifindex;
2024 rth->u.dst.dev = &loopback_dev;
2025 dev_hold(rth->u.dst.dev);
2026 rth->idev = in_dev_get(rth->u.dst.dev);
2027 rth->rt_gateway = daddr;
2028 rth->rt_spec_dst= spec_dst;
2029 rth->u.dst.input= ip_local_deliver;
2030 rth->rt_flags = flags|RTCF_LOCAL;
2031 if (res.type == RTN_UNREACHABLE) {
2032 rth->u.dst.input= ip_error;
2033 rth->u.dst.error= -err;
2034 rth->rt_flags &= ~RTCF_LOCAL;
2036 rth->rt_type = res.type;
2037 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2038 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2042 RT_CACHE_STAT_INC(in_no_route);
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044 res.type = RTN_UNREACHABLE;
2048 * Do not cache martian addresses: they should be logged (RFC1812)
2050 martian_destination:
2051 RT_CACHE_STAT_INC(in_martian_dst);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055 "%u.%u.%u.%u, dev %s\n",
2056 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2060 err = -EHOSTUNREACH;
2072 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2076 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077 u8 tos, struct net_device *dev)
2079 struct rtable * rth;
2081 int iif = dev->ifindex;
2083 tos &= IPTOS_RT_MASK;
2084 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088 rth = rcu_dereference(rth->u.rt_next)) {
2089 if (rth->fl.fl4_dst == daddr &&
2090 rth->fl.fl4_src == saddr &&
2091 rth->fl.iif == iif &&
2093 #ifdef CONFIG_IP_ROUTE_FWMARK
2094 rth->fl.fl4_fwmark == skb->nfmark &&
2096 rth->fl.fl4_tos == tos) {
2097 rth->u.dst.lastuse = jiffies;
2098 dst_hold(&rth->u.dst);
2100 RT_CACHE_STAT_INC(in_hit);
2102 skb->dst = (struct dst_entry*)rth;
2105 RT_CACHE_STAT_INC(in_hlist_search);
2109 /* Multicast recognition logic is moved from route cache to here.
2110 The problem was that too many Ethernet cards have broken/missing
2111 hardware multicast filters :-( As result the host on multicasting
2112 network acquires a lot of useless route cache entries, sort of
2113 SDR messages from all the world. Now we try to get rid of them.
2114 Really, provided software IP multicast filter is organized
2115 reasonably (at least, hashed), it does not result in a slowdown
2116 comparing with route cache reject entries.
2117 Note, that multicast routers are not affected, because
2118 route cache entry is created eventually.
2120 if (MULTICAST(daddr)) {
2121 struct in_device *in_dev;
2124 if ((in_dev = __in_dev_get(dev)) != NULL) {
2125 int our = ip_check_mc(in_dev, daddr, saddr,
2126 skb->nh.iph->protocol);
2128 #ifdef CONFIG_IP_MROUTE
2129 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2133 return ip_route_input_mc(skb, daddr, saddr,
2140 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2143 static inline int __mkroute_output(struct rtable **result,
2144 struct fib_result* res,
2145 const struct flowi *fl,
2146 const struct flowi *oldflp,
2147 struct net_device *dev_out,
2151 struct in_device *in_dev;
2152 u32 tos = RT_FL_TOS(oldflp);
2155 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2158 if (fl->fl4_dst == 0xFFFFFFFF)
2159 res->type = RTN_BROADCAST;
2160 else if (MULTICAST(fl->fl4_dst))
2161 res->type = RTN_MULTICAST;
2162 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2165 if (dev_out->flags & IFF_LOOPBACK)
2166 flags |= RTCF_LOCAL;
2168 /* get work reference to inet device */
2169 in_dev = in_dev_get(dev_out);
2173 if (res->type == RTN_BROADCAST) {
2174 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2176 fib_info_put(res->fi);
2179 } else if (res->type == RTN_MULTICAST) {
2180 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2183 flags &= ~RTCF_LOCAL;
2184 /* If multicast route do not exist use
2185 default one, but do not gateway in this case.
2188 if (res->fi && res->prefixlen < 4) {
2189 fib_info_put(res->fi);
2195 rth = dst_alloc(&ipv4_dst_ops);
2201 rth->u.dst.flags= DST_HOST;
2202 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2204 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2205 if (res->fi->fib_nhs > 1)
2206 rth->u.dst.flags |= DST_BALANCED;
2209 if (in_dev->cnf.no_xfrm)
2210 rth->u.dst.flags |= DST_NOXFRM;
2211 if (in_dev->cnf.no_policy)
2212 rth->u.dst.flags |= DST_NOPOLICY;
2214 rth->fl.fl4_dst = oldflp->fl4_dst;
2215 rth->fl.fl4_tos = tos;
2216 rth->fl.fl4_src = oldflp->fl4_src;
2217 rth->fl.oif = oldflp->oif;
2218 #ifdef CONFIG_IP_ROUTE_FWMARK
2219 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2221 rth->rt_dst = fl->fl4_dst;
2222 rth->rt_src = fl->fl4_src;
2223 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2224 /* get references to the devices that are to be hold by the routing
2226 rth->u.dst.dev = dev_out;
2228 rth->idev = in_dev_get(dev_out);
2229 rth->rt_gateway = fl->fl4_dst;
2230 rth->rt_spec_dst= fl->fl4_src;
2232 rth->u.dst.output=ip_output;
2234 RT_CACHE_STAT_INC(out_slow_tot);
2236 if (flags & RTCF_LOCAL) {
2237 rth->u.dst.input = ip_local_deliver;
2238 rth->rt_spec_dst = fl->fl4_dst;
2240 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2241 rth->rt_spec_dst = fl->fl4_src;
2242 if (flags & RTCF_LOCAL &&
2243 !(dev_out->flags & IFF_LOOPBACK)) {
2244 rth->u.dst.output = ip_mc_output;
2245 RT_CACHE_STAT_INC(out_slow_mc);
2247 #ifdef CONFIG_IP_MROUTE
2248 if (res->type == RTN_MULTICAST) {
2249 if (IN_DEV_MFORWARD(in_dev) &&
2250 !LOCAL_MCAST(oldflp->fl4_dst)) {
2251 rth->u.dst.input = ip_mr_input;
2252 rth->u.dst.output = ip_mc_output;
2258 rt_set_nexthop(rth, res, 0);
2260 rth->rt_flags = flags;
2264 /* release work reference to inet device */
2270 static inline int ip_mkroute_output_def(struct rtable **rp,
2271 struct fib_result* res,
2272 const struct flowi *fl,
2273 const struct flowi *oldflp,
2274 struct net_device *dev_out,
2277 struct rtable *rth = NULL;
2278 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2281 u32 tos = RT_FL_TOS(oldflp);
2283 atomic_set(&rth->u.dst.__refcnt, 1);
2285 hash = rt_hash_code(oldflp->fl4_dst,
2286 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2287 err = rt_intern_hash(hash, rth, rp);
2293 static inline int ip_mkroute_output(struct rtable** rp,
2294 struct fib_result* res,
2295 const struct flowi *fl,
2296 const struct flowi *oldflp,
2297 struct net_device *dev_out,
2300 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2301 u32 tos = RT_FL_TOS(oldflp);
2305 struct rtable *rth = NULL;
2307 if (res->fi && res->fi->fib_nhs > 1) {
2308 unsigned char hopcount = res->fi->fib_nhs;
2310 for (hop = 0; hop < hopcount; hop++) {
2311 struct net_device *dev2nexthop;
2315 /* hold a work reference to the output device */
2316 dev2nexthop = FIB_RES_DEV(*res);
2317 dev_hold(dev2nexthop);
2319 err = __mkroute_output(&rth, res, fl, oldflp,
2320 dev2nexthop, flags);
2325 hash = rt_hash_code(oldflp->fl4_dst,
2327 (oldflp->oif << 5), tos);
2328 err = rt_intern_hash(hash, rth, rp);
2330 /* forward hop information to multipath impl. */
2331 multipath_set_nhinfo(rth,
2332 FIB_RES_NETWORK(*res),
2333 FIB_RES_NETMASK(*res),
2337 /* release work reference to output device */
2338 dev_put(dev2nexthop);
2343 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2346 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2349 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2350 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2355 * Major route resolver routine.
2358 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2360 u32 tos = RT_FL_TOS(oldflp);
2361 struct flowi fl = { .nl_u = { .ip4_u =
2362 { .daddr = oldflp->fl4_dst,
2363 .saddr = oldflp->fl4_src,
2364 .tos = tos & IPTOS_RT_MASK,
2365 .scope = ((tos & RTO_ONLINK) ?
2368 #ifdef CONFIG_IP_ROUTE_FWMARK
2369 .fwmark = oldflp->fl4_fwmark
2372 .iif = loopback_dev.ifindex,
2373 .oif = oldflp->oif };
2374 struct fib_result res;
2376 struct net_device *dev_out = NULL;
2382 #ifdef CONFIG_IP_MULTIPLE_TABLES
2386 if (oldflp->fl4_src) {
2388 if (MULTICAST(oldflp->fl4_src) ||
2389 BADCLASS(oldflp->fl4_src) ||
2390 ZERONET(oldflp->fl4_src))
2393 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2394 dev_out = ip_dev_find(oldflp->fl4_src);
2395 if (dev_out == NULL)
2398 /* I removed check for oif == dev_out->oif here.
2399 It was wrong for two reasons:
2400 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2401 assigned to multiple interfaces.
2402 2. Moreover, we are allowed to send packets with saddr
2403 of another iface. --ANK
2406 if (oldflp->oif == 0
2407 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2408 /* Special hack: user can direct multicasts
2409 and limited broadcast via necessary interface
2410 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2411 This hack is not just for fun, it allows
2412 vic,vat and friends to work.
2413 They bind socket to loopback, set ttl to zero
2414 and expect that it will work.
2415 From the viewpoint of routing cache they are broken,
2416 because we are not allowed to build multicast path
2417 with loopback source addr (look, routing cache
2418 cannot know, that ttl is zero, so that packet
2419 will not leave this host and route is valid).
2420 Luckily, this hack is good workaround.
2423 fl.oif = dev_out->ifindex;
2433 dev_out = dev_get_by_index(oldflp->oif);
2435 if (dev_out == NULL)
2437 if (__in_dev_get(dev_out) == NULL) {
2439 goto out; /* Wrong error code */
2442 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2444 fl.fl4_src = inet_select_addr(dev_out, 0,
2449 if (MULTICAST(oldflp->fl4_dst))
2450 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 else if (!oldflp->fl4_dst)
2453 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 fl.fl4_dst = fl.fl4_src;
2461 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2464 dev_out = &loopback_dev;
2466 fl.oif = loopback_dev.ifindex;
2467 res.type = RTN_LOCAL;
2468 flags |= RTCF_LOCAL;
2472 if (fib_lookup(&fl, &res)) {
2475 /* Apparently, routing tables are wrong. Assume,
2476 that the destination is on link.
2479 Because we are allowed to send to iface
2480 even if it has NO routes and NO assigned
2481 addresses. When oif is specified, routing
2482 tables are looked up with only one purpose:
2483 to catch if destination is gatewayed, rather than
2484 direct. Moreover, if MSG_DONTROUTE is set,
2485 we send packet, ignoring both routing tables
2486 and ifaddr state. --ANK
2489 We could make it even if oif is unknown,
2490 likely IPv6, but we do not.
2493 if (fl.fl4_src == 0)
2494 fl.fl4_src = inet_select_addr(dev_out, 0,
2496 res.type = RTN_UNICAST;
2506 if (res.type == RTN_LOCAL) {
2508 fl.fl4_src = fl.fl4_dst;
2511 dev_out = &loopback_dev;
2513 fl.oif = dev_out->ifindex;
2515 fib_info_put(res.fi);
2517 flags |= RTCF_LOCAL;
2521 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2522 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2523 fib_select_multipath(&fl, &res);
2526 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2527 fib_select_default(&fl, &res);
2530 fl.fl4_src = FIB_RES_PREFSRC(res);
2534 dev_out = FIB_RES_DEV(res);
2536 fl.oif = dev_out->ifindex;
2540 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2555 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2558 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2559 rth = rcu_dereference(rth->u.rt_next)) {
2560 if (rth->fl.fl4_dst == flp->fl4_dst &&
2561 rth->fl.fl4_src == flp->fl4_src &&
2563 rth->fl.oif == flp->oif &&
2564 #ifdef CONFIG_IP_ROUTE_FWMARK
2565 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2567 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2568 (IPTOS_RT_MASK | RTO_ONLINK))) {
2570 /* check for multipath routes and choose one if
2573 if (multipath_select_route(flp, rth, rp)) {
2574 dst_hold(&(*rp)->u.dst);
2575 RT_CACHE_STAT_INC(out_hit);
2576 rcu_read_unlock_bh();
2580 rth->u.dst.lastuse = jiffies;
2581 dst_hold(&rth->u.dst);
2583 RT_CACHE_STAT_INC(out_hit);
2584 rcu_read_unlock_bh();
2588 RT_CACHE_STAT_INC(out_hlist_search);
2590 rcu_read_unlock_bh();
2592 return ip_route_output_slow(rp, flp);
2595 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2599 if ((err = __ip_route_output_key(rp, flp)) != 0)
2604 flp->fl4_src = (*rp)->rt_src;
2606 flp->fl4_dst = (*rp)->rt_dst;
2607 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2613 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2615 return ip_route_output_flow(rp, flp, NULL, 0);
2618 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2619 int nowait, unsigned int flags)
2621 struct rtable *rt = (struct rtable*)skb->dst;
2623 struct nlmsghdr *nlh;
2624 unsigned char *b = skb->tail;
2625 struct rta_cacheinfo ci;
2626 #ifdef CONFIG_IP_MROUTE
2627 struct rtattr *eptr;
2629 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2630 r = NLMSG_DATA(nlh);
2631 r->rtm_family = AF_INET;
2632 r->rtm_dst_len = 32;
2634 r->rtm_tos = rt->fl.fl4_tos;
2635 r->rtm_table = RT_TABLE_MAIN;
2636 r->rtm_type = rt->rt_type;
2637 r->rtm_scope = RT_SCOPE_UNIVERSE;
2638 r->rtm_protocol = RTPROT_UNSPEC;
2639 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2640 if (rt->rt_flags & RTCF_NOTIFY)
2641 r->rtm_flags |= RTM_F_NOTIFY;
2642 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2643 if (rt->fl.fl4_src) {
2644 r->rtm_src_len = 32;
2645 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2648 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2649 #ifdef CONFIG_NET_CLS_ROUTE
2650 if (rt->u.dst.tclassid)
2651 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2654 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2655 __u32 alg = rt->rt_multipath_alg;
2657 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2661 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2662 else if (rt->rt_src != rt->fl.fl4_src)
2663 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2664 if (rt->rt_dst != rt->rt_gateway)
2665 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2666 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2667 goto rtattr_failure;
2668 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2669 ci.rta_used = rt->u.dst.__use;
2670 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2671 if (rt->u.dst.expires)
2672 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2675 ci.rta_error = rt->u.dst.error;
2676 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2678 ci.rta_id = rt->peer->ip_id_count;
2679 if (rt->peer->tcp_ts_stamp) {
2680 ci.rta_ts = rt->peer->tcp_ts;
2681 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2684 #ifdef CONFIG_IP_MROUTE
2685 eptr = (struct rtattr*)skb->tail;
2687 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2689 #ifdef CONFIG_IP_MROUTE
2690 u32 dst = rt->rt_dst;
2692 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2693 ipv4_devconf.mc_forwarding) {
2694 int err = ipmr_get_route(skb, r, nowait);
2701 if (err == -EMSGSIZE)
2703 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2708 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2711 nlh->nlmsg_len = skb->tail - b;
2716 skb_trim(skb, b - skb->data);
2720 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2722 struct rtattr **rta = arg;
2723 struct rtmsg *rtm = NLMSG_DATA(nlh);
2724 struct rtable *rt = NULL;
2729 struct sk_buff *skb;
2731 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2735 /* Reserve room for dummy headers, this skb can pass
2736 through good chunk of routing engine.
2738 skb->mac.raw = skb->data;
2739 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2741 if (rta[RTA_SRC - 1])
2742 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2743 if (rta[RTA_DST - 1])
2744 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2745 if (rta[RTA_IIF - 1])
2746 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2749 struct net_device *dev = __dev_get_by_index(iif);
2753 skb->protocol = htons(ETH_P_IP);
2756 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2758 rt = (struct rtable*)skb->dst;
2759 if (!err && rt->u.dst.error)
2760 err = -rt->u.dst.error;
2762 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2764 .tos = rtm->rtm_tos } } };
2766 if (rta[RTA_OIF - 1])
2767 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2769 err = ip_route_output_key(&rt, &fl);
2774 skb->dst = &rt->u.dst;
2775 if (rtm->rtm_flags & RTM_F_NOTIFY)
2776 rt->rt_flags |= RTCF_NOTIFY;
2778 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2780 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2781 RTM_NEWROUTE, 0, 0);
2789 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2799 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2806 s_idx = idx = cb->args[1];
2807 for (h = 0; h <= rt_hash_mask; h++) {
2808 if (h < s_h) continue;
2812 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2813 rt = rcu_dereference(rt->u.rt_next), idx++) {
2816 skb->dst = dst_clone(&rt->u.dst);
2817 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2818 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2819 1, NLM_F_MULTI) <= 0) {
2820 dst_release(xchg(&skb->dst, NULL));
2821 rcu_read_unlock_bh();
2824 dst_release(xchg(&skb->dst, NULL));
2826 rcu_read_unlock_bh();
2835 void ip_rt_multicast_event(struct in_device *in_dev)
2840 #ifdef CONFIG_SYSCTL
2841 static int flush_delay;
2843 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2844 struct file *filp, void __user *buffer,
2845 size_t *lenp, loff_t *ppos)
2848 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2849 rt_cache_flush(flush_delay);
2856 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2859 void __user *oldval,
2860 size_t __user *oldlenp,
2861 void __user *newval,
2866 if (newlen != sizeof(int))
2868 if (get_user(delay, (int __user *)newval))
2870 rt_cache_flush(delay);
2874 ctl_table ipv4_route_table[] = {
2876 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2877 .procname = "flush",
2878 .data = &flush_delay,
2879 .maxlen = sizeof(int),
2881 .proc_handler = &ipv4_sysctl_rtcache_flush,
2882 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2885 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2886 .procname = "min_delay",
2887 .data = &ip_rt_min_delay,
2888 .maxlen = sizeof(int),
2890 .proc_handler = &proc_dointvec_jiffies,
2891 .strategy = &sysctl_jiffies,
2894 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2895 .procname = "max_delay",
2896 .data = &ip_rt_max_delay,
2897 .maxlen = sizeof(int),
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2903 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2904 .procname = "gc_thresh",
2905 .data = &ipv4_dst_ops.gc_thresh,
2906 .maxlen = sizeof(int),
2908 .proc_handler = &proc_dointvec,
2911 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2912 .procname = "max_size",
2913 .data = &ip_rt_max_size,
2914 .maxlen = sizeof(int),
2916 .proc_handler = &proc_dointvec,
2919 /* Deprecated. Use gc_min_interval_ms */
2921 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2922 .procname = "gc_min_interval",
2923 .data = &ip_rt_gc_min_interval,
2924 .maxlen = sizeof(int),
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2930 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2931 .procname = "gc_min_interval_ms",
2932 .data = &ip_rt_gc_min_interval,
2933 .maxlen = sizeof(int),
2935 .proc_handler = &proc_dointvec_ms_jiffies,
2936 .strategy = &sysctl_ms_jiffies,
2939 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2940 .procname = "gc_timeout",
2941 .data = &ip_rt_gc_timeout,
2942 .maxlen = sizeof(int),
2944 .proc_handler = &proc_dointvec_jiffies,
2945 .strategy = &sysctl_jiffies,
2948 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2949 .procname = "gc_interval",
2950 .data = &ip_rt_gc_interval,
2951 .maxlen = sizeof(int),
2953 .proc_handler = &proc_dointvec_jiffies,
2954 .strategy = &sysctl_jiffies,
2957 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2958 .procname = "redirect_load",
2959 .data = &ip_rt_redirect_load,
2960 .maxlen = sizeof(int),
2962 .proc_handler = &proc_dointvec,
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2966 .procname = "redirect_number",
2967 .data = &ip_rt_redirect_number,
2968 .maxlen = sizeof(int),
2970 .proc_handler = &proc_dointvec,
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2974 .procname = "redirect_silence",
2975 .data = &ip_rt_redirect_silence,
2976 .maxlen = sizeof(int),
2978 .proc_handler = &proc_dointvec,
2981 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2982 .procname = "error_cost",
2983 .data = &ip_rt_error_cost,
2984 .maxlen = sizeof(int),
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2990 .procname = "error_burst",
2991 .data = &ip_rt_error_burst,
2992 .maxlen = sizeof(int),
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2998 .procname = "gc_elasticity",
2999 .data = &ip_rt_gc_elasticity,
3000 .maxlen = sizeof(int),
3002 .proc_handler = &proc_dointvec,
3005 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3006 .procname = "mtu_expires",
3007 .data = &ip_rt_mtu_expires,
3008 .maxlen = sizeof(int),
3010 .proc_handler = &proc_dointvec_jiffies,
3011 .strategy = &sysctl_jiffies,
3014 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3015 .procname = "min_pmtu",
3016 .data = &ip_rt_min_pmtu,
3017 .maxlen = sizeof(int),
3019 .proc_handler = &proc_dointvec,
3022 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3023 .procname = "min_adv_mss",
3024 .data = &ip_rt_min_advmss,
3025 .maxlen = sizeof(int),
3027 .proc_handler = &proc_dointvec,
3030 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3031 .procname = "secret_interval",
3032 .data = &ip_rt_secret_interval,
3033 .maxlen = sizeof(int),
3035 .proc_handler = &proc_dointvec_jiffies,
3036 .strategy = &sysctl_jiffies,
3042 #ifdef CONFIG_NET_CLS_ROUTE
3043 struct ip_rt_acct *ip_rt_acct;
3045 /* This code sucks. But you should have seen it before! --RR */
3047 /* IP route accounting ptr for this logical cpu number. */
3048 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3050 #ifdef CONFIG_PROC_FS
3051 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3052 int length, int *eof, void *data)
3056 if ((offset & 3) || (length & 3))
3059 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3064 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3065 length = sizeof(struct ip_rt_acct) * 256 - offset;
3069 offset /= sizeof(u32);
3072 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3073 u32 *dst = (u32 *) buffer;
3075 /* Copy first cpu. */
3077 memcpy(dst, src, length);
3079 /* Add the other cpus in, one int at a time */
3083 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3085 for (j = 0; j < length/4; j++)
3091 #endif /* CONFIG_PROC_FS */
3092 #endif /* CONFIG_NET_CLS_ROUTE */
3094 static __initdata unsigned long rhash_entries;
3095 static int __init set_rhash_entries(char *str)
3099 rhash_entries = simple_strtoul(str, &str, 0);
3102 __setup("rhash_entries=", set_rhash_entries);
3104 int __init ip_rt_init(void)
3106 int order, goal, rc = 0;
3108 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3109 (jiffies ^ (jiffies >> 7)));
3111 #ifdef CONFIG_NET_CLS_ROUTE
3113 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3115 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3117 panic("IP: failed to allocate ip_rt_acct\n");
3118 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3121 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3122 sizeof(struct rtable),
3123 0, SLAB_HWCACHE_ALIGN,
3126 if (!ipv4_dst_ops.kmem_cachep)
3127 panic("IP: failed to allocate ip_dst_cache\n");
3129 goal = num_physpages >> (26 - PAGE_SHIFT);
3131 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3132 for (order = 0; (1UL << order) < goal; order++)
3136 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3137 sizeof(struct rt_hash_bucket);
3138 while (rt_hash_mask & (rt_hash_mask - 1))
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 __get_free_pages(GFP_ATOMIC, order);
3142 } while (rt_hash_table == NULL && --order > 0);
3145 panic("Failed to allocate IP route cache hash table\n");
3147 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3149 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3151 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3155 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3156 rt_hash_lock_init();
3158 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3159 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3168 init_timer(&rt_flush_timer);
3169 rt_flush_timer.function = rt_run_flush;
3170 init_timer(&rt_periodic_timer);
3171 rt_periodic_timer.function = rt_check_expire;
3172 init_timer(&rt_secret_timer);
3173 rt_secret_timer.function = rt_secret_rebuild;
3175 /* All the timers, started at system startup tend
3176 to synchronize. Perturb it a bit.
3178 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3180 add_timer(&rt_periodic_timer);
3182 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3183 ip_rt_secret_interval;
3184 add_timer(&rt_secret_timer);
3186 #ifdef CONFIG_PROC_FS
3188 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3189 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3190 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3192 free_percpu(rt_cache_stat);
3195 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3197 #ifdef CONFIG_NET_CLS_ROUTE
3198 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3208 EXPORT_SYMBOL(__ip_select_ident);
3209 EXPORT_SYMBOL(ip_route_input);
3210 EXPORT_SYMBOL(ip_route_output_key);