netfilter: Reduce switch/case indent
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
68a5e3dd
DM
112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
1da177e4
LT
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
1da177e4
LT
134/*
135 * Interface to generic destination cache.
136 */
137
138static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 139static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 140static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 141static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
144static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 145static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 146
72cdd1d9
ED
147static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148 int how)
149{
150}
1da177e4 151
62fa8a84
DM
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
06582540
DM
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
a48eff12 159 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 160
06582540
DM
161 peer = rt->peer;
162 if (peer) {
62fa8a84
DM
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
06582540
DM
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
62fa8a84
DM
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
62fa8a84
DM
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
1da177e4
LT
187static struct dst_ops ipv4_dst_ops = {
188 .family = AF_INET,
09640e63 189 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
190 .gc = rt_garbage_collect,
191 .check = ipv4_dst_check,
0dbaee3b 192 .default_advmss = ipv4_default_advmss,
d33e4553 193 .default_mtu = ipv4_default_mtu,
62fa8a84 194 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
195 .destroy = ipv4_dst_destroy,
196 .ifdown = ipv4_dst_ifdown,
197 .negative_advice = ipv4_negative_advice,
198 .link_failure = ipv4_link_failure,
199 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 200 .local_out = __ip_local_out,
1da177e4
LT
201};
202
203#define ECN_OR_COST(class) TC_PRIO_##class
204
4839c52b 205const __u8 ip_tos2prio[16] = {
1da177e4 206 TC_PRIO_BESTEFFORT,
4a2b9c37 207 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
208 TC_PRIO_BESTEFFORT,
209 ECN_OR_COST(BESTEFFORT),
210 TC_PRIO_BULK,
211 ECN_OR_COST(BULK),
212 TC_PRIO_BULK,
213 ECN_OR_COST(BULK),
214 TC_PRIO_INTERACTIVE,
215 ECN_OR_COST(INTERACTIVE),
216 TC_PRIO_INTERACTIVE,
217 ECN_OR_COST(INTERACTIVE),
218 TC_PRIO_INTERACTIVE_BULK,
219 ECN_OR_COST(INTERACTIVE_BULK),
220 TC_PRIO_INTERACTIVE_BULK,
221 ECN_OR_COST(INTERACTIVE_BULK)
222};
223
224
225/*
226 * Route cache.
227 */
228
229/* The locking scheme is rather straight forward:
230 *
231 * 1) Read-Copy Update protects the buckets of the central route hash.
232 * 2) Only writers remove entries, and they hold the lock
233 * as they look at rtable reference counts.
234 * 3) Only readers acquire references to rtable entries,
235 * they do so with atomic increments and with the
236 * lock held.
237 */
238
239struct rt_hash_bucket {
1c31720a 240 struct rtable __rcu *chain;
22c047cc 241};
1080d709 242
8a25d5de
IM
243#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
244 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
245/*
246 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
247 * The size of this table is a power of two and depends on the number of CPUS.
62051200 248 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 249 */
62051200
IM
250#ifdef CONFIG_LOCKDEP
251# define RT_HASH_LOCK_SZ 256
22c047cc 252#else
62051200
IM
253# if NR_CPUS >= 32
254# define RT_HASH_LOCK_SZ 4096
255# elif NR_CPUS >= 16
256# define RT_HASH_LOCK_SZ 2048
257# elif NR_CPUS >= 8
258# define RT_HASH_LOCK_SZ 1024
259# elif NR_CPUS >= 4
260# define RT_HASH_LOCK_SZ 512
261# else
262# define RT_HASH_LOCK_SZ 256
263# endif
22c047cc
ED
264#endif
265
266static spinlock_t *rt_hash_locks;
267# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
268
269static __init void rt_hash_lock_init(void)
270{
271 int i;
272
273 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
274 GFP_KERNEL);
275 if (!rt_hash_locks)
276 panic("IP: failed to allocate rt_hash_locks\n");
277
278 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
279 spin_lock_init(&rt_hash_locks[i]);
280}
22c047cc
ED
281#else
282# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
283
284static inline void rt_hash_lock_init(void)
285{
286}
22c047cc 287#endif
1da177e4 288
817bc4db
SH
289static struct rt_hash_bucket *rt_hash_table __read_mostly;
290static unsigned rt_hash_mask __read_mostly;
291static unsigned int rt_hash_log __read_mostly;
1da177e4 292
2f970d83 293static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 294#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 295
b00180de 296static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 297 int genid)
1da177e4 298{
0eae88f3 299 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 300 idx, genid)
29e75252 301 & rt_hash_mask;
1da177e4
LT
302}
303
e84f84f2
DL
304static inline int rt_genid(struct net *net)
305{
306 return atomic_read(&net->ipv4.rt_genid);
307}
308
1da177e4
LT
309#ifdef CONFIG_PROC_FS
310struct rt_cache_iter_state {
a75e936f 311 struct seq_net_private p;
1da177e4 312 int bucket;
29e75252 313 int genid;
1da177e4
LT
314};
315
1218854a 316static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 317{
1218854a 318 struct rt_cache_iter_state *st = seq->private;
1da177e4 319 struct rtable *r = NULL;
1da177e4
LT
320
321 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 322 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 323 continue;
1da177e4 324 rcu_read_lock_bh();
a898def2 325 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 326 while (r) {
d8d1f30b 327 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 328 r->rt_genid == st->genid)
29e75252 329 return r;
d8d1f30b 330 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 331 }
1da177e4
LT
332 rcu_read_unlock_bh();
333 }
29e75252 334 return r;
1da177e4
LT
335}
336
1218854a 337static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 338 struct rtable *r)
1da177e4 339{
1218854a 340 struct rt_cache_iter_state *st = seq->private;
a6272665 341
1c31720a 342 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
343 while (!r) {
344 rcu_read_unlock_bh();
a6272665
ED
345 do {
346 if (--st->bucket < 0)
347 return NULL;
1c31720a 348 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 349 rcu_read_lock_bh();
1c31720a 350 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 351 }
1c31720a 352 return r;
1da177e4
LT
353}
354
1218854a 355static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
356 struct rtable *r)
357{
1218854a
YH
358 struct rt_cache_iter_state *st = seq->private;
359 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 360 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 361 continue;
642d6318
DL
362 if (r->rt_genid == st->genid)
363 break;
364 }
365 return r;
366}
367
1218854a 368static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 369{
1218854a 370 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
371
372 if (r)
1218854a 373 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
374 --pos;
375 return pos ? NULL : r;
376}
377
378static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
379{
29e75252 380 struct rt_cache_iter_state *st = seq->private;
29e75252 381 if (*pos)
1218854a 382 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 383 st->genid = rt_genid(seq_file_net(seq));
29e75252 384 return SEQ_START_TOKEN;
1da177e4
LT
385}
386
387static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
388{
29e75252 389 struct rtable *r;
1da177e4
LT
390
391 if (v == SEQ_START_TOKEN)
1218854a 392 r = rt_cache_get_first(seq);
1da177e4 393 else
1218854a 394 r = rt_cache_get_next(seq, v);
1da177e4
LT
395 ++*pos;
396 return r;
397}
398
399static void rt_cache_seq_stop(struct seq_file *seq, void *v)
400{
401 if (v && v != SEQ_START_TOKEN)
402 rcu_read_unlock_bh();
403}
404
405static int rt_cache_seq_show(struct seq_file *seq, void *v)
406{
407 if (v == SEQ_START_TOKEN)
408 seq_printf(seq, "%-127s\n",
409 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
410 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
411 "HHUptod\tSpecDst");
412 else {
413 struct rtable *r = v;
5e659e4c 414 int len;
1da177e4 415
0eae88f3
ED
416 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
417 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 418 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
419 (__force u32)r->rt_dst,
420 (__force u32)r->rt_gateway,
d8d1f30b
CG
421 r->rt_flags, atomic_read(&r->dst.__refcnt),
422 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 423 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
424 dst_metric(&r->dst, RTAX_WINDOW),
425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
426 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 427 r->rt_key_tos,
d8d1f30b
CG
428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
429 r->dst.hh ? (r->dst.hh->hh_output ==
1da177e4 430 dev_queue_xmit) : 0,
5e659e4c
PE
431 r->rt_spec_dst, &len);
432
433 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
434 }
435 return 0;
1da177e4
LT
436}
437
f690808e 438static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
439 .start = rt_cache_seq_start,
440 .next = rt_cache_seq_next,
441 .stop = rt_cache_seq_stop,
442 .show = rt_cache_seq_show,
443};
444
445static int rt_cache_seq_open(struct inode *inode, struct file *file)
446{
a75e936f 447 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 448 sizeof(struct rt_cache_iter_state));
1da177e4
LT
449}
450
9a32144e 451static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
452 .owner = THIS_MODULE,
453 .open = rt_cache_seq_open,
454 .read = seq_read,
455 .llseek = seq_lseek,
a75e936f 456 .release = seq_release_net,
1da177e4
LT
457};
458
459
460static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
461{
462 int cpu;
463
464 if (*pos == 0)
465 return SEQ_START_TOKEN;
466
0f23174a 467 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
468 if (!cpu_possible(cpu))
469 continue;
470 *pos = cpu+1;
2f970d83 471 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
472 }
473 return NULL;
474}
475
476static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
477{
478 int cpu;
479
0f23174a 480 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
2f970d83 484 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
485 }
486 return NULL;
e905a9ed 487
1da177e4
LT
488}
489
490static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
491{
492
493}
494
495static int rt_cpu_seq_show(struct seq_file *seq, void *v)
496{
497 struct rt_cache_stat *st = v;
498
499 if (v == SEQ_START_TOKEN) {
5bec0039 500 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
501 return 0;
502 }
e905a9ed 503
1da177e4
LT
504 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
505 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 506 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
507 st->in_hit,
508 st->in_slow_tot,
509 st->in_slow_mc,
510 st->in_no_route,
511 st->in_brd,
512 st->in_martian_dst,
513 st->in_martian_src,
514
515 st->out_hit,
516 st->out_slow_tot,
e905a9ed 517 st->out_slow_mc,
1da177e4
LT
518
519 st->gc_total,
520 st->gc_ignored,
521 st->gc_goal_miss,
522 st->gc_dst_overflow,
523 st->in_hlist_search,
524 st->out_hlist_search
525 );
526 return 0;
527}
528
f690808e 529static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
530 .start = rt_cpu_seq_start,
531 .next = rt_cpu_seq_next,
532 .stop = rt_cpu_seq_stop,
533 .show = rt_cpu_seq_show,
534};
535
536
537static int rt_cpu_seq_open(struct inode *inode, struct file *file)
538{
539 return seq_open(file, &rt_cpu_seq_ops);
540}
541
9a32144e 542static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
543 .owner = THIS_MODULE,
544 .open = rt_cpu_seq_open,
545 .read = seq_read,
546 .llseek = seq_lseek,
547 .release = seq_release,
548};
549
c7066f70 550#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 551static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 552{
a661c419
AD
553 struct ip_rt_acct *dst, *src;
554 unsigned int i, j;
555
556 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
557 if (!dst)
558 return -ENOMEM;
559
560 for_each_possible_cpu(i) {
561 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
562 for (j = 0; j < 256; j++) {
563 dst[j].o_bytes += src[j].o_bytes;
564 dst[j].o_packets += src[j].o_packets;
565 dst[j].i_bytes += src[j].i_bytes;
566 dst[j].i_packets += src[j].i_packets;
567 }
78c686e9
PE
568 }
569
a661c419
AD
570 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
571 kfree(dst);
572 return 0;
573}
78c686e9 574
a661c419
AD
575static int rt_acct_proc_open(struct inode *inode, struct file *file)
576{
577 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 578}
a661c419
AD
579
580static const struct file_operations rt_acct_proc_fops = {
581 .owner = THIS_MODULE,
582 .open = rt_acct_proc_open,
583 .read = seq_read,
584 .llseek = seq_lseek,
585 .release = single_release,
586};
78c686e9 587#endif
107f1634 588
73b38711 589static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
590{
591 struct proc_dir_entry *pde;
592
593 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
594 &rt_cache_seq_fops);
595 if (!pde)
596 goto err1;
597
77020720
WC
598 pde = proc_create("rt_cache", S_IRUGO,
599 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
600 if (!pde)
601 goto err2;
602
c7066f70 603#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
605 if (!pde)
606 goto err3;
607#endif
608 return 0;
609
c7066f70 610#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
611err3:
612 remove_proc_entry("rt_cache", net->proc_net_stat);
613#endif
614err2:
615 remove_proc_entry("rt_cache", net->proc_net);
616err1:
617 return -ENOMEM;
618}
73b38711
DL
619
620static void __net_exit ip_rt_do_proc_exit(struct net *net)
621{
622 remove_proc_entry("rt_cache", net->proc_net_stat);
623 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 624#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 625 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 626#endif
73b38711
DL
627}
628
629static struct pernet_operations ip_rt_proc_ops __net_initdata = {
630 .init = ip_rt_do_proc_init,
631 .exit = ip_rt_do_proc_exit,
632};
633
634static int __init ip_rt_proc_init(void)
635{
636 return register_pernet_subsys(&ip_rt_proc_ops);
637}
638
107f1634 639#else
73b38711 640static inline int ip_rt_proc_init(void)
107f1634
PE
641{
642 return 0;
643}
1da177e4 644#endif /* CONFIG_PROC_FS */
e905a9ed 645
5969f71d 646static inline void rt_free(struct rtable *rt)
1da177e4 647{
d8d1f30b 648 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
649}
650
5969f71d 651static inline void rt_drop(struct rtable *rt)
1da177e4 652{
1da177e4 653 ip_rt_put(rt);
d8d1f30b 654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
655}
656
5969f71d 657static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
658{
659 /* Kill broadcast/multicast entries very aggresively, if they
660 collide in hash table with more useful entries */
661 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 662 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
663}
664
5969f71d 665static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
666{
667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 668 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
669}
670
671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
672{
673 unsigned long age;
674 int ret = 0;
675
d8d1f30b 676 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
677 goto out;
678
d8d1f30b 679 age = jiffies - rth->dst.lastuse;
1da177e4
LT
680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
681 (age <= tmo2 && rt_valuable(rth)))
682 goto out;
683 ret = 1;
684out: return ret;
685}
686
687/* Bits of score are:
688 * 31: very valuable
689 * 30: not quite useless
690 * 29..0: usage counter
691 */
692static inline u32 rt_score(struct rtable *rt)
693{
d8d1f30b 694 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
695
696 score = ~score & ~(3<<30);
697
698 if (rt_valuable(rt))
699 score |= (1<<31);
700
c7537967 701 if (rt_is_output_route(rt) ||
1da177e4
LT
702 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
703 score |= (1<<30);
704
705 return score;
706}
707
1080d709
NH
708static inline bool rt_caching(const struct net *net)
709{
710 return net->ipv4.current_rt_cache_rebuild_count <=
711 net->ipv4.sysctl_rt_cache_rebuild_count;
712}
713
5e2b61f7
DM
714static inline bool compare_hash_inputs(const struct rtable *rt1,
715 const struct rtable *rt2)
1080d709 716{
5e2b61f7
DM
717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
1080d709
NH
720}
721
5e2b61f7 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 723{
5e2b61f7
DM
724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
726 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 727 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
5e2b61f7
DM
728 (rt1->rt_oif ^ rt2->rt_oif) |
729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
1da177e4
LT
730}
731
b5921910
DL
732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
733{
d8d1f30b 734 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
735}
736
e84f84f2
DL
737static inline int rt_is_expired(struct rtable *rth)
738{
d8d1f30b 739 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
740}
741
beb659bd
ED
742/*
743 * Perform a full scan of hash table and free all entries.
744 * Can be called by a softirq or a process.
745 * In the later case, we want to be reschedule if necessary
746 */
6561a3b1 747static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
748{
749 unsigned int i;
750 struct rtable *rth, *next;
751
752 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
753 struct rtable __rcu **pprev;
754 struct rtable *list;
755
beb659bd
ED
756 if (process_context && need_resched())
757 cond_resched();
1c31720a 758 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
759 if (!rth)
760 continue;
761
762 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 763
6561a3b1
DM
764 list = NULL;
765 pprev = &rt_hash_table[i].chain;
766 rth = rcu_dereference_protected(*pprev,
1c31720a 767 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 768
6561a3b1
DM
769 while (rth) {
770 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 771 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
772
773 if (!net ||
774 net_eq(dev_net(rth->dst.dev), net)) {
775 rcu_assign_pointer(*pprev, next);
776 rcu_assign_pointer(rth->dst.rt_next, list);
777 list = rth;
32cb5b4e 778 } else {
6561a3b1 779 pprev = &rth->dst.rt_next;
32cb5b4e 780 }
6561a3b1 781 rth = next;
32cb5b4e 782 }
6561a3b1 783
beb659bd
ED
784 spin_unlock_bh(rt_hash_lock_addr(i));
785
6561a3b1
DM
786 for (; list; list = next) {
787 next = rcu_dereference_protected(list->dst.rt_next, 1);
788 rt_free(list);
beb659bd
ED
789 }
790 }
791}
792
1080d709
NH
793/*
794 * While freeing expired entries, we compute average chain length
795 * and standard deviation, using fixed-point arithmetic.
796 * This to have an estimation of rt_chain_length_max
797 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
798 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
799 */
800
801#define FRACT_BITS 3
802#define ONE (1UL << FRACT_BITS)
803
98376387
ED
804/*
805 * Given a hash chain and an item in this hash chain,
806 * find if a previous entry has the same hash_inputs
807 * (but differs on tos, mark or oif)
808 * Returns 0 if an alias is found.
809 * Returns ONE if rth has no alias before itself.
810 */
811static int has_noalias(const struct rtable *head, const struct rtable *rth)
812{
813 const struct rtable *aux = head;
814
815 while (aux != rth) {
5e2b61f7 816 if (compare_hash_inputs(aux, rth))
98376387 817 return 0;
1c31720a 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
819 }
820 return ONE;
821}
822
29e75252 823/*
25985edc 824 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
826 * many times (2^24) without giving recent rt_genid.
827 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 828 */
86c657f6 829static void rt_cache_invalidate(struct net *net)
1da177e4 830{
29e75252 831 unsigned char shuffle;
1da177e4 832
29e75252 833 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 834 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
835}
836
29e75252
ED
837/*
838 * delay < 0 : invalidate cache (fast : entries will be deleted later)
839 * delay >= 0 : invalidate & flush cache (can be long)
840 */
76e6ebfb 841void rt_cache_flush(struct net *net, int delay)
1da177e4 842{
86c657f6 843 rt_cache_invalidate(net);
29e75252 844 if (delay >= 0)
6561a3b1 845 rt_do_flush(net, !in_softirq());
1da177e4
LT
846}
847
a5ee1551 848/* Flush previous cache invalidated entries from the cache */
6561a3b1 849void rt_cache_flush_batch(struct net *net)
a5ee1551 850{
6561a3b1 851 rt_do_flush(net, !in_softirq());
a5ee1551
EB
852}
853
1080d709
NH
854static void rt_emergency_hash_rebuild(struct net *net)
855{
3ee94372 856 if (net_ratelimit())
1080d709 857 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 858 rt_cache_invalidate(net);
1080d709
NH
859}
860
1da177e4
LT
861/*
862 Short description of GC goals.
863
864 We want to build algorithm, which will keep routing cache
865 at some equilibrium point, when number of aged off entries
866 is kept approximately equal to newly generated ones.
867
868 Current expiration strength is variable "expire".
869 We try to adjust it dynamically, so that if networking
870 is idle expires is large enough to keep enough of warm entries,
871 and when load increases it reduces to limit cache size.
872 */
873
569d3645 874static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
875{
876 static unsigned long expire = RT_GC_TIMEOUT;
877 static unsigned long last_gc;
878 static int rover;
879 static int equilibrium;
1c31720a
ED
880 struct rtable *rth;
881 struct rtable __rcu **rthp;
1da177e4
LT
882 unsigned long now = jiffies;
883 int goal;
fc66f95c 884 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
885
886 /*
887 * Garbage collection is pretty expensive,
888 * do not make it too frequently.
889 */
890
891 RT_CACHE_STAT_INC(gc_total);
892
893 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 894 entries < ip_rt_max_size) {
1da177e4
LT
895 RT_CACHE_STAT_INC(gc_ignored);
896 goto out;
897 }
898
fc66f95c 899 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 900 /* Calculate number of entries, which we want to expire now. */
fc66f95c 901 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
902 if (goal <= 0) {
903 if (equilibrium < ipv4_dst_ops.gc_thresh)
904 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 905 goal = entries - equilibrium;
1da177e4 906 if (goal > 0) {
b790cedd 907 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 908 goal = entries - equilibrium;
1da177e4
LT
909 }
910 } else {
911 /* We are in dangerous area. Try to reduce cache really
912 * aggressively.
913 */
b790cedd 914 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 915 equilibrium = entries - goal;
1da177e4
LT
916 }
917
918 if (now - last_gc >= ip_rt_gc_min_interval)
919 last_gc = now;
920
921 if (goal <= 0) {
922 equilibrium += goal;
923 goto work_done;
924 }
925
926 do {
927 int i, k;
928
929 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
930 unsigned long tmo = expire;
931
932 k = (k + 1) & rt_hash_mask;
933 rthp = &rt_hash_table[k].chain;
22c047cc 934 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
935 while ((rth = rcu_dereference_protected(*rthp,
936 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 937 if (!rt_is_expired(rth) &&
29e75252 938 !rt_may_expire(rth, tmo, expire)) {
1da177e4 939 tmo >>= 1;
d8d1f30b 940 rthp = &rth->dst.rt_next;
1da177e4
LT
941 continue;
942 }
d8d1f30b 943 *rthp = rth->dst.rt_next;
1da177e4
LT
944 rt_free(rth);
945 goal--;
1da177e4 946 }
22c047cc 947 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
948 if (goal <= 0)
949 break;
950 }
951 rover = k;
952
953 if (goal <= 0)
954 goto work_done;
955
956 /* Goal is not achieved. We stop process if:
957
958 - if expire reduced to zero. Otherwise, expire is halfed.
959 - if table is not full.
960 - if we are called from interrupt.
961 - jiffies check is just fallback/debug loop breaker.
962 We will not spin here for long time in any case.
963 */
964
965 RT_CACHE_STAT_INC(gc_goal_miss);
966
967 if (expire == 0)
968 break;
969
970 expire >>= 1;
1da177e4 971
fc66f95c 972 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
973 goto out;
974 } while (!in_softirq() && time_before_eq(jiffies, now));
975
fc66f95c
ED
976 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977 goto out;
978 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
979 goto out;
980 if (net_ratelimit())
981 printk(KERN_WARNING "dst cache overflow\n");
982 RT_CACHE_STAT_INC(gc_dst_overflow);
983 return 1;
984
985work_done:
986 expire += ip_rt_gc_min_interval;
987 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
988 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
989 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 990 expire = ip_rt_gc_timeout;
1da177e4
LT
991out: return 0;
992}
993
98376387
ED
994/*
995 * Returns number of entries in a hash chain that have different hash_inputs
996 */
997static int slow_chain_length(const struct rtable *head)
998{
999 int length = 0;
1000 const struct rtable *rth = head;
1001
1002 while (rth) {
1003 length += has_noalias(head, rth);
1c31720a 1004 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1005 }
1006 return length >> FRACT_BITS;
1007}
1008
b23dd4fe
DM
1009static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1010 struct sk_buff *skb, int ifindex)
1da177e4 1011{
1c31720a
ED
1012 struct rtable *rth, *cand;
1013 struct rtable __rcu **rthp, **candp;
1da177e4 1014 unsigned long now;
1da177e4
LT
1015 u32 min_score;
1016 int chain_length;
1017 int attempts = !in_softirq();
1018
1019restart:
1020 chain_length = 0;
1021 min_score = ~(u32)0;
1022 cand = NULL;
1023 candp = NULL;
1024 now = jiffies;
1025
d8d1f30b 1026 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1027 /*
1028 * If we're not caching, just tell the caller we
1029 * were successful and don't touch the route. The
1030 * caller hold the sole reference to the cache entry, and
1031 * it will be released when the caller is done with it.
1032 * If we drop it here, the callers have no way to resolve routes
1033 * when we're not caching. Instead, just point *rp at rt, so
1034 * the caller gets a single use out of the route
b6280b47
NH
1035 * Note that we do rt_free on this new route entry, so that
1036 * once its refcount hits zero, we are still able to reap it
1037 * (Thanks Alexey)
27b75c95
ED
1038 * Note: To avoid expensive rcu stuff for this uncached dst,
1039 * we set DST_NOCACHE so that dst_release() can free dst without
1040 * waiting a grace period.
73e42897 1041 */
b6280b47 1042
c7d4426a 1043 rt->dst.flags |= DST_NOCACHE;
c7537967 1044 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1045 int err = arp_bind_neighbour(&rt->dst);
b6280b47
NH
1046 if (err) {
1047 if (net_ratelimit())
1048 printk(KERN_WARNING
1049 "Neighbour table failure & not caching routes.\n");
27b75c95 1050 ip_rt_put(rt);
b23dd4fe 1051 return ERR_PTR(err);
b6280b47
NH
1052 }
1053 }
1054
b6280b47 1055 goto skip_hashing;
1080d709
NH
1056 }
1057
1da177e4
LT
1058 rthp = &rt_hash_table[hash].chain;
1059
22c047cc 1060 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1061 while ((rth = rcu_dereference_protected(*rthp,
1062 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1063 if (rt_is_expired(rth)) {
d8d1f30b 1064 *rthp = rth->dst.rt_next;
29e75252
ED
1065 rt_free(rth);
1066 continue;
1067 }
5e2b61f7 1068 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1069 /* Put it first */
d8d1f30b 1070 *rthp = rth->dst.rt_next;
1da177e4
LT
1071 /*
1072 * Since lookup is lockfree, the deletion
1073 * must be visible to another weakly ordered CPU before
1074 * the insertion at the start of the hash chain.
1075 */
d8d1f30b 1076 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1077 rt_hash_table[hash].chain);
1078 /*
1079 * Since lookup is lockfree, the update writes
1080 * must be ordered for consistency on SMP.
1081 */
1082 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1083
d8d1f30b 1084 dst_use(&rth->dst, now);
22c047cc 1085 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1086
1087 rt_drop(rt);
b23dd4fe 1088 if (skb)
d8d1f30b 1089 skb_dst_set(skb, &rth->dst);
b23dd4fe 1090 return rth;
1da177e4
LT
1091 }
1092
d8d1f30b 1093 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1094 u32 score = rt_score(rth);
1095
1096 if (score <= min_score) {
1097 cand = rth;
1098 candp = rthp;
1099 min_score = score;
1100 }
1101 }
1102
1103 chain_length++;
1104
d8d1f30b 1105 rthp = &rth->dst.rt_next;
1da177e4
LT
1106 }
1107
1108 if (cand) {
1109 /* ip_rt_gc_elasticity used to be average length of chain
1110 * length, when exceeded gc becomes really aggressive.
1111 *
1112 * The second limit is less certain. At the moment it allows
1113 * only 2 entries per bucket. We will see.
1114 */
1115 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1116 *candp = cand->dst.rt_next;
1da177e4
LT
1117 rt_free(cand);
1118 }
1080d709 1119 } else {
98376387
ED
1120 if (chain_length > rt_chain_length_max &&
1121 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1122 struct net *net = dev_net(rt->dst.dev);
1080d709 1123 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1124 if (!rt_caching(net)) {
1080d709 1125 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1126 rt->dst.dev->name, num);
1080d709 1127 }
b35ecb5d 1128 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1129 spin_unlock_bh(rt_hash_lock_addr(hash));
1130
5e2b61f7 1131 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1132 ifindex, rt_genid(net));
1133 goto restart;
1080d709 1134 }
1da177e4
LT
1135 }
1136
1137 /* Try to bind route to arp only if it is output
1138 route or unicast forwarding path.
1139 */
c7537967 1140 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1141 int err = arp_bind_neighbour(&rt->dst);
1da177e4 1142 if (err) {
22c047cc 1143 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1144
1145 if (err != -ENOBUFS) {
1146 rt_drop(rt);
b23dd4fe 1147 return ERR_PTR(err);
1da177e4
LT
1148 }
1149
1150 /* Neighbour tables are full and nothing
1151 can be released. Try to shrink route cache,
1152 it is most likely it holds some neighbour records.
1153 */
1154 if (attempts-- > 0) {
1155 int saved_elasticity = ip_rt_gc_elasticity;
1156 int saved_int = ip_rt_gc_min_interval;
1157 ip_rt_gc_elasticity = 1;
1158 ip_rt_gc_min_interval = 0;
569d3645 1159 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1160 ip_rt_gc_min_interval = saved_int;
1161 ip_rt_gc_elasticity = saved_elasticity;
1162 goto restart;
1163 }
1164
1165 if (net_ratelimit())
7e1b33e5 1166 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1167 rt_drop(rt);
b23dd4fe 1168 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1169 }
1170 }
1171
d8d1f30b 1172 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1173
00269b54
ED
1174 /*
1175 * Since lookup is lockfree, we must make sure
25985edc 1176 * previous writes to rt are committed to memory
00269b54
ED
1177 * before making rt visible to other CPUS.
1178 */
1ddbcb00 1179 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1180
22c047cc 1181 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1182
b6280b47 1183skip_hashing:
b23dd4fe 1184 if (skb)
d8d1f30b 1185 skb_dst_set(skb, &rt->dst);
b23dd4fe 1186 return rt;
1da177e4
LT
1187}
1188
6431cbc2
DM
1189static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191static u32 rt_peer_genid(void)
1192{
1193 return atomic_read(&__rt_peer_genid);
1194}
1195
a48eff12 1196void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1197{
1da177e4
LT
1198 struct inet_peer *peer;
1199
a48eff12 1200 peer = inet_getpeer_v4(daddr, create);
1da177e4 1201
49e8ab03 1202 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1203 inet_putpeer(peer);
6431cbc2
DM
1204 else
1205 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1206}
1207
1208/*
1209 * Peer allocation may fail only in serious out-of-memory conditions. However
1210 * we still can generate some output.
1211 * Random ID selection looks a bit dangerous because we have no chances to
1212 * select ID being unique in a reasonable period of time.
1213 * But broken packet identifier may be better than no packet at all.
1214 */
1215static void ip_select_fb_ident(struct iphdr *iph)
1216{
1217 static DEFINE_SPINLOCK(ip_fb_id_lock);
1218 static u32 ip_fallback_id;
1219 u32 salt;
1220
1221 spin_lock_bh(&ip_fb_id_lock);
e448515c 1222 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1223 iph->id = htons(salt & 0xFFFF);
1224 ip_fallback_id = salt;
1225 spin_unlock_bh(&ip_fb_id_lock);
1226}
1227
1228void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1229{
1230 struct rtable *rt = (struct rtable *) dst;
1231
1232 if (rt) {
1233 if (rt->peer == NULL)
a48eff12 1234 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1235
1236 /* If peer is attached to destination, it is never detached,
1237 so that we need not to grab a lock to dereference it.
1238 */
1239 if (rt->peer) {
1240 iph->id = htons(inet_getid(rt->peer, more));
1241 return;
1242 }
1243 } else
e905a9ed 1244 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1245 __builtin_return_address(0));
1da177e4
LT
1246
1247 ip_select_fb_ident(iph);
1248}
4bc2f18b 1249EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1250
1251static void rt_del(unsigned hash, struct rtable *rt)
1252{
1c31720a
ED
1253 struct rtable __rcu **rthp;
1254 struct rtable *aux;
1da177e4 1255
29e75252 1256 rthp = &rt_hash_table[hash].chain;
22c047cc 1257 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1258 ip_rt_put(rt);
1c31720a
ED
1259 while ((aux = rcu_dereference_protected(*rthp,
1260 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1261 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1262 *rthp = aux->dst.rt_next;
29e75252
ED
1263 rt_free(aux);
1264 continue;
1da177e4 1265 }
d8d1f30b 1266 rthp = &aux->dst.rt_next;
29e75252 1267 }
22c047cc 1268 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1269}
1270
ed7865a4 1271/* called in rcu_read_lock() section */
f7655229
AV
1272void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1273 __be32 saddr, struct net_device *dev)
1da177e4 1274{
ed7865a4 1275 struct in_device *in_dev = __in_dev_get_rcu(dev);
f39925db 1276 struct inet_peer *peer;
317805b8 1277 struct net *net;
1da177e4 1278
1da177e4
LT
1279 if (!in_dev)
1280 return;
1281
c346dca1 1282 net = dev_net(dev);
9d4fb27d
JP
1283 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1284 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1285 ipv4_is_zeronet(new_gw))
1da177e4
LT
1286 goto reject_redirect;
1287
1288 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1289 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1290 goto reject_redirect;
1291 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1292 goto reject_redirect;
1293 } else {
317805b8 1294 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1295 goto reject_redirect;
1296 }
1297
f39925db
DM
1298 peer = inet_getpeer_v4(daddr, 1);
1299 if (peer) {
1300 peer->redirect_learned.a4 = new_gw;
e905a9ed 1301
f39925db 1302 inet_putpeer(peer);
1da177e4 1303
f39925db 1304 atomic_inc(&__rt_peer_genid);
1da177e4 1305 }
1da177e4
LT
1306 return;
1307
1308reject_redirect:
1309#ifdef CONFIG_IP_ROUTE_VERBOSE
1310 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1311 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1312 " Advised path = %pI4 -> %pI4\n",
1313 &old_gw, dev->name, &new_gw,
1314 &saddr, &daddr);
1da177e4 1315#endif
ed7865a4 1316 ;
1da177e4
LT
1317}
1318
fe6fe792
ED
1319static bool peer_pmtu_expired(struct inet_peer *peer)
1320{
1321 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1322
1323 return orig &&
1324 time_after_eq(jiffies, orig) &&
1325 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1326}
1327
1328static bool peer_pmtu_cleaned(struct inet_peer *peer)
1329{
1330 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1331
1332 return orig &&
1333 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1334}
1335
1da177e4
LT
1336static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1337{
ee6b9673 1338 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1339 struct dst_entry *ret = dst;
1340
1341 if (rt) {
d11a4dc1 1342 if (dst->obsolete > 0) {
1da177e4
LT
1343 ip_rt_put(rt);
1344 ret = NULL;
2c8cec5c 1345 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1346 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1347 rt->rt_oif,
e84f84f2 1348 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1349 rt_del(hash, rt);
1350 ret = NULL;
fe6fe792
ED
1351 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1352 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1353 }
1354 }
1355 return ret;
1356}
1357
1358/*
1359 * Algorithm:
1360 * 1. The first ip_rt_redirect_number redirects are sent
1361 * with exponential backoff, then we stop sending them at all,
1362 * assuming that the host ignores our redirects.
1363 * 2. If we did not see packets requiring redirects
1364 * during ip_rt_redirect_silence, we assume that the host
1365 * forgot redirected route and start to send redirects again.
1366 *
1367 * This algorithm is much cheaper and more intelligent than dumb load limiting
1368 * in icmp.c.
1369 *
1370 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1371 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1372 */
1373
1374void ip_rt_send_redirect(struct sk_buff *skb)
1375{
511c3f92 1376 struct rtable *rt = skb_rtable(skb);
30038fc6 1377 struct in_device *in_dev;
92d86829 1378 struct inet_peer *peer;
30038fc6 1379 int log_martians;
1da177e4 1380
30038fc6 1381 rcu_read_lock();
d8d1f30b 1382 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1383 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1384 rcu_read_unlock();
1da177e4 1385 return;
30038fc6
ED
1386 }
1387 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1388 rcu_read_unlock();
1da177e4 1389
92d86829 1390 if (!rt->peer)
a48eff12 1391 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1392 peer = rt->peer;
1393 if (!peer) {
1394 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1395 return;
1396 }
1397
1da177e4
LT
1398 /* No redirected packets during ip_rt_redirect_silence;
1399 * reset the algorithm.
1400 */
92d86829
DM
1401 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1402 peer->rate_tokens = 0;
1da177e4
LT
1403
1404 /* Too many ignored redirects; do not send anything
d8d1f30b 1405 * set dst.rate_last to the last seen redirected packet.
1da177e4 1406 */
92d86829
DM
1407 if (peer->rate_tokens >= ip_rt_redirect_number) {
1408 peer->rate_last = jiffies;
30038fc6 1409 return;
1da177e4
LT
1410 }
1411
1412 /* Check for load limit; set rate_last to the latest sent
1413 * redirect.
1414 */
92d86829 1415 if (peer->rate_tokens == 0 ||
14fb8a76 1416 time_after(jiffies,
92d86829
DM
1417 (peer->rate_last +
1418 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1419 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1420 peer->rate_last = jiffies;
1421 ++peer->rate_tokens;
1da177e4 1422#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1423 if (log_martians &&
92d86829 1424 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1425 net_ratelimit())
673d57e7 1426 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1427 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1428 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1429#endif
1430 }
1da177e4
LT
1431}
1432
1433static int ip_error(struct sk_buff *skb)
1434{
511c3f92 1435 struct rtable *rt = skb_rtable(skb);
92d86829 1436 struct inet_peer *peer;
1da177e4 1437 unsigned long now;
92d86829 1438 bool send;
1da177e4
LT
1439 int code;
1440
d8d1f30b 1441 switch (rt->dst.error) {
1da177e4
LT
1442 case EINVAL:
1443 default:
1444 goto out;
1445 case EHOSTUNREACH:
1446 code = ICMP_HOST_UNREACH;
1447 break;
1448 case ENETUNREACH:
1449 code = ICMP_NET_UNREACH;
d8d1f30b 1450 IP_INC_STATS_BH(dev_net(rt->dst.dev),
7c73a6fa 1451 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1452 break;
1453 case EACCES:
1454 code = ICMP_PKT_FILTERED;
1455 break;
1456 }
1457
92d86829 1458 if (!rt->peer)
a48eff12 1459 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1460 peer = rt->peer;
1461
1462 send = true;
1463 if (peer) {
1464 now = jiffies;
1465 peer->rate_tokens += now - peer->rate_last;
1466 if (peer->rate_tokens > ip_rt_error_burst)
1467 peer->rate_tokens = ip_rt_error_burst;
1468 peer->rate_last = now;
1469 if (peer->rate_tokens >= ip_rt_error_cost)
1470 peer->rate_tokens -= ip_rt_error_cost;
1471 else
1472 send = false;
1da177e4 1473 }
92d86829
DM
1474 if (send)
1475 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1476
1477out: kfree_skb(skb);
1478 return 0;
e905a9ed 1479}
1da177e4
LT
1480
1481/*
1482 * The last two values are not from the RFC but
1483 * are needed for AMPRnet AX.25 paths.
1484 */
1485
9b5b5cff 1486static const unsigned short mtu_plateau[] =
1da177e4
LT
1487{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1488
5969f71d 1489static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1490{
1491 int i;
e905a9ed 1492
1da177e4
LT
1493 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1494 if (old_mtu > mtu_plateau[i])
1495 return mtu_plateau[i];
1496 return 68;
1497}
1498
b71d1d42 1499unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1500 unsigned short new_mtu,
1501 struct net_device *dev)
1da177e4 1502{
1da177e4 1503 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1504 unsigned short est_mtu = 0;
2c8cec5c 1505 struct inet_peer *peer;
1da177e4 1506
2c8cec5c
DM
1507 peer = inet_getpeer_v4(iph->daddr, 1);
1508 if (peer) {
1509 unsigned short mtu = new_mtu;
1da177e4 1510
2c8cec5c
DM
1511 if (new_mtu < 68 || new_mtu >= old_mtu) {
1512 /* BSD 4.2 derived systems incorrectly adjust
1513 * tot_len by the IP header length, and report
1514 * a zero MTU in the ICMP message.
1515 */
1516 if (mtu == 0 &&
1517 old_mtu >= 68 + (iph->ihl << 2))
1518 old_mtu -= iph->ihl << 2;
1519 mtu = guess_mtu(old_mtu);
1520 }
0010e465 1521
2c8cec5c
DM
1522 if (mtu < ip_rt_min_pmtu)
1523 mtu = ip_rt_min_pmtu;
1524 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1525 unsigned long pmtu_expires;
1526
1527 pmtu_expires = jiffies + ip_rt_mtu_expires;
1528 if (!pmtu_expires)
1529 pmtu_expires = 1UL;
1530
2c8cec5c
DM
1531 est_mtu = mtu;
1532 peer->pmtu_learned = mtu;
46af3180 1533 peer->pmtu_expires = pmtu_expires;
2c8cec5c 1534 }
1da177e4 1535
2c8cec5c 1536 inet_putpeer(peer);
1da177e4 1537
2c8cec5c 1538 atomic_inc(&__rt_peer_genid);
1da177e4
LT
1539 }
1540 return est_mtu ? : new_mtu;
1541}
1542
2c8cec5c
DM
1543static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1544{
fe6fe792 1545 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1546
fe6fe792
ED
1547 if (!expires)
1548 return;
46af3180 1549 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1550 u32 orig_dst_mtu = dst_mtu(dst);
1551 if (peer->pmtu_learned < orig_dst_mtu) {
1552 if (!peer->pmtu_orig)
1553 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1554 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1555 }
1556 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1557 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1558}
1559
1da177e4
LT
1560static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1561{
2c8cec5c
DM
1562 struct rtable *rt = (struct rtable *) dst;
1563 struct inet_peer *peer;
1564
1565 dst_confirm(dst);
1566
1567 if (!rt->peer)
a48eff12 1568 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1569 peer = rt->peer;
1570 if (peer) {
fe6fe792
ED
1571 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1572
2c8cec5c 1573 if (mtu < ip_rt_min_pmtu)
1da177e4 1574 mtu = ip_rt_min_pmtu;
fe6fe792 1575 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1576
1577 pmtu_expires = jiffies + ip_rt_mtu_expires;
1578 if (!pmtu_expires)
1579 pmtu_expires = 1UL;
1580
2c8cec5c 1581 peer->pmtu_learned = mtu;
46af3180 1582 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1583
1584 atomic_inc(&__rt_peer_genid);
1585 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1586 }
46af3180 1587 check_peer_pmtu(dst, peer);
1da177e4
LT
1588 }
1589}
1590
f39925db
DM
1591static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1592{
1593 struct rtable *rt = (struct rtable *) dst;
1594 __be32 orig_gw = rt->rt_gateway;
1595
1596 dst_confirm(&rt->dst);
1597
1598 neigh_release(rt->dst.neighbour);
1599 rt->dst.neighbour = NULL;
1600
1601 rt->rt_gateway = peer->redirect_learned.a4;
1602 if (arp_bind_neighbour(&rt->dst) ||
1603 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1604 if (rt->dst.neighbour)
1605 neigh_event_send(rt->dst.neighbour, NULL);
1606 rt->rt_gateway = orig_gw;
1607 return -EAGAIN;
1608 } else {
1609 rt->rt_flags |= RTCF_REDIRECTED;
1610 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1611 rt->dst.neighbour);
1612 }
1613 return 0;
1614}
1615
1da177e4
LT
1616static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1617{
6431cbc2
DM
1618 struct rtable *rt = (struct rtable *) dst;
1619
1620 if (rt_is_expired(rt))
d11a4dc1 1621 return NULL;
6431cbc2 1622 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1623 struct inet_peer *peer;
1624
6431cbc2 1625 if (!rt->peer)
a48eff12 1626 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1627
2c8cec5c 1628 peer = rt->peer;
fe6fe792 1629 if (peer) {
2c8cec5c
DM
1630 check_peer_pmtu(dst, peer);
1631
fe6fe792
ED
1632 if (peer->redirect_learned.a4 &&
1633 peer->redirect_learned.a4 != rt->rt_gateway) {
1634 if (check_peer_redir(dst, peer))
1635 return NULL;
1636 }
f39925db
DM
1637 }
1638
6431cbc2
DM
1639 rt->rt_peer_genid = rt_peer_genid();
1640 }
d11a4dc1 1641 return dst;
1da177e4
LT
1642}
1643
1644static void ipv4_dst_destroy(struct dst_entry *dst)
1645{
1646 struct rtable *rt = (struct rtable *) dst;
1647 struct inet_peer *peer = rt->peer;
1da177e4 1648
62fa8a84
DM
1649 if (rt->fi) {
1650 fib_info_put(rt->fi);
1651 rt->fi = NULL;
1652 }
1da177e4
LT
1653 if (peer) {
1654 rt->peer = NULL;
1655 inet_putpeer(peer);
1656 }
1da177e4
LT
1657}
1658
1da177e4
LT
1659
1660static void ipv4_link_failure(struct sk_buff *skb)
1661{
1662 struct rtable *rt;
1663
1664 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1665
511c3f92 1666 rt = skb_rtable(skb);
fe6fe792
ED
1667 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1668 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1669}
1670
1671static int ip_rt_bug(struct sk_buff *skb)
1672{
673d57e7
HH
1673 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1674 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1675 skb->dev ? skb->dev->name : "?");
1676 kfree_skb(skb);
c378a9c0 1677 WARN_ON(1);
1da177e4
LT
1678 return 0;
1679}
1680
1681/*
1682 We do not cache source address of outgoing interface,
1683 because it is used only by IP RR, TS and SRR options,
1684 so that it out of fast path.
1685
1686 BTW remember: "addr" is allowed to be not aligned
1687 in IP options!
1688 */
1689
8e36360a 1690void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1691{
a61ced5d 1692 __be32 src;
1da177e4 1693
c7537967 1694 if (rt_is_output_route(rt))
c5be24ff 1695 src = ip_hdr(skb)->saddr;
ebc0ffae 1696 else {
8e36360a
DM
1697 struct fib_result res;
1698 struct flowi4 fl4;
1699 struct iphdr *iph;
1700
1701 iph = ip_hdr(skb);
1702
1703 memset(&fl4, 0, sizeof(fl4));
1704 fl4.daddr = iph->daddr;
1705 fl4.saddr = iph->saddr;
1706 fl4.flowi4_tos = iph->tos;
1707 fl4.flowi4_oif = rt->dst.dev->ifindex;
1708 fl4.flowi4_iif = skb->dev->ifindex;
1709 fl4.flowi4_mark = skb->mark;
5e2b61f7 1710
ebc0ffae 1711 rcu_read_lock();
68a5e3dd 1712 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1713 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1714 else
1715 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1716 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1717 rcu_read_unlock();
1718 }
1da177e4
LT
1719 memcpy(addr, &src, 4);
1720}
1721
c7066f70 1722#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1723static void set_class_tag(struct rtable *rt, u32 tag)
1724{
d8d1f30b
CG
1725 if (!(rt->dst.tclassid & 0xFFFF))
1726 rt->dst.tclassid |= tag & 0xFFFF;
1727 if (!(rt->dst.tclassid & 0xFFFF0000))
1728 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1729}
1730#endif
1731
0dbaee3b
DM
1732static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1733{
1734 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1735
1736 if (advmss == 0) {
1737 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1738 ip_rt_min_advmss);
1739 if (advmss > 65535 - 40)
1740 advmss = 65535 - 40;
1741 }
1742 return advmss;
1743}
1744
d33e4553
DM
1745static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1746{
1747 unsigned int mtu = dst->dev->mtu;
1748
1749 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1750 const struct rtable *rt = (const struct rtable *) dst;
1751
1752 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1753 mtu = 576;
1754 }
1755
1756 if (mtu > IP_MAX_MTU)
1757 mtu = IP_MAX_MTU;
1758
1759 return mtu;
1760}
1761
813b3b5d 1762static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1763 struct fib_info *fi)
a4daad6b 1764{
0131ba45
DM
1765 struct inet_peer *peer;
1766 int create = 0;
a4daad6b 1767
0131ba45
DM
1768 /* If a peer entry exists for this destination, we must hook
1769 * it up in order to get at cached metrics.
1770 */
813b3b5d 1771 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1772 create = 1;
1773
3c0afdca 1774 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1775 if (peer) {
3c0afdca 1776 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1777 if (inet_metrics_new(peer))
1778 memcpy(peer->metrics, fi->fib_metrics,
1779 sizeof(u32) * RTAX_MAX);
1780 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1781
fe6fe792 1782 check_peer_pmtu(&rt->dst, peer);
f39925db
DM
1783 if (peer->redirect_learned.a4 &&
1784 peer->redirect_learned.a4 != rt->rt_gateway) {
1785 rt->rt_gateway = peer->redirect_learned.a4;
1786 rt->rt_flags |= RTCF_REDIRECTED;
1787 }
0131ba45
DM
1788 } else {
1789 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1790 rt->fi = fi;
1791 atomic_inc(&fi->fib_clntref);
1792 }
1793 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1794 }
1795}
1796
813b3b5d 1797static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1798 const struct fib_result *res,
982721f3 1799 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1800{
defb3519 1801 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1802
1803 if (fi) {
1804 if (FIB_RES_GW(*res) &&
1805 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1806 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1807 rt_init_metrics(rt, fl4, fi);
c7066f70 1808#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1809 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1810#endif
d33e4553 1811 }
defb3519 1812
defb3519
DM
1813 if (dst_mtu(dst) > IP_MAX_MTU)
1814 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1815 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1816 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1817
c7066f70 1818#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1819#ifdef CONFIG_IP_MULTIPLE_TABLES
1820 set_class_tag(rt, fib_rules_tclass(res));
1821#endif
1822 set_class_tag(rt, itag);
1823#endif
1da177e4
LT
1824}
1825
5c1e6aa3
DM
1826static struct rtable *rt_dst_alloc(struct net_device *dev,
1827 bool nopolicy, bool noxfrm)
0c4dcd58 1828{
5c1e6aa3
DM
1829 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1830 DST_HOST |
1831 (nopolicy ? DST_NOPOLICY : 0) |
1832 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1833}
1834
96d36220 1835/* called in rcu_read_lock() section */
9e12bb22 1836static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1837 u8 tos, struct net_device *dev, int our)
1838{
96d36220 1839 unsigned int hash;
1da177e4 1840 struct rtable *rth;
a61ced5d 1841 __be32 spec_dst;
96d36220 1842 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1843 u32 itag = 0;
b5f7e755 1844 int err;
1da177e4
LT
1845
1846 /* Primary sanity checks. */
1847
1848 if (in_dev == NULL)
1849 return -EINVAL;
1850
1e637c74 1851 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1852 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1853 goto e_inval;
1854
f97c1e0c
JP
1855 if (ipv4_is_zeronet(saddr)) {
1856 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1857 goto e_inval;
1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 1859 } else {
5c04c819
MS
1860 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1861 &itag);
b5f7e755
ED
1862 if (err < 0)
1863 goto e_err;
1864 }
5c1e6aa3
DM
1865 rth = rt_dst_alloc(init_net.loopback_dev,
1866 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
1867 if (!rth)
1868 goto e_nobufs;
1869
cf911662
DM
1870#ifdef CONFIG_IP_ROUTE_CLASSID
1871 rth->dst.tclassid = itag;
1872#endif
d8d1f30b 1873 rth->dst.output = ip_rt_bug;
1da177e4 1874
5e2b61f7 1875 rth->rt_key_dst = daddr;
5e2b61f7 1876 rth->rt_key_src = saddr;
cf911662
DM
1877 rth->rt_genid = rt_genid(dev_net(dev));
1878 rth->rt_flags = RTCF_MULTICAST;
1879 rth->rt_type = RTN_MULTICAST;
475949d8 1880 rth->rt_key_tos = tos;
cf911662 1881 rth->rt_dst = daddr;
1da177e4 1882 rth->rt_src = saddr;
1b86a58f 1883 rth->rt_route_iif = dev->ifindex;
5e2b61f7 1884 rth->rt_iif = dev->ifindex;
5e2b61f7 1885 rth->rt_oif = 0;
cf911662 1886 rth->rt_mark = skb->mark;
1da177e4
LT
1887 rth->rt_gateway = daddr;
1888 rth->rt_spec_dst= spec_dst;
cf911662
DM
1889 rth->rt_peer_genid = 0;
1890 rth->peer = NULL;
1891 rth->fi = NULL;
1da177e4 1892 if (our) {
d8d1f30b 1893 rth->dst.input= ip_local_deliver;
1da177e4
LT
1894 rth->rt_flags |= RTCF_LOCAL;
1895 }
1896
1897#ifdef CONFIG_IP_MROUTE
f97c1e0c 1898 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1899 rth->dst.input = ip_mr_input;
1da177e4
LT
1900#endif
1901 RT_CACHE_STAT_INC(in_slow_mc);
1902
e84f84f2 1903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 1904 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 1905 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
1906
1907e_nobufs:
1da177e4 1908 return -ENOBUFS;
1da177e4 1909e_inval:
96d36220 1910 return -EINVAL;
b5f7e755 1911e_err:
b5f7e755 1912 return err;
1da177e4
LT
1913}
1914
1915
1916static void ip_handle_martian_source(struct net_device *dev,
1917 struct in_device *in_dev,
1918 struct sk_buff *skb,
9e12bb22
AV
1919 __be32 daddr,
1920 __be32 saddr)
1da177e4
LT
1921{
1922 RT_CACHE_STAT_INC(in_martian_src);
1923#ifdef CONFIG_IP_ROUTE_VERBOSE
1924 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925 /*
1926 * RFC1812 recommendation, if source is martian,
1927 * the only hint is MAC header.
1928 */
673d57e7
HH
1929 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930 &daddr, &saddr, dev->name);
98e399f8 1931 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1932 int i;
98e399f8 1933 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1934 printk(KERN_WARNING "ll header: ");
1935 for (i = 0; i < dev->hard_header_len; i++, p++) {
1936 printk("%02x", *p);
1937 if (i < (dev->hard_header_len - 1))
1938 printk(":");
1939 }
1940 printk("\n");
1941 }
1942 }
1943#endif
1944}
1945
47360228 1946/* called in rcu_read_lock() section */
5969f71d 1947static int __mkroute_input(struct sk_buff *skb,
982721f3 1948 const struct fib_result *res,
5969f71d
SH
1949 struct in_device *in_dev,
1950 __be32 daddr, __be32 saddr, u32 tos,
1951 struct rtable **result)
1da177e4 1952{
1da177e4
LT
1953 struct rtable *rth;
1954 int err;
1955 struct in_device *out_dev;
47360228 1956 unsigned int flags = 0;
d9c9df8c
AV
1957 __be32 spec_dst;
1958 u32 itag;
1da177e4
LT
1959
1960 /* get a working reference to the output device */
47360228 1961 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
1962 if (out_dev == NULL) {
1963 if (net_ratelimit())
1964 printk(KERN_CRIT "Bug in ip_route_input" \
1965 "_slow(). Please, report\n");
1966 return -EINVAL;
1967 }
1968
1969
5c04c819
MS
1970 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1971 in_dev->dev, &spec_dst, &itag);
1da177e4 1972 if (err < 0) {
e905a9ed 1973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1974 saddr);
e905a9ed 1975
1da177e4
LT
1976 goto cleanup;
1977 }
1978
1979 if (err)
1980 flags |= RTCF_DIRECTSRC;
1981
51b77cae 1982 if (out_dev == in_dev && err &&
1da177e4
LT
1983 (IN_DEV_SHARED_MEDIA(out_dev) ||
1984 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1985 flags |= RTCF_DOREDIRECT;
1986
1987 if (skb->protocol != htons(ETH_P_IP)) {
1988 /* Not IP (i.e. ARP). Do not create route, if it is
1989 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1990 *
1991 * Proxy arp feature have been extended to allow, ARP
1992 * replies back to the same interface, to support
1993 * Private VLAN switch technologies. See arp.c.
1da177e4 1994 */
65324144
JDB
1995 if (out_dev == in_dev &&
1996 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1997 err = -EINVAL;
1998 goto cleanup;
1999 }
2000 }
2001
5c1e6aa3
DM
2002 rth = rt_dst_alloc(out_dev->dev,
2003 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2004 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2005 if (!rth) {
2006 err = -ENOBUFS;
2007 goto cleanup;
2008 }
2009
5e2b61f7 2010 rth->rt_key_dst = daddr;
5e2b61f7 2011 rth->rt_key_src = saddr;
cf911662
DM
2012 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2013 rth->rt_flags = flags;
2014 rth->rt_type = res->type;
475949d8 2015 rth->rt_key_tos = tos;
cf911662 2016 rth->rt_dst = daddr;
1da177e4 2017 rth->rt_src = saddr;
1b86a58f 2018 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2019 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2020 rth->rt_oif = 0;
cf911662
DM
2021 rth->rt_mark = skb->mark;
2022 rth->rt_gateway = daddr;
1da177e4 2023 rth->rt_spec_dst= spec_dst;
cf911662
DM
2024 rth->rt_peer_genid = 0;
2025 rth->peer = NULL;
2026 rth->fi = NULL;
1da177e4 2027
d8d1f30b
CG
2028 rth->dst.input = ip_forward;
2029 rth->dst.output = ip_output;
1da177e4 2030
5e2b61f7 2031 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2032
1da177e4
LT
2033 *result = rth;
2034 err = 0;
2035 cleanup:
1da177e4 2036 return err;
e905a9ed 2037}
1da177e4 2038
5969f71d
SH
2039static int ip_mkroute_input(struct sk_buff *skb,
2040 struct fib_result *res,
68a5e3dd 2041 const struct flowi4 *fl4,
5969f71d
SH
2042 struct in_device *in_dev,
2043 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2044{
7abaa27c 2045 struct rtable* rth = NULL;
1da177e4
LT
2046 int err;
2047 unsigned hash;
2048
2049#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2050 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2051 fib_select_multipath(res);
1da177e4
LT
2052#endif
2053
2054 /* create a routing cache entry */
2055 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2056 if (err)
2057 return err;
1da177e4
LT
2058
2059 /* put it into the cache */
68a5e3dd 2060 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2061 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2062 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2063 if (IS_ERR(rth))
2064 return PTR_ERR(rth);
2065 return 0;
1da177e4
LT
2066}
2067
1da177e4
LT
2068/*
2069 * NOTE. We drop all the packets that has local source
2070 * addresses, because every properly looped back packet
2071 * must have correct destination already attached by output routine.
2072 *
2073 * Such approach solves two big problems:
2074 * 1. Not simplex devices are handled properly.
2075 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2076 * called with rcu_read_lock()
1da177e4
LT
2077 */
2078
9e12bb22 2079static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2080 u8 tos, struct net_device *dev)
2081{
2082 struct fib_result res;
96d36220 2083 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2084 struct flowi4 fl4;
1da177e4
LT
2085 unsigned flags = 0;
2086 u32 itag = 0;
2087 struct rtable * rth;
2088 unsigned hash;
9e12bb22 2089 __be32 spec_dst;
1da177e4 2090 int err = -EINVAL;
c346dca1 2091 struct net * net = dev_net(dev);
1da177e4
LT
2092
2093 /* IP on this device is disabled. */
2094
2095 if (!in_dev)
2096 goto out;
2097
2098 /* Check for the most weird martians, which can be not detected
2099 by fib_lookup.
2100 */
2101
1e637c74 2102 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2103 ipv4_is_loopback(saddr))
1da177e4
LT
2104 goto martian_source;
2105
27a954bd 2106 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2107 goto brd_input;
2108
2109 /* Accept zero addresses only to limited broadcast;
2110 * I even do not know to fix it or not. Waiting for complains :-)
2111 */
f97c1e0c 2112 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2113 goto martian_source;
2114
27a954bd 2115 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2116 goto martian_destination;
2117
2118 /*
2119 * Now we are ready to route packet.
2120 */
68a5e3dd
DM
2121 fl4.flowi4_oif = 0;
2122 fl4.flowi4_iif = dev->ifindex;
2123 fl4.flowi4_mark = skb->mark;
2124 fl4.flowi4_tos = tos;
2125 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2126 fl4.daddr = daddr;
2127 fl4.saddr = saddr;
2128 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2129 if (err != 0) {
1da177e4 2130 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2131 goto e_hostunreach;
1da177e4
LT
2132 goto no_route;
2133 }
1da177e4
LT
2134
2135 RT_CACHE_STAT_INC(in_slow_tot);
2136
2137 if (res.type == RTN_BROADCAST)
2138 goto brd_input;
2139
2140 if (res.type == RTN_LOCAL) {
5c04c819 2141 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2142 net->loopback_dev->ifindex,
5c04c819 2143 dev, &spec_dst, &itag);
b5f7e755
ED
2144 if (err < 0)
2145 goto martian_source_keep_err;
2146 if (err)
1da177e4
LT
2147 flags |= RTCF_DIRECTSRC;
2148 spec_dst = daddr;
2149 goto local_input;
2150 }
2151
2152 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2153 goto e_hostunreach;
1da177e4
LT
2154 if (res.type != RTN_UNICAST)
2155 goto martian_destination;
2156
68a5e3dd 2157 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2158out: return err;
2159
2160brd_input:
2161 if (skb->protocol != htons(ETH_P_IP))
2162 goto e_inval;
2163
f97c1e0c 2164 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2165 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2166 else {
5c04c819
MS
2167 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2168 &itag);
1da177e4 2169 if (err < 0)
b5f7e755 2170 goto martian_source_keep_err;
1da177e4
LT
2171 if (err)
2172 flags |= RTCF_DIRECTSRC;
2173 }
2174 flags |= RTCF_BROADCAST;
2175 res.type = RTN_BROADCAST;
2176 RT_CACHE_STAT_INC(in_brd);
2177
2178local_input:
5c1e6aa3
DM
2179 rth = rt_dst_alloc(net->loopback_dev,
2180 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2181 if (!rth)
2182 goto e_nobufs;
2183
cf911662 2184 rth->dst.input= ip_local_deliver;
d8d1f30b 2185 rth->dst.output= ip_rt_bug;
cf911662
DM
2186#ifdef CONFIG_IP_ROUTE_CLASSID
2187 rth->dst.tclassid = itag;
2188#endif
1da177e4 2189
5e2b61f7 2190 rth->rt_key_dst = daddr;
5e2b61f7 2191 rth->rt_key_src = saddr;
cf911662
DM
2192 rth->rt_genid = rt_genid(net);
2193 rth->rt_flags = flags|RTCF_LOCAL;
2194 rth->rt_type = res.type;
475949d8 2195 rth->rt_key_tos = tos;
cf911662 2196 rth->rt_dst = daddr;
1da177e4 2197 rth->rt_src = saddr;
c7066f70 2198#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2199 rth->dst.tclassid = itag;
1da177e4 2200#endif
1b86a58f 2201 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2202 rth->rt_iif = dev->ifindex;
cf911662
DM
2203 rth->rt_oif = 0;
2204 rth->rt_mark = skb->mark;
1da177e4
LT
2205 rth->rt_gateway = daddr;
2206 rth->rt_spec_dst= spec_dst;
cf911662
DM
2207 rth->rt_peer_genid = 0;
2208 rth->peer = NULL;
2209 rth->fi = NULL;
1da177e4 2210 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2211 rth->dst.input= ip_error;
2212 rth->dst.error= -err;
1da177e4
LT
2213 rth->rt_flags &= ~RTCF_LOCAL;
2214 }
68a5e3dd
DM
2215 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2216 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2217 err = 0;
2218 if (IS_ERR(rth))
2219 err = PTR_ERR(rth);
ebc0ffae 2220 goto out;
1da177e4
LT
2221
2222no_route:
2223 RT_CACHE_STAT_INC(in_no_route);
2224 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2225 res.type = RTN_UNREACHABLE;
7f53878d
MC
2226 if (err == -ESRCH)
2227 err = -ENETUNREACH;
1da177e4
LT
2228 goto local_input;
2229
2230 /*
2231 * Do not cache martian addresses: they should be logged (RFC1812)
2232 */
2233martian_destination:
2234 RT_CACHE_STAT_INC(in_martian_dst);
2235#ifdef CONFIG_IP_ROUTE_VERBOSE
2236 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2237 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2238 &daddr, &saddr, dev->name);
1da177e4 2239#endif
2c2910a4
DE
2240
2241e_hostunreach:
e905a9ed 2242 err = -EHOSTUNREACH;
ebc0ffae 2243 goto out;
2c2910a4 2244
1da177e4
LT
2245e_inval:
2246 err = -EINVAL;
ebc0ffae 2247 goto out;
1da177e4
LT
2248
2249e_nobufs:
2250 err = -ENOBUFS;
ebc0ffae 2251 goto out;
1da177e4
LT
2252
2253martian_source:
b5f7e755
ED
2254 err = -EINVAL;
2255martian_source_keep_err:
1da177e4 2256 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2257 goto out;
1da177e4
LT
2258}
2259
407eadd9
ED
2260int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2261 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2262{
2263 struct rtable * rth;
2264 unsigned hash;
2265 int iif = dev->ifindex;
b5921910 2266 struct net *net;
96d36220 2267 int res;
1da177e4 2268
c346dca1 2269 net = dev_net(dev);
1080d709 2270
96d36220
ED
2271 rcu_read_lock();
2272
1080d709
NH
2273 if (!rt_caching(net))
2274 goto skip_cache;
2275
1da177e4 2276 tos &= IPTOS_RT_MASK;
e84f84f2 2277 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2278
1da177e4 2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2280 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2281 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2282 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2283 (rth->rt_iif ^ iif) |
2284 rth->rt_oif |
475949d8 2285 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2286 rth->rt_mark == skb->mark &&
d8d1f30b 2287 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2288 !rt_is_expired(rth)) {
407eadd9 2289 if (noref) {
d8d1f30b
CG
2290 dst_use_noref(&rth->dst, jiffies);
2291 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2292 } else {
d8d1f30b
CG
2293 dst_use(&rth->dst, jiffies);
2294 skb_dst_set(skb, &rth->dst);
407eadd9 2295 }
1da177e4
LT
2296 RT_CACHE_STAT_INC(in_hit);
2297 rcu_read_unlock();
1da177e4
LT
2298 return 0;
2299 }
2300 RT_CACHE_STAT_INC(in_hlist_search);
2301 }
1da177e4 2302
1080d709 2303skip_cache:
1da177e4
LT
2304 /* Multicast recognition logic is moved from route cache to here.
2305 The problem was that too many Ethernet cards have broken/missing
2306 hardware multicast filters :-( As result the host on multicasting
2307 network acquires a lot of useless route cache entries, sort of
2308 SDR messages from all the world. Now we try to get rid of them.
2309 Really, provided software IP multicast filter is organized
2310 reasonably (at least, hashed), it does not result in a slowdown
2311 comparing with route cache reject entries.
2312 Note, that multicast routers are not affected, because
2313 route cache entry is created eventually.
2314 */
f97c1e0c 2315 if (ipv4_is_multicast(daddr)) {
96d36220 2316 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2317
96d36220 2318 if (in_dev) {
dbdd9a52
DM
2319 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2320 ip_hdr(skb)->protocol);
1da177e4
LT
2321 if (our
2322#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2323 ||
2324 (!ipv4_is_local_multicast(daddr) &&
2325 IN_DEV_MFORWARD(in_dev))
1da177e4 2326#endif
9d4fb27d 2327 ) {
96d36220
ED
2328 int res = ip_route_input_mc(skb, daddr, saddr,
2329 tos, dev, our);
1da177e4 2330 rcu_read_unlock();
96d36220 2331 return res;
1da177e4
LT
2332 }
2333 }
2334 rcu_read_unlock();
2335 return -EINVAL;
2336 }
96d36220
ED
2337 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2338 rcu_read_unlock();
2339 return res;
1da177e4 2340}
407eadd9 2341EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2342
ebc0ffae 2343/* called with rcu_read_lock() */
982721f3 2344static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2345 const struct flowi4 *fl4,
813b3b5d
DM
2346 __be32 orig_daddr, __be32 orig_saddr,
2347 int orig_oif, struct net_device *dev_out,
5ada5527 2348 unsigned int flags)
1da177e4 2349{
982721f3 2350 struct fib_info *fi = res->fi;
813b3b5d 2351 u32 tos = RT_FL_TOS(fl4);
5ada5527 2352 struct in_device *in_dev;
982721f3 2353 u16 type = res->type;
5ada5527 2354 struct rtable *rth;
1da177e4 2355
68a5e3dd 2356 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2357 return ERR_PTR(-EINVAL);
1da177e4 2358
68a5e3dd 2359 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2360 type = RTN_BROADCAST;
68a5e3dd 2361 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2362 type = RTN_MULTICAST;
68a5e3dd 2363 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2364 return ERR_PTR(-EINVAL);
1da177e4
LT
2365
2366 if (dev_out->flags & IFF_LOOPBACK)
2367 flags |= RTCF_LOCAL;
2368
dd28d1a0 2369 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2370 if (!in_dev)
5ada5527 2371 return ERR_PTR(-EINVAL);
ebc0ffae 2372
982721f3 2373 if (type == RTN_BROADCAST) {
1da177e4 2374 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2375 fi = NULL;
2376 } else if (type == RTN_MULTICAST) {
dd28d1a0 2377 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2378 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2379 fl4->flowi4_proto))
1da177e4
LT
2380 flags &= ~RTCF_LOCAL;
2381 /* If multicast route do not exist use
dd28d1a0
ED
2382 * default one, but do not gateway in this case.
2383 * Yes, it is hack.
1da177e4 2384 */
982721f3
DM
2385 if (fi && res->prefixlen < 4)
2386 fi = NULL;
1da177e4
LT
2387 }
2388
5c1e6aa3
DM
2389 rth = rt_dst_alloc(dev_out,
2390 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2391 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2392 if (!rth)
5ada5527 2393 return ERR_PTR(-ENOBUFS);
8391d07b 2394
cf911662
DM
2395 rth->dst.output = ip_output;
2396
813b3b5d
DM
2397 rth->rt_key_dst = orig_daddr;
2398 rth->rt_key_src = orig_saddr;
cf911662
DM
2399 rth->rt_genid = rt_genid(dev_net(dev_out));
2400 rth->rt_flags = flags;
2401 rth->rt_type = type;
475949d8 2402 rth->rt_key_tos = tos;
68a5e3dd
DM
2403 rth->rt_dst = fl4->daddr;
2404 rth->rt_src = fl4->saddr;
1b86a58f 2405 rth->rt_route_iif = 0;
813b3b5d
DM
2406 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2407 rth->rt_oif = orig_oif;
2408 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2409 rth->rt_gateway = fl4->daddr;
2410 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2411 rth->rt_peer_genid = 0;
2412 rth->peer = NULL;
2413 rth->fi = NULL;
1da177e4
LT
2414
2415 RT_CACHE_STAT_INC(out_slow_tot);
2416
2417 if (flags & RTCF_LOCAL) {
d8d1f30b 2418 rth->dst.input = ip_local_deliver;
68a5e3dd 2419 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2420 }
2421 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2422 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2423 if (flags & RTCF_LOCAL &&
1da177e4 2424 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2425 rth->dst.output = ip_mc_output;
1da177e4
LT
2426 RT_CACHE_STAT_INC(out_slow_mc);
2427 }
2428#ifdef CONFIG_IP_MROUTE
982721f3 2429 if (type == RTN_MULTICAST) {
1da177e4 2430 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2431 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2432 rth->dst.input = ip_mr_input;
2433 rth->dst.output = ip_mc_output;
1da177e4
LT
2434 }
2435 }
2436#endif
2437 }
2438
813b3b5d 2439 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2440
5ada5527 2441 return rth;
1da177e4
LT
2442}
2443
1da177e4
LT
2444/*
2445 * Major route resolver routine.
0197aa38 2446 * called with rcu_read_lock();
1da177e4
LT
2447 */
2448
813b3b5d 2449static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2450{
1da177e4 2451 struct net_device *dev_out = NULL;
813b3b5d
DM
2452 u32 tos = RT_FL_TOS(fl4);
2453 unsigned int flags = 0;
2454 struct fib_result res;
5ada5527 2455 struct rtable *rth;
813b3b5d
DM
2456 __be32 orig_daddr;
2457 __be32 orig_saddr;
2458 int orig_oif;
1da177e4
LT
2459
2460 res.fi = NULL;
2461#ifdef CONFIG_IP_MULTIPLE_TABLES
2462 res.r = NULL;
2463#endif
2464
813b3b5d
DM
2465 orig_daddr = fl4->daddr;
2466 orig_saddr = fl4->saddr;
2467 orig_oif = fl4->flowi4_oif;
2468
2469 fl4->flowi4_iif = net->loopback_dev->ifindex;
2470 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2473
010c2708 2474 rcu_read_lock();
813b3b5d 2475 if (fl4->saddr) {
b23dd4fe 2476 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2477 if (ipv4_is_multicast(fl4->saddr) ||
2478 ipv4_is_lbcast(fl4->saddr) ||
2479 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2480 goto out;
2481
1da177e4
LT
2482 /* I removed check for oif == dev_out->oif here.
2483 It was wrong for two reasons:
1ab35276
DL
2484 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2485 is assigned to multiple interfaces.
1da177e4
LT
2486 2. Moreover, we are allowed to send packets with saddr
2487 of another iface. --ANK
2488 */
2489
813b3b5d
DM
2490 if (fl4->flowi4_oif == 0 &&
2491 (ipv4_is_multicast(fl4->daddr) ||
2492 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2493 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2494 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2495 if (dev_out == NULL)
2496 goto out;
2497
1da177e4
LT
2498 /* Special hack: user can direct multicasts
2499 and limited broadcast via necessary interface
2500 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2501 This hack is not just for fun, it allows
2502 vic,vat and friends to work.
2503 They bind socket to loopback, set ttl to zero
2504 and expect that it will work.
2505 From the viewpoint of routing cache they are broken,
2506 because we are not allowed to build multicast path
2507 with loopback source addr (look, routing cache
2508 cannot know, that ttl is zero, so that packet
2509 will not leave this host and route is valid).
2510 Luckily, this hack is good workaround.
2511 */
2512
813b3b5d 2513 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2514 goto make_route;
2515 }
a210d01a 2516
813b3b5d 2517 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2518 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2519 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2520 goto out;
a210d01a 2521 }
1da177e4
LT
2522 }
2523
2524
813b3b5d
DM
2525 if (fl4->flowi4_oif) {
2526 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2527 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2528 if (dev_out == NULL)
2529 goto out;
e5ed6399
HX
2530
2531 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2532 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2533 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2534 goto out;
2535 }
813b3b5d
DM
2536 if (ipv4_is_local_multicast(fl4->daddr) ||
2537 ipv4_is_lbcast(fl4->daddr)) {
2538 if (!fl4->saddr)
2539 fl4->saddr = inet_select_addr(dev_out, 0,
2540 RT_SCOPE_LINK);
1da177e4
LT
2541 goto make_route;
2542 }
813b3b5d
DM
2543 if (fl4->saddr) {
2544 if (ipv4_is_multicast(fl4->daddr))
2545 fl4->saddr = inet_select_addr(dev_out, 0,
2546 fl4->flowi4_scope);
2547 else if (!fl4->daddr)
2548 fl4->saddr = inet_select_addr(dev_out, 0,
2549 RT_SCOPE_HOST);
1da177e4
LT
2550 }
2551 }
2552
813b3b5d
DM
2553 if (!fl4->daddr) {
2554 fl4->daddr = fl4->saddr;
2555 if (!fl4->daddr)
2556 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2557 dev_out = net->loopback_dev;
813b3b5d 2558 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2559 res.type = RTN_LOCAL;
2560 flags |= RTCF_LOCAL;
2561 goto make_route;
2562 }
2563
813b3b5d 2564 if (fib_lookup(net, fl4, &res)) {
1da177e4 2565 res.fi = NULL;
813b3b5d 2566 if (fl4->flowi4_oif) {
1da177e4
LT
2567 /* Apparently, routing tables are wrong. Assume,
2568 that the destination is on link.
2569
2570 WHY? DW.
2571 Because we are allowed to send to iface
2572 even if it has NO routes and NO assigned
2573 addresses. When oif is specified, routing
2574 tables are looked up with only one purpose:
2575 to catch if destination is gatewayed, rather than
2576 direct. Moreover, if MSG_DONTROUTE is set,
2577 we send packet, ignoring both routing tables
2578 and ifaddr state. --ANK
2579
2580
2581 We could make it even if oif is unknown,
2582 likely IPv6, but we do not.
2583 */
2584
813b3b5d
DM
2585 if (fl4->saddr == 0)
2586 fl4->saddr = inet_select_addr(dev_out, 0,
2587 RT_SCOPE_LINK);
1da177e4
LT
2588 res.type = RTN_UNICAST;
2589 goto make_route;
2590 }
b23dd4fe 2591 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2592 goto out;
2593 }
1da177e4
LT
2594
2595 if (res.type == RTN_LOCAL) {
813b3b5d 2596 if (!fl4->saddr) {
9fc3bbb4 2597 if (res.fi->fib_prefsrc)
813b3b5d 2598 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2599 else
813b3b5d 2600 fl4->saddr = fl4->daddr;
9fc3bbb4 2601 }
b40afd0e 2602 dev_out = net->loopback_dev;
813b3b5d 2603 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2604 res.fi = NULL;
2605 flags |= RTCF_LOCAL;
2606 goto make_route;
2607 }
2608
2609#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2610 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2611 fib_select_multipath(&res);
1da177e4
LT
2612 else
2613#endif
21d8c49e
DM
2614 if (!res.prefixlen &&
2615 res.table->tb_num_default > 1 &&
813b3b5d 2616 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2617 fib_select_default(&res);
1da177e4 2618
813b3b5d
DM
2619 if (!fl4->saddr)
2620 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2621
1da177e4 2622 dev_out = FIB_RES_DEV(res);
813b3b5d 2623 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2624
2625
2626make_route:
813b3b5d
DM
2627 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628 dev_out, flags);
b23dd4fe 2629 if (!IS_ERR(rth)) {
5ada5527
DM
2630 unsigned int hash;
2631
813b3b5d 2632 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2633 rt_genid(dev_net(dev_out)));
813b3b5d 2634 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2635 }
1da177e4 2636
010c2708
DM
2637out:
2638 rcu_read_unlock();
b23dd4fe 2639 return rth;
1da177e4
LT
2640}
2641
813b3b5d 2642struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2643{
1da177e4 2644 struct rtable *rth;
010c2708 2645 unsigned int hash;
1da177e4 2646
1080d709
NH
2647 if (!rt_caching(net))
2648 goto slow_output;
2649
9d6ec938 2650 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2651
2652 rcu_read_lock_bh();
a898def2 2653 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2654 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2655 if (rth->rt_key_dst == flp4->daddr &&
2656 rth->rt_key_src == flp4->saddr &&
c7537967 2657 rt_is_output_route(rth) &&
9d6ec938
DM
2658 rth->rt_oif == flp4->flowi4_oif &&
2659 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2660 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2661 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2662 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2663 !rt_is_expired(rth)) {
d8d1f30b 2664 dst_use(&rth->dst, jiffies);
1da177e4
LT
2665 RT_CACHE_STAT_INC(out_hit);
2666 rcu_read_unlock_bh();
56157872
DM
2667 if (!flp4->saddr)
2668 flp4->saddr = rth->rt_src;
2669 if (!flp4->daddr)
2670 flp4->daddr = rth->rt_dst;
b23dd4fe 2671 return rth;
1da177e4
LT
2672 }
2673 RT_CACHE_STAT_INC(out_hlist_search);
2674 }
2675 rcu_read_unlock_bh();
2676
1080d709 2677slow_output:
9d6ec938 2678 return ip_route_output_slow(net, flp4);
1da177e4 2679}
d8c97a94
ACM
2680EXPORT_SYMBOL_GPL(__ip_route_output_key);
2681
ae2688d5
JW
2682static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2683{
2684 return NULL;
2685}
2686
ec831ea7
RD
2687static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688{
2689 return 0;
2690}
2691
14e50e57
DM
2692static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2693{
2694}
2695
0972ddb2
HB
2696static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697 unsigned long old)
2698{
2699 return NULL;
2700}
2701
14e50e57
DM
2702static struct dst_ops ipv4_dst_blackhole_ops = {
2703 .family = AF_INET,
09640e63 2704 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2705 .destroy = ipv4_dst_destroy,
ae2688d5 2706 .check = ipv4_blackhole_dst_check,
ec831ea7 2707 .default_mtu = ipv4_blackhole_default_mtu,
214f45c9 2708 .default_advmss = ipv4_default_advmss,
14e50e57 2709 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2710 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
14e50e57
DM
2711};
2712
2774c131 2713struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2714{
5c1e6aa3 2715 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2716 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2717
2718 if (rt) {
d8d1f30b 2719 struct dst_entry *new = &rt->dst;
14e50e57 2720
14e50e57 2721 new->__use = 1;
352e512c
HX
2722 new->input = dst_discard;
2723 new->output = dst_discard;
defb3519 2724 dst_copy_metrics(new, &ort->dst);
14e50e57 2725
d8d1f30b 2726 new->dev = ort->dst.dev;
14e50e57
DM
2727 if (new->dev)
2728 dev_hold(new->dev);
2729
5e2b61f7
DM
2730 rt->rt_key_dst = ort->rt_key_dst;
2731 rt->rt_key_src = ort->rt_key_src;
475949d8 2732 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2733 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2734 rt->rt_iif = ort->rt_iif;
2735 rt->rt_oif = ort->rt_oif;
2736 rt->rt_mark = ort->rt_mark;
14e50e57 2737
e84f84f2 2738 rt->rt_genid = rt_genid(net);
14e50e57
DM
2739 rt->rt_flags = ort->rt_flags;
2740 rt->rt_type = ort->rt_type;
2741 rt->rt_dst = ort->rt_dst;
2742 rt->rt_src = ort->rt_src;
14e50e57
DM
2743 rt->rt_gateway = ort->rt_gateway;
2744 rt->rt_spec_dst = ort->rt_spec_dst;
2745 rt->peer = ort->peer;
2746 if (rt->peer)
2747 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2748 rt->fi = ort->fi;
2749 if (rt->fi)
2750 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2751
2752 dst_free(new);
2753 }
2754
2774c131
DM
2755 dst_release(dst_orig);
2756
2757 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2758}
2759
9d6ec938 2760struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2761 struct sock *sk)
1da177e4 2762{
9d6ec938 2763 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2764
b23dd4fe
DM
2765 if (IS_ERR(rt))
2766 return rt;
1da177e4 2767
56157872 2768 if (flp4->flowi4_proto)
9d6ec938
DM
2769 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770 flowi4_to_flowi(flp4),
2771 sk, 0);
1da177e4 2772
b23dd4fe 2773 return rt;
1da177e4 2774}
d8c97a94
ACM
2775EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
4feb88e5
BT
2777static int rt_fill_info(struct net *net,
2778 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2779 int nowait, unsigned int flags)
1da177e4 2780{
511c3f92 2781 struct rtable *rt = skb_rtable(skb);
1da177e4 2782 struct rtmsg *r;
be403ea1 2783 struct nlmsghdr *nlh;
fe6fe792
ED
2784 long expires = 0;
2785 const struct inet_peer *peer = rt->peer;
e3703b3d 2786 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2787
2788 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2789 if (nlh == NULL)
26932566 2790 return -EMSGSIZE;
be403ea1
TG
2791
2792 r = nlmsg_data(nlh);
1da177e4
LT
2793 r->rtm_family = AF_INET;
2794 r->rtm_dst_len = 32;
2795 r->rtm_src_len = 0;
475949d8 2796 r->rtm_tos = rt->rt_key_tos;
1da177e4 2797 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2798 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2799 r->rtm_type = rt->rt_type;
2800 r->rtm_scope = RT_SCOPE_UNIVERSE;
2801 r->rtm_protocol = RTPROT_UNSPEC;
2802 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2803 if (rt->rt_flags & RTCF_NOTIFY)
2804 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2805
17fb2c64 2806 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2807
5e2b61f7 2808 if (rt->rt_key_src) {
1da177e4 2809 r->rtm_src_len = 32;
5e2b61f7 2810 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2811 }
d8d1f30b
CG
2812 if (rt->dst.dev)
2813 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2814#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2815 if (rt->dst.tclassid)
2816 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2817#endif
c7537967 2818 if (rt_is_input_route(rt))
17fb2c64 2819 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 2820 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 2821 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2822
1da177e4 2823 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2824 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2825
defb3519 2826 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2827 goto nla_put_failure;
2828
5e2b61f7
DM
2829 if (rt->rt_mark)
2830 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 2831
d8d1f30b 2832 error = rt->dst.error;
fe6fe792 2833 if (peer) {
317fe0e6 2834 inet_peer_refcheck(rt->peer);
fe6fe792
ED
2835 id = atomic_read(&peer->ip_id_count) & 0xffff;
2836 if (peer->tcp_ts_stamp) {
2837 ts = peer->tcp_ts;
2838 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 2839 }
fe6fe792
ED
2840 expires = ACCESS_ONCE(peer->pmtu_expires);
2841 if (expires)
2842 expires -= jiffies;
1da177e4 2843 }
be403ea1 2844
c7537967 2845 if (rt_is_input_route(rt)) {
1da177e4 2846#ifdef CONFIG_IP_MROUTE
e448515c 2847 __be32 dst = rt->rt_dst;
1da177e4 2848
f97c1e0c 2849 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2850 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2851 int err = ipmr_get_route(net, skb,
2852 rt->rt_src, rt->rt_dst,
2853 r, nowait);
1da177e4
LT
2854 if (err <= 0) {
2855 if (!nowait) {
2856 if (err == 0)
2857 return 0;
be403ea1 2858 goto nla_put_failure;
1da177e4
LT
2859 } else {
2860 if (err == -EMSGSIZE)
be403ea1 2861 goto nla_put_failure;
e3703b3d 2862 error = err;
1da177e4
LT
2863 }
2864 }
2865 } else
2866#endif
5e2b61f7 2867 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
2868 }
2869
d8d1f30b 2870 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2871 expires, error) < 0)
2872 goto nla_put_failure;
be403ea1
TG
2873
2874 return nlmsg_end(skb, nlh);
1da177e4 2875
be403ea1 2876nla_put_failure:
26932566
PM
2877 nlmsg_cancel(skb, nlh);
2878 return -EMSGSIZE;
1da177e4
LT
2879}
2880
63f3444f 2881static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2882{
3b1e0a65 2883 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2884 struct rtmsg *rtm;
2885 struct nlattr *tb[RTA_MAX+1];
1da177e4 2886 struct rtable *rt = NULL;
9e12bb22
AV
2887 __be32 dst = 0;
2888 __be32 src = 0;
2889 u32 iif;
d889ce3b 2890 int err;
963bfeee 2891 int mark;
1da177e4
LT
2892 struct sk_buff *skb;
2893
d889ce3b
TG
2894 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2895 if (err < 0)
2896 goto errout;
2897
2898 rtm = nlmsg_data(nlh);
2899
1da177e4 2900 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2901 if (skb == NULL) {
2902 err = -ENOBUFS;
2903 goto errout;
2904 }
1da177e4
LT
2905
2906 /* Reserve room for dummy headers, this skb can pass
2907 through good chunk of routing engine.
2908 */
459a98ed 2909 skb_reset_mac_header(skb);
c1d2bbe1 2910 skb_reset_network_header(skb);
d2c962b8
SH
2911
2912 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2913 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2914 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2915
17fb2c64
AV
2916 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2917 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2918 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2919 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
2920
2921 if (iif) {
d889ce3b
TG
2922 struct net_device *dev;
2923
1937504d 2924 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2925 if (dev == NULL) {
2926 err = -ENODEV;
2927 goto errout_free;
2928 }
2929
1da177e4
LT
2930 skb->protocol = htons(ETH_P_IP);
2931 skb->dev = dev;
963bfeee 2932 skb->mark = mark;
1da177e4
LT
2933 local_bh_disable();
2934 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2935 local_bh_enable();
d889ce3b 2936
511c3f92 2937 rt = skb_rtable(skb);
d8d1f30b
CG
2938 if (err == 0 && rt->dst.error)
2939 err = -rt->dst.error;
1da177e4 2940 } else {
68a5e3dd
DM
2941 struct flowi4 fl4 = {
2942 .daddr = dst,
2943 .saddr = src,
2944 .flowi4_tos = rtm->rtm_tos,
2945 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2946 .flowi4_mark = mark,
d889ce3b 2947 };
9d6ec938 2948 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2949
2950 err = 0;
2951 if (IS_ERR(rt))
2952 err = PTR_ERR(rt);
1da177e4 2953 }
d889ce3b 2954
1da177e4 2955 if (err)
d889ce3b 2956 goto errout_free;
1da177e4 2957
d8d1f30b 2958 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2959 if (rtm->rtm_flags & RTM_F_NOTIFY)
2960 rt->rt_flags |= RTCF_NOTIFY;
2961
4feb88e5 2962 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2963 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2964 if (err <= 0)
2965 goto errout_free;
1da177e4 2966
1937504d 2967 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2968errout:
2942e900 2969 return err;
1da177e4 2970
d889ce3b 2971errout_free:
1da177e4 2972 kfree_skb(skb);
d889ce3b 2973 goto errout;
1da177e4
LT
2974}
2975
2976int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2977{
2978 struct rtable *rt;
2979 int h, s_h;
2980 int idx, s_idx;
1937504d
DL
2981 struct net *net;
2982
3b1e0a65 2983 net = sock_net(skb->sk);
1da177e4
LT
2984
2985 s_h = cb->args[0];
d8c92830
ED
2986 if (s_h < 0)
2987 s_h = 0;
1da177e4 2988 s_idx = idx = cb->args[1];
a6272665
ED
2989 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2990 if (!rt_hash_table[h].chain)
2991 continue;
1da177e4 2992 rcu_read_lock_bh();
a898def2 2993 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
2994 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2995 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 2996 continue;
e84f84f2 2997 if (rt_is_expired(rt))
29e75252 2998 continue;
d8d1f30b 2999 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3000 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3001 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3002 1, NLM_F_MULTI) <= 0) {
adf30907 3003 skb_dst_drop(skb);
1da177e4
LT
3004 rcu_read_unlock_bh();
3005 goto done;
3006 }
adf30907 3007 skb_dst_drop(skb);
1da177e4
LT
3008 }
3009 rcu_read_unlock_bh();
3010 }
3011
3012done:
3013 cb->args[0] = h;
3014 cb->args[1] = idx;
3015 return skb->len;
3016}
3017
3018void ip_rt_multicast_event(struct in_device *in_dev)
3019{
76e6ebfb 3020 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3021}
3022
3023#ifdef CONFIG_SYSCTL
81c684d1 3024static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3025 void __user *buffer,
1da177e4
LT
3026 size_t *lenp, loff_t *ppos)
3027{
3028 if (write) {
639e104f 3029 int flush_delay;
81c684d1 3030 ctl_table ctl;
39a23e75 3031 struct net *net;
639e104f 3032
81c684d1
DL
3033 memcpy(&ctl, __ctl, sizeof(ctl));
3034 ctl.data = &flush_delay;
8d65af78 3035 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3036
81c684d1 3037 net = (struct net *)__ctl->extra1;
39a23e75 3038 rt_cache_flush(net, flush_delay);
1da177e4 3039 return 0;
e905a9ed 3040 }
1da177e4
LT
3041
3042 return -EINVAL;
3043}
3044
eeb61f71 3045static ctl_table ipv4_route_table[] = {
1da177e4 3046 {
1da177e4
LT
3047 .procname = "gc_thresh",
3048 .data = &ipv4_dst_ops.gc_thresh,
3049 .maxlen = sizeof(int),
3050 .mode = 0644,
6d9f239a 3051 .proc_handler = proc_dointvec,
1da177e4
LT
3052 },
3053 {
1da177e4
LT
3054 .procname = "max_size",
3055 .data = &ip_rt_max_size,
3056 .maxlen = sizeof(int),
3057 .mode = 0644,
6d9f239a 3058 .proc_handler = proc_dointvec,
1da177e4
LT
3059 },
3060 {
3061 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3062
1da177e4
LT
3063 .procname = "gc_min_interval",
3064 .data = &ip_rt_gc_min_interval,
3065 .maxlen = sizeof(int),
3066 .mode = 0644,
6d9f239a 3067 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3068 },
3069 {
1da177e4
LT
3070 .procname = "gc_min_interval_ms",
3071 .data = &ip_rt_gc_min_interval,
3072 .maxlen = sizeof(int),
3073 .mode = 0644,
6d9f239a 3074 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3075 },
3076 {
1da177e4
LT
3077 .procname = "gc_timeout",
3078 .data = &ip_rt_gc_timeout,
3079 .maxlen = sizeof(int),
3080 .mode = 0644,
6d9f239a 3081 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3082 },
3083 {
1da177e4
LT
3084 .procname = "gc_interval",
3085 .data = &ip_rt_gc_interval,
3086 .maxlen = sizeof(int),
3087 .mode = 0644,
6d9f239a 3088 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3089 },
3090 {
1da177e4
LT
3091 .procname = "redirect_load",
3092 .data = &ip_rt_redirect_load,
3093 .maxlen = sizeof(int),
3094 .mode = 0644,
6d9f239a 3095 .proc_handler = proc_dointvec,
1da177e4
LT
3096 },
3097 {
1da177e4
LT
3098 .procname = "redirect_number",
3099 .data = &ip_rt_redirect_number,
3100 .maxlen = sizeof(int),
3101 .mode = 0644,
6d9f239a 3102 .proc_handler = proc_dointvec,
1da177e4
LT
3103 },
3104 {
1da177e4
LT
3105 .procname = "redirect_silence",
3106 .data = &ip_rt_redirect_silence,
3107 .maxlen = sizeof(int),
3108 .mode = 0644,
6d9f239a 3109 .proc_handler = proc_dointvec,
1da177e4
LT
3110 },
3111 {
1da177e4
LT
3112 .procname = "error_cost",
3113 .data = &ip_rt_error_cost,
3114 .maxlen = sizeof(int),
3115 .mode = 0644,
6d9f239a 3116 .proc_handler = proc_dointvec,
1da177e4
LT
3117 },
3118 {
1da177e4
LT
3119 .procname = "error_burst",
3120 .data = &ip_rt_error_burst,
3121 .maxlen = sizeof(int),
3122 .mode = 0644,
6d9f239a 3123 .proc_handler = proc_dointvec,
1da177e4
LT
3124 },
3125 {
1da177e4
LT
3126 .procname = "gc_elasticity",
3127 .data = &ip_rt_gc_elasticity,
3128 .maxlen = sizeof(int),
3129 .mode = 0644,
6d9f239a 3130 .proc_handler = proc_dointvec,
1da177e4
LT
3131 },
3132 {
1da177e4
LT
3133 .procname = "mtu_expires",
3134 .data = &ip_rt_mtu_expires,
3135 .maxlen = sizeof(int),
3136 .mode = 0644,
6d9f239a 3137 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3138 },
3139 {
1da177e4
LT
3140 .procname = "min_pmtu",
3141 .data = &ip_rt_min_pmtu,
3142 .maxlen = sizeof(int),
3143 .mode = 0644,
6d9f239a 3144 .proc_handler = proc_dointvec,
1da177e4
LT
3145 },
3146 {
1da177e4
LT
3147 .procname = "min_adv_mss",
3148 .data = &ip_rt_min_advmss,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
6d9f239a 3151 .proc_handler = proc_dointvec,
1da177e4 3152 },
f8572d8f 3153 { }
1da177e4 3154};
39a23e75 3155
2f4520d3
AV
3156static struct ctl_table empty[1];
3157
3158static struct ctl_table ipv4_skeleton[] =
3159{
f8572d8f 3160 { .procname = "route",
d994af0d 3161 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3162 { .procname = "neigh",
d994af0d 3163 .mode = 0555, .child = empty},
2f4520d3
AV
3164 { }
3165};
3166
3167static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3168 { .procname = "net", },
3169 { .procname = "ipv4", },
39a23e75
DL
3170 { },
3171};
3172
39a23e75
DL
3173static struct ctl_table ipv4_route_flush_table[] = {
3174 {
39a23e75
DL
3175 .procname = "flush",
3176 .maxlen = sizeof(int),
3177 .mode = 0200,
6d9f239a 3178 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3179 },
f8572d8f 3180 { },
39a23e75
DL
3181};
3182
2f4520d3 3183static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3184 { .procname = "net", },
3185 { .procname = "ipv4", },
3186 { .procname = "route", },
2f4520d3
AV
3187 { },
3188};
3189
39a23e75
DL
3190static __net_init int sysctl_route_net_init(struct net *net)
3191{
3192 struct ctl_table *tbl;
3193
3194 tbl = ipv4_route_flush_table;
09ad9bc7 3195 if (!net_eq(net, &init_net)) {
39a23e75
DL
3196 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3197 if (tbl == NULL)
3198 goto err_dup;
3199 }
3200 tbl[0].extra1 = net;
3201
3202 net->ipv4.route_hdr =
3203 register_net_sysctl_table(net, ipv4_route_path, tbl);
3204 if (net->ipv4.route_hdr == NULL)
3205 goto err_reg;
3206 return 0;
3207
3208err_reg:
3209 if (tbl != ipv4_route_flush_table)
3210 kfree(tbl);
3211err_dup:
3212 return -ENOMEM;
3213}
3214
3215static __net_exit void sysctl_route_net_exit(struct net *net)
3216{
3217 struct ctl_table *tbl;
3218
3219 tbl = net->ipv4.route_hdr->ctl_table_arg;
3220 unregister_net_sysctl_table(net->ipv4.route_hdr);
3221 BUG_ON(tbl == ipv4_route_flush_table);
3222 kfree(tbl);
3223}
3224
3225static __net_initdata struct pernet_operations sysctl_route_ops = {
3226 .init = sysctl_route_net_init,
3227 .exit = sysctl_route_net_exit,
3228};
1da177e4
LT
3229#endif
3230
3ee94372 3231static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3232{
3ee94372
NH
3233 get_random_bytes(&net->ipv4.rt_genid,
3234 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3235 get_random_bytes(&net->ipv4.dev_addr_genid,
3236 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3237 return 0;
3238}
3239
3ee94372
NH
3240static __net_initdata struct pernet_operations rt_genid_ops = {
3241 .init = rt_genid_init,
9f5e97e5
DL
3242};
3243
3244
c7066f70 3245#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3246struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3247#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3248
3249static __initdata unsigned long rhash_entries;
3250static int __init set_rhash_entries(char *str)
3251{
3252 if (!str)
3253 return 0;
3254 rhash_entries = simple_strtoul(str, &str, 0);
3255 return 1;
3256}
3257__setup("rhash_entries=", set_rhash_entries);
3258
3259int __init ip_rt_init(void)
3260{
424c4b70 3261 int rc = 0;
1da177e4 3262
c7066f70 3263#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3264 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3265 if (!ip_rt_acct)
3266 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3267#endif
3268
e5d679f3
AD
3269 ipv4_dst_ops.kmem_cachep =
3270 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3271 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3272
14e50e57
DM
3273 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3274
fc66f95c
ED
3275 if (dst_entries_init(&ipv4_dst_ops) < 0)
3276 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
424c4b70
ED
3281 rt_hash_table = (struct rt_hash_bucket *)
3282 alloc_large_system_hash("IP route cache",
3283 sizeof(struct rt_hash_bucket),
3284 rhash_entries,
4481374c 3285 (totalram_pages >= 128 * 1024) ?
18955cfc 3286 15 : 17,
8d1502de 3287 0,
424c4b70
ED
3288 &rt_hash_log,
3289 &rt_hash_mask,
c9503e0f 3290 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3291 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3292 rt_hash_lock_init();
1da177e4
LT
3293
3294 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3295 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3296
1da177e4
LT
3297 devinet_init();
3298 ip_fib_init();
3299
73b38711 3300 if (ip_rt_proc_init())
107f1634 3301 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3302#ifdef CONFIG_XFRM
3303 xfrm_init();
a33bc5c1 3304 xfrm4_init(ip_rt_max_size);
1da177e4 3305#endif
c7ac8679 3306 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3307
39a23e75
DL
3308#ifdef CONFIG_SYSCTL
3309 register_pernet_subsys(&sysctl_route_ops);
3310#endif
3ee94372 3311 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3312 return rc;
3313}
3314
a1bc6eb4 3315#ifdef CONFIG_SYSCTL
eeb61f71
AV
3316/*
3317 * We really need to sanitize the damn ipv4 init order, then all
3318 * this nonsense will go away.
3319 */
3320void __init ip_static_sysctl_init(void)
3321{
2f4520d3 3322 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3323}
a1bc6eb4 3324#endif