decnet: net/dn.h needs net/flow.h
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
b9eda06f 94#include <linux/prefetch.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1da177e4 113
68a5e3dd 114#define RT_FL_TOS(oldflp4) \
f61759e6 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
1da177e4 121static int ip_rt_max_size;
817bc4db 122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
1080d709 134static int rt_chain_length_max __read_mostly = 20;
de68dca1 135static int redirect_genid;
1da177e4 136
9f28a2fc
ED
137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
1da177e4
LT
140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 146static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 147static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 151static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 152
72cdd1d9
ED
153static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155{
156}
1da177e4 157
62fa8a84
DM
158static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159{
06582540
DM
160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
163
164 if (!rt->peer)
a48eff12 165 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 166
06582540
DM
167 peer = rt->peer;
168 if (peer) {
62fa8a84
DM
169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
06582540
DM
172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
62fa8a84
DM
180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
62fa8a84
DM
184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191}
192
d3aaeb38
DM
193static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
1da177e4
LT
195static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
09640e63 197 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
0dbaee3b 200 .default_advmss = ipv4_default_advmss,
ebb762f2 201 .mtu = ipv4_mtu,
62fa8a84 202 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 208 .local_out = __ip_local_out,
d3aaeb38 209 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
210};
211
212#define ECN_OR_COST(class) TC_PRIO_##class
213
4839c52b 214const __u8 ip_tos2prio[16] = {
1da177e4 215 TC_PRIO_BESTEFFORT,
4a2b9c37 216 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231};
232
233
234/*
235 * Route cache.
236 */
237
238/* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248struct rt_hash_bucket {
1c31720a 249 struct rtable __rcu *chain;
22c047cc 250};
1080d709 251
8a25d5de
IM
252#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
254/*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
62051200 257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 258 */
62051200
IM
259#ifdef CONFIG_LOCKDEP
260# define RT_HASH_LOCK_SZ 256
22c047cc 261#else
62051200
IM
262# if NR_CPUS >= 32
263# define RT_HASH_LOCK_SZ 4096
264# elif NR_CPUS >= 16
265# define RT_HASH_LOCK_SZ 2048
266# elif NR_CPUS >= 8
267# define RT_HASH_LOCK_SZ 1024
268# elif NR_CPUS >= 4
269# define RT_HASH_LOCK_SZ 512
270# else
271# define RT_HASH_LOCK_SZ 256
272# endif
22c047cc
ED
273#endif
274
275static spinlock_t *rt_hash_locks;
276# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
277
278static __init void rt_hash_lock_init(void)
279{
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289}
22c047cc
ED
290#else
291# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
292
293static inline void rt_hash_lock_init(void)
294{
295}
22c047cc 296#endif
1da177e4 297
817bc4db
SH
298static struct rt_hash_bucket *rt_hash_table __read_mostly;
299static unsigned rt_hash_mask __read_mostly;
300static unsigned int rt_hash_log __read_mostly;
1da177e4 301
2f970d83 302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 303#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 304
b00180de 305static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 306 int genid)
1da177e4 307{
0eae88f3 308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 309 idx, genid)
29e75252 310 & rt_hash_mask;
1da177e4
LT
311}
312
e84f84f2
DL
313static inline int rt_genid(struct net *net)
314{
315 return atomic_read(&net->ipv4.rt_genid);
316}
317
1da177e4
LT
318#ifdef CONFIG_PROC_FS
319struct rt_cache_iter_state {
a75e936f 320 struct seq_net_private p;
1da177e4 321 int bucket;
29e75252 322 int genid;
1da177e4
LT
323};
324
1218854a 325static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 326{
1218854a 327 struct rt_cache_iter_state *st = seq->private;
1da177e4 328 struct rtable *r = NULL;
1da177e4
LT
329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 332 continue;
1da177e4 333 rcu_read_lock_bh();
a898def2 334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 335 while (r) {
d8d1f30b 336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 337 r->rt_genid == st->genid)
29e75252 338 return r;
d8d1f30b 339 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 340 }
1da177e4
LT
341 rcu_read_unlock_bh();
342 }
29e75252 343 return r;
1da177e4
LT
344}
345
1218854a 346static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 347 struct rtable *r)
1da177e4 348{
1218854a 349 struct rt_cache_iter_state *st = seq->private;
a6272665 350
1c31720a 351 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
352 while (!r) {
353 rcu_read_unlock_bh();
a6272665
ED
354 do {
355 if (--st->bucket < 0)
356 return NULL;
33d480ce 357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 358 rcu_read_lock_bh();
1c31720a 359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 360 }
1c31720a 361 return r;
1da177e4
LT
362}
363
1218854a 364static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
365 struct rtable *r)
366{
1218854a
YH
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 369 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 370 continue;
642d6318
DL
371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375}
376
1218854a 377static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 378{
1218854a 379 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
380
381 if (r)
1218854a 382 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
383 --pos;
384 return pos ? NULL : r;
385}
386
387static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388{
29e75252 389 struct rt_cache_iter_state *st = seq->private;
29e75252 390 if (*pos)
1218854a 391 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 392 st->genid = rt_genid(seq_file_net(seq));
29e75252 393 return SEQ_START_TOKEN;
1da177e4
LT
394}
395
396static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397{
29e75252 398 struct rtable *r;
1da177e4
LT
399
400 if (v == SEQ_START_TOKEN)
1218854a 401 r = rt_cache_get_first(seq);
1da177e4 402 else
1218854a 403 r = rt_cache_get_next(seq, v);
1da177e4
LT
404 ++*pos;
405 return r;
406}
407
408static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409{
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412}
413
414static int rt_cache_seq_show(struct seq_file *seq, void *v)
415{
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
69cce1d1 423 struct neighbour *n;
218fa90f 424 int len, HHUptod;
1da177e4 425
218fa90f 426 rcu_read_lock();
27217455 427 n = dst_get_neighbour_noref(&r->dst);
218fa90f
ED
428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
0eae88f3
ED
431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 433 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
d8d1f30b
CG
436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 438 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 442 r->rt_key_tos,
f6b72b62 443 -1,
218fa90f 444 HHUptod,
5e659e4c
PE
445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
448 }
449 return 0;
1da177e4
LT
450}
451
f690808e 452static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457};
458
459static int rt_cache_seq_open(struct inode *inode, struct file *file)
460{
a75e936f 461 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 462 sizeof(struct rt_cache_iter_state));
1da177e4
LT
463}
464
9a32144e 465static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
a75e936f 470 .release = seq_release_net,
1da177e4
LT
471};
472
473
474static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475{
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
0f23174a 481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
2f970d83 485 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
486 }
487 return NULL;
488}
489
490static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491{
492 int cpu;
493
0f23174a 494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
2f970d83 498 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
499 }
500 return NULL;
e905a9ed 501
1da177e4
LT
502}
503
504static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505{
506
507}
508
509static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510{
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
5bec0039 514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
515 return 0;
516 }
e905a9ed 517
1da177e4
LT
518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 520 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
e905a9ed 531 st->out_slow_mc,
1da177e4
LT
532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541}
542
f690808e 543static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548};
549
550
551static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552{
553 return seq_open(file, &rt_cpu_seq_ops);
554}
555
9a32144e 556static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562};
563
c7066f70 564#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 565static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 566{
a661c419
AD
567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
569
570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
573
574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
581 }
78c686e9
PE
582 }
583
a661c419
AD
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
587}
78c686e9 588
a661c419
AD
589static int rt_acct_proc_open(struct inode *inode, struct file *file)
590{
591 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 592}
a661c419
AD
593
594static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600};
78c686e9 601#endif
107f1634 602
73b38711 603static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
604{
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
77020720
WC
612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
614 if (!pde)
615 goto err2;
616
c7066f70 617#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
619 if (!pde)
620 goto err3;
621#endif
622 return 0;
623
c7066f70 624#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
625err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627#endif
628err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630err1:
631 return -ENOMEM;
632}
73b38711
DL
633
634static void __net_exit ip_rt_do_proc_exit(struct net *net)
635{
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 638#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 639 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 640#endif
73b38711
DL
641}
642
643static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646};
647
648static int __init ip_rt_proc_init(void)
649{
650 return register_pernet_subsys(&ip_rt_proc_ops);
651}
652
107f1634 653#else
73b38711 654static inline int ip_rt_proc_init(void)
107f1634
PE
655{
656 return 0;
657}
1da177e4 658#endif /* CONFIG_PROC_FS */
e905a9ed 659
5969f71d 660static inline void rt_free(struct rtable *rt)
1da177e4 661{
d8d1f30b 662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
663}
664
5969f71d 665static inline void rt_drop(struct rtable *rt)
1da177e4 666{
1da177e4 667 ip_rt_put(rt);
d8d1f30b 668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
669}
670
5969f71d 671static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
672{
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 676 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
677}
678
5969f71d 679static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
680{
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 682 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
683}
684
685static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686{
687 unsigned long age;
688 int ret = 0;
689
d8d1f30b 690 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
691 goto out;
692
d8d1f30b 693 age = jiffies - rth->dst.lastuse;
1da177e4
LT
694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698out: return ret;
699}
700
701/* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706static inline u32 rt_score(struct rtable *rt)
707{
d8d1f30b 708 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
c7537967 715 if (rt_is_output_route(rt) ||
1da177e4
LT
716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720}
721
1080d709
NH
722static inline bool rt_caching(const struct net *net)
723{
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726}
727
5e2b61f7
DM
728static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
1080d709 730{
5e2b61f7
DM
731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
734}
735
5e2b61f7 736static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 737{
5e2b61f7
DM
738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
744}
745
b5921910
DL
746static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747{
d8d1f30b 748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
749}
750
e84f84f2
DL
751static inline int rt_is_expired(struct rtable *rth)
752{
d8d1f30b 753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
754}
755
beb659bd
ED
756/*
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
760 */
6561a3b1 761static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
762{
763 unsigned int i;
764 struct rtable *rth, *next;
765
766 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
767 struct rtable __rcu **pprev;
768 struct rtable *list;
769
beb659bd
ED
770 if (process_context && need_resched())
771 cond_resched();
33d480ce 772 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
773 if (!rth)
774 continue;
775
776 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 777
6561a3b1
DM
778 list = NULL;
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
1c31720a 781 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 782
6561a3b1
DM
783 while (rth) {
784 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 785 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
786
787 if (!net ||
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
791 list = rth;
32cb5b4e 792 } else {
6561a3b1 793 pprev = &rth->dst.rt_next;
32cb5b4e 794 }
6561a3b1 795 rth = next;
32cb5b4e 796 }
6561a3b1 797
beb659bd
ED
798 spin_unlock_bh(rt_hash_lock_addr(i));
799
6561a3b1
DM
800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
802 rt_free(list);
beb659bd
ED
803 }
804 }
805}
806
1080d709
NH
807/*
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813 */
814
815#define FRACT_BITS 3
816#define ONE (1UL << FRACT_BITS)
817
98376387
ED
818/*
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
824 */
825static int has_noalias(const struct rtable *head, const struct rtable *rth)
826{
827 const struct rtable *aux = head;
828
829 while (aux != rth) {
5e2b61f7 830 if (compare_hash_inputs(aux, rth))
98376387 831 return 0;
1c31720a 832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
833 }
834 return ONE;
835}
836
9f28a2fc
ED
837static void rt_check_expire(void)
838{
839 static unsigned int rover;
840 unsigned int i = rover, goal;
841 struct rtable *rth;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
845 unsigned long delta;
846 u64 mult;
847
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
859
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
862
863 if (need_resched())
864 cond_resched();
865
866 samples++;
867
868 if (rcu_dereference_raw(*rthp) == NULL)
869 continue;
870 length = 0;
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
877 rt_free(rth);
878 continue;
879 }
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
883nofree:
884 tmo >>= 1;
885 rthp = &rth->dst.rt_next;
886 /*
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
893 */
894 length += has_noalias(rt_hash_table[i].chain, rth);
895 continue;
896 }
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 goto nofree;
899
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
902 rt_free(rth);
903 }
904 spin_unlock_bh(rt_hash_lock_addr(i));
905 sum += length;
906 sum2 += length*length;
907 }
908 if (samples) {
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
912 ip_rt_gc_elasticity,
913 (avg + 4*sd) >> FRACT_BITS);
914 }
915 rover = i;
916}
917
918/*
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
921 */
922static void rt_worker_func(struct work_struct *work)
923{
924 rt_check_expire();
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926}
927
29e75252 928/*
25985edc 929 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 933 */
86c657f6 934static void rt_cache_invalidate(struct net *net)
1da177e4 935{
29e75252 936 unsigned char shuffle;
1da177e4 937
29e75252 938 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
de68dca1 940 redirect_genid++;
1da177e4
LT
941}
942
29e75252
ED
943/*
944 * delay < 0 : invalidate cache (fast : entries will be deleted later)
945 * delay >= 0 : invalidate & flush cache (can be long)
946 */
76e6ebfb 947void rt_cache_flush(struct net *net, int delay)
1da177e4 948{
86c657f6 949 rt_cache_invalidate(net);
29e75252 950 if (delay >= 0)
6561a3b1 951 rt_do_flush(net, !in_softirq());
1da177e4
LT
952}
953
a5ee1551 954/* Flush previous cache invalidated entries from the cache */
6561a3b1 955void rt_cache_flush_batch(struct net *net)
a5ee1551 956{
6561a3b1 957 rt_do_flush(net, !in_softirq());
a5ee1551
EB
958}
959
1080d709
NH
960static void rt_emergency_hash_rebuild(struct net *net)
961{
3ee94372 962 if (net_ratelimit())
1080d709 963 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 964 rt_cache_invalidate(net);
1080d709
NH
965}
966
1da177e4
LT
967/*
968 Short description of GC goals.
969
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
973
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
978 */
979
569d3645 980static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
981{
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
984 static int rover;
985 static int equilibrium;
1c31720a
ED
986 struct rtable *rth;
987 struct rtable __rcu **rthp;
1da177e4
LT
988 unsigned long now = jiffies;
989 int goal;
fc66f95c 990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
991
992 /*
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
995 */
996
997 RT_CACHE_STAT_INC(gc_total);
998
999 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 1000 entries < ip_rt_max_size) {
1da177e4
LT
1001 RT_CACHE_STAT_INC(gc_ignored);
1002 goto out;
1003 }
1004
fc66f95c 1005 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1006 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1008 if (goal <= 0) {
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1011 goal = entries - equilibrium;
1da177e4 1012 if (goal > 0) {
b790cedd 1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1014 goal = entries - equilibrium;
1da177e4
LT
1015 }
1016 } else {
1017 /* We are in dangerous area. Try to reduce cache really
1018 * aggressively.
1019 */
b790cedd 1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1021 equilibrium = entries - goal;
1da177e4
LT
1022 }
1023
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1025 last_gc = now;
1026
1027 if (goal <= 0) {
1028 equilibrium += goal;
1029 goto work_done;
1030 }
1031
1032 do {
1033 int i, k;
1034
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1037
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
22c047cc 1040 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1043 if (!rt_is_expired(rth) &&
29e75252 1044 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1045 tmo >>= 1;
d8d1f30b 1046 rthp = &rth->dst.rt_next;
1da177e4
LT
1047 continue;
1048 }
d8d1f30b 1049 *rthp = rth->dst.rt_next;
1da177e4
LT
1050 rt_free(rth);
1051 goal--;
1da177e4 1052 }
22c047cc 1053 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1054 if (goal <= 0)
1055 break;
1056 }
1057 rover = k;
1058
1059 if (goal <= 0)
1060 goto work_done;
1061
1062 /* Goal is not achieved. We stop process if:
1063
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1069 */
1070
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073 if (expire == 0)
1074 break;
1075
1076 expire >>= 1;
1da177e4 1077
fc66f95c 1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1079 goto out;
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1081
fc66f95c
ED
1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1085 goto out;
1086 if (net_ratelimit())
1087 printk(KERN_WARNING "dst cache overflow\n");
1088 RT_CACHE_STAT_INC(gc_dst_overflow);
1089 return 1;
1090
1091work_done:
1092 expire += ip_rt_gc_min_interval;
1093 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1096 expire = ip_rt_gc_timeout;
1da177e4
LT
1097out: return 0;
1098}
1099
98376387
ED
1100/*
1101 * Returns number of entries in a hash chain that have different hash_inputs
1102 */
1103static int slow_chain_length(const struct rtable *head)
1104{
1105 int length = 0;
1106 const struct rtable *rth = head;
1107
1108 while (rth) {
1109 length += has_noalias(head, rth);
1c31720a 1110 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1111 }
1112 return length >> FRACT_BITS;
1113}
1114
d3aaeb38 1115static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1116{
d3aaeb38
DM
1117 static const __be32 inaddr_any = 0;
1118 struct net_device *dev = dst->dev;
1119 const __be32 *pkey = daddr;
39232973 1120 const struct rtable *rt;
3769cffb
DM
1121 struct neighbour *n;
1122
39232973
DM
1123 rt = (const struct rtable *) dst;
1124
3769cffb 1125 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
d3aaeb38 1126 pkey = &inaddr_any;
39232973
DM
1127 else if (rt->rt_gateway)
1128 pkey = (const __be32 *) &rt->rt_gateway;
d3aaeb38 1129
32092ecf 1130 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
d3aaeb38
DM
1131 if (n)
1132 return n;
32092ecf 1133 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1134}
1135
1136static int rt_bind_neighbour(struct rtable *rt)
1137{
1138 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1139 if (IS_ERR(n))
1140 return PTR_ERR(n);
69cce1d1 1141 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1142
1143 return 0;
1144}
1145
b23dd4fe
DM
1146static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147 struct sk_buff *skb, int ifindex)
1da177e4 1148{
1c31720a
ED
1149 struct rtable *rth, *cand;
1150 struct rtable __rcu **rthp, **candp;
1da177e4 1151 unsigned long now;
1da177e4
LT
1152 u32 min_score;
1153 int chain_length;
1154 int attempts = !in_softirq();
1155
1156restart:
1157 chain_length = 0;
1158 min_score = ~(u32)0;
1159 cand = NULL;
1160 candp = NULL;
1161 now = jiffies;
1162
d8d1f30b 1163 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1164 /*
1165 * If we're not caching, just tell the caller we
1166 * were successful and don't touch the route. The
1167 * caller hold the sole reference to the cache entry, and
1168 * it will be released when the caller is done with it.
1169 * If we drop it here, the callers have no way to resolve routes
1170 * when we're not caching. Instead, just point *rp at rt, so
1171 * the caller gets a single use out of the route
b6280b47
NH
1172 * Note that we do rt_free on this new route entry, so that
1173 * once its refcount hits zero, we are still able to reap it
1174 * (Thanks Alexey)
27b75c95
ED
1175 * Note: To avoid expensive rcu stuff for this uncached dst,
1176 * we set DST_NOCACHE so that dst_release() can free dst without
1177 * waiting a grace period.
73e42897 1178 */
b6280b47 1179
c7d4426a 1180 rt->dst.flags |= DST_NOCACHE;
c7537967 1181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1182 int err = rt_bind_neighbour(rt);
b6280b47
NH
1183 if (err) {
1184 if (net_ratelimit())
1185 printk(KERN_WARNING
1186 "Neighbour table failure & not caching routes.\n");
27b75c95 1187 ip_rt_put(rt);
b23dd4fe 1188 return ERR_PTR(err);
b6280b47
NH
1189 }
1190 }
1191
b6280b47 1192 goto skip_hashing;
1080d709
NH
1193 }
1194
1da177e4
LT
1195 rthp = &rt_hash_table[hash].chain;
1196
22c047cc 1197 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1198 while ((rth = rcu_dereference_protected(*rthp,
1199 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1200 if (rt_is_expired(rth)) {
d8d1f30b 1201 *rthp = rth->dst.rt_next;
29e75252
ED
1202 rt_free(rth);
1203 continue;
1204 }
5e2b61f7 1205 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1206 /* Put it first */
d8d1f30b 1207 *rthp = rth->dst.rt_next;
1da177e4
LT
1208 /*
1209 * Since lookup is lockfree, the deletion
1210 * must be visible to another weakly ordered CPU before
1211 * the insertion at the start of the hash chain.
1212 */
d8d1f30b 1213 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1214 rt_hash_table[hash].chain);
1215 /*
1216 * Since lookup is lockfree, the update writes
1217 * must be ordered for consistency on SMP.
1218 */
1219 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220
d8d1f30b 1221 dst_use(&rth->dst, now);
22c047cc 1222 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1223
1224 rt_drop(rt);
b23dd4fe 1225 if (skb)
d8d1f30b 1226 skb_dst_set(skb, &rth->dst);
b23dd4fe 1227 return rth;
1da177e4
LT
1228 }
1229
d8d1f30b 1230 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1231 u32 score = rt_score(rth);
1232
1233 if (score <= min_score) {
1234 cand = rth;
1235 candp = rthp;
1236 min_score = score;
1237 }
1238 }
1239
1240 chain_length++;
1241
d8d1f30b 1242 rthp = &rth->dst.rt_next;
1da177e4
LT
1243 }
1244
1245 if (cand) {
1246 /* ip_rt_gc_elasticity used to be average length of chain
1247 * length, when exceeded gc becomes really aggressive.
1248 *
1249 * The second limit is less certain. At the moment it allows
1250 * only 2 entries per bucket. We will see.
1251 */
1252 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1253 *candp = cand->dst.rt_next;
1da177e4
LT
1254 rt_free(cand);
1255 }
1080d709 1256 } else {
98376387
ED
1257 if (chain_length > rt_chain_length_max &&
1258 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1259 struct net *net = dev_net(rt->dst.dev);
1080d709 1260 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1261 if (!rt_caching(net)) {
1080d709 1262 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1263 rt->dst.dev->name, num);
1080d709 1264 }
b35ecb5d 1265 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1266 spin_unlock_bh(rt_hash_lock_addr(hash));
1267
5e2b61f7 1268 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1269 ifindex, rt_genid(net));
1270 goto restart;
1080d709 1271 }
1da177e4
LT
1272 }
1273
1274 /* Try to bind route to arp only if it is output
1275 route or unicast forwarding path.
1276 */
c7537967 1277 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1278 int err = rt_bind_neighbour(rt);
1da177e4 1279 if (err) {
22c047cc 1280 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1281
1282 if (err != -ENOBUFS) {
1283 rt_drop(rt);
b23dd4fe 1284 return ERR_PTR(err);
1da177e4
LT
1285 }
1286
1287 /* Neighbour tables are full and nothing
1288 can be released. Try to shrink route cache,
1289 it is most likely it holds some neighbour records.
1290 */
1291 if (attempts-- > 0) {
1292 int saved_elasticity = ip_rt_gc_elasticity;
1293 int saved_int = ip_rt_gc_min_interval;
1294 ip_rt_gc_elasticity = 1;
1295 ip_rt_gc_min_interval = 0;
569d3645 1296 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1297 ip_rt_gc_min_interval = saved_int;
1298 ip_rt_gc_elasticity = saved_elasticity;
1299 goto restart;
1300 }
1301
1302 if (net_ratelimit())
7e1b33e5 1303 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1304 rt_drop(rt);
b23dd4fe 1305 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1306 }
1307 }
1308
d8d1f30b 1309 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1310
00269b54
ED
1311 /*
1312 * Since lookup is lockfree, we must make sure
25985edc 1313 * previous writes to rt are committed to memory
00269b54
ED
1314 * before making rt visible to other CPUS.
1315 */
1ddbcb00 1316 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1317
22c047cc 1318 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1319
b6280b47 1320skip_hashing:
b23dd4fe 1321 if (skb)
d8d1f30b 1322 skb_dst_set(skb, &rt->dst);
b23dd4fe 1323 return rt;
1da177e4
LT
1324}
1325
6431cbc2
DM
1326static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327
1328static u32 rt_peer_genid(void)
1329{
1330 return atomic_read(&__rt_peer_genid);
1331}
1332
a48eff12 1333void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1334{
1da177e4
LT
1335 struct inet_peer *peer;
1336
a48eff12 1337 peer = inet_getpeer_v4(daddr, create);
1da177e4 1338
49e8ab03 1339 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1340 inet_putpeer(peer);
6431cbc2
DM
1341 else
1342 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1343}
1344
1345/*
1346 * Peer allocation may fail only in serious out-of-memory conditions. However
1347 * we still can generate some output.
1348 * Random ID selection looks a bit dangerous because we have no chances to
1349 * select ID being unique in a reasonable period of time.
1350 * But broken packet identifier may be better than no packet at all.
1351 */
1352static void ip_select_fb_ident(struct iphdr *iph)
1353{
1354 static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 static u32 ip_fallback_id;
1356 u32 salt;
1357
1358 spin_lock_bh(&ip_fb_id_lock);
e448515c 1359 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1360 iph->id = htons(salt & 0xFFFF);
1361 ip_fallback_id = salt;
1362 spin_unlock_bh(&ip_fb_id_lock);
1363}
1364
1365void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366{
1367 struct rtable *rt = (struct rtable *) dst;
1368
e688a604 1369 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1da177e4 1370 if (rt->peer == NULL)
a48eff12 1371 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1372
1373 /* If peer is attached to destination, it is never detached,
1374 so that we need not to grab a lock to dereference it.
1375 */
1376 if (rt->peer) {
1377 iph->id = htons(inet_getid(rt->peer, more));
1378 return;
1379 }
e688a604 1380 } else if (!rt)
e905a9ed 1381 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1382 __builtin_return_address(0));
1da177e4
LT
1383
1384 ip_select_fb_ident(iph);
1385}
4bc2f18b 1386EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1387
1388static void rt_del(unsigned hash, struct rtable *rt)
1389{
1c31720a
ED
1390 struct rtable __rcu **rthp;
1391 struct rtable *aux;
1da177e4 1392
29e75252 1393 rthp = &rt_hash_table[hash].chain;
22c047cc 1394 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1395 ip_rt_put(rt);
1c31720a
ED
1396 while ((aux = rcu_dereference_protected(*rthp,
1397 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1398 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1399 *rthp = aux->dst.rt_next;
29e75252
ED
1400 rt_free(aux);
1401 continue;
1da177e4 1402 }
d8d1f30b 1403 rthp = &aux->dst.rt_next;
29e75252 1404 }
22c047cc 1405 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1406}
1407
de398fb8 1408static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
9cc20b26
ED
1409{
1410 struct rtable *rt = (struct rtable *) dst;
1411 __be32 orig_gw = rt->rt_gateway;
1412 struct neighbour *n, *old_n;
1413
1414 dst_confirm(&rt->dst);
1415
1416 rt->rt_gateway = peer->redirect_learned.a4;
1417
1418 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
de398fb8
DM
1419 if (IS_ERR(n)) {
1420 rt->rt_gateway = orig_gw;
1421 return;
1422 }
9cc20b26
ED
1423 old_n = xchg(&rt->dst._neighbour, n);
1424 if (old_n)
1425 neigh_release(old_n);
de398fb8
DM
1426 if (!(n->nud_state & NUD_VALID)) {
1427 neigh_event_send(n, NULL);
9cc20b26
ED
1428 } else {
1429 rt->rt_flags |= RTCF_REDIRECTED;
1430 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431 }
9cc20b26
ED
1432}
1433
ed7865a4 1434/* called in rcu_read_lock() section */
f7655229
AV
1435void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 __be32 saddr, struct net_device *dev)
1da177e4 1437{
7cc9150e 1438 int s, i;
ed7865a4 1439 struct in_device *in_dev = __in_dev_get_rcu(dev);
7cc9150e
FL
1440 __be32 skeys[2] = { saddr, 0 };
1441 int ikeys[2] = { dev->ifindex, 0 };
f39925db 1442 struct inet_peer *peer;
317805b8 1443 struct net *net;
1da177e4 1444
1da177e4
LT
1445 if (!in_dev)
1446 return;
1447
c346dca1 1448 net = dev_net(dev);
9d4fb27d
JP
1449 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 ipv4_is_zeronet(new_gw))
1da177e4
LT
1452 goto reject_redirect;
1453
1454 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 goto reject_redirect;
1457 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 goto reject_redirect;
1459 } else {
317805b8 1460 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1461 goto reject_redirect;
1462 }
1463
7cc9150e
FL
1464 for (s = 0; s < 2; s++) {
1465 for (i = 0; i < 2; i++) {
9cc20b26
ED
1466 unsigned int hash;
1467 struct rtable __rcu **rthp;
1468 struct rtable *rt;
1469
1470 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471
1472 rthp = &rt_hash_table[hash].chain;
1473
1474 while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 rthp = &rt->dst.rt_next;
1476
1477 if (rt->rt_key_dst != daddr ||
1478 rt->rt_key_src != skeys[s] ||
1479 rt->rt_oif != ikeys[i] ||
1480 rt_is_input_route(rt) ||
1481 rt_is_expired(rt) ||
1482 !net_eq(dev_net(rt->dst.dev), net) ||
1483 rt->dst.error ||
1484 rt->dst.dev != dev ||
1485 rt->rt_gateway != old_gw)
1486 continue;
e905a9ed 1487
9cc20b26
ED
1488 if (!rt->peer)
1489 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4 1490
9cc20b26
ED
1491 peer = rt->peer;
1492 if (peer) {
de68dca1
ED
1493 if (peer->redirect_learned.a4 != new_gw ||
1494 peer->redirect_genid != redirect_genid) {
9cc20b26 1495 peer->redirect_learned.a4 = new_gw;
de68dca1 1496 peer->redirect_genid = redirect_genid;
9cc20b26
ED
1497 atomic_inc(&__rt_peer_genid);
1498 }
1499 check_peer_redir(&rt->dst, peer);
1500 }
7cc9150e 1501 }
7cc9150e 1502 }
1da177e4 1503 }
1da177e4
LT
1504 return;
1505
1506reject_redirect:
1507#ifdef CONFIG_IP_ROUTE_VERBOSE
1508 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1509 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1510 " Advised path = %pI4 -> %pI4\n",
1511 &old_gw, dev->name, &new_gw,
1512 &saddr, &daddr);
1da177e4 1513#endif
ed7865a4 1514 ;
1da177e4
LT
1515}
1516
fe6fe792
ED
1517static bool peer_pmtu_expired(struct inet_peer *peer)
1518{
1519 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520
1521 return orig &&
1522 time_after_eq(jiffies, orig) &&
1523 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524}
1525
1526static bool peer_pmtu_cleaned(struct inet_peer *peer)
1527{
1528 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1529
1530 return orig &&
1531 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1532}
1533
1da177e4
LT
1534static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1535{
ee6b9673 1536 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1537 struct dst_entry *ret = dst;
1538
1539 if (rt) {
d11a4dc1 1540 if (dst->obsolete > 0) {
1da177e4
LT
1541 ip_rt_put(rt);
1542 ret = NULL;
2c8cec5c 1543 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1544 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1545 rt->rt_oif,
e84f84f2 1546 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1547 rt_del(hash, rt);
1548 ret = NULL;
fe6fe792
ED
1549 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1550 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1551 }
1552 }
1553 return ret;
1554}
1555
1556/*
1557 * Algorithm:
1558 * 1. The first ip_rt_redirect_number redirects are sent
1559 * with exponential backoff, then we stop sending them at all,
1560 * assuming that the host ignores our redirects.
1561 * 2. If we did not see packets requiring redirects
1562 * during ip_rt_redirect_silence, we assume that the host
1563 * forgot redirected route and start to send redirects again.
1564 *
1565 * This algorithm is much cheaper and more intelligent than dumb load limiting
1566 * in icmp.c.
1567 *
1568 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1569 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1570 */
1571
1572void ip_rt_send_redirect(struct sk_buff *skb)
1573{
511c3f92 1574 struct rtable *rt = skb_rtable(skb);
30038fc6 1575 struct in_device *in_dev;
92d86829 1576 struct inet_peer *peer;
30038fc6 1577 int log_martians;
1da177e4 1578
30038fc6 1579 rcu_read_lock();
d8d1f30b 1580 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1581 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1582 rcu_read_unlock();
1da177e4 1583 return;
30038fc6
ED
1584 }
1585 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1586 rcu_read_unlock();
1da177e4 1587
92d86829 1588 if (!rt->peer)
a48eff12 1589 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1590 peer = rt->peer;
1591 if (!peer) {
1592 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1593 return;
1594 }
1595
1da177e4
LT
1596 /* No redirected packets during ip_rt_redirect_silence;
1597 * reset the algorithm.
1598 */
92d86829
DM
1599 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1600 peer->rate_tokens = 0;
1da177e4
LT
1601
1602 /* Too many ignored redirects; do not send anything
d8d1f30b 1603 * set dst.rate_last to the last seen redirected packet.
1da177e4 1604 */
92d86829
DM
1605 if (peer->rate_tokens >= ip_rt_redirect_number) {
1606 peer->rate_last = jiffies;
30038fc6 1607 return;
1da177e4
LT
1608 }
1609
1610 /* Check for load limit; set rate_last to the latest sent
1611 * redirect.
1612 */
92d86829 1613 if (peer->rate_tokens == 0 ||
14fb8a76 1614 time_after(jiffies,
92d86829
DM
1615 (peer->rate_last +
1616 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1617 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1618 peer->rate_last = jiffies;
1619 ++peer->rate_tokens;
1da177e4 1620#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1621 if (log_martians &&
92d86829 1622 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1623 net_ratelimit())
673d57e7 1624 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1625 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1626 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1627#endif
1628 }
1da177e4
LT
1629}
1630
1631static int ip_error(struct sk_buff *skb)
1632{
511c3f92 1633 struct rtable *rt = skb_rtable(skb);
92d86829 1634 struct inet_peer *peer;
1da177e4 1635 unsigned long now;
92d86829 1636 bool send;
1da177e4
LT
1637 int code;
1638
d8d1f30b 1639 switch (rt->dst.error) {
4500ebf8
JP
1640 case EINVAL:
1641 default:
1642 goto out;
1643 case EHOSTUNREACH:
1644 code = ICMP_HOST_UNREACH;
1645 break;
1646 case ENETUNREACH:
1647 code = ICMP_NET_UNREACH;
1648 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1649 IPSTATS_MIB_INNOROUTES);
1650 break;
1651 case EACCES:
1652 code = ICMP_PKT_FILTERED;
1653 break;
1da177e4
LT
1654 }
1655
92d86829 1656 if (!rt->peer)
a48eff12 1657 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1658 peer = rt->peer;
1659
1660 send = true;
1661 if (peer) {
1662 now = jiffies;
1663 peer->rate_tokens += now - peer->rate_last;
1664 if (peer->rate_tokens > ip_rt_error_burst)
1665 peer->rate_tokens = ip_rt_error_burst;
1666 peer->rate_last = now;
1667 if (peer->rate_tokens >= ip_rt_error_cost)
1668 peer->rate_tokens -= ip_rt_error_cost;
1669 else
1670 send = false;
1da177e4 1671 }
92d86829
DM
1672 if (send)
1673 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1674
1675out: kfree_skb(skb);
1676 return 0;
e905a9ed 1677}
1da177e4
LT
1678
1679/*
1680 * The last two values are not from the RFC but
1681 * are needed for AMPRnet AX.25 paths.
1682 */
1683
9b5b5cff 1684static const unsigned short mtu_plateau[] =
1da177e4
LT
1685{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1686
5969f71d 1687static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1688{
1689 int i;
e905a9ed 1690
1da177e4
LT
1691 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1692 if (old_mtu > mtu_plateau[i])
1693 return mtu_plateau[i];
1694 return 68;
1695}
1696
b71d1d42 1697unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1698 unsigned short new_mtu,
1699 struct net_device *dev)
1da177e4 1700{
1da177e4 1701 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1702 unsigned short est_mtu = 0;
2c8cec5c 1703 struct inet_peer *peer;
1da177e4 1704
2c8cec5c
DM
1705 peer = inet_getpeer_v4(iph->daddr, 1);
1706 if (peer) {
1707 unsigned short mtu = new_mtu;
1da177e4 1708
2c8cec5c
DM
1709 if (new_mtu < 68 || new_mtu >= old_mtu) {
1710 /* BSD 4.2 derived systems incorrectly adjust
1711 * tot_len by the IP header length, and report
1712 * a zero MTU in the ICMP message.
1713 */
1714 if (mtu == 0 &&
1715 old_mtu >= 68 + (iph->ihl << 2))
1716 old_mtu -= iph->ihl << 2;
1717 mtu = guess_mtu(old_mtu);
1718 }
0010e465 1719
2c8cec5c
DM
1720 if (mtu < ip_rt_min_pmtu)
1721 mtu = ip_rt_min_pmtu;
1722 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1723 unsigned long pmtu_expires;
1724
1725 pmtu_expires = jiffies + ip_rt_mtu_expires;
1726 if (!pmtu_expires)
1727 pmtu_expires = 1UL;
1728
2c8cec5c
DM
1729 est_mtu = mtu;
1730 peer->pmtu_learned = mtu;
46af3180 1731 peer->pmtu_expires = pmtu_expires;
59445b6b 1732 atomic_inc(&__rt_peer_genid);
2c8cec5c 1733 }
1da177e4 1734
2c8cec5c 1735 inet_putpeer(peer);
1da177e4
LT
1736 }
1737 return est_mtu ? : new_mtu;
1738}
1739
2c8cec5c
DM
1740static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1741{
fe6fe792 1742 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1743
fe6fe792
ED
1744 if (!expires)
1745 return;
46af3180 1746 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1747 u32 orig_dst_mtu = dst_mtu(dst);
1748 if (peer->pmtu_learned < orig_dst_mtu) {
1749 if (!peer->pmtu_orig)
1750 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1751 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1752 }
1753 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1754 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1755}
1756
1da177e4
LT
1757static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1758{
2c8cec5c
DM
1759 struct rtable *rt = (struct rtable *) dst;
1760 struct inet_peer *peer;
1761
1762 dst_confirm(dst);
1763
1764 if (!rt->peer)
a48eff12 1765 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1766 peer = rt->peer;
1767 if (peer) {
fe6fe792
ED
1768 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1769
2c8cec5c 1770 if (mtu < ip_rt_min_pmtu)
1da177e4 1771 mtu = ip_rt_min_pmtu;
fe6fe792 1772 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1773
1774 pmtu_expires = jiffies + ip_rt_mtu_expires;
1775 if (!pmtu_expires)
1776 pmtu_expires = 1UL;
1777
2c8cec5c 1778 peer->pmtu_learned = mtu;
46af3180 1779 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1780
1781 atomic_inc(&__rt_peer_genid);
1782 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1783 }
46af3180 1784 check_peer_pmtu(dst, peer);
1da177e4
LT
1785 }
1786}
1787
f39925db 1788
de398fb8 1789static void ipv4_validate_peer(struct rtable *rt)
1da177e4 1790{
6431cbc2 1791 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1792 struct inet_peer *peer;
1793
6431cbc2 1794 if (!rt->peer)
a48eff12 1795 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1796
2c8cec5c 1797 peer = rt->peer;
fe6fe792 1798 if (peer) {
efbc368d 1799 check_peer_pmtu(&rt->dst, peer);
2c8cec5c 1800
de68dca1
ED
1801 if (peer->redirect_genid != redirect_genid)
1802 peer->redirect_learned.a4 = 0;
fe6fe792 1803 if (peer->redirect_learned.a4 &&
de398fb8
DM
1804 peer->redirect_learned.a4 != rt->rt_gateway)
1805 check_peer_redir(&rt->dst, peer);
f39925db
DM
1806 }
1807
6431cbc2
DM
1808 rt->rt_peer_genid = rt_peer_genid();
1809 }
efbc368d
DM
1810}
1811
1812static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1813{
1814 struct rtable *rt = (struct rtable *) dst;
1815
1816 if (rt_is_expired(rt))
1817 return NULL;
de398fb8 1818 ipv4_validate_peer(rt);
d11a4dc1 1819 return dst;
1da177e4
LT
1820}
1821
1822static void ipv4_dst_destroy(struct dst_entry *dst)
1823{
1824 struct rtable *rt = (struct rtable *) dst;
1825 struct inet_peer *peer = rt->peer;
1da177e4 1826
62fa8a84
DM
1827 if (rt->fi) {
1828 fib_info_put(rt->fi);
1829 rt->fi = NULL;
1830 }
1da177e4
LT
1831 if (peer) {
1832 rt->peer = NULL;
1833 inet_putpeer(peer);
1834 }
1da177e4
LT
1835}
1836
1da177e4
LT
1837
1838static void ipv4_link_failure(struct sk_buff *skb)
1839{
1840 struct rtable *rt;
1841
1842 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1843
511c3f92 1844 rt = skb_rtable(skb);
fe6fe792
ED
1845 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1846 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1847}
1848
1849static int ip_rt_bug(struct sk_buff *skb)
1850{
673d57e7
HH
1851 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1852 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1853 skb->dev ? skb->dev->name : "?");
1854 kfree_skb(skb);
c378a9c0 1855 WARN_ON(1);
1da177e4
LT
1856 return 0;
1857}
1858
1859/*
1860 We do not cache source address of outgoing interface,
1861 because it is used only by IP RR, TS and SRR options,
1862 so that it out of fast path.
1863
1864 BTW remember: "addr" is allowed to be not aligned
1865 in IP options!
1866 */
1867
8e36360a 1868void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1869{
a61ced5d 1870 __be32 src;
1da177e4 1871
c7537967 1872 if (rt_is_output_route(rt))
c5be24ff 1873 src = ip_hdr(skb)->saddr;
ebc0ffae 1874 else {
8e36360a
DM
1875 struct fib_result res;
1876 struct flowi4 fl4;
1877 struct iphdr *iph;
1878
1879 iph = ip_hdr(skb);
1880
1881 memset(&fl4, 0, sizeof(fl4));
1882 fl4.daddr = iph->daddr;
1883 fl4.saddr = iph->saddr;
b0fe4a31 1884 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1885 fl4.flowi4_oif = rt->dst.dev->ifindex;
1886 fl4.flowi4_iif = skb->dev->ifindex;
1887 fl4.flowi4_mark = skb->mark;
5e2b61f7 1888
ebc0ffae 1889 rcu_read_lock();
68a5e3dd 1890 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1891 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1892 else
1893 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1894 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1895 rcu_read_unlock();
1896 }
1da177e4
LT
1897 memcpy(addr, &src, 4);
1898}
1899
c7066f70 1900#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1901static void set_class_tag(struct rtable *rt, u32 tag)
1902{
d8d1f30b
CG
1903 if (!(rt->dst.tclassid & 0xFFFF))
1904 rt->dst.tclassid |= tag & 0xFFFF;
1905 if (!(rt->dst.tclassid & 0xFFFF0000))
1906 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1907}
1908#endif
1909
0dbaee3b
DM
1910static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1911{
1912 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1913
1914 if (advmss == 0) {
1915 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1916 ip_rt_min_advmss);
1917 if (advmss > 65535 - 40)
1918 advmss = 65535 - 40;
1919 }
1920 return advmss;
1921}
1922
ebb762f2 1923static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1924{
261663b0 1925 const struct rtable *rt = (const struct rtable *) dst;
618f9bc7
SK
1926 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1927
261663b0 1928 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1929 return mtu;
1930
1931 mtu = dst->dev->mtu;
d33e4553
DM
1932
1933 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1934
1935 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1936 mtu = 576;
1937 }
1938
1939 if (mtu > IP_MAX_MTU)
1940 mtu = IP_MAX_MTU;
1941
1942 return mtu;
1943}
1944
813b3b5d 1945static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1946 struct fib_info *fi)
a4daad6b 1947{
0131ba45
DM
1948 struct inet_peer *peer;
1949 int create = 0;
a4daad6b 1950
0131ba45
DM
1951 /* If a peer entry exists for this destination, we must hook
1952 * it up in order to get at cached metrics.
1953 */
813b3b5d 1954 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1955 create = 1;
1956
3c0afdca 1957 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1958 if (peer) {
3c0afdca 1959 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1960 if (inet_metrics_new(peer))
1961 memcpy(peer->metrics, fi->fib_metrics,
1962 sizeof(u32) * RTAX_MAX);
1963 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1964
fe6fe792 1965 check_peer_pmtu(&rt->dst, peer);
de68dca1
ED
1966 if (peer->redirect_genid != redirect_genid)
1967 peer->redirect_learned.a4 = 0;
f39925db
DM
1968 if (peer->redirect_learned.a4 &&
1969 peer->redirect_learned.a4 != rt->rt_gateway) {
1970 rt->rt_gateway = peer->redirect_learned.a4;
1971 rt->rt_flags |= RTCF_REDIRECTED;
1972 }
0131ba45
DM
1973 } else {
1974 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1975 rt->fi = fi;
1976 atomic_inc(&fi->fib_clntref);
1977 }
1978 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1979 }
1980}
1981
813b3b5d 1982static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1983 const struct fib_result *res,
982721f3 1984 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1985{
defb3519 1986 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1987
1988 if (fi) {
1989 if (FIB_RES_GW(*res) &&
1990 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1991 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1992 rt_init_metrics(rt, fl4, fi);
c7066f70 1993#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1994 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1995#endif
d33e4553 1996 }
defb3519 1997
defb3519
DM
1998 if (dst_mtu(dst) > IP_MAX_MTU)
1999 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 2000 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 2001 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 2002
c7066f70 2003#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
2004#ifdef CONFIG_IP_MULTIPLE_TABLES
2005 set_class_tag(rt, fib_rules_tclass(res));
2006#endif
2007 set_class_tag(rt, itag);
2008#endif
1da177e4
LT
2009}
2010
5c1e6aa3
DM
2011static struct rtable *rt_dst_alloc(struct net_device *dev,
2012 bool nopolicy, bool noxfrm)
0c4dcd58 2013{
5c1e6aa3
DM
2014 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2015 DST_HOST |
2016 (nopolicy ? DST_NOPOLICY : 0) |
2017 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
2018}
2019
96d36220 2020/* called in rcu_read_lock() section */
9e12bb22 2021static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2022 u8 tos, struct net_device *dev, int our)
2023{
96d36220 2024 unsigned int hash;
1da177e4 2025 struct rtable *rth;
a61ced5d 2026 __be32 spec_dst;
96d36220 2027 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2028 u32 itag = 0;
b5f7e755 2029 int err;
1da177e4
LT
2030
2031 /* Primary sanity checks. */
2032
2033 if (in_dev == NULL)
2034 return -EINVAL;
2035
1e637c74 2036 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2037 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
2038 goto e_inval;
2039
f97c1e0c
JP
2040 if (ipv4_is_zeronet(saddr)) {
2041 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
2042 goto e_inval;
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 2044 } else {
5c04c819
MS
2045 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2046 &itag);
b5f7e755
ED
2047 if (err < 0)
2048 goto e_err;
2049 }
5c1e6aa3
DM
2050 rth = rt_dst_alloc(init_net.loopback_dev,
2051 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2052 if (!rth)
2053 goto e_nobufs;
2054
cf911662
DM
2055#ifdef CONFIG_IP_ROUTE_CLASSID
2056 rth->dst.tclassid = itag;
2057#endif
d8d1f30b 2058 rth->dst.output = ip_rt_bug;
1da177e4 2059
5e2b61f7 2060 rth->rt_key_dst = daddr;
5e2b61f7 2061 rth->rt_key_src = saddr;
cf911662
DM
2062 rth->rt_genid = rt_genid(dev_net(dev));
2063 rth->rt_flags = RTCF_MULTICAST;
2064 rth->rt_type = RTN_MULTICAST;
475949d8 2065 rth->rt_key_tos = tos;
cf911662 2066 rth->rt_dst = daddr;
1da177e4 2067 rth->rt_src = saddr;
1b86a58f 2068 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2069 rth->rt_iif = dev->ifindex;
5e2b61f7 2070 rth->rt_oif = 0;
cf911662 2071 rth->rt_mark = skb->mark;
1da177e4
LT
2072 rth->rt_gateway = daddr;
2073 rth->rt_spec_dst= spec_dst;
cf911662
DM
2074 rth->rt_peer_genid = 0;
2075 rth->peer = NULL;
2076 rth->fi = NULL;
1da177e4 2077 if (our) {
d8d1f30b 2078 rth->dst.input= ip_local_deliver;
1da177e4
LT
2079 rth->rt_flags |= RTCF_LOCAL;
2080 }
2081
2082#ifdef CONFIG_IP_MROUTE
f97c1e0c 2083 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2084 rth->dst.input = ip_mr_input;
1da177e4
LT
2085#endif
2086 RT_CACHE_STAT_INC(in_slow_mc);
2087
e84f84f2 2088 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 2089 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 2090 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
2091
2092e_nobufs:
1da177e4 2093 return -ENOBUFS;
1da177e4 2094e_inval:
96d36220 2095 return -EINVAL;
b5f7e755 2096e_err:
b5f7e755 2097 return err;
1da177e4
LT
2098}
2099
2100
2101static void ip_handle_martian_source(struct net_device *dev,
2102 struct in_device *in_dev,
2103 struct sk_buff *skb,
9e12bb22
AV
2104 __be32 daddr,
2105 __be32 saddr)
1da177e4
LT
2106{
2107 RT_CACHE_STAT_INC(in_martian_src);
2108#ifdef CONFIG_IP_ROUTE_VERBOSE
2109 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2110 /*
2111 * RFC1812 recommendation, if source is martian,
2112 * the only hint is MAC header.
2113 */
673d57e7
HH
2114 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2115 &daddr, &saddr, dev->name);
98e399f8 2116 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2117 int i;
98e399f8 2118 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2119 printk(KERN_WARNING "ll header: ");
2120 for (i = 0; i < dev->hard_header_len; i++, p++) {
2121 printk("%02x", *p);
2122 if (i < (dev->hard_header_len - 1))
2123 printk(":");
2124 }
2125 printk("\n");
2126 }
2127 }
2128#endif
2129}
2130
47360228 2131/* called in rcu_read_lock() section */
5969f71d 2132static int __mkroute_input(struct sk_buff *skb,
982721f3 2133 const struct fib_result *res,
5969f71d
SH
2134 struct in_device *in_dev,
2135 __be32 daddr, __be32 saddr, u32 tos,
2136 struct rtable **result)
1da177e4 2137{
1da177e4
LT
2138 struct rtable *rth;
2139 int err;
2140 struct in_device *out_dev;
47360228 2141 unsigned int flags = 0;
d9c9df8c
AV
2142 __be32 spec_dst;
2143 u32 itag;
1da177e4
LT
2144
2145 /* get a working reference to the output device */
47360228 2146 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2147 if (out_dev == NULL) {
2148 if (net_ratelimit())
2149 printk(KERN_CRIT "Bug in ip_route_input" \
2150 "_slow(). Please, report\n");
2151 return -EINVAL;
2152 }
2153
2154
5c04c819
MS
2155 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2156 in_dev->dev, &spec_dst, &itag);
1da177e4 2157 if (err < 0) {
e905a9ed 2158 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2159 saddr);
e905a9ed 2160
1da177e4
LT
2161 goto cleanup;
2162 }
2163
2164 if (err)
2165 flags |= RTCF_DIRECTSRC;
2166
51b77cae 2167 if (out_dev == in_dev && err &&
1da177e4
LT
2168 (IN_DEV_SHARED_MEDIA(out_dev) ||
2169 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2170 flags |= RTCF_DOREDIRECT;
2171
2172 if (skb->protocol != htons(ETH_P_IP)) {
2173 /* Not IP (i.e. ARP). Do not create route, if it is
2174 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2175 *
2176 * Proxy arp feature have been extended to allow, ARP
2177 * replies back to the same interface, to support
2178 * Private VLAN switch technologies. See arp.c.
1da177e4 2179 */
65324144
JDB
2180 if (out_dev == in_dev &&
2181 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2182 err = -EINVAL;
2183 goto cleanup;
2184 }
2185 }
2186
5c1e6aa3
DM
2187 rth = rt_dst_alloc(out_dev->dev,
2188 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2189 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2190 if (!rth) {
2191 err = -ENOBUFS;
2192 goto cleanup;
2193 }
2194
5e2b61f7 2195 rth->rt_key_dst = daddr;
5e2b61f7 2196 rth->rt_key_src = saddr;
cf911662
DM
2197 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2198 rth->rt_flags = flags;
2199 rth->rt_type = res->type;
475949d8 2200 rth->rt_key_tos = tos;
cf911662 2201 rth->rt_dst = daddr;
1da177e4 2202 rth->rt_src = saddr;
1b86a58f 2203 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2204 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2205 rth->rt_oif = 0;
cf911662
DM
2206 rth->rt_mark = skb->mark;
2207 rth->rt_gateway = daddr;
1da177e4 2208 rth->rt_spec_dst= spec_dst;
cf911662
DM
2209 rth->rt_peer_genid = 0;
2210 rth->peer = NULL;
2211 rth->fi = NULL;
1da177e4 2212
d8d1f30b
CG
2213 rth->dst.input = ip_forward;
2214 rth->dst.output = ip_output;
1da177e4 2215
5e2b61f7 2216 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2217
1da177e4
LT
2218 *result = rth;
2219 err = 0;
2220 cleanup:
1da177e4 2221 return err;
e905a9ed 2222}
1da177e4 2223
5969f71d
SH
2224static int ip_mkroute_input(struct sk_buff *skb,
2225 struct fib_result *res,
68a5e3dd 2226 const struct flowi4 *fl4,
5969f71d
SH
2227 struct in_device *in_dev,
2228 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2229{
7abaa27c 2230 struct rtable* rth = NULL;
1da177e4
LT
2231 int err;
2232 unsigned hash;
2233
2234#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2235 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2236 fib_select_multipath(res);
1da177e4
LT
2237#endif
2238
2239 /* create a routing cache entry */
2240 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2241 if (err)
2242 return err;
1da177e4
LT
2243
2244 /* put it into the cache */
68a5e3dd 2245 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2246 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2247 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2248 if (IS_ERR(rth))
2249 return PTR_ERR(rth);
2250 return 0;
1da177e4
LT
2251}
2252
1da177e4
LT
2253/*
2254 * NOTE. We drop all the packets that has local source
2255 * addresses, because every properly looped back packet
2256 * must have correct destination already attached by output routine.
2257 *
2258 * Such approach solves two big problems:
2259 * 1. Not simplex devices are handled properly.
2260 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2261 * called with rcu_read_lock()
1da177e4
LT
2262 */
2263
9e12bb22 2264static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2265 u8 tos, struct net_device *dev)
2266{
2267 struct fib_result res;
96d36220 2268 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2269 struct flowi4 fl4;
1da177e4
LT
2270 unsigned flags = 0;
2271 u32 itag = 0;
2272 struct rtable * rth;
2273 unsigned hash;
9e12bb22 2274 __be32 spec_dst;
1da177e4 2275 int err = -EINVAL;
c346dca1 2276 struct net * net = dev_net(dev);
1da177e4
LT
2277
2278 /* IP on this device is disabled. */
2279
2280 if (!in_dev)
2281 goto out;
2282
2283 /* Check for the most weird martians, which can be not detected
2284 by fib_lookup.
2285 */
2286
1e637c74 2287 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2288 ipv4_is_loopback(saddr))
1da177e4
LT
2289 goto martian_source;
2290
27a954bd 2291 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2292 goto brd_input;
2293
2294 /* Accept zero addresses only to limited broadcast;
2295 * I even do not know to fix it or not. Waiting for complains :-)
2296 */
f97c1e0c 2297 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2298 goto martian_source;
2299
27a954bd 2300 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2301 goto martian_destination;
2302
2303 /*
2304 * Now we are ready to route packet.
2305 */
68a5e3dd
DM
2306 fl4.flowi4_oif = 0;
2307 fl4.flowi4_iif = dev->ifindex;
2308 fl4.flowi4_mark = skb->mark;
2309 fl4.flowi4_tos = tos;
2310 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2311 fl4.daddr = daddr;
2312 fl4.saddr = saddr;
2313 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2314 if (err != 0) {
1da177e4 2315 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2316 goto e_hostunreach;
1da177e4
LT
2317 goto no_route;
2318 }
1da177e4
LT
2319
2320 RT_CACHE_STAT_INC(in_slow_tot);
2321
2322 if (res.type == RTN_BROADCAST)
2323 goto brd_input;
2324
2325 if (res.type == RTN_LOCAL) {
5c04c819 2326 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2327 net->loopback_dev->ifindex,
5c04c819 2328 dev, &spec_dst, &itag);
b5f7e755
ED
2329 if (err < 0)
2330 goto martian_source_keep_err;
2331 if (err)
1da177e4
LT
2332 flags |= RTCF_DIRECTSRC;
2333 spec_dst = daddr;
2334 goto local_input;
2335 }
2336
2337 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2338 goto e_hostunreach;
1da177e4
LT
2339 if (res.type != RTN_UNICAST)
2340 goto martian_destination;
2341
68a5e3dd 2342 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2343out: return err;
2344
2345brd_input:
2346 if (skb->protocol != htons(ETH_P_IP))
2347 goto e_inval;
2348
f97c1e0c 2349 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2350 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2351 else {
5c04c819
MS
2352 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2353 &itag);
1da177e4 2354 if (err < 0)
b5f7e755 2355 goto martian_source_keep_err;
1da177e4
LT
2356 if (err)
2357 flags |= RTCF_DIRECTSRC;
2358 }
2359 flags |= RTCF_BROADCAST;
2360 res.type = RTN_BROADCAST;
2361 RT_CACHE_STAT_INC(in_brd);
2362
2363local_input:
5c1e6aa3
DM
2364 rth = rt_dst_alloc(net->loopback_dev,
2365 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2366 if (!rth)
2367 goto e_nobufs;
2368
cf911662 2369 rth->dst.input= ip_local_deliver;
d8d1f30b 2370 rth->dst.output= ip_rt_bug;
cf911662
DM
2371#ifdef CONFIG_IP_ROUTE_CLASSID
2372 rth->dst.tclassid = itag;
2373#endif
1da177e4 2374
5e2b61f7 2375 rth->rt_key_dst = daddr;
5e2b61f7 2376 rth->rt_key_src = saddr;
cf911662
DM
2377 rth->rt_genid = rt_genid(net);
2378 rth->rt_flags = flags|RTCF_LOCAL;
2379 rth->rt_type = res.type;
475949d8 2380 rth->rt_key_tos = tos;
cf911662 2381 rth->rt_dst = daddr;
1da177e4 2382 rth->rt_src = saddr;
c7066f70 2383#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2384 rth->dst.tclassid = itag;
1da177e4 2385#endif
1b86a58f 2386 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2387 rth->rt_iif = dev->ifindex;
cf911662
DM
2388 rth->rt_oif = 0;
2389 rth->rt_mark = skb->mark;
1da177e4
LT
2390 rth->rt_gateway = daddr;
2391 rth->rt_spec_dst= spec_dst;
cf911662
DM
2392 rth->rt_peer_genid = 0;
2393 rth->peer = NULL;
2394 rth->fi = NULL;
1da177e4 2395 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2396 rth->dst.input= ip_error;
2397 rth->dst.error= -err;
1da177e4
LT
2398 rth->rt_flags &= ~RTCF_LOCAL;
2399 }
68a5e3dd
DM
2400 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2401 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2402 err = 0;
2403 if (IS_ERR(rth))
2404 err = PTR_ERR(rth);
ebc0ffae 2405 goto out;
1da177e4
LT
2406
2407no_route:
2408 RT_CACHE_STAT_INC(in_no_route);
2409 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2410 res.type = RTN_UNREACHABLE;
7f53878d
MC
2411 if (err == -ESRCH)
2412 err = -ENETUNREACH;
1da177e4
LT
2413 goto local_input;
2414
2415 /*
2416 * Do not cache martian addresses: they should be logged (RFC1812)
2417 */
2418martian_destination:
2419 RT_CACHE_STAT_INC(in_martian_dst);
2420#ifdef CONFIG_IP_ROUTE_VERBOSE
2421 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2422 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2423 &daddr, &saddr, dev->name);
1da177e4 2424#endif
2c2910a4
DE
2425
2426e_hostunreach:
e905a9ed 2427 err = -EHOSTUNREACH;
ebc0ffae 2428 goto out;
2c2910a4 2429
1da177e4
LT
2430e_inval:
2431 err = -EINVAL;
ebc0ffae 2432 goto out;
1da177e4
LT
2433
2434e_nobufs:
2435 err = -ENOBUFS;
ebc0ffae 2436 goto out;
1da177e4
LT
2437
2438martian_source:
b5f7e755
ED
2439 err = -EINVAL;
2440martian_source_keep_err:
1da177e4 2441 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2442 goto out;
1da177e4
LT
2443}
2444
407eadd9
ED
2445int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2446 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2447{
2448 struct rtable * rth;
2449 unsigned hash;
2450 int iif = dev->ifindex;
b5921910 2451 struct net *net;
96d36220 2452 int res;
1da177e4 2453
c346dca1 2454 net = dev_net(dev);
1080d709 2455
96d36220
ED
2456 rcu_read_lock();
2457
1080d709
NH
2458 if (!rt_caching(net))
2459 goto skip_cache;
2460
1da177e4 2461 tos &= IPTOS_RT_MASK;
e84f84f2 2462 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2463
1da177e4 2464 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2465 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2466 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2467 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2468 (rth->rt_route_iif ^ iif) |
475949d8 2469 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2470 rth->rt_mark == skb->mark &&
d8d1f30b 2471 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2472 !rt_is_expired(rth)) {
de398fb8 2473 ipv4_validate_peer(rth);
407eadd9 2474 if (noref) {
d8d1f30b
CG
2475 dst_use_noref(&rth->dst, jiffies);
2476 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2477 } else {
d8d1f30b
CG
2478 dst_use(&rth->dst, jiffies);
2479 skb_dst_set(skb, &rth->dst);
407eadd9 2480 }
1da177e4
LT
2481 RT_CACHE_STAT_INC(in_hit);
2482 rcu_read_unlock();
1da177e4
LT
2483 return 0;
2484 }
2485 RT_CACHE_STAT_INC(in_hlist_search);
2486 }
1da177e4 2487
1080d709 2488skip_cache:
1da177e4
LT
2489 /* Multicast recognition logic is moved from route cache to here.
2490 The problem was that too many Ethernet cards have broken/missing
2491 hardware multicast filters :-( As result the host on multicasting
2492 network acquires a lot of useless route cache entries, sort of
2493 SDR messages from all the world. Now we try to get rid of them.
2494 Really, provided software IP multicast filter is organized
2495 reasonably (at least, hashed), it does not result in a slowdown
2496 comparing with route cache reject entries.
2497 Note, that multicast routers are not affected, because
2498 route cache entry is created eventually.
2499 */
f97c1e0c 2500 if (ipv4_is_multicast(daddr)) {
96d36220 2501 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2502
96d36220 2503 if (in_dev) {
dbdd9a52
DM
2504 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2505 ip_hdr(skb)->protocol);
1da177e4
LT
2506 if (our
2507#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2508 ||
2509 (!ipv4_is_local_multicast(daddr) &&
2510 IN_DEV_MFORWARD(in_dev))
1da177e4 2511#endif
9d4fb27d 2512 ) {
96d36220
ED
2513 int res = ip_route_input_mc(skb, daddr, saddr,
2514 tos, dev, our);
1da177e4 2515 rcu_read_unlock();
96d36220 2516 return res;
1da177e4
LT
2517 }
2518 }
2519 rcu_read_unlock();
2520 return -EINVAL;
2521 }
96d36220
ED
2522 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2523 rcu_read_unlock();
2524 return res;
1da177e4 2525}
407eadd9 2526EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2527
ebc0ffae 2528/* called with rcu_read_lock() */
982721f3 2529static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2530 const struct flowi4 *fl4,
813b3b5d 2531 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2532 int orig_oif, __u8 orig_rtos,
2533 struct net_device *dev_out,
5ada5527 2534 unsigned int flags)
1da177e4 2535{
982721f3 2536 struct fib_info *fi = res->fi;
5ada5527 2537 struct in_device *in_dev;
982721f3 2538 u16 type = res->type;
5ada5527 2539 struct rtable *rth;
1da177e4 2540
68a5e3dd 2541 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2542 return ERR_PTR(-EINVAL);
1da177e4 2543
68a5e3dd 2544 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2545 type = RTN_BROADCAST;
68a5e3dd 2546 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2547 type = RTN_MULTICAST;
68a5e3dd 2548 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2549 return ERR_PTR(-EINVAL);
1da177e4
LT
2550
2551 if (dev_out->flags & IFF_LOOPBACK)
2552 flags |= RTCF_LOCAL;
2553
dd28d1a0 2554 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2555 if (!in_dev)
5ada5527 2556 return ERR_PTR(-EINVAL);
ebc0ffae 2557
982721f3 2558 if (type == RTN_BROADCAST) {
1da177e4 2559 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2560 fi = NULL;
2561 } else if (type == RTN_MULTICAST) {
dd28d1a0 2562 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2563 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2564 fl4->flowi4_proto))
1da177e4
LT
2565 flags &= ~RTCF_LOCAL;
2566 /* If multicast route do not exist use
dd28d1a0
ED
2567 * default one, but do not gateway in this case.
2568 * Yes, it is hack.
1da177e4 2569 */
982721f3
DM
2570 if (fi && res->prefixlen < 4)
2571 fi = NULL;
1da177e4
LT
2572 }
2573
5c1e6aa3
DM
2574 rth = rt_dst_alloc(dev_out,
2575 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2576 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2577 if (!rth)
5ada5527 2578 return ERR_PTR(-ENOBUFS);
8391d07b 2579
cf911662
DM
2580 rth->dst.output = ip_output;
2581
813b3b5d
DM
2582 rth->rt_key_dst = orig_daddr;
2583 rth->rt_key_src = orig_saddr;
cf911662
DM
2584 rth->rt_genid = rt_genid(dev_net(dev_out));
2585 rth->rt_flags = flags;
2586 rth->rt_type = type;
f61759e6 2587 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2588 rth->rt_dst = fl4->daddr;
2589 rth->rt_src = fl4->saddr;
1b86a58f 2590 rth->rt_route_iif = 0;
813b3b5d
DM
2591 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2592 rth->rt_oif = orig_oif;
2593 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2594 rth->rt_gateway = fl4->daddr;
2595 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2596 rth->rt_peer_genid = 0;
2597 rth->peer = NULL;
2598 rth->fi = NULL;
1da177e4
LT
2599
2600 RT_CACHE_STAT_INC(out_slow_tot);
2601
2602 if (flags & RTCF_LOCAL) {
d8d1f30b 2603 rth->dst.input = ip_local_deliver;
68a5e3dd 2604 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2605 }
2606 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2607 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2608 if (flags & RTCF_LOCAL &&
1da177e4 2609 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2610 rth->dst.output = ip_mc_output;
1da177e4
LT
2611 RT_CACHE_STAT_INC(out_slow_mc);
2612 }
2613#ifdef CONFIG_IP_MROUTE
982721f3 2614 if (type == RTN_MULTICAST) {
1da177e4 2615 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2616 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2617 rth->dst.input = ip_mr_input;
2618 rth->dst.output = ip_mc_output;
1da177e4
LT
2619 }
2620 }
2621#endif
2622 }
2623
813b3b5d 2624 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2625
5ada5527 2626 return rth;
1da177e4
LT
2627}
2628
1da177e4
LT
2629/*
2630 * Major route resolver routine.
0197aa38 2631 * called with rcu_read_lock();
1da177e4
LT
2632 */
2633
813b3b5d 2634static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2635{
1da177e4 2636 struct net_device *dev_out = NULL;
f61759e6 2637 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2638 unsigned int flags = 0;
2639 struct fib_result res;
5ada5527 2640 struct rtable *rth;
813b3b5d
DM
2641 __be32 orig_daddr;
2642 __be32 orig_saddr;
2643 int orig_oif;
1da177e4
LT
2644
2645 res.fi = NULL;
2646#ifdef CONFIG_IP_MULTIPLE_TABLES
2647 res.r = NULL;
2648#endif
2649
813b3b5d
DM
2650 orig_daddr = fl4->daddr;
2651 orig_saddr = fl4->saddr;
2652 orig_oif = fl4->flowi4_oif;
2653
2654 fl4->flowi4_iif = net->loopback_dev->ifindex;
2655 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2656 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2657 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2658
010c2708 2659 rcu_read_lock();
813b3b5d 2660 if (fl4->saddr) {
b23dd4fe 2661 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2662 if (ipv4_is_multicast(fl4->saddr) ||
2663 ipv4_is_lbcast(fl4->saddr) ||
2664 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2665 goto out;
2666
1da177e4
LT
2667 /* I removed check for oif == dev_out->oif here.
2668 It was wrong for two reasons:
1ab35276
DL
2669 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2670 is assigned to multiple interfaces.
1da177e4
LT
2671 2. Moreover, we are allowed to send packets with saddr
2672 of another iface. --ANK
2673 */
2674
813b3b5d
DM
2675 if (fl4->flowi4_oif == 0 &&
2676 (ipv4_is_multicast(fl4->daddr) ||
2677 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2678 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2679 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2680 if (dev_out == NULL)
2681 goto out;
2682
1da177e4
LT
2683 /* Special hack: user can direct multicasts
2684 and limited broadcast via necessary interface
2685 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2686 This hack is not just for fun, it allows
2687 vic,vat and friends to work.
2688 They bind socket to loopback, set ttl to zero
2689 and expect that it will work.
2690 From the viewpoint of routing cache they are broken,
2691 because we are not allowed to build multicast path
2692 with loopback source addr (look, routing cache
2693 cannot know, that ttl is zero, so that packet
2694 will not leave this host and route is valid).
2695 Luckily, this hack is good workaround.
2696 */
2697
813b3b5d 2698 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2699 goto make_route;
2700 }
a210d01a 2701
813b3b5d 2702 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2703 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2704 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2705 goto out;
a210d01a 2706 }
1da177e4
LT
2707 }
2708
2709
813b3b5d
DM
2710 if (fl4->flowi4_oif) {
2711 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2712 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2713 if (dev_out == NULL)
2714 goto out;
e5ed6399
HX
2715
2716 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2717 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2718 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2719 goto out;
2720 }
813b3b5d
DM
2721 if (ipv4_is_local_multicast(fl4->daddr) ||
2722 ipv4_is_lbcast(fl4->daddr)) {
2723 if (!fl4->saddr)
2724 fl4->saddr = inet_select_addr(dev_out, 0,
2725 RT_SCOPE_LINK);
1da177e4
LT
2726 goto make_route;
2727 }
813b3b5d
DM
2728 if (fl4->saddr) {
2729 if (ipv4_is_multicast(fl4->daddr))
2730 fl4->saddr = inet_select_addr(dev_out, 0,
2731 fl4->flowi4_scope);
2732 else if (!fl4->daddr)
2733 fl4->saddr = inet_select_addr(dev_out, 0,
2734 RT_SCOPE_HOST);
1da177e4
LT
2735 }
2736 }
2737
813b3b5d
DM
2738 if (!fl4->daddr) {
2739 fl4->daddr = fl4->saddr;
2740 if (!fl4->daddr)
2741 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2742 dev_out = net->loopback_dev;
813b3b5d 2743 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2744 res.type = RTN_LOCAL;
2745 flags |= RTCF_LOCAL;
2746 goto make_route;
2747 }
2748
813b3b5d 2749 if (fib_lookup(net, fl4, &res)) {
1da177e4 2750 res.fi = NULL;
813b3b5d 2751 if (fl4->flowi4_oif) {
1da177e4
LT
2752 /* Apparently, routing tables are wrong. Assume,
2753 that the destination is on link.
2754
2755 WHY? DW.
2756 Because we are allowed to send to iface
2757 even if it has NO routes and NO assigned
2758 addresses. When oif is specified, routing
2759 tables are looked up with only one purpose:
2760 to catch if destination is gatewayed, rather than
2761 direct. Moreover, if MSG_DONTROUTE is set,
2762 we send packet, ignoring both routing tables
2763 and ifaddr state. --ANK
2764
2765
2766 We could make it even if oif is unknown,
2767 likely IPv6, but we do not.
2768 */
2769
813b3b5d
DM
2770 if (fl4->saddr == 0)
2771 fl4->saddr = inet_select_addr(dev_out, 0,
2772 RT_SCOPE_LINK);
1da177e4
LT
2773 res.type = RTN_UNICAST;
2774 goto make_route;
2775 }
b23dd4fe 2776 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2777 goto out;
2778 }
1da177e4
LT
2779
2780 if (res.type == RTN_LOCAL) {
813b3b5d 2781 if (!fl4->saddr) {
9fc3bbb4 2782 if (res.fi->fib_prefsrc)
813b3b5d 2783 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2784 else
813b3b5d 2785 fl4->saddr = fl4->daddr;
9fc3bbb4 2786 }
b40afd0e 2787 dev_out = net->loopback_dev;
813b3b5d 2788 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2789 res.fi = NULL;
2790 flags |= RTCF_LOCAL;
2791 goto make_route;
2792 }
2793
2794#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2795 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2796 fib_select_multipath(&res);
1da177e4
LT
2797 else
2798#endif
21d8c49e
DM
2799 if (!res.prefixlen &&
2800 res.table->tb_num_default > 1 &&
813b3b5d 2801 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2802 fib_select_default(&res);
1da177e4 2803
813b3b5d
DM
2804 if (!fl4->saddr)
2805 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2806
1da177e4 2807 dev_out = FIB_RES_DEV(res);
813b3b5d 2808 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2809
2810
2811make_route:
813b3b5d 2812 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2813 tos, dev_out, flags);
b23dd4fe 2814 if (!IS_ERR(rth)) {
5ada5527
DM
2815 unsigned int hash;
2816
813b3b5d 2817 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2818 rt_genid(dev_net(dev_out)));
813b3b5d 2819 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2820 }
1da177e4 2821
010c2708
DM
2822out:
2823 rcu_read_unlock();
b23dd4fe 2824 return rth;
1da177e4
LT
2825}
2826
813b3b5d 2827struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2828{
1da177e4 2829 struct rtable *rth;
010c2708 2830 unsigned int hash;
1da177e4 2831
1080d709
NH
2832 if (!rt_caching(net))
2833 goto slow_output;
2834
9d6ec938 2835 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2836
2837 rcu_read_lock_bh();
a898def2 2838 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2839 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2840 if (rth->rt_key_dst == flp4->daddr &&
2841 rth->rt_key_src == flp4->saddr &&
c7537967 2842 rt_is_output_route(rth) &&
9d6ec938
DM
2843 rth->rt_oif == flp4->flowi4_oif &&
2844 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2845 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2846 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2847 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2848 !rt_is_expired(rth)) {
de398fb8 2849 ipv4_validate_peer(rth);
d8d1f30b 2850 dst_use(&rth->dst, jiffies);
1da177e4
LT
2851 RT_CACHE_STAT_INC(out_hit);
2852 rcu_read_unlock_bh();
56157872
DM
2853 if (!flp4->saddr)
2854 flp4->saddr = rth->rt_src;
2855 if (!flp4->daddr)
2856 flp4->daddr = rth->rt_dst;
b23dd4fe 2857 return rth;
1da177e4
LT
2858 }
2859 RT_CACHE_STAT_INC(out_hlist_search);
2860 }
2861 rcu_read_unlock_bh();
2862
1080d709 2863slow_output:
9d6ec938 2864 return ip_route_output_slow(net, flp4);
1da177e4 2865}
d8c97a94
ACM
2866EXPORT_SYMBOL_GPL(__ip_route_output_key);
2867
ae2688d5
JW
2868static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2869{
2870 return NULL;
2871}
2872
ebb762f2 2873static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2874{
618f9bc7
SK
2875 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2876
2877 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2878}
2879
14e50e57
DM
2880static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2881{
2882}
2883
0972ddb2
HB
2884static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2885 unsigned long old)
2886{
2887 return NULL;
2888}
2889
14e50e57
DM
2890static struct dst_ops ipv4_dst_blackhole_ops = {
2891 .family = AF_INET,
09640e63 2892 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2893 .destroy = ipv4_dst_destroy,
ae2688d5 2894 .check = ipv4_blackhole_dst_check,
ebb762f2 2895 .mtu = ipv4_blackhole_mtu,
214f45c9 2896 .default_advmss = ipv4_default_advmss,
14e50e57 2897 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2898 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2899 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2900};
2901
2774c131 2902struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2903{
5c1e6aa3 2904 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2905 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2906
2907 if (rt) {
d8d1f30b 2908 struct dst_entry *new = &rt->dst;
14e50e57 2909
14e50e57 2910 new->__use = 1;
352e512c
HX
2911 new->input = dst_discard;
2912 new->output = dst_discard;
defb3519 2913 dst_copy_metrics(new, &ort->dst);
14e50e57 2914
d8d1f30b 2915 new->dev = ort->dst.dev;
14e50e57
DM
2916 if (new->dev)
2917 dev_hold(new->dev);
2918
5e2b61f7
DM
2919 rt->rt_key_dst = ort->rt_key_dst;
2920 rt->rt_key_src = ort->rt_key_src;
475949d8 2921 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2922 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2923 rt->rt_iif = ort->rt_iif;
2924 rt->rt_oif = ort->rt_oif;
2925 rt->rt_mark = ort->rt_mark;
14e50e57 2926
e84f84f2 2927 rt->rt_genid = rt_genid(net);
14e50e57
DM
2928 rt->rt_flags = ort->rt_flags;
2929 rt->rt_type = ort->rt_type;
2930 rt->rt_dst = ort->rt_dst;
2931 rt->rt_src = ort->rt_src;
14e50e57
DM
2932 rt->rt_gateway = ort->rt_gateway;
2933 rt->rt_spec_dst = ort->rt_spec_dst;
2934 rt->peer = ort->peer;
2935 if (rt->peer)
2936 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2937 rt->fi = ort->fi;
2938 if (rt->fi)
2939 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2940
2941 dst_free(new);
2942 }
2943
2774c131
DM
2944 dst_release(dst_orig);
2945
2946 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2947}
2948
9d6ec938 2949struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2950 struct sock *sk)
1da177e4 2951{
9d6ec938 2952 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2953
b23dd4fe
DM
2954 if (IS_ERR(rt))
2955 return rt;
1da177e4 2956
56157872 2957 if (flp4->flowi4_proto)
9d6ec938
DM
2958 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2959 flowi4_to_flowi(flp4),
2960 sk, 0);
1da177e4 2961
b23dd4fe 2962 return rt;
1da177e4 2963}
d8c97a94
ACM
2964EXPORT_SYMBOL_GPL(ip_route_output_flow);
2965
4feb88e5
BT
2966static int rt_fill_info(struct net *net,
2967 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2968 int nowait, unsigned int flags)
1da177e4 2969{
511c3f92 2970 struct rtable *rt = skb_rtable(skb);
1da177e4 2971 struct rtmsg *r;
be403ea1 2972 struct nlmsghdr *nlh;
2bc8ca40 2973 unsigned long expires = 0;
fe6fe792 2974 const struct inet_peer *peer = rt->peer;
e3703b3d 2975 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2976
2977 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2978 if (nlh == NULL)
26932566 2979 return -EMSGSIZE;
be403ea1
TG
2980
2981 r = nlmsg_data(nlh);
1da177e4
LT
2982 r->rtm_family = AF_INET;
2983 r->rtm_dst_len = 32;
2984 r->rtm_src_len = 0;
475949d8 2985 r->rtm_tos = rt->rt_key_tos;
1da177e4 2986 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2987 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2988 r->rtm_type = rt->rt_type;
2989 r->rtm_scope = RT_SCOPE_UNIVERSE;
2990 r->rtm_protocol = RTPROT_UNSPEC;
2991 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2992 if (rt->rt_flags & RTCF_NOTIFY)
2993 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2994
17fb2c64 2995 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2996
5e2b61f7 2997 if (rt->rt_key_src) {
1da177e4 2998 r->rtm_src_len = 32;
5e2b61f7 2999 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 3000 }
d8d1f30b
CG
3001 if (rt->dst.dev)
3002 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 3003#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
3004 if (rt->dst.tclassid)
3005 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 3006#endif
c7537967 3007 if (rt_is_input_route(rt))
17fb2c64 3008 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 3009 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 3010 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 3011
1da177e4 3012 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 3013 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 3014
defb3519 3015 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
3016 goto nla_put_failure;
3017
5e2b61f7
DM
3018 if (rt->rt_mark)
3019 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 3020
d8d1f30b 3021 error = rt->dst.error;
fe6fe792 3022 if (peer) {
317fe0e6 3023 inet_peer_refcheck(rt->peer);
fe6fe792
ED
3024 id = atomic_read(&peer->ip_id_count) & 0xffff;
3025 if (peer->tcp_ts_stamp) {
3026 ts = peer->tcp_ts;
3027 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 3028 }
fe6fe792 3029 expires = ACCESS_ONCE(peer->pmtu_expires);
2bc8ca40
SK
3030 if (expires) {
3031 if (time_before(jiffies, expires))
3032 expires -= jiffies;
3033 else
3034 expires = 0;
3035 }
1da177e4 3036 }
be403ea1 3037
c7537967 3038 if (rt_is_input_route(rt)) {
1da177e4 3039#ifdef CONFIG_IP_MROUTE
e448515c 3040 __be32 dst = rt->rt_dst;
1da177e4 3041
f97c1e0c 3042 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 3043 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
3044 int err = ipmr_get_route(net, skb,
3045 rt->rt_src, rt->rt_dst,
3046 r, nowait);
1da177e4
LT
3047 if (err <= 0) {
3048 if (!nowait) {
3049 if (err == 0)
3050 return 0;
be403ea1 3051 goto nla_put_failure;
1da177e4
LT
3052 } else {
3053 if (err == -EMSGSIZE)
be403ea1 3054 goto nla_put_failure;
e3703b3d 3055 error = err;
1da177e4
LT
3056 }
3057 }
3058 } else
3059#endif
5e2b61f7 3060 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
3061 }
3062
d8d1f30b 3063 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
3064 expires, error) < 0)
3065 goto nla_put_failure;
be403ea1
TG
3066
3067 return nlmsg_end(skb, nlh);
1da177e4 3068
be403ea1 3069nla_put_failure:
26932566
PM
3070 nlmsg_cancel(skb, nlh);
3071 return -EMSGSIZE;
1da177e4
LT
3072}
3073
63f3444f 3074static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 3075{
3b1e0a65 3076 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3077 struct rtmsg *rtm;
3078 struct nlattr *tb[RTA_MAX+1];
1da177e4 3079 struct rtable *rt = NULL;
9e12bb22
AV
3080 __be32 dst = 0;
3081 __be32 src = 0;
3082 u32 iif;
d889ce3b 3083 int err;
963bfeee 3084 int mark;
1da177e4
LT
3085 struct sk_buff *skb;
3086
d889ce3b
TG
3087 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3088 if (err < 0)
3089 goto errout;
3090
3091 rtm = nlmsg_data(nlh);
3092
1da177e4 3093 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3094 if (skb == NULL) {
3095 err = -ENOBUFS;
3096 goto errout;
3097 }
1da177e4
LT
3098
3099 /* Reserve room for dummy headers, this skb can pass
3100 through good chunk of routing engine.
3101 */
459a98ed 3102 skb_reset_mac_header(skb);
c1d2bbe1 3103 skb_reset_network_header(skb);
d2c962b8
SH
3104
3105 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3106 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3107 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3108
17fb2c64
AV
3109 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3110 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3111 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3112 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3113
3114 if (iif) {
d889ce3b
TG
3115 struct net_device *dev;
3116
1937504d 3117 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3118 if (dev == NULL) {
3119 err = -ENODEV;
3120 goto errout_free;
3121 }
3122
1da177e4
LT
3123 skb->protocol = htons(ETH_P_IP);
3124 skb->dev = dev;
963bfeee 3125 skb->mark = mark;
1da177e4
LT
3126 local_bh_disable();
3127 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3128 local_bh_enable();
d889ce3b 3129
511c3f92 3130 rt = skb_rtable(skb);
d8d1f30b
CG
3131 if (err == 0 && rt->dst.error)
3132 err = -rt->dst.error;
1da177e4 3133 } else {
68a5e3dd
DM
3134 struct flowi4 fl4 = {
3135 .daddr = dst,
3136 .saddr = src,
3137 .flowi4_tos = rtm->rtm_tos,
3138 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3139 .flowi4_mark = mark,
d889ce3b 3140 };
9d6ec938 3141 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
3142
3143 err = 0;
3144 if (IS_ERR(rt))
3145 err = PTR_ERR(rt);
1da177e4 3146 }
d889ce3b 3147
1da177e4 3148 if (err)
d889ce3b 3149 goto errout_free;
1da177e4 3150
d8d1f30b 3151 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3152 if (rtm->rtm_flags & RTM_F_NOTIFY)
3153 rt->rt_flags |= RTCF_NOTIFY;
3154
4feb88e5 3155 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3156 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3157 if (err <= 0)
3158 goto errout_free;
1da177e4 3159
1937504d 3160 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3161errout:
2942e900 3162 return err;
1da177e4 3163
d889ce3b 3164errout_free:
1da177e4 3165 kfree_skb(skb);
d889ce3b 3166 goto errout;
1da177e4
LT
3167}
3168
3169int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3170{
3171 struct rtable *rt;
3172 int h, s_h;
3173 int idx, s_idx;
1937504d
DL
3174 struct net *net;
3175
3b1e0a65 3176 net = sock_net(skb->sk);
1da177e4
LT
3177
3178 s_h = cb->args[0];
d8c92830
ED
3179 if (s_h < 0)
3180 s_h = 0;
1da177e4 3181 s_idx = idx = cb->args[1];
a6272665
ED
3182 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3183 if (!rt_hash_table[h].chain)
3184 continue;
1da177e4 3185 rcu_read_lock_bh();
a898def2 3186 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3187 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3188 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3189 continue;
e84f84f2 3190 if (rt_is_expired(rt))
29e75252 3191 continue;
d8d1f30b 3192 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3193 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3194 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3195 1, NLM_F_MULTI) <= 0) {
adf30907 3196 skb_dst_drop(skb);
1da177e4
LT
3197 rcu_read_unlock_bh();
3198 goto done;
3199 }
adf30907 3200 skb_dst_drop(skb);
1da177e4
LT
3201 }
3202 rcu_read_unlock_bh();
3203 }
3204
3205done:
3206 cb->args[0] = h;
3207 cb->args[1] = idx;
3208 return skb->len;
3209}
3210
3211void ip_rt_multicast_event(struct in_device *in_dev)
3212{
76e6ebfb 3213 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3214}
3215
3216#ifdef CONFIG_SYSCTL
81c684d1 3217static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3218 void __user *buffer,
1da177e4
LT
3219 size_t *lenp, loff_t *ppos)
3220{
3221 if (write) {
639e104f 3222 int flush_delay;
81c684d1 3223 ctl_table ctl;
39a23e75 3224 struct net *net;
639e104f 3225
81c684d1
DL
3226 memcpy(&ctl, __ctl, sizeof(ctl));
3227 ctl.data = &flush_delay;
8d65af78 3228 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3229
81c684d1 3230 net = (struct net *)__ctl->extra1;
39a23e75 3231 rt_cache_flush(net, flush_delay);
1da177e4 3232 return 0;
e905a9ed 3233 }
1da177e4
LT
3234
3235 return -EINVAL;
3236}
3237
eeb61f71 3238static ctl_table ipv4_route_table[] = {
1da177e4 3239 {
1da177e4
LT
3240 .procname = "gc_thresh",
3241 .data = &ipv4_dst_ops.gc_thresh,
3242 .maxlen = sizeof(int),
3243 .mode = 0644,
6d9f239a 3244 .proc_handler = proc_dointvec,
1da177e4
LT
3245 },
3246 {
1da177e4
LT
3247 .procname = "max_size",
3248 .data = &ip_rt_max_size,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
6d9f239a 3251 .proc_handler = proc_dointvec,
1da177e4
LT
3252 },
3253 {
3254 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3255
1da177e4
LT
3256 .procname = "gc_min_interval",
3257 .data = &ip_rt_gc_min_interval,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
6d9f239a 3260 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3261 },
3262 {
1da177e4
LT
3263 .procname = "gc_min_interval_ms",
3264 .data = &ip_rt_gc_min_interval,
3265 .maxlen = sizeof(int),
3266 .mode = 0644,
6d9f239a 3267 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3268 },
3269 {
1da177e4
LT
3270 .procname = "gc_timeout",
3271 .data = &ip_rt_gc_timeout,
3272 .maxlen = sizeof(int),
3273 .mode = 0644,
6d9f239a 3274 .proc_handler = proc_dointvec_jiffies,
1da177e4 3275 },
9f28a2fc
ED
3276 {
3277 .procname = "gc_interval",
3278 .data = &ip_rt_gc_interval,
3279 .maxlen = sizeof(int),
3280 .mode = 0644,
3281 .proc_handler = proc_dointvec_jiffies,
3282 },
1da177e4 3283 {
1da177e4
LT
3284 .procname = "redirect_load",
3285 .data = &ip_rt_redirect_load,
3286 .maxlen = sizeof(int),
3287 .mode = 0644,
6d9f239a 3288 .proc_handler = proc_dointvec,
1da177e4
LT
3289 },
3290 {
1da177e4
LT
3291 .procname = "redirect_number",
3292 .data = &ip_rt_redirect_number,
3293 .maxlen = sizeof(int),
3294 .mode = 0644,
6d9f239a 3295 .proc_handler = proc_dointvec,
1da177e4
LT
3296 },
3297 {
1da177e4
LT
3298 .procname = "redirect_silence",
3299 .data = &ip_rt_redirect_silence,
3300 .maxlen = sizeof(int),
3301 .mode = 0644,
6d9f239a 3302 .proc_handler = proc_dointvec,
1da177e4
LT
3303 },
3304 {
1da177e4
LT
3305 .procname = "error_cost",
3306 .data = &ip_rt_error_cost,
3307 .maxlen = sizeof(int),
3308 .mode = 0644,
6d9f239a 3309 .proc_handler = proc_dointvec,
1da177e4
LT
3310 },
3311 {
1da177e4
LT
3312 .procname = "error_burst",
3313 .data = &ip_rt_error_burst,
3314 .maxlen = sizeof(int),
3315 .mode = 0644,
6d9f239a 3316 .proc_handler = proc_dointvec,
1da177e4
LT
3317 },
3318 {
1da177e4
LT
3319 .procname = "gc_elasticity",
3320 .data = &ip_rt_gc_elasticity,
3321 .maxlen = sizeof(int),
3322 .mode = 0644,
6d9f239a 3323 .proc_handler = proc_dointvec,
1da177e4
LT
3324 },
3325 {
1da177e4
LT
3326 .procname = "mtu_expires",
3327 .data = &ip_rt_mtu_expires,
3328 .maxlen = sizeof(int),
3329 .mode = 0644,
6d9f239a 3330 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3331 },
3332 {
1da177e4
LT
3333 .procname = "min_pmtu",
3334 .data = &ip_rt_min_pmtu,
3335 .maxlen = sizeof(int),
3336 .mode = 0644,
6d9f239a 3337 .proc_handler = proc_dointvec,
1da177e4
LT
3338 },
3339 {
1da177e4
LT
3340 .procname = "min_adv_mss",
3341 .data = &ip_rt_min_advmss,
3342 .maxlen = sizeof(int),
3343 .mode = 0644,
6d9f239a 3344 .proc_handler = proc_dointvec,
1da177e4 3345 },
f8572d8f 3346 { }
1da177e4 3347};
39a23e75 3348
2f4520d3
AV
3349static struct ctl_table empty[1];
3350
3351static struct ctl_table ipv4_skeleton[] =
3352{
f8572d8f 3353 { .procname = "route",
d994af0d 3354 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3355 { .procname = "neigh",
d994af0d 3356 .mode = 0555, .child = empty},
2f4520d3
AV
3357 { }
3358};
3359
3360static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3361 { .procname = "net", },
3362 { .procname = "ipv4", },
39a23e75
DL
3363 { },
3364};
3365
39a23e75
DL
3366static struct ctl_table ipv4_route_flush_table[] = {
3367 {
39a23e75
DL
3368 .procname = "flush",
3369 .maxlen = sizeof(int),
3370 .mode = 0200,
6d9f239a 3371 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3372 },
f8572d8f 3373 { },
39a23e75
DL
3374};
3375
2f4520d3 3376static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3377 { .procname = "net", },
3378 { .procname = "ipv4", },
3379 { .procname = "route", },
2f4520d3
AV
3380 { },
3381};
3382
39a23e75
DL
3383static __net_init int sysctl_route_net_init(struct net *net)
3384{
3385 struct ctl_table *tbl;
3386
3387 tbl = ipv4_route_flush_table;
09ad9bc7 3388 if (!net_eq(net, &init_net)) {
39a23e75
DL
3389 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3390 if (tbl == NULL)
3391 goto err_dup;
3392 }
3393 tbl[0].extra1 = net;
3394
3395 net->ipv4.route_hdr =
3396 register_net_sysctl_table(net, ipv4_route_path, tbl);
3397 if (net->ipv4.route_hdr == NULL)
3398 goto err_reg;
3399 return 0;
3400
3401err_reg:
3402 if (tbl != ipv4_route_flush_table)
3403 kfree(tbl);
3404err_dup:
3405 return -ENOMEM;
3406}
3407
3408static __net_exit void sysctl_route_net_exit(struct net *net)
3409{
3410 struct ctl_table *tbl;
3411
3412 tbl = net->ipv4.route_hdr->ctl_table_arg;
3413 unregister_net_sysctl_table(net->ipv4.route_hdr);
3414 BUG_ON(tbl == ipv4_route_flush_table);
3415 kfree(tbl);
3416}
3417
3418static __net_initdata struct pernet_operations sysctl_route_ops = {
3419 .init = sysctl_route_net_init,
3420 .exit = sysctl_route_net_exit,
3421};
1da177e4
LT
3422#endif
3423
3ee94372 3424static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3425{
3ee94372
NH
3426 get_random_bytes(&net->ipv4.rt_genid,
3427 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3428 get_random_bytes(&net->ipv4.dev_addr_genid,
3429 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3430 return 0;
3431}
3432
3ee94372
NH
3433static __net_initdata struct pernet_operations rt_genid_ops = {
3434 .init = rt_genid_init,
9f5e97e5
DL
3435};
3436
3437
c7066f70 3438#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3439struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3440#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3441
3442static __initdata unsigned long rhash_entries;
3443static int __init set_rhash_entries(char *str)
3444{
3445 if (!str)
3446 return 0;
3447 rhash_entries = simple_strtoul(str, &str, 0);
3448 return 1;
3449}
3450__setup("rhash_entries=", set_rhash_entries);
3451
3452int __init ip_rt_init(void)
3453{
424c4b70 3454 int rc = 0;
1da177e4 3455
c7066f70 3456#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3457 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3458 if (!ip_rt_acct)
3459 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3460#endif
3461
e5d679f3
AD
3462 ipv4_dst_ops.kmem_cachep =
3463 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3465
14e50e57
DM
3466 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3467
fc66f95c
ED
3468 if (dst_entries_init(&ipv4_dst_ops) < 0)
3469 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3470
3471 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3472 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3473
424c4b70
ED
3474 rt_hash_table = (struct rt_hash_bucket *)
3475 alloc_large_system_hash("IP route cache",
3476 sizeof(struct rt_hash_bucket),
3477 rhash_entries,
4481374c 3478 (totalram_pages >= 128 * 1024) ?
18955cfc 3479 15 : 17,
8d1502de 3480 0,
424c4b70
ED
3481 &rt_hash_log,
3482 &rt_hash_mask,
c9503e0f 3483 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3484 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3485 rt_hash_lock_init();
1da177e4
LT
3486
3487 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3488 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3489
1da177e4
LT
3490 devinet_init();
3491 ip_fib_init();
3492
9f28a2fc
ED
3493 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3494 expires_ljiffies = jiffies;
3495 schedule_delayed_work(&expires_work,
3496 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3497
73b38711 3498 if (ip_rt_proc_init())
107f1634 3499 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3500#ifdef CONFIG_XFRM
3501 xfrm_init();
a33bc5c1 3502 xfrm4_init(ip_rt_max_size);
1da177e4 3503#endif
c7ac8679 3504 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3505
39a23e75
DL
3506#ifdef CONFIG_SYSCTL
3507 register_pernet_subsys(&sysctl_route_ops);
3508#endif
3ee94372 3509 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3510 return rc;
3511}
3512
a1bc6eb4 3513#ifdef CONFIG_SYSCTL
eeb61f71
AV
3514/*
3515 * We really need to sanitize the damn ipv4 init order, then all
3516 * this nonsense will go away.
3517 */
3518void __init ip_static_sysctl_init(void)
3519{
2f4520d3 3520 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3521}
a1bc6eb4 3522#endif