ipv4: Make neigh lookups directly in output packet path.
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4 72#include <linux/mm.h>
424c4b70 73#include <linux/bootmem.h>
1da177e4
LT
74#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
39c90ece 83#include <linux/workqueue.h>
1da177e4 84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
5a0e3ad6 94#include <linux/slab.h>
b9eda06f 95#include <linux/prefetch.h>
352e512c 96#include <net/dst.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
8d71740c 108#include <net/netevent.h>
63f3444f 109#include <net/rtnetlink.h>
1da177e4
LT
110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
7426a564 112#include <linux/kmemleak.h>
1da177e4 113#endif
6e5714ea 114#include <net/secure_seq.h>
1da177e4 115
68a5e3dd 116#define RT_FL_TOS(oldflp4) \
f61759e6 117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db 124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
1080d709 136static int rt_chain_length_max __read_mostly = 20;
1da177e4 137
9f28a2fc
ED
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
1da177e4
LT
141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 147static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 152static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 153
72cdd1d9
ED
154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how)
156{
157}
1da177e4 158
62fa8a84
DM
159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{
06582540
DM
161 struct rtable *rt = (struct rtable *) dst;
162 struct inet_peer *peer;
163 u32 *p = NULL;
164
fbfe95a4 165 peer = rt_get_peer_create(rt, rt->rt_dst);
06582540 166 if (peer) {
62fa8a84
DM
167 u32 *old_p = __DST_METRICS_PTR(old);
168 unsigned long prev, new;
169
06582540
DM
170 p = peer->metrics;
171 if (inet_metrics_new(peer))
172 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
173
174 new = (unsigned long) p;
175 prev = cmpxchg(&dst->_metrics, old, new);
176
177 if (prev != old) {
62fa8a84
DM
178 p = __DST_METRICS_PTR(prev);
179 if (prev & DST_METRICS_READ_ONLY)
180 p = NULL;
181 } else {
62fa8a84
DM
182 if (rt->fi) {
183 fib_info_put(rt->fi);
184 rt->fi = NULL;
185 }
186 }
187 }
188 return p;
189}
190
d3aaeb38
DM
191static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
192
1da177e4
LT
193static struct dst_ops ipv4_dst_ops = {
194 .family = AF_INET,
09640e63 195 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
196 .gc = rt_garbage_collect,
197 .check = ipv4_dst_check,
0dbaee3b 198 .default_advmss = ipv4_default_advmss,
ebb762f2 199 .mtu = ipv4_mtu,
62fa8a84 200 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
201 .destroy = ipv4_dst_destroy,
202 .ifdown = ipv4_dst_ifdown,
203 .negative_advice = ipv4_negative_advice,
204 .link_failure = ipv4_link_failure,
205 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 206 .local_out = __ip_local_out,
d3aaeb38 207 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
208};
209
210#define ECN_OR_COST(class) TC_PRIO_##class
211
4839c52b 212const __u8 ip_tos2prio[16] = {
1da177e4 213 TC_PRIO_BESTEFFORT,
4a2b9c37 214 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
215 TC_PRIO_BESTEFFORT,
216 ECN_OR_COST(BESTEFFORT),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK)
229};
d4a96865 230EXPORT_SYMBOL(ip_tos2prio);
1da177e4
LT
231
232/*
233 * Route cache.
234 */
235
236/* The locking scheme is rather straight forward:
237 *
238 * 1) Read-Copy Update protects the buckets of the central route hash.
239 * 2) Only writers remove entries, and they hold the lock
240 * as they look at rtable reference counts.
241 * 3) Only readers acquire references to rtable entries,
242 * they do so with atomic increments and with the
243 * lock held.
244 */
245
246struct rt_hash_bucket {
1c31720a 247 struct rtable __rcu *chain;
22c047cc 248};
1080d709 249
8a25d5de
IM
250#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
252/*
253 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254 * The size of this table is a power of two and depends on the number of CPUS.
62051200 255 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 256 */
62051200
IM
257#ifdef CONFIG_LOCKDEP
258# define RT_HASH_LOCK_SZ 256
22c047cc 259#else
62051200
IM
260# if NR_CPUS >= 32
261# define RT_HASH_LOCK_SZ 4096
262# elif NR_CPUS >= 16
263# define RT_HASH_LOCK_SZ 2048
264# elif NR_CPUS >= 8
265# define RT_HASH_LOCK_SZ 1024
266# elif NR_CPUS >= 4
267# define RT_HASH_LOCK_SZ 512
268# else
269# define RT_HASH_LOCK_SZ 256
270# endif
22c047cc
ED
271#endif
272
273static spinlock_t *rt_hash_locks;
274# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
275
276static __init void rt_hash_lock_init(void)
277{
278 int i;
279
280 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 GFP_KERNEL);
282 if (!rt_hash_locks)
283 panic("IP: failed to allocate rt_hash_locks\n");
284
285 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
286 spin_lock_init(&rt_hash_locks[i]);
287}
22c047cc
ED
288#else
289# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
290
291static inline void rt_hash_lock_init(void)
292{
293}
22c047cc 294#endif
1da177e4 295
817bc4db 296static struct rt_hash_bucket *rt_hash_table __read_mostly;
95c96174 297static unsigned int rt_hash_mask __read_mostly;
817bc4db 298static unsigned int rt_hash_log __read_mostly;
1da177e4 299
2f970d83 300static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 301#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 302
b00180de 303static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 304 int genid)
1da177e4 305{
0eae88f3 306 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 307 idx, genid)
29e75252 308 & rt_hash_mask;
1da177e4
LT
309}
310
e84f84f2
DL
311static inline int rt_genid(struct net *net)
312{
313 return atomic_read(&net->ipv4.rt_genid);
314}
315
1da177e4
LT
316#ifdef CONFIG_PROC_FS
317struct rt_cache_iter_state {
a75e936f 318 struct seq_net_private p;
1da177e4 319 int bucket;
29e75252 320 int genid;
1da177e4
LT
321};
322
1218854a 323static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 324{
1218854a 325 struct rt_cache_iter_state *st = seq->private;
1da177e4 326 struct rtable *r = NULL;
1da177e4
LT
327
328 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 329 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 330 continue;
1da177e4 331 rcu_read_lock_bh();
a898def2 332 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 333 while (r) {
d8d1f30b 334 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 335 r->rt_genid == st->genid)
29e75252 336 return r;
d8d1f30b 337 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 338 }
1da177e4
LT
339 rcu_read_unlock_bh();
340 }
29e75252 341 return r;
1da177e4
LT
342}
343
1218854a 344static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 345 struct rtable *r)
1da177e4 346{
1218854a 347 struct rt_cache_iter_state *st = seq->private;
a6272665 348
1c31720a 349 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
350 while (!r) {
351 rcu_read_unlock_bh();
a6272665
ED
352 do {
353 if (--st->bucket < 0)
354 return NULL;
33d480ce 355 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 356 rcu_read_lock_bh();
1c31720a 357 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 358 }
1c31720a 359 return r;
1da177e4
LT
360}
361
1218854a 362static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
363 struct rtable *r)
364{
1218854a
YH
365 struct rt_cache_iter_state *st = seq->private;
366 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 367 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 368 continue;
642d6318
DL
369 if (r->rt_genid == st->genid)
370 break;
371 }
372 return r;
373}
374
1218854a 375static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 376{
1218854a 377 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
378
379 if (r)
1218854a 380 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
381 --pos;
382 return pos ? NULL : r;
383}
384
385static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
386{
29e75252 387 struct rt_cache_iter_state *st = seq->private;
29e75252 388 if (*pos)
1218854a 389 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 390 st->genid = rt_genid(seq_file_net(seq));
29e75252 391 return SEQ_START_TOKEN;
1da177e4
LT
392}
393
394static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
395{
29e75252 396 struct rtable *r;
1da177e4
LT
397
398 if (v == SEQ_START_TOKEN)
1218854a 399 r = rt_cache_get_first(seq);
1da177e4 400 else
1218854a 401 r = rt_cache_get_next(seq, v);
1da177e4
LT
402 ++*pos;
403 return r;
404}
405
406static void rt_cache_seq_stop(struct seq_file *seq, void *v)
407{
408 if (v && v != SEQ_START_TOKEN)
409 rcu_read_unlock_bh();
410}
411
412static int rt_cache_seq_show(struct seq_file *seq, void *v)
413{
414 if (v == SEQ_START_TOKEN)
415 seq_printf(seq, "%-127s\n",
416 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 "HHUptod\tSpecDst");
419 else {
420 struct rtable *r = v;
69cce1d1 421 struct neighbour *n;
218fa90f 422 int len, HHUptod;
1da177e4 423
218fa90f 424 rcu_read_lock();
27217455 425 n = dst_get_neighbour_noref(&r->dst);
218fa90f
ED
426 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427 rcu_read_unlock();
428
0eae88f3
ED
429 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
430 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 431 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
432 (__force u32)r->rt_dst,
433 (__force u32)r->rt_gateway,
d8d1f30b
CG
434 r->rt_flags, atomic_read(&r->dst.__refcnt),
435 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 436 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
437 dst_metric(&r->dst, RTAX_WINDOW),
438 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
439 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 440 r->rt_key_tos,
f6b72b62 441 -1,
218fa90f 442 HHUptod,
41347dcd 443 0, &len);
5e659e4c
PE
444
445 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
446 }
447 return 0;
1da177e4
LT
448}
449
f690808e 450static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
451 .start = rt_cache_seq_start,
452 .next = rt_cache_seq_next,
453 .stop = rt_cache_seq_stop,
454 .show = rt_cache_seq_show,
455};
456
457static int rt_cache_seq_open(struct inode *inode, struct file *file)
458{
a75e936f 459 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 460 sizeof(struct rt_cache_iter_state));
1da177e4
LT
461}
462
9a32144e 463static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
464 .owner = THIS_MODULE,
465 .open = rt_cache_seq_open,
466 .read = seq_read,
467 .llseek = seq_lseek,
a75e936f 468 .release = seq_release_net,
1da177e4
LT
469};
470
471
472static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
473{
474 int cpu;
475
476 if (*pos == 0)
477 return SEQ_START_TOKEN;
478
0f23174a 479 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
480 if (!cpu_possible(cpu))
481 continue;
482 *pos = cpu+1;
2f970d83 483 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
484 }
485 return NULL;
486}
487
488static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
489{
490 int cpu;
491
0f23174a 492 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
493 if (!cpu_possible(cpu))
494 continue;
495 *pos = cpu+1;
2f970d83 496 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
497 }
498 return NULL;
e905a9ed 499
1da177e4
LT
500}
501
502static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
503{
504
505}
506
507static int rt_cpu_seq_show(struct seq_file *seq, void *v)
508{
509 struct rt_cache_stat *st = v;
510
511 if (v == SEQ_START_TOKEN) {
5bec0039 512 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
513 return 0;
514 }
e905a9ed 515
1da177e4
LT
516 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
517 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 518 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
519 st->in_hit,
520 st->in_slow_tot,
521 st->in_slow_mc,
522 st->in_no_route,
523 st->in_brd,
524 st->in_martian_dst,
525 st->in_martian_src,
526
527 st->out_hit,
528 st->out_slow_tot,
e905a9ed 529 st->out_slow_mc,
1da177e4
LT
530
531 st->gc_total,
532 st->gc_ignored,
533 st->gc_goal_miss,
534 st->gc_dst_overflow,
535 st->in_hlist_search,
536 st->out_hlist_search
537 );
538 return 0;
539}
540
f690808e 541static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
542 .start = rt_cpu_seq_start,
543 .next = rt_cpu_seq_next,
544 .stop = rt_cpu_seq_stop,
545 .show = rt_cpu_seq_show,
546};
547
548
549static int rt_cpu_seq_open(struct inode *inode, struct file *file)
550{
551 return seq_open(file, &rt_cpu_seq_ops);
552}
553
9a32144e 554static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
555 .owner = THIS_MODULE,
556 .open = rt_cpu_seq_open,
557 .read = seq_read,
558 .llseek = seq_lseek,
559 .release = seq_release,
560};
561
c7066f70 562#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 563static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 564{
a661c419
AD
565 struct ip_rt_acct *dst, *src;
566 unsigned int i, j;
567
568 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
569 if (!dst)
570 return -ENOMEM;
571
572 for_each_possible_cpu(i) {
573 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
574 for (j = 0; j < 256; j++) {
575 dst[j].o_bytes += src[j].o_bytes;
576 dst[j].o_packets += src[j].o_packets;
577 dst[j].i_bytes += src[j].i_bytes;
578 dst[j].i_packets += src[j].i_packets;
579 }
78c686e9
PE
580 }
581
a661c419
AD
582 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
583 kfree(dst);
584 return 0;
585}
78c686e9 586
a661c419
AD
587static int rt_acct_proc_open(struct inode *inode, struct file *file)
588{
589 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 590}
a661c419
AD
591
592static const struct file_operations rt_acct_proc_fops = {
593 .owner = THIS_MODULE,
594 .open = rt_acct_proc_open,
595 .read = seq_read,
596 .llseek = seq_lseek,
597 .release = single_release,
598};
78c686e9 599#endif
107f1634 600
73b38711 601static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
602{
603 struct proc_dir_entry *pde;
604
605 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
606 &rt_cache_seq_fops);
607 if (!pde)
608 goto err1;
609
77020720
WC
610 pde = proc_create("rt_cache", S_IRUGO,
611 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
612 if (!pde)
613 goto err2;
614
c7066f70 615#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 616 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
617 if (!pde)
618 goto err3;
619#endif
620 return 0;
621
c7066f70 622#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
623err3:
624 remove_proc_entry("rt_cache", net->proc_net_stat);
625#endif
626err2:
627 remove_proc_entry("rt_cache", net->proc_net);
628err1:
629 return -ENOMEM;
630}
73b38711
DL
631
632static void __net_exit ip_rt_do_proc_exit(struct net *net)
633{
634 remove_proc_entry("rt_cache", net->proc_net_stat);
635 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 636#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 637 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 638#endif
73b38711
DL
639}
640
641static struct pernet_operations ip_rt_proc_ops __net_initdata = {
642 .init = ip_rt_do_proc_init,
643 .exit = ip_rt_do_proc_exit,
644};
645
646static int __init ip_rt_proc_init(void)
647{
648 return register_pernet_subsys(&ip_rt_proc_ops);
649}
650
107f1634 651#else
73b38711 652static inline int ip_rt_proc_init(void)
107f1634
PE
653{
654 return 0;
655}
1da177e4 656#endif /* CONFIG_PROC_FS */
e905a9ed 657
5969f71d 658static inline void rt_free(struct rtable *rt)
1da177e4 659{
d8d1f30b 660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
661}
662
5969f71d 663static inline void rt_drop(struct rtable *rt)
1da177e4 664{
1da177e4 665 ip_rt_put(rt);
d8d1f30b 666 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
667}
668
5969f71d 669static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
670{
671 /* Kill broadcast/multicast entries very aggresively, if they
672 collide in hash table with more useful entries */
673 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 674 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
675}
676
5969f71d 677static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
678{
679 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
97bab73f 680 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
1da177e4
LT
681}
682
683static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
684{
685 unsigned long age;
686 int ret = 0;
687
d8d1f30b 688 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
689 goto out;
690
d8d1f30b 691 age = jiffies - rth->dst.lastuse;
1da177e4
LT
692 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
693 (age <= tmo2 && rt_valuable(rth)))
694 goto out;
695 ret = 1;
696out: return ret;
697}
698
699/* Bits of score are:
700 * 31: very valuable
701 * 30: not quite useless
702 * 29..0: usage counter
703 */
704static inline u32 rt_score(struct rtable *rt)
705{
d8d1f30b 706 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
707
708 score = ~score & ~(3<<30);
709
710 if (rt_valuable(rt))
711 score |= (1<<31);
712
c7537967 713 if (rt_is_output_route(rt) ||
1da177e4
LT
714 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
715 score |= (1<<30);
716
717 return score;
718}
719
1080d709
NH
720static inline bool rt_caching(const struct net *net)
721{
722 return net->ipv4.current_rt_cache_rebuild_count <=
723 net->ipv4.sysctl_rt_cache_rebuild_count;
724}
725
5e2b61f7
DM
726static inline bool compare_hash_inputs(const struct rtable *rt1,
727 const struct rtable *rt2)
1080d709 728{
5e2b61f7
DM
729 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 731 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
732}
733
5e2b61f7 734static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 735{
5e2b61f7
DM
736 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
737 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
738 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 739 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 740 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 741 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
742}
743
b5921910
DL
744static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
745{
d8d1f30b 746 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
747}
748
e84f84f2
DL
749static inline int rt_is_expired(struct rtable *rth)
750{
d8d1f30b 751 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
752}
753
beb659bd
ED
754/*
755 * Perform a full scan of hash table and free all entries.
756 * Can be called by a softirq or a process.
757 * In the later case, we want to be reschedule if necessary
758 */
6561a3b1 759static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
760{
761 unsigned int i;
762 struct rtable *rth, *next;
763
764 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
765 struct rtable __rcu **pprev;
766 struct rtable *list;
767
beb659bd
ED
768 if (process_context && need_resched())
769 cond_resched();
33d480ce 770 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
771 if (!rth)
772 continue;
773
774 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 775
6561a3b1
DM
776 list = NULL;
777 pprev = &rt_hash_table[i].chain;
778 rth = rcu_dereference_protected(*pprev,
1c31720a 779 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 780
6561a3b1
DM
781 while (rth) {
782 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 783 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
784
785 if (!net ||
786 net_eq(dev_net(rth->dst.dev), net)) {
787 rcu_assign_pointer(*pprev, next);
788 rcu_assign_pointer(rth->dst.rt_next, list);
789 list = rth;
32cb5b4e 790 } else {
6561a3b1 791 pprev = &rth->dst.rt_next;
32cb5b4e 792 }
6561a3b1 793 rth = next;
32cb5b4e 794 }
6561a3b1 795
beb659bd
ED
796 spin_unlock_bh(rt_hash_lock_addr(i));
797
6561a3b1
DM
798 for (; list; list = next) {
799 next = rcu_dereference_protected(list->dst.rt_next, 1);
800 rt_free(list);
beb659bd
ED
801 }
802 }
803}
804
1080d709
NH
805/*
806 * While freeing expired entries, we compute average chain length
807 * and standard deviation, using fixed-point arithmetic.
808 * This to have an estimation of rt_chain_length_max
809 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
810 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
811 */
812
813#define FRACT_BITS 3
814#define ONE (1UL << FRACT_BITS)
815
98376387
ED
816/*
817 * Given a hash chain and an item in this hash chain,
818 * find if a previous entry has the same hash_inputs
819 * (but differs on tos, mark or oif)
820 * Returns 0 if an alias is found.
821 * Returns ONE if rth has no alias before itself.
822 */
823static int has_noalias(const struct rtable *head, const struct rtable *rth)
824{
825 const struct rtable *aux = head;
826
827 while (aux != rth) {
5e2b61f7 828 if (compare_hash_inputs(aux, rth))
98376387 829 return 0;
1c31720a 830 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
831 }
832 return ONE;
833}
834
9f28a2fc
ED
835static void rt_check_expire(void)
836{
837 static unsigned int rover;
838 unsigned int i = rover, goal;
839 struct rtable *rth;
840 struct rtable __rcu **rthp;
841 unsigned long samples = 0;
842 unsigned long sum = 0, sum2 = 0;
843 unsigned long delta;
844 u64 mult;
845
846 delta = jiffies - expires_ljiffies;
847 expires_ljiffies = jiffies;
848 mult = ((u64)delta) << rt_hash_log;
849 if (ip_rt_gc_timeout > 1)
850 do_div(mult, ip_rt_gc_timeout);
851 goal = (unsigned int)mult;
852 if (goal > rt_hash_mask)
853 goal = rt_hash_mask + 1;
854 for (; goal > 0; goal--) {
855 unsigned long tmo = ip_rt_gc_timeout;
856 unsigned long length;
857
858 i = (i + 1) & rt_hash_mask;
859 rthp = &rt_hash_table[i].chain;
860
861 if (need_resched())
862 cond_resched();
863
864 samples++;
865
866 if (rcu_dereference_raw(*rthp) == NULL)
867 continue;
868 length = 0;
869 spin_lock_bh(rt_hash_lock_addr(i));
870 while ((rth = rcu_dereference_protected(*rthp,
871 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
872 prefetch(rth->dst.rt_next);
df67e6c9
DM
873 if (rt_is_expired(rth) ||
874 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
9f28a2fc
ED
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
df67e6c9
DM
879
880 /* We only count entries on a chain with equal
881 * hash inputs once so that entries for
882 * different QOS levels, and other non-hash
883 * input attributes don't unfairly skew the
884 * length computation
885 */
886 tmo >>= 1;
887 rthp = &rth->dst.rt_next;
888 length += has_noalias(rt_hash_table[i].chain, rth);
9f28a2fc
ED
889 }
890 spin_unlock_bh(rt_hash_lock_addr(i));
891 sum += length;
892 sum2 += length*length;
893 }
894 if (samples) {
895 unsigned long avg = sum / samples;
896 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
897 rt_chain_length_max = max_t(unsigned long,
898 ip_rt_gc_elasticity,
899 (avg + 4*sd) >> FRACT_BITS);
900 }
901 rover = i;
902}
903
904/*
905 * rt_worker_func() is run in process context.
906 * we call rt_check_expire() to scan part of the hash table
907 */
908static void rt_worker_func(struct work_struct *work)
909{
910 rt_check_expire();
911 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
912}
913
29e75252 914/*
25985edc 915 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
916 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
917 * many times (2^24) without giving recent rt_genid.
918 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 919 */
86c657f6 920static void rt_cache_invalidate(struct net *net)
1da177e4 921{
29e75252 922 unsigned char shuffle;
1da177e4 923
29e75252 924 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 925 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
b48c80ec 926 inetpeer_invalidate_family(AF_INET);
1da177e4
LT
927}
928
29e75252
ED
929/*
930 * delay < 0 : invalidate cache (fast : entries will be deleted later)
931 * delay >= 0 : invalidate & flush cache (can be long)
932 */
76e6ebfb 933void rt_cache_flush(struct net *net, int delay)
1da177e4 934{
86c657f6 935 rt_cache_invalidate(net);
29e75252 936 if (delay >= 0)
6561a3b1 937 rt_do_flush(net, !in_softirq());
1da177e4
LT
938}
939
a5ee1551 940/* Flush previous cache invalidated entries from the cache */
6561a3b1 941void rt_cache_flush_batch(struct net *net)
a5ee1551 942{
6561a3b1 943 rt_do_flush(net, !in_softirq());
a5ee1551
EB
944}
945
1080d709
NH
946static void rt_emergency_hash_rebuild(struct net *net)
947{
e87cc472 948 net_warn_ratelimited("Route hash chain too long!\n");
3ee94372 949 rt_cache_invalidate(net);
1080d709
NH
950}
951
1da177e4
LT
952/*
953 Short description of GC goals.
954
955 We want to build algorithm, which will keep routing cache
956 at some equilibrium point, when number of aged off entries
957 is kept approximately equal to newly generated ones.
958
959 Current expiration strength is variable "expire".
960 We try to adjust it dynamically, so that if networking
961 is idle expires is large enough to keep enough of warm entries,
962 and when load increases it reduces to limit cache size.
963 */
964
569d3645 965static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
966{
967 static unsigned long expire = RT_GC_TIMEOUT;
968 static unsigned long last_gc;
969 static int rover;
970 static int equilibrium;
1c31720a
ED
971 struct rtable *rth;
972 struct rtable __rcu **rthp;
1da177e4
LT
973 unsigned long now = jiffies;
974 int goal;
fc66f95c 975 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
976
977 /*
978 * Garbage collection is pretty expensive,
979 * do not make it too frequently.
980 */
981
982 RT_CACHE_STAT_INC(gc_total);
983
984 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 985 entries < ip_rt_max_size) {
1da177e4
LT
986 RT_CACHE_STAT_INC(gc_ignored);
987 goto out;
988 }
989
fc66f95c 990 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 991 /* Calculate number of entries, which we want to expire now. */
fc66f95c 992 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
993 if (goal <= 0) {
994 if (equilibrium < ipv4_dst_ops.gc_thresh)
995 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 996 goal = entries - equilibrium;
1da177e4 997 if (goal > 0) {
b790cedd 998 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 999 goal = entries - equilibrium;
1da177e4
LT
1000 }
1001 } else {
1002 /* We are in dangerous area. Try to reduce cache really
1003 * aggressively.
1004 */
b790cedd 1005 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1006 equilibrium = entries - goal;
1da177e4
LT
1007 }
1008
1009 if (now - last_gc >= ip_rt_gc_min_interval)
1010 last_gc = now;
1011
1012 if (goal <= 0) {
1013 equilibrium += goal;
1014 goto work_done;
1015 }
1016
1017 do {
1018 int i, k;
1019
1020 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1021 unsigned long tmo = expire;
1022
1023 k = (k + 1) & rt_hash_mask;
1024 rthp = &rt_hash_table[k].chain;
22c047cc 1025 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1026 while ((rth = rcu_dereference_protected(*rthp,
1027 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1028 if (!rt_is_expired(rth) &&
29e75252 1029 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1030 tmo >>= 1;
d8d1f30b 1031 rthp = &rth->dst.rt_next;
1da177e4
LT
1032 continue;
1033 }
d8d1f30b 1034 *rthp = rth->dst.rt_next;
1da177e4
LT
1035 rt_free(rth);
1036 goal--;
1da177e4 1037 }
22c047cc 1038 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1039 if (goal <= 0)
1040 break;
1041 }
1042 rover = k;
1043
1044 if (goal <= 0)
1045 goto work_done;
1046
1047 /* Goal is not achieved. We stop process if:
1048
1049 - if expire reduced to zero. Otherwise, expire is halfed.
1050 - if table is not full.
1051 - if we are called from interrupt.
1052 - jiffies check is just fallback/debug loop breaker.
1053 We will not spin here for long time in any case.
1054 */
1055
1056 RT_CACHE_STAT_INC(gc_goal_miss);
1057
1058 if (expire == 0)
1059 break;
1060
1061 expire >>= 1;
1da177e4 1062
fc66f95c 1063 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1064 goto out;
1065 } while (!in_softirq() && time_before_eq(jiffies, now));
1066
fc66f95c
ED
1067 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1068 goto out;
1069 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4 1070 goto out;
e87cc472 1071 net_warn_ratelimited("dst cache overflow\n");
1da177e4
LT
1072 RT_CACHE_STAT_INC(gc_dst_overflow);
1073 return 1;
1074
1075work_done:
1076 expire += ip_rt_gc_min_interval;
1077 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1078 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1079 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1080 expire = ip_rt_gc_timeout;
1da177e4
LT
1081out: return 0;
1082}
1083
98376387
ED
1084/*
1085 * Returns number of entries in a hash chain that have different hash_inputs
1086 */
1087static int slow_chain_length(const struct rtable *head)
1088{
1089 int length = 0;
1090 const struct rtable *rth = head;
1091
1092 while (rth) {
1093 length += has_noalias(head, rth);
1c31720a 1094 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1095 }
1096 return length >> FRACT_BITS;
1097}
1098
d3aaeb38 1099static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1100{
d3aaeb38
DM
1101 struct net_device *dev = dst->dev;
1102 const __be32 *pkey = daddr;
39232973 1103 const struct rtable *rt;
3769cffb
DM
1104 struct neighbour *n;
1105
39232973 1106 rt = (const struct rtable *) dst;
a263b309 1107 if (rt->rt_gateway)
39232973 1108 pkey = (const __be32 *) &rt->rt_gateway;
d3aaeb38 1109
80703d26 1110 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
1111 if (n)
1112 return n;
32092ecf 1113 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1114}
1115
1116static int rt_bind_neighbour(struct rtable *rt)
1117{
1118 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1119 if (IS_ERR(n))
1120 return PTR_ERR(n);
69cce1d1 1121 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1122
1123 return 0;
1124}
1125
95c96174 1126static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
b23dd4fe 1127 struct sk_buff *skb, int ifindex)
1da177e4 1128{
1c31720a
ED
1129 struct rtable *rth, *cand;
1130 struct rtable __rcu **rthp, **candp;
1da177e4 1131 unsigned long now;
1da177e4
LT
1132 u32 min_score;
1133 int chain_length;
1134 int attempts = !in_softirq();
1135
1136restart:
1137 chain_length = 0;
1138 min_score = ~(u32)0;
1139 cand = NULL;
1140 candp = NULL;
1141 now = jiffies;
1142
7586eceb 1143 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
73e42897
NH
1144 /*
1145 * If we're not caching, just tell the caller we
1146 * were successful and don't touch the route. The
1147 * caller hold the sole reference to the cache entry, and
1148 * it will be released when the caller is done with it.
1149 * If we drop it here, the callers have no way to resolve routes
1150 * when we're not caching. Instead, just point *rp at rt, so
1151 * the caller gets a single use out of the route
b6280b47
NH
1152 * Note that we do rt_free on this new route entry, so that
1153 * once its refcount hits zero, we are still able to reap it
1154 * (Thanks Alexey)
27b75c95
ED
1155 * Note: To avoid expensive rcu stuff for this uncached dst,
1156 * we set DST_NOCACHE so that dst_release() can free dst without
1157 * waiting a grace period.
73e42897 1158 */
b6280b47 1159
c7d4426a 1160 rt->dst.flags |= DST_NOCACHE;
c7537967 1161 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1162 int err = rt_bind_neighbour(rt);
b6280b47 1163 if (err) {
e87cc472 1164 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
27b75c95 1165 ip_rt_put(rt);
b23dd4fe 1166 return ERR_PTR(err);
b6280b47
NH
1167 }
1168 }
1169
b6280b47 1170 goto skip_hashing;
1080d709
NH
1171 }
1172
1da177e4
LT
1173 rthp = &rt_hash_table[hash].chain;
1174
22c047cc 1175 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1176 while ((rth = rcu_dereference_protected(*rthp,
1177 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1178 if (rt_is_expired(rth)) {
d8d1f30b 1179 *rthp = rth->dst.rt_next;
29e75252
ED
1180 rt_free(rth);
1181 continue;
1182 }
5e2b61f7 1183 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1184 /* Put it first */
d8d1f30b 1185 *rthp = rth->dst.rt_next;
1da177e4
LT
1186 /*
1187 * Since lookup is lockfree, the deletion
1188 * must be visible to another weakly ordered CPU before
1189 * the insertion at the start of the hash chain.
1190 */
d8d1f30b 1191 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1192 rt_hash_table[hash].chain);
1193 /*
1194 * Since lookup is lockfree, the update writes
1195 * must be ordered for consistency on SMP.
1196 */
1197 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1198
d8d1f30b 1199 dst_use(&rth->dst, now);
22c047cc 1200 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1201
1202 rt_drop(rt);
b23dd4fe 1203 if (skb)
d8d1f30b 1204 skb_dst_set(skb, &rth->dst);
b23dd4fe 1205 return rth;
1da177e4
LT
1206 }
1207
d8d1f30b 1208 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1209 u32 score = rt_score(rth);
1210
1211 if (score <= min_score) {
1212 cand = rth;
1213 candp = rthp;
1214 min_score = score;
1215 }
1216 }
1217
1218 chain_length++;
1219
d8d1f30b 1220 rthp = &rth->dst.rt_next;
1da177e4
LT
1221 }
1222
1223 if (cand) {
1224 /* ip_rt_gc_elasticity used to be average length of chain
1225 * length, when exceeded gc becomes really aggressive.
1226 *
1227 * The second limit is less certain. At the moment it allows
1228 * only 2 entries per bucket. We will see.
1229 */
1230 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1231 *candp = cand->dst.rt_next;
1da177e4
LT
1232 rt_free(cand);
1233 }
1080d709 1234 } else {
98376387
ED
1235 if (chain_length > rt_chain_length_max &&
1236 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1237 struct net *net = dev_net(rt->dst.dev);
1080d709 1238 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1239 if (!rt_caching(net)) {
058bd4d2 1240 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1241 rt->dst.dev->name, num);
1080d709 1242 }
b35ecb5d 1243 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1244 spin_unlock_bh(rt_hash_lock_addr(hash));
1245
5e2b61f7 1246 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1247 ifindex, rt_genid(net));
1248 goto restart;
1080d709 1249 }
1da177e4
LT
1250 }
1251
1252 /* Try to bind route to arp only if it is output
1253 route or unicast forwarding path.
1254 */
c7537967 1255 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1256 int err = rt_bind_neighbour(rt);
1da177e4 1257 if (err) {
22c047cc 1258 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1259
1260 if (err != -ENOBUFS) {
1261 rt_drop(rt);
b23dd4fe 1262 return ERR_PTR(err);
1da177e4
LT
1263 }
1264
1265 /* Neighbour tables are full and nothing
1266 can be released. Try to shrink route cache,
1267 it is most likely it holds some neighbour records.
1268 */
1269 if (attempts-- > 0) {
1270 int saved_elasticity = ip_rt_gc_elasticity;
1271 int saved_int = ip_rt_gc_min_interval;
1272 ip_rt_gc_elasticity = 1;
1273 ip_rt_gc_min_interval = 0;
569d3645 1274 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1275 ip_rt_gc_min_interval = saved_int;
1276 ip_rt_gc_elasticity = saved_elasticity;
1277 goto restart;
1278 }
1279
e87cc472 1280 net_warn_ratelimited("Neighbour table overflow\n");
1da177e4 1281 rt_drop(rt);
b23dd4fe 1282 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1283 }
1284 }
1285
d8d1f30b 1286 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1287
00269b54
ED
1288 /*
1289 * Since lookup is lockfree, we must make sure
25985edc 1290 * previous writes to rt are committed to memory
00269b54
ED
1291 * before making rt visible to other CPUS.
1292 */
1ddbcb00 1293 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1294
22c047cc 1295 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1296
b6280b47 1297skip_hashing:
b23dd4fe 1298 if (skb)
d8d1f30b 1299 skb_dst_set(skb, &rt->dst);
b23dd4fe 1300 return rt;
1da177e4
LT
1301}
1302
6431cbc2
DM
1303static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1304
1305static u32 rt_peer_genid(void)
1306{
1307 return atomic_read(&__rt_peer_genid);
1308}
1309
a48eff12 1310void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1311{
97bab73f 1312 struct inet_peer_base *base;
1da177e4
LT
1313 struct inet_peer *peer;
1314
97bab73f
DM
1315 base = inetpeer_base_ptr(rt->_peer);
1316 if (!base)
1317 return;
1318
1319 peer = inet_getpeer_v4(base, daddr, create);
7b34ca2a
DM
1320 if (peer) {
1321 if (!rt_set_peer(rt, peer))
1322 inet_putpeer(peer);
1323 else
1324 rt->rt_peer_genid = rt_peer_genid();
1325 }
1da177e4
LT
1326}
1327
1328/*
1329 * Peer allocation may fail only in serious out-of-memory conditions. However
1330 * we still can generate some output.
1331 * Random ID selection looks a bit dangerous because we have no chances to
1332 * select ID being unique in a reasonable period of time.
1333 * But broken packet identifier may be better than no packet at all.
1334 */
1335static void ip_select_fb_ident(struct iphdr *iph)
1336{
1337 static DEFINE_SPINLOCK(ip_fb_id_lock);
1338 static u32 ip_fallback_id;
1339 u32 salt;
1340
1341 spin_lock_bh(&ip_fb_id_lock);
e448515c 1342 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1343 iph->id = htons(salt & 0xFFFF);
1344 ip_fallback_id = salt;
1345 spin_unlock_bh(&ip_fb_id_lock);
1346}
1347
1348void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1349{
1350 struct rtable *rt = (struct rtable *) dst;
1351
e688a604 1352 if (rt && !(rt->dst.flags & DST_NOPEER)) {
fbfe95a4 1353 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1da177e4
LT
1354
1355 /* If peer is attached to destination, it is never detached,
1356 so that we need not to grab a lock to dereference it.
1357 */
fbfe95a4
DM
1358 if (peer) {
1359 iph->id = htons(inet_getid(peer, more));
1da177e4
LT
1360 return;
1361 }
e688a604 1362 } else if (!rt)
91df42be 1363 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1da177e4
LT
1364
1365 ip_select_fb_ident(iph);
1366}
4bc2f18b 1367EXPORT_SYMBOL(__ip_select_ident);
1da177e4 1368
95c96174 1369static void rt_del(unsigned int hash, struct rtable *rt)
1da177e4 1370{
1c31720a
ED
1371 struct rtable __rcu **rthp;
1372 struct rtable *aux;
1da177e4 1373
29e75252 1374 rthp = &rt_hash_table[hash].chain;
22c047cc 1375 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1376 ip_rt_put(rt);
1c31720a
ED
1377 while ((aux = rcu_dereference_protected(*rthp,
1378 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1379 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1380 *rthp = aux->dst.rt_next;
29e75252
ED
1381 rt_free(aux);
1382 continue;
1da177e4 1383 }
d8d1f30b 1384 rthp = &aux->dst.rt_next;
29e75252 1385 }
22c047cc 1386 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1387}
1388
de398fb8 1389static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
9cc20b26
ED
1390{
1391 struct rtable *rt = (struct rtable *) dst;
1392 __be32 orig_gw = rt->rt_gateway;
1393 struct neighbour *n, *old_n;
1394
1395 dst_confirm(&rt->dst);
1396
1397 rt->rt_gateway = peer->redirect_learned.a4;
1398
1399 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
de398fb8
DM
1400 if (IS_ERR(n)) {
1401 rt->rt_gateway = orig_gw;
1402 return;
1403 }
9cc20b26
ED
1404 old_n = xchg(&rt->dst._neighbour, n);
1405 if (old_n)
1406 neigh_release(old_n);
de398fb8
DM
1407 if (!(n->nud_state & NUD_VALID)) {
1408 neigh_event_send(n, NULL);
9cc20b26
ED
1409 } else {
1410 rt->rt_flags |= RTCF_REDIRECTED;
1411 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1412 }
9cc20b26
ED
1413}
1414
ed7865a4 1415/* called in rcu_read_lock() section */
f7655229
AV
1416void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1417 __be32 saddr, struct net_device *dev)
1da177e4 1418{
7cc9150e 1419 int s, i;
ed7865a4 1420 struct in_device *in_dev = __in_dev_get_rcu(dev);
7cc9150e
FL
1421 __be32 skeys[2] = { saddr, 0 };
1422 int ikeys[2] = { dev->ifindex, 0 };
f39925db 1423 struct inet_peer *peer;
317805b8 1424 struct net *net;
1da177e4 1425
1da177e4
LT
1426 if (!in_dev)
1427 return;
1428
c346dca1 1429 net = dev_net(dev);
9d4fb27d
JP
1430 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1431 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1432 ipv4_is_zeronet(new_gw))
1da177e4
LT
1433 goto reject_redirect;
1434
1435 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1436 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1437 goto reject_redirect;
1438 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1439 goto reject_redirect;
1440 } else {
317805b8 1441 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1442 goto reject_redirect;
1443 }
1444
7cc9150e
FL
1445 for (s = 0; s < 2; s++) {
1446 for (i = 0; i < 2; i++) {
9cc20b26
ED
1447 unsigned int hash;
1448 struct rtable __rcu **rthp;
1449 struct rtable *rt;
1450
1451 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1452
1453 rthp = &rt_hash_table[hash].chain;
1454
1455 while ((rt = rcu_dereference(*rthp)) != NULL) {
1456 rthp = &rt->dst.rt_next;
1457
1458 if (rt->rt_key_dst != daddr ||
1459 rt->rt_key_src != skeys[s] ||
1460 rt->rt_oif != ikeys[i] ||
1461 rt_is_input_route(rt) ||
1462 rt_is_expired(rt) ||
1463 !net_eq(dev_net(rt->dst.dev), net) ||
1464 rt->dst.error ||
1465 rt->dst.dev != dev ||
1466 rt->rt_gateway != old_gw)
1467 continue;
e905a9ed 1468
fbfe95a4 1469 peer = rt_get_peer_create(rt, rt->rt_dst);
9cc20b26 1470 if (peer) {
ac3f48de 1471 if (peer->redirect_learned.a4 != new_gw) {
9cc20b26
ED
1472 peer->redirect_learned.a4 = new_gw;
1473 atomic_inc(&__rt_peer_genid);
1474 }
1475 check_peer_redir(&rt->dst, peer);
1476 }
7cc9150e 1477 }
7cc9150e 1478 }
1da177e4 1479 }
1da177e4
LT
1480 return;
1481
1482reject_redirect:
1483#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
1484 if (IN_DEV_LOG_MARTIANS(in_dev))
1485 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1486 " Advised path = %pI4 -> %pI4\n",
1487 &old_gw, dev->name, &new_gw,
1488 &saddr, &daddr);
1da177e4 1489#endif
ed7865a4 1490 ;
1da177e4
LT
1491}
1492
fe6fe792
ED
1493static bool peer_pmtu_expired(struct inet_peer *peer)
1494{
1495 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1496
1497 return orig &&
1498 time_after_eq(jiffies, orig) &&
1499 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1500}
1501
1502static bool peer_pmtu_cleaned(struct inet_peer *peer)
1503{
1504 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1505
1506 return orig &&
1507 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1508}
1509
1da177e4
LT
1510static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1511{
ee6b9673 1512 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1513 struct dst_entry *ret = dst;
1514
1515 if (rt) {
d11a4dc1 1516 if (dst->obsolete > 0) {
1da177e4
LT
1517 ip_rt_put(rt);
1518 ret = NULL;
2c8cec5c 1519 } else if (rt->rt_flags & RTCF_REDIRECTED) {
95c96174 1520 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
5e2b61f7 1521 rt->rt_oif,
e84f84f2 1522 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1523 rt_del(hash, rt);
1524 ret = NULL;
97bab73f
DM
1525 } else if (rt_has_peer(rt)) {
1526 struct inet_peer *peer = rt_peer_ptr(rt);
1527 if (peer_pmtu_expired(peer))
1528 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1da177e4
LT
1529 }
1530 }
1531 return ret;
1532}
1533
1534/*
1535 * Algorithm:
1536 * 1. The first ip_rt_redirect_number redirects are sent
1537 * with exponential backoff, then we stop sending them at all,
1538 * assuming that the host ignores our redirects.
1539 * 2. If we did not see packets requiring redirects
1540 * during ip_rt_redirect_silence, we assume that the host
1541 * forgot redirected route and start to send redirects again.
1542 *
1543 * This algorithm is much cheaper and more intelligent than dumb load limiting
1544 * in icmp.c.
1545 *
1546 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548 */
1549
1550void ip_rt_send_redirect(struct sk_buff *skb)
1551{
511c3f92 1552 struct rtable *rt = skb_rtable(skb);
30038fc6 1553 struct in_device *in_dev;
92d86829 1554 struct inet_peer *peer;
30038fc6 1555 int log_martians;
1da177e4 1556
30038fc6 1557 rcu_read_lock();
d8d1f30b 1558 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1559 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560 rcu_read_unlock();
1da177e4 1561 return;
30038fc6
ED
1562 }
1563 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 rcu_read_unlock();
1da177e4 1565
fbfe95a4 1566 peer = rt_get_peer_create(rt, rt->rt_dst);
92d86829
DM
1567 if (!peer) {
1568 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1569 return;
1570 }
1571
1da177e4
LT
1572 /* No redirected packets during ip_rt_redirect_silence;
1573 * reset the algorithm.
1574 */
92d86829
DM
1575 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1576 peer->rate_tokens = 0;
1da177e4
LT
1577
1578 /* Too many ignored redirects; do not send anything
d8d1f30b 1579 * set dst.rate_last to the last seen redirected packet.
1da177e4 1580 */
92d86829
DM
1581 if (peer->rate_tokens >= ip_rt_redirect_number) {
1582 peer->rate_last = jiffies;
30038fc6 1583 return;
1da177e4
LT
1584 }
1585
1586 /* Check for load limit; set rate_last to the latest sent
1587 * redirect.
1588 */
92d86829 1589 if (peer->rate_tokens == 0 ||
14fb8a76 1590 time_after(jiffies,
92d86829
DM
1591 (peer->rate_last +
1592 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1593 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1594 peer->rate_last = jiffies;
1595 ++peer->rate_tokens;
1da177e4 1596#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1597 if (log_martians &&
e87cc472
JP
1598 peer->rate_tokens == ip_rt_redirect_number)
1599 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1600 &ip_hdr(skb)->saddr, rt->rt_iif,
1601 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1602#endif
1603 }
1da177e4
LT
1604}
1605
1606static int ip_error(struct sk_buff *skb)
1607{
251da413 1608 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 1609 struct rtable *rt = skb_rtable(skb);
92d86829 1610 struct inet_peer *peer;
1da177e4 1611 unsigned long now;
251da413 1612 struct net *net;
92d86829 1613 bool send;
1da177e4
LT
1614 int code;
1615
251da413
DM
1616 net = dev_net(rt->dst.dev);
1617 if (!IN_DEV_FORWARD(in_dev)) {
1618 switch (rt->dst.error) {
1619 case EHOSTUNREACH:
1620 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1621 break;
1622
1623 case ENETUNREACH:
1624 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1625 break;
1626 }
1627 goto out;
1628 }
1629
d8d1f30b 1630 switch (rt->dst.error) {
4500ebf8
JP
1631 case EINVAL:
1632 default:
1633 goto out;
1634 case EHOSTUNREACH:
1635 code = ICMP_HOST_UNREACH;
1636 break;
1637 case ENETUNREACH:
1638 code = ICMP_NET_UNREACH;
251da413 1639 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
1640 break;
1641 case EACCES:
1642 code = ICMP_PKT_FILTERED;
1643 break;
1da177e4
LT
1644 }
1645
fbfe95a4 1646 peer = rt_get_peer_create(rt, rt->rt_dst);
92d86829
DM
1647
1648 send = true;
1649 if (peer) {
1650 now = jiffies;
1651 peer->rate_tokens += now - peer->rate_last;
1652 if (peer->rate_tokens > ip_rt_error_burst)
1653 peer->rate_tokens = ip_rt_error_burst;
1654 peer->rate_last = now;
1655 if (peer->rate_tokens >= ip_rt_error_cost)
1656 peer->rate_tokens -= ip_rt_error_cost;
1657 else
1658 send = false;
1da177e4 1659 }
92d86829
DM
1660 if (send)
1661 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1662
1663out: kfree_skb(skb);
1664 return 0;
e905a9ed 1665}
1da177e4 1666
2c8cec5c
DM
1667static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1668{
fe6fe792 1669 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1670
fe6fe792
ED
1671 if (!expires)
1672 return;
46af3180 1673 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1674 u32 orig_dst_mtu = dst_mtu(dst);
1675 if (peer->pmtu_learned < orig_dst_mtu) {
1676 if (!peer->pmtu_orig)
1677 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1678 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1679 }
1680 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1681 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1682}
1683
1da177e4
LT
1684static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685{
2c8cec5c
DM
1686 struct rtable *rt = (struct rtable *) dst;
1687 struct inet_peer *peer;
1688
1689 dst_confirm(dst);
1690
fbfe95a4 1691 peer = rt_get_peer_create(rt, rt->rt_dst);
2c8cec5c 1692 if (peer) {
fe6fe792
ED
1693 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1694
2c8cec5c 1695 if (mtu < ip_rt_min_pmtu)
1da177e4 1696 mtu = ip_rt_min_pmtu;
fe6fe792 1697 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1698
1699 pmtu_expires = jiffies + ip_rt_mtu_expires;
1700 if (!pmtu_expires)
1701 pmtu_expires = 1UL;
1702
2c8cec5c 1703 peer->pmtu_learned = mtu;
46af3180 1704 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1705
1706 atomic_inc(&__rt_peer_genid);
1707 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1708 }
46af3180 1709 check_peer_pmtu(dst, peer);
1da177e4
LT
1710 }
1711}
1712
36393395
DM
1713void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1714 int oif, u32 mark, u8 protocol, int flow_flags)
1715{
1716 const struct iphdr *iph = (const struct iphdr *)skb->data;
1717 struct flowi4 fl4;
1718 struct rtable *rt;
1719
1720 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1721 protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1722 iph->daddr, iph->saddr, 0, 0);
1723 rt = __ip_route_output_key(net, &fl4);
1724 if (!IS_ERR(rt)) {
1725 ip_rt_update_pmtu(&rt->dst, mtu);
1726 ip_rt_put(rt);
1727 }
1728}
1729EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1730
1731void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1732{
1733 const struct inet_sock *inet = inet_sk(sk);
1734
1735 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1736 sk->sk_bound_dev_if, sk->sk_mark,
1737 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1738 inet_sk_flowi_flags(sk));
1739}
1740EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1741
de398fb8 1742static void ipv4_validate_peer(struct rtable *rt)
1da177e4 1743{
6431cbc2 1744 if (rt->rt_peer_genid != rt_peer_genid()) {
fbfe95a4 1745 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
6431cbc2 1746
fe6fe792 1747 if (peer) {
efbc368d 1748 check_peer_pmtu(&rt->dst, peer);
2c8cec5c 1749
fe6fe792 1750 if (peer->redirect_learned.a4 &&
de398fb8
DM
1751 peer->redirect_learned.a4 != rt->rt_gateway)
1752 check_peer_redir(&rt->dst, peer);
f39925db
DM
1753 }
1754
6431cbc2
DM
1755 rt->rt_peer_genid = rt_peer_genid();
1756 }
efbc368d
DM
1757}
1758
1759static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1760{
1761 struct rtable *rt = (struct rtable *) dst;
1762
1763 if (rt_is_expired(rt))
1764 return NULL;
de398fb8 1765 ipv4_validate_peer(rt);
d11a4dc1 1766 return dst;
1da177e4
LT
1767}
1768
1769static void ipv4_dst_destroy(struct dst_entry *dst)
1770{
1771 struct rtable *rt = (struct rtable *) dst;
1da177e4 1772
62fa8a84
DM
1773 if (rt->fi) {
1774 fib_info_put(rt->fi);
1775 rt->fi = NULL;
1776 }
97bab73f
DM
1777 if (rt_has_peer(rt)) {
1778 struct inet_peer *peer = rt_peer_ptr(rt);
1da177e4
LT
1779 inet_putpeer(peer);
1780 }
1da177e4
LT
1781}
1782
1da177e4
LT
1783
1784static void ipv4_link_failure(struct sk_buff *skb)
1785{
1786 struct rtable *rt;
1787
1788 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1789
511c3f92 1790 rt = skb_rtable(skb);
97bab73f
DM
1791 if (rt && rt_has_peer(rt)) {
1792 struct inet_peer *peer = rt_peer_ptr(rt);
1793 if (peer_pmtu_cleaned(peer))
1794 dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1795 }
1da177e4
LT
1796}
1797
1798static int ip_rt_bug(struct sk_buff *skb)
1799{
91df42be
JP
1800 pr_debug("%s: %pI4 -> %pI4, %s\n",
1801 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1802 skb->dev ? skb->dev->name : "?");
1da177e4 1803 kfree_skb(skb);
c378a9c0 1804 WARN_ON(1);
1da177e4
LT
1805 return 0;
1806}
1807
1808/*
1809 We do not cache source address of outgoing interface,
1810 because it is used only by IP RR, TS and SRR options,
1811 so that it out of fast path.
1812
1813 BTW remember: "addr" is allowed to be not aligned
1814 in IP options!
1815 */
1816
8e36360a 1817void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1818{
a61ced5d 1819 __be32 src;
1da177e4 1820
c7537967 1821 if (rt_is_output_route(rt))
c5be24ff 1822 src = ip_hdr(skb)->saddr;
ebc0ffae 1823 else {
8e36360a
DM
1824 struct fib_result res;
1825 struct flowi4 fl4;
1826 struct iphdr *iph;
1827
1828 iph = ip_hdr(skb);
1829
1830 memset(&fl4, 0, sizeof(fl4));
1831 fl4.daddr = iph->daddr;
1832 fl4.saddr = iph->saddr;
b0fe4a31 1833 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1834 fl4.flowi4_oif = rt->dst.dev->ifindex;
1835 fl4.flowi4_iif = skb->dev->ifindex;
1836 fl4.flowi4_mark = skb->mark;
5e2b61f7 1837
ebc0ffae 1838 rcu_read_lock();
68a5e3dd 1839 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1840 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1841 else
1842 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1843 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1844 rcu_read_unlock();
1845 }
1da177e4
LT
1846 memcpy(addr, &src, 4);
1847}
1848
c7066f70 1849#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1850static void set_class_tag(struct rtable *rt, u32 tag)
1851{
d8d1f30b
CG
1852 if (!(rt->dst.tclassid & 0xFFFF))
1853 rt->dst.tclassid |= tag & 0xFFFF;
1854 if (!(rt->dst.tclassid & 0xFFFF0000))
1855 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1856}
1857#endif
1858
0dbaee3b
DM
1859static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1860{
1861 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1862
1863 if (advmss == 0) {
1864 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1865 ip_rt_min_advmss);
1866 if (advmss > 65535 - 40)
1867 advmss = 65535 - 40;
1868 }
1869 return advmss;
1870}
1871
ebb762f2 1872static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1873{
261663b0 1874 const struct rtable *rt = (const struct rtable *) dst;
618f9bc7
SK
1875 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1876
261663b0 1877 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1878 return mtu;
1879
1880 mtu = dst->dev->mtu;
d33e4553
DM
1881
1882 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1883
1884 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1885 mtu = 576;
1886 }
1887
1888 if (mtu > IP_MAX_MTU)
1889 mtu = IP_MAX_MTU;
1890
1891 return mtu;
1892}
1893
813b3b5d 1894static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1895 struct fib_info *fi)
a4daad6b 1896{
97bab73f 1897 struct inet_peer_base *base;
0131ba45
DM
1898 struct inet_peer *peer;
1899 int create = 0;
a4daad6b 1900
0131ba45
DM
1901 /* If a peer entry exists for this destination, we must hook
1902 * it up in order to get at cached metrics.
1903 */
813b3b5d 1904 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1905 create = 1;
1906
97bab73f
DM
1907 base = inetpeer_base_ptr(rt->_peer);
1908 BUG_ON(!base);
1909
1910 peer = inet_getpeer_v4(base, rt->rt_dst, create);
0131ba45 1911 if (peer) {
97bab73f 1912 __rt_set_peer(rt, peer);
3c0afdca 1913 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1914 if (inet_metrics_new(peer))
1915 memcpy(peer->metrics, fi->fib_metrics,
1916 sizeof(u32) * RTAX_MAX);
1917 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1918
fe6fe792 1919 check_peer_pmtu(&rt->dst, peer);
ac3f48de 1920
f39925db
DM
1921 if (peer->redirect_learned.a4 &&
1922 peer->redirect_learned.a4 != rt->rt_gateway) {
1923 rt->rt_gateway = peer->redirect_learned.a4;
1924 rt->rt_flags |= RTCF_REDIRECTED;
1925 }
0131ba45
DM
1926 } else {
1927 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1928 rt->fi = fi;
1929 atomic_inc(&fi->fib_clntref);
1930 }
1931 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1932 }
1933}
1934
813b3b5d 1935static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1936 const struct fib_result *res,
982721f3 1937 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1938{
defb3519 1939 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1940
1941 if (fi) {
1942 if (FIB_RES_GW(*res) &&
1943 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1944 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1945 rt_init_metrics(rt, fl4, fi);
c7066f70 1946#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1947 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1948#endif
d33e4553 1949 }
defb3519 1950
defb3519
DM
1951 if (dst_mtu(dst) > IP_MAX_MTU)
1952 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1da177e4 1953
c7066f70 1954#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1955#ifdef CONFIG_IP_MULTIPLE_TABLES
1956 set_class_tag(rt, fib_rules_tclass(res));
1957#endif
1958 set_class_tag(rt, itag);
1959#endif
1da177e4
LT
1960}
1961
5c1e6aa3
DM
1962static struct rtable *rt_dst_alloc(struct net_device *dev,
1963 bool nopolicy, bool noxfrm)
0c4dcd58 1964{
5c1e6aa3
DM
1965 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1966 DST_HOST |
1967 (nopolicy ? DST_NOPOLICY : 0) |
1968 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1969}
1970
96d36220 1971/* called in rcu_read_lock() section */
9e12bb22 1972static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1973 u8 tos, struct net_device *dev, int our)
1974{
96d36220 1975 unsigned int hash;
1da177e4 1976 struct rtable *rth;
96d36220 1977 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1978 u32 itag = 0;
b5f7e755 1979 int err;
1da177e4
LT
1980
1981 /* Primary sanity checks. */
1982
1983 if (in_dev == NULL)
1984 return -EINVAL;
1985
1e637c74 1986 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1987 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1988 goto e_inval;
1989
d0daebc3
TG
1990 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1991 if (ipv4_is_loopback(saddr))
1992 goto e_inval;
1993
f97c1e0c
JP
1994 if (ipv4_is_zeronet(saddr)) {
1995 if (!ipv4_is_local_multicast(daddr))
1da177e4 1996 goto e_inval;
b5f7e755 1997 } else {
9e56e380
DM
1998 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1999 in_dev, &itag);
b5f7e755
ED
2000 if (err < 0)
2001 goto e_err;
2002 }
4e7b2f14 2003 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
5c1e6aa3 2004 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2005 if (!rth)
2006 goto e_nobufs;
2007
cf911662
DM
2008#ifdef CONFIG_IP_ROUTE_CLASSID
2009 rth->dst.tclassid = itag;
2010#endif
d8d1f30b 2011 rth->dst.output = ip_rt_bug;
1da177e4 2012
5e2b61f7 2013 rth->rt_key_dst = daddr;
5e2b61f7 2014 rth->rt_key_src = saddr;
cf911662
DM
2015 rth->rt_genid = rt_genid(dev_net(dev));
2016 rth->rt_flags = RTCF_MULTICAST;
2017 rth->rt_type = RTN_MULTICAST;
475949d8 2018 rth->rt_key_tos = tos;
cf911662 2019 rth->rt_dst = daddr;
1da177e4 2020 rth->rt_src = saddr;
1b86a58f 2021 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2022 rth->rt_iif = dev->ifindex;
5e2b61f7 2023 rth->rt_oif = 0;
cf911662 2024 rth->rt_mark = skb->mark;
1da177e4 2025 rth->rt_gateway = daddr;
cf911662 2026 rth->rt_peer_genid = 0;
97bab73f 2027 rt_init_peer(rth, dev_net(dev)->ipv4.peers);
cf911662 2028 rth->fi = NULL;
1da177e4 2029 if (our) {
d8d1f30b 2030 rth->dst.input= ip_local_deliver;
1da177e4
LT
2031 rth->rt_flags |= RTCF_LOCAL;
2032 }
2033
2034#ifdef CONFIG_IP_MROUTE
f97c1e0c 2035 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2036 rth->dst.input = ip_mr_input;
1da177e4
LT
2037#endif
2038 RT_CACHE_STAT_INC(in_slow_mc);
2039
e84f84f2 2040 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 2041 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 2042 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
2043
2044e_nobufs:
1da177e4 2045 return -ENOBUFS;
1da177e4 2046e_inval:
96d36220 2047 return -EINVAL;
b5f7e755 2048e_err:
b5f7e755 2049 return err;
1da177e4
LT
2050}
2051
2052
2053static void ip_handle_martian_source(struct net_device *dev,
2054 struct in_device *in_dev,
2055 struct sk_buff *skb,
9e12bb22
AV
2056 __be32 daddr,
2057 __be32 saddr)
1da177e4
LT
2058{
2059 RT_CACHE_STAT_INC(in_martian_src);
2060#ifdef CONFIG_IP_ROUTE_VERBOSE
2061 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2062 /*
2063 * RFC1812 recommendation, if source is martian,
2064 * the only hint is MAC header.
2065 */
058bd4d2 2066 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 2067 &daddr, &saddr, dev->name);
98e399f8 2068 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
2069 print_hex_dump(KERN_WARNING, "ll header: ",
2070 DUMP_PREFIX_OFFSET, 16, 1,
2071 skb_mac_header(skb),
2072 dev->hard_header_len, true);
1da177e4
LT
2073 }
2074 }
2075#endif
2076}
2077
47360228 2078/* called in rcu_read_lock() section */
5969f71d 2079static int __mkroute_input(struct sk_buff *skb,
982721f3 2080 const struct fib_result *res,
5969f71d
SH
2081 struct in_device *in_dev,
2082 __be32 daddr, __be32 saddr, u32 tos,
2083 struct rtable **result)
1da177e4 2084{
1da177e4
LT
2085 struct rtable *rth;
2086 int err;
2087 struct in_device *out_dev;
47360228 2088 unsigned int flags = 0;
d9c9df8c 2089 u32 itag;
1da177e4
LT
2090
2091 /* get a working reference to the output device */
47360228 2092 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 2093 if (out_dev == NULL) {
e87cc472 2094 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
2095 return -EINVAL;
2096 }
2097
2098
5c04c819 2099 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 2100 in_dev->dev, in_dev, &itag);
1da177e4 2101 if (err < 0) {
e905a9ed 2102 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2103 saddr);
e905a9ed 2104
1da177e4
LT
2105 goto cleanup;
2106 }
2107
2108 if (err)
2109 flags |= RTCF_DIRECTSRC;
2110
51b77cae 2111 if (out_dev == in_dev && err &&
1da177e4
LT
2112 (IN_DEV_SHARED_MEDIA(out_dev) ||
2113 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2114 flags |= RTCF_DOREDIRECT;
2115
2116 if (skb->protocol != htons(ETH_P_IP)) {
2117 /* Not IP (i.e. ARP). Do not create route, if it is
2118 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2119 *
2120 * Proxy arp feature have been extended to allow, ARP
2121 * replies back to the same interface, to support
2122 * Private VLAN switch technologies. See arp.c.
1da177e4 2123 */
65324144
JDB
2124 if (out_dev == in_dev &&
2125 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2126 err = -EINVAL;
2127 goto cleanup;
2128 }
2129 }
2130
5c1e6aa3
DM
2131 rth = rt_dst_alloc(out_dev->dev,
2132 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2133 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2134 if (!rth) {
2135 err = -ENOBUFS;
2136 goto cleanup;
2137 }
2138
5e2b61f7 2139 rth->rt_key_dst = daddr;
5e2b61f7 2140 rth->rt_key_src = saddr;
cf911662
DM
2141 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2142 rth->rt_flags = flags;
2143 rth->rt_type = res->type;
475949d8 2144 rth->rt_key_tos = tos;
cf911662 2145 rth->rt_dst = daddr;
1da177e4 2146 rth->rt_src = saddr;
1b86a58f 2147 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2148 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2149 rth->rt_oif = 0;
cf911662
DM
2150 rth->rt_mark = skb->mark;
2151 rth->rt_gateway = daddr;
cf911662 2152 rth->rt_peer_genid = 0;
8b96d22d 2153 rt_init_peer(rth, &res->table->tb_peers);
cf911662 2154 rth->fi = NULL;
1da177e4 2155
d8d1f30b
CG
2156 rth->dst.input = ip_forward;
2157 rth->dst.output = ip_output;
1da177e4 2158
5e2b61f7 2159 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2160
1da177e4
LT
2161 *result = rth;
2162 err = 0;
2163 cleanup:
1da177e4 2164 return err;
e905a9ed 2165}
1da177e4 2166
5969f71d
SH
2167static int ip_mkroute_input(struct sk_buff *skb,
2168 struct fib_result *res,
68a5e3dd 2169 const struct flowi4 *fl4,
5969f71d
SH
2170 struct in_device *in_dev,
2171 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2172{
5e73ea1a 2173 struct rtable *rth = NULL;
1da177e4 2174 int err;
95c96174 2175 unsigned int hash;
1da177e4
LT
2176
2177#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2178 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2179 fib_select_multipath(res);
1da177e4
LT
2180#endif
2181
2182 /* create a routing cache entry */
2183 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2184 if (err)
2185 return err;
1da177e4
LT
2186
2187 /* put it into the cache */
68a5e3dd 2188 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2189 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2190 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2191 if (IS_ERR(rth))
2192 return PTR_ERR(rth);
2193 return 0;
1da177e4
LT
2194}
2195
1da177e4
LT
2196/*
2197 * NOTE. We drop all the packets that has local source
2198 * addresses, because every properly looped back packet
2199 * must have correct destination already attached by output routine.
2200 *
2201 * Such approach solves two big problems:
2202 * 1. Not simplex devices are handled properly.
2203 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2204 * called with rcu_read_lock()
1da177e4
LT
2205 */
2206
9e12bb22 2207static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 2208 u8 tos, struct net_device *dev)
1da177e4
LT
2209{
2210 struct fib_result res;
96d36220 2211 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2212 struct flowi4 fl4;
95c96174 2213 unsigned int flags = 0;
1da177e4 2214 u32 itag = 0;
95c96174
ED
2215 struct rtable *rth;
2216 unsigned int hash;
1da177e4 2217 int err = -EINVAL;
5e73ea1a 2218 struct net *net = dev_net(dev);
1da177e4
LT
2219
2220 /* IP on this device is disabled. */
2221
2222 if (!in_dev)
2223 goto out;
2224
2225 /* Check for the most weird martians, which can be not detected
2226 by fib_lookup.
2227 */
2228
d0daebc3 2229 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2230 goto martian_source;
2231
27a954bd 2232 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2233 goto brd_input;
2234
2235 /* Accept zero addresses only to limited broadcast;
2236 * I even do not know to fix it or not. Waiting for complains :-)
2237 */
f97c1e0c 2238 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2239 goto martian_source;
2240
d0daebc3 2241 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2242 goto martian_destination;
2243
d0daebc3
TG
2244 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2245 if (ipv4_is_loopback(daddr))
2246 goto martian_destination;
2247
2248 if (ipv4_is_loopback(saddr))
2249 goto martian_source;
2250 }
2251
1da177e4
LT
2252 /*
2253 * Now we are ready to route packet.
2254 */
68a5e3dd
DM
2255 fl4.flowi4_oif = 0;
2256 fl4.flowi4_iif = dev->ifindex;
2257 fl4.flowi4_mark = skb->mark;
2258 fl4.flowi4_tos = tos;
2259 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2260 fl4.daddr = daddr;
2261 fl4.saddr = saddr;
2262 err = fib_lookup(net, &fl4, &res);
251da413 2263 if (err != 0)
1da177e4 2264 goto no_route;
1da177e4
LT
2265
2266 RT_CACHE_STAT_INC(in_slow_tot);
2267
2268 if (res.type == RTN_BROADCAST)
2269 goto brd_input;
2270
2271 if (res.type == RTN_LOCAL) {
5c04c819 2272 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2273 net->loopback_dev->ifindex,
9e56e380 2274 dev, in_dev, &itag);
b5f7e755
ED
2275 if (err < 0)
2276 goto martian_source_keep_err;
2277 if (err)
1da177e4 2278 flags |= RTCF_DIRECTSRC;
1da177e4
LT
2279 goto local_input;
2280 }
2281
2282 if (!IN_DEV_FORWARD(in_dev))
251da413 2283 goto no_route;
1da177e4
LT
2284 if (res.type != RTN_UNICAST)
2285 goto martian_destination;
2286
68a5e3dd 2287 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2288out: return err;
2289
2290brd_input:
2291 if (skb->protocol != htons(ETH_P_IP))
2292 goto e_inval;
2293
41347dcd 2294 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2295 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2296 in_dev, &itag);
1da177e4 2297 if (err < 0)
b5f7e755 2298 goto martian_source_keep_err;
1da177e4
LT
2299 if (err)
2300 flags |= RTCF_DIRECTSRC;
2301 }
2302 flags |= RTCF_BROADCAST;
2303 res.type = RTN_BROADCAST;
2304 RT_CACHE_STAT_INC(in_brd);
2305
2306local_input:
5c1e6aa3
DM
2307 rth = rt_dst_alloc(net->loopback_dev,
2308 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2309 if (!rth)
2310 goto e_nobufs;
2311
cf911662 2312 rth->dst.input= ip_local_deliver;
d8d1f30b 2313 rth->dst.output= ip_rt_bug;
cf911662
DM
2314#ifdef CONFIG_IP_ROUTE_CLASSID
2315 rth->dst.tclassid = itag;
2316#endif
1da177e4 2317
5e2b61f7 2318 rth->rt_key_dst = daddr;
5e2b61f7 2319 rth->rt_key_src = saddr;
cf911662
DM
2320 rth->rt_genid = rt_genid(net);
2321 rth->rt_flags = flags|RTCF_LOCAL;
2322 rth->rt_type = res.type;
475949d8 2323 rth->rt_key_tos = tos;
cf911662 2324 rth->rt_dst = daddr;
1da177e4 2325 rth->rt_src = saddr;
1b86a58f 2326 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2327 rth->rt_iif = dev->ifindex;
cf911662
DM
2328 rth->rt_oif = 0;
2329 rth->rt_mark = skb->mark;
1da177e4 2330 rth->rt_gateway = daddr;
cf911662 2331 rth->rt_peer_genid = 0;
97bab73f 2332 rt_init_peer(rth, net->ipv4.peers);
cf911662 2333 rth->fi = NULL;
1da177e4 2334 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2335 rth->dst.input= ip_error;
2336 rth->dst.error= -err;
1da177e4
LT
2337 rth->rt_flags &= ~RTCF_LOCAL;
2338 }
68a5e3dd
DM
2339 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2340 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2341 err = 0;
2342 if (IS_ERR(rth))
2343 err = PTR_ERR(rth);
ebc0ffae 2344 goto out;
1da177e4
LT
2345
2346no_route:
2347 RT_CACHE_STAT_INC(in_no_route);
1da177e4 2348 res.type = RTN_UNREACHABLE;
7f53878d
MC
2349 if (err == -ESRCH)
2350 err = -ENETUNREACH;
1da177e4
LT
2351 goto local_input;
2352
2353 /*
2354 * Do not cache martian addresses: they should be logged (RFC1812)
2355 */
2356martian_destination:
2357 RT_CACHE_STAT_INC(in_martian_dst);
2358#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2359 if (IN_DEV_LOG_MARTIANS(in_dev))
2360 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2361 &daddr, &saddr, dev->name);
1da177e4 2362#endif
2c2910a4 2363
1da177e4
LT
2364e_inval:
2365 err = -EINVAL;
ebc0ffae 2366 goto out;
1da177e4
LT
2367
2368e_nobufs:
2369 err = -ENOBUFS;
ebc0ffae 2370 goto out;
1da177e4
LT
2371
2372martian_source:
b5f7e755
ED
2373 err = -EINVAL;
2374martian_source_keep_err:
1da177e4 2375 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2376 goto out;
1da177e4
LT
2377}
2378
407eadd9 2379int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 2380 u8 tos, struct net_device *dev, bool noref)
1da177e4 2381{
95c96174
ED
2382 struct rtable *rth;
2383 unsigned int hash;
1da177e4 2384 int iif = dev->ifindex;
b5921910 2385 struct net *net;
96d36220 2386 int res;
1da177e4 2387
c346dca1 2388 net = dev_net(dev);
1080d709 2389
96d36220
ED
2390 rcu_read_lock();
2391
1080d709
NH
2392 if (!rt_caching(net))
2393 goto skip_cache;
2394
1da177e4 2395 tos &= IPTOS_RT_MASK;
e84f84f2 2396 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2397
1da177e4 2398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2399 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2400 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2401 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2402 (rth->rt_route_iif ^ iif) |
475949d8 2403 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2404 rth->rt_mark == skb->mark &&
d8d1f30b 2405 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2406 !rt_is_expired(rth)) {
de398fb8 2407 ipv4_validate_peer(rth);
407eadd9 2408 if (noref) {
d8d1f30b
CG
2409 dst_use_noref(&rth->dst, jiffies);
2410 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2411 } else {
d8d1f30b
CG
2412 dst_use(&rth->dst, jiffies);
2413 skb_dst_set(skb, &rth->dst);
407eadd9 2414 }
1da177e4
LT
2415 RT_CACHE_STAT_INC(in_hit);
2416 rcu_read_unlock();
1da177e4
LT
2417 return 0;
2418 }
2419 RT_CACHE_STAT_INC(in_hlist_search);
2420 }
1da177e4 2421
1080d709 2422skip_cache:
1da177e4
LT
2423 /* Multicast recognition logic is moved from route cache to here.
2424 The problem was that too many Ethernet cards have broken/missing
2425 hardware multicast filters :-( As result the host on multicasting
2426 network acquires a lot of useless route cache entries, sort of
2427 SDR messages from all the world. Now we try to get rid of them.
2428 Really, provided software IP multicast filter is organized
2429 reasonably (at least, hashed), it does not result in a slowdown
2430 comparing with route cache reject entries.
2431 Note, that multicast routers are not affected, because
2432 route cache entry is created eventually.
2433 */
f97c1e0c 2434 if (ipv4_is_multicast(daddr)) {
96d36220 2435 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2436
96d36220 2437 if (in_dev) {
dbdd9a52
DM
2438 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2439 ip_hdr(skb)->protocol);
1da177e4
LT
2440 if (our
2441#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2442 ||
2443 (!ipv4_is_local_multicast(daddr) &&
2444 IN_DEV_MFORWARD(in_dev))
1da177e4 2445#endif
9d4fb27d 2446 ) {
96d36220
ED
2447 int res = ip_route_input_mc(skb, daddr, saddr,
2448 tos, dev, our);
1da177e4 2449 rcu_read_unlock();
96d36220 2450 return res;
1da177e4
LT
2451 }
2452 }
2453 rcu_read_unlock();
2454 return -EINVAL;
2455 }
c10237e0 2456 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
2457 rcu_read_unlock();
2458 return res;
1da177e4 2459}
407eadd9 2460EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2461
ebc0ffae 2462/* called with rcu_read_lock() */
982721f3 2463static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2464 const struct flowi4 *fl4,
813b3b5d 2465 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2466 int orig_oif, __u8 orig_rtos,
2467 struct net_device *dev_out,
5ada5527 2468 unsigned int flags)
1da177e4 2469{
982721f3 2470 struct fib_info *fi = res->fi;
5ada5527 2471 struct in_device *in_dev;
982721f3 2472 u16 type = res->type;
5ada5527 2473 struct rtable *rth;
1da177e4 2474
d0daebc3
TG
2475 in_dev = __in_dev_get_rcu(dev_out);
2476 if (!in_dev)
5ada5527 2477 return ERR_PTR(-EINVAL);
1da177e4 2478
d0daebc3
TG
2479 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2480 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2481 return ERR_PTR(-EINVAL);
2482
68a5e3dd 2483 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2484 type = RTN_BROADCAST;
68a5e3dd 2485 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2486 type = RTN_MULTICAST;
68a5e3dd 2487 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2488 return ERR_PTR(-EINVAL);
1da177e4
LT
2489
2490 if (dev_out->flags & IFF_LOOPBACK)
2491 flags |= RTCF_LOCAL;
2492
982721f3 2493 if (type == RTN_BROADCAST) {
1da177e4 2494 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2495 fi = NULL;
2496 } else if (type == RTN_MULTICAST) {
dd28d1a0 2497 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2498 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2499 fl4->flowi4_proto))
1da177e4
LT
2500 flags &= ~RTCF_LOCAL;
2501 /* If multicast route do not exist use
dd28d1a0
ED
2502 * default one, but do not gateway in this case.
2503 * Yes, it is hack.
1da177e4 2504 */
982721f3
DM
2505 if (fi && res->prefixlen < 4)
2506 fi = NULL;
1da177e4
LT
2507 }
2508
5c1e6aa3
DM
2509 rth = rt_dst_alloc(dev_out,
2510 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2511 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2512 if (!rth)
5ada5527 2513 return ERR_PTR(-ENOBUFS);
8391d07b 2514
cf911662
DM
2515 rth->dst.output = ip_output;
2516
813b3b5d
DM
2517 rth->rt_key_dst = orig_daddr;
2518 rth->rt_key_src = orig_saddr;
cf911662
DM
2519 rth->rt_genid = rt_genid(dev_net(dev_out));
2520 rth->rt_flags = flags;
2521 rth->rt_type = type;
f61759e6 2522 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2523 rth->rt_dst = fl4->daddr;
2524 rth->rt_src = fl4->saddr;
1b86a58f 2525 rth->rt_route_iif = 0;
813b3b5d
DM
2526 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2527 rth->rt_oif = orig_oif;
2528 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd 2529 rth->rt_gateway = fl4->daddr;
cf911662 2530 rth->rt_peer_genid = 0;
8b96d22d
DM
2531 rt_init_peer(rth, (res->table ?
2532 &res->table->tb_peers :
2533 dev_net(dev_out)->ipv4.peers));
cf911662 2534 rth->fi = NULL;
1da177e4
LT
2535
2536 RT_CACHE_STAT_INC(out_slow_tot);
2537
41347dcd 2538 if (flags & RTCF_LOCAL)
d8d1f30b 2539 rth->dst.input = ip_local_deliver;
1da177e4 2540 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2541 if (flags & RTCF_LOCAL &&
1da177e4 2542 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2543 rth->dst.output = ip_mc_output;
1da177e4
LT
2544 RT_CACHE_STAT_INC(out_slow_mc);
2545 }
2546#ifdef CONFIG_IP_MROUTE
982721f3 2547 if (type == RTN_MULTICAST) {
1da177e4 2548 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2549 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2550 rth->dst.input = ip_mr_input;
2551 rth->dst.output = ip_mc_output;
1da177e4
LT
2552 }
2553 }
2554#endif
2555 }
2556
813b3b5d 2557 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2558
7586eceb
ED
2559 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2560 rth->dst.flags |= DST_NOCACHE;
2561
5ada5527 2562 return rth;
1da177e4
LT
2563}
2564
1da177e4
LT
2565/*
2566 * Major route resolver routine.
0197aa38 2567 * called with rcu_read_lock();
1da177e4
LT
2568 */
2569
813b3b5d 2570static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2571{
1da177e4 2572 struct net_device *dev_out = NULL;
f61759e6 2573 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2574 unsigned int flags = 0;
2575 struct fib_result res;
5ada5527 2576 struct rtable *rth;
813b3b5d
DM
2577 __be32 orig_daddr;
2578 __be32 orig_saddr;
2579 int orig_oif;
1da177e4
LT
2580
2581 res.fi = NULL;
8b96d22d 2582 res.table = NULL;
1da177e4
LT
2583#ifdef CONFIG_IP_MULTIPLE_TABLES
2584 res.r = NULL;
2585#endif
2586
813b3b5d
DM
2587 orig_daddr = fl4->daddr;
2588 orig_saddr = fl4->saddr;
2589 orig_oif = fl4->flowi4_oif;
2590
2591 fl4->flowi4_iif = net->loopback_dev->ifindex;
2592 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2593 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2594 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2595
010c2708 2596 rcu_read_lock();
813b3b5d 2597 if (fl4->saddr) {
b23dd4fe 2598 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2599 if (ipv4_is_multicast(fl4->saddr) ||
2600 ipv4_is_lbcast(fl4->saddr) ||
2601 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2602 goto out;
2603
1da177e4
LT
2604 /* I removed check for oif == dev_out->oif here.
2605 It was wrong for two reasons:
1ab35276
DL
2606 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2607 is assigned to multiple interfaces.
1da177e4
LT
2608 2. Moreover, we are allowed to send packets with saddr
2609 of another iface. --ANK
2610 */
2611
813b3b5d
DM
2612 if (fl4->flowi4_oif == 0 &&
2613 (ipv4_is_multicast(fl4->daddr) ||
2614 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2615 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2616 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2617 if (dev_out == NULL)
2618 goto out;
2619
1da177e4
LT
2620 /* Special hack: user can direct multicasts
2621 and limited broadcast via necessary interface
2622 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2623 This hack is not just for fun, it allows
2624 vic,vat and friends to work.
2625 They bind socket to loopback, set ttl to zero
2626 and expect that it will work.
2627 From the viewpoint of routing cache they are broken,
2628 because we are not allowed to build multicast path
2629 with loopback source addr (look, routing cache
2630 cannot know, that ttl is zero, so that packet
2631 will not leave this host and route is valid).
2632 Luckily, this hack is good workaround.
2633 */
2634
813b3b5d 2635 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2636 goto make_route;
2637 }
a210d01a 2638
813b3b5d 2639 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2640 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2641 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2642 goto out;
a210d01a 2643 }
1da177e4
LT
2644 }
2645
2646
813b3b5d
DM
2647 if (fl4->flowi4_oif) {
2648 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2649 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2650 if (dev_out == NULL)
2651 goto out;
e5ed6399
HX
2652
2653 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2654 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2655 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2656 goto out;
2657 }
813b3b5d
DM
2658 if (ipv4_is_local_multicast(fl4->daddr) ||
2659 ipv4_is_lbcast(fl4->daddr)) {
2660 if (!fl4->saddr)
2661 fl4->saddr = inet_select_addr(dev_out, 0,
2662 RT_SCOPE_LINK);
1da177e4
LT
2663 goto make_route;
2664 }
813b3b5d
DM
2665 if (fl4->saddr) {
2666 if (ipv4_is_multicast(fl4->daddr))
2667 fl4->saddr = inet_select_addr(dev_out, 0,
2668 fl4->flowi4_scope);
2669 else if (!fl4->daddr)
2670 fl4->saddr = inet_select_addr(dev_out, 0,
2671 RT_SCOPE_HOST);
1da177e4
LT
2672 }
2673 }
2674
813b3b5d
DM
2675 if (!fl4->daddr) {
2676 fl4->daddr = fl4->saddr;
2677 if (!fl4->daddr)
2678 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2679 dev_out = net->loopback_dev;
813b3b5d 2680 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2681 res.type = RTN_LOCAL;
2682 flags |= RTCF_LOCAL;
2683 goto make_route;
2684 }
2685
813b3b5d 2686 if (fib_lookup(net, fl4, &res)) {
1da177e4 2687 res.fi = NULL;
8b96d22d 2688 res.table = NULL;
813b3b5d 2689 if (fl4->flowi4_oif) {
1da177e4
LT
2690 /* Apparently, routing tables are wrong. Assume,
2691 that the destination is on link.
2692
2693 WHY? DW.
2694 Because we are allowed to send to iface
2695 even if it has NO routes and NO assigned
2696 addresses. When oif is specified, routing
2697 tables are looked up with only one purpose:
2698 to catch if destination is gatewayed, rather than
2699 direct. Moreover, if MSG_DONTROUTE is set,
2700 we send packet, ignoring both routing tables
2701 and ifaddr state. --ANK
2702
2703
2704 We could make it even if oif is unknown,
2705 likely IPv6, but we do not.
2706 */
2707
813b3b5d
DM
2708 if (fl4->saddr == 0)
2709 fl4->saddr = inet_select_addr(dev_out, 0,
2710 RT_SCOPE_LINK);
1da177e4
LT
2711 res.type = RTN_UNICAST;
2712 goto make_route;
2713 }
b23dd4fe 2714 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2715 goto out;
2716 }
1da177e4
LT
2717
2718 if (res.type == RTN_LOCAL) {
813b3b5d 2719 if (!fl4->saddr) {
9fc3bbb4 2720 if (res.fi->fib_prefsrc)
813b3b5d 2721 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2722 else
813b3b5d 2723 fl4->saddr = fl4->daddr;
9fc3bbb4 2724 }
b40afd0e 2725 dev_out = net->loopback_dev;
813b3b5d 2726 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2727 res.fi = NULL;
2728 flags |= RTCF_LOCAL;
2729 goto make_route;
2730 }
2731
2732#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2733 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2734 fib_select_multipath(&res);
1da177e4
LT
2735 else
2736#endif
21d8c49e
DM
2737 if (!res.prefixlen &&
2738 res.table->tb_num_default > 1 &&
813b3b5d 2739 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2740 fib_select_default(&res);
1da177e4 2741
813b3b5d
DM
2742 if (!fl4->saddr)
2743 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2744
1da177e4 2745 dev_out = FIB_RES_DEV(res);
813b3b5d 2746 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2747
2748
2749make_route:
813b3b5d 2750 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2751 tos, dev_out, flags);
b23dd4fe 2752 if (!IS_ERR(rth)) {
5ada5527
DM
2753 unsigned int hash;
2754
813b3b5d 2755 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2756 rt_genid(dev_net(dev_out)));
813b3b5d 2757 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2758 }
1da177e4 2759
010c2708
DM
2760out:
2761 rcu_read_unlock();
b23dd4fe 2762 return rth;
1da177e4
LT
2763}
2764
813b3b5d 2765struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2766{
1da177e4 2767 struct rtable *rth;
010c2708 2768 unsigned int hash;
1da177e4 2769
1080d709
NH
2770 if (!rt_caching(net))
2771 goto slow_output;
2772
9d6ec938 2773 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2774
2775 rcu_read_lock_bh();
a898def2 2776 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2777 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2778 if (rth->rt_key_dst == flp4->daddr &&
2779 rth->rt_key_src == flp4->saddr &&
c7537967 2780 rt_is_output_route(rth) &&
9d6ec938
DM
2781 rth->rt_oif == flp4->flowi4_oif &&
2782 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2783 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2784 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2785 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2786 !rt_is_expired(rth)) {
de398fb8 2787 ipv4_validate_peer(rth);
d8d1f30b 2788 dst_use(&rth->dst, jiffies);
1da177e4
LT
2789 RT_CACHE_STAT_INC(out_hit);
2790 rcu_read_unlock_bh();
56157872
DM
2791 if (!flp4->saddr)
2792 flp4->saddr = rth->rt_src;
2793 if (!flp4->daddr)
2794 flp4->daddr = rth->rt_dst;
b23dd4fe 2795 return rth;
1da177e4
LT
2796 }
2797 RT_CACHE_STAT_INC(out_hlist_search);
2798 }
2799 rcu_read_unlock_bh();
2800
1080d709 2801slow_output:
9d6ec938 2802 return ip_route_output_slow(net, flp4);
1da177e4 2803}
d8c97a94
ACM
2804EXPORT_SYMBOL_GPL(__ip_route_output_key);
2805
ae2688d5
JW
2806static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2807{
2808 return NULL;
2809}
2810
ebb762f2 2811static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2812{
618f9bc7
SK
2813 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2814
2815 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2816}
2817
14e50e57
DM
2818static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2819{
2820}
2821
0972ddb2
HB
2822static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2823 unsigned long old)
2824{
2825 return NULL;
2826}
2827
14e50e57
DM
2828static struct dst_ops ipv4_dst_blackhole_ops = {
2829 .family = AF_INET,
09640e63 2830 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2831 .destroy = ipv4_dst_destroy,
ae2688d5 2832 .check = ipv4_blackhole_dst_check,
ebb762f2 2833 .mtu = ipv4_blackhole_mtu,
214f45c9 2834 .default_advmss = ipv4_default_advmss,
14e50e57 2835 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2836 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2837 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2838};
2839
2774c131 2840struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2841{
5c1e6aa3 2842 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2843 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2844
2845 if (rt) {
d8d1f30b 2846 struct dst_entry *new = &rt->dst;
14e50e57 2847
14e50e57 2848 new->__use = 1;
352e512c
HX
2849 new->input = dst_discard;
2850 new->output = dst_discard;
defb3519 2851 dst_copy_metrics(new, &ort->dst);
14e50e57 2852
d8d1f30b 2853 new->dev = ort->dst.dev;
14e50e57
DM
2854 if (new->dev)
2855 dev_hold(new->dev);
2856
5e2b61f7
DM
2857 rt->rt_key_dst = ort->rt_key_dst;
2858 rt->rt_key_src = ort->rt_key_src;
475949d8 2859 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2860 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2861 rt->rt_iif = ort->rt_iif;
2862 rt->rt_oif = ort->rt_oif;
2863 rt->rt_mark = ort->rt_mark;
14e50e57 2864
e84f84f2 2865 rt->rt_genid = rt_genid(net);
14e50e57
DM
2866 rt->rt_flags = ort->rt_flags;
2867 rt->rt_type = ort->rt_type;
2868 rt->rt_dst = ort->rt_dst;
2869 rt->rt_src = ort->rt_src;
14e50e57 2870 rt->rt_gateway = ort->rt_gateway;
97bab73f 2871 rt_transfer_peer(rt, ort);
62fa8a84
DM
2872 rt->fi = ort->fi;
2873 if (rt->fi)
2874 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2875
2876 dst_free(new);
2877 }
2878
2774c131
DM
2879 dst_release(dst_orig);
2880
2881 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2882}
2883
9d6ec938 2884struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2885 struct sock *sk)
1da177e4 2886{
9d6ec938 2887 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2888
b23dd4fe
DM
2889 if (IS_ERR(rt))
2890 return rt;
1da177e4 2891
56157872 2892 if (flp4->flowi4_proto)
9d6ec938
DM
2893 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2894 flowi4_to_flowi(flp4),
2895 sk, 0);
1da177e4 2896
b23dd4fe 2897 return rt;
1da177e4 2898}
d8c97a94
ACM
2899EXPORT_SYMBOL_GPL(ip_route_output_flow);
2900
4feb88e5
BT
2901static int rt_fill_info(struct net *net,
2902 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2903 int nowait, unsigned int flags)
1da177e4 2904{
511c3f92 2905 struct rtable *rt = skb_rtable(skb);
1da177e4 2906 struct rtmsg *r;
be403ea1 2907 struct nlmsghdr *nlh;
2bc8ca40 2908 unsigned long expires = 0;
e3703b3d 2909 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2910
2911 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2912 if (nlh == NULL)
26932566 2913 return -EMSGSIZE;
be403ea1
TG
2914
2915 r = nlmsg_data(nlh);
1da177e4
LT
2916 r->rtm_family = AF_INET;
2917 r->rtm_dst_len = 32;
2918 r->rtm_src_len = 0;
475949d8 2919 r->rtm_tos = rt->rt_key_tos;
1da177e4 2920 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2921 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2922 goto nla_put_failure;
1da177e4
LT
2923 r->rtm_type = rt->rt_type;
2924 r->rtm_scope = RT_SCOPE_UNIVERSE;
2925 r->rtm_protocol = RTPROT_UNSPEC;
2926 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2927 if (rt->rt_flags & RTCF_NOTIFY)
2928 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2929
f3756b79
DM
2930 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2931 goto nla_put_failure;
5e2b61f7 2932 if (rt->rt_key_src) {
1da177e4 2933 r->rtm_src_len = 32;
f3756b79
DM
2934 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2935 goto nla_put_failure;
1da177e4 2936 }
f3756b79
DM
2937 if (rt->dst.dev &&
2938 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2939 goto nla_put_failure;
c7066f70 2940#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2941 if (rt->dst.tclassid &&
2942 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2943 goto nla_put_failure;
1da177e4 2944#endif
41347dcd
DM
2945 if (!rt_is_input_route(rt) &&
2946 rt->rt_src != rt->rt_key_src) {
f3756b79
DM
2947 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2948 goto nla_put_failure;
2949 }
2950 if (rt->rt_dst != rt->rt_gateway &&
2951 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2952 goto nla_put_failure;
be403ea1 2953
defb3519 2954 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2955 goto nla_put_failure;
2956
f3756b79
DM
2957 if (rt->rt_mark &&
2958 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2959 goto nla_put_failure;
963bfeee 2960
d8d1f30b 2961 error = rt->dst.error;
97bab73f
DM
2962 if (rt_has_peer(rt)) {
2963 const struct inet_peer *peer = rt_peer_ptr(rt);
2964 inet_peer_refcheck(peer);
fe6fe792
ED
2965 id = atomic_read(&peer->ip_id_count) & 0xffff;
2966 if (peer->tcp_ts_stamp) {
2967 ts = peer->tcp_ts;
2968 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 2969 }
fe6fe792 2970 expires = ACCESS_ONCE(peer->pmtu_expires);
2bc8ca40
SK
2971 if (expires) {
2972 if (time_before(jiffies, expires))
2973 expires -= jiffies;
2974 else
2975 expires = 0;
2976 }
1da177e4 2977 }
be403ea1 2978
c7537967 2979 if (rt_is_input_route(rt)) {
1da177e4 2980#ifdef CONFIG_IP_MROUTE
e448515c 2981 __be32 dst = rt->rt_dst;
1da177e4 2982
f97c1e0c 2983 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2984 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2985 int err = ipmr_get_route(net, skb,
2986 rt->rt_src, rt->rt_dst,
2987 r, nowait);
1da177e4
LT
2988 if (err <= 0) {
2989 if (!nowait) {
2990 if (err == 0)
2991 return 0;
be403ea1 2992 goto nla_put_failure;
1da177e4
LT
2993 } else {
2994 if (err == -EMSGSIZE)
be403ea1 2995 goto nla_put_failure;
e3703b3d 2996 error = err;
1da177e4
LT
2997 }
2998 }
2999 } else
3000#endif
f3756b79
DM
3001 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3002 goto nla_put_failure;
1da177e4
LT
3003 }
3004
d8d1f30b 3005 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
3006 expires, error) < 0)
3007 goto nla_put_failure;
be403ea1
TG
3008
3009 return nlmsg_end(skb, nlh);
1da177e4 3010
be403ea1 3011nla_put_failure:
26932566
PM
3012 nlmsg_cancel(skb, nlh);
3013 return -EMSGSIZE;
1da177e4
LT
3014}
3015
5e73ea1a 3016static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1da177e4 3017{
3b1e0a65 3018 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3019 struct rtmsg *rtm;
3020 struct nlattr *tb[RTA_MAX+1];
1da177e4 3021 struct rtable *rt = NULL;
9e12bb22
AV
3022 __be32 dst = 0;
3023 __be32 src = 0;
3024 u32 iif;
d889ce3b 3025 int err;
963bfeee 3026 int mark;
1da177e4
LT
3027 struct sk_buff *skb;
3028
d889ce3b
TG
3029 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3030 if (err < 0)
3031 goto errout;
3032
3033 rtm = nlmsg_data(nlh);
3034
1da177e4 3035 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3036 if (skb == NULL) {
3037 err = -ENOBUFS;
3038 goto errout;
3039 }
1da177e4
LT
3040
3041 /* Reserve room for dummy headers, this skb can pass
3042 through good chunk of routing engine.
3043 */
459a98ed 3044 skb_reset_mac_header(skb);
c1d2bbe1 3045 skb_reset_network_header(skb);
d2c962b8
SH
3046
3047 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3048 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3049 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3050
17fb2c64
AV
3051 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3052 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3053 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3054 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3055
3056 if (iif) {
d889ce3b
TG
3057 struct net_device *dev;
3058
1937504d 3059 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3060 if (dev == NULL) {
3061 err = -ENODEV;
3062 goto errout_free;
3063 }
3064
1da177e4
LT
3065 skb->protocol = htons(ETH_P_IP);
3066 skb->dev = dev;
963bfeee 3067 skb->mark = mark;
1da177e4
LT
3068 local_bh_disable();
3069 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3070 local_bh_enable();
d889ce3b 3071
511c3f92 3072 rt = skb_rtable(skb);
d8d1f30b
CG
3073 if (err == 0 && rt->dst.error)
3074 err = -rt->dst.error;
1da177e4 3075 } else {
68a5e3dd
DM
3076 struct flowi4 fl4 = {
3077 .daddr = dst,
3078 .saddr = src,
3079 .flowi4_tos = rtm->rtm_tos,
3080 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3081 .flowi4_mark = mark,
d889ce3b 3082 };
9d6ec938 3083 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
3084
3085 err = 0;
3086 if (IS_ERR(rt))
3087 err = PTR_ERR(rt);
1da177e4 3088 }
d889ce3b 3089
1da177e4 3090 if (err)
d889ce3b 3091 goto errout_free;
1da177e4 3092
d8d1f30b 3093 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3094 if (rtm->rtm_flags & RTM_F_NOTIFY)
3095 rt->rt_flags |= RTCF_NOTIFY;
3096
4feb88e5 3097 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3098 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3099 if (err <= 0)
3100 goto errout_free;
1da177e4 3101
1937504d 3102 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3103errout:
2942e900 3104 return err;
1da177e4 3105
d889ce3b 3106errout_free:
1da177e4 3107 kfree_skb(skb);
d889ce3b 3108 goto errout;
1da177e4
LT
3109}
3110
3111int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3112{
3113 struct rtable *rt;
3114 int h, s_h;
3115 int idx, s_idx;
1937504d
DL
3116 struct net *net;
3117
3b1e0a65 3118 net = sock_net(skb->sk);
1da177e4
LT
3119
3120 s_h = cb->args[0];
d8c92830
ED
3121 if (s_h < 0)
3122 s_h = 0;
1da177e4 3123 s_idx = idx = cb->args[1];
a6272665
ED
3124 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3125 if (!rt_hash_table[h].chain)
3126 continue;
1da177e4 3127 rcu_read_lock_bh();
a898def2 3128 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3129 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3130 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3131 continue;
e84f84f2 3132 if (rt_is_expired(rt))
29e75252 3133 continue;
d8d1f30b 3134 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3135 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3136 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3137 1, NLM_F_MULTI) <= 0) {
adf30907 3138 skb_dst_drop(skb);
1da177e4
LT
3139 rcu_read_unlock_bh();
3140 goto done;
3141 }
adf30907 3142 skb_dst_drop(skb);
1da177e4
LT
3143 }
3144 rcu_read_unlock_bh();
3145 }
3146
3147done:
3148 cb->args[0] = h;
3149 cb->args[1] = idx;
3150 return skb->len;
3151}
3152
3153void ip_rt_multicast_event(struct in_device *in_dev)
3154{
76e6ebfb 3155 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3156}
3157
3158#ifdef CONFIG_SYSCTL
81c684d1 3159static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3160 void __user *buffer,
1da177e4
LT
3161 size_t *lenp, loff_t *ppos)
3162{
3163 if (write) {
639e104f 3164 int flush_delay;
81c684d1 3165 ctl_table ctl;
39a23e75 3166 struct net *net;
639e104f 3167
81c684d1
DL
3168 memcpy(&ctl, __ctl, sizeof(ctl));
3169 ctl.data = &flush_delay;
8d65af78 3170 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3171
81c684d1 3172 net = (struct net *)__ctl->extra1;
39a23e75 3173 rt_cache_flush(net, flush_delay);
1da177e4 3174 return 0;
e905a9ed 3175 }
1da177e4
LT
3176
3177 return -EINVAL;
3178}
3179
eeb61f71 3180static ctl_table ipv4_route_table[] = {
1da177e4 3181 {
1da177e4
LT
3182 .procname = "gc_thresh",
3183 .data = &ipv4_dst_ops.gc_thresh,
3184 .maxlen = sizeof(int),
3185 .mode = 0644,
6d9f239a 3186 .proc_handler = proc_dointvec,
1da177e4
LT
3187 },
3188 {
1da177e4
LT
3189 .procname = "max_size",
3190 .data = &ip_rt_max_size,
3191 .maxlen = sizeof(int),
3192 .mode = 0644,
6d9f239a 3193 .proc_handler = proc_dointvec,
1da177e4
LT
3194 },
3195 {
3196 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3197
1da177e4
LT
3198 .procname = "gc_min_interval",
3199 .data = &ip_rt_gc_min_interval,
3200 .maxlen = sizeof(int),
3201 .mode = 0644,
6d9f239a 3202 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3203 },
3204 {
1da177e4
LT
3205 .procname = "gc_min_interval_ms",
3206 .data = &ip_rt_gc_min_interval,
3207 .maxlen = sizeof(int),
3208 .mode = 0644,
6d9f239a 3209 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3210 },
3211 {
1da177e4
LT
3212 .procname = "gc_timeout",
3213 .data = &ip_rt_gc_timeout,
3214 .maxlen = sizeof(int),
3215 .mode = 0644,
6d9f239a 3216 .proc_handler = proc_dointvec_jiffies,
1da177e4 3217 },
9f28a2fc
ED
3218 {
3219 .procname = "gc_interval",
3220 .data = &ip_rt_gc_interval,
3221 .maxlen = sizeof(int),
3222 .mode = 0644,
3223 .proc_handler = proc_dointvec_jiffies,
3224 },
1da177e4 3225 {
1da177e4
LT
3226 .procname = "redirect_load",
3227 .data = &ip_rt_redirect_load,
3228 .maxlen = sizeof(int),
3229 .mode = 0644,
6d9f239a 3230 .proc_handler = proc_dointvec,
1da177e4
LT
3231 },
3232 {
1da177e4
LT
3233 .procname = "redirect_number",
3234 .data = &ip_rt_redirect_number,
3235 .maxlen = sizeof(int),
3236 .mode = 0644,
6d9f239a 3237 .proc_handler = proc_dointvec,
1da177e4
LT
3238 },
3239 {
1da177e4
LT
3240 .procname = "redirect_silence",
3241 .data = &ip_rt_redirect_silence,
3242 .maxlen = sizeof(int),
3243 .mode = 0644,
6d9f239a 3244 .proc_handler = proc_dointvec,
1da177e4
LT
3245 },
3246 {
1da177e4
LT
3247 .procname = "error_cost",
3248 .data = &ip_rt_error_cost,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
6d9f239a 3251 .proc_handler = proc_dointvec,
1da177e4
LT
3252 },
3253 {
1da177e4
LT
3254 .procname = "error_burst",
3255 .data = &ip_rt_error_burst,
3256 .maxlen = sizeof(int),
3257 .mode = 0644,
6d9f239a 3258 .proc_handler = proc_dointvec,
1da177e4
LT
3259 },
3260 {
1da177e4
LT
3261 .procname = "gc_elasticity",
3262 .data = &ip_rt_gc_elasticity,
3263 .maxlen = sizeof(int),
3264 .mode = 0644,
6d9f239a 3265 .proc_handler = proc_dointvec,
1da177e4
LT
3266 },
3267 {
1da177e4
LT
3268 .procname = "mtu_expires",
3269 .data = &ip_rt_mtu_expires,
3270 .maxlen = sizeof(int),
3271 .mode = 0644,
6d9f239a 3272 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3273 },
3274 {
1da177e4
LT
3275 .procname = "min_pmtu",
3276 .data = &ip_rt_min_pmtu,
3277 .maxlen = sizeof(int),
3278 .mode = 0644,
6d9f239a 3279 .proc_handler = proc_dointvec,
1da177e4
LT
3280 },
3281 {
1da177e4
LT
3282 .procname = "min_adv_mss",
3283 .data = &ip_rt_min_advmss,
3284 .maxlen = sizeof(int),
3285 .mode = 0644,
6d9f239a 3286 .proc_handler = proc_dointvec,
1da177e4 3287 },
f8572d8f 3288 { }
1da177e4 3289};
39a23e75 3290
39a23e75
DL
3291static struct ctl_table ipv4_route_flush_table[] = {
3292 {
39a23e75
DL
3293 .procname = "flush",
3294 .maxlen = sizeof(int),
3295 .mode = 0200,
6d9f239a 3296 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3297 },
f8572d8f 3298 { },
39a23e75
DL
3299};
3300
3301static __net_init int sysctl_route_net_init(struct net *net)
3302{
3303 struct ctl_table *tbl;
3304
3305 tbl = ipv4_route_flush_table;
09ad9bc7 3306 if (!net_eq(net, &init_net)) {
39a23e75
DL
3307 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3308 if (tbl == NULL)
3309 goto err_dup;
3310 }
3311 tbl[0].extra1 = net;
3312
ec8f23ce 3313 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
3314 if (net->ipv4.route_hdr == NULL)
3315 goto err_reg;
3316 return 0;
3317
3318err_reg:
3319 if (tbl != ipv4_route_flush_table)
3320 kfree(tbl);
3321err_dup:
3322 return -ENOMEM;
3323}
3324
3325static __net_exit void sysctl_route_net_exit(struct net *net)
3326{
3327 struct ctl_table *tbl;
3328
3329 tbl = net->ipv4.route_hdr->ctl_table_arg;
3330 unregister_net_sysctl_table(net->ipv4.route_hdr);
3331 BUG_ON(tbl == ipv4_route_flush_table);
3332 kfree(tbl);
3333}
3334
3335static __net_initdata struct pernet_operations sysctl_route_ops = {
3336 .init = sysctl_route_net_init,
3337 .exit = sysctl_route_net_exit,
3338};
1da177e4
LT
3339#endif
3340
3ee94372 3341static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3342{
3ee94372
NH
3343 get_random_bytes(&net->ipv4.rt_genid,
3344 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3345 get_random_bytes(&net->ipv4.dev_addr_genid,
3346 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3347 return 0;
3348}
3349
3ee94372
NH
3350static __net_initdata struct pernet_operations rt_genid_ops = {
3351 .init = rt_genid_init,
9f5e97e5
DL
3352};
3353
c3426b47
DM
3354static int __net_init ipv4_inetpeer_init(struct net *net)
3355{
3356 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3357
3358 if (!bp)
3359 return -ENOMEM;
3360 inet_peer_base_init(bp);
3361 net->ipv4.peers = bp;
3362 return 0;
3363}
3364
3365static void __net_exit ipv4_inetpeer_exit(struct net *net)
3366{
3367 struct inet_peer_base *bp = net->ipv4.peers;
3368
3369 net->ipv4.peers = NULL;
56a6b248 3370 inetpeer_invalidate_tree(bp);
c3426b47
DM
3371 kfree(bp);
3372}
3373
3374static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3375 .init = ipv4_inetpeer_init,
3376 .exit = ipv4_inetpeer_exit,
3377};
9f5e97e5 3378
c7066f70 3379#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3380struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3381#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3382
3383static __initdata unsigned long rhash_entries;
3384static int __init set_rhash_entries(char *str)
3385{
413c27d8
EZ
3386 ssize_t ret;
3387
1da177e4
LT
3388 if (!str)
3389 return 0;
413c27d8
EZ
3390
3391 ret = kstrtoul(str, 0, &rhash_entries);
3392 if (ret)
3393 return 0;
3394
1da177e4
LT
3395 return 1;
3396}
3397__setup("rhash_entries=", set_rhash_entries);
3398
3399int __init ip_rt_init(void)
3400{
424c4b70 3401 int rc = 0;
1da177e4 3402
c7066f70 3403#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3404 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3405 if (!ip_rt_acct)
3406 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3407#endif
3408
e5d679f3
AD
3409 ipv4_dst_ops.kmem_cachep =
3410 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3411 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3412
14e50e57
DM
3413 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3414
fc66f95c
ED
3415 if (dst_entries_init(&ipv4_dst_ops) < 0)
3416 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3417
3418 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3419 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3420
424c4b70
ED
3421 rt_hash_table = (struct rt_hash_bucket *)
3422 alloc_large_system_hash("IP route cache",
3423 sizeof(struct rt_hash_bucket),
3424 rhash_entries,
4481374c 3425 (totalram_pages >= 128 * 1024) ?
18955cfc 3426 15 : 17,
8d1502de 3427 0,
424c4b70
ED
3428 &rt_hash_log,
3429 &rt_hash_mask,
31fe62b9 3430 0,
c9503e0f 3431 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3432 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3433 rt_hash_lock_init();
1da177e4
LT
3434
3435 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3436 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3437
1da177e4
LT
3438 devinet_init();
3439 ip_fib_init();
3440
9f28a2fc
ED
3441 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3442 expires_ljiffies = jiffies;
3443 schedule_delayed_work(&expires_work,
3444 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3445
73b38711 3446 if (ip_rt_proc_init())
058bd4d2 3447 pr_err("Unable to create route proc files\n");
1da177e4
LT
3448#ifdef CONFIG_XFRM
3449 xfrm_init();
a33bc5c1 3450 xfrm4_init(ip_rt_max_size);
1da177e4 3451#endif
c7ac8679 3452 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3453
39a23e75
DL
3454#ifdef CONFIG_SYSCTL
3455 register_pernet_subsys(&sysctl_route_ops);
3456#endif
3ee94372 3457 register_pernet_subsys(&rt_genid_ops);
c3426b47 3458 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3459 return rc;
3460}
3461
a1bc6eb4 3462#ifdef CONFIG_SYSCTL
eeb61f71
AV
3463/*
3464 * We really need to sanitize the damn ipv4 init order, then all
3465 * this nonsense will go away.
3466 */
3467void __init ip_static_sysctl_init(void)
3468{
4e5ca785 3469 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3470}
a1bc6eb4 3471#endif