ipv4: Fix warnings in ip_do_redirect() for some configurations.
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4 72#include <linux/mm.h>
424c4b70 73#include <linux/bootmem.h>
1da177e4
LT
74#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
39c90ece 83#include <linux/workqueue.h>
1da177e4 84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
5a0e3ad6 94#include <linux/slab.h>
b9eda06f 95#include <linux/prefetch.h>
352e512c 96#include <net/dst.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
8d71740c 108#include <net/netevent.h>
63f3444f 109#include <net/rtnetlink.h>
1da177e4
LT
110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
7426a564 112#include <linux/kmemleak.h>
1da177e4 113#endif
6e5714ea 114#include <net/secure_seq.h>
1da177e4 115
68a5e3dd 116#define RT_FL_TOS(oldflp4) \
f61759e6 117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db 124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
1080d709 136static int rt_chain_length_max __read_mostly = 20;
1da177e4 137
9f28a2fc
ED
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
1da177e4
LT
141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 147static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
e47a185b 152static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
569d3645 153static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 154
72cdd1d9
ED
155static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
156 int how)
157{
158}
1da177e4 159
62fa8a84
DM
160static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
161{
31248731
DM
162 WARN_ON(1);
163 return NULL;
62fa8a84
DM
164}
165
f894cbf8
DM
166static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
167 struct sk_buff *skb,
168 const void *daddr);
d3aaeb38 169
1da177e4
LT
170static struct dst_ops ipv4_dst_ops = {
171 .family = AF_INET,
09640e63 172 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
173 .gc = rt_garbage_collect,
174 .check = ipv4_dst_check,
0dbaee3b 175 .default_advmss = ipv4_default_advmss,
ebb762f2 176 .mtu = ipv4_mtu,
62fa8a84 177 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
178 .destroy = ipv4_dst_destroy,
179 .ifdown = ipv4_dst_ifdown,
180 .negative_advice = ipv4_negative_advice,
181 .link_failure = ipv4_link_failure,
182 .update_pmtu = ip_rt_update_pmtu,
e47a185b 183 .redirect = ip_do_redirect,
1ac06e03 184 .local_out = __ip_local_out,
d3aaeb38 185 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
186};
187
188#define ECN_OR_COST(class) TC_PRIO_##class
189
4839c52b 190const __u8 ip_tos2prio[16] = {
1da177e4 191 TC_PRIO_BESTEFFORT,
4a2b9c37 192 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
193 TC_PRIO_BESTEFFORT,
194 ECN_OR_COST(BESTEFFORT),
195 TC_PRIO_BULK,
196 ECN_OR_COST(BULK),
197 TC_PRIO_BULK,
198 ECN_OR_COST(BULK),
199 TC_PRIO_INTERACTIVE,
200 ECN_OR_COST(INTERACTIVE),
201 TC_PRIO_INTERACTIVE,
202 ECN_OR_COST(INTERACTIVE),
203 TC_PRIO_INTERACTIVE_BULK,
204 ECN_OR_COST(INTERACTIVE_BULK),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK)
207};
d4a96865 208EXPORT_SYMBOL(ip_tos2prio);
1da177e4
LT
209
210/*
211 * Route cache.
212 */
213
214/* The locking scheme is rather straight forward:
215 *
216 * 1) Read-Copy Update protects the buckets of the central route hash.
217 * 2) Only writers remove entries, and they hold the lock
218 * as they look at rtable reference counts.
219 * 3) Only readers acquire references to rtable entries,
220 * they do so with atomic increments and with the
221 * lock held.
222 */
223
224struct rt_hash_bucket {
1c31720a 225 struct rtable __rcu *chain;
22c047cc 226};
1080d709 227
8a25d5de
IM
228#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
229 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
230/*
231 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
232 * The size of this table is a power of two and depends on the number of CPUS.
62051200 233 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 234 */
62051200
IM
235#ifdef CONFIG_LOCKDEP
236# define RT_HASH_LOCK_SZ 256
22c047cc 237#else
62051200
IM
238# if NR_CPUS >= 32
239# define RT_HASH_LOCK_SZ 4096
240# elif NR_CPUS >= 16
241# define RT_HASH_LOCK_SZ 2048
242# elif NR_CPUS >= 8
243# define RT_HASH_LOCK_SZ 1024
244# elif NR_CPUS >= 4
245# define RT_HASH_LOCK_SZ 512
246# else
247# define RT_HASH_LOCK_SZ 256
248# endif
22c047cc
ED
249#endif
250
251static spinlock_t *rt_hash_locks;
252# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
253
254static __init void rt_hash_lock_init(void)
255{
256 int i;
257
258 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
259 GFP_KERNEL);
260 if (!rt_hash_locks)
261 panic("IP: failed to allocate rt_hash_locks\n");
262
263 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
264 spin_lock_init(&rt_hash_locks[i]);
265}
22c047cc
ED
266#else
267# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
268
269static inline void rt_hash_lock_init(void)
270{
271}
22c047cc 272#endif
1da177e4 273
817bc4db 274static struct rt_hash_bucket *rt_hash_table __read_mostly;
95c96174 275static unsigned int rt_hash_mask __read_mostly;
817bc4db 276static unsigned int rt_hash_log __read_mostly;
1da177e4 277
2f970d83 278static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 279#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 280
b00180de 281static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 282 int genid)
1da177e4 283{
0eae88f3 284 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 285 idx, genid)
29e75252 286 & rt_hash_mask;
1da177e4
LT
287}
288
e84f84f2
DL
289static inline int rt_genid(struct net *net)
290{
291 return atomic_read(&net->ipv4.rt_genid);
292}
293
1da177e4
LT
294#ifdef CONFIG_PROC_FS
295struct rt_cache_iter_state {
a75e936f 296 struct seq_net_private p;
1da177e4 297 int bucket;
29e75252 298 int genid;
1da177e4
LT
299};
300
1218854a 301static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 302{
1218854a 303 struct rt_cache_iter_state *st = seq->private;
1da177e4 304 struct rtable *r = NULL;
1da177e4
LT
305
306 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 307 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 308 continue;
1da177e4 309 rcu_read_lock_bh();
a898def2 310 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 311 while (r) {
d8d1f30b 312 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 313 r->rt_genid == st->genid)
29e75252 314 return r;
d8d1f30b 315 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 316 }
1da177e4
LT
317 rcu_read_unlock_bh();
318 }
29e75252 319 return r;
1da177e4
LT
320}
321
1218854a 322static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 323 struct rtable *r)
1da177e4 324{
1218854a 325 struct rt_cache_iter_state *st = seq->private;
a6272665 326
1c31720a 327 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
328 while (!r) {
329 rcu_read_unlock_bh();
a6272665
ED
330 do {
331 if (--st->bucket < 0)
332 return NULL;
33d480ce 333 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 334 rcu_read_lock_bh();
1c31720a 335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 336 }
1c31720a 337 return r;
1da177e4
LT
338}
339
1218854a 340static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
341 struct rtable *r)
342{
1218854a
YH
343 struct rt_cache_iter_state *st = seq->private;
344 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 345 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 346 continue;
642d6318
DL
347 if (r->rt_genid == st->genid)
348 break;
349 }
350 return r;
351}
352
1218854a 353static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 354{
1218854a 355 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
356
357 if (r)
1218854a 358 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
359 --pos;
360 return pos ? NULL : r;
361}
362
363static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
364{
29e75252 365 struct rt_cache_iter_state *st = seq->private;
29e75252 366 if (*pos)
1218854a 367 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 368 st->genid = rt_genid(seq_file_net(seq));
29e75252 369 return SEQ_START_TOKEN;
1da177e4
LT
370}
371
372static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373{
29e75252 374 struct rtable *r;
1da177e4
LT
375
376 if (v == SEQ_START_TOKEN)
1218854a 377 r = rt_cache_get_first(seq);
1da177e4 378 else
1218854a 379 r = rt_cache_get_next(seq, v);
1da177e4
LT
380 ++*pos;
381 return r;
382}
383
384static void rt_cache_seq_stop(struct seq_file *seq, void *v)
385{
386 if (v && v != SEQ_START_TOKEN)
387 rcu_read_unlock_bh();
388}
389
390static int rt_cache_seq_show(struct seq_file *seq, void *v)
391{
392 if (v == SEQ_START_TOKEN)
393 seq_printf(seq, "%-127s\n",
394 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
395 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
396 "HHUptod\tSpecDst");
397 else {
398 struct rtable *r = v;
3c521f2b 399 int len;
218fa90f 400
0eae88f3 401 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
794785bf
DM
402 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
403 r->dst.dev ? r->dst.dev->name : "*",
404 (__force u32)r->rt_dst,
405 (__force u32)r->rt_gateway,
406 r->rt_flags, atomic_read(&r->dst.__refcnt),
407 r->dst.__use, 0, (__force u32)r->rt_src,
408 dst_metric_advmss(&r->dst) + 40,
409 dst_metric(&r->dst, RTAX_WINDOW), 0,
410 r->rt_key_tos,
411 -1, 0, 0, &len);
5e659e4c
PE
412
413 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
414 }
415 return 0;
1da177e4
LT
416}
417
f690808e 418static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
419 .start = rt_cache_seq_start,
420 .next = rt_cache_seq_next,
421 .stop = rt_cache_seq_stop,
422 .show = rt_cache_seq_show,
423};
424
425static int rt_cache_seq_open(struct inode *inode, struct file *file)
426{
a75e936f 427 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 428 sizeof(struct rt_cache_iter_state));
1da177e4
LT
429}
430
9a32144e 431static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
432 .owner = THIS_MODULE,
433 .open = rt_cache_seq_open,
434 .read = seq_read,
435 .llseek = seq_lseek,
a75e936f 436 .release = seq_release_net,
1da177e4
LT
437};
438
439
440static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
441{
442 int cpu;
443
444 if (*pos == 0)
445 return SEQ_START_TOKEN;
446
0f23174a 447 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
448 if (!cpu_possible(cpu))
449 continue;
450 *pos = cpu+1;
2f970d83 451 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
452 }
453 return NULL;
454}
455
456static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
457{
458 int cpu;
459
0f23174a 460 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
461 if (!cpu_possible(cpu))
462 continue;
463 *pos = cpu+1;
2f970d83 464 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
465 }
466 return NULL;
e905a9ed 467
1da177e4
LT
468}
469
470static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
471{
472
473}
474
475static int rt_cpu_seq_show(struct seq_file *seq, void *v)
476{
477 struct rt_cache_stat *st = v;
478
479 if (v == SEQ_START_TOKEN) {
5bec0039 480 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
481 return 0;
482 }
e905a9ed 483
1da177e4
LT
484 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
485 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 486 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
487 st->in_hit,
488 st->in_slow_tot,
489 st->in_slow_mc,
490 st->in_no_route,
491 st->in_brd,
492 st->in_martian_dst,
493 st->in_martian_src,
494
495 st->out_hit,
496 st->out_slow_tot,
e905a9ed 497 st->out_slow_mc,
1da177e4
LT
498
499 st->gc_total,
500 st->gc_ignored,
501 st->gc_goal_miss,
502 st->gc_dst_overflow,
503 st->in_hlist_search,
504 st->out_hlist_search
505 );
506 return 0;
507}
508
f690808e 509static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
510 .start = rt_cpu_seq_start,
511 .next = rt_cpu_seq_next,
512 .stop = rt_cpu_seq_stop,
513 .show = rt_cpu_seq_show,
514};
515
516
517static int rt_cpu_seq_open(struct inode *inode, struct file *file)
518{
519 return seq_open(file, &rt_cpu_seq_ops);
520}
521
9a32144e 522static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
523 .owner = THIS_MODULE,
524 .open = rt_cpu_seq_open,
525 .read = seq_read,
526 .llseek = seq_lseek,
527 .release = seq_release,
528};
529
c7066f70 530#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 531static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 532{
a661c419
AD
533 struct ip_rt_acct *dst, *src;
534 unsigned int i, j;
535
536 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
537 if (!dst)
538 return -ENOMEM;
539
540 for_each_possible_cpu(i) {
541 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
542 for (j = 0; j < 256; j++) {
543 dst[j].o_bytes += src[j].o_bytes;
544 dst[j].o_packets += src[j].o_packets;
545 dst[j].i_bytes += src[j].i_bytes;
546 dst[j].i_packets += src[j].i_packets;
547 }
78c686e9
PE
548 }
549
a661c419
AD
550 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
551 kfree(dst);
552 return 0;
553}
78c686e9 554
a661c419
AD
555static int rt_acct_proc_open(struct inode *inode, struct file *file)
556{
557 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 558}
a661c419
AD
559
560static const struct file_operations rt_acct_proc_fops = {
561 .owner = THIS_MODULE,
562 .open = rt_acct_proc_open,
563 .read = seq_read,
564 .llseek = seq_lseek,
565 .release = single_release,
566};
78c686e9 567#endif
107f1634 568
73b38711 569static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
570{
571 struct proc_dir_entry *pde;
572
573 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
574 &rt_cache_seq_fops);
575 if (!pde)
576 goto err1;
577
77020720
WC
578 pde = proc_create("rt_cache", S_IRUGO,
579 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
580 if (!pde)
581 goto err2;
582
c7066f70 583#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 584 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
585 if (!pde)
586 goto err3;
587#endif
588 return 0;
589
c7066f70 590#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
591err3:
592 remove_proc_entry("rt_cache", net->proc_net_stat);
593#endif
594err2:
595 remove_proc_entry("rt_cache", net->proc_net);
596err1:
597 return -ENOMEM;
598}
73b38711
DL
599
600static void __net_exit ip_rt_do_proc_exit(struct net *net)
601{
602 remove_proc_entry("rt_cache", net->proc_net_stat);
603 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 604#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 605 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 606#endif
73b38711
DL
607}
608
609static struct pernet_operations ip_rt_proc_ops __net_initdata = {
610 .init = ip_rt_do_proc_init,
611 .exit = ip_rt_do_proc_exit,
612};
613
614static int __init ip_rt_proc_init(void)
615{
616 return register_pernet_subsys(&ip_rt_proc_ops);
617}
618
107f1634 619#else
73b38711 620static inline int ip_rt_proc_init(void)
107f1634
PE
621{
622 return 0;
623}
1da177e4 624#endif /* CONFIG_PROC_FS */
e905a9ed 625
5969f71d 626static inline void rt_free(struct rtable *rt)
1da177e4 627{
d8d1f30b 628 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
629}
630
5969f71d 631static inline void rt_drop(struct rtable *rt)
1da177e4 632{
1da177e4 633 ip_rt_put(rt);
d8d1f30b 634 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
635}
636
5969f71d 637static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
638{
639 /* Kill broadcast/multicast entries very aggresively, if they
640 collide in hash table with more useful entries */
641 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 642 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
643}
644
5969f71d 645static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
646{
647 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
5943634f 648 rth->dst.expires;
1da177e4
LT
649}
650
651static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
652{
653 unsigned long age;
654 int ret = 0;
655
d8d1f30b 656 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
657 goto out;
658
d8d1f30b 659 age = jiffies - rth->dst.lastuse;
1da177e4
LT
660 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
661 (age <= tmo2 && rt_valuable(rth)))
662 goto out;
663 ret = 1;
664out: return ret;
665}
666
667/* Bits of score are:
668 * 31: very valuable
669 * 30: not quite useless
670 * 29..0: usage counter
671 */
672static inline u32 rt_score(struct rtable *rt)
673{
d8d1f30b 674 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
675
676 score = ~score & ~(3<<30);
677
678 if (rt_valuable(rt))
679 score |= (1<<31);
680
c7537967 681 if (rt_is_output_route(rt) ||
1da177e4
LT
682 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
683 score |= (1<<30);
684
685 return score;
686}
687
1080d709
NH
688static inline bool rt_caching(const struct net *net)
689{
690 return net->ipv4.current_rt_cache_rebuild_count <=
691 net->ipv4.sysctl_rt_cache_rebuild_count;
692}
693
5e2b61f7
DM
694static inline bool compare_hash_inputs(const struct rtable *rt1,
695 const struct rtable *rt2)
1080d709 696{
5e2b61f7
DM
697 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 699 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
700}
701
5e2b61f7 702static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 703{
5e2b61f7
DM
704 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
705 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
706 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 707 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 708 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 709 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
710}
711
b5921910
DL
712static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
713{
d8d1f30b 714 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
715}
716
e84f84f2
DL
717static inline int rt_is_expired(struct rtable *rth)
718{
d8d1f30b 719 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
720}
721
beb659bd
ED
722/*
723 * Perform a full scan of hash table and free all entries.
724 * Can be called by a softirq or a process.
725 * In the later case, we want to be reschedule if necessary
726 */
6561a3b1 727static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
728{
729 unsigned int i;
730 struct rtable *rth, *next;
731
732 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
733 struct rtable __rcu **pprev;
734 struct rtable *list;
735
beb659bd
ED
736 if (process_context && need_resched())
737 cond_resched();
33d480ce 738 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
739 if (!rth)
740 continue;
741
742 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 743
6561a3b1
DM
744 list = NULL;
745 pprev = &rt_hash_table[i].chain;
746 rth = rcu_dereference_protected(*pprev,
1c31720a 747 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 748
6561a3b1
DM
749 while (rth) {
750 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 751 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
752
753 if (!net ||
754 net_eq(dev_net(rth->dst.dev), net)) {
755 rcu_assign_pointer(*pprev, next);
756 rcu_assign_pointer(rth->dst.rt_next, list);
757 list = rth;
32cb5b4e 758 } else {
6561a3b1 759 pprev = &rth->dst.rt_next;
32cb5b4e 760 }
6561a3b1 761 rth = next;
32cb5b4e 762 }
6561a3b1 763
beb659bd
ED
764 spin_unlock_bh(rt_hash_lock_addr(i));
765
6561a3b1
DM
766 for (; list; list = next) {
767 next = rcu_dereference_protected(list->dst.rt_next, 1);
768 rt_free(list);
beb659bd
ED
769 }
770 }
771}
772
1080d709
NH
773/*
774 * While freeing expired entries, we compute average chain length
775 * and standard deviation, using fixed-point arithmetic.
776 * This to have an estimation of rt_chain_length_max
777 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
778 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 */
780
781#define FRACT_BITS 3
782#define ONE (1UL << FRACT_BITS)
783
98376387
ED
784/*
785 * Given a hash chain and an item in this hash chain,
786 * find if a previous entry has the same hash_inputs
787 * (but differs on tos, mark or oif)
788 * Returns 0 if an alias is found.
789 * Returns ONE if rth has no alias before itself.
790 */
791static int has_noalias(const struct rtable *head, const struct rtable *rth)
792{
793 const struct rtable *aux = head;
794
795 while (aux != rth) {
5e2b61f7 796 if (compare_hash_inputs(aux, rth))
98376387 797 return 0;
1c31720a 798 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
799 }
800 return ONE;
801}
802
9f28a2fc
ED
803static void rt_check_expire(void)
804{
805 static unsigned int rover;
806 unsigned int i = rover, goal;
807 struct rtable *rth;
808 struct rtable __rcu **rthp;
809 unsigned long samples = 0;
810 unsigned long sum = 0, sum2 = 0;
811 unsigned long delta;
812 u64 mult;
813
814 delta = jiffies - expires_ljiffies;
815 expires_ljiffies = jiffies;
816 mult = ((u64)delta) << rt_hash_log;
817 if (ip_rt_gc_timeout > 1)
818 do_div(mult, ip_rt_gc_timeout);
819 goal = (unsigned int)mult;
820 if (goal > rt_hash_mask)
821 goal = rt_hash_mask + 1;
822 for (; goal > 0; goal--) {
823 unsigned long tmo = ip_rt_gc_timeout;
824 unsigned long length;
825
826 i = (i + 1) & rt_hash_mask;
827 rthp = &rt_hash_table[i].chain;
828
829 if (need_resched())
830 cond_resched();
831
832 samples++;
833
834 if (rcu_dereference_raw(*rthp) == NULL)
835 continue;
836 length = 0;
837 spin_lock_bh(rt_hash_lock_addr(i));
838 while ((rth = rcu_dereference_protected(*rthp,
839 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
840 prefetch(rth->dst.rt_next);
df67e6c9
DM
841 if (rt_is_expired(rth) ||
842 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
9f28a2fc
ED
843 *rthp = rth->dst.rt_next;
844 rt_free(rth);
845 continue;
846 }
df67e6c9
DM
847
848 /* We only count entries on a chain with equal
849 * hash inputs once so that entries for
850 * different QOS levels, and other non-hash
851 * input attributes don't unfairly skew the
852 * length computation
853 */
854 tmo >>= 1;
855 rthp = &rth->dst.rt_next;
856 length += has_noalias(rt_hash_table[i].chain, rth);
9f28a2fc
ED
857 }
858 spin_unlock_bh(rt_hash_lock_addr(i));
859 sum += length;
860 sum2 += length*length;
861 }
862 if (samples) {
863 unsigned long avg = sum / samples;
864 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
865 rt_chain_length_max = max_t(unsigned long,
866 ip_rt_gc_elasticity,
867 (avg + 4*sd) >> FRACT_BITS);
868 }
869 rover = i;
870}
871
872/*
873 * rt_worker_func() is run in process context.
874 * we call rt_check_expire() to scan part of the hash table
875 */
876static void rt_worker_func(struct work_struct *work)
877{
878 rt_check_expire();
879 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
880}
881
29e75252 882/*
25985edc 883 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
884 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
885 * many times (2^24) without giving recent rt_genid.
886 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 887 */
86c657f6 888static void rt_cache_invalidate(struct net *net)
1da177e4 889{
29e75252 890 unsigned char shuffle;
1da177e4 891
29e75252 892 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 893 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
894}
895
29e75252
ED
896/*
897 * delay < 0 : invalidate cache (fast : entries will be deleted later)
898 * delay >= 0 : invalidate & flush cache (can be long)
899 */
76e6ebfb 900void rt_cache_flush(struct net *net, int delay)
1da177e4 901{
86c657f6 902 rt_cache_invalidate(net);
29e75252 903 if (delay >= 0)
6561a3b1 904 rt_do_flush(net, !in_softirq());
1da177e4
LT
905}
906
a5ee1551 907/* Flush previous cache invalidated entries from the cache */
6561a3b1 908void rt_cache_flush_batch(struct net *net)
a5ee1551 909{
6561a3b1 910 rt_do_flush(net, !in_softirq());
a5ee1551
EB
911}
912
1080d709
NH
913static void rt_emergency_hash_rebuild(struct net *net)
914{
e87cc472 915 net_warn_ratelimited("Route hash chain too long!\n");
3ee94372 916 rt_cache_invalidate(net);
1080d709
NH
917}
918
1da177e4
LT
919/*
920 Short description of GC goals.
921
922 We want to build algorithm, which will keep routing cache
923 at some equilibrium point, when number of aged off entries
924 is kept approximately equal to newly generated ones.
925
926 Current expiration strength is variable "expire".
927 We try to adjust it dynamically, so that if networking
928 is idle expires is large enough to keep enough of warm entries,
929 and when load increases it reduces to limit cache size.
930 */
931
569d3645 932static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
933{
934 static unsigned long expire = RT_GC_TIMEOUT;
935 static unsigned long last_gc;
936 static int rover;
937 static int equilibrium;
1c31720a
ED
938 struct rtable *rth;
939 struct rtable __rcu **rthp;
1da177e4
LT
940 unsigned long now = jiffies;
941 int goal;
fc66f95c 942 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
943
944 /*
945 * Garbage collection is pretty expensive,
946 * do not make it too frequently.
947 */
948
949 RT_CACHE_STAT_INC(gc_total);
950
951 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 952 entries < ip_rt_max_size) {
1da177e4
LT
953 RT_CACHE_STAT_INC(gc_ignored);
954 goto out;
955 }
956
fc66f95c 957 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 958 /* Calculate number of entries, which we want to expire now. */
fc66f95c 959 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
960 if (goal <= 0) {
961 if (equilibrium < ipv4_dst_ops.gc_thresh)
962 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 963 goal = entries - equilibrium;
1da177e4 964 if (goal > 0) {
b790cedd 965 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 966 goal = entries - equilibrium;
1da177e4
LT
967 }
968 } else {
969 /* We are in dangerous area. Try to reduce cache really
970 * aggressively.
971 */
b790cedd 972 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 973 equilibrium = entries - goal;
1da177e4
LT
974 }
975
976 if (now - last_gc >= ip_rt_gc_min_interval)
977 last_gc = now;
978
979 if (goal <= 0) {
980 equilibrium += goal;
981 goto work_done;
982 }
983
984 do {
985 int i, k;
986
987 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
988 unsigned long tmo = expire;
989
990 k = (k + 1) & rt_hash_mask;
991 rthp = &rt_hash_table[k].chain;
22c047cc 992 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
993 while ((rth = rcu_dereference_protected(*rthp,
994 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 995 if (!rt_is_expired(rth) &&
29e75252 996 !rt_may_expire(rth, tmo, expire)) {
1da177e4 997 tmo >>= 1;
d8d1f30b 998 rthp = &rth->dst.rt_next;
1da177e4
LT
999 continue;
1000 }
d8d1f30b 1001 *rthp = rth->dst.rt_next;
1da177e4
LT
1002 rt_free(rth);
1003 goal--;
1da177e4 1004 }
22c047cc 1005 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1006 if (goal <= 0)
1007 break;
1008 }
1009 rover = k;
1010
1011 if (goal <= 0)
1012 goto work_done;
1013
1014 /* Goal is not achieved. We stop process if:
1015
1016 - if expire reduced to zero. Otherwise, expire is halfed.
1017 - if table is not full.
1018 - if we are called from interrupt.
1019 - jiffies check is just fallback/debug loop breaker.
1020 We will not spin here for long time in any case.
1021 */
1022
1023 RT_CACHE_STAT_INC(gc_goal_miss);
1024
1025 if (expire == 0)
1026 break;
1027
1028 expire >>= 1;
1da177e4 1029
fc66f95c 1030 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1031 goto out;
1032 } while (!in_softirq() && time_before_eq(jiffies, now));
1033
fc66f95c
ED
1034 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1035 goto out;
1036 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4 1037 goto out;
e87cc472 1038 net_warn_ratelimited("dst cache overflow\n");
1da177e4
LT
1039 RT_CACHE_STAT_INC(gc_dst_overflow);
1040 return 1;
1041
1042work_done:
1043 expire += ip_rt_gc_min_interval;
1044 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1045 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1046 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1047 expire = ip_rt_gc_timeout;
1da177e4
LT
1048out: return 0;
1049}
1050
98376387
ED
1051/*
1052 * Returns number of entries in a hash chain that have different hash_inputs
1053 */
1054static int slow_chain_length(const struct rtable *head)
1055{
1056 int length = 0;
1057 const struct rtable *rth = head;
1058
1059 while (rth) {
1060 length += has_noalias(head, rth);
1c31720a 1061 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1062 }
1063 return length >> FRACT_BITS;
1064}
1065
f894cbf8
DM
1066static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1067 struct sk_buff *skb,
1068 const void *daddr)
3769cffb 1069{
d3aaeb38
DM
1070 struct net_device *dev = dst->dev;
1071 const __be32 *pkey = daddr;
39232973 1072 const struct rtable *rt;
3769cffb
DM
1073 struct neighbour *n;
1074
39232973 1075 rt = (const struct rtable *) dst;
a263b309 1076 if (rt->rt_gateway)
39232973 1077 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
1078 else if (skb)
1079 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 1080
80703d26 1081 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
1082 if (n)
1083 return n;
32092ecf 1084 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1085}
1086
95c96174 1087static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
b23dd4fe 1088 struct sk_buff *skb, int ifindex)
1da177e4 1089{
1c31720a
ED
1090 struct rtable *rth, *cand;
1091 struct rtable __rcu **rthp, **candp;
1da177e4 1092 unsigned long now;
1da177e4
LT
1093 u32 min_score;
1094 int chain_length;
1da177e4
LT
1095
1096restart:
1097 chain_length = 0;
1098 min_score = ~(u32)0;
1099 cand = NULL;
1100 candp = NULL;
1101 now = jiffies;
1102
7586eceb 1103 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
73e42897
NH
1104 /*
1105 * If we're not caching, just tell the caller we
1106 * were successful and don't touch the route. The
1107 * caller hold the sole reference to the cache entry, and
1108 * it will be released when the caller is done with it.
1109 * If we drop it here, the callers have no way to resolve routes
1110 * when we're not caching. Instead, just point *rp at rt, so
1111 * the caller gets a single use out of the route
b6280b47
NH
1112 * Note that we do rt_free on this new route entry, so that
1113 * once its refcount hits zero, we are still able to reap it
1114 * (Thanks Alexey)
27b75c95
ED
1115 * Note: To avoid expensive rcu stuff for this uncached dst,
1116 * we set DST_NOCACHE so that dst_release() can free dst without
1117 * waiting a grace period.
73e42897 1118 */
b6280b47 1119
c7d4426a 1120 rt->dst.flags |= DST_NOCACHE;
b6280b47 1121 goto skip_hashing;
1080d709
NH
1122 }
1123
1da177e4
LT
1124 rthp = &rt_hash_table[hash].chain;
1125
22c047cc 1126 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1127 while ((rth = rcu_dereference_protected(*rthp,
1128 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1129 if (rt_is_expired(rth)) {
d8d1f30b 1130 *rthp = rth->dst.rt_next;
29e75252
ED
1131 rt_free(rth);
1132 continue;
1133 }
5e2b61f7 1134 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1135 /* Put it first */
d8d1f30b 1136 *rthp = rth->dst.rt_next;
1da177e4
LT
1137 /*
1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain.
1141 */
d8d1f30b 1142 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1143 rt_hash_table[hash].chain);
1144 /*
1145 * Since lookup is lockfree, the update writes
1146 * must be ordered for consistency on SMP.
1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
d8d1f30b 1150 dst_use(&rth->dst, now);
22c047cc 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1152
1153 rt_drop(rt);
b23dd4fe 1154 if (skb)
d8d1f30b 1155 skb_dst_set(skb, &rth->dst);
b23dd4fe 1156 return rth;
1da177e4
LT
1157 }
1158
d8d1f30b 1159 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1160 u32 score = rt_score(rth);
1161
1162 if (score <= min_score) {
1163 cand = rth;
1164 candp = rthp;
1165 min_score = score;
1166 }
1167 }
1168
1169 chain_length++;
1170
d8d1f30b 1171 rthp = &rth->dst.rt_next;
1da177e4
LT
1172 }
1173
1174 if (cand) {
1175 /* ip_rt_gc_elasticity used to be average length of chain
1176 * length, when exceeded gc becomes really aggressive.
1177 *
1178 * The second limit is less certain. At the moment it allows
1179 * only 2 entries per bucket. We will see.
1180 */
1181 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1182 *candp = cand->dst.rt_next;
1da177e4
LT
1183 rt_free(cand);
1184 }
1080d709 1185 } else {
98376387
ED
1186 if (chain_length > rt_chain_length_max &&
1187 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1188 struct net *net = dev_net(rt->dst.dev);
1080d709 1189 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1190 if (!rt_caching(net)) {
058bd4d2 1191 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1192 rt->dst.dev->name, num);
1080d709 1193 }
b35ecb5d 1194 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1195 spin_unlock_bh(rt_hash_lock_addr(hash));
1196
5e2b61f7 1197 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1198 ifindex, rt_genid(net));
1199 goto restart;
1080d709 1200 }
1da177e4
LT
1201 }
1202
d8d1f30b 1203 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1204
00269b54
ED
1205 /*
1206 * Since lookup is lockfree, we must make sure
25985edc 1207 * previous writes to rt are committed to memory
00269b54
ED
1208 * before making rt visible to other CPUS.
1209 */
1ddbcb00 1210 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1211
22c047cc 1212 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1213
b6280b47 1214skip_hashing:
b23dd4fe 1215 if (skb)
d8d1f30b 1216 skb_dst_set(skb, &rt->dst);
b23dd4fe 1217 return rt;
1da177e4
LT
1218}
1219
1da177e4
LT
1220/*
1221 * Peer allocation may fail only in serious out-of-memory conditions. However
1222 * we still can generate some output.
1223 * Random ID selection looks a bit dangerous because we have no chances to
1224 * select ID being unique in a reasonable period of time.
1225 * But broken packet identifier may be better than no packet at all.
1226 */
1227static void ip_select_fb_ident(struct iphdr *iph)
1228{
1229 static DEFINE_SPINLOCK(ip_fb_id_lock);
1230 static u32 ip_fallback_id;
1231 u32 salt;
1232
1233 spin_lock_bh(&ip_fb_id_lock);
e448515c 1234 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1235 iph->id = htons(salt & 0xFFFF);
1236 ip_fallback_id = salt;
1237 spin_unlock_bh(&ip_fb_id_lock);
1238}
1239
1240void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1241{
1d861aa4
DM
1242 struct net *net = dev_net(dst->dev);
1243 struct inet_peer *peer;
1da177e4 1244
1d861aa4
DM
1245 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1246 if (peer) {
1247 iph->id = htons(inet_getid(peer, more));
1248 inet_putpeer(peer);
1249 return;
1250 }
1da177e4
LT
1251
1252 ip_select_fb_ident(iph);
1253}
4bc2f18b 1254EXPORT_SYMBOL(__ip_select_ident);
1da177e4 1255
95c96174 1256static void rt_del(unsigned int hash, struct rtable *rt)
1da177e4 1257{
1c31720a
ED
1258 struct rtable __rcu **rthp;
1259 struct rtable *aux;
1da177e4 1260
29e75252 1261 rthp = &rt_hash_table[hash].chain;
22c047cc 1262 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1263 ip_rt_put(rt);
1c31720a
ED
1264 while ((aux = rcu_dereference_protected(*rthp,
1265 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1266 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1267 *rthp = aux->dst.rt_next;
29e75252
ED
1268 rt_free(aux);
1269 continue;
1da177e4 1270 }
d8d1f30b 1271 rthp = &aux->dst.rt_next;
29e75252 1272 }
22c047cc 1273 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1274}
1275
e47a185b 1276static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
1da177e4 1277{
e47a185b 1278 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 1279 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 1280 struct net_device *dev = skb->dev;
e47a185b
DM
1281 struct in_device *in_dev;
1282 struct neighbour *n;
1283 struct rtable *rt;
317805b8 1284 struct net *net;
1da177e4 1285
94206125
DM
1286 switch (icmp_hdr(skb)->code & 7) {
1287 case ICMP_REDIR_NET:
1288 case ICMP_REDIR_NETTOS:
1289 case ICMP_REDIR_HOST:
1290 case ICMP_REDIR_HOSTTOS:
1291 break;
1292
1293 default:
1294 return;
1295 }
1296
e47a185b
DM
1297 rt = (struct rtable *) dst;
1298 if (rt->rt_gateway != old_gw)
1299 return;
1300
1301 in_dev = __in_dev_get_rcu(dev);
1302 if (!in_dev)
1303 return;
1304
c346dca1 1305 net = dev_net(dev);
9d4fb27d
JP
1306 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1307 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1308 ipv4_is_zeronet(new_gw))
1da177e4
LT
1309 goto reject_redirect;
1310
1311 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1312 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1313 goto reject_redirect;
1314 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1315 goto reject_redirect;
1316 } else {
317805b8 1317 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1318 goto reject_redirect;
1319 }
1320
e47a185b
DM
1321 n = ipv4_neigh_lookup(dst, NULL, &new_gw);
1322 if (n) {
1323 if (!(n->nud_state & NUD_VALID)) {
1324 neigh_event_send(n, NULL);
1325 } else {
1326 rt->rt_gateway = new_gw;
1327 rt->rt_flags |= RTCF_REDIRECTED;
1328 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1329 }
1330 neigh_release(n);
1331 }
1332 return;
1333
1334reject_redirect:
1335#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
1336 if (IN_DEV_LOG_MARTIANS(in_dev)) {
1337 const struct iphdr *iph = (const struct iphdr *) skb->data;
1338 __be32 daddr = iph->daddr;
1339 __be32 saddr = iph->saddr;
1340
e47a185b
DM
1341 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1342 " Advised path = %pI4 -> %pI4\n",
1343 &old_gw, dev->name, &new_gw,
1344 &saddr, &daddr);
99ee038d 1345 }
e47a185b
DM
1346#endif
1347 ;
1348}
1349
1da177e4
LT
1350static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1351{
ee6b9673 1352 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1353 struct dst_entry *ret = dst;
1354
1355 if (rt) {
d11a4dc1 1356 if (dst->obsolete > 0) {
1da177e4
LT
1357 ip_rt_put(rt);
1358 ret = NULL;
5943634f
DM
1359 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1360 rt->dst.expires) {
95c96174 1361 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
5e2b61f7 1362 rt->rt_oif,
e84f84f2 1363 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1364 rt_del(hash, rt);
1365 ret = NULL;
1366 }
1367 }
1368 return ret;
1369}
1370
1371/*
1372 * Algorithm:
1373 * 1. The first ip_rt_redirect_number redirects are sent
1374 * with exponential backoff, then we stop sending them at all,
1375 * assuming that the host ignores our redirects.
1376 * 2. If we did not see packets requiring redirects
1377 * during ip_rt_redirect_silence, we assume that the host
1378 * forgot redirected route and start to send redirects again.
1379 *
1380 * This algorithm is much cheaper and more intelligent than dumb load limiting
1381 * in icmp.c.
1382 *
1383 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1384 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1385 */
1386
1387void ip_rt_send_redirect(struct sk_buff *skb)
1388{
511c3f92 1389 struct rtable *rt = skb_rtable(skb);
30038fc6 1390 struct in_device *in_dev;
92d86829 1391 struct inet_peer *peer;
1d861aa4 1392 struct net *net;
30038fc6 1393 int log_martians;
1da177e4 1394
30038fc6 1395 rcu_read_lock();
d8d1f30b 1396 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1397 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1398 rcu_read_unlock();
1da177e4 1399 return;
30038fc6
ED
1400 }
1401 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1402 rcu_read_unlock();
1da177e4 1403
1d861aa4
DM
1404 net = dev_net(rt->dst.dev);
1405 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
1406 if (!peer) {
1407 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1408 return;
1409 }
1410
1da177e4
LT
1411 /* No redirected packets during ip_rt_redirect_silence;
1412 * reset the algorithm.
1413 */
92d86829
DM
1414 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1415 peer->rate_tokens = 0;
1da177e4
LT
1416
1417 /* Too many ignored redirects; do not send anything
d8d1f30b 1418 * set dst.rate_last to the last seen redirected packet.
1da177e4 1419 */
92d86829
DM
1420 if (peer->rate_tokens >= ip_rt_redirect_number) {
1421 peer->rate_last = jiffies;
1d861aa4 1422 goto out_put_peer;
1da177e4
LT
1423 }
1424
1425 /* Check for load limit; set rate_last to the latest sent
1426 * redirect.
1427 */
92d86829 1428 if (peer->rate_tokens == 0 ||
14fb8a76 1429 time_after(jiffies,
92d86829
DM
1430 (peer->rate_last +
1431 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1432 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1433 peer->rate_last = jiffies;
1434 ++peer->rate_tokens;
1da177e4 1435#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1436 if (log_martians &&
e87cc472
JP
1437 peer->rate_tokens == ip_rt_redirect_number)
1438 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1439 &ip_hdr(skb)->saddr, rt->rt_iif,
1440 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1441#endif
1442 }
1d861aa4
DM
1443out_put_peer:
1444 inet_putpeer(peer);
1da177e4
LT
1445}
1446
1447static int ip_error(struct sk_buff *skb)
1448{
251da413 1449 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 1450 struct rtable *rt = skb_rtable(skb);
92d86829 1451 struct inet_peer *peer;
1da177e4 1452 unsigned long now;
251da413 1453 struct net *net;
92d86829 1454 bool send;
1da177e4
LT
1455 int code;
1456
251da413
DM
1457 net = dev_net(rt->dst.dev);
1458 if (!IN_DEV_FORWARD(in_dev)) {
1459 switch (rt->dst.error) {
1460 case EHOSTUNREACH:
1461 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1462 break;
1463
1464 case ENETUNREACH:
1465 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1466 break;
1467 }
1468 goto out;
1469 }
1470
d8d1f30b 1471 switch (rt->dst.error) {
4500ebf8
JP
1472 case EINVAL:
1473 default:
1474 goto out;
1475 case EHOSTUNREACH:
1476 code = ICMP_HOST_UNREACH;
1477 break;
1478 case ENETUNREACH:
1479 code = ICMP_NET_UNREACH;
251da413 1480 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
1481 break;
1482 case EACCES:
1483 code = ICMP_PKT_FILTERED;
1484 break;
1da177e4
LT
1485 }
1486
1d861aa4 1487 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
1488
1489 send = true;
1490 if (peer) {
1491 now = jiffies;
1492 peer->rate_tokens += now - peer->rate_last;
1493 if (peer->rate_tokens > ip_rt_error_burst)
1494 peer->rate_tokens = ip_rt_error_burst;
1495 peer->rate_last = now;
1496 if (peer->rate_tokens >= ip_rt_error_cost)
1497 peer->rate_tokens -= ip_rt_error_cost;
1498 else
1499 send = false;
1d861aa4 1500 inet_putpeer(peer);
1da177e4 1501 }
92d86829
DM
1502 if (send)
1503 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1504
1505out: kfree_skb(skb);
1506 return 0;
e905a9ed 1507}
1da177e4 1508
1da177e4
LT
1509static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1510{
2c8cec5c 1511 struct rtable *rt = (struct rtable *) dst;
2c8cec5c
DM
1512
1513 dst_confirm(dst);
1514
5943634f
DM
1515 if (mtu < ip_rt_min_pmtu)
1516 mtu = ip_rt_min_pmtu;
2c8cec5c 1517
5943634f
DM
1518 rt->rt_pmtu = mtu;
1519 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1da177e4
LT
1520}
1521
36393395
DM
1522void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1523 int oif, u32 mark, u8 protocol, int flow_flags)
1524{
1525 const struct iphdr *iph = (const struct iphdr *)skb->data;
1526 struct flowi4 fl4;
1527 struct rtable *rt;
1528
1529 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
3e12939a 1530 protocol, flow_flags,
36393395
DM
1531 iph->daddr, iph->saddr, 0, 0);
1532 rt = __ip_route_output_key(net, &fl4);
1533 if (!IS_ERR(rt)) {
1534 ip_rt_update_pmtu(&rt->dst, mtu);
1535 ip_rt_put(rt);
1536 }
1537}
1538EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1539
1540void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1541{
1542 const struct inet_sock *inet = inet_sk(sk);
1543
1544 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1545 sk->sk_bound_dev_if, sk->sk_mark,
1546 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1547 inet_sk_flowi_flags(sk));
1548}
1549EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1550
b42597e2
DM
1551void ipv4_redirect(struct sk_buff *skb, struct net *net,
1552 int oif, u32 mark, u8 protocol, int flow_flags)
1553{
1554 const struct iphdr *iph = (const struct iphdr *)skb->data;
1555 struct flowi4 fl4;
1556 struct rtable *rt;
1557
1558 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1559 protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
1560 rt = __ip_route_output_key(net, &fl4);
1561 if (!IS_ERR(rt)) {
1562 ip_do_redirect(&rt->dst, skb);
1563 ip_rt_put(rt);
1564 }
1565}
1566EXPORT_SYMBOL_GPL(ipv4_redirect);
1567
1568void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1569{
1570 const struct inet_sock *inet = inet_sk(sk);
1571
1572 return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
1573 sk->sk_mark,
1574 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1575 inet_sk_flowi_flags(sk));
1576}
1577EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1578
efbc368d
DM
1579static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1580{
1581 struct rtable *rt = (struct rtable *) dst;
1582
1583 if (rt_is_expired(rt))
1584 return NULL;
d11a4dc1 1585 return dst;
1da177e4
LT
1586}
1587
1588static void ipv4_dst_destroy(struct dst_entry *dst)
1589{
1590 struct rtable *rt = (struct rtable *) dst;
1da177e4 1591
62fa8a84
DM
1592 if (rt->fi) {
1593 fib_info_put(rt->fi);
1594 rt->fi = NULL;
1595 }
1da177e4
LT
1596}
1597
1da177e4
LT
1598
1599static void ipv4_link_failure(struct sk_buff *skb)
1600{
1601 struct rtable *rt;
1602
1603 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1604
511c3f92 1605 rt = skb_rtable(skb);
5943634f
DM
1606 if (rt)
1607 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1608}
1609
1610static int ip_rt_bug(struct sk_buff *skb)
1611{
91df42be
JP
1612 pr_debug("%s: %pI4 -> %pI4, %s\n",
1613 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1614 skb->dev ? skb->dev->name : "?");
1da177e4 1615 kfree_skb(skb);
c378a9c0 1616 WARN_ON(1);
1da177e4
LT
1617 return 0;
1618}
1619
1620/*
1621 We do not cache source address of outgoing interface,
1622 because it is used only by IP RR, TS and SRR options,
1623 so that it out of fast path.
1624
1625 BTW remember: "addr" is allowed to be not aligned
1626 in IP options!
1627 */
1628
8e36360a 1629void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1630{
a61ced5d 1631 __be32 src;
1da177e4 1632
c7537967 1633 if (rt_is_output_route(rt))
c5be24ff 1634 src = ip_hdr(skb)->saddr;
ebc0ffae 1635 else {
8e36360a
DM
1636 struct fib_result res;
1637 struct flowi4 fl4;
1638 struct iphdr *iph;
1639
1640 iph = ip_hdr(skb);
1641
1642 memset(&fl4, 0, sizeof(fl4));
1643 fl4.daddr = iph->daddr;
1644 fl4.saddr = iph->saddr;
b0fe4a31 1645 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1646 fl4.flowi4_oif = rt->dst.dev->ifindex;
1647 fl4.flowi4_iif = skb->dev->ifindex;
1648 fl4.flowi4_mark = skb->mark;
5e2b61f7 1649
ebc0ffae 1650 rcu_read_lock();
68a5e3dd 1651 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1652 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1653 else
1654 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1655 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1656 rcu_read_unlock();
1657 }
1da177e4
LT
1658 memcpy(addr, &src, 4);
1659}
1660
c7066f70 1661#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1662static void set_class_tag(struct rtable *rt, u32 tag)
1663{
d8d1f30b
CG
1664 if (!(rt->dst.tclassid & 0xFFFF))
1665 rt->dst.tclassid |= tag & 0xFFFF;
1666 if (!(rt->dst.tclassid & 0xFFFF0000))
1667 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1668}
1669#endif
1670
0dbaee3b
DM
1671static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1672{
1673 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1674
1675 if (advmss == 0) {
1676 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1677 ip_rt_min_advmss);
1678 if (advmss > 65535 - 40)
1679 advmss = 65535 - 40;
1680 }
1681 return advmss;
1682}
1683
ebb762f2 1684static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1685{
261663b0 1686 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1687 unsigned int mtu = rt->rt_pmtu;
1688
1689 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1690 mtu = 0;
1691
1692 if (!mtu)
1693 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1694
261663b0 1695 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1696 return mtu;
1697
1698 mtu = dst->dev->mtu;
d33e4553
DM
1699
1700 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1701
1702 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1703 mtu = 576;
1704 }
1705
1706 if (mtu > IP_MAX_MTU)
1707 mtu = IP_MAX_MTU;
1708
1709 return mtu;
1710}
1711
813b3b5d 1712static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1713 struct fib_info *fi)
a4daad6b 1714{
f185071d
DM
1715 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1716 rt->fi = fi;
1717 atomic_inc(&fi->fib_clntref);
a4daad6b 1718 }
f185071d 1719 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1720}
1721
813b3b5d 1722static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1723 const struct fib_result *res,
982721f3 1724 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1725{
1da177e4
LT
1726 if (fi) {
1727 if (FIB_RES_GW(*res) &&
1728 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1729 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1730 rt_init_metrics(rt, fl4, fi);
c7066f70 1731#ifdef CONFIG_IP_ROUTE_CLASSID
710ab6c0 1732 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1733#endif
d33e4553 1734 }
defb3519 1735
c7066f70 1736#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1737#ifdef CONFIG_IP_MULTIPLE_TABLES
1738 set_class_tag(rt, fib_rules_tclass(res));
1739#endif
1740 set_class_tag(rt, itag);
1741#endif
1da177e4
LT
1742}
1743
5c1e6aa3
DM
1744static struct rtable *rt_dst_alloc(struct net_device *dev,
1745 bool nopolicy, bool noxfrm)
0c4dcd58 1746{
5c1e6aa3
DM
1747 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1748 DST_HOST |
1749 (nopolicy ? DST_NOPOLICY : 0) |
1750 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1751}
1752
96d36220 1753/* called in rcu_read_lock() section */
9e12bb22 1754static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1755 u8 tos, struct net_device *dev, int our)
1756{
96d36220 1757 unsigned int hash;
1da177e4 1758 struct rtable *rth;
96d36220 1759 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1760 u32 itag = 0;
b5f7e755 1761 int err;
1da177e4
LT
1762
1763 /* Primary sanity checks. */
1764
1765 if (in_dev == NULL)
1766 return -EINVAL;
1767
1e637c74 1768 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1769 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1770 goto e_inval;
1771
d0daebc3
TG
1772 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1773 if (ipv4_is_loopback(saddr))
1774 goto e_inval;
1775
f97c1e0c
JP
1776 if (ipv4_is_zeronet(saddr)) {
1777 if (!ipv4_is_local_multicast(daddr))
1da177e4 1778 goto e_inval;
b5f7e755 1779 } else {
9e56e380
DM
1780 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1781 in_dev, &itag);
b5f7e755
ED
1782 if (err < 0)
1783 goto e_err;
1784 }
4e7b2f14 1785 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
5c1e6aa3 1786 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
1787 if (!rth)
1788 goto e_nobufs;
1789
cf911662
DM
1790#ifdef CONFIG_IP_ROUTE_CLASSID
1791 rth->dst.tclassid = itag;
1792#endif
d8d1f30b 1793 rth->dst.output = ip_rt_bug;
1da177e4 1794
5e2b61f7 1795 rth->rt_key_dst = daddr;
5e2b61f7 1796 rth->rt_key_src = saddr;
cf911662
DM
1797 rth->rt_genid = rt_genid(dev_net(dev));
1798 rth->rt_flags = RTCF_MULTICAST;
1799 rth->rt_type = RTN_MULTICAST;
475949d8 1800 rth->rt_key_tos = tos;
cf911662 1801 rth->rt_dst = daddr;
1da177e4 1802 rth->rt_src = saddr;
1b86a58f 1803 rth->rt_route_iif = dev->ifindex;
5e2b61f7 1804 rth->rt_iif = dev->ifindex;
5e2b61f7 1805 rth->rt_oif = 0;
cf911662 1806 rth->rt_mark = skb->mark;
5943634f 1807 rth->rt_pmtu = 0;
1da177e4 1808 rth->rt_gateway = daddr;
cf911662 1809 rth->fi = NULL;
1da177e4 1810 if (our) {
d8d1f30b 1811 rth->dst.input= ip_local_deliver;
1da177e4
LT
1812 rth->rt_flags |= RTCF_LOCAL;
1813 }
1814
1815#ifdef CONFIG_IP_MROUTE
f97c1e0c 1816 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1817 rth->dst.input = ip_mr_input;
1da177e4
LT
1818#endif
1819 RT_CACHE_STAT_INC(in_slow_mc);
1820
e84f84f2 1821 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 1822 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 1823 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
1824
1825e_nobufs:
1da177e4 1826 return -ENOBUFS;
1da177e4 1827e_inval:
96d36220 1828 return -EINVAL;
b5f7e755 1829e_err:
b5f7e755 1830 return err;
1da177e4
LT
1831}
1832
1833
1834static void ip_handle_martian_source(struct net_device *dev,
1835 struct in_device *in_dev,
1836 struct sk_buff *skb,
9e12bb22
AV
1837 __be32 daddr,
1838 __be32 saddr)
1da177e4
LT
1839{
1840 RT_CACHE_STAT_INC(in_martian_src);
1841#ifdef CONFIG_IP_ROUTE_VERBOSE
1842 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1843 /*
1844 * RFC1812 recommendation, if source is martian,
1845 * the only hint is MAC header.
1846 */
058bd4d2 1847 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1848 &daddr, &saddr, dev->name);
98e399f8 1849 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1850 print_hex_dump(KERN_WARNING, "ll header: ",
1851 DUMP_PREFIX_OFFSET, 16, 1,
1852 skb_mac_header(skb),
1853 dev->hard_header_len, true);
1da177e4
LT
1854 }
1855 }
1856#endif
1857}
1858
47360228 1859/* called in rcu_read_lock() section */
5969f71d 1860static int __mkroute_input(struct sk_buff *skb,
982721f3 1861 const struct fib_result *res,
5969f71d
SH
1862 struct in_device *in_dev,
1863 __be32 daddr, __be32 saddr, u32 tos,
1864 struct rtable **result)
1da177e4 1865{
1da177e4
LT
1866 struct rtable *rth;
1867 int err;
1868 struct in_device *out_dev;
47360228 1869 unsigned int flags = 0;
d9c9df8c 1870 u32 itag;
1da177e4
LT
1871
1872 /* get a working reference to the output device */
47360228 1873 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1874 if (out_dev == NULL) {
e87cc472 1875 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1876 return -EINVAL;
1877 }
1878
1879
5c04c819 1880 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1881 in_dev->dev, in_dev, &itag);
1da177e4 1882 if (err < 0) {
e905a9ed 1883 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1884 saddr);
e905a9ed 1885
1da177e4
LT
1886 goto cleanup;
1887 }
1888
1889 if (err)
1890 flags |= RTCF_DIRECTSRC;
1891
51b77cae 1892 if (out_dev == in_dev && err &&
1da177e4
LT
1893 (IN_DEV_SHARED_MEDIA(out_dev) ||
1894 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1895 flags |= RTCF_DOREDIRECT;
1896
1897 if (skb->protocol != htons(ETH_P_IP)) {
1898 /* Not IP (i.e. ARP). Do not create route, if it is
1899 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1900 *
1901 * Proxy arp feature have been extended to allow, ARP
1902 * replies back to the same interface, to support
1903 * Private VLAN switch technologies. See arp.c.
1da177e4 1904 */
65324144
JDB
1905 if (out_dev == in_dev &&
1906 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1907 err = -EINVAL;
1908 goto cleanup;
1909 }
1910 }
1911
5c1e6aa3
DM
1912 rth = rt_dst_alloc(out_dev->dev,
1913 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 1914 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
1915 if (!rth) {
1916 err = -ENOBUFS;
1917 goto cleanup;
1918 }
1919
5e2b61f7 1920 rth->rt_key_dst = daddr;
5e2b61f7 1921 rth->rt_key_src = saddr;
cf911662
DM
1922 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1923 rth->rt_flags = flags;
1924 rth->rt_type = res->type;
475949d8 1925 rth->rt_key_tos = tos;
cf911662 1926 rth->rt_dst = daddr;
1da177e4 1927 rth->rt_src = saddr;
1b86a58f 1928 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 1929 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 1930 rth->rt_oif = 0;
cf911662 1931 rth->rt_mark = skb->mark;
5943634f 1932 rth->rt_pmtu = 0;
cf911662 1933 rth->rt_gateway = daddr;
cf911662 1934 rth->fi = NULL;
1da177e4 1935
d8d1f30b
CG
1936 rth->dst.input = ip_forward;
1937 rth->dst.output = ip_output;
1da177e4 1938
5e2b61f7 1939 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 1940
1da177e4
LT
1941 *result = rth;
1942 err = 0;
1943 cleanup:
1da177e4 1944 return err;
e905a9ed 1945}
1da177e4 1946
5969f71d
SH
1947static int ip_mkroute_input(struct sk_buff *skb,
1948 struct fib_result *res,
68a5e3dd 1949 const struct flowi4 *fl4,
5969f71d
SH
1950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1952{
5e73ea1a 1953 struct rtable *rth = NULL;
1da177e4 1954 int err;
95c96174 1955 unsigned int hash;
1da177e4
LT
1956
1957#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1958 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1959 fib_select_multipath(res);
1da177e4
LT
1960#endif
1961
1962 /* create a routing cache entry */
1963 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1964 if (err)
1965 return err;
1da177e4
LT
1966
1967 /* put it into the cache */
68a5e3dd 1968 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 1969 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 1970 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
1971 if (IS_ERR(rth))
1972 return PTR_ERR(rth);
1973 return 0;
1da177e4
LT
1974}
1975
1da177e4
LT
1976/*
1977 * NOTE. We drop all the packets that has local source
1978 * addresses, because every properly looped back packet
1979 * must have correct destination already attached by output routine.
1980 *
1981 * Such approach solves two big problems:
1982 * 1. Not simplex devices are handled properly.
1983 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1984 * called with rcu_read_lock()
1da177e4
LT
1985 */
1986
9e12bb22 1987static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1988 u8 tos, struct net_device *dev)
1da177e4
LT
1989{
1990 struct fib_result res;
96d36220 1991 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1992 struct flowi4 fl4;
95c96174 1993 unsigned int flags = 0;
1da177e4 1994 u32 itag = 0;
95c96174
ED
1995 struct rtable *rth;
1996 unsigned int hash;
1da177e4 1997 int err = -EINVAL;
5e73ea1a 1998 struct net *net = dev_net(dev);
1da177e4
LT
1999
2000 /* IP on this device is disabled. */
2001
2002 if (!in_dev)
2003 goto out;
2004
2005 /* Check for the most weird martians, which can be not detected
2006 by fib_lookup.
2007 */
2008
d0daebc3 2009 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2010 goto martian_source;
2011
27a954bd 2012 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2013 goto brd_input;
2014
2015 /* Accept zero addresses only to limited broadcast;
2016 * I even do not know to fix it or not. Waiting for complains :-)
2017 */
f97c1e0c 2018 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2019 goto martian_source;
2020
d0daebc3 2021 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2022 goto martian_destination;
2023
d0daebc3
TG
2024 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2025 if (ipv4_is_loopback(daddr))
2026 goto martian_destination;
2027
2028 if (ipv4_is_loopback(saddr))
2029 goto martian_source;
2030 }
2031
1da177e4
LT
2032 /*
2033 * Now we are ready to route packet.
2034 */
68a5e3dd
DM
2035 fl4.flowi4_oif = 0;
2036 fl4.flowi4_iif = dev->ifindex;
2037 fl4.flowi4_mark = skb->mark;
2038 fl4.flowi4_tos = tos;
2039 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2040 fl4.daddr = daddr;
2041 fl4.saddr = saddr;
2042 err = fib_lookup(net, &fl4, &res);
251da413 2043 if (err != 0)
1da177e4 2044 goto no_route;
1da177e4
LT
2045
2046 RT_CACHE_STAT_INC(in_slow_tot);
2047
2048 if (res.type == RTN_BROADCAST)
2049 goto brd_input;
2050
2051 if (res.type == RTN_LOCAL) {
5c04c819 2052 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2053 net->loopback_dev->ifindex,
9e56e380 2054 dev, in_dev, &itag);
b5f7e755
ED
2055 if (err < 0)
2056 goto martian_source_keep_err;
2057 if (err)
1da177e4 2058 flags |= RTCF_DIRECTSRC;
1da177e4
LT
2059 goto local_input;
2060 }
2061
2062 if (!IN_DEV_FORWARD(in_dev))
251da413 2063 goto no_route;
1da177e4
LT
2064 if (res.type != RTN_UNICAST)
2065 goto martian_destination;
2066
68a5e3dd 2067 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2068out: return err;
2069
2070brd_input:
2071 if (skb->protocol != htons(ETH_P_IP))
2072 goto e_inval;
2073
41347dcd 2074 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2075 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2076 in_dev, &itag);
1da177e4 2077 if (err < 0)
b5f7e755 2078 goto martian_source_keep_err;
1da177e4
LT
2079 if (err)
2080 flags |= RTCF_DIRECTSRC;
2081 }
2082 flags |= RTCF_BROADCAST;
2083 res.type = RTN_BROADCAST;
2084 RT_CACHE_STAT_INC(in_brd);
2085
2086local_input:
5c1e6aa3
DM
2087 rth = rt_dst_alloc(net->loopback_dev,
2088 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2089 if (!rth)
2090 goto e_nobufs;
2091
cf911662 2092 rth->dst.input= ip_local_deliver;
d8d1f30b 2093 rth->dst.output= ip_rt_bug;
cf911662
DM
2094#ifdef CONFIG_IP_ROUTE_CLASSID
2095 rth->dst.tclassid = itag;
2096#endif
1da177e4 2097
5e2b61f7 2098 rth->rt_key_dst = daddr;
5e2b61f7 2099 rth->rt_key_src = saddr;
cf911662
DM
2100 rth->rt_genid = rt_genid(net);
2101 rth->rt_flags = flags|RTCF_LOCAL;
2102 rth->rt_type = res.type;
475949d8 2103 rth->rt_key_tos = tos;
cf911662 2104 rth->rt_dst = daddr;
1da177e4 2105 rth->rt_src = saddr;
1b86a58f 2106 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2107 rth->rt_iif = dev->ifindex;
cf911662
DM
2108 rth->rt_oif = 0;
2109 rth->rt_mark = skb->mark;
5943634f 2110 rth->rt_pmtu = 0;
1da177e4 2111 rth->rt_gateway = daddr;
cf911662 2112 rth->fi = NULL;
1da177e4 2113 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2114 rth->dst.input= ip_error;
2115 rth->dst.error= -err;
1da177e4
LT
2116 rth->rt_flags &= ~RTCF_LOCAL;
2117 }
68a5e3dd
DM
2118 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2119 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2120 err = 0;
2121 if (IS_ERR(rth))
2122 err = PTR_ERR(rth);
ebc0ffae 2123 goto out;
1da177e4
LT
2124
2125no_route:
2126 RT_CACHE_STAT_INC(in_no_route);
1da177e4 2127 res.type = RTN_UNREACHABLE;
7f53878d
MC
2128 if (err == -ESRCH)
2129 err = -ENETUNREACH;
1da177e4
LT
2130 goto local_input;
2131
2132 /*
2133 * Do not cache martian addresses: they should be logged (RFC1812)
2134 */
2135martian_destination:
2136 RT_CACHE_STAT_INC(in_martian_dst);
2137#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2138 if (IN_DEV_LOG_MARTIANS(in_dev))
2139 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2140 &daddr, &saddr, dev->name);
1da177e4 2141#endif
2c2910a4 2142
1da177e4
LT
2143e_inval:
2144 err = -EINVAL;
ebc0ffae 2145 goto out;
1da177e4
LT
2146
2147e_nobufs:
2148 err = -ENOBUFS;
ebc0ffae 2149 goto out;
1da177e4
LT
2150
2151martian_source:
b5f7e755
ED
2152 err = -EINVAL;
2153martian_source_keep_err:
1da177e4 2154 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2155 goto out;
1da177e4
LT
2156}
2157
407eadd9 2158int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 2159 u8 tos, struct net_device *dev, bool noref)
1da177e4 2160{
95c96174
ED
2161 struct rtable *rth;
2162 unsigned int hash;
1da177e4 2163 int iif = dev->ifindex;
b5921910 2164 struct net *net;
96d36220 2165 int res;
1da177e4 2166
c346dca1 2167 net = dev_net(dev);
1080d709 2168
96d36220
ED
2169 rcu_read_lock();
2170
1080d709
NH
2171 if (!rt_caching(net))
2172 goto skip_cache;
2173
1da177e4 2174 tos &= IPTOS_RT_MASK;
e84f84f2 2175 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2176
1da177e4 2177 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2178 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2179 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2180 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2181 (rth->rt_route_iif ^ iif) |
475949d8 2182 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2183 rth->rt_mark == skb->mark &&
d8d1f30b 2184 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2185 !rt_is_expired(rth)) {
407eadd9 2186 if (noref) {
d8d1f30b
CG
2187 dst_use_noref(&rth->dst, jiffies);
2188 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2189 } else {
d8d1f30b
CG
2190 dst_use(&rth->dst, jiffies);
2191 skb_dst_set(skb, &rth->dst);
407eadd9 2192 }
1da177e4
LT
2193 RT_CACHE_STAT_INC(in_hit);
2194 rcu_read_unlock();
1da177e4
LT
2195 return 0;
2196 }
2197 RT_CACHE_STAT_INC(in_hlist_search);
2198 }
1da177e4 2199
1080d709 2200skip_cache:
1da177e4
LT
2201 /* Multicast recognition logic is moved from route cache to here.
2202 The problem was that too many Ethernet cards have broken/missing
2203 hardware multicast filters :-( As result the host on multicasting
2204 network acquires a lot of useless route cache entries, sort of
2205 SDR messages from all the world. Now we try to get rid of them.
2206 Really, provided software IP multicast filter is organized
2207 reasonably (at least, hashed), it does not result in a slowdown
2208 comparing with route cache reject entries.
2209 Note, that multicast routers are not affected, because
2210 route cache entry is created eventually.
2211 */
f97c1e0c 2212 if (ipv4_is_multicast(daddr)) {
96d36220 2213 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2214
96d36220 2215 if (in_dev) {
dbdd9a52
DM
2216 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2217 ip_hdr(skb)->protocol);
1da177e4
LT
2218 if (our
2219#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2220 ||
2221 (!ipv4_is_local_multicast(daddr) &&
2222 IN_DEV_MFORWARD(in_dev))
1da177e4 2223#endif
9d4fb27d 2224 ) {
96d36220
ED
2225 int res = ip_route_input_mc(skb, daddr, saddr,
2226 tos, dev, our);
1da177e4 2227 rcu_read_unlock();
96d36220 2228 return res;
1da177e4
LT
2229 }
2230 }
2231 rcu_read_unlock();
2232 return -EINVAL;
2233 }
c10237e0 2234 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
2235 rcu_read_unlock();
2236 return res;
1da177e4 2237}
407eadd9 2238EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2239
ebc0ffae 2240/* called with rcu_read_lock() */
982721f3 2241static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2242 const struct flowi4 *fl4,
813b3b5d 2243 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2244 int orig_oif, __u8 orig_rtos,
2245 struct net_device *dev_out,
5ada5527 2246 unsigned int flags)
1da177e4 2247{
982721f3 2248 struct fib_info *fi = res->fi;
5ada5527 2249 struct in_device *in_dev;
982721f3 2250 u16 type = res->type;
5ada5527 2251 struct rtable *rth;
1da177e4 2252
d0daebc3
TG
2253 in_dev = __in_dev_get_rcu(dev_out);
2254 if (!in_dev)
5ada5527 2255 return ERR_PTR(-EINVAL);
1da177e4 2256
d0daebc3
TG
2257 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2258 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2259 return ERR_PTR(-EINVAL);
2260
68a5e3dd 2261 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2262 type = RTN_BROADCAST;
68a5e3dd 2263 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2264 type = RTN_MULTICAST;
68a5e3dd 2265 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2266 return ERR_PTR(-EINVAL);
1da177e4
LT
2267
2268 if (dev_out->flags & IFF_LOOPBACK)
2269 flags |= RTCF_LOCAL;
2270
982721f3 2271 if (type == RTN_BROADCAST) {
1da177e4 2272 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2273 fi = NULL;
2274 } else if (type == RTN_MULTICAST) {
dd28d1a0 2275 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2276 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2277 fl4->flowi4_proto))
1da177e4
LT
2278 flags &= ~RTCF_LOCAL;
2279 /* If multicast route do not exist use
dd28d1a0
ED
2280 * default one, but do not gateway in this case.
2281 * Yes, it is hack.
1da177e4 2282 */
982721f3
DM
2283 if (fi && res->prefixlen < 4)
2284 fi = NULL;
1da177e4
LT
2285 }
2286
5c1e6aa3
DM
2287 rth = rt_dst_alloc(dev_out,
2288 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2289 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2290 if (!rth)
5ada5527 2291 return ERR_PTR(-ENOBUFS);
8391d07b 2292
cf911662
DM
2293 rth->dst.output = ip_output;
2294
813b3b5d
DM
2295 rth->rt_key_dst = orig_daddr;
2296 rth->rt_key_src = orig_saddr;
cf911662
DM
2297 rth->rt_genid = rt_genid(dev_net(dev_out));
2298 rth->rt_flags = flags;
2299 rth->rt_type = type;
f61759e6 2300 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2301 rth->rt_dst = fl4->daddr;
2302 rth->rt_src = fl4->saddr;
1b86a58f 2303 rth->rt_route_iif = 0;
813b3b5d
DM
2304 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2305 rth->rt_oif = orig_oif;
2306 rth->rt_mark = fl4->flowi4_mark;
5943634f 2307 rth->rt_pmtu = 0;
68a5e3dd 2308 rth->rt_gateway = fl4->daddr;
cf911662 2309 rth->fi = NULL;
1da177e4
LT
2310
2311 RT_CACHE_STAT_INC(out_slow_tot);
2312
41347dcd 2313 if (flags & RTCF_LOCAL)
d8d1f30b 2314 rth->dst.input = ip_local_deliver;
1da177e4 2315 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2316 if (flags & RTCF_LOCAL &&
1da177e4 2317 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2318 rth->dst.output = ip_mc_output;
1da177e4
LT
2319 RT_CACHE_STAT_INC(out_slow_mc);
2320 }
2321#ifdef CONFIG_IP_MROUTE
982721f3 2322 if (type == RTN_MULTICAST) {
1da177e4 2323 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2324 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2325 rth->dst.input = ip_mr_input;
2326 rth->dst.output = ip_mc_output;
1da177e4
LT
2327 }
2328 }
2329#endif
2330 }
2331
813b3b5d 2332 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2333
7586eceb
ED
2334 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2335 rth->dst.flags |= DST_NOCACHE;
2336
5ada5527 2337 return rth;
1da177e4
LT
2338}
2339
1da177e4
LT
2340/*
2341 * Major route resolver routine.
0197aa38 2342 * called with rcu_read_lock();
1da177e4
LT
2343 */
2344
813b3b5d 2345static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2346{
1da177e4 2347 struct net_device *dev_out = NULL;
f61759e6 2348 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2349 unsigned int flags = 0;
2350 struct fib_result res;
5ada5527 2351 struct rtable *rth;
813b3b5d
DM
2352 __be32 orig_daddr;
2353 __be32 orig_saddr;
2354 int orig_oif;
1da177e4
LT
2355
2356 res.fi = NULL;
8b96d22d 2357 res.table = NULL;
1da177e4
LT
2358#ifdef CONFIG_IP_MULTIPLE_TABLES
2359 res.r = NULL;
2360#endif
2361
813b3b5d
DM
2362 orig_daddr = fl4->daddr;
2363 orig_saddr = fl4->saddr;
2364 orig_oif = fl4->flowi4_oif;
2365
2366 fl4->flowi4_iif = net->loopback_dev->ifindex;
2367 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2368 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2369 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2370
010c2708 2371 rcu_read_lock();
813b3b5d 2372 if (fl4->saddr) {
b23dd4fe 2373 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2374 if (ipv4_is_multicast(fl4->saddr) ||
2375 ipv4_is_lbcast(fl4->saddr) ||
2376 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2377 goto out;
2378
1da177e4
LT
2379 /* I removed check for oif == dev_out->oif here.
2380 It was wrong for two reasons:
1ab35276
DL
2381 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2382 is assigned to multiple interfaces.
1da177e4
LT
2383 2. Moreover, we are allowed to send packets with saddr
2384 of another iface. --ANK
2385 */
2386
813b3b5d
DM
2387 if (fl4->flowi4_oif == 0 &&
2388 (ipv4_is_multicast(fl4->daddr) ||
2389 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2390 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2391 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2392 if (dev_out == NULL)
2393 goto out;
2394
1da177e4
LT
2395 /* Special hack: user can direct multicasts
2396 and limited broadcast via necessary interface
2397 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2398 This hack is not just for fun, it allows
2399 vic,vat and friends to work.
2400 They bind socket to loopback, set ttl to zero
2401 and expect that it will work.
2402 From the viewpoint of routing cache they are broken,
2403 because we are not allowed to build multicast path
2404 with loopback source addr (look, routing cache
2405 cannot know, that ttl is zero, so that packet
2406 will not leave this host and route is valid).
2407 Luckily, this hack is good workaround.
2408 */
2409
813b3b5d 2410 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2411 goto make_route;
2412 }
a210d01a 2413
813b3b5d 2414 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2415 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2416 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2417 goto out;
a210d01a 2418 }
1da177e4
LT
2419 }
2420
2421
813b3b5d
DM
2422 if (fl4->flowi4_oif) {
2423 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2424 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2425 if (dev_out == NULL)
2426 goto out;
e5ed6399
HX
2427
2428 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2429 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2430 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2431 goto out;
2432 }
813b3b5d
DM
2433 if (ipv4_is_local_multicast(fl4->daddr) ||
2434 ipv4_is_lbcast(fl4->daddr)) {
2435 if (!fl4->saddr)
2436 fl4->saddr = inet_select_addr(dev_out, 0,
2437 RT_SCOPE_LINK);
1da177e4
LT
2438 goto make_route;
2439 }
813b3b5d
DM
2440 if (fl4->saddr) {
2441 if (ipv4_is_multicast(fl4->daddr))
2442 fl4->saddr = inet_select_addr(dev_out, 0,
2443 fl4->flowi4_scope);
2444 else if (!fl4->daddr)
2445 fl4->saddr = inet_select_addr(dev_out, 0,
2446 RT_SCOPE_HOST);
1da177e4
LT
2447 }
2448 }
2449
813b3b5d
DM
2450 if (!fl4->daddr) {
2451 fl4->daddr = fl4->saddr;
2452 if (!fl4->daddr)
2453 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2454 dev_out = net->loopback_dev;
813b3b5d 2455 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2456 res.type = RTN_LOCAL;
2457 flags |= RTCF_LOCAL;
2458 goto make_route;
2459 }
2460
813b3b5d 2461 if (fib_lookup(net, fl4, &res)) {
1da177e4 2462 res.fi = NULL;
8b96d22d 2463 res.table = NULL;
813b3b5d 2464 if (fl4->flowi4_oif) {
1da177e4
LT
2465 /* Apparently, routing tables are wrong. Assume,
2466 that the destination is on link.
2467
2468 WHY? DW.
2469 Because we are allowed to send to iface
2470 even if it has NO routes and NO assigned
2471 addresses. When oif is specified, routing
2472 tables are looked up with only one purpose:
2473 to catch if destination is gatewayed, rather than
2474 direct. Moreover, if MSG_DONTROUTE is set,
2475 we send packet, ignoring both routing tables
2476 and ifaddr state. --ANK
2477
2478
2479 We could make it even if oif is unknown,
2480 likely IPv6, but we do not.
2481 */
2482
813b3b5d
DM
2483 if (fl4->saddr == 0)
2484 fl4->saddr = inet_select_addr(dev_out, 0,
2485 RT_SCOPE_LINK);
1da177e4
LT
2486 res.type = RTN_UNICAST;
2487 goto make_route;
2488 }
b23dd4fe 2489 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2490 goto out;
2491 }
1da177e4
LT
2492
2493 if (res.type == RTN_LOCAL) {
813b3b5d 2494 if (!fl4->saddr) {
9fc3bbb4 2495 if (res.fi->fib_prefsrc)
813b3b5d 2496 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2497 else
813b3b5d 2498 fl4->saddr = fl4->daddr;
9fc3bbb4 2499 }
b40afd0e 2500 dev_out = net->loopback_dev;
813b3b5d 2501 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2502 res.fi = NULL;
2503 flags |= RTCF_LOCAL;
2504 goto make_route;
2505 }
2506
2507#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2508 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2509 fib_select_multipath(&res);
1da177e4
LT
2510 else
2511#endif
21d8c49e
DM
2512 if (!res.prefixlen &&
2513 res.table->tb_num_default > 1 &&
813b3b5d 2514 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2515 fib_select_default(&res);
1da177e4 2516
813b3b5d
DM
2517 if (!fl4->saddr)
2518 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2519
1da177e4 2520 dev_out = FIB_RES_DEV(res);
813b3b5d 2521 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2522
2523
2524make_route:
813b3b5d 2525 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2526 tos, dev_out, flags);
b23dd4fe 2527 if (!IS_ERR(rth)) {
5ada5527
DM
2528 unsigned int hash;
2529
813b3b5d 2530 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2531 rt_genid(dev_net(dev_out)));
813b3b5d 2532 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2533 }
1da177e4 2534
010c2708
DM
2535out:
2536 rcu_read_unlock();
b23dd4fe 2537 return rth;
1da177e4
LT
2538}
2539
813b3b5d 2540struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2541{
1da177e4 2542 struct rtable *rth;
010c2708 2543 unsigned int hash;
1da177e4 2544
1080d709
NH
2545 if (!rt_caching(net))
2546 goto slow_output;
2547
9d6ec938 2548 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2549
2550 rcu_read_lock_bh();
a898def2 2551 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2552 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2553 if (rth->rt_key_dst == flp4->daddr &&
2554 rth->rt_key_src == flp4->saddr &&
c7537967 2555 rt_is_output_route(rth) &&
9d6ec938
DM
2556 rth->rt_oif == flp4->flowi4_oif &&
2557 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2558 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2559 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2560 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2561 !rt_is_expired(rth)) {
d8d1f30b 2562 dst_use(&rth->dst, jiffies);
1da177e4
LT
2563 RT_CACHE_STAT_INC(out_hit);
2564 rcu_read_unlock_bh();
56157872
DM
2565 if (!flp4->saddr)
2566 flp4->saddr = rth->rt_src;
2567 if (!flp4->daddr)
2568 flp4->daddr = rth->rt_dst;
b23dd4fe 2569 return rth;
1da177e4
LT
2570 }
2571 RT_CACHE_STAT_INC(out_hlist_search);
2572 }
2573 rcu_read_unlock_bh();
2574
1080d709 2575slow_output:
9d6ec938 2576 return ip_route_output_slow(net, flp4);
1da177e4 2577}
d8c97a94
ACM
2578EXPORT_SYMBOL_GPL(__ip_route_output_key);
2579
ae2688d5
JW
2580static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2581{
2582 return NULL;
2583}
2584
ebb762f2 2585static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2586{
618f9bc7
SK
2587 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2588
2589 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2590}
2591
14e50e57
DM
2592static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2593{
2594}
2595
b587ee3b
DM
2596static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
2597{
2598}
2599
0972ddb2
HB
2600static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2601 unsigned long old)
2602{
2603 return NULL;
2604}
2605
14e50e57
DM
2606static struct dst_ops ipv4_dst_blackhole_ops = {
2607 .family = AF_INET,
09640e63 2608 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2609 .destroy = ipv4_dst_destroy,
ae2688d5 2610 .check = ipv4_blackhole_dst_check,
ebb762f2 2611 .mtu = ipv4_blackhole_mtu,
214f45c9 2612 .default_advmss = ipv4_default_advmss,
14e50e57 2613 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2614 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2615 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2616 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2617};
2618
2774c131 2619struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2620{
5c1e6aa3 2621 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2622 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2623
2624 if (rt) {
d8d1f30b 2625 struct dst_entry *new = &rt->dst;
14e50e57 2626
14e50e57 2627 new->__use = 1;
352e512c
HX
2628 new->input = dst_discard;
2629 new->output = dst_discard;
14e50e57 2630
d8d1f30b 2631 new->dev = ort->dst.dev;
14e50e57
DM
2632 if (new->dev)
2633 dev_hold(new->dev);
2634
5e2b61f7
DM
2635 rt->rt_key_dst = ort->rt_key_dst;
2636 rt->rt_key_src = ort->rt_key_src;
475949d8 2637 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2638 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2639 rt->rt_iif = ort->rt_iif;
2640 rt->rt_oif = ort->rt_oif;
2641 rt->rt_mark = ort->rt_mark;
5943634f 2642 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2643
e84f84f2 2644 rt->rt_genid = rt_genid(net);
14e50e57
DM
2645 rt->rt_flags = ort->rt_flags;
2646 rt->rt_type = ort->rt_type;
2647 rt->rt_dst = ort->rt_dst;
2648 rt->rt_src = ort->rt_src;
14e50e57 2649 rt->rt_gateway = ort->rt_gateway;
62fa8a84
DM
2650 rt->fi = ort->fi;
2651 if (rt->fi)
2652 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2653
2654 dst_free(new);
2655 }
2656
2774c131
DM
2657 dst_release(dst_orig);
2658
2659 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2660}
2661
9d6ec938 2662struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2663 struct sock *sk)
1da177e4 2664{
9d6ec938 2665 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2666
b23dd4fe
DM
2667 if (IS_ERR(rt))
2668 return rt;
1da177e4 2669
56157872 2670 if (flp4->flowi4_proto)
9d6ec938
DM
2671 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2672 flowi4_to_flowi(flp4),
2673 sk, 0);
1da177e4 2674
b23dd4fe 2675 return rt;
1da177e4 2676}
d8c97a94
ACM
2677EXPORT_SYMBOL_GPL(ip_route_output_flow);
2678
4feb88e5
BT
2679static int rt_fill_info(struct net *net,
2680 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2681 int nowait, unsigned int flags)
1da177e4 2682{
511c3f92 2683 struct rtable *rt = skb_rtable(skb);
1da177e4 2684 struct rtmsg *r;
be403ea1 2685 struct nlmsghdr *nlh;
2bc8ca40 2686 unsigned long expires = 0;
f185071d 2687 u32 error;
be403ea1
TG
2688
2689 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2690 if (nlh == NULL)
26932566 2691 return -EMSGSIZE;
be403ea1
TG
2692
2693 r = nlmsg_data(nlh);
1da177e4
LT
2694 r->rtm_family = AF_INET;
2695 r->rtm_dst_len = 32;
2696 r->rtm_src_len = 0;
475949d8 2697 r->rtm_tos = rt->rt_key_tos;
1da177e4 2698 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2699 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2700 goto nla_put_failure;
1da177e4
LT
2701 r->rtm_type = rt->rt_type;
2702 r->rtm_scope = RT_SCOPE_UNIVERSE;
2703 r->rtm_protocol = RTPROT_UNSPEC;
2704 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2705 if (rt->rt_flags & RTCF_NOTIFY)
2706 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2707
f3756b79
DM
2708 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2709 goto nla_put_failure;
5e2b61f7 2710 if (rt->rt_key_src) {
1da177e4 2711 r->rtm_src_len = 32;
f3756b79
DM
2712 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2713 goto nla_put_failure;
1da177e4 2714 }
f3756b79
DM
2715 if (rt->dst.dev &&
2716 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2717 goto nla_put_failure;
c7066f70 2718#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2719 if (rt->dst.tclassid &&
2720 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2721 goto nla_put_failure;
1da177e4 2722#endif
41347dcd
DM
2723 if (!rt_is_input_route(rt) &&
2724 rt->rt_src != rt->rt_key_src) {
f3756b79
DM
2725 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2726 goto nla_put_failure;
2727 }
2728 if (rt->rt_dst != rt->rt_gateway &&
2729 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2730 goto nla_put_failure;
be403ea1 2731
defb3519 2732 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2733 goto nla_put_failure;
2734
f3756b79
DM
2735 if (rt->rt_mark &&
2736 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2737 goto nla_put_failure;
963bfeee 2738
d8d1f30b 2739 error = rt->dst.error;
5943634f
DM
2740 expires = rt->dst.expires;
2741 if (expires) {
2742 if (time_before(jiffies, expires))
2743 expires -= jiffies;
2744 else
2745 expires = 0;
1da177e4 2746 }
be403ea1 2747
c7537967 2748 if (rt_is_input_route(rt)) {
1da177e4 2749#ifdef CONFIG_IP_MROUTE
e448515c 2750 __be32 dst = rt->rt_dst;
1da177e4 2751
f97c1e0c 2752 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2753 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2754 int err = ipmr_get_route(net, skb,
2755 rt->rt_src, rt->rt_dst,
2756 r, nowait);
1da177e4
LT
2757 if (err <= 0) {
2758 if (!nowait) {
2759 if (err == 0)
2760 return 0;
be403ea1 2761 goto nla_put_failure;
1da177e4
LT
2762 } else {
2763 if (err == -EMSGSIZE)
be403ea1 2764 goto nla_put_failure;
e3703b3d 2765 error = err;
1da177e4
LT
2766 }
2767 }
2768 } else
2769#endif
f3756b79
DM
2770 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2771 goto nla_put_failure;
1da177e4
LT
2772 }
2773
f185071d 2774 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2775 goto nla_put_failure;
be403ea1
TG
2776
2777 return nlmsg_end(skb, nlh);
1da177e4 2778
be403ea1 2779nla_put_failure:
26932566
PM
2780 nlmsg_cancel(skb, nlh);
2781 return -EMSGSIZE;
1da177e4
LT
2782}
2783
5e73ea1a 2784static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1da177e4 2785{
3b1e0a65 2786 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2787 struct rtmsg *rtm;
2788 struct nlattr *tb[RTA_MAX+1];
1da177e4 2789 struct rtable *rt = NULL;
9e12bb22
AV
2790 __be32 dst = 0;
2791 __be32 src = 0;
2792 u32 iif;
d889ce3b 2793 int err;
963bfeee 2794 int mark;
1da177e4
LT
2795 struct sk_buff *skb;
2796
d889ce3b
TG
2797 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2798 if (err < 0)
2799 goto errout;
2800
2801 rtm = nlmsg_data(nlh);
2802
1da177e4 2803 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2804 if (skb == NULL) {
2805 err = -ENOBUFS;
2806 goto errout;
2807 }
1da177e4
LT
2808
2809 /* Reserve room for dummy headers, this skb can pass
2810 through good chunk of routing engine.
2811 */
459a98ed 2812 skb_reset_mac_header(skb);
c1d2bbe1 2813 skb_reset_network_header(skb);
d2c962b8
SH
2814
2815 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2816 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2817 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2818
17fb2c64
AV
2819 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2820 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2821 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2822 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
2823
2824 if (iif) {
d889ce3b
TG
2825 struct net_device *dev;
2826
1937504d 2827 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2828 if (dev == NULL) {
2829 err = -ENODEV;
2830 goto errout_free;
2831 }
2832
1da177e4
LT
2833 skb->protocol = htons(ETH_P_IP);
2834 skb->dev = dev;
963bfeee 2835 skb->mark = mark;
1da177e4
LT
2836 local_bh_disable();
2837 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2838 local_bh_enable();
d889ce3b 2839
511c3f92 2840 rt = skb_rtable(skb);
d8d1f30b
CG
2841 if (err == 0 && rt->dst.error)
2842 err = -rt->dst.error;
1da177e4 2843 } else {
68a5e3dd
DM
2844 struct flowi4 fl4 = {
2845 .daddr = dst,
2846 .saddr = src,
2847 .flowi4_tos = rtm->rtm_tos,
2848 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2849 .flowi4_mark = mark,
d889ce3b 2850 };
9d6ec938 2851 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2852
2853 err = 0;
2854 if (IS_ERR(rt))
2855 err = PTR_ERR(rt);
1da177e4 2856 }
d889ce3b 2857
1da177e4 2858 if (err)
d889ce3b 2859 goto errout_free;
1da177e4 2860
d8d1f30b 2861 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2862 if (rtm->rtm_flags & RTM_F_NOTIFY)
2863 rt->rt_flags |= RTCF_NOTIFY;
2864
4feb88e5 2865 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2866 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2867 if (err <= 0)
2868 goto errout_free;
1da177e4 2869
1937504d 2870 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2871errout:
2942e900 2872 return err;
1da177e4 2873
d889ce3b 2874errout_free:
1da177e4 2875 kfree_skb(skb);
d889ce3b 2876 goto errout;
1da177e4
LT
2877}
2878
2879int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2880{
2881 struct rtable *rt;
2882 int h, s_h;
2883 int idx, s_idx;
1937504d
DL
2884 struct net *net;
2885
3b1e0a65 2886 net = sock_net(skb->sk);
1da177e4
LT
2887
2888 s_h = cb->args[0];
d8c92830
ED
2889 if (s_h < 0)
2890 s_h = 0;
1da177e4 2891 s_idx = idx = cb->args[1];
a6272665
ED
2892 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2893 if (!rt_hash_table[h].chain)
2894 continue;
1da177e4 2895 rcu_read_lock_bh();
a898def2 2896 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
2897 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2898 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 2899 continue;
e84f84f2 2900 if (rt_is_expired(rt))
29e75252 2901 continue;
d8d1f30b 2902 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 2903 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2904 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2905 1, NLM_F_MULTI) <= 0) {
adf30907 2906 skb_dst_drop(skb);
1da177e4
LT
2907 rcu_read_unlock_bh();
2908 goto done;
2909 }
adf30907 2910 skb_dst_drop(skb);
1da177e4
LT
2911 }
2912 rcu_read_unlock_bh();
2913 }
2914
2915done:
2916 cb->args[0] = h;
2917 cb->args[1] = idx;
2918 return skb->len;
2919}
2920
2921void ip_rt_multicast_event(struct in_device *in_dev)
2922{
76e6ebfb 2923 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
2924}
2925
2926#ifdef CONFIG_SYSCTL
81c684d1 2927static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2928 void __user *buffer,
1da177e4
LT
2929 size_t *lenp, loff_t *ppos)
2930{
2931 if (write) {
639e104f 2932 int flush_delay;
81c684d1 2933 ctl_table ctl;
39a23e75 2934 struct net *net;
639e104f 2935
81c684d1
DL
2936 memcpy(&ctl, __ctl, sizeof(ctl));
2937 ctl.data = &flush_delay;
8d65af78 2938 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 2939
81c684d1 2940 net = (struct net *)__ctl->extra1;
39a23e75 2941 rt_cache_flush(net, flush_delay);
1da177e4 2942 return 0;
e905a9ed 2943 }
1da177e4
LT
2944
2945 return -EINVAL;
2946}
2947
eeb61f71 2948static ctl_table ipv4_route_table[] = {
1da177e4 2949 {
1da177e4
LT
2950 .procname = "gc_thresh",
2951 .data = &ipv4_dst_ops.gc_thresh,
2952 .maxlen = sizeof(int),
2953 .mode = 0644,
6d9f239a 2954 .proc_handler = proc_dointvec,
1da177e4
LT
2955 },
2956 {
1da177e4
LT
2957 .procname = "max_size",
2958 .data = &ip_rt_max_size,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
6d9f239a 2961 .proc_handler = proc_dointvec,
1da177e4
LT
2962 },
2963 {
2964 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2965
1da177e4
LT
2966 .procname = "gc_min_interval",
2967 .data = &ip_rt_gc_min_interval,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
6d9f239a 2970 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2971 },
2972 {
1da177e4
LT
2973 .procname = "gc_min_interval_ms",
2974 .data = &ip_rt_gc_min_interval,
2975 .maxlen = sizeof(int),
2976 .mode = 0644,
6d9f239a 2977 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2978 },
2979 {
1da177e4
LT
2980 .procname = "gc_timeout",
2981 .data = &ip_rt_gc_timeout,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
6d9f239a 2984 .proc_handler = proc_dointvec_jiffies,
1da177e4 2985 },
9f28a2fc
ED
2986 {
2987 .procname = "gc_interval",
2988 .data = &ip_rt_gc_interval,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
2991 .proc_handler = proc_dointvec_jiffies,
2992 },
1da177e4 2993 {
1da177e4
LT
2994 .procname = "redirect_load",
2995 .data = &ip_rt_redirect_load,
2996 .maxlen = sizeof(int),
2997 .mode = 0644,
6d9f239a 2998 .proc_handler = proc_dointvec,
1da177e4
LT
2999 },
3000 {
1da177e4
LT
3001 .procname = "redirect_number",
3002 .data = &ip_rt_redirect_number,
3003 .maxlen = sizeof(int),
3004 .mode = 0644,
6d9f239a 3005 .proc_handler = proc_dointvec,
1da177e4
LT
3006 },
3007 {
1da177e4
LT
3008 .procname = "redirect_silence",
3009 .data = &ip_rt_redirect_silence,
3010 .maxlen = sizeof(int),
3011 .mode = 0644,
6d9f239a 3012 .proc_handler = proc_dointvec,
1da177e4
LT
3013 },
3014 {
1da177e4
LT
3015 .procname = "error_cost",
3016 .data = &ip_rt_error_cost,
3017 .maxlen = sizeof(int),
3018 .mode = 0644,
6d9f239a 3019 .proc_handler = proc_dointvec,
1da177e4
LT
3020 },
3021 {
1da177e4
LT
3022 .procname = "error_burst",
3023 .data = &ip_rt_error_burst,
3024 .maxlen = sizeof(int),
3025 .mode = 0644,
6d9f239a 3026 .proc_handler = proc_dointvec,
1da177e4
LT
3027 },
3028 {
1da177e4
LT
3029 .procname = "gc_elasticity",
3030 .data = &ip_rt_gc_elasticity,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
6d9f239a 3033 .proc_handler = proc_dointvec,
1da177e4
LT
3034 },
3035 {
1da177e4
LT
3036 .procname = "mtu_expires",
3037 .data = &ip_rt_mtu_expires,
3038 .maxlen = sizeof(int),
3039 .mode = 0644,
6d9f239a 3040 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3041 },
3042 {
1da177e4
LT
3043 .procname = "min_pmtu",
3044 .data = &ip_rt_min_pmtu,
3045 .maxlen = sizeof(int),
3046 .mode = 0644,
6d9f239a 3047 .proc_handler = proc_dointvec,
1da177e4
LT
3048 },
3049 {
1da177e4
LT
3050 .procname = "min_adv_mss",
3051 .data = &ip_rt_min_advmss,
3052 .maxlen = sizeof(int),
3053 .mode = 0644,
6d9f239a 3054 .proc_handler = proc_dointvec,
1da177e4 3055 },
f8572d8f 3056 { }
1da177e4 3057};
39a23e75 3058
39a23e75
DL
3059static struct ctl_table ipv4_route_flush_table[] = {
3060 {
39a23e75
DL
3061 .procname = "flush",
3062 .maxlen = sizeof(int),
3063 .mode = 0200,
6d9f239a 3064 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3065 },
f8572d8f 3066 { },
39a23e75
DL
3067};
3068
3069static __net_init int sysctl_route_net_init(struct net *net)
3070{
3071 struct ctl_table *tbl;
3072
3073 tbl = ipv4_route_flush_table;
09ad9bc7 3074 if (!net_eq(net, &init_net)) {
39a23e75
DL
3075 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3076 if (tbl == NULL)
3077 goto err_dup;
3078 }
3079 tbl[0].extra1 = net;
3080
ec8f23ce 3081 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
3082 if (net->ipv4.route_hdr == NULL)
3083 goto err_reg;
3084 return 0;
3085
3086err_reg:
3087 if (tbl != ipv4_route_flush_table)
3088 kfree(tbl);
3089err_dup:
3090 return -ENOMEM;
3091}
3092
3093static __net_exit void sysctl_route_net_exit(struct net *net)
3094{
3095 struct ctl_table *tbl;
3096
3097 tbl = net->ipv4.route_hdr->ctl_table_arg;
3098 unregister_net_sysctl_table(net->ipv4.route_hdr);
3099 BUG_ON(tbl == ipv4_route_flush_table);
3100 kfree(tbl);
3101}
3102
3103static __net_initdata struct pernet_operations sysctl_route_ops = {
3104 .init = sysctl_route_net_init,
3105 .exit = sysctl_route_net_exit,
3106};
1da177e4
LT
3107#endif
3108
3ee94372 3109static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3110{
3ee94372
NH
3111 get_random_bytes(&net->ipv4.rt_genid,
3112 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3113 get_random_bytes(&net->ipv4.dev_addr_genid,
3114 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3115 return 0;
3116}
3117
3ee94372
NH
3118static __net_initdata struct pernet_operations rt_genid_ops = {
3119 .init = rt_genid_init,
9f5e97e5
DL
3120};
3121
c3426b47
DM
3122static int __net_init ipv4_inetpeer_init(struct net *net)
3123{
3124 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3125
3126 if (!bp)
3127 return -ENOMEM;
3128 inet_peer_base_init(bp);
3129 net->ipv4.peers = bp;
3130 return 0;
3131}
3132
3133static void __net_exit ipv4_inetpeer_exit(struct net *net)
3134{
3135 struct inet_peer_base *bp = net->ipv4.peers;
3136
3137 net->ipv4.peers = NULL;
56a6b248 3138 inetpeer_invalidate_tree(bp);
c3426b47
DM
3139 kfree(bp);
3140}
3141
3142static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3143 .init = ipv4_inetpeer_init,
3144 .exit = ipv4_inetpeer_exit,
3145};
9f5e97e5 3146
c7066f70 3147#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3148struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3149#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3150
3151static __initdata unsigned long rhash_entries;
3152static int __init set_rhash_entries(char *str)
3153{
413c27d8
EZ
3154 ssize_t ret;
3155
1da177e4
LT
3156 if (!str)
3157 return 0;
413c27d8
EZ
3158
3159 ret = kstrtoul(str, 0, &rhash_entries);
3160 if (ret)
3161 return 0;
3162
1da177e4
LT
3163 return 1;
3164}
3165__setup("rhash_entries=", set_rhash_entries);
3166
3167int __init ip_rt_init(void)
3168{
424c4b70 3169 int rc = 0;
1da177e4 3170
c7066f70 3171#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3172 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3173 if (!ip_rt_acct)
3174 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3175#endif
3176
e5d679f3
AD
3177 ipv4_dst_ops.kmem_cachep =
3178 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3179 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3180
14e50e57
DM
3181 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3182
fc66f95c
ED
3183 if (dst_entries_init(&ipv4_dst_ops) < 0)
3184 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3185
3186 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3187 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3188
424c4b70
ED
3189 rt_hash_table = (struct rt_hash_bucket *)
3190 alloc_large_system_hash("IP route cache",
3191 sizeof(struct rt_hash_bucket),
3192 rhash_entries,
4481374c 3193 (totalram_pages >= 128 * 1024) ?
18955cfc 3194 15 : 17,
8d1502de 3195 0,
424c4b70
ED
3196 &rt_hash_log,
3197 &rt_hash_mask,
31fe62b9 3198 0,
c9503e0f 3199 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3200 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3201 rt_hash_lock_init();
1da177e4
LT
3202
3203 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3204 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3205
1da177e4
LT
3206 devinet_init();
3207 ip_fib_init();
3208
9f28a2fc
ED
3209 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3210 expires_ljiffies = jiffies;
3211 schedule_delayed_work(&expires_work,
3212 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3213
73b38711 3214 if (ip_rt_proc_init())
058bd4d2 3215 pr_err("Unable to create route proc files\n");
1da177e4
LT
3216#ifdef CONFIG_XFRM
3217 xfrm_init();
a33bc5c1 3218 xfrm4_init(ip_rt_max_size);
1da177e4 3219#endif
c7ac8679 3220 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3221
39a23e75
DL
3222#ifdef CONFIG_SYSCTL
3223 register_pernet_subsys(&sysctl_route_ops);
3224#endif
3ee94372 3225 register_pernet_subsys(&rt_genid_ops);
c3426b47 3226 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3227 return rc;
3228}
3229
a1bc6eb4 3230#ifdef CONFIG_SYSCTL
eeb61f71
AV
3231/*
3232 * We really need to sanitize the damn ipv4 init order, then all
3233 * this nonsense will go away.
3234 */
3235void __init ip_static_sysctl_init(void)
3236{
4e5ca785 3237 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3238}
a1bc6eb4 3239#endif