ipv4: Don't create nh exeption when the device mtu is smaller than the reported pmtu
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
352e512c 92#include <net/dst.h>
457c4cbc 93#include <net/net_namespace.h>
1da177e4
LT
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
8d71740c 104#include <net/netevent.h>
63f3444f 105#include <net/rtnetlink.h>
1da177e4
LT
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
7426a564 108#include <linux/kmemleak.h>
1da177e4 109#endif
6e5714ea 110#include <net/secure_seq.h>
1da177e4 111
68a5e3dd 112#define RT_FL_TOS(oldflp4) \
f61759e6 113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db 120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 132
1da177e4
LT
133/*
134 * Interface to generic destination cache.
135 */
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 138static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 139static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
142static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
caacf05e 146static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 147
72cdd1d9
ED
148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
1da177e4 152
62fa8a84
DM
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
31248731
DM
155 WARN_ON(1);
156 return NULL;
62fa8a84
DM
157}
158
f894cbf8
DM
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
d3aaeb38 162
1da177e4
LT
163static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET,
09640e63 165 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4 166 .check = ipv4_dst_check,
0dbaee3b 167 .default_advmss = ipv4_default_advmss,
ebb762f2 168 .mtu = ipv4_mtu,
62fa8a84 169 .cow_metrics = ipv4_cow_metrics,
caacf05e 170 .destroy = ipv4_dst_destroy,
1da177e4
LT
171 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
e47a185b 175 .redirect = ip_do_redirect,
1ac06e03 176 .local_out = __ip_local_out,
d3aaeb38 177 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
4839c52b 182const __u8 ip_tos2prio[16] = {
1da177e4 183 TC_PRIO_BESTEFFORT,
4a2b9c37 184 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
d4a96865 200EXPORT_SYMBOL(ip_tos2prio);
1da177e4 201
2f970d83 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 204
1da177e4 205#ifdef CONFIG_PROC_FS
1da177e4
LT
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
29e75252 208 if (*pos)
89aef892 209 return NULL;
29e75252 210 return SEQ_START_TOKEN;
1da177e4
LT
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
1da177e4 215 ++*pos;
89aef892 216 return NULL;
1da177e4
LT
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
1da177e4
LT
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
e905a9ed 230 return 0;
1da177e4
LT
231}
232
f690808e 233static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
89aef892 242 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
243}
244
9a32144e 245static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
89aef892 250 .release = seq_release,
1da177e4
LT
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
0f23174a 261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
2f970d83 265 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
0f23174a 274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
2f970d83 278 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
279 }
280 return NULL;
e905a9ed 281
1da177e4
LT
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
5bec0039 294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
295 return 0;
296 }
e905a9ed 297
1da177e4
LT
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 300 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
301 st->in_hit,
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 st->out_hit,
310 st->out_slow_tot,
e905a9ed 311 st->out_slow_mc,
1da177e4
LT
312
313 st->gc_total,
314 st->gc_ignored,
315 st->gc_goal_miss,
316 st->gc_dst_overflow,
317 st->in_hlist_search,
318 st->out_hlist_search
319 );
320 return 0;
321}
322
f690808e 323static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
9a32144e 336static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
c7066f70 344#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 345static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 346{
a661c419
AD
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
78c686e9
PE
362 }
363
a661c419
AD
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
78c686e9 368
a661c419
AD
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 372}
a661c419
AD
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
78c686e9 381#endif
107f1634 382
73b38711 383static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
384{
385 struct proc_dir_entry *pde;
386
387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
77020720
WC
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
394 if (!pde)
395 goto err2;
396
c7066f70 397#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
c7066f70 404#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
73b38711
DL
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 418#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 419 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 420#endif
73b38711
DL
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
107f1634 433#else
73b38711 434static inline int ip_rt_proc_init(void)
107f1634
PE
435{
436 return 0;
437}
1da177e4 438#endif /* CONFIG_PROC_FS */
e905a9ed 439
4331debc 440static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 441{
d8d1f30b 442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
443}
444
4ccfe6d4 445void rt_cache_flush(struct net *net)
1da177e4 446{
b42664f8 447 rt_genid_bump(net);
98376387
ED
448}
449
f894cbf8
DM
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
3769cffb 453{
d3aaeb38
DM
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
39232973 456 const struct rtable *rt;
3769cffb
DM
457 struct neighbour *n;
458
39232973 459 rt = (const struct rtable *) dst;
a263b309 460 if (rt->rt_gateway)
39232973 461 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 464
80703d26 465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
466 if (n)
467 return n;
32092ecf 468 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
469}
470
1da177e4
LT
471/*
472 * Peer allocation may fail only in serious out-of-memory conditions. However
473 * we still can generate some output.
474 * Random ID selection looks a bit dangerous because we have no chances to
475 * select ID being unique in a reasonable period of time.
476 * But broken packet identifier may be better than no packet at all.
477 */
478static void ip_select_fb_ident(struct iphdr *iph)
479{
480 static DEFINE_SPINLOCK(ip_fb_id_lock);
481 static u32 ip_fallback_id;
482 u32 salt;
483
484 spin_lock_bh(&ip_fb_id_lock);
e448515c 485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
486 iph->id = htons(salt & 0xFFFF);
487 ip_fallback_id = salt;
488 spin_unlock_bh(&ip_fb_id_lock);
489}
490
491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492{
1d861aa4
DM
493 struct net *net = dev_net(dst->dev);
494 struct inet_peer *peer;
1da177e4 495
1d861aa4
DM
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 if (peer) {
498 iph->id = htons(inet_getid(peer, more));
499 inet_putpeer(peer);
500 return;
501 }
1da177e4
LT
502
503 ip_select_fb_ident(iph);
504}
4bc2f18b 505EXPORT_SYMBOL(__ip_select_ident);
1da177e4 506
5abf7f7e 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
4895c771
DM
508 const struct iphdr *iph,
509 int oif, u8 tos,
510 u8 prot, u32 mark, int flow_flags)
511{
512 if (sk) {
513 const struct inet_sock *inet = inet_sk(sk);
514
515 oif = sk->sk_bound_dev_if;
516 mark = sk->sk_mark;
517 tos = RT_CONN_FLAGS(sk);
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 }
520 flowi4_init_output(fl4, oif, mark, tos,
521 RT_SCOPE_UNIVERSE, prot,
522 flow_flags,
523 iph->daddr, iph->saddr, 0, 0);
524}
525
5abf7f7e
ED
526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 const struct sock *sk)
4895c771
DM
528{
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
534
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536}
537
5abf7f7e 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
539{
540 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 541 const struct ip_options_rcu *inet_opt;
4895c771
DM
542 __be32 daddr = inet->inet_daddr;
543
544 rcu_read_lock();
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0);
553 rcu_read_unlock();
554}
555
5abf7f7e
ED
556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
4895c771
DM
558{
559 if (skb)
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563}
564
c5038a83
DM
565static inline void rt_free(struct rtable *rt)
566{
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568}
569
570static DEFINE_SPINLOCK(fnhe_lock);
4895c771 571
aee06da6 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
573{
574 struct fib_nh_exception *fnhe, *oldest;
c5038a83 575 struct rtable *orig;
4895c771
DM
576
577 oldest = rcu_dereference(hash->chain);
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 oldest = fnhe;
582 }
c5038a83
DM
583 orig = rcu_dereference(oldest->fnhe_rth);
584 if (orig) {
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 rt_free(orig);
587 }
4895c771
DM
588 return oldest;
589}
590
d3a25c98
DM
591static inline u32 fnhe_hashfun(__be32 daddr)
592{
593 u32 hval;
594
595 hval = (__force u32) daddr;
596 hval ^= (hval >> 11) ^ (hval >> 22);
597
598 return hval & (FNHE_HASH_SIZE - 1);
599}
600
aee06da6
JA
601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 u32 pmtu, unsigned long expires)
4895c771 603{
aee06da6 604 struct fnhe_hash_bucket *hash;
4895c771
DM
605 struct fib_nh_exception *fnhe;
606 int depth;
aee06da6
JA
607 u32 hval = fnhe_hashfun(daddr);
608
c5038a83 609 spin_lock_bh(&fnhe_lock);
4895c771 610
aee06da6 611 hash = nh->nh_exceptions;
4895c771 612 if (!hash) {
aee06da6 613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 614 if (!hash)
aee06da6
JA
615 goto out_unlock;
616 nh->nh_exceptions = hash;
4895c771
DM
617 }
618
4895c771
DM
619 hash += hval;
620
621 depth = 0;
622 for (fnhe = rcu_dereference(hash->chain); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (fnhe->fnhe_daddr == daddr)
aee06da6 625 break;
4895c771
DM
626 depth++;
627 }
628
aee06da6
JA
629 if (fnhe) {
630 if (gw)
631 fnhe->fnhe_gw = gw;
632 if (pmtu) {
633 fnhe->fnhe_pmtu = pmtu;
634 fnhe->fnhe_expires = expires;
635 }
636 } else {
637 if (depth > FNHE_RECLAIM_DEPTH)
638 fnhe = fnhe_oldest(hash);
639 else {
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 if (!fnhe)
642 goto out_unlock;
643
644 fnhe->fnhe_next = hash->chain;
645 rcu_assign_pointer(hash->chain, fnhe);
646 }
647 fnhe->fnhe_daddr = daddr;
648 fnhe->fnhe_gw = gw;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
4895c771 651 }
4895c771 652
4895c771 653 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
654
655out_unlock:
c5038a83 656 spin_unlock_bh(&fnhe_lock);
aee06da6 657 return;
4895c771
DM
658}
659
ceb33206
DM
660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 bool kill_route)
1da177e4 662{
e47a185b 663 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 664 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 665 struct net_device *dev = skb->dev;
e47a185b 666 struct in_device *in_dev;
4895c771 667 struct fib_result res;
e47a185b 668 struct neighbour *n;
317805b8 669 struct net *net;
1da177e4 670
94206125
DM
671 switch (icmp_hdr(skb)->code & 7) {
672 case ICMP_REDIR_NET:
673 case ICMP_REDIR_NETTOS:
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
676 break;
677
678 default:
679 return;
680 }
681
e47a185b
DM
682 if (rt->rt_gateway != old_gw)
683 return;
684
685 in_dev = __in_dev_get_rcu(dev);
686 if (!in_dev)
687 return;
688
c346dca1 689 net = dev_net(dev);
9d4fb27d
JP
690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 ipv4_is_zeronet(new_gw))
1da177e4
LT
693 goto reject_redirect;
694
695 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 goto reject_redirect;
698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 goto reject_redirect;
700 } else {
317805b8 701 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
702 goto reject_redirect;
703 }
704
4895c771 705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
e47a185b
DM
706 if (n) {
707 if (!(n->nud_state & NUD_VALID)) {
708 neigh_event_send(n, NULL);
709 } else {
4895c771
DM
710 if (fib_lookup(net, fl4, &res) == 0) {
711 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 712
aee06da6
JA
713 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 0, 0);
4895c771 715 }
ceb33206
DM
716 if (kill_route)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 }
720 neigh_release(n);
721 }
722 return;
723
724reject_redirect:
725#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
726 if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 const struct iphdr *iph = (const struct iphdr *) skb->data;
728 __be32 daddr = iph->daddr;
729 __be32 saddr = iph->saddr;
730
e47a185b
DM
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
734 &saddr, &daddr);
99ee038d 735 }
e47a185b
DM
736#endif
737 ;
738}
739
4895c771
DM
740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741{
742 struct rtable *rt;
743 struct flowi4 fl4;
744
745 rt = (struct rtable *) dst;
746
747 ip_rt_build_flow_key(&fl4, sk, skb);
ceb33206 748 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
749}
750
1da177e4
LT
751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752{
ee6b9673 753 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
754 struct dst_entry *ret = dst;
755
756 if (rt) {
d11a4dc1 757 if (dst->obsolete > 0) {
1da177e4
LT
758 ip_rt_put(rt);
759 ret = NULL;
5943634f
DM
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 rt->dst.expires) {
89aef892 762 ip_rt_put(rt);
1da177e4
LT
763 ret = NULL;
764 }
765 }
766 return ret;
767}
768
769/*
770 * Algorithm:
771 * 1. The first ip_rt_redirect_number redirects are sent
772 * with exponential backoff, then we stop sending them at all,
773 * assuming that the host ignores our redirects.
774 * 2. If we did not see packets requiring redirects
775 * during ip_rt_redirect_silence, we assume that the host
776 * forgot redirected route and start to send redirects again.
777 *
778 * This algorithm is much cheaper and more intelligent than dumb load limiting
779 * in icmp.c.
780 *
781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
783 */
784
785void ip_rt_send_redirect(struct sk_buff *skb)
786{
511c3f92 787 struct rtable *rt = skb_rtable(skb);
30038fc6 788 struct in_device *in_dev;
92d86829 789 struct inet_peer *peer;
1d861aa4 790 struct net *net;
30038fc6 791 int log_martians;
1da177e4 792
30038fc6 793 rcu_read_lock();
d8d1f30b 794 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 rcu_read_unlock();
1da177e4 797 return;
30038fc6
ED
798 }
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 rcu_read_unlock();
1da177e4 801
1d861aa4
DM
802 net = dev_net(rt->dst.dev);
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
804 if (!peer) {
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
806 return;
807 }
808
1da177e4
LT
809 /* No redirected packets during ip_rt_redirect_silence;
810 * reset the algorithm.
811 */
92d86829
DM
812 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
813 peer->rate_tokens = 0;
1da177e4
LT
814
815 /* Too many ignored redirects; do not send anything
d8d1f30b 816 * set dst.rate_last to the last seen redirected packet.
1da177e4 817 */
92d86829
DM
818 if (peer->rate_tokens >= ip_rt_redirect_number) {
819 peer->rate_last = jiffies;
1d861aa4 820 goto out_put_peer;
1da177e4
LT
821 }
822
823 /* Check for load limit; set rate_last to the latest sent
824 * redirect.
825 */
92d86829 826 if (peer->rate_tokens == 0 ||
14fb8a76 827 time_after(jiffies,
92d86829
DM
828 (peer->rate_last +
829 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 830 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
831 peer->rate_last = jiffies;
832 ++peer->rate_tokens;
1da177e4 833#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 834 if (log_martians &&
e87cc472
JP
835 peer->rate_tokens == ip_rt_redirect_number)
836 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 837 &ip_hdr(skb)->saddr, inet_iif(skb),
f1ce3062 838 &ip_hdr(skb)->daddr, &rt->rt_gateway);
1da177e4
LT
839#endif
840 }
1d861aa4
DM
841out_put_peer:
842 inet_putpeer(peer);
1da177e4
LT
843}
844
845static int ip_error(struct sk_buff *skb)
846{
251da413 847 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 848 struct rtable *rt = skb_rtable(skb);
92d86829 849 struct inet_peer *peer;
1da177e4 850 unsigned long now;
251da413 851 struct net *net;
92d86829 852 bool send;
1da177e4
LT
853 int code;
854
251da413
DM
855 net = dev_net(rt->dst.dev);
856 if (!IN_DEV_FORWARD(in_dev)) {
857 switch (rt->dst.error) {
858 case EHOSTUNREACH:
859 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
860 break;
861
862 case ENETUNREACH:
863 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
864 break;
865 }
866 goto out;
867 }
868
d8d1f30b 869 switch (rt->dst.error) {
4500ebf8
JP
870 case EINVAL:
871 default:
872 goto out;
873 case EHOSTUNREACH:
874 code = ICMP_HOST_UNREACH;
875 break;
876 case ENETUNREACH:
877 code = ICMP_NET_UNREACH;
251da413 878 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
879 break;
880 case EACCES:
881 code = ICMP_PKT_FILTERED;
882 break;
1da177e4
LT
883 }
884
1d861aa4 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
886
887 send = true;
888 if (peer) {
889 now = jiffies;
890 peer->rate_tokens += now - peer->rate_last;
891 if (peer->rate_tokens > ip_rt_error_burst)
892 peer->rate_tokens = ip_rt_error_burst;
893 peer->rate_last = now;
894 if (peer->rate_tokens >= ip_rt_error_cost)
895 peer->rate_tokens -= ip_rt_error_cost;
896 else
897 send = false;
1d861aa4 898 inet_putpeer(peer);
1da177e4 899 }
92d86829
DM
900 if (send)
901 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
902
903out: kfree_skb(skb);
904 return 0;
e905a9ed 905}
1da177e4 906
d851c12b 907static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 908{
d851c12b 909 struct dst_entry *dst = &rt->dst;
4895c771 910 struct fib_result res;
2c8cec5c 911
7f92d334
SK
912 if (dst->dev->mtu < mtu)
913 return;
914
5943634f
DM
915 if (mtu < ip_rt_min_pmtu)
916 mtu = ip_rt_min_pmtu;
2c8cec5c 917
d851c12b
SK
918 if (!rt->rt_pmtu) {
919 dst->obsolete = DST_OBSOLETE_KILL;
920 } else {
921 rt->rt_pmtu = mtu;
922 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
923 }
924
c5ae7d41 925 rcu_read_lock();
d851c12b 926 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
4895c771 927 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 928
aee06da6
JA
929 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
930 jiffies + ip_rt_mtu_expires);
4895c771 931 }
c5ae7d41 932 rcu_read_unlock();
1da177e4
LT
933}
934
4895c771
DM
935static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
936 struct sk_buff *skb, u32 mtu)
937{
938 struct rtable *rt = (struct rtable *) dst;
939 struct flowi4 fl4;
940
941 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 942 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
943}
944
36393395
DM
945void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
946 int oif, u32 mark, u8 protocol, int flow_flags)
947{
4895c771 948 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
949 struct flowi4 fl4;
950 struct rtable *rt;
951
4895c771
DM
952 __build_flow_key(&fl4, NULL, iph, oif,
953 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
954 rt = __ip_route_output_key(net, &fl4);
955 if (!IS_ERR(rt)) {
4895c771 956 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
957 ip_rt_put(rt);
958 }
959}
960EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
961
962void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
963{
4895c771
DM
964 const struct iphdr *iph = (const struct iphdr *) skb->data;
965 struct flowi4 fl4;
966 struct rtable *rt;
36393395 967
4895c771
DM
968 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
969 rt = __ip_route_output_key(sock_net(sk), &fl4);
970 if (!IS_ERR(rt)) {
971 __ip_rt_update_pmtu(rt, &fl4, mtu);
972 ip_rt_put(rt);
973 }
36393395
DM
974}
975EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 976
b42597e2
DM
977void ipv4_redirect(struct sk_buff *skb, struct net *net,
978 int oif, u32 mark, u8 protocol, int flow_flags)
979{
4895c771 980 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
981 struct flowi4 fl4;
982 struct rtable *rt;
983
4895c771
DM
984 __build_flow_key(&fl4, NULL, iph, oif,
985 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
986 rt = __ip_route_output_key(net, &fl4);
987 if (!IS_ERR(rt)) {
ceb33206 988 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
989 ip_rt_put(rt);
990 }
991}
992EXPORT_SYMBOL_GPL(ipv4_redirect);
993
994void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
995{
4895c771
DM
996 const struct iphdr *iph = (const struct iphdr *) skb->data;
997 struct flowi4 fl4;
998 struct rtable *rt;
b42597e2 999
4895c771
DM
1000 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1001 rt = __ip_route_output_key(sock_net(sk), &fl4);
1002 if (!IS_ERR(rt)) {
ceb33206 1003 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1004 ip_rt_put(rt);
1005 }
b42597e2
DM
1006}
1007EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1008
efbc368d
DM
1009static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1010{
1011 struct rtable *rt = (struct rtable *) dst;
1012
ceb33206
DM
1013 /* All IPV4 dsts are created with ->obsolete set to the value
1014 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1015 * into this function always.
1016 *
1017 * When a PMTU/redirect information update invalidates a
1018 * route, this is indicated by setting obsolete to
1019 * DST_OBSOLETE_KILL.
1020 */
1021 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
efbc368d 1022 return NULL;
d11a4dc1 1023 return dst;
1da177e4
LT
1024}
1025
1da177e4
LT
1026static void ipv4_link_failure(struct sk_buff *skb)
1027{
1028 struct rtable *rt;
1029
1030 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1031
511c3f92 1032 rt = skb_rtable(skb);
5943634f
DM
1033 if (rt)
1034 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1035}
1036
1037static int ip_rt_bug(struct sk_buff *skb)
1038{
91df42be
JP
1039 pr_debug("%s: %pI4 -> %pI4, %s\n",
1040 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1041 skb->dev ? skb->dev->name : "?");
1da177e4 1042 kfree_skb(skb);
c378a9c0 1043 WARN_ON(1);
1da177e4
LT
1044 return 0;
1045}
1046
1047/*
1048 We do not cache source address of outgoing interface,
1049 because it is used only by IP RR, TS and SRR options,
1050 so that it out of fast path.
1051
1052 BTW remember: "addr" is allowed to be not aligned
1053 in IP options!
1054 */
1055
8e36360a 1056void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1057{
a61ced5d 1058 __be32 src;
1da177e4 1059
c7537967 1060 if (rt_is_output_route(rt))
c5be24ff 1061 src = ip_hdr(skb)->saddr;
ebc0ffae 1062 else {
8e36360a
DM
1063 struct fib_result res;
1064 struct flowi4 fl4;
1065 struct iphdr *iph;
1066
1067 iph = ip_hdr(skb);
1068
1069 memset(&fl4, 0, sizeof(fl4));
1070 fl4.daddr = iph->daddr;
1071 fl4.saddr = iph->saddr;
b0fe4a31 1072 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1073 fl4.flowi4_oif = rt->dst.dev->ifindex;
1074 fl4.flowi4_iif = skb->dev->ifindex;
1075 fl4.flowi4_mark = skb->mark;
5e2b61f7 1076
ebc0ffae 1077 rcu_read_lock();
68a5e3dd 1078 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1079 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1080 else
f8126f1d
DM
1081 src = inet_select_addr(rt->dst.dev,
1082 rt_nexthop(rt, iph->daddr),
1083 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1084 rcu_read_unlock();
1085 }
1da177e4
LT
1086 memcpy(addr, &src, 4);
1087}
1088
c7066f70 1089#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1090static void set_class_tag(struct rtable *rt, u32 tag)
1091{
d8d1f30b
CG
1092 if (!(rt->dst.tclassid & 0xFFFF))
1093 rt->dst.tclassid |= tag & 0xFFFF;
1094 if (!(rt->dst.tclassid & 0xFFFF0000))
1095 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1096}
1097#endif
1098
0dbaee3b
DM
1099static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1100{
1101 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1102
1103 if (advmss == 0) {
1104 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1105 ip_rt_min_advmss);
1106 if (advmss > 65535 - 40)
1107 advmss = 65535 - 40;
1108 }
1109 return advmss;
1110}
1111
ebb762f2 1112static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1113{
261663b0 1114 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1115 unsigned int mtu = rt->rt_pmtu;
1116
98d75c37 1117 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1118 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1119
261663b0 1120 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1121 return mtu;
1122
1123 mtu = dst->dev->mtu;
d33e4553
DM
1124
1125 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
f8126f1d 1126 if (rt->rt_gateway && mtu > 576)
d33e4553
DM
1127 mtu = 576;
1128 }
1129
1130 if (mtu > IP_MAX_MTU)
1131 mtu = IP_MAX_MTU;
1132
1133 return mtu;
1134}
1135
f2bb4bed 1136static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771
DM
1137{
1138 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1139 struct fib_nh_exception *fnhe;
1140 u32 hval;
1141
f2bb4bed
DM
1142 if (!hash)
1143 return NULL;
1144
d3a25c98 1145 hval = fnhe_hashfun(daddr);
4895c771
DM
1146
1147 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1148 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1149 if (fnhe->fnhe_daddr == daddr)
1150 return fnhe;
1151 }
1152 return NULL;
1153}
aee06da6 1154
caacf05e 1155static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
f2bb4bed
DM
1156 __be32 daddr)
1157{
caacf05e
DM
1158 bool ret = false;
1159
c5038a83 1160 spin_lock_bh(&fnhe_lock);
f2bb4bed 1161
c5038a83
DM
1162 if (daddr == fnhe->fnhe_daddr) {
1163 struct rtable *orig;
f2bb4bed 1164
c5038a83
DM
1165 if (fnhe->fnhe_pmtu) {
1166 unsigned long expires = fnhe->fnhe_expires;
1167 unsigned long diff = expires - jiffies;
1168
1169 if (time_before(jiffies, expires)) {
1170 rt->rt_pmtu = fnhe->fnhe_pmtu;
1171 dst_set_expires(&rt->dst, diff);
1172 }
1173 }
1174 if (fnhe->fnhe_gw) {
1175 rt->rt_flags |= RTCF_REDIRECTED;
1176 rt->rt_gateway = fnhe->fnhe_gw;
ceb33206 1177 }
f2bb4bed 1178
c5038a83
DM
1179 orig = rcu_dereference(fnhe->fnhe_rth);
1180 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1181 if (orig)
1182 rt_free(orig);
1183
1184 fnhe->fnhe_stamp = jiffies;
caacf05e 1185 ret = true;
c5038a83
DM
1186 } else {
1187 /* Routes we intend to cache in nexthop exception have
1188 * the DST_NOCACHE bit clear. However, if we are
1189 * unsuccessful at storing this route into the cache
1190 * we really need to set it.
1191 */
1192 rt->dst.flags |= DST_NOCACHE;
1193 }
1194 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1195
1196 return ret;
54764bb6
ED
1197}
1198
caacf05e 1199static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1200{
d26b3a7c 1201 struct rtable *orig, *prev, **p;
caacf05e 1202 bool ret = true;
f2bb4bed 1203
d26b3a7c 1204 if (rt_is_input_route(rt)) {
54764bb6 1205 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c
ED
1206 } else {
1207 if (!nh->nh_pcpu_rth_output)
1208 goto nocache;
1209 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1210 }
f2bb4bed
DM
1211 orig = *p;
1212
1213 prev = cmpxchg(p, orig, rt);
1214 if (prev == orig) {
f2bb4bed 1215 if (orig)
54764bb6 1216 rt_free(orig);
c6cffba4 1217 } else {
54764bb6
ED
1218 /* Routes we intend to cache in the FIB nexthop have
1219 * the DST_NOCACHE bit clear. However, if we are
1220 * unsuccessful at storing this route into the cache
1221 * we really need to set it.
1222 */
d26b3a7c 1223nocache:
54764bb6 1224 rt->dst.flags |= DST_NOCACHE;
caacf05e
DM
1225 ret = false;
1226 }
1227
1228 return ret;
1229}
1230
1231static DEFINE_SPINLOCK(rt_uncached_lock);
1232static LIST_HEAD(rt_uncached_list);
1233
1234static void rt_add_uncached_list(struct rtable *rt)
1235{
1236 spin_lock_bh(&rt_uncached_lock);
1237 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1238 spin_unlock_bh(&rt_uncached_lock);
1239}
1240
1241static void ipv4_dst_destroy(struct dst_entry *dst)
1242{
1243 struct rtable *rt = (struct rtable *) dst;
1244
78df76a0 1245 if (!list_empty(&rt->rt_uncached)) {
caacf05e
DM
1246 spin_lock_bh(&rt_uncached_lock);
1247 list_del(&rt->rt_uncached);
1248 spin_unlock_bh(&rt_uncached_lock);
1249 }
1250}
1251
1252void rt_flush_dev(struct net_device *dev)
1253{
1254 if (!list_empty(&rt_uncached_list)) {
1255 struct net *net = dev_net(dev);
1256 struct rtable *rt;
1257
1258 spin_lock_bh(&rt_uncached_lock);
1259 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1260 if (rt->dst.dev != dev)
1261 continue;
1262 rt->dst.dev = net->loopback_dev;
1263 dev_hold(rt->dst.dev);
1264 dev_put(dev);
1265 }
1266 spin_unlock_bh(&rt_uncached_lock);
4895c771
DM
1267 }
1268}
1269
4331debc 1270static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1271{
4331debc
ED
1272 return rt &&
1273 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1274 !rt_is_expired(rt);
d2d68ba9
DM
1275}
1276
f2bb4bed 1277static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1278 const struct fib_result *res,
f2bb4bed 1279 struct fib_nh_exception *fnhe,
982721f3 1280 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1281{
caacf05e
DM
1282 bool cached = false;
1283
1da177e4 1284 if (fi) {
4895c771
DM
1285 struct fib_nh *nh = &FIB_RES_NH(*res);
1286
1287 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1288 rt->rt_gateway = nh->nh_gw;
2860583f 1289 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
c7066f70 1290#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1291 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1292#endif
c5038a83 1293 if (unlikely(fnhe))
caacf05e 1294 cached = rt_bind_exception(rt, fnhe, daddr);
c5038a83 1295 else if (!(rt->dst.flags & DST_NOCACHE))
caacf05e 1296 cached = rt_cache_route(nh, rt);
d33e4553 1297 }
caacf05e
DM
1298 if (unlikely(!cached))
1299 rt_add_uncached_list(rt);
defb3519 1300
c7066f70 1301#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1302#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1303 set_class_tag(rt, res->tclassid);
1da177e4
LT
1304#endif
1305 set_class_tag(rt, itag);
1306#endif
1da177e4
LT
1307}
1308
5c1e6aa3 1309static struct rtable *rt_dst_alloc(struct net_device *dev,
f2bb4bed 1310 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1311{
f5b0a874 1312 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
c6cffba4 1313 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
5c1e6aa3
DM
1314 (nopolicy ? DST_NOPOLICY : 0) |
1315 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1316}
1317
96d36220 1318/* called in rcu_read_lock() section */
9e12bb22 1319static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1320 u8 tos, struct net_device *dev, int our)
1321{
1da177e4 1322 struct rtable *rth;
96d36220 1323 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1324 u32 itag = 0;
b5f7e755 1325 int err;
1da177e4
LT
1326
1327 /* Primary sanity checks. */
1328
1329 if (in_dev == NULL)
1330 return -EINVAL;
1331
1e637c74 1332 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1333 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1334 goto e_inval;
1335
d0daebc3
TG
1336 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1337 if (ipv4_is_loopback(saddr))
1338 goto e_inval;
1339
f97c1e0c
JP
1340 if (ipv4_is_zeronet(saddr)) {
1341 if (!ipv4_is_local_multicast(daddr))
1da177e4 1342 goto e_inval;
b5f7e755 1343 } else {
9e56e380
DM
1344 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1345 in_dev, &itag);
b5f7e755
ED
1346 if (err < 0)
1347 goto e_err;
1348 }
4e7b2f14 1349 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
f2bb4bed 1350 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1351 if (!rth)
1352 goto e_nobufs;
1353
cf911662
DM
1354#ifdef CONFIG_IP_ROUTE_CLASSID
1355 rth->dst.tclassid = itag;
1356#endif
d8d1f30b 1357 rth->dst.output = ip_rt_bug;
1da177e4 1358
cf911662
DM
1359 rth->rt_genid = rt_genid(dev_net(dev));
1360 rth->rt_flags = RTCF_MULTICAST;
1361 rth->rt_type = RTN_MULTICAST;
9917e1e8 1362 rth->rt_is_input= 1;
13378cad 1363 rth->rt_iif = 0;
5943634f 1364 rth->rt_pmtu = 0;
f8126f1d 1365 rth->rt_gateway = 0;
caacf05e 1366 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1367 if (our) {
d8d1f30b 1368 rth->dst.input= ip_local_deliver;
1da177e4
LT
1369 rth->rt_flags |= RTCF_LOCAL;
1370 }
1371
1372#ifdef CONFIG_IP_MROUTE
f97c1e0c 1373 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1374 rth->dst.input = ip_mr_input;
1da177e4
LT
1375#endif
1376 RT_CACHE_STAT_INC(in_slow_mc);
1377
89aef892
DM
1378 skb_dst_set(skb, &rth->dst);
1379 return 0;
1da177e4
LT
1380
1381e_nobufs:
1da177e4 1382 return -ENOBUFS;
1da177e4 1383e_inval:
96d36220 1384 return -EINVAL;
b5f7e755 1385e_err:
b5f7e755 1386 return err;
1da177e4
LT
1387}
1388
1389
1390static void ip_handle_martian_source(struct net_device *dev,
1391 struct in_device *in_dev,
1392 struct sk_buff *skb,
9e12bb22
AV
1393 __be32 daddr,
1394 __be32 saddr)
1da177e4
LT
1395{
1396 RT_CACHE_STAT_INC(in_martian_src);
1397#ifdef CONFIG_IP_ROUTE_VERBOSE
1398 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1399 /*
1400 * RFC1812 recommendation, if source is martian,
1401 * the only hint is MAC header.
1402 */
058bd4d2 1403 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1404 &daddr, &saddr, dev->name);
98e399f8 1405 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1406 print_hex_dump(KERN_WARNING, "ll header: ",
1407 DUMP_PREFIX_OFFSET, 16, 1,
1408 skb_mac_header(skb),
1409 dev->hard_header_len, true);
1da177e4
LT
1410 }
1411 }
1412#endif
1413}
1414
47360228 1415/* called in rcu_read_lock() section */
5969f71d 1416static int __mkroute_input(struct sk_buff *skb,
982721f3 1417 const struct fib_result *res,
5969f71d 1418 struct in_device *in_dev,
c6cffba4 1419 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1420{
1da177e4
LT
1421 struct rtable *rth;
1422 int err;
1423 struct in_device *out_dev;
47360228 1424 unsigned int flags = 0;
d2d68ba9 1425 bool do_cache;
d9c9df8c 1426 u32 itag;
1da177e4
LT
1427
1428 /* get a working reference to the output device */
47360228 1429 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1430 if (out_dev == NULL) {
e87cc472 1431 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1432 return -EINVAL;
1433 }
1434
1435
5c04c819 1436 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1437 in_dev->dev, in_dev, &itag);
1da177e4 1438 if (err < 0) {
e905a9ed 1439 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1440 saddr);
e905a9ed 1441
1da177e4
LT
1442 goto cleanup;
1443 }
1444
51b77cae 1445 if (out_dev == in_dev && err &&
1da177e4
LT
1446 (IN_DEV_SHARED_MEDIA(out_dev) ||
1447 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1448 flags |= RTCF_DOREDIRECT;
1449
1450 if (skb->protocol != htons(ETH_P_IP)) {
1451 /* Not IP (i.e. ARP). Do not create route, if it is
1452 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1453 *
1454 * Proxy arp feature have been extended to allow, ARP
1455 * replies back to the same interface, to support
1456 * Private VLAN switch technologies. See arp.c.
1da177e4 1457 */
65324144
JDB
1458 if (out_dev == in_dev &&
1459 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1460 err = -EINVAL;
1461 goto cleanup;
1462 }
1463 }
1464
d2d68ba9
DM
1465 do_cache = false;
1466 if (res->fi) {
fe3edf45 1467 if (!itag) {
54764bb6 1468 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 1469 if (rt_cache_valid(rth)) {
c6cffba4 1470 skb_dst_set_noref(skb, &rth->dst);
d2d68ba9
DM
1471 goto out;
1472 }
1473 do_cache = true;
1474 }
1475 }
f2bb4bed 1476
5c1e6aa3
DM
1477 rth = rt_dst_alloc(out_dev->dev,
1478 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1479 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1480 if (!rth) {
1481 err = -ENOBUFS;
1482 goto cleanup;
1483 }
1484
cf911662
DM
1485 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1486 rth->rt_flags = flags;
1487 rth->rt_type = res->type;
9917e1e8 1488 rth->rt_is_input = 1;
13378cad 1489 rth->rt_iif = 0;
5943634f 1490 rth->rt_pmtu = 0;
f8126f1d 1491 rth->rt_gateway = 0;
caacf05e 1492 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1493
d8d1f30b
CG
1494 rth->dst.input = ip_forward;
1495 rth->dst.output = ip_output;
1da177e4 1496
d2d68ba9 1497 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
c6cffba4 1498 skb_dst_set(skb, &rth->dst);
d2d68ba9 1499out:
1da177e4
LT
1500 err = 0;
1501 cleanup:
1da177e4 1502 return err;
e905a9ed 1503}
1da177e4 1504
5969f71d
SH
1505static int ip_mkroute_input(struct sk_buff *skb,
1506 struct fib_result *res,
68a5e3dd 1507 const struct flowi4 *fl4,
5969f71d
SH
1508 struct in_device *in_dev,
1509 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1510{
1da177e4 1511#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1512 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1513 fib_select_multipath(res);
1da177e4
LT
1514#endif
1515
1516 /* create a routing cache entry */
c6cffba4 1517 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1518}
1519
1da177e4
LT
1520/*
1521 * NOTE. We drop all the packets that has local source
1522 * addresses, because every properly looped back packet
1523 * must have correct destination already attached by output routine.
1524 *
1525 * Such approach solves two big problems:
1526 * 1. Not simplex devices are handled properly.
1527 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1528 * called with rcu_read_lock()
1da177e4
LT
1529 */
1530
9e12bb22 1531static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1532 u8 tos, struct net_device *dev)
1da177e4
LT
1533{
1534 struct fib_result res;
96d36220 1535 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1536 struct flowi4 fl4;
95c96174 1537 unsigned int flags = 0;
1da177e4 1538 u32 itag = 0;
95c96174 1539 struct rtable *rth;
1da177e4 1540 int err = -EINVAL;
5e73ea1a 1541 struct net *net = dev_net(dev);
d2d68ba9 1542 bool do_cache;
1da177e4
LT
1543
1544 /* IP on this device is disabled. */
1545
1546 if (!in_dev)
1547 goto out;
1548
1549 /* Check for the most weird martians, which can be not detected
1550 by fib_lookup.
1551 */
1552
d0daebc3 1553 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1554 goto martian_source;
1555
d2d68ba9 1556 res.fi = NULL;
27a954bd 1557 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1558 goto brd_input;
1559
1560 /* Accept zero addresses only to limited broadcast;
1561 * I even do not know to fix it or not. Waiting for complains :-)
1562 */
f97c1e0c 1563 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1564 goto martian_source;
1565
d0daebc3 1566 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1567 goto martian_destination;
1568
9eb43e76
ED
1569 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1570 * and call it once if daddr or/and saddr are loopback addresses
1571 */
1572 if (ipv4_is_loopback(daddr)) {
1573 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1574 goto martian_destination;
9eb43e76
ED
1575 } else if (ipv4_is_loopback(saddr)) {
1576 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1577 goto martian_source;
1578 }
1579
1da177e4
LT
1580 /*
1581 * Now we are ready to route packet.
1582 */
68a5e3dd
DM
1583 fl4.flowi4_oif = 0;
1584 fl4.flowi4_iif = dev->ifindex;
1585 fl4.flowi4_mark = skb->mark;
1586 fl4.flowi4_tos = tos;
1587 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1588 fl4.daddr = daddr;
1589 fl4.saddr = saddr;
1590 err = fib_lookup(net, &fl4, &res);
251da413 1591 if (err != 0)
1da177e4 1592 goto no_route;
1da177e4
LT
1593
1594 RT_CACHE_STAT_INC(in_slow_tot);
1595
1596 if (res.type == RTN_BROADCAST)
1597 goto brd_input;
1598
1599 if (res.type == RTN_LOCAL) {
5c04c819 1600 err = fib_validate_source(skb, saddr, daddr, tos,
1fb9489b 1601 LOOPBACK_IFINDEX,
9e56e380 1602 dev, in_dev, &itag);
b5f7e755
ED
1603 if (err < 0)
1604 goto martian_source_keep_err;
1da177e4
LT
1605 goto local_input;
1606 }
1607
1608 if (!IN_DEV_FORWARD(in_dev))
251da413 1609 goto no_route;
1da177e4
LT
1610 if (res.type != RTN_UNICAST)
1611 goto martian_destination;
1612
68a5e3dd 1613 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
1614out: return err;
1615
1616brd_input:
1617 if (skb->protocol != htons(ETH_P_IP))
1618 goto e_inval;
1619
41347dcd 1620 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1621 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1622 in_dev, &itag);
1da177e4 1623 if (err < 0)
b5f7e755 1624 goto martian_source_keep_err;
1da177e4
LT
1625 }
1626 flags |= RTCF_BROADCAST;
1627 res.type = RTN_BROADCAST;
1628 RT_CACHE_STAT_INC(in_brd);
1629
1630local_input:
d2d68ba9
DM
1631 do_cache = false;
1632 if (res.fi) {
fe3edf45 1633 if (!itag) {
54764bb6 1634 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
d2d68ba9 1635 if (rt_cache_valid(rth)) {
c6cffba4
DM
1636 skb_dst_set_noref(skb, &rth->dst);
1637 err = 0;
1638 goto out;
d2d68ba9
DM
1639 }
1640 do_cache = true;
1641 }
1642 }
1643
5c1e6aa3 1644 rth = rt_dst_alloc(net->loopback_dev,
d2d68ba9 1645 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
1646 if (!rth)
1647 goto e_nobufs;
1648
cf911662 1649 rth->dst.input= ip_local_deliver;
d8d1f30b 1650 rth->dst.output= ip_rt_bug;
cf911662
DM
1651#ifdef CONFIG_IP_ROUTE_CLASSID
1652 rth->dst.tclassid = itag;
1653#endif
1da177e4 1654
cf911662
DM
1655 rth->rt_genid = rt_genid(net);
1656 rth->rt_flags = flags|RTCF_LOCAL;
1657 rth->rt_type = res.type;
9917e1e8 1658 rth->rt_is_input = 1;
13378cad 1659 rth->rt_iif = 0;
5943634f 1660 rth->rt_pmtu = 0;
f8126f1d 1661 rth->rt_gateway = 0;
caacf05e 1662 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1663 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
1664 rth->dst.input= ip_error;
1665 rth->dst.error= -err;
1da177e4
LT
1666 rth->rt_flags &= ~RTCF_LOCAL;
1667 }
d2d68ba9
DM
1668 if (do_cache)
1669 rt_cache_route(&FIB_RES_NH(res), rth);
89aef892 1670 skb_dst_set(skb, &rth->dst);
b23dd4fe 1671 err = 0;
ebc0ffae 1672 goto out;
1da177e4
LT
1673
1674no_route:
1675 RT_CACHE_STAT_INC(in_no_route);
1da177e4 1676 res.type = RTN_UNREACHABLE;
7f53878d
MC
1677 if (err == -ESRCH)
1678 err = -ENETUNREACH;
1da177e4
LT
1679 goto local_input;
1680
1681 /*
1682 * Do not cache martian addresses: they should be logged (RFC1812)
1683 */
1684martian_destination:
1685 RT_CACHE_STAT_INC(in_martian_dst);
1686#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
1687 if (IN_DEV_LOG_MARTIANS(in_dev))
1688 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1689 &daddr, &saddr, dev->name);
1da177e4 1690#endif
2c2910a4 1691
1da177e4
LT
1692e_inval:
1693 err = -EINVAL;
ebc0ffae 1694 goto out;
1da177e4
LT
1695
1696e_nobufs:
1697 err = -ENOBUFS;
ebc0ffae 1698 goto out;
1da177e4
LT
1699
1700martian_source:
b5f7e755
ED
1701 err = -EINVAL;
1702martian_source_keep_err:
1da177e4 1703 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 1704 goto out;
1da177e4
LT
1705}
1706
c6cffba4
DM
1707int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1708 u8 tos, struct net_device *dev)
1da177e4 1709{
96d36220 1710 int res;
1da177e4 1711
96d36220
ED
1712 rcu_read_lock();
1713
1da177e4
LT
1714 /* Multicast recognition logic is moved from route cache to here.
1715 The problem was that too many Ethernet cards have broken/missing
1716 hardware multicast filters :-( As result the host on multicasting
1717 network acquires a lot of useless route cache entries, sort of
1718 SDR messages from all the world. Now we try to get rid of them.
1719 Really, provided software IP multicast filter is organized
1720 reasonably (at least, hashed), it does not result in a slowdown
1721 comparing with route cache reject entries.
1722 Note, that multicast routers are not affected, because
1723 route cache entry is created eventually.
1724 */
f97c1e0c 1725 if (ipv4_is_multicast(daddr)) {
96d36220 1726 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1727
96d36220 1728 if (in_dev) {
dbdd9a52
DM
1729 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1730 ip_hdr(skb)->protocol);
1da177e4
LT
1731 if (our
1732#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
1733 ||
1734 (!ipv4_is_local_multicast(daddr) &&
1735 IN_DEV_MFORWARD(in_dev))
1da177e4 1736#endif
9d4fb27d 1737 ) {
96d36220
ED
1738 int res = ip_route_input_mc(skb, daddr, saddr,
1739 tos, dev, our);
1da177e4 1740 rcu_read_unlock();
96d36220 1741 return res;
1da177e4
LT
1742 }
1743 }
1744 rcu_read_unlock();
1745 return -EINVAL;
1746 }
c10237e0 1747 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
1748 rcu_read_unlock();
1749 return res;
1da177e4 1750}
c6cffba4 1751EXPORT_SYMBOL(ip_route_input_noref);
1da177e4 1752
ebc0ffae 1753/* called with rcu_read_lock() */
982721f3 1754static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 1755 const struct flowi4 *fl4, int orig_oif,
f61759e6 1756 struct net_device *dev_out,
5ada5527 1757 unsigned int flags)
1da177e4 1758{
982721f3 1759 struct fib_info *fi = res->fi;
f2bb4bed 1760 struct fib_nh_exception *fnhe;
5ada5527 1761 struct in_device *in_dev;
982721f3 1762 u16 type = res->type;
5ada5527 1763 struct rtable *rth;
1da177e4 1764
d0daebc3
TG
1765 in_dev = __in_dev_get_rcu(dev_out);
1766 if (!in_dev)
5ada5527 1767 return ERR_PTR(-EINVAL);
1da177e4 1768
d0daebc3
TG
1769 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1770 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1771 return ERR_PTR(-EINVAL);
1772
68a5e3dd 1773 if (ipv4_is_lbcast(fl4->daddr))
982721f3 1774 type = RTN_BROADCAST;
68a5e3dd 1775 else if (ipv4_is_multicast(fl4->daddr))
982721f3 1776 type = RTN_MULTICAST;
68a5e3dd 1777 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 1778 return ERR_PTR(-EINVAL);
1da177e4
LT
1779
1780 if (dev_out->flags & IFF_LOOPBACK)
1781 flags |= RTCF_LOCAL;
1782
982721f3 1783 if (type == RTN_BROADCAST) {
1da177e4 1784 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
1785 fi = NULL;
1786 } else if (type == RTN_MULTICAST) {
dd28d1a0 1787 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
1788 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1789 fl4->flowi4_proto))
1da177e4
LT
1790 flags &= ~RTCF_LOCAL;
1791 /* If multicast route do not exist use
dd28d1a0
ED
1792 * default one, but do not gateway in this case.
1793 * Yes, it is hack.
1da177e4 1794 */
982721f3
DM
1795 if (fi && res->prefixlen < 4)
1796 fi = NULL;
1da177e4
LT
1797 }
1798
f2bb4bed
DM
1799 fnhe = NULL;
1800 if (fi) {
c5038a83 1801 struct rtable __rcu **prth;
d26b3a7c 1802
c5038a83
DM
1803 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1804 if (fnhe)
1805 prth = &fnhe->fnhe_rth;
1806 else
d26b3a7c 1807 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
c5038a83
DM
1808 rth = rcu_dereference(*prth);
1809 if (rt_cache_valid(rth)) {
1810 dst_hold(&rth->dst);
1811 return rth;
f2bb4bed
DM
1812 }
1813 }
5c1e6aa3
DM
1814 rth = rt_dst_alloc(dev_out,
1815 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 1816 IN_DEV_CONF_GET(in_dev, NOXFRM),
c5038a83 1817 fi);
8391d07b 1818 if (!rth)
5ada5527 1819 return ERR_PTR(-ENOBUFS);
8391d07b 1820
cf911662
DM
1821 rth->dst.output = ip_output;
1822
cf911662
DM
1823 rth->rt_genid = rt_genid(dev_net(dev_out));
1824 rth->rt_flags = flags;
1825 rth->rt_type = type;
9917e1e8 1826 rth->rt_is_input = 0;
13378cad 1827 rth->rt_iif = orig_oif ? : 0;
5943634f 1828 rth->rt_pmtu = 0;
f8126f1d 1829 rth->rt_gateway = 0;
caacf05e 1830 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4
LT
1831
1832 RT_CACHE_STAT_INC(out_slow_tot);
1833
41347dcd 1834 if (flags & RTCF_LOCAL)
d8d1f30b 1835 rth->dst.input = ip_local_deliver;
1da177e4 1836 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 1837 if (flags & RTCF_LOCAL &&
1da177e4 1838 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 1839 rth->dst.output = ip_mc_output;
1da177e4
LT
1840 RT_CACHE_STAT_INC(out_slow_mc);
1841 }
1842#ifdef CONFIG_IP_MROUTE
982721f3 1843 if (type == RTN_MULTICAST) {
1da177e4 1844 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 1845 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
1846 rth->dst.input = ip_mr_input;
1847 rth->dst.output = ip_mc_output;
1da177e4
LT
1848 }
1849 }
1850#endif
1851 }
1852
f2bb4bed 1853 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1da177e4 1854
5ada5527 1855 return rth;
1da177e4
LT
1856}
1857
1da177e4
LT
1858/*
1859 * Major route resolver routine.
1860 */
1861
89aef892 1862struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1da177e4 1863{
1da177e4 1864 struct net_device *dev_out = NULL;
f61759e6 1865 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
1866 unsigned int flags = 0;
1867 struct fib_result res;
5ada5527 1868 struct rtable *rth;
813b3b5d 1869 int orig_oif;
1da177e4 1870
85b91b03 1871 res.tclassid = 0;
1da177e4 1872 res.fi = NULL;
8b96d22d 1873 res.table = NULL;
1da177e4 1874
813b3b5d
DM
1875 orig_oif = fl4->flowi4_oif;
1876
1fb9489b 1877 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
1878 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1879 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1880 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 1881
010c2708 1882 rcu_read_lock();
813b3b5d 1883 if (fl4->saddr) {
b23dd4fe 1884 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
1885 if (ipv4_is_multicast(fl4->saddr) ||
1886 ipv4_is_lbcast(fl4->saddr) ||
1887 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
1888 goto out;
1889
1da177e4
LT
1890 /* I removed check for oif == dev_out->oif here.
1891 It was wrong for two reasons:
1ab35276
DL
1892 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1893 is assigned to multiple interfaces.
1da177e4
LT
1894 2. Moreover, we are allowed to send packets with saddr
1895 of another iface. --ANK
1896 */
1897
813b3b5d
DM
1898 if (fl4->flowi4_oif == 0 &&
1899 (ipv4_is_multicast(fl4->daddr) ||
1900 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 1901 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1902 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
1903 if (dev_out == NULL)
1904 goto out;
1905
1da177e4
LT
1906 /* Special hack: user can direct multicasts
1907 and limited broadcast via necessary interface
1908 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1909 This hack is not just for fun, it allows
1910 vic,vat and friends to work.
1911 They bind socket to loopback, set ttl to zero
1912 and expect that it will work.
1913 From the viewpoint of routing cache they are broken,
1914 because we are not allowed to build multicast path
1915 with loopback source addr (look, routing cache
1916 cannot know, that ttl is zero, so that packet
1917 will not leave this host and route is valid).
1918 Luckily, this hack is good workaround.
1919 */
1920
813b3b5d 1921 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
1922 goto make_route;
1923 }
a210d01a 1924
813b3b5d 1925 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 1926 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1927 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 1928 goto out;
a210d01a 1929 }
1da177e4
LT
1930 }
1931
1932
813b3b5d
DM
1933 if (fl4->flowi4_oif) {
1934 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 1935 rth = ERR_PTR(-ENODEV);
1da177e4
LT
1936 if (dev_out == NULL)
1937 goto out;
e5ed6399
HX
1938
1939 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 1940 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 1941 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
1942 goto out;
1943 }
813b3b5d
DM
1944 if (ipv4_is_local_multicast(fl4->daddr) ||
1945 ipv4_is_lbcast(fl4->daddr)) {
1946 if (!fl4->saddr)
1947 fl4->saddr = inet_select_addr(dev_out, 0,
1948 RT_SCOPE_LINK);
1da177e4
LT
1949 goto make_route;
1950 }
813b3b5d
DM
1951 if (fl4->saddr) {
1952 if (ipv4_is_multicast(fl4->daddr))
1953 fl4->saddr = inet_select_addr(dev_out, 0,
1954 fl4->flowi4_scope);
1955 else if (!fl4->daddr)
1956 fl4->saddr = inet_select_addr(dev_out, 0,
1957 RT_SCOPE_HOST);
1da177e4
LT
1958 }
1959 }
1960
813b3b5d
DM
1961 if (!fl4->daddr) {
1962 fl4->daddr = fl4->saddr;
1963 if (!fl4->daddr)
1964 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 1965 dev_out = net->loopback_dev;
1fb9489b 1966 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1da177e4
LT
1967 res.type = RTN_LOCAL;
1968 flags |= RTCF_LOCAL;
1969 goto make_route;
1970 }
1971
813b3b5d 1972 if (fib_lookup(net, fl4, &res)) {
1da177e4 1973 res.fi = NULL;
8b96d22d 1974 res.table = NULL;
813b3b5d 1975 if (fl4->flowi4_oif) {
1da177e4
LT
1976 /* Apparently, routing tables are wrong. Assume,
1977 that the destination is on link.
1978
1979 WHY? DW.
1980 Because we are allowed to send to iface
1981 even if it has NO routes and NO assigned
1982 addresses. When oif is specified, routing
1983 tables are looked up with only one purpose:
1984 to catch if destination is gatewayed, rather than
1985 direct. Moreover, if MSG_DONTROUTE is set,
1986 we send packet, ignoring both routing tables
1987 and ifaddr state. --ANK
1988
1989
1990 We could make it even if oif is unknown,
1991 likely IPv6, but we do not.
1992 */
1993
813b3b5d
DM
1994 if (fl4->saddr == 0)
1995 fl4->saddr = inet_select_addr(dev_out, 0,
1996 RT_SCOPE_LINK);
1da177e4
LT
1997 res.type = RTN_UNICAST;
1998 goto make_route;
1999 }
b23dd4fe 2000 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2001 goto out;
2002 }
1da177e4
LT
2003
2004 if (res.type == RTN_LOCAL) {
813b3b5d 2005 if (!fl4->saddr) {
9fc3bbb4 2006 if (res.fi->fib_prefsrc)
813b3b5d 2007 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2008 else
813b3b5d 2009 fl4->saddr = fl4->daddr;
9fc3bbb4 2010 }
b40afd0e 2011 dev_out = net->loopback_dev;
813b3b5d 2012 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2013 flags |= RTCF_LOCAL;
2014 goto make_route;
2015 }
2016
2017#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2018 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2019 fib_select_multipath(&res);
1da177e4
LT
2020 else
2021#endif
21d8c49e
DM
2022 if (!res.prefixlen &&
2023 res.table->tb_num_default > 1 &&
813b3b5d 2024 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2025 fib_select_default(&res);
1da177e4 2026
813b3b5d
DM
2027 if (!fl4->saddr)
2028 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2029
1da177e4 2030 dev_out = FIB_RES_DEV(res);
813b3b5d 2031 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2032
2033
2034make_route:
1a00fee4 2035 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1da177e4 2036
010c2708
DM
2037out:
2038 rcu_read_unlock();
b23dd4fe 2039 return rth;
1da177e4 2040}
d8c97a94
ACM
2041EXPORT_SYMBOL_GPL(__ip_route_output_key);
2042
ae2688d5
JW
2043static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2044{
2045 return NULL;
2046}
2047
ebb762f2 2048static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2049{
618f9bc7
SK
2050 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2051
2052 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2053}
2054
6700c270
DM
2055static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2056 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2057{
2058}
2059
6700c270
DM
2060static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2061 struct sk_buff *skb)
b587ee3b
DM
2062{
2063}
2064
0972ddb2
HB
2065static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2066 unsigned long old)
2067{
2068 return NULL;
2069}
2070
14e50e57
DM
2071static struct dst_ops ipv4_dst_blackhole_ops = {
2072 .family = AF_INET,
09640e63 2073 .protocol = cpu_to_be16(ETH_P_IP),
ae2688d5 2074 .check = ipv4_blackhole_dst_check,
ebb762f2 2075 .mtu = ipv4_blackhole_mtu,
214f45c9 2076 .default_advmss = ipv4_default_advmss,
14e50e57 2077 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2078 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2079 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2080 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2081};
2082
2774c131 2083struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2084{
2774c131 2085 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2086 struct rtable *rt;
14e50e57 2087
f5b0a874 2088 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2089 if (rt) {
d8d1f30b 2090 struct dst_entry *new = &rt->dst;
14e50e57 2091
14e50e57 2092 new->__use = 1;
352e512c
HX
2093 new->input = dst_discard;
2094 new->output = dst_discard;
14e50e57 2095
d8d1f30b 2096 new->dev = ort->dst.dev;
14e50e57
DM
2097 if (new->dev)
2098 dev_hold(new->dev);
2099
9917e1e8 2100 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2101 rt->rt_iif = ort->rt_iif;
5943634f 2102 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2103
e84f84f2 2104 rt->rt_genid = rt_genid(net);
14e50e57
DM
2105 rt->rt_flags = ort->rt_flags;
2106 rt->rt_type = ort->rt_type;
14e50e57 2107 rt->rt_gateway = ort->rt_gateway;
14e50e57 2108
caacf05e
DM
2109 INIT_LIST_HEAD(&rt->rt_uncached);
2110
14e50e57
DM
2111 dst_free(new);
2112 }
2113
2774c131
DM
2114 dst_release(dst_orig);
2115
2116 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2117}
2118
9d6ec938 2119struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2120 struct sock *sk)
1da177e4 2121{
9d6ec938 2122 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2123
b23dd4fe
DM
2124 if (IS_ERR(rt))
2125 return rt;
1da177e4 2126
56157872 2127 if (flp4->flowi4_proto)
9d6ec938
DM
2128 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2129 flowi4_to_flowi(flp4),
2130 sk, 0);
1da177e4 2131
b23dd4fe 2132 return rt;
1da177e4 2133}
d8c97a94
ACM
2134EXPORT_SYMBOL_GPL(ip_route_output_flow);
2135
f1ce3062 2136static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
15e47304 2137 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
f1ce3062 2138 u32 seq, int event, int nowait, unsigned int flags)
1da177e4 2139{
511c3f92 2140 struct rtable *rt = skb_rtable(skb);
1da177e4 2141 struct rtmsg *r;
be403ea1 2142 struct nlmsghdr *nlh;
2bc8ca40 2143 unsigned long expires = 0;
f185071d 2144 u32 error;
521f5490 2145 u32 metrics[RTAX_MAX];
be403ea1 2146
15e47304 2147 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
be403ea1 2148 if (nlh == NULL)
26932566 2149 return -EMSGSIZE;
be403ea1
TG
2150
2151 r = nlmsg_data(nlh);
1da177e4
LT
2152 r->rtm_family = AF_INET;
2153 r->rtm_dst_len = 32;
2154 r->rtm_src_len = 0;
d6c0a4f6 2155 r->rtm_tos = fl4->flowi4_tos;
1da177e4 2156 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2157 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2158 goto nla_put_failure;
1da177e4
LT
2159 r->rtm_type = rt->rt_type;
2160 r->rtm_scope = RT_SCOPE_UNIVERSE;
2161 r->rtm_protocol = RTPROT_UNSPEC;
2162 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2163 if (rt->rt_flags & RTCF_NOTIFY)
2164 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2165
f1ce3062 2166 if (nla_put_be32(skb, RTA_DST, dst))
f3756b79 2167 goto nla_put_failure;
1a00fee4 2168 if (src) {
1da177e4 2169 r->rtm_src_len = 32;
1a00fee4 2170 if (nla_put_be32(skb, RTA_SRC, src))
f3756b79 2171 goto nla_put_failure;
1da177e4 2172 }
f3756b79
DM
2173 if (rt->dst.dev &&
2174 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2175 goto nla_put_failure;
c7066f70 2176#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2177 if (rt->dst.tclassid &&
2178 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2179 goto nla_put_failure;
1da177e4 2180#endif
41347dcd 2181 if (!rt_is_input_route(rt) &&
d6c0a4f6
DM
2182 fl4->saddr != src) {
2183 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2184 goto nla_put_failure;
2185 }
f8126f1d 2186 if (rt->rt_gateway &&
f3756b79
DM
2187 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2188 goto nla_put_failure;
be403ea1 2189
521f5490
JA
2190 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2191 if (rt->rt_pmtu)
2192 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2193 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2194 goto nla_put_failure;
2195
b4869889
DM
2196 if (fl4->flowi4_mark &&
2197 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2198 goto nla_put_failure;
963bfeee 2199
d8d1f30b 2200 error = rt->dst.error;
5943634f
DM
2201 expires = rt->dst.expires;
2202 if (expires) {
2203 if (time_before(jiffies, expires))
2204 expires -= jiffies;
2205 else
2206 expires = 0;
1da177e4 2207 }
be403ea1 2208
c7537967 2209 if (rt_is_input_route(rt)) {
f1ce3062
DM
2210 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2211 goto nla_put_failure;
1da177e4
LT
2212 }
2213
f185071d 2214 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2215 goto nla_put_failure;
be403ea1
TG
2216
2217 return nlmsg_end(skb, nlh);
1da177e4 2218
be403ea1 2219nla_put_failure:
26932566
PM
2220 nlmsg_cancel(skb, nlh);
2221 return -EMSGSIZE;
1da177e4
LT
2222}
2223
5e73ea1a 2224static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1da177e4 2225{
3b1e0a65 2226 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2227 struct rtmsg *rtm;
2228 struct nlattr *tb[RTA_MAX+1];
1da177e4 2229 struct rtable *rt = NULL;
d6c0a4f6 2230 struct flowi4 fl4;
9e12bb22
AV
2231 __be32 dst = 0;
2232 __be32 src = 0;
2233 u32 iif;
d889ce3b 2234 int err;
963bfeee 2235 int mark;
1da177e4
LT
2236 struct sk_buff *skb;
2237
d889ce3b
TG
2238 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2239 if (err < 0)
2240 goto errout;
2241
2242 rtm = nlmsg_data(nlh);
2243
1da177e4 2244 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2245 if (skb == NULL) {
2246 err = -ENOBUFS;
2247 goto errout;
2248 }
1da177e4
LT
2249
2250 /* Reserve room for dummy headers, this skb can pass
2251 through good chunk of routing engine.
2252 */
459a98ed 2253 skb_reset_mac_header(skb);
c1d2bbe1 2254 skb_reset_network_header(skb);
d2c962b8
SH
2255
2256 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2257 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2258 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2259
17fb2c64
AV
2260 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2261 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2262 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2263 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4 2264
d6c0a4f6
DM
2265 memset(&fl4, 0, sizeof(fl4));
2266 fl4.daddr = dst;
2267 fl4.saddr = src;
2268 fl4.flowi4_tos = rtm->rtm_tos;
2269 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2270 fl4.flowi4_mark = mark;
2271
1da177e4 2272 if (iif) {
d889ce3b
TG
2273 struct net_device *dev;
2274
1937504d 2275 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2276 if (dev == NULL) {
2277 err = -ENODEV;
2278 goto errout_free;
2279 }
2280
1da177e4
LT
2281 skb->protocol = htons(ETH_P_IP);
2282 skb->dev = dev;
963bfeee 2283 skb->mark = mark;
1da177e4
LT
2284 local_bh_disable();
2285 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2286 local_bh_enable();
d889ce3b 2287
511c3f92 2288 rt = skb_rtable(skb);
d8d1f30b
CG
2289 if (err == 0 && rt->dst.error)
2290 err = -rt->dst.error;
1da177e4 2291 } else {
9d6ec938 2292 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2293
2294 err = 0;
2295 if (IS_ERR(rt))
2296 err = PTR_ERR(rt);
1da177e4 2297 }
d889ce3b 2298
1da177e4 2299 if (err)
d889ce3b 2300 goto errout_free;
1da177e4 2301
d8d1f30b 2302 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2303 if (rtm->rtm_flags & RTM_F_NOTIFY)
2304 rt->rt_flags |= RTCF_NOTIFY;
2305
f1ce3062 2306 err = rt_fill_info(net, dst, src, &fl4, skb,
15e47304 2307 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
1937504d 2308 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2309 if (err <= 0)
2310 goto errout_free;
1da177e4 2311
15e47304 2312 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2313errout:
2942e900 2314 return err;
1da177e4 2315
d889ce3b 2316errout_free:
1da177e4 2317 kfree_skb(skb);
d889ce3b 2318 goto errout;
1da177e4
LT
2319}
2320
2321int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2322{
1da177e4
LT
2323 return skb->len;
2324}
2325
2326void ip_rt_multicast_event(struct in_device *in_dev)
2327{
4ccfe6d4 2328 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2329}
2330
2331#ifdef CONFIG_SYSCTL
81c684d1 2332static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2333 void __user *buffer,
1da177e4
LT
2334 size_t *lenp, loff_t *ppos)
2335{
2336 if (write) {
4ccfe6d4 2337 rt_cache_flush((struct net *)__ctl->extra1);
1da177e4 2338 return 0;
e905a9ed 2339 }
1da177e4
LT
2340
2341 return -EINVAL;
2342}
2343
eeb61f71 2344static ctl_table ipv4_route_table[] = {
1da177e4 2345 {
1da177e4
LT
2346 .procname = "gc_thresh",
2347 .data = &ipv4_dst_ops.gc_thresh,
2348 .maxlen = sizeof(int),
2349 .mode = 0644,
6d9f239a 2350 .proc_handler = proc_dointvec,
1da177e4
LT
2351 },
2352 {
1da177e4
LT
2353 .procname = "max_size",
2354 .data = &ip_rt_max_size,
2355 .maxlen = sizeof(int),
2356 .mode = 0644,
6d9f239a 2357 .proc_handler = proc_dointvec,
1da177e4
LT
2358 },
2359 {
2360 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2361
1da177e4
LT
2362 .procname = "gc_min_interval",
2363 .data = &ip_rt_gc_min_interval,
2364 .maxlen = sizeof(int),
2365 .mode = 0644,
6d9f239a 2366 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2367 },
2368 {
1da177e4
LT
2369 .procname = "gc_min_interval_ms",
2370 .data = &ip_rt_gc_min_interval,
2371 .maxlen = sizeof(int),
2372 .mode = 0644,
6d9f239a 2373 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2374 },
2375 {
1da177e4
LT
2376 .procname = "gc_timeout",
2377 .data = &ip_rt_gc_timeout,
2378 .maxlen = sizeof(int),
2379 .mode = 0644,
6d9f239a 2380 .proc_handler = proc_dointvec_jiffies,
1da177e4 2381 },
9f28a2fc
ED
2382 {
2383 .procname = "gc_interval",
2384 .data = &ip_rt_gc_interval,
2385 .maxlen = sizeof(int),
2386 .mode = 0644,
2387 .proc_handler = proc_dointvec_jiffies,
2388 },
1da177e4 2389 {
1da177e4
LT
2390 .procname = "redirect_load",
2391 .data = &ip_rt_redirect_load,
2392 .maxlen = sizeof(int),
2393 .mode = 0644,
6d9f239a 2394 .proc_handler = proc_dointvec,
1da177e4
LT
2395 },
2396 {
1da177e4
LT
2397 .procname = "redirect_number",
2398 .data = &ip_rt_redirect_number,
2399 .maxlen = sizeof(int),
2400 .mode = 0644,
6d9f239a 2401 .proc_handler = proc_dointvec,
1da177e4
LT
2402 },
2403 {
1da177e4
LT
2404 .procname = "redirect_silence",
2405 .data = &ip_rt_redirect_silence,
2406 .maxlen = sizeof(int),
2407 .mode = 0644,
6d9f239a 2408 .proc_handler = proc_dointvec,
1da177e4
LT
2409 },
2410 {
1da177e4
LT
2411 .procname = "error_cost",
2412 .data = &ip_rt_error_cost,
2413 .maxlen = sizeof(int),
2414 .mode = 0644,
6d9f239a 2415 .proc_handler = proc_dointvec,
1da177e4
LT
2416 },
2417 {
1da177e4
LT
2418 .procname = "error_burst",
2419 .data = &ip_rt_error_burst,
2420 .maxlen = sizeof(int),
2421 .mode = 0644,
6d9f239a 2422 .proc_handler = proc_dointvec,
1da177e4
LT
2423 },
2424 {
1da177e4
LT
2425 .procname = "gc_elasticity",
2426 .data = &ip_rt_gc_elasticity,
2427 .maxlen = sizeof(int),
2428 .mode = 0644,
6d9f239a 2429 .proc_handler = proc_dointvec,
1da177e4
LT
2430 },
2431 {
1da177e4
LT
2432 .procname = "mtu_expires",
2433 .data = &ip_rt_mtu_expires,
2434 .maxlen = sizeof(int),
2435 .mode = 0644,
6d9f239a 2436 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2437 },
2438 {
1da177e4
LT
2439 .procname = "min_pmtu",
2440 .data = &ip_rt_min_pmtu,
2441 .maxlen = sizeof(int),
2442 .mode = 0644,
6d9f239a 2443 .proc_handler = proc_dointvec,
1da177e4
LT
2444 },
2445 {
1da177e4
LT
2446 .procname = "min_adv_mss",
2447 .data = &ip_rt_min_advmss,
2448 .maxlen = sizeof(int),
2449 .mode = 0644,
6d9f239a 2450 .proc_handler = proc_dointvec,
1da177e4 2451 },
f8572d8f 2452 { }
1da177e4 2453};
39a23e75 2454
39a23e75
DL
2455static struct ctl_table ipv4_route_flush_table[] = {
2456 {
39a23e75
DL
2457 .procname = "flush",
2458 .maxlen = sizeof(int),
2459 .mode = 0200,
6d9f239a 2460 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2461 },
f8572d8f 2462 { },
39a23e75
DL
2463};
2464
2465static __net_init int sysctl_route_net_init(struct net *net)
2466{
2467 struct ctl_table *tbl;
2468
2469 tbl = ipv4_route_flush_table;
09ad9bc7 2470 if (!net_eq(net, &init_net)) {
39a23e75
DL
2471 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2472 if (tbl == NULL)
2473 goto err_dup;
2474 }
2475 tbl[0].extra1 = net;
2476
ec8f23ce 2477 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
2478 if (net->ipv4.route_hdr == NULL)
2479 goto err_reg;
2480 return 0;
2481
2482err_reg:
2483 if (tbl != ipv4_route_flush_table)
2484 kfree(tbl);
2485err_dup:
2486 return -ENOMEM;
2487}
2488
2489static __net_exit void sysctl_route_net_exit(struct net *net)
2490{
2491 struct ctl_table *tbl;
2492
2493 tbl = net->ipv4.route_hdr->ctl_table_arg;
2494 unregister_net_sysctl_table(net->ipv4.route_hdr);
2495 BUG_ON(tbl == ipv4_route_flush_table);
2496 kfree(tbl);
2497}
2498
2499static __net_initdata struct pernet_operations sysctl_route_ops = {
2500 .init = sysctl_route_net_init,
2501 .exit = sysctl_route_net_exit,
2502};
1da177e4
LT
2503#endif
2504
3ee94372 2505static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2506{
b42664f8 2507 atomic_set(&net->rt_genid, 0);
436c3b66
DM
2508 get_random_bytes(&net->ipv4.dev_addr_genid,
2509 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
2510 return 0;
2511}
2512
3ee94372
NH
2513static __net_initdata struct pernet_operations rt_genid_ops = {
2514 .init = rt_genid_init,
9f5e97e5
DL
2515};
2516
c3426b47
DM
2517static int __net_init ipv4_inetpeer_init(struct net *net)
2518{
2519 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2520
2521 if (!bp)
2522 return -ENOMEM;
2523 inet_peer_base_init(bp);
2524 net->ipv4.peers = bp;
2525 return 0;
2526}
2527
2528static void __net_exit ipv4_inetpeer_exit(struct net *net)
2529{
2530 struct inet_peer_base *bp = net->ipv4.peers;
2531
2532 net->ipv4.peers = NULL;
56a6b248 2533 inetpeer_invalidate_tree(bp);
c3426b47
DM
2534 kfree(bp);
2535}
2536
2537static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2538 .init = ipv4_inetpeer_init,
2539 .exit = ipv4_inetpeer_exit,
2540};
9f5e97e5 2541
c7066f70 2542#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 2543struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 2544#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 2545
1da177e4
LT
2546int __init ip_rt_init(void)
2547{
424c4b70 2548 int rc = 0;
1da177e4 2549
c7066f70 2550#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 2551 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
2552 if (!ip_rt_acct)
2553 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2554#endif
2555
e5d679f3
AD
2556 ipv4_dst_ops.kmem_cachep =
2557 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2558 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2559
14e50e57
DM
2560 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2561
fc66f95c
ED
2562 if (dst_entries_init(&ipv4_dst_ops) < 0)
2563 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2564
2565 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2566 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2567
89aef892
DM
2568 ipv4_dst_ops.gc_thresh = ~0;
2569 ip_rt_max_size = INT_MAX;
1da177e4 2570
1da177e4
LT
2571 devinet_init();
2572 ip_fib_init();
2573
73b38711 2574 if (ip_rt_proc_init())
058bd4d2 2575 pr_err("Unable to create route proc files\n");
1da177e4
LT
2576#ifdef CONFIG_XFRM
2577 xfrm_init();
a33bc5c1 2578 xfrm4_init(ip_rt_max_size);
1da177e4 2579#endif
c7ac8679 2580 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 2581
39a23e75
DL
2582#ifdef CONFIG_SYSCTL
2583 register_pernet_subsys(&sysctl_route_ops);
2584#endif
3ee94372 2585 register_pernet_subsys(&rt_genid_ops);
c3426b47 2586 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
2587 return rc;
2588}
2589
a1bc6eb4 2590#ifdef CONFIG_SYSCTL
eeb61f71
AV
2591/*
2592 * We really need to sanitize the damn ipv4 init order, then all
2593 * this nonsense will go away.
2594 */
2595void __init ip_static_sysctl_init(void)
2596{
4e5ca785 2597 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 2598}
a1bc6eb4 2599#endif