ipv4: reset rt_iif for recirculated mcast/bcast out pkts
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
e905a9ed 22 * Alan Cox : Super /proc >4K
1da177e4
LT
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
e905a9ed 40 *
1da177e4
LT
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
59 */
60
afd46503
JP
61#define pr_fmt(fmt) "IPv4: " fmt
62
1da177e4 63#include <linux/module.h>
7c0f6ba6 64#include <linux/uaccess.h>
1da177e4
LT
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
1da177e4
LT
68#include <linux/mm.h>
69#include <linux/string.h>
70#include <linux/socket.h>
71#include <linux/sockios.h>
72#include <linux/errno.h>
73#include <linux/in.h>
74#include <linux/inet.h>
75#include <linux/netdevice.h>
76#include <linux/proc_fs.h>
77#include <linux/init.h>
78#include <linux/skbuff.h>
1da177e4
LT
79#include <linux/inetdevice.h>
80#include <linux/igmp.h>
81#include <linux/pkt_sched.h>
82#include <linux/mroute.h>
83#include <linux/netfilter_ipv4.h>
84#include <linux/random.h>
1da177e4
LT
85#include <linux/rcupdate.h>
86#include <linux/times.h>
5a0e3ad6 87#include <linux/slab.h>
73f156a6 88#include <linux/jhash.h>
352e512c 89#include <net/dst.h>
1b7179d3 90#include <net/dst_metadata.h>
457c4cbc 91#include <net/net_namespace.h>
1da177e4
LT
92#include <net/protocol.h>
93#include <net/ip.h>
94#include <net/route.h>
95#include <net/inetpeer.h>
96#include <net/sock.h>
97#include <net/ip_fib.h>
98#include <net/arp.h>
99#include <net/tcp.h>
100#include <net/icmp.h>
101#include <net/xfrm.h>
571e7226 102#include <net/lwtunnel.h>
8d71740c 103#include <net/netevent.h>
63f3444f 104#include <net/rtnetlink.h>
1da177e4
LT
105#ifdef CONFIG_SYSCTL
106#include <linux/sysctl.h>
107#endif
6e5714ea 108#include <net/secure_seq.h>
1b7179d3 109#include <net/ip_tunnels.h>
385add90 110#include <net/l3mdev.h>
1da177e4 111
b6179813
RP
112#include "fib_lookup.h"
113
68a5e3dd 114#define RT_FL_TOS(oldflp4) \
f61759e6 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 116
1da177e4
LT
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_redirect_number __read_mostly = 9;
121static int ip_rt_redirect_load __read_mostly = HZ / 50;
122static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123static int ip_rt_error_cost __read_mostly = HZ;
124static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 125static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
c7272c2f 126static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 127static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 128
deed49df 129static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
c7272c2f 130
1da177e4
LT
131/*
132 * Interface to generic destination cache.
133 */
134
135static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 136static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 137static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
138static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
140static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb, u32 mtu);
142static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb);
caacf05e 144static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 145
62fa8a84
DM
146static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
147{
31248731
DM
148 WARN_ON(1);
149 return NULL;
62fa8a84
DM
150}
151
f894cbf8
DM
152static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153 struct sk_buff *skb,
154 const void *daddr);
63fca65d 155static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 156
1da177e4
LT
157static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
1da177e4 159 .check = ipv4_dst_check,
0dbaee3b 160 .default_advmss = ipv4_default_advmss,
ebb762f2 161 .mtu = ipv4_mtu,
62fa8a84 162 .cow_metrics = ipv4_cow_metrics,
caacf05e 163 .destroy = ipv4_dst_destroy,
1da177e4
LT
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
e47a185b 167 .redirect = ip_do_redirect,
b92dacd4 168 .local_out = __ip_local_out,
d3aaeb38 169 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 170 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
171};
172
173#define ECN_OR_COST(class) TC_PRIO_##class
174
4839c52b 175const __u8 ip_tos2prio[16] = {
1da177e4 176 TC_PRIO_BESTEFFORT,
4a2b9c37 177 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(BESTEFFORT),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK)
192};
d4a96865 193EXPORT_SYMBOL(ip_tos2prio);
1da177e4 194
2f970d83 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 197
1da177e4 198#ifdef CONFIG_PROC_FS
1da177e4
LT
199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200{
29e75252 201 if (*pos)
89aef892 202 return NULL;
29e75252 203 return SEQ_START_TOKEN;
1da177e4
LT
204}
205
206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207{
1da177e4 208 ++*pos;
89aef892 209 return NULL;
1da177e4
LT
210}
211
212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213{
1da177e4
LT
214}
215
216static int rt_cache_seq_show(struct seq_file *seq, void *v)
217{
218 if (v == SEQ_START_TOKEN)
219 seq_printf(seq, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222 "HHUptod\tSpecDst");
e905a9ed 223 return 0;
1da177e4
LT
224}
225
f690808e 226static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
227 .start = rt_cache_seq_start,
228 .next = rt_cache_seq_next,
229 .stop = rt_cache_seq_stop,
230 .show = rt_cache_seq_show,
231};
232
233static int rt_cache_seq_open(struct inode *inode, struct file *file)
234{
89aef892 235 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
236}
237
9a32144e 238static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
239 .open = rt_cache_seq_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
89aef892 242 .release = seq_release,
1da177e4
LT
243};
244
245
246static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247{
248 int cpu;
249
250 if (*pos == 0)
251 return SEQ_START_TOKEN;
252
0f23174a 253 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
254 if (!cpu_possible(cpu))
255 continue;
256 *pos = cpu+1;
2f970d83 257 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
258 }
259 return NULL;
260}
261
262static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263{
264 int cpu;
265
0f23174a 266 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
267 if (!cpu_possible(cpu))
268 continue;
269 *pos = cpu+1;
2f970d83 270 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
271 }
272 return NULL;
e905a9ed 273
1da177e4
LT
274}
275
276static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277{
278
279}
280
281static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282{
283 struct rt_cache_stat *st = v;
284
285 if (v == SEQ_START_TOKEN) {
5bec0039 286 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
287 return 0;
288 }
e905a9ed 289
1da177e4
LT
290 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
291 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 292 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 293 0, /* st->in_hit */
1da177e4
LT
294 st->in_slow_tot,
295 st->in_slow_mc,
296 st->in_no_route,
297 st->in_brd,
298 st->in_martian_dst,
299 st->in_martian_src,
300
0baf2b35 301 0, /* st->out_hit */
1da177e4 302 st->out_slow_tot,
e905a9ed 303 st->out_slow_mc,
1da177e4 304
0baf2b35
ED
305 0, /* st->gc_total */
306 0, /* st->gc_ignored */
307 0, /* st->gc_goal_miss */
308 0, /* st->gc_dst_overflow */
309 0, /* st->in_hlist_search */
310 0 /* st->out_hlist_search */
1da177e4
LT
311 );
312 return 0;
313}
314
f690808e 315static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
316 .start = rt_cpu_seq_start,
317 .next = rt_cpu_seq_next,
318 .stop = rt_cpu_seq_stop,
319 .show = rt_cpu_seq_show,
320};
321
322
323static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324{
325 return seq_open(file, &rt_cpu_seq_ops);
326}
327
9a32144e 328static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
329 .open = rt_cpu_seq_open,
330 .read = seq_read,
331 .llseek = seq_lseek,
332 .release = seq_release,
333};
334
c7066f70 335#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 336static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 337{
a661c419
AD
338 struct ip_rt_acct *dst, *src;
339 unsigned int i, j;
340
341 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342 if (!dst)
343 return -ENOMEM;
344
345 for_each_possible_cpu(i) {
346 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347 for (j = 0; j < 256; j++) {
348 dst[j].o_bytes += src[j].o_bytes;
349 dst[j].o_packets += src[j].o_packets;
350 dst[j].i_bytes += src[j].i_bytes;
351 dst[j].i_packets += src[j].i_packets;
352 }
78c686e9
PE
353 }
354
a661c419
AD
355 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356 kfree(dst);
357 return 0;
358}
78c686e9 359#endif
107f1634 360
73b38711 361static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
362{
363 struct proc_dir_entry *pde;
364
d6444062 365 pde = proc_create("rt_cache", 0444, net->proc_net,
d4beaa66 366 &rt_cache_seq_fops);
107f1634
PE
367 if (!pde)
368 goto err1;
369
d6444062 370 pde = proc_create("rt_cache", 0444,
77020720 371 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
372 if (!pde)
373 goto err2;
374
c7066f70 375#ifdef CONFIG_IP_ROUTE_CLASSID
3f3942ac
CH
376 pde = proc_create_single("rt_acct", 0, net->proc_net,
377 rt_acct_proc_show);
107f1634
PE
378 if (!pde)
379 goto err3;
380#endif
381 return 0;
382
c7066f70 383#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
384err3:
385 remove_proc_entry("rt_cache", net->proc_net_stat);
386#endif
387err2:
388 remove_proc_entry("rt_cache", net->proc_net);
389err1:
390 return -ENOMEM;
391}
73b38711
DL
392
393static void __net_exit ip_rt_do_proc_exit(struct net *net)
394{
395 remove_proc_entry("rt_cache", net->proc_net_stat);
396 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 397#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 398 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 399#endif
73b38711
DL
400}
401
402static struct pernet_operations ip_rt_proc_ops __net_initdata = {
403 .init = ip_rt_do_proc_init,
404 .exit = ip_rt_do_proc_exit,
405};
406
407static int __init ip_rt_proc_init(void)
408{
409 return register_pernet_subsys(&ip_rt_proc_ops);
410}
411
107f1634 412#else
73b38711 413static inline int ip_rt_proc_init(void)
107f1634
PE
414{
415 return 0;
416}
1da177e4 417#endif /* CONFIG_PROC_FS */
e905a9ed 418
4331debc 419static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 420{
ca4c3fc2 421 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
422}
423
4ccfe6d4 424void rt_cache_flush(struct net *net)
1da177e4 425{
ca4c3fc2 426 rt_genid_bump_ipv4(net);
98376387
ED
427}
428
f894cbf8
DM
429static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
430 struct sk_buff *skb,
431 const void *daddr)
3769cffb 432{
1550c171 433 const struct rtable *rt = container_of(dst, struct rtable, dst);
d3aaeb38 434 struct net_device *dev = dst->dev;
3769cffb
DM
435 struct neighbour *n;
436
5c9f7c1d
DA
437 rcu_read_lock_bh();
438
439 if (likely(rt->rt_gw_family == AF_INET)) {
440 n = ip_neigh_gw4(dev, rt->rt_gw4);
441 } else if (rt->rt_gw_family == AF_INET6) {
442 n = ip_neigh_gw6(dev, &rt->rt_gw6);
443 } else {
444 __be32 pkey;
445
446 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
447 n = ip_neigh_gw4(dev, pkey);
448 }
449
450 if (n && !refcount_inc_not_zero(&n->refcnt))
451 n = NULL;
452
453 rcu_read_unlock_bh();
454
455 return n;
d3aaeb38
DM
456}
457
63fca65d
JA
458static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459{
1550c171 460 const struct rtable *rt = container_of(dst, struct rtable, dst);
63fca65d
JA
461 struct net_device *dev = dst->dev;
462 const __be32 *pkey = daddr;
63fca65d 463
6de9c055 464 if (rt->rt_gw_family == AF_INET) {
1550c171 465 pkey = (const __be32 *)&rt->rt_gw4;
6de9c055
DA
466 } else if (rt->rt_gw_family == AF_INET6) {
467 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
468 } else if (!daddr ||
63fca65d 469 (rt->rt_flags &
6de9c055 470 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
63fca65d 471 return;
6de9c055 472 }
63fca65d
JA
473 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474}
475
04ca6973 476#define IP_IDENTS_SZ 2048u
04ca6973 477
355b590c
ED
478static atomic_t *ip_idents __read_mostly;
479static u32 *ip_tstamps __read_mostly;
04ca6973
ED
480
481/* In order to protect privacy, we add a perturbation to identifiers
482 * if one generator is seldom used. This makes hard for an attacker
483 * to infer how many packets were sent between two points in time.
484 */
485u32 ip_idents_reserve(u32 hash, int segs)
486{
355b590c
ED
487 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
488 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 489 u32 old = READ_ONCE(*p_tstamp);
04ca6973 490 u32 now = (u32)jiffies;
adb03115 491 u32 new, delta = 0;
04ca6973 492
355b590c 493 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
494 delta = prandom_u32_max(now - old);
495
adb03115
ED
496 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
497 do {
498 old = (u32)atomic_read(p_id);
499 new = old + delta + segs;
500 } while (atomic_cmpxchg(p_id, old, new) != old);
501
502 return new - segs;
04ca6973
ED
503}
504EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 505
b6a7719a 506void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 507{
73f156a6 508 u32 hash, id;
1da177e4 509
df453700
ED
510 /* Note the following code is not safe, but this is okay. */
511 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
512 get_random_bytes(&net->ipv4.ip_id_key,
513 sizeof(net->ipv4.ip_id_key));
1da177e4 514
df453700 515 hash = siphash_3u32((__force u32)iph->daddr,
04ca6973 516 (__force u32)iph->saddr,
df453700
ED
517 iph->protocol,
518 &net->ipv4.ip_id_key);
73f156a6
ED
519 id = ip_idents_reserve(hash, segs);
520 iph->id = htons(id);
1da177e4 521}
4bc2f18b 522EXPORT_SYMBOL(__ip_select_ident);
1da177e4 523
e2d118a1
LC
524static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
525 const struct sock *sk,
4895c771
DM
526 const struct iphdr *iph,
527 int oif, u8 tos,
528 u8 prot, u32 mark, int flow_flags)
529{
530 if (sk) {
531 const struct inet_sock *inet = inet_sk(sk);
532
533 oif = sk->sk_bound_dev_if;
534 mark = sk->sk_mark;
535 tos = RT_CONN_FLAGS(sk);
536 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
537 }
538 flowi4_init_output(fl4, oif, mark, tos,
539 RT_SCOPE_UNIVERSE, prot,
540 flow_flags,
e2d118a1
LC
541 iph->daddr, iph->saddr, 0, 0,
542 sock_net_uid(net, sk));
4895c771
DM
543}
544
5abf7f7e
ED
545static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
546 const struct sock *sk)
4895c771 547{
d109e61b 548 const struct net *net = dev_net(skb->dev);
4895c771
DM
549 const struct iphdr *iph = ip_hdr(skb);
550 int oif = skb->dev->ifindex;
551 u8 tos = RT_TOS(iph->tos);
552 u8 prot = iph->protocol;
553 u32 mark = skb->mark;
554
d109e61b 555 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
556}
557
5abf7f7e 558static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
559{
560 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 561 const struct ip_options_rcu *inet_opt;
4895c771
DM
562 __be32 daddr = inet->inet_daddr;
563
564 rcu_read_lock();
565 inet_opt = rcu_dereference(inet->inet_opt);
566 if (inet_opt && inet_opt->opt.srr)
567 daddr = inet_opt->opt.faddr;
568 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
569 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
570 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
571 inet_sk_flowi_flags(sk),
e2d118a1 572 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
573 rcu_read_unlock();
574}
575
5abf7f7e
ED
576static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
577 const struct sk_buff *skb)
4895c771
DM
578{
579 if (skb)
580 build_skb_flow_key(fl4, skb, sk);
581 else
582 build_sk_flow_key(fl4, sk);
583}
584
c5038a83 585static DEFINE_SPINLOCK(fnhe_lock);
4895c771 586
2ffae99d
TT
587static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
588{
589 struct rtable *rt;
590
591 rt = rcu_dereference(fnhe->fnhe_rth_input);
592 if (rt) {
593 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 594 dst_dev_put(&rt->dst);
0830106c 595 dst_release(&rt->dst);
2ffae99d
TT
596 }
597 rt = rcu_dereference(fnhe->fnhe_rth_output);
598 if (rt) {
599 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 600 dst_dev_put(&rt->dst);
0830106c 601 dst_release(&rt->dst);
2ffae99d
TT
602 }
603}
604
aee06da6 605static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
606{
607 struct fib_nh_exception *fnhe, *oldest;
608
609 oldest = rcu_dereference(hash->chain);
610 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
611 fnhe = rcu_dereference(fnhe->fnhe_next)) {
612 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
613 oldest = fnhe;
614 }
2ffae99d 615 fnhe_flush_routes(oldest);
4895c771
DM
616 return oldest;
617}
618
d3a25c98
DM
619static inline u32 fnhe_hashfun(__be32 daddr)
620{
d546c621 621 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
622 u32 hval;
623
d546c621
ED
624 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
625 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
626 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
627}
628
387aa65a
TT
629static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
630{
631 rt->rt_pmtu = fnhe->fnhe_pmtu;
d52e5a7e 632 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
633 rt->dst.expires = fnhe->fnhe_expires;
634
635 if (fnhe->fnhe_gw) {
636 rt->rt_flags |= RTCF_REDIRECTED;
1550c171
DA
637 rt->rt_gw_family = AF_INET;
638 rt->rt_gw4 = fnhe->fnhe_gw;
387aa65a
TT
639 }
640}
641
a5995e71
DA
642static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
643 __be32 gw, u32 pmtu, bool lock,
644 unsigned long expires)
4895c771 645{
aee06da6 646 struct fnhe_hash_bucket *hash;
4895c771 647 struct fib_nh_exception *fnhe;
387aa65a 648 struct rtable *rt;
cebe84c6 649 u32 genid, hval;
387aa65a 650 unsigned int i;
4895c771 651 int depth;
cebe84c6 652
a5995e71 653 genid = fnhe_genid(dev_net(nhc->nhc_dev));
cebe84c6 654 hval = fnhe_hashfun(daddr);
aee06da6 655
c5038a83 656 spin_lock_bh(&fnhe_lock);
4895c771 657
a5995e71 658 hash = rcu_dereference(nhc->nhc_exceptions);
4895c771 659 if (!hash) {
6396bb22 660 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
4895c771 661 if (!hash)
aee06da6 662 goto out_unlock;
a5995e71 663 rcu_assign_pointer(nhc->nhc_exceptions, hash);
4895c771
DM
664 }
665
4895c771
DM
666 hash += hval;
667
668 depth = 0;
669 for (fnhe = rcu_dereference(hash->chain); fnhe;
670 fnhe = rcu_dereference(fnhe->fnhe_next)) {
671 if (fnhe->fnhe_daddr == daddr)
aee06da6 672 break;
4895c771
DM
673 depth++;
674 }
675
aee06da6 676 if (fnhe) {
cebe84c6
XL
677 if (fnhe->fnhe_genid != genid)
678 fnhe->fnhe_genid = genid;
aee06da6
JA
679 if (gw)
680 fnhe->fnhe_gw = gw;
d52e5a7e 681 if (pmtu) {
aee06da6 682 fnhe->fnhe_pmtu = pmtu;
d52e5a7e
SD
683 fnhe->fnhe_mtu_locked = lock;
684 }
e39d5246 685 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 686 /* Update all cached dsts too */
2ffae99d
TT
687 rt = rcu_dereference(fnhe->fnhe_rth_input);
688 if (rt)
689 fill_route_from_fnhe(rt, fnhe);
690 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
693 } else {
694 if (depth > FNHE_RECLAIM_DEPTH)
695 fnhe = fnhe_oldest(hash);
696 else {
697 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698 if (!fnhe)
699 goto out_unlock;
700
701 fnhe->fnhe_next = hash->chain;
702 rcu_assign_pointer(hash->chain, fnhe);
703 }
cebe84c6 704 fnhe->fnhe_genid = genid;
aee06da6
JA
705 fnhe->fnhe_daddr = daddr;
706 fnhe->fnhe_gw = gw;
707 fnhe->fnhe_pmtu = pmtu;
d52e5a7e 708 fnhe->fnhe_mtu_locked = lock;
94720e3a 709 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
710
711 /* Exception created; mark the cached routes for the nexthop
712 * stale, so anyone caching it rechecks if this exception
713 * applies to them.
714 */
0f457a36 715 rt = rcu_dereference(nhc->nhc_rth_input);
2ffae99d
TT
716 if (rt)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718
387aa65a
TT
719 for_each_possible_cpu(i) {
720 struct rtable __rcu **prt;
0f457a36 721 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
387aa65a
TT
722 rt = rcu_dereference(*prt);
723 if (rt)
724 rt->dst.obsolete = DST_OBSOLETE_KILL;
725 }
4895c771 726 }
4895c771 727
4895c771 728 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
729
730out_unlock:
c5038a83 731 spin_unlock_bh(&fnhe_lock);
4895c771
DM
732}
733
ceb33206
DM
734static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 bool kill_route)
1da177e4 736{
e47a185b 737 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 738 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 739 struct net_device *dev = skb->dev;
e47a185b 740 struct in_device *in_dev;
4895c771 741 struct fib_result res;
e47a185b 742 struct neighbour *n;
317805b8 743 struct net *net;
1da177e4 744
94206125
DM
745 switch (icmp_hdr(skb)->code & 7) {
746 case ICMP_REDIR_NET:
747 case ICMP_REDIR_NETTOS:
748 case ICMP_REDIR_HOST:
749 case ICMP_REDIR_HOSTTOS:
750 break;
751
752 default:
753 return;
754 }
755
1550c171 756 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
e47a185b
DM
757 return;
758
759 in_dev = __in_dev_get_rcu(dev);
760 if (!in_dev)
761 return;
762
c346dca1 763 net = dev_net(dev);
9d4fb27d
JP
764 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 ipv4_is_zeronet(new_gw))
1da177e4
LT
767 goto reject_redirect;
768
769 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 goto reject_redirect;
772 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 goto reject_redirect;
774 } else {
317805b8 775 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
776 goto reject_redirect;
777 }
778
969447f2
SSL
779 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 if (!n)
781 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 782 if (!IS_ERR(n)) {
e47a185b
DM
783 if (!(n->nud_state & NUD_VALID)) {
784 neigh_event_send(n, NULL);
785 } else {
0eeb075f 786 if (fib_lookup(net, fl4, &res, 0) == 0) {
eba618ab 787 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 788
a5995e71 789 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
d52e5a7e
SD
790 0, false,
791 jiffies + ip_rt_gc_timeout);
4895c771 792 }
ceb33206
DM
793 if (kill_route)
794 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
795 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
796 }
797 neigh_release(n);
798 }
799 return;
800
801reject_redirect:
802#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
803 if (IN_DEV_LOG_MARTIANS(in_dev)) {
804 const struct iphdr *iph = (const struct iphdr *) skb->data;
805 __be32 daddr = iph->daddr;
806 __be32 saddr = iph->saddr;
807
e47a185b
DM
808 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
809 " Advised path = %pI4 -> %pI4\n",
810 &old_gw, dev->name, &new_gw,
811 &saddr, &daddr);
99ee038d 812 }
e47a185b
DM
813#endif
814 ;
815}
816
4895c771
DM
817static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
818{
819 struct rtable *rt;
820 struct flowi4 fl4;
f96ef988 821 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 822 struct net *net = dev_net(skb->dev);
f96ef988
MK
823 int oif = skb->dev->ifindex;
824 u8 tos = RT_TOS(iph->tos);
825 u8 prot = iph->protocol;
826 u32 mark = skb->mark;
4895c771
DM
827
828 rt = (struct rtable *) dst;
829
7d995694 830 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 831 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
832}
833
1da177e4
LT
834static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
835{
ee6b9673 836 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
837 struct dst_entry *ret = dst;
838
839 if (rt) {
d11a4dc1 840 if (dst->obsolete > 0) {
1da177e4
LT
841 ip_rt_put(rt);
842 ret = NULL;
5943634f
DM
843 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
844 rt->dst.expires) {
89aef892 845 ip_rt_put(rt);
1da177e4
LT
846 ret = NULL;
847 }
848 }
849 return ret;
850}
851
852/*
853 * Algorithm:
854 * 1. The first ip_rt_redirect_number redirects are sent
855 * with exponential backoff, then we stop sending them at all,
856 * assuming that the host ignores our redirects.
857 * 2. If we did not see packets requiring redirects
858 * during ip_rt_redirect_silence, we assume that the host
859 * forgot redirected route and start to send redirects again.
860 *
861 * This algorithm is much cheaper and more intelligent than dumb load limiting
862 * in icmp.c.
863 *
864 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
865 * and "frag. need" (breaks PMTU discovery) in icmp.c.
866 */
867
868void ip_rt_send_redirect(struct sk_buff *skb)
869{
511c3f92 870 struct rtable *rt = skb_rtable(skb);
30038fc6 871 struct in_device *in_dev;
92d86829 872 struct inet_peer *peer;
1d861aa4 873 struct net *net;
30038fc6 874 int log_martians;
192132b9 875 int vif;
1da177e4 876
30038fc6 877 rcu_read_lock();
d8d1f30b 878 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
879 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
880 rcu_read_unlock();
1da177e4 881 return;
30038fc6
ED
882 }
883 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 884 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 885 rcu_read_unlock();
1da177e4 886
1d861aa4 887 net = dev_net(rt->dst.dev);
192132b9 888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 889 if (!peer) {
e81da0e1
JA
890 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
891 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
892 return;
893 }
894
1da177e4
LT
895 /* No redirected packets during ip_rt_redirect_silence;
896 * reset the algorithm.
897 */
c09551c6 898 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 899 peer->rate_tokens = 0;
c09551c6
LB
900 peer->n_redirects = 0;
901 }
1da177e4
LT
902
903 /* Too many ignored redirects; do not send anything
d8d1f30b 904 * set dst.rate_last to the last seen redirected packet.
1da177e4 905 */
c09551c6 906 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 907 peer->rate_last = jiffies;
1d861aa4 908 goto out_put_peer;
1da177e4
LT
909 }
910
911 /* Check for load limit; set rate_last to the latest sent
912 * redirect.
913 */
92d86829 914 if (peer->rate_tokens == 0 ||
14fb8a76 915 time_after(jiffies,
92d86829
DM
916 (peer->rate_last +
917 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
918 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
919
920 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
921 peer->rate_last = jiffies;
922 ++peer->rate_tokens;
c09551c6 923 ++peer->n_redirects;
1da177e4 924#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 925 if (log_martians &&
e87cc472
JP
926 peer->rate_tokens == ip_rt_redirect_number)
927 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 928 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 929 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
930#endif
931 }
1d861aa4
DM
932out_put_peer:
933 inet_putpeer(peer);
1da177e4
LT
934}
935
936static int ip_error(struct sk_buff *skb)
937{
511c3f92 938 struct rtable *rt = skb_rtable(skb);
e2c0dc1f
SS
939 struct net_device *dev = skb->dev;
940 struct in_device *in_dev;
92d86829 941 struct inet_peer *peer;
1da177e4 942 unsigned long now;
251da413 943 struct net *net;
92d86829 944 bool send;
1da177e4
LT
945 int code;
946
e2c0dc1f
SS
947 if (netif_is_l3_master(skb->dev)) {
948 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949 if (!dev)
950 goto out;
951 }
952
953 in_dev = __in_dev_get_rcu(dev);
954
381c759d
EB
955 /* IP on this device is disabled. */
956 if (!in_dev)
957 goto out;
958
251da413
DM
959 net = dev_net(rt->dst.dev);
960 if (!IN_DEV_FORWARD(in_dev)) {
961 switch (rt->dst.error) {
962 case EHOSTUNREACH:
b45386ef 963 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
964 break;
965
966 case ENETUNREACH:
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
968 break;
969 }
970 goto out;
971 }
972
d8d1f30b 973 switch (rt->dst.error) {
4500ebf8
JP
974 case EINVAL:
975 default:
976 goto out;
977 case EHOSTUNREACH:
978 code = ICMP_HOST_UNREACH;
979 break;
980 case ENETUNREACH:
981 code = ICMP_NET_UNREACH;
b45386ef 982 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
983 break;
984 case EACCES:
985 code = ICMP_PKT_FILTERED;
986 break;
1da177e4
LT
987 }
988
192132b9 989 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 990 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
991
992 send = true;
993 if (peer) {
994 now = jiffies;
995 peer->rate_tokens += now - peer->rate_last;
996 if (peer->rate_tokens > ip_rt_error_burst)
997 peer->rate_tokens = ip_rt_error_burst;
998 peer->rate_last = now;
999 if (peer->rate_tokens >= ip_rt_error_cost)
1000 peer->rate_tokens -= ip_rt_error_cost;
1001 else
1002 send = false;
1d861aa4 1003 inet_putpeer(peer);
1da177e4 1004 }
92d86829
DM
1005 if (send)
1006 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1007
1008out: kfree_skb(skb);
1009 return 0;
e905a9ed 1010}
1da177e4 1011
d851c12b 1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1013{
d851c12b 1014 struct dst_entry *dst = &rt->dst;
28d35bcd 1015 u32 old_mtu = ipv4_mtu(dst);
4895c771 1016 struct fib_result res;
d52e5a7e 1017 bool lock = false;
2c8cec5c 1018
d52e5a7e 1019 if (ip_mtu_locked(dst))
fa1e492a
SK
1020 return;
1021
28d35bcd 1022 if (old_mtu < mtu)
3cdaa5be
LW
1023 return;
1024
d52e5a7e
SD
1025 if (mtu < ip_rt_min_pmtu) {
1026 lock = true;
28d35bcd 1027 mtu = min(old_mtu, ip_rt_min_pmtu);
d52e5a7e 1028 }
2c8cec5c 1029
28d35bcd 1030 if (rt->rt_pmtu == mtu && !lock &&
f016229e
TT
1031 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 return;
1033
c5ae7d41 1034 rcu_read_lock();
0eeb075f 1035 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
eba618ab 1036 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 1037
a5995e71 1038 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
aee06da6 1039 jiffies + ip_rt_mtu_expires);
4895c771 1040 }
c5ae7d41 1041 rcu_read_unlock();
1da177e4
LT
1042}
1043
4895c771
DM
1044static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 struct sk_buff *skb, u32 mtu)
1046{
1047 struct rtable *rt = (struct rtable *) dst;
1048 struct flowi4 fl4;
1049
1050 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1051 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1052}
1053
36393395 1054void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
d888f396 1055 int oif, u8 protocol)
36393395 1056{
4895c771 1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1058 struct flowi4 fl4;
1059 struct rtable *rt;
d888f396 1060 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1b3c61dc 1061
e2d118a1 1062 __build_flow_key(net, &fl4, NULL, iph, oif,
d888f396 1063 RT_TOS(iph->tos), protocol, mark, 0);
36393395
DM
1064 rt = __ip_route_output_key(net, &fl4);
1065 if (!IS_ERR(rt)) {
4895c771 1066 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1067 ip_rt_put(rt);
1068 }
1069}
1070EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071
9cb3a50c 1072static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1073{
4895c771
DM
1074 const struct iphdr *iph = (const struct iphdr *) skb->data;
1075 struct flowi4 fl4;
1076 struct rtable *rt;
36393395 1077
e2d118a1 1078 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1079
1080 if (!fl4.flowi4_mark)
1081 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082
4895c771
DM
1083 rt = __ip_route_output_key(sock_net(sk), &fl4);
1084 if (!IS_ERR(rt)) {
1085 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086 ip_rt_put(rt);
1087 }
36393395 1088}
9cb3a50c
SK
1089
1090void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091{
1092 const struct iphdr *iph = (const struct iphdr *) skb->data;
1093 struct flowi4 fl4;
1094 struct rtable *rt;
7f502361 1095 struct dst_entry *odst = NULL;
b44108db 1096 bool new = false;
e2d118a1 1097 struct net *net = sock_net(sk);
9cb3a50c
SK
1098
1099 bh_lock_sock(sk);
482fc609
HFS
1100
1101 if (!ip_sk_accept_pmtu(sk))
1102 goto out;
1103
7f502361 1104 odst = sk_dst_get(sk);
9cb3a50c 1105
7f502361 1106 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1107 __ipv4_sk_update_pmtu(skb, sk, mtu);
1108 goto out;
1109 }
1110
e2d118a1 1111 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1112
7f502361 1113 rt = (struct rtable *)odst;
51456b29 1114 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1115 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116 if (IS_ERR(rt))
1117 goto out;
b44108db
SK
1118
1119 new = true;
9cb3a50c
SK
1120 }
1121
0f6c480f 1122 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1123
7f502361 1124 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1125 if (new)
1126 dst_release(&rt->dst);
1127
9cb3a50c
SK
1128 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129 if (IS_ERR(rt))
1130 goto out;
1131
b44108db 1132 new = true;
9cb3a50c
SK
1133 }
1134
b44108db 1135 if (new)
7f502361 1136 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1137
1138out:
1139 bh_unlock_sock(sk);
7f502361 1140 dst_release(odst);
9cb3a50c 1141}
36393395 1142EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1143
b42597e2 1144void ipv4_redirect(struct sk_buff *skb, struct net *net,
1042caa7 1145 int oif, u8 protocol)
b42597e2 1146{
4895c771 1147 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1148 struct flowi4 fl4;
1149 struct rtable *rt;
1150
e2d118a1 1151 __build_flow_key(net, &fl4, NULL, iph, oif,
1042caa7 1152 RT_TOS(iph->tos), protocol, 0, 0);
b42597e2
DM
1153 rt = __ip_route_output_key(net, &fl4);
1154 if (!IS_ERR(rt)) {
ceb33206 1155 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1156 ip_rt_put(rt);
1157 }
1158}
1159EXPORT_SYMBOL_GPL(ipv4_redirect);
1160
1161void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162{
4895c771
DM
1163 const struct iphdr *iph = (const struct iphdr *) skb->data;
1164 struct flowi4 fl4;
1165 struct rtable *rt;
e2d118a1 1166 struct net *net = sock_net(sk);
b42597e2 1167
e2d118a1
LC
1168 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169 rt = __ip_route_output_key(net, &fl4);
4895c771 1170 if (!IS_ERR(rt)) {
ceb33206 1171 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1172 ip_rt_put(rt);
1173 }
b42597e2
DM
1174}
1175EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176
efbc368d
DM
1177static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178{
1179 struct rtable *rt = (struct rtable *) dst;
1180
ceb33206
DM
1181 /* All IPV4 dsts are created with ->obsolete set to the value
1182 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183 * into this function always.
1184 *
387aa65a
TT
1185 * When a PMTU/redirect information update invalidates a route,
1186 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
02afc7ad 1187 * DST_OBSOLETE_DEAD.
ceb33206 1188 */
387aa65a 1189 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1190 return NULL;
d11a4dc1 1191 return dst;
1da177e4
LT
1192}
1193
20ff83f1 1194static void ipv4_send_dest_unreach(struct sk_buff *skb)
1da177e4 1195{
ed0de45a 1196 struct ip_options opt;
c543cb4a 1197 int res;
1da177e4 1198
ed0de45a 1199 /* Recompile ip options since IPCB may not be valid anymore.
20ff83f1 1200 * Also check we have a reasonable ipv4 header.
ed0de45a 1201 */
20ff83f1
ED
1202 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204 return;
c543cb4a 1205
20ff83f1
ED
1206 memset(&opt, 0, sizeof(opt));
1207 if (ip_hdr(skb)->ihl > 5) {
1208 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209 return;
1210 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
c543cb4a 1211
20ff83f1
ED
1212 rcu_read_lock();
1213 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214 rcu_read_unlock();
ed0de45a 1215
20ff83f1
ED
1216 if (res)
1217 return;
1218 }
ed0de45a 1219 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
20ff83f1
ED
1220}
1221
1222static void ipv4_link_failure(struct sk_buff *skb)
1223{
1224 struct rtable *rt;
1225
1226 ipv4_send_dest_unreach(skb);
1da177e4 1227
511c3f92 1228 rt = skb_rtable(skb);
5943634f
DM
1229 if (rt)
1230 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1231}
1232
ede2059d 1233static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1234{
91df42be
JP
1235 pr_debug("%s: %pI4 -> %pI4, %s\n",
1236 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237 skb->dev ? skb->dev->name : "?");
1da177e4 1238 kfree_skb(skb);
c378a9c0 1239 WARN_ON(1);
1da177e4
LT
1240 return 0;
1241}
1242
1243/*
1244 We do not cache source address of outgoing interface,
1245 because it is used only by IP RR, TS and SRR options,
1246 so that it out of fast path.
1247
1248 BTW remember: "addr" is allowed to be not aligned
1249 in IP options!
1250 */
1251
8e36360a 1252void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1253{
a61ced5d 1254 __be32 src;
1da177e4 1255
c7537967 1256 if (rt_is_output_route(rt))
c5be24ff 1257 src = ip_hdr(skb)->saddr;
ebc0ffae 1258 else {
8e36360a 1259 struct fib_result res;
e351bb62
MÅ»
1260 struct iphdr *iph = ip_hdr(skb);
1261 struct flowi4 fl4 = {
1262 .daddr = iph->daddr,
1263 .saddr = iph->saddr,
1264 .flowi4_tos = RT_TOS(iph->tos),
1265 .flowi4_oif = rt->dst.dev->ifindex,
1266 .flowi4_iif = skb->dev->ifindex,
1267 .flowi4_mark = skb->mark,
1268 };
5e2b61f7 1269
ebc0ffae 1270 rcu_read_lock();
0eeb075f 1271 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
eba618ab 1272 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
ebc0ffae 1273 else
f8126f1d
DM
1274 src = inet_select_addr(rt->dst.dev,
1275 rt_nexthop(rt, iph->daddr),
1276 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1277 rcu_read_unlock();
1278 }
1da177e4
LT
1279 memcpy(addr, &src, 4);
1280}
1281
c7066f70 1282#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1283static void set_class_tag(struct rtable *rt, u32 tag)
1284{
d8d1f30b
CG
1285 if (!(rt->dst.tclassid & 0xFFFF))
1286 rt->dst.tclassid |= tag & 0xFFFF;
1287 if (!(rt->dst.tclassid & 0xFFFF0000))
1288 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1289}
1290#endif
1291
0dbaee3b
DM
1292static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293{
7ed14d97 1294 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1295 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1296 ip_rt_min_advmss);
0dbaee3b 1297
7ed14d97 1298 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1299}
1300
ebb762f2 1301static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1302{
261663b0 1303 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1304 unsigned int mtu = rt->rt_pmtu;
1305
98d75c37 1306 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1307 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1308
38d523e2 1309 if (mtu)
618f9bc7
SK
1310 return mtu;
1311
c780a049 1312 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1313
d52e5a7e 1314 if (unlikely(ip_mtu_locked(dst))) {
1550c171 1315 if (rt->rt_gw_family && mtu > 576)
d33e4553
DM
1316 mtu = 576;
1317 }
1318
14972cbd
RP
1319 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1322}
1323
a5995e71 1324static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
94720e3a
JA
1325{
1326 struct fnhe_hash_bucket *hash;
1327 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328 u32 hval = fnhe_hashfun(daddr);
1329
1330 spin_lock_bh(&fnhe_lock);
1331
a5995e71 1332 hash = rcu_dereference_protected(nhc->nhc_exceptions,
94720e3a
JA
1333 lockdep_is_held(&fnhe_lock));
1334 hash += hval;
1335
1336 fnhe_p = &hash->chain;
1337 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338 while (fnhe) {
1339 if (fnhe->fnhe_daddr == daddr) {
1340 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
ee60ad21
XL
1342 /* set fnhe_daddr to 0 to ensure it won't bind with
1343 * new dsts in rt_bind_exception().
1344 */
1345 fnhe->fnhe_daddr = 0;
94720e3a
JA
1346 fnhe_flush_routes(fnhe);
1347 kfree_rcu(fnhe, rcu);
1348 break;
1349 }
1350 fnhe_p = &fnhe->fnhe_next;
1351 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352 lockdep_is_held(&fnhe_lock));
1353 }
1354
1355 spin_unlock_bh(&fnhe_lock);
1356}
1357
a5995e71
DA
1358static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359 __be32 daddr)
4895c771 1360{
a5995e71 1361 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
4895c771
DM
1362 struct fib_nh_exception *fnhe;
1363 u32 hval;
1364
f2bb4bed
DM
1365 if (!hash)
1366 return NULL;
1367
d3a25c98 1368 hval = fnhe_hashfun(daddr);
4895c771
DM
1369
1370 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371 fnhe = rcu_dereference(fnhe->fnhe_next)) {
94720e3a
JA
1372 if (fnhe->fnhe_daddr == daddr) {
1373 if (fnhe->fnhe_expires &&
1374 time_after(jiffies, fnhe->fnhe_expires)) {
a5995e71 1375 ip_del_fnhe(nhc, daddr);
94720e3a
JA
1376 break;
1377 }
f2bb4bed 1378 return fnhe;
94720e3a 1379 }
f2bb4bed
DM
1380 }
1381 return NULL;
1382}
aee06da6 1383
50d889b1
DA
1384/* MTU selection:
1385 * 1. mtu on route is locked - use it
1386 * 2. mtu from nexthop exception
1387 * 3. mtu from egress device
1388 */
1389
1390u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391{
eba618ab
DA
1392 struct fib_nh_common *nhc = res->nhc;
1393 struct net_device *dev = nhc->nhc_dev;
50d889b1 1394 struct fib_info *fi = res->fi;
50d889b1
DA
1395 u32 mtu = 0;
1396
1397 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399 mtu = fi->fib_mtu;
1400
1401 if (likely(!mtu)) {
1402 struct fib_nh_exception *fnhe;
1403
a5995e71 1404 fnhe = find_exception(nhc, daddr);
50d889b1
DA
1405 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406 mtu = fnhe->fnhe_pmtu;
1407 }
1408
1409 if (likely(!mtu))
1410 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411
eba618ab 1412 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
50d889b1
DA
1413}
1414
caacf05e 1415static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1416 __be32 daddr, const bool do_cache)
f2bb4bed 1417{
caacf05e
DM
1418 bool ret = false;
1419
c5038a83 1420 spin_lock_bh(&fnhe_lock);
f2bb4bed 1421
c5038a83 1422 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1423 struct rtable __rcu **porig;
1424 struct rtable *orig;
5aad1de5 1425 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1426
1427 if (rt_is_input_route(rt))
1428 porig = &fnhe->fnhe_rth_input;
1429 else
1430 porig = &fnhe->fnhe_rth_output;
1431 orig = rcu_dereference(*porig);
5aad1de5
TT
1432
1433 if (fnhe->fnhe_genid != genid) {
1434 fnhe->fnhe_genid = genid;
13d82bf5
SK
1435 fnhe->fnhe_gw = 0;
1436 fnhe->fnhe_pmtu = 0;
1437 fnhe->fnhe_expires = 0;
0e8411e4 1438 fnhe->fnhe_mtu_locked = false;
2ffae99d
TT
1439 fnhe_flush_routes(fnhe);
1440 orig = NULL;
13d82bf5 1441 }
387aa65a 1442 fill_route_from_fnhe(rt, fnhe);
1550c171
DA
1443 if (!rt->rt_gw4) {
1444 rt->rt_gw4 = daddr;
1445 rt->rt_gw_family = AF_INET;
1446 }
f2bb4bed 1447
a4c2fd7f 1448 if (do_cache) {
0830106c 1449 dst_hold(&rt->dst);
2ffae99d 1450 rcu_assign_pointer(*porig, rt);
0830106c 1451 if (orig) {
95c47f9c 1452 dst_dev_put(&orig->dst);
0830106c 1453 dst_release(&orig->dst);
0830106c 1454 }
2ffae99d
TT
1455 ret = true;
1456 }
c5038a83
DM
1457
1458 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1459 }
1460 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1461
1462 return ret;
54764bb6
ED
1463}
1464
87063a1f 1465static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
f2bb4bed 1466{
d26b3a7c 1467 struct rtable *orig, *prev, **p;
caacf05e 1468 bool ret = true;
f2bb4bed 1469
d26b3a7c 1470 if (rt_is_input_route(rt)) {
0f457a36 1471 p = (struct rtable **)&nhc->nhc_rth_input;
d26b3a7c 1472 } else {
0f457a36 1473 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
d26b3a7c 1474 }
f2bb4bed
DM
1475 orig = *p;
1476
0830106c
WW
1477 /* hold dst before doing cmpxchg() to avoid race condition
1478 * on this dst
1479 */
1480 dst_hold(&rt->dst);
f2bb4bed
DM
1481 prev = cmpxchg(p, orig, rt);
1482 if (prev == orig) {
0830106c 1483 if (orig) {
95c47f9c 1484 dst_dev_put(&orig->dst);
0830106c 1485 dst_release(&orig->dst);
0830106c
WW
1486 }
1487 } else {
1488 dst_release(&rt->dst);
caacf05e 1489 ret = false;
0830106c 1490 }
caacf05e
DM
1491
1492 return ret;
1493}
1494
5055c371
ED
1495struct uncached_list {
1496 spinlock_t lock;
1497 struct list_head head;
1498};
1499
1500static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1501
510c321b 1502void rt_add_uncached_list(struct rtable *rt)
caacf05e 1503{
5055c371
ED
1504 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505
1506 rt->rt_uncached_list = ul;
1507
1508 spin_lock_bh(&ul->lock);
1509 list_add_tail(&rt->rt_uncached, &ul->head);
1510 spin_unlock_bh(&ul->lock);
caacf05e
DM
1511}
1512
510c321b 1513void rt_del_uncached_list(struct rtable *rt)
caacf05e 1514{
78df76a0 1515 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1516 struct uncached_list *ul = rt->rt_uncached_list;
1517
1518 spin_lock_bh(&ul->lock);
caacf05e 1519 list_del(&rt->rt_uncached);
5055c371 1520 spin_unlock_bh(&ul->lock);
caacf05e
DM
1521 }
1522}
1523
510c321b
XL
1524static void ipv4_dst_destroy(struct dst_entry *dst)
1525{
510c321b
XL
1526 struct rtable *rt = (struct rtable *)dst;
1527
1620a336 1528 ip_dst_metrics_put(dst);
510c321b
XL
1529 rt_del_uncached_list(rt);
1530}
1531
caacf05e
DM
1532void rt_flush_dev(struct net_device *dev)
1533{
5055c371
ED
1534 struct net *net = dev_net(dev);
1535 struct rtable *rt;
1536 int cpu;
1537
1538 for_each_possible_cpu(cpu) {
1539 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1540
5055c371
ED
1541 spin_lock_bh(&ul->lock);
1542 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1543 if (rt->dst.dev != dev)
1544 continue;
1545 rt->dst.dev = net->loopback_dev;
1546 dev_hold(rt->dst.dev);
1547 dev_put(dev);
1548 }
5055c371 1549 spin_unlock_bh(&ul->lock);
4895c771
DM
1550 }
1551}
1552
4331debc 1553static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1554{
4331debc
ED
1555 return rt &&
1556 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557 !rt_is_expired(rt);
d2d68ba9
DM
1558}
1559
f2bb4bed 1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1561 const struct fib_result *res,
f2bb4bed 1562 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1563 struct fib_info *fi, u16 type, u32 itag,
1564 const bool do_cache)
1da177e4 1565{
caacf05e
DM
1566 bool cached = false;
1567
1da177e4 1568 if (fi) {
eba618ab 1569 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
4895c771 1570
0f5f7d7b
DA
1571 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572 rt->rt_gw_family = nhc->nhc_gw_family;
1573 /* only INET and INET6 are supported */
1574 if (likely(nhc->nhc_gw_family == AF_INET))
1575 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576 else
1577 rt->rt_gw6 = nhc->nhc_gw.ipv6;
155e8336 1578 }
0f5f7d7b 1579
e1255ed4
DA
1580 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
c7066f70 1582#ifdef CONFIG_IP_ROUTE_CLASSID
87063a1f
DA
1583 {
1584 struct fib_nh *nh;
1585
1586 nh = container_of(nhc, struct fib_nh, nh_common);
1587 rt->dst.tclassid = nh->nh_tclassid;
1588 }
1da177e4 1589#endif
87063a1f 1590 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
c5038a83 1591 if (unlikely(fnhe))
a4c2fd7f
WW
1592 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593 else if (do_cache)
87063a1f 1594 cached = rt_cache_route(nhc, rt);
155e8336
JA
1595 if (unlikely(!cached)) {
1596 /* Routes we intend to cache in nexthop exception or
1597 * FIB nexthop have the DST_NOCACHE bit clear.
1598 * However, if we are unsuccessful at storing this
1599 * route into the cache we really need to set it.
1600 */
1550c171
DA
1601 if (!rt->rt_gw4) {
1602 rt->rt_gw_family = AF_INET;
1603 rt->rt_gw4 = daddr;
1604 }
155e8336
JA
1605 rt_add_uncached_list(rt);
1606 }
1607 } else
caacf05e 1608 rt_add_uncached_list(rt);
defb3519 1609
c7066f70 1610#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1611#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1612 set_class_tag(rt, res->tclassid);
1da177e4
LT
1613#endif
1614 set_class_tag(rt, itag);
1615#endif
1da177e4
LT
1616}
1617
9ab179d8
DA
1618struct rtable *rt_dst_alloc(struct net_device *dev,
1619 unsigned int flags, u16 type,
1620 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1621{
d08c4f35
DA
1622 struct rtable *rt;
1623
1624 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1625 (will_cache ? 0 : DST_HOST) |
d08c4f35 1626 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1627 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1628
1629 if (rt) {
1630 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631 rt->rt_flags = flags;
1632 rt->rt_type = type;
1633 rt->rt_is_input = 0;
1634 rt->rt_iif = 0;
1635 rt->rt_pmtu = 0;
d52e5a7e 1636 rt->rt_mtu_locked = 0;
1550c171
DA
1637 rt->rt_gw_family = 0;
1638 rt->rt_gw4 = 0;
d08c4f35
DA
1639 INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641 rt->dst.output = ip_output;
1642 if (flags & RTCF_LOCAL)
1643 rt->dst.input = ip_local_deliver;
1644 }
1645
1646 return rt;
0c4dcd58 1647}
9ab179d8 1648EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1649
5b18f128
SS
1650struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1651{
1652 struct rtable *new_rt;
1653
1654 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1655 rt->dst.flags);
1656
1657 if (new_rt) {
1658 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1659 new_rt->rt_flags = rt->rt_flags;
1660 new_rt->rt_type = rt->rt_type;
1661 new_rt->rt_is_input = rt->rt_is_input;
1662 new_rt->rt_iif = rt->rt_iif;
1663 new_rt->rt_pmtu = rt->rt_pmtu;
1664 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1665 new_rt->rt_gw_family = rt->rt_gw_family;
1666 if (rt->rt_gw_family == AF_INET)
1667 new_rt->rt_gw4 = rt->rt_gw4;
1668 else if (rt->rt_gw_family == AF_INET6)
1669 new_rt->rt_gw6 = rt->rt_gw6;
1670 INIT_LIST_HEAD(&new_rt->rt_uncached);
1671
1672 new_rt->dst.flags |= DST_HOST;
1673 new_rt->dst.input = rt->dst.input;
1674 new_rt->dst.output = rt->dst.output;
1675 new_rt->dst.error = rt->dst.error;
1676 new_rt->dst.lastuse = jiffies;
1677 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1678 }
1679 return new_rt;
1680}
1681EXPORT_SYMBOL(rt_dst_clone);
1682
96d36220 1683/* called in rcu_read_lock() section */
bc044e8d
PA
1684int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685 u8 tos, struct net_device *dev,
1686 struct in_device *in_dev, u32 *itag)
1da177e4 1687{
b5f7e755 1688 int err;
1da177e4
LT
1689
1690 /* Primary sanity checks. */
51456b29 1691 if (!in_dev)
1da177e4
LT
1692 return -EINVAL;
1693
1e637c74 1694 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1695 skb->protocol != htons(ETH_P_IP))
bc044e8d 1696 return -EINVAL;
1da177e4 1697
75fea73d 1698 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1699 return -EINVAL;
d0daebc3 1700
f97c1e0c 1701 if (ipv4_is_zeronet(saddr)) {
1d2f4ebb
EC
1702 if (!ipv4_is_local_multicast(daddr) &&
1703 ip_hdr(skb)->protocol != IPPROTO_IGMP)
bc044e8d 1704 return -EINVAL;
b5f7e755 1705 } else {
9e56e380 1706 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1707 in_dev, itag);
b5f7e755 1708 if (err < 0)
bc044e8d 1709 return err;
b5f7e755 1710 }
bc044e8d
PA
1711 return 0;
1712}
1713
1714/* called in rcu_read_lock() section */
1715static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716 u8 tos, struct net_device *dev, int our)
1717{
1718 struct in_device *in_dev = __in_dev_get_rcu(dev);
1719 unsigned int flags = RTCF_MULTICAST;
1720 struct rtable *rth;
1721 u32 itag = 0;
1722 int err;
1723
1724 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725 if (err)
1726 return err;
1727
d08c4f35
DA
1728 if (our)
1729 flags |= RTCF_LOCAL;
1730
1731 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1732 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1733 if (!rth)
bc044e8d 1734 return -ENOBUFS;
1da177e4 1735
cf911662
DM
1736#ifdef CONFIG_IP_ROUTE_CLASSID
1737 rth->dst.tclassid = itag;
1738#endif
d8d1f30b 1739 rth->dst.output = ip_rt_bug;
9917e1e8 1740 rth->rt_is_input= 1;
1da177e4
LT
1741
1742#ifdef CONFIG_IP_MROUTE
f97c1e0c 1743 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1744 rth->dst.input = ip_mr_input;
1da177e4
LT
1745#endif
1746 RT_CACHE_STAT_INC(in_slow_mc);
1747
89aef892
DM
1748 skb_dst_set(skb, &rth->dst);
1749 return 0;
1da177e4
LT
1750}
1751
1752
1753static void ip_handle_martian_source(struct net_device *dev,
1754 struct in_device *in_dev,
1755 struct sk_buff *skb,
9e12bb22
AV
1756 __be32 daddr,
1757 __be32 saddr)
1da177e4
LT
1758{
1759 RT_CACHE_STAT_INC(in_martian_src);
1760#ifdef CONFIG_IP_ROUTE_VERBOSE
1761 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1762 /*
1763 * RFC1812 recommendation, if source is martian,
1764 * the only hint is MAC header.
1765 */
058bd4d2 1766 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1767 &daddr, &saddr, dev->name);
98e399f8 1768 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1769 print_hex_dump(KERN_WARNING, "ll header: ",
1770 DUMP_PREFIX_OFFSET, 16, 1,
1771 skb_mac_header(skb),
b2c85100 1772 dev->hard_header_len, false);
1da177e4
LT
1773 }
1774 }
1775#endif
1776}
1777
47360228 1778/* called in rcu_read_lock() section */
5969f71d 1779static int __mkroute_input(struct sk_buff *skb,
982721f3 1780 const struct fib_result *res,
5969f71d 1781 struct in_device *in_dev,
c6cffba4 1782 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1783{
eba618ab
DA
1784 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785 struct net_device *dev = nhc->nhc_dev;
2ffae99d 1786 struct fib_nh_exception *fnhe;
1da177e4
LT
1787 struct rtable *rth;
1788 int err;
1789 struct in_device *out_dev;
d2d68ba9 1790 bool do_cache;
fbdc0ad0 1791 u32 itag = 0;
1da177e4
LT
1792
1793 /* get a working reference to the output device */
eba618ab 1794 out_dev = __in_dev_get_rcu(dev);
51456b29 1795 if (!out_dev) {
e87cc472 1796 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1797 return -EINVAL;
1798 }
1799
5c04c819 1800 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1801 in_dev->dev, in_dev, &itag);
1da177e4 1802 if (err < 0) {
e905a9ed 1803 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1804 saddr);
e905a9ed 1805
1da177e4
LT
1806 goto cleanup;
1807 }
1808
e81da0e1
JA
1809 do_cache = res->fi && !itag;
1810 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
eba618ab 1811 skb->protocol == htons(ETH_P_IP)) {
bdf00467 1812 __be32 gw;
eba618ab 1813
bdf00467 1814 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
eba618ab
DA
1815 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816 inet_addr_onlink(out_dev, saddr, gw))
1817 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1818 }
1da177e4
LT
1819
1820 if (skb->protocol != htons(ETH_P_IP)) {
1821 /* Not IP (i.e. ARP). Do not create route, if it is
1822 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1823 *
1824 * Proxy arp feature have been extended to allow, ARP
1825 * replies back to the same interface, to support
1826 * Private VLAN switch technologies. See arp.c.
1da177e4 1827 */
65324144
JDB
1828 if (out_dev == in_dev &&
1829 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1830 err = -EINVAL;
1831 goto cleanup;
1832 }
1833 }
1834
a5995e71 1835 fnhe = find_exception(nhc, daddr);
e81da0e1 1836 if (do_cache) {
94720e3a 1837 if (fnhe)
2ffae99d 1838 rth = rcu_dereference(fnhe->fnhe_rth_input);
94720e3a 1839 else
0f457a36 1840 rth = rcu_dereference(nhc->nhc_rth_input);
e81da0e1
JA
1841 if (rt_cache_valid(rth)) {
1842 skb_dst_set_noref(skb, &rth->dst);
1843 goto out;
d2d68ba9
DM
1844 }
1845 }
f2bb4bed 1846
d08c4f35 1847 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1848 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1849 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1850 if (!rth) {
1851 err = -ENOBUFS;
1852 goto cleanup;
1853 }
1854
9917e1e8 1855 rth->rt_is_input = 1;
a6254864 1856 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1857
d8d1f30b 1858 rth->dst.input = ip_forward;
1da177e4 1859
a4c2fd7f
WW
1860 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861 do_cache);
9942895b 1862 lwtunnel_set_redirect(&rth->dst);
c6cffba4 1863 skb_dst_set(skb, &rth->dst);
d2d68ba9 1864out:
1da177e4
LT
1865 err = 0;
1866 cleanup:
1da177e4 1867 return err;
e905a9ed 1868}
1da177e4 1869
79a13159 1870#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1871/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1872 * calculated from the inner IP addresses.
79a13159 1873 */
bf4e0a3d
NA
1874static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875 struct flow_keys *hash_keys)
79a13159
PN
1876{
1877 const struct iphdr *outer_iph = ip_hdr(skb);
6f74b6c2 1878 const struct iphdr *key_iph = outer_iph;
bf4e0a3d 1879 const struct iphdr *inner_iph;
79a13159
PN
1880 const struct icmphdr *icmph;
1881 struct iphdr _inner_iph;
bf4e0a3d
NA
1882 struct icmphdr _icmph;
1883
bf4e0a3d 1884 if (likely(outer_iph->protocol != IPPROTO_ICMP))
6f74b6c2 1885 goto out;
79a13159
PN
1886
1887 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
6f74b6c2 1888 goto out;
79a13159
PN
1889
1890 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891 &_icmph);
1892 if (!icmph)
6f74b6c2 1893 goto out;
79a13159
PN
1894
1895 if (icmph->type != ICMP_DEST_UNREACH &&
1896 icmph->type != ICMP_REDIRECT &&
1897 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d 1898 icmph->type != ICMP_PARAMETERPROB)
6f74b6c2 1899 goto out;
79a13159
PN
1900
1901 inner_iph = skb_header_pointer(skb,
1902 outer_iph->ihl * 4 + sizeof(_icmph),
1903 sizeof(_inner_iph), &_inner_iph);
1904 if (!inner_iph)
6f74b6c2
DA
1905 goto out;
1906
1907 key_iph = inner_iph;
1908out:
1909 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
bf4e0a3d 1911}
79a13159 1912
bf4e0a3d 1913/* if skb is set it will be used and fl4 can be NULL */
7efc0b6b 1914int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
e37b1e97 1915 const struct sk_buff *skb, struct flow_keys *flkeys)
bf4e0a3d 1916{
2a8e4997 1917 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
bf4e0a3d
NA
1918 struct flow_keys hash_keys;
1919 u32 mhash;
79a13159 1920
bf4e0a3d
NA
1921 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922 case 0:
1923 memset(&hash_keys, 0, sizeof(hash_keys));
1924 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925 if (skb) {
1926 ip_multipath_l3_keys(skb, &hash_keys);
1927 } else {
1928 hash_keys.addrs.v4addrs.src = fl4->saddr;
1929 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930 }
1931 break;
1932 case 1:
1933 /* skb is currently provided only when forwarding */
1934 if (skb) {
1935 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936 struct flow_keys keys;
1937
1938 /* short-circuit if we already have L4 hash present */
1939 if (skb->l4_hash)
1940 return skb_get_hash_raw(skb) >> 1;
ec7127a5 1941
bf4e0a3d 1942 memset(&hash_keys, 0, sizeof(hash_keys));
1fe4b118 1943
ec7127a5 1944 if (!flkeys) {
e37b1e97 1945 skb_flow_dissect_flow_keys(skb, &keys, flag);
ec7127a5 1946 flkeys = &keys;
e37b1e97 1947 }
ec7127a5
DA
1948
1949 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952 hash_keys.ports.src = flkeys->ports.src;
1953 hash_keys.ports.dst = flkeys->ports.dst;
1954 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
bf4e0a3d
NA
1955 } else {
1956 memset(&hash_keys, 0, sizeof(hash_keys));
1957 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958 hash_keys.addrs.v4addrs.src = fl4->saddr;
1959 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960 hash_keys.ports.src = fl4->fl4_sport;
1961 hash_keys.ports.dst = fl4->fl4_dport;
1962 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963 }
1964 break;
1965 }
1966 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1967
24ba1440 1968 if (multipath_hash)
1969 mhash = jhash_2words(mhash, multipath_hash, 0);
1970
bf4e0a3d
NA
1971 return mhash >> 1;
1972}
79a13159
PN
1973#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1974
5969f71d
SH
1975static int ip_mkroute_input(struct sk_buff *skb,
1976 struct fib_result *res,
5969f71d 1977 struct in_device *in_dev,
e37b1e97
RP
1978 __be32 daddr, __be32 saddr, u32 tos,
1979 struct flow_keys *hkeys)
1da177e4 1980{
1da177e4 1981#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1982 if (res->fi && res->fi->fib_nhs > 1) {
7efc0b6b 1983 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
0e884c78 1984
0e884c78
PN
1985 fib_select_multipath(res, h);
1986 }
1da177e4
LT
1987#endif
1988
1989 /* create a routing cache entry */
c6cffba4 1990 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1991}
1992
1da177e4
LT
1993/*
1994 * NOTE. We drop all the packets that has local source
1995 * addresses, because every properly looped back packet
1996 * must have correct destination already attached by output routine.
1997 *
1998 * Such approach solves two big problems:
1999 * 1. Not simplex devices are handled properly.
2000 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2001 * called with rcu_read_lock()
1da177e4
LT
2002 */
2003
9e12bb22 2004static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
2005 u8 tos, struct net_device *dev,
2006 struct fib_result *res)
1da177e4 2007{
96d36220 2008 struct in_device *in_dev = __in_dev_get_rcu(dev);
e37b1e97
RP
2009 struct flow_keys *flkeys = NULL, _flkeys;
2010 struct net *net = dev_net(dev);
1b7179d3 2011 struct ip_tunnel_info *tun_info;
e37b1e97 2012 int err = -EINVAL;
95c96174 2013 unsigned int flags = 0;
1da177e4 2014 u32 itag = 0;
95c96174 2015 struct rtable *rth;
e37b1e97 2016 struct flowi4 fl4;
0a90478b 2017 bool do_cache = true;
1da177e4
LT
2018
2019 /* IP on this device is disabled. */
2020
2021 if (!in_dev)
2022 goto out;
2023
2024 /* Check for the most weird martians, which can be not detected
2025 by fib_lookup.
2026 */
2027
61adedf3 2028 tun_info = skb_tunnel_info(skb);
46fa062a 2029 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
2030 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2031 else
2032 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
2033 skb_dst_drop(skb);
2034
d0daebc3 2035 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2036 goto martian_source;
2037
5510cdf7
DA
2038 res->fi = NULL;
2039 res->table = NULL;
27a954bd 2040 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2041 goto brd_input;
2042
2043 /* Accept zero addresses only to limited broadcast;
2044 * I even do not know to fix it or not. Waiting for complains :-)
2045 */
f97c1e0c 2046 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2047 goto martian_source;
2048
d0daebc3 2049 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2050 goto martian_destination;
2051
9eb43e76
ED
2052 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2053 * and call it once if daddr or/and saddr are loopback addresses
2054 */
2055 if (ipv4_is_loopback(daddr)) {
2056 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 2057 goto martian_destination;
9eb43e76
ED
2058 } else if (ipv4_is_loopback(saddr)) {
2059 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
2060 goto martian_source;
2061 }
2062
1da177e4
LT
2063 /*
2064 * Now we are ready to route packet.
2065 */
68a5e3dd 2066 fl4.flowi4_oif = 0;
e0d56fdd 2067 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
2068 fl4.flowi4_mark = skb->mark;
2069 fl4.flowi4_tos = tos;
2070 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 2071 fl4.flowi4_flags = 0;
68a5e3dd
DM
2072 fl4.daddr = daddr;
2073 fl4.saddr = saddr;
8bcfd092 2074 fl4.flowi4_uid = sock_net_uid(net, NULL);
e37b1e97 2075
5a847a6e 2076 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
e37b1e97 2077 flkeys = &_flkeys;
5a847a6e
DA
2078 } else {
2079 fl4.flowi4_proto = 0;
2080 fl4.fl4_sport = 0;
2081 fl4.fl4_dport = 0;
2082 }
e37b1e97 2083
5510cdf7 2084 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
2085 if (err != 0) {
2086 if (!IN_DEV_FORWARD(in_dev))
2087 err = -EHOSTUNREACH;
1da177e4 2088 goto no_route;
cd0f0b95 2089 }
1da177e4 2090
5cbf777c
XL
2091 if (res->type == RTN_BROADCAST) {
2092 if (IN_DEV_BFORWARD(in_dev))
2093 goto make_route;
0a90478b
XL
2094 /* not do cache if bc_forwarding is enabled */
2095 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2096 do_cache = false;
1da177e4 2097 goto brd_input;
5cbf777c 2098 }
1da177e4 2099
5510cdf7 2100 if (res->type == RTN_LOCAL) {
5c04c819 2101 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 2102 0, dev, in_dev, &itag);
b5f7e755 2103 if (err < 0)
0d753960 2104 goto martian_source;
1da177e4
LT
2105 goto local_input;
2106 }
2107
cd0f0b95
DJ
2108 if (!IN_DEV_FORWARD(in_dev)) {
2109 err = -EHOSTUNREACH;
251da413 2110 goto no_route;
cd0f0b95 2111 }
5510cdf7 2112 if (res->type != RTN_UNICAST)
1da177e4
LT
2113 goto martian_destination;
2114
5cbf777c 2115make_route:
e37b1e97 2116 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1da177e4
LT
2117out: return err;
2118
2119brd_input:
2120 if (skb->protocol != htons(ETH_P_IP))
2121 goto e_inval;
2122
41347dcd 2123 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2124 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2125 in_dev, &itag);
1da177e4 2126 if (err < 0)
0d753960 2127 goto martian_source;
1da177e4
LT
2128 }
2129 flags |= RTCF_BROADCAST;
5510cdf7 2130 res->type = RTN_BROADCAST;
1da177e4
LT
2131 RT_CACHE_STAT_INC(in_brd);
2132
2133local_input:
0a90478b
XL
2134 do_cache &= res->fi && !itag;
2135 if (do_cache) {
2136 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
eba618ab 2137
0a90478b
XL
2138 rth = rcu_dereference(nhc->nhc_rth_input);
2139 if (rt_cache_valid(rth)) {
2140 skb_dst_set_noref(skb, &rth->dst);
2141 err = 0;
2142 goto out;
d2d68ba9
DM
2143 }
2144 }
2145
f5a0aab8 2146 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2147 flags | RTCF_LOCAL, res->type,
d2d68ba9 2148 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2149 if (!rth)
2150 goto e_nobufs;
2151
d8d1f30b 2152 rth->dst.output= ip_rt_bug;
cf911662
DM
2153#ifdef CONFIG_IP_ROUTE_CLASSID
2154 rth->dst.tclassid = itag;
2155#endif
9917e1e8 2156 rth->rt_is_input = 1;
571e7226 2157
a6254864 2158 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2159 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2160 rth->dst.input= ip_error;
2161 rth->dst.error= -err;
1da177e4
LT
2162 rth->rt_flags &= ~RTCF_LOCAL;
2163 }
efd85700 2164
dcdfdf56 2165 if (do_cache) {
eba618ab 2166 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
efd85700 2167
eba618ab 2168 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
efd85700
TG
2169 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2170 WARN_ON(rth->dst.input == lwtunnel_input);
2171 rth->dst.lwtstate->orig_input = rth->dst.input;
2172 rth->dst.input = lwtunnel_input;
2173 }
2174
87063a1f 2175 if (unlikely(!rt_cache_route(nhc, rth)))
dcdfdf56 2176 rt_add_uncached_list(rth);
dcdfdf56 2177 }
89aef892 2178 skb_dst_set(skb, &rth->dst);
b23dd4fe 2179 err = 0;
ebc0ffae 2180 goto out;
1da177e4
LT
2181
2182no_route:
2183 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2184 res->type = RTN_UNREACHABLE;
2185 res->fi = NULL;
2186 res->table = NULL;
1da177e4
LT
2187 goto local_input;
2188
2189 /*
2190 * Do not cache martian addresses: they should be logged (RFC1812)
2191 */
2192martian_destination:
2193 RT_CACHE_STAT_INC(in_martian_dst);
2194#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2195 if (IN_DEV_LOG_MARTIANS(in_dev))
2196 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2197 &daddr, &saddr, dev->name);
1da177e4 2198#endif
2c2910a4 2199
1da177e4
LT
2200e_inval:
2201 err = -EINVAL;
ebc0ffae 2202 goto out;
1da177e4
LT
2203
2204e_nobufs:
2205 err = -ENOBUFS;
ebc0ffae 2206 goto out;
1da177e4
LT
2207
2208martian_source:
2209 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2210 goto out;
1da177e4
LT
2211}
2212
c6cffba4
DM
2213int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2214 u8 tos, struct net_device *dev)
1da177e4 2215{
5510cdf7
DA
2216 struct fib_result res;
2217 int err;
1da177e4 2218
6e28099d 2219 tos &= IPTOS_RT_MASK;
96d36220 2220 rcu_read_lock();
5510cdf7
DA
2221 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2222 rcu_read_unlock();
96d36220 2223
5510cdf7
DA
2224 return err;
2225}
2226EXPORT_SYMBOL(ip_route_input_noref);
2227
2228/* called with rcu_read_lock held */
2229int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2230 u8 tos, struct net_device *dev, struct fib_result *res)
2231{
1da177e4
LT
2232 /* Multicast recognition logic is moved from route cache to here.
2233 The problem was that too many Ethernet cards have broken/missing
2234 hardware multicast filters :-( As result the host on multicasting
2235 network acquires a lot of useless route cache entries, sort of
2236 SDR messages from all the world. Now we try to get rid of them.
2237 Really, provided software IP multicast filter is organized
2238 reasonably (at least, hashed), it does not result in a slowdown
2239 comparing with route cache reject entries.
2240 Note, that multicast routers are not affected, because
2241 route cache entry is created eventually.
2242 */
f97c1e0c 2243 if (ipv4_is_multicast(daddr)) {
96d36220 2244 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2245 int our = 0;
5510cdf7 2246 int err = -EINVAL;
1da177e4 2247
22c74764
PA
2248 if (!in_dev)
2249 return err;
2250 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2251 ip_hdr(skb)->protocol);
e58e4159
DA
2252
2253 /* check l3 master if no match yet */
22c74764 2254 if (!our && netif_is_l3_slave(dev)) {
e58e4159
DA
2255 struct in_device *l3_in_dev;
2256
2257 l3_in_dev = __in_dev_get_rcu(skb->dev);
2258 if (l3_in_dev)
2259 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2260 ip_hdr(skb)->protocol);
2261 }
2262
e58e4159 2263 if (our
1da177e4 2264#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2265 ||
2266 (!ipv4_is_local_multicast(daddr) &&
2267 IN_DEV_MFORWARD(in_dev))
1da177e4 2268#endif
e58e4159 2269 ) {
5510cdf7 2270 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2271 tos, dev, our);
1da177e4 2272 }
5510cdf7 2273 return err;
1da177e4 2274 }
5510cdf7
DA
2275
2276 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2277}
2278
ebc0ffae 2279/* called with rcu_read_lock() */
982721f3 2280static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2281 const struct flowi4 *fl4, int orig_oif,
f61759e6 2282 struct net_device *dev_out,
5ada5527 2283 unsigned int flags)
1da177e4 2284{
982721f3 2285 struct fib_info *fi = res->fi;
f2bb4bed 2286 struct fib_nh_exception *fnhe;
5ada5527 2287 struct in_device *in_dev;
982721f3 2288 u16 type = res->type;
5ada5527 2289 struct rtable *rth;
c92b9655 2290 bool do_cache;
1da177e4 2291
d0daebc3
TG
2292 in_dev = __in_dev_get_rcu(dev_out);
2293 if (!in_dev)
5ada5527 2294 return ERR_PTR(-EINVAL);
1da177e4 2295
d0daebc3 2296 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2297 if (ipv4_is_loopback(fl4->saddr) &&
2298 !(dev_out->flags & IFF_LOOPBACK) &&
2299 !netif_is_l3_master(dev_out))
d0daebc3
TG
2300 return ERR_PTR(-EINVAL);
2301
68a5e3dd 2302 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2303 type = RTN_BROADCAST;
68a5e3dd 2304 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2305 type = RTN_MULTICAST;
68a5e3dd 2306 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2307 return ERR_PTR(-EINVAL);
1da177e4
LT
2308
2309 if (dev_out->flags & IFF_LOOPBACK)
2310 flags |= RTCF_LOCAL;
2311
63617421 2312 do_cache = true;
982721f3 2313 if (type == RTN_BROADCAST) {
1da177e4 2314 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2315 fi = NULL;
2316 } else if (type == RTN_MULTICAST) {
dd28d1a0 2317 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2318 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2319 fl4->flowi4_proto))
1da177e4 2320 flags &= ~RTCF_LOCAL;
63617421
JA
2321 else
2322 do_cache = false;
1da177e4 2323 /* If multicast route do not exist use
dd28d1a0
ED
2324 * default one, but do not gateway in this case.
2325 * Yes, it is hack.
1da177e4 2326 */
982721f3
DM
2327 if (fi && res->prefixlen < 4)
2328 fi = NULL;
d6d5e999
CF
2329 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2330 (orig_oif != dev_out->ifindex)) {
2331 /* For local routes that require a particular output interface
2332 * we do not want to cache the result. Caching the result
2333 * causes incorrect behaviour when there are multiple source
2334 * addresses on the interface, the end result being that if the
2335 * intended recipient is waiting on that interface for the
2336 * packet he won't receive it because it will be delivered on
2337 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2338 * be set to the loopback interface as well.
2339 */
94720e3a 2340 do_cache = false;
1da177e4
LT
2341 }
2342
f2bb4bed 2343 fnhe = NULL;
63617421 2344 do_cache &= fi != NULL;
94720e3a 2345 if (fi) {
eba618ab 2346 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
c5038a83 2347 struct rtable __rcu **prth;
d26b3a7c 2348
a5995e71 2349 fnhe = find_exception(nhc, fl4->daddr);
94720e3a
JA
2350 if (!do_cache)
2351 goto add;
deed49df 2352 if (fnhe) {
2ffae99d 2353 prth = &fnhe->fnhe_rth_output;
94720e3a
JA
2354 } else {
2355 if (unlikely(fl4->flowi4_flags &
2356 FLOWI_FLAG_KNOWN_NH &&
bdf00467 2357 !(nhc->nhc_gw_family &&
eba618ab 2358 nhc->nhc_scope == RT_SCOPE_LINK))) {
94720e3a
JA
2359 do_cache = false;
2360 goto add;
c92b9655 2361 }
0f457a36 2362 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
c92b9655 2363 }
c5038a83 2364 rth = rcu_dereference(*prth);
9df16efa 2365 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2366 return rth;
f2bb4bed 2367 }
c92b9655
JA
2368
2369add:
d08c4f35 2370 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2371 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2372 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2373 do_cache);
8391d07b 2374 if (!rth)
5ada5527 2375 return ERR_PTR(-ENOBUFS);
8391d07b 2376
9438c871 2377 rth->rt_iif = orig_oif;
b7503e0c 2378
1da177e4
LT
2379 RT_CACHE_STAT_INC(out_slow_tot);
2380
1da177e4 2381 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2382 if (flags & RTCF_LOCAL &&
1da177e4 2383 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2384 rth->dst.output = ip_mc_output;
1da177e4
LT
2385 RT_CACHE_STAT_INC(out_slow_mc);
2386 }
2387#ifdef CONFIG_IP_MROUTE
982721f3 2388 if (type == RTN_MULTICAST) {
1da177e4 2389 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2390 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2391 rth->dst.input = ip_mr_input;
2392 rth->dst.output = ip_mc_output;
1da177e4
LT
2393 }
2394 }
2395#endif
2396 }
2397
a4c2fd7f 2398 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
9942895b 2399 lwtunnel_set_redirect(&rth->dst);
1da177e4 2400
5ada5527 2401 return rth;
1da177e4
LT
2402}
2403
1da177e4
LT
2404/*
2405 * Major route resolver routine.
2406 */
2407
3abd1ade
DA
2408struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2409 const struct sk_buff *skb)
1da177e4 2410{
f61759e6 2411 __u8 tos = RT_FL_TOS(fl4);
d0ea2b12
ED
2412 struct fib_result res = {
2413 .type = RTN_UNSPEC,
2414 .fi = NULL,
2415 .table = NULL,
2416 .tclassid = 0,
2417 };
5ada5527 2418 struct rtable *rth;
1da177e4 2419
1fb9489b 2420 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2421 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2422 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2423 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2424
010c2708 2425 rcu_read_lock();
3abd1ade
DA
2426 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2427 rcu_read_unlock();
2428
2429 return rth;
2430}
2431EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2432
2433struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2434 struct fib_result *res,
2435 const struct sk_buff *skb)
2436{
2437 struct net_device *dev_out = NULL;
2438 int orig_oif = fl4->flowi4_oif;
2439 unsigned int flags = 0;
2440 struct rtable *rth;
2441 int err = -ENETUNREACH;
2442
813b3b5d 2443 if (fl4->saddr) {
b23dd4fe 2444 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2445 if (ipv4_is_multicast(fl4->saddr) ||
2446 ipv4_is_lbcast(fl4->saddr) ||
2447 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2448 goto out;
2449
1da177e4
LT
2450 /* I removed check for oif == dev_out->oif here.
2451 It was wrong for two reasons:
1ab35276
DL
2452 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2453 is assigned to multiple interfaces.
1da177e4
LT
2454 2. Moreover, we are allowed to send packets with saddr
2455 of another iface. --ANK
2456 */
2457
813b3b5d
DM
2458 if (fl4->flowi4_oif == 0 &&
2459 (ipv4_is_multicast(fl4->daddr) ||
2460 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2461 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2462 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2463 if (!dev_out)
a210d01a
JA
2464 goto out;
2465
1da177e4
LT
2466 /* Special hack: user can direct multicasts
2467 and limited broadcast via necessary interface
2468 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2469 This hack is not just for fun, it allows
2470 vic,vat and friends to work.
2471 They bind socket to loopback, set ttl to zero
2472 and expect that it will work.
2473 From the viewpoint of routing cache they are broken,
2474 because we are not allowed to build multicast path
2475 with loopback source addr (look, routing cache
2476 cannot know, that ttl is zero, so that packet
2477 will not leave this host and route is valid).
2478 Luckily, this hack is good workaround.
2479 */
2480
813b3b5d 2481 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2482 goto make_route;
2483 }
a210d01a 2484
813b3b5d 2485 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2486 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2487 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2488 goto out;
a210d01a 2489 }
1da177e4
LT
2490 }
2491
2492
813b3b5d
DM
2493 if (fl4->flowi4_oif) {
2494 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2495 rth = ERR_PTR(-ENODEV);
51456b29 2496 if (!dev_out)
1da177e4 2497 goto out;
e5ed6399
HX
2498
2499 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2500 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2501 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2502 goto out;
2503 }
813b3b5d 2504 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2505 ipv4_is_lbcast(fl4->daddr) ||
2506 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2507 if (!fl4->saddr)
2508 fl4->saddr = inet_select_addr(dev_out, 0,
2509 RT_SCOPE_LINK);
1da177e4
LT
2510 goto make_route;
2511 }
0a7e2260 2512 if (!fl4->saddr) {
813b3b5d
DM
2513 if (ipv4_is_multicast(fl4->daddr))
2514 fl4->saddr = inet_select_addr(dev_out, 0,
2515 fl4->flowi4_scope);
2516 else if (!fl4->daddr)
2517 fl4->saddr = inet_select_addr(dev_out, 0,
2518 RT_SCOPE_HOST);
1da177e4
LT
2519 }
2520 }
2521
813b3b5d
DM
2522 if (!fl4->daddr) {
2523 fl4->daddr = fl4->saddr;
2524 if (!fl4->daddr)
2525 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2526 dev_out = net->loopback_dev;
1fb9489b 2527 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2528 res->type = RTN_LOCAL;
1da177e4
LT
2529 flags |= RTCF_LOCAL;
2530 goto make_route;
2531 }
2532
3abd1ade 2533 err = fib_lookup(net, fl4, res, 0);
0315e382 2534 if (err) {
3abd1ade
DA
2535 res->fi = NULL;
2536 res->table = NULL;
6104e112 2537 if (fl4->flowi4_oif &&
e58e4159
DA
2538 (ipv4_is_multicast(fl4->daddr) ||
2539 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2540 /* Apparently, routing tables are wrong. Assume,
2541 that the destination is on link.
2542
2543 WHY? DW.
2544 Because we are allowed to send to iface
2545 even if it has NO routes and NO assigned
2546 addresses. When oif is specified, routing
2547 tables are looked up with only one purpose:
2548 to catch if destination is gatewayed, rather than
2549 direct. Moreover, if MSG_DONTROUTE is set,
2550 we send packet, ignoring both routing tables
2551 and ifaddr state. --ANK
2552
2553
2554 We could make it even if oif is unknown,
2555 likely IPv6, but we do not.
2556 */
2557
813b3b5d
DM
2558 if (fl4->saddr == 0)
2559 fl4->saddr = inet_select_addr(dev_out, 0,
2560 RT_SCOPE_LINK);
3abd1ade 2561 res->type = RTN_UNICAST;
1da177e4
LT
2562 goto make_route;
2563 }
0315e382 2564 rth = ERR_PTR(err);
1da177e4
LT
2565 goto out;
2566 }
1da177e4 2567
3abd1ade 2568 if (res->type == RTN_LOCAL) {
813b3b5d 2569 if (!fl4->saddr) {
3abd1ade
DA
2570 if (res->fi->fib_prefsrc)
2571 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2572 else
813b3b5d 2573 fl4->saddr = fl4->daddr;
9fc3bbb4 2574 }
5f02ce24
DA
2575
2576 /* L3 master device is the loopback for that domain */
3abd1ade 2577 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2578 net->loopback_dev;
839da4d9
DA
2579
2580 /* make sure orig_oif points to fib result device even
2581 * though packet rx/tx happens over loopback or l3mdev
2582 */
2583 orig_oif = FIB_RES_OIF(*res);
2584
813b3b5d 2585 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2586 flags |= RTCF_LOCAL;
2587 goto make_route;
2588 }
2589
3abd1ade 2590 fib_select_path(net, res, fl4, skb);
1da177e4 2591
3abd1ade 2592 dev_out = FIB_RES_DEV(*res);
813b3b5d 2593 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2594
2595
2596make_route:
3abd1ade 2597 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2598
010c2708 2599out:
b23dd4fe 2600 return rth;
1da177e4 2601}
d8c97a94 2602
ae2688d5
JW
2603static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2604{
2605 return NULL;
2606}
2607
ebb762f2 2608static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2609{
618f9bc7
SK
2610 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2611
2612 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2613}
2614
6700c270
DM
2615static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2616 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2617{
2618}
2619
6700c270
DM
2620static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2621 struct sk_buff *skb)
b587ee3b
DM
2622{
2623}
2624
0972ddb2
HB
2625static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2626 unsigned long old)
2627{
2628 return NULL;
2629}
2630
14e50e57
DM
2631static struct dst_ops ipv4_dst_blackhole_ops = {
2632 .family = AF_INET,
ae2688d5 2633 .check = ipv4_blackhole_dst_check,
ebb762f2 2634 .mtu = ipv4_blackhole_mtu,
214f45c9 2635 .default_advmss = ipv4_default_advmss,
14e50e57 2636 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2637 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2638 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2639 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2640};
2641
2774c131 2642struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2643{
2774c131 2644 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2645 struct rtable *rt;
14e50e57 2646
6c0e7284 2647 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2648 if (rt) {
d8d1f30b 2649 struct dst_entry *new = &rt->dst;
14e50e57 2650
14e50e57 2651 new->__use = 1;
352e512c 2652 new->input = dst_discard;
ede2059d 2653 new->output = dst_discard_out;
14e50e57 2654
1dbe3252 2655 new->dev = net->loopback_dev;
14e50e57
DM
2656 if (new->dev)
2657 dev_hold(new->dev);
2658
9917e1e8 2659 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2660 rt->rt_iif = ort->rt_iif;
5943634f 2661 rt->rt_pmtu = ort->rt_pmtu;
d52e5a7e 2662 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2663
ca4c3fc2 2664 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2665 rt->rt_flags = ort->rt_flags;
2666 rt->rt_type = ort->rt_type;
1550c171
DA
2667 rt->rt_gw_family = ort->rt_gw_family;
2668 if (rt->rt_gw_family == AF_INET)
2669 rt->rt_gw4 = ort->rt_gw4;
0f5f7d7b
DA
2670 else if (rt->rt_gw_family == AF_INET6)
2671 rt->rt_gw6 = ort->rt_gw6;
14e50e57 2672
caacf05e 2673 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2674 }
2675
2774c131
DM
2676 dst_release(dst_orig);
2677
2678 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2679}
2680
9d6ec938 2681struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2682 const struct sock *sk)
1da177e4 2683{
9d6ec938 2684 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2685
b23dd4fe
DM
2686 if (IS_ERR(rt))
2687 return rt;
1da177e4 2688
56157872 2689 if (flp4->flowi4_proto)
f92ee619
SK
2690 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2691 flowi4_to_flowi(flp4),
2692 sk, 0);
1da177e4 2693
b23dd4fe 2694 return rt;
1da177e4 2695}
d8c97a94
ACM
2696EXPORT_SYMBOL_GPL(ip_route_output_flow);
2697
3765d35e 2698/* called with rcu_read_lock held */
404eb77e
RP
2699static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2700 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2701 struct sk_buff *skb, u32 portid, u32 seq)
1da177e4 2702{
1da177e4 2703 struct rtmsg *r;
be403ea1 2704 struct nlmsghdr *nlh;
2bc8ca40 2705 unsigned long expires = 0;
f185071d 2706 u32 error;
521f5490 2707 u32 metrics[RTAX_MAX];
be403ea1 2708
d3166e0c 2709 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2710 if (!nlh)
26932566 2711 return -EMSGSIZE;
be403ea1
TG
2712
2713 r = nlmsg_data(nlh);
1da177e4
LT
2714 r->rtm_family = AF_INET;
2715 r->rtm_dst_len = 32;
2716 r->rtm_src_len = 0;
d6c0a4f6 2717 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2718 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2719 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2720 goto nla_put_failure;
1da177e4
LT
2721 r->rtm_type = rt->rt_type;
2722 r->rtm_scope = RT_SCOPE_UNIVERSE;
2723 r->rtm_protocol = RTPROT_UNSPEC;
2724 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2725 if (rt->rt_flags & RTCF_NOTIFY)
2726 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2727 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2728 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2729
930345ea 2730 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2731 goto nla_put_failure;
1a00fee4 2732 if (src) {
1da177e4 2733 r->rtm_src_len = 32;
930345ea 2734 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2735 goto nla_put_failure;
1da177e4 2736 }
f3756b79
DM
2737 if (rt->dst.dev &&
2738 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2739 goto nla_put_failure;
c7066f70 2740#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2741 if (rt->dst.tclassid &&
2742 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2743 goto nla_put_failure;
1da177e4 2744#endif
41347dcd 2745 if (!rt_is_input_route(rt) &&
d6c0a4f6 2746 fl4->saddr != src) {
930345ea 2747 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2748 goto nla_put_failure;
2749 }
1550c171 2750 if (rt->rt_gw_family == AF_INET &&
0f5f7d7b 2751 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
f3756b79 2752 goto nla_put_failure;
0f5f7d7b
DA
2753 } else if (rt->rt_gw_family == AF_INET6) {
2754 int alen = sizeof(struct in6_addr);
2755 struct nlattr *nla;
2756 struct rtvia *via;
2757
2758 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2759 if (!nla)
2760 goto nla_put_failure;
2761
2762 via = nla_data(nla);
2763 via->rtvia_family = AF_INET6;
2764 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2765 }
be403ea1 2766
ee9a8f7a
SK
2767 expires = rt->dst.expires;
2768 if (expires) {
2769 unsigned long now = jiffies;
2770
2771 if (time_before(now, expires))
2772 expires -= now;
2773 else
2774 expires = 0;
2775 }
2776
521f5490 2777 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2778 if (rt->rt_pmtu && expires)
521f5490 2779 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
d52e5a7e
SD
2780 if (rt->rt_mtu_locked && expires)
2781 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2782 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2783 goto nla_put_failure;
2784
b4869889 2785 if (fl4->flowi4_mark &&
68aaed54 2786 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2787 goto nla_put_failure;
963bfeee 2788
622ec2c9
LC
2789 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2790 nla_put_u32(skb, RTA_UID,
2791 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2792 goto nla_put_failure;
2793
d8d1f30b 2794 error = rt->dst.error;
be403ea1 2795
c7537967 2796 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2797#ifdef CONFIG_IP_MROUTE
2798 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2799 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2800 int err = ipmr_get_route(net, skb,
2801 fl4->saddr, fl4->daddr,
9f09eaea 2802 r, portid);
2cf75070 2803
8caaf7b6 2804 if (err <= 0) {
0c8d803f
DA
2805 if (err == 0)
2806 return 0;
2807 goto nla_put_failure;
8caaf7b6
ND
2808 }
2809 } else
2810#endif
404eb77e 2811 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
8caaf7b6 2812 goto nla_put_failure;
1da177e4
LT
2813 }
2814
f185071d 2815 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2816 goto nla_put_failure;
be403ea1 2817
053c095a
JB
2818 nlmsg_end(skb, nlh);
2819 return 0;
1da177e4 2820
be403ea1 2821nla_put_failure:
26932566
PM
2822 nlmsg_cancel(skb, nlh);
2823 return -EMSGSIZE;
1da177e4
LT
2824}
2825
404eb77e
RP
2826static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2827 u8 ip_proto, __be16 sport,
2828 __be16 dport)
2829{
2830 struct sk_buff *skb;
2831 struct iphdr *iph;
2832
2833 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2834 if (!skb)
2835 return NULL;
2836
2837 /* Reserve room for dummy headers, this skb can pass
2838 * through good chunk of routing engine.
2839 */
2840 skb_reset_mac_header(skb);
2841 skb_reset_network_header(skb);
2842 skb->protocol = htons(ETH_P_IP);
2843 iph = skb_put(skb, sizeof(struct iphdr));
2844 iph->protocol = ip_proto;
2845 iph->saddr = src;
2846 iph->daddr = dst;
2847 iph->version = 0x4;
2848 iph->frag_off = 0;
2849 iph->ihl = 0x5;
2850 skb_set_transport_header(skb, skb->len);
2851
2852 switch (iph->protocol) {
2853 case IPPROTO_UDP: {
2854 struct udphdr *udph;
2855
2856 udph = skb_put_zero(skb, sizeof(struct udphdr));
2857 udph->source = sport;
2858 udph->dest = dport;
2859 udph->len = sizeof(struct udphdr);
2860 udph->check = 0;
2861 break;
2862 }
2863 case IPPROTO_TCP: {
2864 struct tcphdr *tcph;
2865
2866 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2867 tcph->source = sport;
2868 tcph->dest = dport;
2869 tcph->doff = sizeof(struct tcphdr) / 4;
2870 tcph->rst = 1;
2871 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2872 src, dst, 0);
2873 break;
2874 }
2875 case IPPROTO_ICMP: {
2876 struct icmphdr *icmph;
2877
2878 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2879 icmph->type = ICMP_ECHO;
2880 icmph->code = 0;
2881 }
2882 }
2883
2884 return skb;
2885}
2886
a00302b6
JK
2887static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2888 const struct nlmsghdr *nlh,
2889 struct nlattr **tb,
2890 struct netlink_ext_ack *extack)
2891{
2892 struct rtmsg *rtm;
2893 int i, err;
2894
2895 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2896 NL_SET_ERR_MSG(extack,
2897 "ipv4: Invalid header for route get request");
2898 return -EINVAL;
2899 }
2900
2901 if (!netlink_strict_get_check(skb))
8cb08174
JB
2902 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2903 rtm_ipv4_policy, extack);
a00302b6
JK
2904
2905 rtm = nlmsg_data(nlh);
2906 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2907 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2908 rtm->rtm_table || rtm->rtm_protocol ||
2909 rtm->rtm_scope || rtm->rtm_type) {
2910 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2911 return -EINVAL;
2912 }
2913
2914 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2915 RTM_F_LOOKUP_TABLE |
2916 RTM_F_FIB_MATCH)) {
2917 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2918 return -EINVAL;
2919 }
2920
8cb08174
JB
2921 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2922 rtm_ipv4_policy, extack);
a00302b6
JK
2923 if (err)
2924 return err;
2925
2926 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2927 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2928 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2929 return -EINVAL;
2930 }
2931
2932 for (i = 0; i <= RTA_MAX; i++) {
2933 if (!tb[i])
2934 continue;
2935
2936 switch (i) {
2937 case RTA_IIF:
2938 case RTA_OIF:
2939 case RTA_SRC:
2940 case RTA_DST:
2941 case RTA_IP_PROTO:
2942 case RTA_SPORT:
2943 case RTA_DPORT:
2944 case RTA_MARK:
2945 case RTA_UID:
2946 break;
2947 default:
2948 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2949 return -EINVAL;
2950 }
2951 }
2952
2953 return 0;
2954}
2955
c21ef3e3
DA
2956static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2957 struct netlink_ext_ack *extack)
1da177e4 2958{
3b1e0a65 2959 struct net *net = sock_net(in_skb->sk);
d889ce3b 2960 struct nlattr *tb[RTA_MAX+1];
404eb77e
RP
2961 u32 table_id = RT_TABLE_MAIN;
2962 __be16 sport = 0, dport = 0;
3765d35e 2963 struct fib_result res = {};
404eb77e 2964 u8 ip_proto = IPPROTO_UDP;
1da177e4 2965 struct rtable *rt = NULL;
404eb77e
RP
2966 struct sk_buff *skb;
2967 struct rtmsg *rtm;
e8e3fbe9 2968 struct flowi4 fl4 = {};
9e12bb22
AV
2969 __be32 dst = 0;
2970 __be32 src = 0;
404eb77e 2971 kuid_t uid;
9e12bb22 2972 u32 iif;
d889ce3b 2973 int err;
963bfeee 2974 int mark;
1da177e4 2975
a00302b6 2976 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
d889ce3b 2977 if (err < 0)
404eb77e 2978 return err;
d889ce3b
TG
2979
2980 rtm = nlmsg_data(nlh);
67b61f6c
JB
2981 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2982 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2983 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2984 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2985 if (tb[RTA_UID])
2986 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2987 else
2988 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2989
404eb77e
RP
2990 if (tb[RTA_IP_PROTO]) {
2991 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5e1a99ea 2992 &ip_proto, AF_INET, extack);
404eb77e
RP
2993 if (err)
2994 return err;
2995 }
bbadb9a2 2996
404eb77e
RP
2997 if (tb[RTA_SPORT])
2998 sport = nla_get_be16(tb[RTA_SPORT]);
bbadb9a2 2999
404eb77e
RP
3000 if (tb[RTA_DPORT])
3001 dport = nla_get_be16(tb[RTA_DPORT]);
3002
3003 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3004 if (!skb)
3005 return -ENOBUFS;
bbadb9a2 3006
d6c0a4f6
DM
3007 fl4.daddr = dst;
3008 fl4.saddr = src;
3009 fl4.flowi4_tos = rtm->rtm_tos;
3010 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3011 fl4.flowi4_mark = mark;
622ec2c9 3012 fl4.flowi4_uid = uid;
404eb77e
RP
3013 if (sport)
3014 fl4.fl4_sport = sport;
3015 if (dport)
3016 fl4.fl4_dport = dport;
3017 fl4.flowi4_proto = ip_proto;
d6c0a4f6 3018
3765d35e
DA
3019 rcu_read_lock();
3020
1da177e4 3021 if (iif) {
d889ce3b
TG
3022 struct net_device *dev;
3023
3765d35e 3024 dev = dev_get_by_index_rcu(net, iif);
51456b29 3025 if (!dev) {
d889ce3b 3026 err = -ENODEV;
404eb77e 3027 goto errout_rcu;
d889ce3b
TG
3028 }
3029
404eb77e 3030 fl4.flowi4_iif = iif; /* for rt_fill_info */
1da177e4 3031 skb->dev = dev;
963bfeee 3032 skb->mark = mark;
3765d35e
DA
3033 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3034 dev, &res);
d889ce3b 3035
511c3f92 3036 rt = skb_rtable(skb);
d8d1f30b
CG
3037 if (err == 0 && rt->dst.error)
3038 err = -rt->dst.error;
1da177e4 3039 } else {
6503a304 3040 fl4.flowi4_iif = LOOPBACK_IFINDEX;
21f94775 3041 skb->dev = net->loopback_dev;
3765d35e 3042 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
3043 err = 0;
3044 if (IS_ERR(rt))
3045 err = PTR_ERR(rt);
2c87d63a
FW
3046 else
3047 skb_dst_set(skb, &rt->dst);
1da177e4 3048 }
d889ce3b 3049
1da177e4 3050 if (err)
404eb77e 3051 goto errout_rcu;
1da177e4 3052
1da177e4
LT
3053 if (rtm->rtm_flags & RTM_F_NOTIFY)
3054 rt->rt_flags |= RTCF_NOTIFY;
3055
c36ba660 3056 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
68e813aa 3057 table_id = res.table ? res.table->tb_id : 0;
c36ba660 3058
404eb77e
RP
3059 /* reset skb for netlink reply msg */
3060 skb_trim(skb, 0);
3061 skb_reset_network_header(skb);
3062 skb_reset_transport_header(skb);
3063 skb_reset_mac_header(skb);
3064
bc3aae2b
RP
3065 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3066 if (!res.fi) {
3067 err = fib_props[res.type].error;
3068 if (!err)
3069 err = -EHOSTUNREACH;
404eb77e 3070 goto errout_rcu;
bc3aae2b 3071 }
b6179813
RP
3072 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3073 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3074 rt->rt_type, res.prefix, res.prefixlen,
3075 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 3076 } else {
404eb77e 3077 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
ba52d61e 3078 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 3079 }
7b46a644 3080 if (err < 0)
404eb77e 3081 goto errout_rcu;
1da177e4 3082
3765d35e
DA
3083 rcu_read_unlock();
3084
15e47304 3085 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1da177e4 3086
d889ce3b 3087errout_free:
404eb77e
RP
3088 return err;
3089errout_rcu:
3765d35e 3090 rcu_read_unlock();
1da177e4 3091 kfree_skb(skb);
404eb77e 3092 goto errout_free;
1da177e4
LT
3093}
3094
1da177e4
LT
3095void ip_rt_multicast_event(struct in_device *in_dev)
3096{
4ccfe6d4 3097 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
3098}
3099
3100#ifdef CONFIG_SYSCTL
082c7ca4
G
3101static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3102static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3103static int ip_rt_gc_elasticity __read_mostly = 8;
773daa3c 3104static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
082c7ca4 3105
fe2c6338 3106static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 3107 void __user *buffer,
1da177e4
LT
3108 size_t *lenp, loff_t *ppos)
3109{
5aad1de5
TT
3110 struct net *net = (struct net *)__ctl->extra1;
3111
1da177e4 3112 if (write) {
5aad1de5
TT
3113 rt_cache_flush(net);
3114 fnhe_genid_bump(net);
1da177e4 3115 return 0;
e905a9ed 3116 }
1da177e4
LT
3117
3118 return -EINVAL;
3119}
3120
fe2c6338 3121static struct ctl_table ipv4_route_table[] = {
1da177e4 3122 {
1da177e4
LT
3123 .procname = "gc_thresh",
3124 .data = &ipv4_dst_ops.gc_thresh,
3125 .maxlen = sizeof(int),
3126 .mode = 0644,
6d9f239a 3127 .proc_handler = proc_dointvec,
1da177e4
LT
3128 },
3129 {
1da177e4
LT
3130 .procname = "max_size",
3131 .data = &ip_rt_max_size,
3132 .maxlen = sizeof(int),
3133 .mode = 0644,
6d9f239a 3134 .proc_handler = proc_dointvec,
1da177e4
LT
3135 },
3136 {
3137 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3138
1da177e4
LT
3139 .procname = "gc_min_interval",
3140 .data = &ip_rt_gc_min_interval,
3141 .maxlen = sizeof(int),
3142 .mode = 0644,
6d9f239a 3143 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3144 },
3145 {
1da177e4
LT
3146 .procname = "gc_min_interval_ms",
3147 .data = &ip_rt_gc_min_interval,
3148 .maxlen = sizeof(int),
3149 .mode = 0644,
6d9f239a 3150 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3151 },
3152 {
1da177e4
LT
3153 .procname = "gc_timeout",
3154 .data = &ip_rt_gc_timeout,
3155 .maxlen = sizeof(int),
3156 .mode = 0644,
6d9f239a 3157 .proc_handler = proc_dointvec_jiffies,
1da177e4 3158 },
9f28a2fc
ED
3159 {
3160 .procname = "gc_interval",
3161 .data = &ip_rt_gc_interval,
3162 .maxlen = sizeof(int),
3163 .mode = 0644,
3164 .proc_handler = proc_dointvec_jiffies,
3165 },
1da177e4 3166 {
1da177e4
LT
3167 .procname = "redirect_load",
3168 .data = &ip_rt_redirect_load,
3169 .maxlen = sizeof(int),
3170 .mode = 0644,
6d9f239a 3171 .proc_handler = proc_dointvec,
1da177e4
LT
3172 },
3173 {
1da177e4
LT
3174 .procname = "redirect_number",
3175 .data = &ip_rt_redirect_number,
3176 .maxlen = sizeof(int),
3177 .mode = 0644,
6d9f239a 3178 .proc_handler = proc_dointvec,
1da177e4
LT
3179 },
3180 {
1da177e4
LT
3181 .procname = "redirect_silence",
3182 .data = &ip_rt_redirect_silence,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
6d9f239a 3185 .proc_handler = proc_dointvec,
1da177e4
LT
3186 },
3187 {
1da177e4
LT
3188 .procname = "error_cost",
3189 .data = &ip_rt_error_cost,
3190 .maxlen = sizeof(int),
3191 .mode = 0644,
6d9f239a 3192 .proc_handler = proc_dointvec,
1da177e4
LT
3193 },
3194 {
1da177e4
LT
3195 .procname = "error_burst",
3196 .data = &ip_rt_error_burst,
3197 .maxlen = sizeof(int),
3198 .mode = 0644,
6d9f239a 3199 .proc_handler = proc_dointvec,
1da177e4
LT
3200 },
3201 {
1da177e4
LT
3202 .procname = "gc_elasticity",
3203 .data = &ip_rt_gc_elasticity,
3204 .maxlen = sizeof(int),
3205 .mode = 0644,
6d9f239a 3206 .proc_handler = proc_dointvec,
1da177e4
LT
3207 },
3208 {
1da177e4
LT
3209 .procname = "mtu_expires",
3210 .data = &ip_rt_mtu_expires,
3211 .maxlen = sizeof(int),
3212 .mode = 0644,
6d9f239a 3213 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3214 },
3215 {
1da177e4
LT
3216 .procname = "min_pmtu",
3217 .data = &ip_rt_min_pmtu,
3218 .maxlen = sizeof(int),
3219 .mode = 0644,
c7272c2f
SD
3220 .proc_handler = proc_dointvec_minmax,
3221 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
3222 },
3223 {
1da177e4
LT
3224 .procname = "min_adv_mss",
3225 .data = &ip_rt_min_advmss,
3226 .maxlen = sizeof(int),
3227 .mode = 0644,
6d9f239a 3228 .proc_handler = proc_dointvec,
1da177e4 3229 },
f8572d8f 3230 { }
1da177e4 3231};
39a23e75 3232
39a23e75
DL
3233static struct ctl_table ipv4_route_flush_table[] = {
3234 {
39a23e75
DL
3235 .procname = "flush",
3236 .maxlen = sizeof(int),
3237 .mode = 0200,
6d9f239a 3238 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3239 },
f8572d8f 3240 { },
39a23e75
DL
3241};
3242
3243static __net_init int sysctl_route_net_init(struct net *net)
3244{
3245 struct ctl_table *tbl;
3246
3247 tbl = ipv4_route_flush_table;
09ad9bc7 3248 if (!net_eq(net, &init_net)) {
39a23e75 3249 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 3250 if (!tbl)
39a23e75 3251 goto err_dup;
464dc801
EB
3252
3253 /* Don't export sysctls to unprivileged users */
3254 if (net->user_ns != &init_user_ns)
3255 tbl[0].procname = NULL;
39a23e75
DL
3256 }
3257 tbl[0].extra1 = net;
3258
ec8f23ce 3259 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 3260 if (!net->ipv4.route_hdr)
39a23e75
DL
3261 goto err_reg;
3262 return 0;
3263
3264err_reg:
3265 if (tbl != ipv4_route_flush_table)
3266 kfree(tbl);
3267err_dup:
3268 return -ENOMEM;
3269}
3270
3271static __net_exit void sysctl_route_net_exit(struct net *net)
3272{
3273 struct ctl_table *tbl;
3274
3275 tbl = net->ipv4.route_hdr->ctl_table_arg;
3276 unregister_net_sysctl_table(net->ipv4.route_hdr);
3277 BUG_ON(tbl == ipv4_route_flush_table);
3278 kfree(tbl);
3279}
3280
3281static __net_initdata struct pernet_operations sysctl_route_ops = {
3282 .init = sysctl_route_net_init,
3283 .exit = sysctl_route_net_exit,
3284};
1da177e4
LT
3285#endif
3286
3ee94372 3287static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3288{
ca4c3fc2 3289 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3290 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3291 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3292 return 0;
3293}
3294
3ee94372
NH
3295static __net_initdata struct pernet_operations rt_genid_ops = {
3296 .init = rt_genid_init,
9f5e97e5
DL
3297};
3298
c3426b47
DM
3299static int __net_init ipv4_inetpeer_init(struct net *net)
3300{
3301 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3302
3303 if (!bp)
3304 return -ENOMEM;
3305 inet_peer_base_init(bp);
3306 net->ipv4.peers = bp;
3307 return 0;
3308}
3309
3310static void __net_exit ipv4_inetpeer_exit(struct net *net)
3311{
3312 struct inet_peer_base *bp = net->ipv4.peers;
3313
3314 net->ipv4.peers = NULL;
56a6b248 3315 inetpeer_invalidate_tree(bp);
c3426b47
DM
3316 kfree(bp);
3317}
3318
3319static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3320 .init = ipv4_inetpeer_init,
3321 .exit = ipv4_inetpeer_exit,
3322};
9f5e97e5 3323
c7066f70 3324#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3325struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3326#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3327
1da177e4
LT
3328int __init ip_rt_init(void)
3329{
5055c371 3330 int cpu;
1da177e4 3331
6da2ec56
KC
3332 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3333 GFP_KERNEL);
73f156a6
ED
3334 if (!ip_idents)
3335 panic("IP: failed to allocate ip_idents\n");
3336
3337 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3338
355b590c
ED
3339 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3340 if (!ip_tstamps)
3341 panic("IP: failed to allocate ip_tstamps\n");
3342
5055c371
ED
3343 for_each_possible_cpu(cpu) {
3344 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3345
3346 INIT_LIST_HEAD(&ul->head);
3347 spin_lock_init(&ul->lock);
3348 }
c7066f70 3349#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3350 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3351 if (!ip_rt_acct)
3352 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3353#endif
3354
e5d679f3
AD
3355 ipv4_dst_ops.kmem_cachep =
3356 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3357 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3358
14e50e57
DM
3359 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360
fc66f95c
ED
3361 if (dst_entries_init(&ipv4_dst_ops) < 0)
3362 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3363
3364 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3365 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3366
89aef892
DM
3367 ipv4_dst_ops.gc_thresh = ~0;
3368 ip_rt_max_size = INT_MAX;
1da177e4 3369
1da177e4
LT
3370 devinet_init();
3371 ip_fib_init();
3372
73b38711 3373 if (ip_rt_proc_init())
058bd4d2 3374 pr_err("Unable to create route proc files\n");
1da177e4
LT
3375#ifdef CONFIG_XFRM
3376 xfrm_init();
703fb94e 3377 xfrm4_init();
1da177e4 3378#endif
394f51ab
FW
3379 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3380 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3381
39a23e75
DL
3382#ifdef CONFIG_SYSCTL
3383 register_pernet_subsys(&sysctl_route_ops);
3384#endif
3ee94372 3385 register_pernet_subsys(&rt_genid_ops);
c3426b47 3386 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3387 return 0;
1da177e4
LT
3388}
3389
a1bc6eb4 3390#ifdef CONFIG_SYSCTL
eeb61f71
AV
3391/*
3392 * We really need to sanitize the damn ipv4 init order, then all
3393 * this nonsense will go away.
3394 */
3395void __init ip_static_sysctl_init(void)
3396{
4e5ca785 3397 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3398}
a1bc6eb4 3399#endif