Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide
[linux-2.6-block.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1b7179d3 113#include <net/ip_tunnels.h>
385add90 114#include <net/l3mdev.h>
1da177e4 115
b6179813
RP
116#include "fib_lookup.h"
117
68a5e3dd 118#define RT_FL_TOS(oldflp4) \
f61759e6 119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 120
1da177e4
LT
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db
SH
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
c7272c2f 130static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 131static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 132
deed49df 133static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
c7272c2f 134
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
89aef892 246 .release = seq_release,
1da177e4
LT
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
0f23174a 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
2f970d83 261 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
262 }
263 return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268 int cpu;
269
0f23174a 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
2f970d83 274 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
275 }
276 return NULL;
e905a9ed 277
1da177e4
LT
278}
279
280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281{
282
283}
284
285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286{
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
5bec0039 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
291 return 0;
292 }
e905a9ed 293
1da177e4
LT
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 296 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 297 0, /* st->in_hit */
1da177e4
LT
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
0baf2b35 305 0, /* st->out_hit */
1da177e4 306 st->out_slow_tot,
e905a9ed 307 st->out_slow_mc,
1da177e4 308
0baf2b35
ED
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
1da177e4
LT
315 );
316 return 0;
317}
318
f690808e 319static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324};
325
326
327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328{
329 return seq_open(file, &rt_cpu_seq_ops);
330}
331
9a32144e 332static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
333 .open = rt_cpu_seq_open,
334 .read = seq_read,
335 .llseek = seq_lseek,
336 .release = seq_release,
337};
338
c7066f70 339#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 340static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 341{
a661c419
AD
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
78c686e9
PE
357 }
358
a661c419
AD
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362}
78c686e9 363#endif
107f1634 364
73b38711 365static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
366{
367 struct proc_dir_entry *pde;
368
d6444062 369 pde = proc_create("rt_cache", 0444, net->proc_net,
d4beaa66 370 &rt_cache_seq_fops);
107f1634
PE
371 if (!pde)
372 goto err1;
373
d6444062 374 pde = proc_create("rt_cache", 0444,
77020720 375 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
376 if (!pde)
377 goto err2;
378
c7066f70 379#ifdef CONFIG_IP_ROUTE_CLASSID
3f3942ac
CH
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
107f1634
PE
382 if (!pde)
383 goto err3;
384#endif
385 return 0;
386
c7066f70 387#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
388err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390#endif
391err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393err1:
394 return -ENOMEM;
395}
73b38711
DL
396
397static void __net_exit ip_rt_do_proc_exit(struct net *net)
398{
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 402 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 403#endif
73b38711
DL
404}
405
406static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409};
410
411static int __init ip_rt_proc_init(void)
412{
413 return register_pernet_subsys(&ip_rt_proc_ops);
414}
415
107f1634 416#else
73b38711 417static inline int ip_rt_proc_init(void)
107f1634
PE
418{
419 return 0;
420}
1da177e4 421#endif /* CONFIG_PROC_FS */
e905a9ed 422
4331debc 423static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 424{
ca4c3fc2 425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
426}
427
4ccfe6d4 428void rt_cache_flush(struct net *net)
1da177e4 429{
ca4c3fc2 430 rt_genid_bump_ipv4(net);
98376387
ED
431}
432
f894cbf8
DM
433static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
3769cffb 436{
d3aaeb38
DM
437 struct net_device *dev = dst->dev;
438 const __be32 *pkey = daddr;
39232973 439 const struct rtable *rt;
3769cffb
DM
440 struct neighbour *n;
441
39232973 442 rt = (const struct rtable *) dst;
a263b309 443 if (rt->rt_gateway)
39232973 444 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
445 else if (skb)
446 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 447
80703d26 448 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
449 if (n)
450 return n;
32092ecf 451 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
452}
453
63fca65d
JA
454static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
455{
456 struct net_device *dev = dst->dev;
457 const __be32 *pkey = daddr;
458 const struct rtable *rt;
459
460 rt = (const struct rtable *)dst;
461 if (rt->rt_gateway)
462 pkey = (const __be32 *)&rt->rt_gateway;
463 else if (!daddr ||
464 (rt->rt_flags &
465 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
466 return;
467
468 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
469}
470
04ca6973 471#define IP_IDENTS_SZ 2048u
04ca6973 472
355b590c
ED
473static atomic_t *ip_idents __read_mostly;
474static u32 *ip_tstamps __read_mostly;
04ca6973
ED
475
476/* In order to protect privacy, we add a perturbation to identifiers
477 * if one generator is seldom used. This makes hard for an attacker
478 * to infer how many packets were sent between two points in time.
479 */
480u32 ip_idents_reserve(u32 hash, int segs)
481{
355b590c
ED
482 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 484 u32 old = READ_ONCE(*p_tstamp);
04ca6973 485 u32 now = (u32)jiffies;
adb03115 486 u32 new, delta = 0;
04ca6973 487
355b590c 488 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
489 delta = prandom_u32_max(now - old);
490
adb03115
ED
491 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
492 do {
493 old = (u32)atomic_read(p_id);
494 new = old + delta + segs;
495 } while (atomic_cmpxchg(p_id, old, new) != old);
496
497 return new - segs;
04ca6973
ED
498}
499EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 500
b6a7719a 501void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 502{
73f156a6
ED
503 static u32 ip_idents_hashrnd __read_mostly;
504 u32 hash, id;
1da177e4 505
73f156a6 506 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 507
04ca6973
ED
508 hash = jhash_3words((__force u32)iph->daddr,
509 (__force u32)iph->saddr,
b6a7719a 510 iph->protocol ^ net_hash_mix(net),
04ca6973 511 ip_idents_hashrnd);
73f156a6
ED
512 id = ip_idents_reserve(hash, segs);
513 iph->id = htons(id);
1da177e4 514}
4bc2f18b 515EXPORT_SYMBOL(__ip_select_ident);
1da177e4 516
e2d118a1
LC
517static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
518 const struct sock *sk,
4895c771
DM
519 const struct iphdr *iph,
520 int oif, u8 tos,
521 u8 prot, u32 mark, int flow_flags)
522{
523 if (sk) {
524 const struct inet_sock *inet = inet_sk(sk);
525
526 oif = sk->sk_bound_dev_if;
527 mark = sk->sk_mark;
528 tos = RT_CONN_FLAGS(sk);
529 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530 }
531 flowi4_init_output(fl4, oif, mark, tos,
532 RT_SCOPE_UNIVERSE, prot,
533 flow_flags,
e2d118a1
LC
534 iph->daddr, iph->saddr, 0, 0,
535 sock_net_uid(net, sk));
4895c771
DM
536}
537
5abf7f7e
ED
538static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539 const struct sock *sk)
4895c771 540{
d109e61b 541 const struct net *net = dev_net(skb->dev);
4895c771
DM
542 const struct iphdr *iph = ip_hdr(skb);
543 int oif = skb->dev->ifindex;
544 u8 tos = RT_TOS(iph->tos);
545 u8 prot = iph->protocol;
546 u32 mark = skb->mark;
547
d109e61b 548 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
549}
550
5abf7f7e 551static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
552{
553 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 554 const struct ip_options_rcu *inet_opt;
4895c771
DM
555 __be32 daddr = inet->inet_daddr;
556
557 rcu_read_lock();
558 inet_opt = rcu_dereference(inet->inet_opt);
559 if (inet_opt && inet_opt->opt.srr)
560 daddr = inet_opt->opt.faddr;
561 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
562 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
563 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
564 inet_sk_flowi_flags(sk),
e2d118a1 565 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
566 rcu_read_unlock();
567}
568
5abf7f7e
ED
569static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
570 const struct sk_buff *skb)
4895c771
DM
571{
572 if (skb)
573 build_skb_flow_key(fl4, skb, sk);
574 else
575 build_sk_flow_key(fl4, sk);
576}
577
c5038a83 578static DEFINE_SPINLOCK(fnhe_lock);
4895c771 579
2ffae99d
TT
580static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
581{
582 struct rtable *rt;
583
584 rt = rcu_dereference(fnhe->fnhe_rth_input);
585 if (rt) {
586 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 587 dst_dev_put(&rt->dst);
0830106c 588 dst_release(&rt->dst);
2ffae99d
TT
589 }
590 rt = rcu_dereference(fnhe->fnhe_rth_output);
591 if (rt) {
592 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 593 dst_dev_put(&rt->dst);
0830106c 594 dst_release(&rt->dst);
2ffae99d
TT
595 }
596}
597
aee06da6 598static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
599{
600 struct fib_nh_exception *fnhe, *oldest;
601
602 oldest = rcu_dereference(hash->chain);
603 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
604 fnhe = rcu_dereference(fnhe->fnhe_next)) {
605 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
606 oldest = fnhe;
607 }
2ffae99d 608 fnhe_flush_routes(oldest);
4895c771
DM
609 return oldest;
610}
611
d3a25c98
DM
612static inline u32 fnhe_hashfun(__be32 daddr)
613{
d546c621 614 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
615 u32 hval;
616
d546c621
ED
617 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
618 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
619 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
620}
621
387aa65a
TT
622static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
623{
624 rt->rt_pmtu = fnhe->fnhe_pmtu;
d52e5a7e 625 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
626 rt->dst.expires = fnhe->fnhe_expires;
627
628 if (fnhe->fnhe_gw) {
629 rt->rt_flags |= RTCF_REDIRECTED;
630 rt->rt_gateway = fnhe->fnhe_gw;
631 rt->rt_uses_gateway = 1;
632 }
633}
634
aee06da6 635static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
d52e5a7e 636 u32 pmtu, bool lock, unsigned long expires)
4895c771 637{
aee06da6 638 struct fnhe_hash_bucket *hash;
4895c771 639 struct fib_nh_exception *fnhe;
387aa65a 640 struct rtable *rt;
cebe84c6 641 u32 genid, hval;
387aa65a 642 unsigned int i;
4895c771 643 int depth;
cebe84c6
XL
644
645 genid = fnhe_genid(dev_net(nh->nh_dev));
646 hval = fnhe_hashfun(daddr);
aee06da6 647
c5038a83 648 spin_lock_bh(&fnhe_lock);
4895c771 649
caa41527 650 hash = rcu_dereference(nh->nh_exceptions);
4895c771 651 if (!hash) {
6396bb22 652 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
4895c771 653 if (!hash)
aee06da6 654 goto out_unlock;
caa41527 655 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
656 }
657
4895c771
DM
658 hash += hval;
659
660 depth = 0;
661 for (fnhe = rcu_dereference(hash->chain); fnhe;
662 fnhe = rcu_dereference(fnhe->fnhe_next)) {
663 if (fnhe->fnhe_daddr == daddr)
aee06da6 664 break;
4895c771
DM
665 depth++;
666 }
667
aee06da6 668 if (fnhe) {
cebe84c6
XL
669 if (fnhe->fnhe_genid != genid)
670 fnhe->fnhe_genid = genid;
aee06da6
JA
671 if (gw)
672 fnhe->fnhe_gw = gw;
d52e5a7e 673 if (pmtu) {
aee06da6 674 fnhe->fnhe_pmtu = pmtu;
d52e5a7e
SD
675 fnhe->fnhe_mtu_locked = lock;
676 }
e39d5246 677 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 678 /* Update all cached dsts too */
2ffae99d
TT
679 rt = rcu_dereference(fnhe->fnhe_rth_input);
680 if (rt)
681 fill_route_from_fnhe(rt, fnhe);
682 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
683 if (rt)
684 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
685 } else {
686 if (depth > FNHE_RECLAIM_DEPTH)
687 fnhe = fnhe_oldest(hash);
688 else {
689 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
690 if (!fnhe)
691 goto out_unlock;
692
693 fnhe->fnhe_next = hash->chain;
694 rcu_assign_pointer(hash->chain, fnhe);
695 }
cebe84c6 696 fnhe->fnhe_genid = genid;
aee06da6
JA
697 fnhe->fnhe_daddr = daddr;
698 fnhe->fnhe_gw = gw;
699 fnhe->fnhe_pmtu = pmtu;
d52e5a7e 700 fnhe->fnhe_mtu_locked = lock;
94720e3a 701 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
702
703 /* Exception created; mark the cached routes for the nexthop
704 * stale, so anyone caching it rechecks if this exception
705 * applies to them.
706 */
2ffae99d
TT
707 rt = rcu_dereference(nh->nh_rth_input);
708 if (rt)
709 rt->dst.obsolete = DST_OBSOLETE_KILL;
710
387aa65a
TT
711 for_each_possible_cpu(i) {
712 struct rtable __rcu **prt;
713 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
714 rt = rcu_dereference(*prt);
715 if (rt)
716 rt->dst.obsolete = DST_OBSOLETE_KILL;
717 }
4895c771 718 }
4895c771 719
4895c771 720 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
721
722out_unlock:
c5038a83 723 spin_unlock_bh(&fnhe_lock);
4895c771
DM
724}
725
ceb33206
DM
726static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
727 bool kill_route)
1da177e4 728{
e47a185b 729 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 730 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 731 struct net_device *dev = skb->dev;
e47a185b 732 struct in_device *in_dev;
4895c771 733 struct fib_result res;
e47a185b 734 struct neighbour *n;
317805b8 735 struct net *net;
1da177e4 736
94206125
DM
737 switch (icmp_hdr(skb)->code & 7) {
738 case ICMP_REDIR_NET:
739 case ICMP_REDIR_NETTOS:
740 case ICMP_REDIR_HOST:
741 case ICMP_REDIR_HOSTTOS:
742 break;
743
744 default:
745 return;
746 }
747
e47a185b
DM
748 if (rt->rt_gateway != old_gw)
749 return;
750
751 in_dev = __in_dev_get_rcu(dev);
752 if (!in_dev)
753 return;
754
c346dca1 755 net = dev_net(dev);
9d4fb27d
JP
756 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
757 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
758 ipv4_is_zeronet(new_gw))
1da177e4
LT
759 goto reject_redirect;
760
761 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
762 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
763 goto reject_redirect;
764 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
765 goto reject_redirect;
766 } else {
317805b8 767 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
768 goto reject_redirect;
769 }
770
969447f2
SSL
771 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
772 if (!n)
773 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 774 if (!IS_ERR(n)) {
e47a185b
DM
775 if (!(n->nud_state & NUD_VALID)) {
776 neigh_event_send(n, NULL);
777 } else {
0eeb075f 778 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 779 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 780
aee06da6 781 update_or_create_fnhe(nh, fl4->daddr, new_gw,
d52e5a7e
SD
782 0, false,
783 jiffies + ip_rt_gc_timeout);
4895c771 784 }
ceb33206
DM
785 if (kill_route)
786 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
787 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
788 }
789 neigh_release(n);
790 }
791 return;
792
793reject_redirect:
794#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
795 if (IN_DEV_LOG_MARTIANS(in_dev)) {
796 const struct iphdr *iph = (const struct iphdr *) skb->data;
797 __be32 daddr = iph->daddr;
798 __be32 saddr = iph->saddr;
799
e47a185b
DM
800 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
801 " Advised path = %pI4 -> %pI4\n",
802 &old_gw, dev->name, &new_gw,
803 &saddr, &daddr);
99ee038d 804 }
e47a185b
DM
805#endif
806 ;
807}
808
4895c771
DM
809static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
810{
811 struct rtable *rt;
812 struct flowi4 fl4;
f96ef988 813 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 814 struct net *net = dev_net(skb->dev);
f96ef988
MK
815 int oif = skb->dev->ifindex;
816 u8 tos = RT_TOS(iph->tos);
817 u8 prot = iph->protocol;
818 u32 mark = skb->mark;
4895c771
DM
819
820 rt = (struct rtable *) dst;
821
7d995694 822 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 823 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
824}
825
1da177e4
LT
826static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
827{
ee6b9673 828 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
829 struct dst_entry *ret = dst;
830
831 if (rt) {
d11a4dc1 832 if (dst->obsolete > 0) {
1da177e4
LT
833 ip_rt_put(rt);
834 ret = NULL;
5943634f
DM
835 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
836 rt->dst.expires) {
89aef892 837 ip_rt_put(rt);
1da177e4
LT
838 ret = NULL;
839 }
840 }
841 return ret;
842}
843
844/*
845 * Algorithm:
846 * 1. The first ip_rt_redirect_number redirects are sent
847 * with exponential backoff, then we stop sending them at all,
848 * assuming that the host ignores our redirects.
849 * 2. If we did not see packets requiring redirects
850 * during ip_rt_redirect_silence, we assume that the host
851 * forgot redirected route and start to send redirects again.
852 *
853 * This algorithm is much cheaper and more intelligent than dumb load limiting
854 * in icmp.c.
855 *
856 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
857 * and "frag. need" (breaks PMTU discovery) in icmp.c.
858 */
859
860void ip_rt_send_redirect(struct sk_buff *skb)
861{
511c3f92 862 struct rtable *rt = skb_rtable(skb);
30038fc6 863 struct in_device *in_dev;
92d86829 864 struct inet_peer *peer;
1d861aa4 865 struct net *net;
30038fc6 866 int log_martians;
192132b9 867 int vif;
1da177e4 868
30038fc6 869 rcu_read_lock();
d8d1f30b 870 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
871 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
872 rcu_read_unlock();
1da177e4 873 return;
30038fc6
ED
874 }
875 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 876 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 877 rcu_read_unlock();
1da177e4 878
1d861aa4 879 net = dev_net(rt->dst.dev);
192132b9 880 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 881 if (!peer) {
e81da0e1
JA
882 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
883 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
884 return;
885 }
886
1da177e4
LT
887 /* No redirected packets during ip_rt_redirect_silence;
888 * reset the algorithm.
889 */
c09551c6 890 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 891 peer->rate_tokens = 0;
c09551c6
LB
892 peer->n_redirects = 0;
893 }
1da177e4
LT
894
895 /* Too many ignored redirects; do not send anything
d8d1f30b 896 * set dst.rate_last to the last seen redirected packet.
1da177e4 897 */
c09551c6 898 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 899 peer->rate_last = jiffies;
1d861aa4 900 goto out_put_peer;
1da177e4
LT
901 }
902
903 /* Check for load limit; set rate_last to the latest sent
904 * redirect.
905 */
92d86829 906 if (peer->rate_tokens == 0 ||
14fb8a76 907 time_after(jiffies,
92d86829
DM
908 (peer->rate_last +
909 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
910 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
911
912 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
913 peer->rate_last = jiffies;
914 ++peer->rate_tokens;
c09551c6 915 ++peer->n_redirects;
1da177e4 916#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 917 if (log_martians &&
e87cc472
JP
918 peer->rate_tokens == ip_rt_redirect_number)
919 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 920 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 921 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
922#endif
923 }
1d861aa4
DM
924out_put_peer:
925 inet_putpeer(peer);
1da177e4
LT
926}
927
928static int ip_error(struct sk_buff *skb)
929{
511c3f92 930 struct rtable *rt = skb_rtable(skb);
e2c0dc1f
SS
931 struct net_device *dev = skb->dev;
932 struct in_device *in_dev;
92d86829 933 struct inet_peer *peer;
1da177e4 934 unsigned long now;
251da413 935 struct net *net;
92d86829 936 bool send;
1da177e4
LT
937 int code;
938
e2c0dc1f
SS
939 if (netif_is_l3_master(skb->dev)) {
940 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
941 if (!dev)
942 goto out;
943 }
944
945 in_dev = __in_dev_get_rcu(dev);
946
381c759d
EB
947 /* IP on this device is disabled. */
948 if (!in_dev)
949 goto out;
950
251da413
DM
951 net = dev_net(rt->dst.dev);
952 if (!IN_DEV_FORWARD(in_dev)) {
953 switch (rt->dst.error) {
954 case EHOSTUNREACH:
b45386ef 955 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
956 break;
957
958 case ENETUNREACH:
b45386ef 959 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
960 break;
961 }
962 goto out;
963 }
964
d8d1f30b 965 switch (rt->dst.error) {
4500ebf8
JP
966 case EINVAL:
967 default:
968 goto out;
969 case EHOSTUNREACH:
970 code = ICMP_HOST_UNREACH;
971 break;
972 case ENETUNREACH:
973 code = ICMP_NET_UNREACH;
b45386ef 974 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
975 break;
976 case EACCES:
977 code = ICMP_PKT_FILTERED;
978 break;
1da177e4
LT
979 }
980
192132b9 981 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 982 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
983
984 send = true;
985 if (peer) {
986 now = jiffies;
987 peer->rate_tokens += now - peer->rate_last;
988 if (peer->rate_tokens > ip_rt_error_burst)
989 peer->rate_tokens = ip_rt_error_burst;
990 peer->rate_last = now;
991 if (peer->rate_tokens >= ip_rt_error_cost)
992 peer->rate_tokens -= ip_rt_error_cost;
993 else
994 send = false;
1d861aa4 995 inet_putpeer(peer);
1da177e4 996 }
92d86829
DM
997 if (send)
998 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
999
1000out: kfree_skb(skb);
1001 return 0;
e905a9ed 1002}
1da177e4 1003
d851c12b 1004static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1005{
d851c12b 1006 struct dst_entry *dst = &rt->dst;
28d35bcd 1007 u32 old_mtu = ipv4_mtu(dst);
4895c771 1008 struct fib_result res;
d52e5a7e 1009 bool lock = false;
2c8cec5c 1010
d52e5a7e 1011 if (ip_mtu_locked(dst))
fa1e492a
SK
1012 return;
1013
28d35bcd 1014 if (old_mtu < mtu)
3cdaa5be
LW
1015 return;
1016
d52e5a7e
SD
1017 if (mtu < ip_rt_min_pmtu) {
1018 lock = true;
28d35bcd 1019 mtu = min(old_mtu, ip_rt_min_pmtu);
d52e5a7e 1020 }
2c8cec5c 1021
28d35bcd 1022 if (rt->rt_pmtu == mtu && !lock &&
f016229e
TT
1023 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1024 return;
1025
c5ae7d41 1026 rcu_read_lock();
0eeb075f 1027 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1028 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1029
d52e5a7e 1030 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
aee06da6 1031 jiffies + ip_rt_mtu_expires);
4895c771 1032 }
c5ae7d41 1033 rcu_read_unlock();
1da177e4
LT
1034}
1035
4895c771
DM
1036static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1037 struct sk_buff *skb, u32 mtu)
1038{
1039 struct rtable *rt = (struct rtable *) dst;
1040 struct flowi4 fl4;
1041
1042 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1043 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1044}
1045
36393395 1046void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
d888f396 1047 int oif, u8 protocol)
36393395 1048{
4895c771 1049 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1050 struct flowi4 fl4;
1051 struct rtable *rt;
d888f396 1052 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1b3c61dc 1053
e2d118a1 1054 __build_flow_key(net, &fl4, NULL, iph, oif,
d888f396 1055 RT_TOS(iph->tos), protocol, mark, 0);
36393395
DM
1056 rt = __ip_route_output_key(net, &fl4);
1057 if (!IS_ERR(rt)) {
4895c771 1058 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1059 ip_rt_put(rt);
1060 }
1061}
1062EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1063
9cb3a50c 1064static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1065{
4895c771
DM
1066 const struct iphdr *iph = (const struct iphdr *) skb->data;
1067 struct flowi4 fl4;
1068 struct rtable *rt;
36393395 1069
e2d118a1 1070 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1071
1072 if (!fl4.flowi4_mark)
1073 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1074
4895c771
DM
1075 rt = __ip_route_output_key(sock_net(sk), &fl4);
1076 if (!IS_ERR(rt)) {
1077 __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 ip_rt_put(rt);
1079 }
36393395 1080}
9cb3a50c
SK
1081
1082void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1083{
1084 const struct iphdr *iph = (const struct iphdr *) skb->data;
1085 struct flowi4 fl4;
1086 struct rtable *rt;
7f502361 1087 struct dst_entry *odst = NULL;
b44108db 1088 bool new = false;
e2d118a1 1089 struct net *net = sock_net(sk);
9cb3a50c
SK
1090
1091 bh_lock_sock(sk);
482fc609
HFS
1092
1093 if (!ip_sk_accept_pmtu(sk))
1094 goto out;
1095
7f502361 1096 odst = sk_dst_get(sk);
9cb3a50c 1097
7f502361 1098 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1099 __ipv4_sk_update_pmtu(skb, sk, mtu);
1100 goto out;
1101 }
1102
e2d118a1 1103 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1104
7f502361 1105 rt = (struct rtable *)odst;
51456b29 1106 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1107 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1108 if (IS_ERR(rt))
1109 goto out;
b44108db
SK
1110
1111 new = true;
9cb3a50c
SK
1112 }
1113
0f6c480f 1114 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1115
7f502361 1116 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1117 if (new)
1118 dst_release(&rt->dst);
1119
9cb3a50c
SK
1120 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1121 if (IS_ERR(rt))
1122 goto out;
1123
b44108db 1124 new = true;
9cb3a50c
SK
1125 }
1126
b44108db 1127 if (new)
7f502361 1128 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1129
1130out:
1131 bh_unlock_sock(sk);
7f502361 1132 dst_release(odst);
9cb3a50c 1133}
36393395 1134EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1135
b42597e2 1136void ipv4_redirect(struct sk_buff *skb, struct net *net,
1042caa7 1137 int oif, u8 protocol)
b42597e2 1138{
4895c771 1139 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1140 struct flowi4 fl4;
1141 struct rtable *rt;
1142
e2d118a1 1143 __build_flow_key(net, &fl4, NULL, iph, oif,
1042caa7 1144 RT_TOS(iph->tos), protocol, 0, 0);
b42597e2
DM
1145 rt = __ip_route_output_key(net, &fl4);
1146 if (!IS_ERR(rt)) {
ceb33206 1147 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1148 ip_rt_put(rt);
1149 }
1150}
1151EXPORT_SYMBOL_GPL(ipv4_redirect);
1152
1153void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1154{
4895c771
DM
1155 const struct iphdr *iph = (const struct iphdr *) skb->data;
1156 struct flowi4 fl4;
1157 struct rtable *rt;
e2d118a1 1158 struct net *net = sock_net(sk);
b42597e2 1159
e2d118a1
LC
1160 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1161 rt = __ip_route_output_key(net, &fl4);
4895c771 1162 if (!IS_ERR(rt)) {
ceb33206 1163 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1164 ip_rt_put(rt);
1165 }
b42597e2
DM
1166}
1167EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1168
efbc368d
DM
1169static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1170{
1171 struct rtable *rt = (struct rtable *) dst;
1172
ceb33206
DM
1173 /* All IPV4 dsts are created with ->obsolete set to the value
1174 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1175 * into this function always.
1176 *
387aa65a
TT
1177 * When a PMTU/redirect information update invalidates a route,
1178 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1179 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1180 */
387aa65a 1181 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1182 return NULL;
d11a4dc1 1183 return dst;
1da177e4
LT
1184}
1185
1da177e4
LT
1186static void ipv4_link_failure(struct sk_buff *skb)
1187{
1188 struct rtable *rt;
1189
1190 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1191
511c3f92 1192 rt = skb_rtable(skb);
5943634f
DM
1193 if (rt)
1194 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1195}
1196
ede2059d 1197static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1198{
91df42be
JP
1199 pr_debug("%s: %pI4 -> %pI4, %s\n",
1200 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1201 skb->dev ? skb->dev->name : "?");
1da177e4 1202 kfree_skb(skb);
c378a9c0 1203 WARN_ON(1);
1da177e4
LT
1204 return 0;
1205}
1206
1207/*
1208 We do not cache source address of outgoing interface,
1209 because it is used only by IP RR, TS and SRR options,
1210 so that it out of fast path.
1211
1212 BTW remember: "addr" is allowed to be not aligned
1213 in IP options!
1214 */
1215
8e36360a 1216void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1217{
a61ced5d 1218 __be32 src;
1da177e4 1219
c7537967 1220 if (rt_is_output_route(rt))
c5be24ff 1221 src = ip_hdr(skb)->saddr;
ebc0ffae 1222 else {
8e36360a 1223 struct fib_result res;
e351bb62
MÅ»
1224 struct iphdr *iph = ip_hdr(skb);
1225 struct flowi4 fl4 = {
1226 .daddr = iph->daddr,
1227 .saddr = iph->saddr,
1228 .flowi4_tos = RT_TOS(iph->tos),
1229 .flowi4_oif = rt->dst.dev->ifindex,
1230 .flowi4_iif = skb->dev->ifindex,
1231 .flowi4_mark = skb->mark,
1232 };
5e2b61f7 1233
ebc0ffae 1234 rcu_read_lock();
0eeb075f 1235 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1236 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1237 else
f8126f1d
DM
1238 src = inet_select_addr(rt->dst.dev,
1239 rt_nexthop(rt, iph->daddr),
1240 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1241 rcu_read_unlock();
1242 }
1da177e4
LT
1243 memcpy(addr, &src, 4);
1244}
1245
c7066f70 1246#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1247static void set_class_tag(struct rtable *rt, u32 tag)
1248{
d8d1f30b
CG
1249 if (!(rt->dst.tclassid & 0xFFFF))
1250 rt->dst.tclassid |= tag & 0xFFFF;
1251 if (!(rt->dst.tclassid & 0xFFFF0000))
1252 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1253}
1254#endif
1255
0dbaee3b
DM
1256static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1257{
7ed14d97 1258 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1259 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1260 ip_rt_min_advmss);
0dbaee3b 1261
7ed14d97 1262 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1263}
1264
ebb762f2 1265static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1266{
261663b0 1267 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1268 unsigned int mtu = rt->rt_pmtu;
1269
98d75c37 1270 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1271 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1272
38d523e2 1273 if (mtu)
618f9bc7
SK
1274 return mtu;
1275
c780a049 1276 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1277
d52e5a7e 1278 if (unlikely(ip_mtu_locked(dst))) {
155e8336 1279 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1280 mtu = 576;
1281 }
1282
14972cbd
RP
1283 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1284
1285 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1286}
1287
94720e3a
JA
1288static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1289{
1290 struct fnhe_hash_bucket *hash;
1291 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1292 u32 hval = fnhe_hashfun(daddr);
1293
1294 spin_lock_bh(&fnhe_lock);
1295
1296 hash = rcu_dereference_protected(nh->nh_exceptions,
1297 lockdep_is_held(&fnhe_lock));
1298 hash += hval;
1299
1300 fnhe_p = &hash->chain;
1301 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1302 while (fnhe) {
1303 if (fnhe->fnhe_daddr == daddr) {
1304 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1305 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
ee60ad21
XL
1306 /* set fnhe_daddr to 0 to ensure it won't bind with
1307 * new dsts in rt_bind_exception().
1308 */
1309 fnhe->fnhe_daddr = 0;
94720e3a
JA
1310 fnhe_flush_routes(fnhe);
1311 kfree_rcu(fnhe, rcu);
1312 break;
1313 }
1314 fnhe_p = &fnhe->fnhe_next;
1315 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1316 lockdep_is_held(&fnhe_lock));
1317 }
1318
1319 spin_unlock_bh(&fnhe_lock);
1320}
1321
f2bb4bed 1322static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1323{
caa41527 1324 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1325 struct fib_nh_exception *fnhe;
1326 u32 hval;
1327
f2bb4bed
DM
1328 if (!hash)
1329 return NULL;
1330
d3a25c98 1331 hval = fnhe_hashfun(daddr);
4895c771
DM
1332
1333 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1334 fnhe = rcu_dereference(fnhe->fnhe_next)) {
94720e3a
JA
1335 if (fnhe->fnhe_daddr == daddr) {
1336 if (fnhe->fnhe_expires &&
1337 time_after(jiffies, fnhe->fnhe_expires)) {
1338 ip_del_fnhe(nh, daddr);
1339 break;
1340 }
f2bb4bed 1341 return fnhe;
94720e3a 1342 }
f2bb4bed
DM
1343 }
1344 return NULL;
1345}
aee06da6 1346
50d889b1
DA
1347/* MTU selection:
1348 * 1. mtu on route is locked - use it
1349 * 2. mtu from nexthop exception
1350 * 3. mtu from egress device
1351 */
1352
1353u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1354{
1355 struct fib_info *fi = res->fi;
1356 struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1357 struct net_device *dev = nh->nh_dev;
1358 u32 mtu = 0;
1359
1360 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1361 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1362 mtu = fi->fib_mtu;
1363
1364 if (likely(!mtu)) {
1365 struct fib_nh_exception *fnhe;
1366
1367 fnhe = find_exception(nh, daddr);
1368 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1369 mtu = fnhe->fnhe_pmtu;
1370 }
1371
1372 if (likely(!mtu))
1373 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1374
1375 return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1376}
1377
caacf05e 1378static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1379 __be32 daddr, const bool do_cache)
f2bb4bed 1380{
caacf05e
DM
1381 bool ret = false;
1382
c5038a83 1383 spin_lock_bh(&fnhe_lock);
f2bb4bed 1384
c5038a83 1385 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1386 struct rtable __rcu **porig;
1387 struct rtable *orig;
5aad1de5 1388 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1389
1390 if (rt_is_input_route(rt))
1391 porig = &fnhe->fnhe_rth_input;
1392 else
1393 porig = &fnhe->fnhe_rth_output;
1394 orig = rcu_dereference(*porig);
5aad1de5
TT
1395
1396 if (fnhe->fnhe_genid != genid) {
1397 fnhe->fnhe_genid = genid;
13d82bf5
SK
1398 fnhe->fnhe_gw = 0;
1399 fnhe->fnhe_pmtu = 0;
1400 fnhe->fnhe_expires = 0;
0e8411e4 1401 fnhe->fnhe_mtu_locked = false;
2ffae99d
TT
1402 fnhe_flush_routes(fnhe);
1403 orig = NULL;
13d82bf5 1404 }
387aa65a
TT
1405 fill_route_from_fnhe(rt, fnhe);
1406 if (!rt->rt_gateway)
155e8336 1407 rt->rt_gateway = daddr;
f2bb4bed 1408
a4c2fd7f 1409 if (do_cache) {
0830106c 1410 dst_hold(&rt->dst);
2ffae99d 1411 rcu_assign_pointer(*porig, rt);
0830106c 1412 if (orig) {
95c47f9c 1413 dst_dev_put(&orig->dst);
0830106c 1414 dst_release(&orig->dst);
0830106c 1415 }
2ffae99d
TT
1416 ret = true;
1417 }
c5038a83
DM
1418
1419 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1420 }
1421 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1422
1423 return ret;
54764bb6
ED
1424}
1425
caacf05e 1426static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1427{
d26b3a7c 1428 struct rtable *orig, *prev, **p;
caacf05e 1429 bool ret = true;
f2bb4bed 1430
d26b3a7c 1431 if (rt_is_input_route(rt)) {
54764bb6 1432 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1433 } else {
903ceff7 1434 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1435 }
f2bb4bed
DM
1436 orig = *p;
1437
0830106c
WW
1438 /* hold dst before doing cmpxchg() to avoid race condition
1439 * on this dst
1440 */
1441 dst_hold(&rt->dst);
f2bb4bed
DM
1442 prev = cmpxchg(p, orig, rt);
1443 if (prev == orig) {
0830106c 1444 if (orig) {
95c47f9c 1445 dst_dev_put(&orig->dst);
0830106c 1446 dst_release(&orig->dst);
0830106c
WW
1447 }
1448 } else {
1449 dst_release(&rt->dst);
caacf05e 1450 ret = false;
0830106c 1451 }
caacf05e
DM
1452
1453 return ret;
1454}
1455
5055c371
ED
1456struct uncached_list {
1457 spinlock_t lock;
1458 struct list_head head;
1459};
1460
1461static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1462
510c321b 1463void rt_add_uncached_list(struct rtable *rt)
caacf05e 1464{
5055c371
ED
1465 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1466
1467 rt->rt_uncached_list = ul;
1468
1469 spin_lock_bh(&ul->lock);
1470 list_add_tail(&rt->rt_uncached, &ul->head);
1471 spin_unlock_bh(&ul->lock);
caacf05e
DM
1472}
1473
510c321b 1474void rt_del_uncached_list(struct rtable *rt)
caacf05e 1475{
78df76a0 1476 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1477 struct uncached_list *ul = rt->rt_uncached_list;
1478
1479 spin_lock_bh(&ul->lock);
caacf05e 1480 list_del(&rt->rt_uncached);
5055c371 1481 spin_unlock_bh(&ul->lock);
caacf05e
DM
1482 }
1483}
1484
510c321b
XL
1485static void ipv4_dst_destroy(struct dst_entry *dst)
1486{
510c321b
XL
1487 struct rtable *rt = (struct rtable *)dst;
1488
1620a336 1489 ip_dst_metrics_put(dst);
510c321b
XL
1490 rt_del_uncached_list(rt);
1491}
1492
caacf05e
DM
1493void rt_flush_dev(struct net_device *dev)
1494{
5055c371
ED
1495 struct net *net = dev_net(dev);
1496 struct rtable *rt;
1497 int cpu;
1498
1499 for_each_possible_cpu(cpu) {
1500 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1501
5055c371
ED
1502 spin_lock_bh(&ul->lock);
1503 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1504 if (rt->dst.dev != dev)
1505 continue;
1506 rt->dst.dev = net->loopback_dev;
1507 dev_hold(rt->dst.dev);
1508 dev_put(dev);
1509 }
5055c371 1510 spin_unlock_bh(&ul->lock);
4895c771
DM
1511 }
1512}
1513
4331debc 1514static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1515{
4331debc
ED
1516 return rt &&
1517 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1518 !rt_is_expired(rt);
d2d68ba9
DM
1519}
1520
f2bb4bed 1521static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1522 const struct fib_result *res,
f2bb4bed 1523 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1524 struct fib_info *fi, u16 type, u32 itag,
1525 const bool do_cache)
1da177e4 1526{
caacf05e
DM
1527 bool cached = false;
1528
1da177e4 1529 if (fi) {
4895c771
DM
1530 struct fib_nh *nh = &FIB_RES_NH(*res);
1531
155e8336 1532 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1533 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1534 rt->rt_uses_gateway = 1;
1535 }
e1255ed4
DA
1536 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1537
c7066f70 1538#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1539 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1540#endif
61adedf3 1541 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1542 if (unlikely(fnhe))
a4c2fd7f
WW
1543 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1544 else if (do_cache)
caacf05e 1545 cached = rt_cache_route(nh, rt);
155e8336
JA
1546 if (unlikely(!cached)) {
1547 /* Routes we intend to cache in nexthop exception or
1548 * FIB nexthop have the DST_NOCACHE bit clear.
1549 * However, if we are unsuccessful at storing this
1550 * route into the cache we really need to set it.
1551 */
155e8336
JA
1552 if (!rt->rt_gateway)
1553 rt->rt_gateway = daddr;
1554 rt_add_uncached_list(rt);
1555 }
1556 } else
caacf05e 1557 rt_add_uncached_list(rt);
defb3519 1558
c7066f70 1559#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1560#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1561 set_class_tag(rt, res->tclassid);
1da177e4
LT
1562#endif
1563 set_class_tag(rt, itag);
1564#endif
1da177e4
LT
1565}
1566
9ab179d8
DA
1567struct rtable *rt_dst_alloc(struct net_device *dev,
1568 unsigned int flags, u16 type,
1569 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1570{
d08c4f35
DA
1571 struct rtable *rt;
1572
1573 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1574 (will_cache ? 0 : DST_HOST) |
d08c4f35 1575 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1576 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1577
1578 if (rt) {
1579 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1580 rt->rt_flags = flags;
1581 rt->rt_type = type;
1582 rt->rt_is_input = 0;
1583 rt->rt_iif = 0;
1584 rt->rt_pmtu = 0;
d52e5a7e 1585 rt->rt_mtu_locked = 0;
d08c4f35
DA
1586 rt->rt_gateway = 0;
1587 rt->rt_uses_gateway = 0;
1588 INIT_LIST_HEAD(&rt->rt_uncached);
1589
1590 rt->dst.output = ip_output;
1591 if (flags & RTCF_LOCAL)
1592 rt->dst.input = ip_local_deliver;
1593 }
1594
1595 return rt;
0c4dcd58 1596}
9ab179d8 1597EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1598
96d36220 1599/* called in rcu_read_lock() section */
bc044e8d
PA
1600int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1601 u8 tos, struct net_device *dev,
1602 struct in_device *in_dev, u32 *itag)
1da177e4 1603{
b5f7e755 1604 int err;
1da177e4
LT
1605
1606 /* Primary sanity checks. */
51456b29 1607 if (!in_dev)
1da177e4
LT
1608 return -EINVAL;
1609
1e637c74 1610 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1611 skb->protocol != htons(ETH_P_IP))
bc044e8d 1612 return -EINVAL;
1da177e4 1613
75fea73d 1614 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1615 return -EINVAL;
d0daebc3 1616
f97c1e0c 1617 if (ipv4_is_zeronet(saddr)) {
1d2f4ebb
EC
1618 if (!ipv4_is_local_multicast(daddr) &&
1619 ip_hdr(skb)->protocol != IPPROTO_IGMP)
bc044e8d 1620 return -EINVAL;
b5f7e755 1621 } else {
9e56e380 1622 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1623 in_dev, itag);
b5f7e755 1624 if (err < 0)
bc044e8d 1625 return err;
b5f7e755 1626 }
bc044e8d
PA
1627 return 0;
1628}
1629
1630/* called in rcu_read_lock() section */
1631static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1632 u8 tos, struct net_device *dev, int our)
1633{
1634 struct in_device *in_dev = __in_dev_get_rcu(dev);
1635 unsigned int flags = RTCF_MULTICAST;
1636 struct rtable *rth;
1637 u32 itag = 0;
1638 int err;
1639
1640 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1641 if (err)
1642 return err;
1643
d08c4f35
DA
1644 if (our)
1645 flags |= RTCF_LOCAL;
1646
1647 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1648 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1649 if (!rth)
bc044e8d 1650 return -ENOBUFS;
1da177e4 1651
cf911662
DM
1652#ifdef CONFIG_IP_ROUTE_CLASSID
1653 rth->dst.tclassid = itag;
1654#endif
d8d1f30b 1655 rth->dst.output = ip_rt_bug;
9917e1e8 1656 rth->rt_is_input= 1;
1da177e4
LT
1657
1658#ifdef CONFIG_IP_MROUTE
f97c1e0c 1659 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1660 rth->dst.input = ip_mr_input;
1da177e4
LT
1661#endif
1662 RT_CACHE_STAT_INC(in_slow_mc);
1663
89aef892
DM
1664 skb_dst_set(skb, &rth->dst);
1665 return 0;
1da177e4
LT
1666}
1667
1668
1669static void ip_handle_martian_source(struct net_device *dev,
1670 struct in_device *in_dev,
1671 struct sk_buff *skb,
9e12bb22
AV
1672 __be32 daddr,
1673 __be32 saddr)
1da177e4
LT
1674{
1675 RT_CACHE_STAT_INC(in_martian_src);
1676#ifdef CONFIG_IP_ROUTE_VERBOSE
1677 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1678 /*
1679 * RFC1812 recommendation, if source is martian,
1680 * the only hint is MAC header.
1681 */
058bd4d2 1682 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1683 &daddr, &saddr, dev->name);
98e399f8 1684 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1685 print_hex_dump(KERN_WARNING, "ll header: ",
1686 DUMP_PREFIX_OFFSET, 16, 1,
1687 skb_mac_header(skb),
b2c85100 1688 dev->hard_header_len, false);
1da177e4
LT
1689 }
1690 }
1691#endif
1692}
1693
47360228 1694/* called in rcu_read_lock() section */
5969f71d 1695static int __mkroute_input(struct sk_buff *skb,
982721f3 1696 const struct fib_result *res,
5969f71d 1697 struct in_device *in_dev,
c6cffba4 1698 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1699{
2ffae99d 1700 struct fib_nh_exception *fnhe;
1da177e4
LT
1701 struct rtable *rth;
1702 int err;
1703 struct in_device *out_dev;
d2d68ba9 1704 bool do_cache;
fbdc0ad0 1705 u32 itag = 0;
1da177e4
LT
1706
1707 /* get a working reference to the output device */
47360228 1708 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1709 if (!out_dev) {
e87cc472 1710 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1711 return -EINVAL;
1712 }
1713
5c04c819 1714 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1715 in_dev->dev, in_dev, &itag);
1da177e4 1716 if (err < 0) {
e905a9ed 1717 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1718 saddr);
e905a9ed 1719
1da177e4
LT
1720 goto cleanup;
1721 }
1722
e81da0e1
JA
1723 do_cache = res->fi && !itag;
1724 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1725 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1726 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1727 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1728 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1729
1730 if (skb->protocol != htons(ETH_P_IP)) {
1731 /* Not IP (i.e. ARP). Do not create route, if it is
1732 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1733 *
1734 * Proxy arp feature have been extended to allow, ARP
1735 * replies back to the same interface, to support
1736 * Private VLAN switch technologies. See arp.c.
1da177e4 1737 */
65324144
JDB
1738 if (out_dev == in_dev &&
1739 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1740 err = -EINVAL;
1741 goto cleanup;
1742 }
1743 }
1744
2ffae99d 1745 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1746 if (do_cache) {
94720e3a 1747 if (fnhe)
2ffae99d 1748 rth = rcu_dereference(fnhe->fnhe_rth_input);
94720e3a
JA
1749 else
1750 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
e81da0e1
JA
1751 if (rt_cache_valid(rth)) {
1752 skb_dst_set_noref(skb, &rth->dst);
1753 goto out;
d2d68ba9
DM
1754 }
1755 }
f2bb4bed 1756
d08c4f35 1757 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1758 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1759 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1760 if (!rth) {
1761 err = -ENOBUFS;
1762 goto cleanup;
1763 }
1764
9917e1e8 1765 rth->rt_is_input = 1;
a6254864 1766 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1767
d8d1f30b 1768 rth->dst.input = ip_forward;
1da177e4 1769
a4c2fd7f
WW
1770 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1771 do_cache);
9942895b 1772 lwtunnel_set_redirect(&rth->dst);
c6cffba4 1773 skb_dst_set(skb, &rth->dst);
d2d68ba9 1774out:
1da177e4
LT
1775 err = 0;
1776 cleanup:
1da177e4 1777 return err;
e905a9ed 1778}
1da177e4 1779
79a13159 1780#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1781/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1782 * calculated from the inner IP addresses.
79a13159 1783 */
bf4e0a3d
NA
1784static void ip_multipath_l3_keys(const struct sk_buff *skb,
1785 struct flow_keys *hash_keys)
79a13159
PN
1786{
1787 const struct iphdr *outer_iph = ip_hdr(skb);
6f74b6c2 1788 const struct iphdr *key_iph = outer_iph;
bf4e0a3d 1789 const struct iphdr *inner_iph;
79a13159
PN
1790 const struct icmphdr *icmph;
1791 struct iphdr _inner_iph;
bf4e0a3d
NA
1792 struct icmphdr _icmph;
1793
bf4e0a3d 1794 if (likely(outer_iph->protocol != IPPROTO_ICMP))
6f74b6c2 1795 goto out;
79a13159
PN
1796
1797 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
6f74b6c2 1798 goto out;
79a13159
PN
1799
1800 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1801 &_icmph);
1802 if (!icmph)
6f74b6c2 1803 goto out;
79a13159
PN
1804
1805 if (icmph->type != ICMP_DEST_UNREACH &&
1806 icmph->type != ICMP_REDIRECT &&
1807 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d 1808 icmph->type != ICMP_PARAMETERPROB)
6f74b6c2 1809 goto out;
79a13159
PN
1810
1811 inner_iph = skb_header_pointer(skb,
1812 outer_iph->ihl * 4 + sizeof(_icmph),
1813 sizeof(_inner_iph), &_inner_iph);
1814 if (!inner_iph)
6f74b6c2
DA
1815 goto out;
1816
1817 key_iph = inner_iph;
1818out:
1819 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1820 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
bf4e0a3d 1821}
79a13159 1822
bf4e0a3d 1823/* if skb is set it will be used and fl4 can be NULL */
7efc0b6b 1824int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
e37b1e97 1825 const struct sk_buff *skb, struct flow_keys *flkeys)
bf4e0a3d 1826{
2a8e4997 1827 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
bf4e0a3d
NA
1828 struct flow_keys hash_keys;
1829 u32 mhash;
79a13159 1830
bf4e0a3d
NA
1831 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1832 case 0:
1833 memset(&hash_keys, 0, sizeof(hash_keys));
1834 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1835 if (skb) {
1836 ip_multipath_l3_keys(skb, &hash_keys);
1837 } else {
1838 hash_keys.addrs.v4addrs.src = fl4->saddr;
1839 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1840 }
1841 break;
1842 case 1:
1843 /* skb is currently provided only when forwarding */
1844 if (skb) {
1845 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1846 struct flow_keys keys;
1847
1848 /* short-circuit if we already have L4 hash present */
1849 if (skb->l4_hash)
1850 return skb_get_hash_raw(skb) >> 1;
ec7127a5 1851
bf4e0a3d 1852 memset(&hash_keys, 0, sizeof(hash_keys));
1fe4b118 1853
ec7127a5 1854 if (!flkeys) {
e37b1e97 1855 skb_flow_dissect_flow_keys(skb, &keys, flag);
ec7127a5 1856 flkeys = &keys;
e37b1e97 1857 }
ec7127a5
DA
1858
1859 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1860 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1861 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1862 hash_keys.ports.src = flkeys->ports.src;
1863 hash_keys.ports.dst = flkeys->ports.dst;
1864 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
bf4e0a3d
NA
1865 } else {
1866 memset(&hash_keys, 0, sizeof(hash_keys));
1867 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1868 hash_keys.addrs.v4addrs.src = fl4->saddr;
1869 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1870 hash_keys.ports.src = fl4->fl4_sport;
1871 hash_keys.ports.dst = fl4->fl4_dport;
1872 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1873 }
1874 break;
1875 }
1876 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1877
24ba1440 1878 if (multipath_hash)
1879 mhash = jhash_2words(mhash, multipath_hash, 0);
1880
bf4e0a3d
NA
1881 return mhash >> 1;
1882}
79a13159
PN
1883#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1884
5969f71d
SH
1885static int ip_mkroute_input(struct sk_buff *skb,
1886 struct fib_result *res,
5969f71d 1887 struct in_device *in_dev,
e37b1e97
RP
1888 __be32 daddr, __be32 saddr, u32 tos,
1889 struct flow_keys *hkeys)
1da177e4 1890{
1da177e4 1891#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1892 if (res->fi && res->fi->fib_nhs > 1) {
7efc0b6b 1893 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
0e884c78 1894
0e884c78
PN
1895 fib_select_multipath(res, h);
1896 }
1da177e4
LT
1897#endif
1898
1899 /* create a routing cache entry */
c6cffba4 1900 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1901}
1902
1da177e4
LT
1903/*
1904 * NOTE. We drop all the packets that has local source
1905 * addresses, because every properly looped back packet
1906 * must have correct destination already attached by output routine.
1907 *
1908 * Such approach solves two big problems:
1909 * 1. Not simplex devices are handled properly.
1910 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1911 * called with rcu_read_lock()
1da177e4
LT
1912 */
1913
9e12bb22 1914static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1915 u8 tos, struct net_device *dev,
1916 struct fib_result *res)
1da177e4 1917{
96d36220 1918 struct in_device *in_dev = __in_dev_get_rcu(dev);
e37b1e97
RP
1919 struct flow_keys *flkeys = NULL, _flkeys;
1920 struct net *net = dev_net(dev);
1b7179d3 1921 struct ip_tunnel_info *tun_info;
e37b1e97 1922 int err = -EINVAL;
95c96174 1923 unsigned int flags = 0;
1da177e4 1924 u32 itag = 0;
95c96174 1925 struct rtable *rth;
e37b1e97 1926 struct flowi4 fl4;
d2d68ba9 1927 bool do_cache;
1da177e4
LT
1928
1929 /* IP on this device is disabled. */
1930
1931 if (!in_dev)
1932 goto out;
1933
1934 /* Check for the most weird martians, which can be not detected
1935 by fib_lookup.
1936 */
1937
61adedf3 1938 tun_info = skb_tunnel_info(skb);
46fa062a 1939 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1940 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1941 else
1942 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1943 skb_dst_drop(skb);
1944
d0daebc3 1945 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1946 goto martian_source;
1947
5510cdf7
DA
1948 res->fi = NULL;
1949 res->table = NULL;
27a954bd 1950 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1951 goto brd_input;
1952
1953 /* Accept zero addresses only to limited broadcast;
1954 * I even do not know to fix it or not. Waiting for complains :-)
1955 */
f97c1e0c 1956 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1957 goto martian_source;
1958
d0daebc3 1959 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1960 goto martian_destination;
1961
9eb43e76
ED
1962 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1963 * and call it once if daddr or/and saddr are loopback addresses
1964 */
1965 if (ipv4_is_loopback(daddr)) {
1966 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1967 goto martian_destination;
9eb43e76
ED
1968 } else if (ipv4_is_loopback(saddr)) {
1969 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1970 goto martian_source;
1971 }
1972
1da177e4
LT
1973 /*
1974 * Now we are ready to route packet.
1975 */
68a5e3dd 1976 fl4.flowi4_oif = 0;
e0d56fdd 1977 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1978 fl4.flowi4_mark = skb->mark;
1979 fl4.flowi4_tos = tos;
1980 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1981 fl4.flowi4_flags = 0;
68a5e3dd
DM
1982 fl4.daddr = daddr;
1983 fl4.saddr = saddr;
8bcfd092 1984 fl4.flowi4_uid = sock_net_uid(net, NULL);
e37b1e97 1985
5a847a6e 1986 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
e37b1e97 1987 flkeys = &_flkeys;
5a847a6e
DA
1988 } else {
1989 fl4.flowi4_proto = 0;
1990 fl4.fl4_sport = 0;
1991 fl4.fl4_dport = 0;
1992 }
e37b1e97 1993
5510cdf7 1994 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1995 if (err != 0) {
1996 if (!IN_DEV_FORWARD(in_dev))
1997 err = -EHOSTUNREACH;
1da177e4 1998 goto no_route;
cd0f0b95 1999 }
1da177e4 2000
5cbf777c
XL
2001 if (res->type == RTN_BROADCAST) {
2002 if (IN_DEV_BFORWARD(in_dev))
2003 goto make_route;
1da177e4 2004 goto brd_input;
5cbf777c 2005 }
1da177e4 2006
5510cdf7 2007 if (res->type == RTN_LOCAL) {
5c04c819 2008 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 2009 0, dev, in_dev, &itag);
b5f7e755 2010 if (err < 0)
0d753960 2011 goto martian_source;
1da177e4
LT
2012 goto local_input;
2013 }
2014
cd0f0b95
DJ
2015 if (!IN_DEV_FORWARD(in_dev)) {
2016 err = -EHOSTUNREACH;
251da413 2017 goto no_route;
cd0f0b95 2018 }
5510cdf7 2019 if (res->type != RTN_UNICAST)
1da177e4
LT
2020 goto martian_destination;
2021
5cbf777c 2022make_route:
e37b1e97 2023 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1da177e4
LT
2024out: return err;
2025
2026brd_input:
2027 if (skb->protocol != htons(ETH_P_IP))
2028 goto e_inval;
2029
41347dcd 2030 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2031 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2032 in_dev, &itag);
1da177e4 2033 if (err < 0)
0d753960 2034 goto martian_source;
1da177e4
LT
2035 }
2036 flags |= RTCF_BROADCAST;
5510cdf7 2037 res->type = RTN_BROADCAST;
1da177e4
LT
2038 RT_CACHE_STAT_INC(in_brd);
2039
2040local_input:
d2d68ba9 2041 do_cache = false;
5510cdf7 2042 if (res->fi) {
fe3edf45 2043 if (!itag) {
5510cdf7 2044 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2045 if (rt_cache_valid(rth)) {
c6cffba4
DM
2046 skb_dst_set_noref(skb, &rth->dst);
2047 err = 0;
2048 goto out;
d2d68ba9
DM
2049 }
2050 do_cache = true;
2051 }
2052 }
2053
f5a0aab8 2054 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2055 flags | RTCF_LOCAL, res->type,
d2d68ba9 2056 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2057 if (!rth)
2058 goto e_nobufs;
2059
d8d1f30b 2060 rth->dst.output= ip_rt_bug;
cf911662
DM
2061#ifdef CONFIG_IP_ROUTE_CLASSID
2062 rth->dst.tclassid = itag;
2063#endif
9917e1e8 2064 rth->rt_is_input = 1;
571e7226 2065
a6254864 2066 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2067 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2068 rth->dst.input= ip_error;
2069 rth->dst.error= -err;
1da177e4
LT
2070 rth->rt_flags &= ~RTCF_LOCAL;
2071 }
efd85700 2072
dcdfdf56 2073 if (do_cache) {
5510cdf7 2074 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2075
2076 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2077 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2078 WARN_ON(rth->dst.input == lwtunnel_input);
2079 rth->dst.lwtstate->orig_input = rth->dst.input;
2080 rth->dst.input = lwtunnel_input;
2081 }
2082
a4c2fd7f 2083 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2084 rt_add_uncached_list(rth);
dcdfdf56 2085 }
89aef892 2086 skb_dst_set(skb, &rth->dst);
b23dd4fe 2087 err = 0;
ebc0ffae 2088 goto out;
1da177e4
LT
2089
2090no_route:
2091 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2092 res->type = RTN_UNREACHABLE;
2093 res->fi = NULL;
2094 res->table = NULL;
1da177e4
LT
2095 goto local_input;
2096
2097 /*
2098 * Do not cache martian addresses: they should be logged (RFC1812)
2099 */
2100martian_destination:
2101 RT_CACHE_STAT_INC(in_martian_dst);
2102#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2103 if (IN_DEV_LOG_MARTIANS(in_dev))
2104 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2105 &daddr, &saddr, dev->name);
1da177e4 2106#endif
2c2910a4 2107
1da177e4
LT
2108e_inval:
2109 err = -EINVAL;
ebc0ffae 2110 goto out;
1da177e4
LT
2111
2112e_nobufs:
2113 err = -ENOBUFS;
ebc0ffae 2114 goto out;
1da177e4
LT
2115
2116martian_source:
2117 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2118 goto out;
1da177e4
LT
2119}
2120
c6cffba4
DM
2121int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2122 u8 tos, struct net_device *dev)
1da177e4 2123{
5510cdf7
DA
2124 struct fib_result res;
2125 int err;
1da177e4 2126
6e28099d 2127 tos &= IPTOS_RT_MASK;
96d36220 2128 rcu_read_lock();
5510cdf7
DA
2129 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2130 rcu_read_unlock();
96d36220 2131
5510cdf7
DA
2132 return err;
2133}
2134EXPORT_SYMBOL(ip_route_input_noref);
2135
2136/* called with rcu_read_lock held */
2137int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2138 u8 tos, struct net_device *dev, struct fib_result *res)
2139{
1da177e4
LT
2140 /* Multicast recognition logic is moved from route cache to here.
2141 The problem was that too many Ethernet cards have broken/missing
2142 hardware multicast filters :-( As result the host on multicasting
2143 network acquires a lot of useless route cache entries, sort of
2144 SDR messages from all the world. Now we try to get rid of them.
2145 Really, provided software IP multicast filter is organized
2146 reasonably (at least, hashed), it does not result in a slowdown
2147 comparing with route cache reject entries.
2148 Note, that multicast routers are not affected, because
2149 route cache entry is created eventually.
2150 */
f97c1e0c 2151 if (ipv4_is_multicast(daddr)) {
96d36220 2152 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2153 int our = 0;
5510cdf7 2154 int err = -EINVAL;
1da177e4 2155
22c74764
PA
2156 if (!in_dev)
2157 return err;
2158 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2159 ip_hdr(skb)->protocol);
e58e4159
DA
2160
2161 /* check l3 master if no match yet */
22c74764 2162 if (!our && netif_is_l3_slave(dev)) {
e58e4159
DA
2163 struct in_device *l3_in_dev;
2164
2165 l3_in_dev = __in_dev_get_rcu(skb->dev);
2166 if (l3_in_dev)
2167 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2168 ip_hdr(skb)->protocol);
2169 }
2170
e58e4159 2171 if (our
1da177e4 2172#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2173 ||
2174 (!ipv4_is_local_multicast(daddr) &&
2175 IN_DEV_MFORWARD(in_dev))
1da177e4 2176#endif
e58e4159 2177 ) {
5510cdf7 2178 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2179 tos, dev, our);
1da177e4 2180 }
5510cdf7 2181 return err;
1da177e4 2182 }
5510cdf7
DA
2183
2184 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2185}
2186
ebc0ffae 2187/* called with rcu_read_lock() */
982721f3 2188static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2189 const struct flowi4 *fl4, int orig_oif,
f61759e6 2190 struct net_device *dev_out,
5ada5527 2191 unsigned int flags)
1da177e4 2192{
982721f3 2193 struct fib_info *fi = res->fi;
f2bb4bed 2194 struct fib_nh_exception *fnhe;
5ada5527 2195 struct in_device *in_dev;
982721f3 2196 u16 type = res->type;
5ada5527 2197 struct rtable *rth;
c92b9655 2198 bool do_cache;
1da177e4 2199
d0daebc3
TG
2200 in_dev = __in_dev_get_rcu(dev_out);
2201 if (!in_dev)
5ada5527 2202 return ERR_PTR(-EINVAL);
1da177e4 2203
d0daebc3 2204 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2205 if (ipv4_is_loopback(fl4->saddr) &&
2206 !(dev_out->flags & IFF_LOOPBACK) &&
2207 !netif_is_l3_master(dev_out))
d0daebc3
TG
2208 return ERR_PTR(-EINVAL);
2209
68a5e3dd 2210 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2211 type = RTN_BROADCAST;
68a5e3dd 2212 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2213 type = RTN_MULTICAST;
68a5e3dd 2214 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2215 return ERR_PTR(-EINVAL);
1da177e4
LT
2216
2217 if (dev_out->flags & IFF_LOOPBACK)
2218 flags |= RTCF_LOCAL;
2219
63617421 2220 do_cache = true;
982721f3 2221 if (type == RTN_BROADCAST) {
1da177e4 2222 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2223 fi = NULL;
2224 } else if (type == RTN_MULTICAST) {
dd28d1a0 2225 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2226 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2227 fl4->flowi4_proto))
1da177e4 2228 flags &= ~RTCF_LOCAL;
63617421
JA
2229 else
2230 do_cache = false;
1da177e4 2231 /* If multicast route do not exist use
dd28d1a0
ED
2232 * default one, but do not gateway in this case.
2233 * Yes, it is hack.
1da177e4 2234 */
982721f3
DM
2235 if (fi && res->prefixlen < 4)
2236 fi = NULL;
d6d5e999
CF
2237 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2238 (orig_oif != dev_out->ifindex)) {
2239 /* For local routes that require a particular output interface
2240 * we do not want to cache the result. Caching the result
2241 * causes incorrect behaviour when there are multiple source
2242 * addresses on the interface, the end result being that if the
2243 * intended recipient is waiting on that interface for the
2244 * packet he won't receive it because it will be delivered on
2245 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2246 * be set to the loopback interface as well.
2247 */
94720e3a 2248 do_cache = false;
1da177e4
LT
2249 }
2250
f2bb4bed 2251 fnhe = NULL;
63617421 2252 do_cache &= fi != NULL;
94720e3a 2253 if (fi) {
c5038a83 2254 struct rtable __rcu **prth;
c92b9655 2255 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2256
c92b9655 2257 fnhe = find_exception(nh, fl4->daddr);
94720e3a
JA
2258 if (!do_cache)
2259 goto add;
deed49df 2260 if (fnhe) {
2ffae99d 2261 prth = &fnhe->fnhe_rth_output;
94720e3a
JA
2262 } else {
2263 if (unlikely(fl4->flowi4_flags &
2264 FLOWI_FLAG_KNOWN_NH &&
2265 !(nh->nh_gw &&
2266 nh->nh_scope == RT_SCOPE_LINK))) {
2267 do_cache = false;
2268 goto add;
c92b9655 2269 }
94720e3a 2270 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c92b9655 2271 }
c5038a83 2272 rth = rcu_dereference(*prth);
9df16efa 2273 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2274 return rth;
f2bb4bed 2275 }
c92b9655
JA
2276
2277add:
d08c4f35 2278 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2279 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2280 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2281 do_cache);
8391d07b 2282 if (!rth)
5ada5527 2283 return ERR_PTR(-ENOBUFS);
8391d07b 2284
9438c871 2285 rth->rt_iif = orig_oif;
b7503e0c 2286
1da177e4
LT
2287 RT_CACHE_STAT_INC(out_slow_tot);
2288
1da177e4 2289 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2290 if (flags & RTCF_LOCAL &&
1da177e4 2291 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2292 rth->dst.output = ip_mc_output;
1da177e4
LT
2293 RT_CACHE_STAT_INC(out_slow_mc);
2294 }
2295#ifdef CONFIG_IP_MROUTE
982721f3 2296 if (type == RTN_MULTICAST) {
1da177e4 2297 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2298 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2299 rth->dst.input = ip_mr_input;
2300 rth->dst.output = ip_mc_output;
1da177e4
LT
2301 }
2302 }
2303#endif
2304 }
2305
a4c2fd7f 2306 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
9942895b 2307 lwtunnel_set_redirect(&rth->dst);
1da177e4 2308
5ada5527 2309 return rth;
1da177e4
LT
2310}
2311
1da177e4
LT
2312/*
2313 * Major route resolver routine.
2314 */
2315
3abd1ade
DA
2316struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2317 const struct sk_buff *skb)
1da177e4 2318{
f61759e6 2319 __u8 tos = RT_FL_TOS(fl4);
d0ea2b12
ED
2320 struct fib_result res = {
2321 .type = RTN_UNSPEC,
2322 .fi = NULL,
2323 .table = NULL,
2324 .tclassid = 0,
2325 };
5ada5527 2326 struct rtable *rth;
1da177e4 2327
1fb9489b 2328 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2329 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2330 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2331 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2332
010c2708 2333 rcu_read_lock();
3abd1ade
DA
2334 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2335 rcu_read_unlock();
2336
2337 return rth;
2338}
2339EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2340
2341struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2342 struct fib_result *res,
2343 const struct sk_buff *skb)
2344{
2345 struct net_device *dev_out = NULL;
2346 int orig_oif = fl4->flowi4_oif;
2347 unsigned int flags = 0;
2348 struct rtable *rth;
2349 int err = -ENETUNREACH;
2350
813b3b5d 2351 if (fl4->saddr) {
b23dd4fe 2352 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2353 if (ipv4_is_multicast(fl4->saddr) ||
2354 ipv4_is_lbcast(fl4->saddr) ||
2355 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2356 goto out;
2357
1da177e4
LT
2358 /* I removed check for oif == dev_out->oif here.
2359 It was wrong for two reasons:
1ab35276
DL
2360 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2361 is assigned to multiple interfaces.
1da177e4
LT
2362 2. Moreover, we are allowed to send packets with saddr
2363 of another iface. --ANK
2364 */
2365
813b3b5d
DM
2366 if (fl4->flowi4_oif == 0 &&
2367 (ipv4_is_multicast(fl4->daddr) ||
2368 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2369 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2370 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2371 if (!dev_out)
a210d01a
JA
2372 goto out;
2373
1da177e4
LT
2374 /* Special hack: user can direct multicasts
2375 and limited broadcast via necessary interface
2376 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2377 This hack is not just for fun, it allows
2378 vic,vat and friends to work.
2379 They bind socket to loopback, set ttl to zero
2380 and expect that it will work.
2381 From the viewpoint of routing cache they are broken,
2382 because we are not allowed to build multicast path
2383 with loopback source addr (look, routing cache
2384 cannot know, that ttl is zero, so that packet
2385 will not leave this host and route is valid).
2386 Luckily, this hack is good workaround.
2387 */
2388
813b3b5d 2389 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2390 goto make_route;
2391 }
a210d01a 2392
813b3b5d 2393 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2394 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2395 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2396 goto out;
a210d01a 2397 }
1da177e4
LT
2398 }
2399
2400
813b3b5d
DM
2401 if (fl4->flowi4_oif) {
2402 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2403 rth = ERR_PTR(-ENODEV);
51456b29 2404 if (!dev_out)
1da177e4 2405 goto out;
e5ed6399
HX
2406
2407 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2408 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2409 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2410 goto out;
2411 }
813b3b5d 2412 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2413 ipv4_is_lbcast(fl4->daddr) ||
2414 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2415 if (!fl4->saddr)
2416 fl4->saddr = inet_select_addr(dev_out, 0,
2417 RT_SCOPE_LINK);
1da177e4
LT
2418 goto make_route;
2419 }
0a7e2260 2420 if (!fl4->saddr) {
813b3b5d
DM
2421 if (ipv4_is_multicast(fl4->daddr))
2422 fl4->saddr = inet_select_addr(dev_out, 0,
2423 fl4->flowi4_scope);
2424 else if (!fl4->daddr)
2425 fl4->saddr = inet_select_addr(dev_out, 0,
2426 RT_SCOPE_HOST);
1da177e4
LT
2427 }
2428 }
2429
813b3b5d
DM
2430 if (!fl4->daddr) {
2431 fl4->daddr = fl4->saddr;
2432 if (!fl4->daddr)
2433 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2434 dev_out = net->loopback_dev;
1fb9489b 2435 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2436 res->type = RTN_LOCAL;
1da177e4
LT
2437 flags |= RTCF_LOCAL;
2438 goto make_route;
2439 }
2440
3abd1ade 2441 err = fib_lookup(net, fl4, res, 0);
0315e382 2442 if (err) {
3abd1ade
DA
2443 res->fi = NULL;
2444 res->table = NULL;
6104e112 2445 if (fl4->flowi4_oif &&
e58e4159
DA
2446 (ipv4_is_multicast(fl4->daddr) ||
2447 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2448 /* Apparently, routing tables are wrong. Assume,
2449 that the destination is on link.
2450
2451 WHY? DW.
2452 Because we are allowed to send to iface
2453 even if it has NO routes and NO assigned
2454 addresses. When oif is specified, routing
2455 tables are looked up with only one purpose:
2456 to catch if destination is gatewayed, rather than
2457 direct. Moreover, if MSG_DONTROUTE is set,
2458 we send packet, ignoring both routing tables
2459 and ifaddr state. --ANK
2460
2461
2462 We could make it even if oif is unknown,
2463 likely IPv6, but we do not.
2464 */
2465
813b3b5d
DM
2466 if (fl4->saddr == 0)
2467 fl4->saddr = inet_select_addr(dev_out, 0,
2468 RT_SCOPE_LINK);
3abd1ade 2469 res->type = RTN_UNICAST;
1da177e4
LT
2470 goto make_route;
2471 }
0315e382 2472 rth = ERR_PTR(err);
1da177e4
LT
2473 goto out;
2474 }
1da177e4 2475
3abd1ade 2476 if (res->type == RTN_LOCAL) {
813b3b5d 2477 if (!fl4->saddr) {
3abd1ade
DA
2478 if (res->fi->fib_prefsrc)
2479 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2480 else
813b3b5d 2481 fl4->saddr = fl4->daddr;
9fc3bbb4 2482 }
5f02ce24
DA
2483
2484 /* L3 master device is the loopback for that domain */
3abd1ade 2485 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2486 net->loopback_dev;
839da4d9
DA
2487
2488 /* make sure orig_oif points to fib result device even
2489 * though packet rx/tx happens over loopback or l3mdev
2490 */
2491 orig_oif = FIB_RES_OIF(*res);
2492
813b3b5d 2493 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2494 flags |= RTCF_LOCAL;
2495 goto make_route;
2496 }
2497
3abd1ade 2498 fib_select_path(net, res, fl4, skb);
1da177e4 2499
3abd1ade 2500 dev_out = FIB_RES_DEV(*res);
813b3b5d 2501 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2502
2503
2504make_route:
3abd1ade 2505 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2506
010c2708 2507out:
b23dd4fe 2508 return rth;
1da177e4 2509}
d8c97a94 2510
ae2688d5
JW
2511static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2512{
2513 return NULL;
2514}
2515
ebb762f2 2516static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2517{
618f9bc7
SK
2518 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2519
2520 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2521}
2522
6700c270
DM
2523static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2524 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2525{
2526}
2527
6700c270
DM
2528static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2529 struct sk_buff *skb)
b587ee3b
DM
2530{
2531}
2532
0972ddb2
HB
2533static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2534 unsigned long old)
2535{
2536 return NULL;
2537}
2538
14e50e57
DM
2539static struct dst_ops ipv4_dst_blackhole_ops = {
2540 .family = AF_INET,
ae2688d5 2541 .check = ipv4_blackhole_dst_check,
ebb762f2 2542 .mtu = ipv4_blackhole_mtu,
214f45c9 2543 .default_advmss = ipv4_default_advmss,
14e50e57 2544 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2545 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2546 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2547 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2548};
2549
2774c131 2550struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2551{
2774c131 2552 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2553 struct rtable *rt;
14e50e57 2554
6c0e7284 2555 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2556 if (rt) {
d8d1f30b 2557 struct dst_entry *new = &rt->dst;
14e50e57 2558
14e50e57 2559 new->__use = 1;
352e512c 2560 new->input = dst_discard;
ede2059d 2561 new->output = dst_discard_out;
14e50e57 2562
1dbe3252 2563 new->dev = net->loopback_dev;
14e50e57
DM
2564 if (new->dev)
2565 dev_hold(new->dev);
2566
9917e1e8 2567 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2568 rt->rt_iif = ort->rt_iif;
5943634f 2569 rt->rt_pmtu = ort->rt_pmtu;
d52e5a7e 2570 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2571
ca4c3fc2 2572 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2573 rt->rt_flags = ort->rt_flags;
2574 rt->rt_type = ort->rt_type;
14e50e57 2575 rt->rt_gateway = ort->rt_gateway;
155e8336 2576 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2577
caacf05e 2578 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2579 }
2580
2774c131
DM
2581 dst_release(dst_orig);
2582
2583 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2584}
2585
9d6ec938 2586struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2587 const struct sock *sk)
1da177e4 2588{
9d6ec938 2589 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2590
b23dd4fe
DM
2591 if (IS_ERR(rt))
2592 return rt;
1da177e4 2593
56157872 2594 if (flp4->flowi4_proto)
f92ee619
SK
2595 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2596 flowi4_to_flowi(flp4),
2597 sk, 0);
1da177e4 2598
b23dd4fe 2599 return rt;
1da177e4 2600}
d8c97a94
ACM
2601EXPORT_SYMBOL_GPL(ip_route_output_flow);
2602
3765d35e 2603/* called with rcu_read_lock held */
404eb77e
RP
2604static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2605 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2606 struct sk_buff *skb, u32 portid, u32 seq)
1da177e4 2607{
1da177e4 2608 struct rtmsg *r;
be403ea1 2609 struct nlmsghdr *nlh;
2bc8ca40 2610 unsigned long expires = 0;
f185071d 2611 u32 error;
521f5490 2612 u32 metrics[RTAX_MAX];
be403ea1 2613
d3166e0c 2614 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2615 if (!nlh)
26932566 2616 return -EMSGSIZE;
be403ea1
TG
2617
2618 r = nlmsg_data(nlh);
1da177e4
LT
2619 r->rtm_family = AF_INET;
2620 r->rtm_dst_len = 32;
2621 r->rtm_src_len = 0;
d6c0a4f6 2622 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2623 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2624 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2625 goto nla_put_failure;
1da177e4
LT
2626 r->rtm_type = rt->rt_type;
2627 r->rtm_scope = RT_SCOPE_UNIVERSE;
2628 r->rtm_protocol = RTPROT_UNSPEC;
2629 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2630 if (rt->rt_flags & RTCF_NOTIFY)
2631 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2632 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2633 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2634
930345ea 2635 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2636 goto nla_put_failure;
1a00fee4 2637 if (src) {
1da177e4 2638 r->rtm_src_len = 32;
930345ea 2639 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2640 goto nla_put_failure;
1da177e4 2641 }
f3756b79
DM
2642 if (rt->dst.dev &&
2643 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2644 goto nla_put_failure;
c7066f70 2645#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2646 if (rt->dst.tclassid &&
2647 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2648 goto nla_put_failure;
1da177e4 2649#endif
41347dcd 2650 if (!rt_is_input_route(rt) &&
d6c0a4f6 2651 fl4->saddr != src) {
930345ea 2652 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2653 goto nla_put_failure;
2654 }
155e8336 2655 if (rt->rt_uses_gateway &&
930345ea 2656 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2657 goto nla_put_failure;
be403ea1 2658
ee9a8f7a
SK
2659 expires = rt->dst.expires;
2660 if (expires) {
2661 unsigned long now = jiffies;
2662
2663 if (time_before(now, expires))
2664 expires -= now;
2665 else
2666 expires = 0;
2667 }
2668
521f5490 2669 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2670 if (rt->rt_pmtu && expires)
521f5490 2671 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
d52e5a7e
SD
2672 if (rt->rt_mtu_locked && expires)
2673 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2674 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2675 goto nla_put_failure;
2676
b4869889 2677 if (fl4->flowi4_mark &&
68aaed54 2678 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2679 goto nla_put_failure;
963bfeee 2680
622ec2c9
LC
2681 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2682 nla_put_u32(skb, RTA_UID,
2683 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2684 goto nla_put_failure;
2685
d8d1f30b 2686 error = rt->dst.error;
be403ea1 2687
c7537967 2688 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2689#ifdef CONFIG_IP_MROUTE
2690 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2691 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2692 int err = ipmr_get_route(net, skb,
2693 fl4->saddr, fl4->daddr,
9f09eaea 2694 r, portid);
2cf75070 2695
8caaf7b6 2696 if (err <= 0) {
0c8d803f
DA
2697 if (err == 0)
2698 return 0;
2699 goto nla_put_failure;
8caaf7b6
ND
2700 }
2701 } else
2702#endif
404eb77e 2703 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
8caaf7b6 2704 goto nla_put_failure;
1da177e4
LT
2705 }
2706
f185071d 2707 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2708 goto nla_put_failure;
be403ea1 2709
053c095a
JB
2710 nlmsg_end(skb, nlh);
2711 return 0;
1da177e4 2712
be403ea1 2713nla_put_failure:
26932566
PM
2714 nlmsg_cancel(skb, nlh);
2715 return -EMSGSIZE;
1da177e4
LT
2716}
2717
404eb77e
RP
2718static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2719 u8 ip_proto, __be16 sport,
2720 __be16 dport)
2721{
2722 struct sk_buff *skb;
2723 struct iphdr *iph;
2724
2725 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2726 if (!skb)
2727 return NULL;
2728
2729 /* Reserve room for dummy headers, this skb can pass
2730 * through good chunk of routing engine.
2731 */
2732 skb_reset_mac_header(skb);
2733 skb_reset_network_header(skb);
2734 skb->protocol = htons(ETH_P_IP);
2735 iph = skb_put(skb, sizeof(struct iphdr));
2736 iph->protocol = ip_proto;
2737 iph->saddr = src;
2738 iph->daddr = dst;
2739 iph->version = 0x4;
2740 iph->frag_off = 0;
2741 iph->ihl = 0x5;
2742 skb_set_transport_header(skb, skb->len);
2743
2744 switch (iph->protocol) {
2745 case IPPROTO_UDP: {
2746 struct udphdr *udph;
2747
2748 udph = skb_put_zero(skb, sizeof(struct udphdr));
2749 udph->source = sport;
2750 udph->dest = dport;
2751 udph->len = sizeof(struct udphdr);
2752 udph->check = 0;
2753 break;
2754 }
2755 case IPPROTO_TCP: {
2756 struct tcphdr *tcph;
2757
2758 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2759 tcph->source = sport;
2760 tcph->dest = dport;
2761 tcph->doff = sizeof(struct tcphdr) / 4;
2762 tcph->rst = 1;
2763 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2764 src, dst, 0);
2765 break;
2766 }
2767 case IPPROTO_ICMP: {
2768 struct icmphdr *icmph;
2769
2770 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2771 icmph->type = ICMP_ECHO;
2772 icmph->code = 0;
2773 }
2774 }
2775
2776 return skb;
2777}
2778
a00302b6
JK
2779static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2780 const struct nlmsghdr *nlh,
2781 struct nlattr **tb,
2782 struct netlink_ext_ack *extack)
2783{
2784 struct rtmsg *rtm;
2785 int i, err;
2786
2787 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2788 NL_SET_ERR_MSG(extack,
2789 "ipv4: Invalid header for route get request");
2790 return -EINVAL;
2791 }
2792
2793 if (!netlink_strict_get_check(skb))
2794 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
2795 rtm_ipv4_policy, extack);
2796
2797 rtm = nlmsg_data(nlh);
2798 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2799 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2800 rtm->rtm_table || rtm->rtm_protocol ||
2801 rtm->rtm_scope || rtm->rtm_type) {
2802 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2803 return -EINVAL;
2804 }
2805
2806 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2807 RTM_F_LOOKUP_TABLE |
2808 RTM_F_FIB_MATCH)) {
2809 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2810 return -EINVAL;
2811 }
2812
2813 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2814 rtm_ipv4_policy, extack);
2815 if (err)
2816 return err;
2817
2818 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2819 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2820 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2821 return -EINVAL;
2822 }
2823
2824 for (i = 0; i <= RTA_MAX; i++) {
2825 if (!tb[i])
2826 continue;
2827
2828 switch (i) {
2829 case RTA_IIF:
2830 case RTA_OIF:
2831 case RTA_SRC:
2832 case RTA_DST:
2833 case RTA_IP_PROTO:
2834 case RTA_SPORT:
2835 case RTA_DPORT:
2836 case RTA_MARK:
2837 case RTA_UID:
2838 break;
2839 default:
2840 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2841 return -EINVAL;
2842 }
2843 }
2844
2845 return 0;
2846}
2847
c21ef3e3
DA
2848static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2849 struct netlink_ext_ack *extack)
1da177e4 2850{
3b1e0a65 2851 struct net *net = sock_net(in_skb->sk);
d889ce3b 2852 struct nlattr *tb[RTA_MAX+1];
404eb77e
RP
2853 u32 table_id = RT_TABLE_MAIN;
2854 __be16 sport = 0, dport = 0;
3765d35e 2855 struct fib_result res = {};
404eb77e 2856 u8 ip_proto = IPPROTO_UDP;
1da177e4 2857 struct rtable *rt = NULL;
404eb77e
RP
2858 struct sk_buff *skb;
2859 struct rtmsg *rtm;
e8e3fbe9 2860 struct flowi4 fl4 = {};
9e12bb22
AV
2861 __be32 dst = 0;
2862 __be32 src = 0;
404eb77e 2863 kuid_t uid;
9e12bb22 2864 u32 iif;
d889ce3b 2865 int err;
963bfeee 2866 int mark;
1da177e4 2867
a00302b6 2868 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
d889ce3b 2869 if (err < 0)
404eb77e 2870 return err;
d889ce3b
TG
2871
2872 rtm = nlmsg_data(nlh);
67b61f6c
JB
2873 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2874 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2875 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2876 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2877 if (tb[RTA_UID])
2878 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2879 else
2880 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2881
404eb77e
RP
2882 if (tb[RTA_IP_PROTO]) {
2883 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5e1a99ea 2884 &ip_proto, AF_INET, extack);
404eb77e
RP
2885 if (err)
2886 return err;
2887 }
bbadb9a2 2888
404eb77e
RP
2889 if (tb[RTA_SPORT])
2890 sport = nla_get_be16(tb[RTA_SPORT]);
bbadb9a2 2891
404eb77e
RP
2892 if (tb[RTA_DPORT])
2893 dport = nla_get_be16(tb[RTA_DPORT]);
2894
2895 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2896 if (!skb)
2897 return -ENOBUFS;
bbadb9a2 2898
d6c0a4f6
DM
2899 fl4.daddr = dst;
2900 fl4.saddr = src;
2901 fl4.flowi4_tos = rtm->rtm_tos;
2902 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2903 fl4.flowi4_mark = mark;
622ec2c9 2904 fl4.flowi4_uid = uid;
404eb77e
RP
2905 if (sport)
2906 fl4.fl4_sport = sport;
2907 if (dport)
2908 fl4.fl4_dport = dport;
2909 fl4.flowi4_proto = ip_proto;
d6c0a4f6 2910
3765d35e
DA
2911 rcu_read_lock();
2912
1da177e4 2913 if (iif) {
d889ce3b
TG
2914 struct net_device *dev;
2915
3765d35e 2916 dev = dev_get_by_index_rcu(net, iif);
51456b29 2917 if (!dev) {
d889ce3b 2918 err = -ENODEV;
404eb77e 2919 goto errout_rcu;
d889ce3b
TG
2920 }
2921
404eb77e 2922 fl4.flowi4_iif = iif; /* for rt_fill_info */
1da177e4 2923 skb->dev = dev;
963bfeee 2924 skb->mark = mark;
3765d35e
DA
2925 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2926 dev, &res);
d889ce3b 2927
511c3f92 2928 rt = skb_rtable(skb);
d8d1f30b
CG
2929 if (err == 0 && rt->dst.error)
2930 err = -rt->dst.error;
1da177e4 2931 } else {
6503a304 2932 fl4.flowi4_iif = LOOPBACK_IFINDEX;
21f94775 2933 skb->dev = net->loopback_dev;
3765d35e 2934 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2935 err = 0;
2936 if (IS_ERR(rt))
2937 err = PTR_ERR(rt);
2c87d63a
FW
2938 else
2939 skb_dst_set(skb, &rt->dst);
1da177e4 2940 }
d889ce3b 2941
1da177e4 2942 if (err)
404eb77e 2943 goto errout_rcu;
1da177e4 2944
1da177e4
LT
2945 if (rtm->rtm_flags & RTM_F_NOTIFY)
2946 rt->rt_flags |= RTCF_NOTIFY;
2947
c36ba660 2948 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
68e813aa 2949 table_id = res.table ? res.table->tb_id : 0;
c36ba660 2950
404eb77e
RP
2951 /* reset skb for netlink reply msg */
2952 skb_trim(skb, 0);
2953 skb_reset_network_header(skb);
2954 skb_reset_transport_header(skb);
2955 skb_reset_mac_header(skb);
2956
bc3aae2b
RP
2957 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2958 if (!res.fi) {
2959 err = fib_props[res.type].error;
2960 if (!err)
2961 err = -EHOSTUNREACH;
404eb77e 2962 goto errout_rcu;
bc3aae2b 2963 }
b6179813
RP
2964 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2965 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2966 rt->rt_type, res.prefix, res.prefixlen,
2967 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2968 } else {
404eb77e 2969 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
ba52d61e 2970 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2971 }
7b46a644 2972 if (err < 0)
404eb77e 2973 goto errout_rcu;
1da177e4 2974
3765d35e
DA
2975 rcu_read_unlock();
2976
15e47304 2977 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1da177e4 2978
d889ce3b 2979errout_free:
404eb77e
RP
2980 return err;
2981errout_rcu:
3765d35e 2982 rcu_read_unlock();
1da177e4 2983 kfree_skb(skb);
404eb77e 2984 goto errout_free;
1da177e4
LT
2985}
2986
1da177e4
LT
2987void ip_rt_multicast_event(struct in_device *in_dev)
2988{
4ccfe6d4 2989 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2990}
2991
2992#ifdef CONFIG_SYSCTL
082c7ca4
G
2993static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2994static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2995static int ip_rt_gc_elasticity __read_mostly = 8;
773daa3c 2996static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
082c7ca4 2997
fe2c6338 2998static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2999 void __user *buffer,
1da177e4
LT
3000 size_t *lenp, loff_t *ppos)
3001{
5aad1de5
TT
3002 struct net *net = (struct net *)__ctl->extra1;
3003
1da177e4 3004 if (write) {
5aad1de5
TT
3005 rt_cache_flush(net);
3006 fnhe_genid_bump(net);
1da177e4 3007 return 0;
e905a9ed 3008 }
1da177e4
LT
3009
3010 return -EINVAL;
3011}
3012
fe2c6338 3013static struct ctl_table ipv4_route_table[] = {
1da177e4 3014 {
1da177e4
LT
3015 .procname = "gc_thresh",
3016 .data = &ipv4_dst_ops.gc_thresh,
3017 .maxlen = sizeof(int),
3018 .mode = 0644,
6d9f239a 3019 .proc_handler = proc_dointvec,
1da177e4
LT
3020 },
3021 {
1da177e4
LT
3022 .procname = "max_size",
3023 .data = &ip_rt_max_size,
3024 .maxlen = sizeof(int),
3025 .mode = 0644,
6d9f239a 3026 .proc_handler = proc_dointvec,
1da177e4
LT
3027 },
3028 {
3029 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3030
1da177e4
LT
3031 .procname = "gc_min_interval",
3032 .data = &ip_rt_gc_min_interval,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
6d9f239a 3035 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3036 },
3037 {
1da177e4
LT
3038 .procname = "gc_min_interval_ms",
3039 .data = &ip_rt_gc_min_interval,
3040 .maxlen = sizeof(int),
3041 .mode = 0644,
6d9f239a 3042 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3043 },
3044 {
1da177e4
LT
3045 .procname = "gc_timeout",
3046 .data = &ip_rt_gc_timeout,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
6d9f239a 3049 .proc_handler = proc_dointvec_jiffies,
1da177e4 3050 },
9f28a2fc
ED
3051 {
3052 .procname = "gc_interval",
3053 .data = &ip_rt_gc_interval,
3054 .maxlen = sizeof(int),
3055 .mode = 0644,
3056 .proc_handler = proc_dointvec_jiffies,
3057 },
1da177e4 3058 {
1da177e4
LT
3059 .procname = "redirect_load",
3060 .data = &ip_rt_redirect_load,
3061 .maxlen = sizeof(int),
3062 .mode = 0644,
6d9f239a 3063 .proc_handler = proc_dointvec,
1da177e4
LT
3064 },
3065 {
1da177e4
LT
3066 .procname = "redirect_number",
3067 .data = &ip_rt_redirect_number,
3068 .maxlen = sizeof(int),
3069 .mode = 0644,
6d9f239a 3070 .proc_handler = proc_dointvec,
1da177e4
LT
3071 },
3072 {
1da177e4
LT
3073 .procname = "redirect_silence",
3074 .data = &ip_rt_redirect_silence,
3075 .maxlen = sizeof(int),
3076 .mode = 0644,
6d9f239a 3077 .proc_handler = proc_dointvec,
1da177e4
LT
3078 },
3079 {
1da177e4
LT
3080 .procname = "error_cost",
3081 .data = &ip_rt_error_cost,
3082 .maxlen = sizeof(int),
3083 .mode = 0644,
6d9f239a 3084 .proc_handler = proc_dointvec,
1da177e4
LT
3085 },
3086 {
1da177e4
LT
3087 .procname = "error_burst",
3088 .data = &ip_rt_error_burst,
3089 .maxlen = sizeof(int),
3090 .mode = 0644,
6d9f239a 3091 .proc_handler = proc_dointvec,
1da177e4
LT
3092 },
3093 {
1da177e4
LT
3094 .procname = "gc_elasticity",
3095 .data = &ip_rt_gc_elasticity,
3096 .maxlen = sizeof(int),
3097 .mode = 0644,
6d9f239a 3098 .proc_handler = proc_dointvec,
1da177e4
LT
3099 },
3100 {
1da177e4
LT
3101 .procname = "mtu_expires",
3102 .data = &ip_rt_mtu_expires,
3103 .maxlen = sizeof(int),
3104 .mode = 0644,
6d9f239a 3105 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3106 },
3107 {
1da177e4
LT
3108 .procname = "min_pmtu",
3109 .data = &ip_rt_min_pmtu,
3110 .maxlen = sizeof(int),
3111 .mode = 0644,
c7272c2f
SD
3112 .proc_handler = proc_dointvec_minmax,
3113 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
3114 },
3115 {
1da177e4
LT
3116 .procname = "min_adv_mss",
3117 .data = &ip_rt_min_advmss,
3118 .maxlen = sizeof(int),
3119 .mode = 0644,
6d9f239a 3120 .proc_handler = proc_dointvec,
1da177e4 3121 },
f8572d8f 3122 { }
1da177e4 3123};
39a23e75 3124
39a23e75
DL
3125static struct ctl_table ipv4_route_flush_table[] = {
3126 {
39a23e75
DL
3127 .procname = "flush",
3128 .maxlen = sizeof(int),
3129 .mode = 0200,
6d9f239a 3130 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3131 },
f8572d8f 3132 { },
39a23e75
DL
3133};
3134
3135static __net_init int sysctl_route_net_init(struct net *net)
3136{
3137 struct ctl_table *tbl;
3138
3139 tbl = ipv4_route_flush_table;
09ad9bc7 3140 if (!net_eq(net, &init_net)) {
39a23e75 3141 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 3142 if (!tbl)
39a23e75 3143 goto err_dup;
464dc801
EB
3144
3145 /* Don't export sysctls to unprivileged users */
3146 if (net->user_ns != &init_user_ns)
3147 tbl[0].procname = NULL;
39a23e75
DL
3148 }
3149 tbl[0].extra1 = net;
3150
ec8f23ce 3151 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 3152 if (!net->ipv4.route_hdr)
39a23e75
DL
3153 goto err_reg;
3154 return 0;
3155
3156err_reg:
3157 if (tbl != ipv4_route_flush_table)
3158 kfree(tbl);
3159err_dup:
3160 return -ENOMEM;
3161}
3162
3163static __net_exit void sysctl_route_net_exit(struct net *net)
3164{
3165 struct ctl_table *tbl;
3166
3167 tbl = net->ipv4.route_hdr->ctl_table_arg;
3168 unregister_net_sysctl_table(net->ipv4.route_hdr);
3169 BUG_ON(tbl == ipv4_route_flush_table);
3170 kfree(tbl);
3171}
3172
3173static __net_initdata struct pernet_operations sysctl_route_ops = {
3174 .init = sysctl_route_net_init,
3175 .exit = sysctl_route_net_exit,
3176};
1da177e4
LT
3177#endif
3178
3ee94372 3179static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3180{
ca4c3fc2 3181 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3182 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3183 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3184 return 0;
3185}
3186
3ee94372
NH
3187static __net_initdata struct pernet_operations rt_genid_ops = {
3188 .init = rt_genid_init,
9f5e97e5
DL
3189};
3190
c3426b47
DM
3191static int __net_init ipv4_inetpeer_init(struct net *net)
3192{
3193 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3194
3195 if (!bp)
3196 return -ENOMEM;
3197 inet_peer_base_init(bp);
3198 net->ipv4.peers = bp;
3199 return 0;
3200}
3201
3202static void __net_exit ipv4_inetpeer_exit(struct net *net)
3203{
3204 struct inet_peer_base *bp = net->ipv4.peers;
3205
3206 net->ipv4.peers = NULL;
56a6b248 3207 inetpeer_invalidate_tree(bp);
c3426b47
DM
3208 kfree(bp);
3209}
3210
3211static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3212 .init = ipv4_inetpeer_init,
3213 .exit = ipv4_inetpeer_exit,
3214};
9f5e97e5 3215
c7066f70 3216#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3217struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3218#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3219
1da177e4
LT
3220int __init ip_rt_init(void)
3221{
5055c371 3222 int cpu;
1da177e4 3223
6da2ec56
KC
3224 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3225 GFP_KERNEL);
73f156a6
ED
3226 if (!ip_idents)
3227 panic("IP: failed to allocate ip_idents\n");
3228
3229 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3230
355b590c
ED
3231 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3232 if (!ip_tstamps)
3233 panic("IP: failed to allocate ip_tstamps\n");
3234
5055c371
ED
3235 for_each_possible_cpu(cpu) {
3236 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3237
3238 INIT_LIST_HEAD(&ul->head);
3239 spin_lock_init(&ul->lock);
3240 }
c7066f70 3241#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3242 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3243 if (!ip_rt_acct)
3244 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3245#endif
3246
e5d679f3
AD
3247 ipv4_dst_ops.kmem_cachep =
3248 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3249 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3250
14e50e57
DM
3251 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3252
fc66f95c
ED
3253 if (dst_entries_init(&ipv4_dst_ops) < 0)
3254 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3255
3256 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3257 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3258
89aef892
DM
3259 ipv4_dst_ops.gc_thresh = ~0;
3260 ip_rt_max_size = INT_MAX;
1da177e4 3261
1da177e4
LT
3262 devinet_init();
3263 ip_fib_init();
3264
73b38711 3265 if (ip_rt_proc_init())
058bd4d2 3266 pr_err("Unable to create route proc files\n");
1da177e4
LT
3267#ifdef CONFIG_XFRM
3268 xfrm_init();
703fb94e 3269 xfrm4_init();
1da177e4 3270#endif
394f51ab
FW
3271 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3272 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3273
39a23e75
DL
3274#ifdef CONFIG_SYSCTL
3275 register_pernet_subsys(&sysctl_route_ops);
3276#endif
3ee94372 3277 register_pernet_subsys(&rt_genid_ops);
c3426b47 3278 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3279 return 0;
1da177e4
LT
3280}
3281
a1bc6eb4 3282#ifdef CONFIG_SYSCTL
eeb61f71
AV
3283/*
3284 * We really need to sanitize the damn ipv4 init order, then all
3285 * this nonsense will go away.
3286 */
3287void __init ip_static_sysctl_init(void)
3288{
4e5ca785 3289 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3290}
a1bc6eb4 3291#endif