proc: introduce proc_create_single{,_data}
[linux-block.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115
116 #include "fib_lookup.h"
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_seq_fops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_seq_fops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         struct net_device *dev = dst->dev;
438         const __be32 *pkey = daddr;
439         const struct rtable *rt;
440         struct neighbour *n;
441
442         rt = (const struct rtable *) dst;
443         if (rt->rt_gateway)
444                 pkey = (const __be32 *) &rt->rt_gateway;
445         else if (skb)
446                 pkey = &ip_hdr(skb)->daddr;
447
448         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
449         if (n)
450                 return n;
451         return neigh_create(&arp_tbl, pkey, dev);
452 }
453
454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
455 {
456         struct net_device *dev = dst->dev;
457         const __be32 *pkey = daddr;
458         const struct rtable *rt;
459
460         rt = (const struct rtable *)dst;
461         if (rt->rt_gateway)
462                 pkey = (const __be32 *)&rt->rt_gateway;
463         else if (!daddr ||
464                  (rt->rt_flags &
465                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
466                 return;
467
468         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
469 }
470
471 #define IP_IDENTS_SZ 2048u
472
473 static atomic_t *ip_idents __read_mostly;
474 static u32 *ip_tstamps __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
484         u32 old = READ_ONCE(*p_tstamp);
485         u32 now = (u32)jiffies;
486         u32 new, delta = 0;
487
488         if (old != now && cmpxchg(p_tstamp, old, now) == old)
489                 delta = prandom_u32_max(now - old);
490
491         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
492         do {
493                 old = (u32)atomic_read(p_id);
494                 new = old + delta + segs;
495         } while (atomic_cmpxchg(p_id, old, new) != old);
496
497         return new - segs;
498 }
499 EXPORT_SYMBOL(ip_idents_reserve);
500
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
502 {
503         static u32 ip_idents_hashrnd __read_mostly;
504         u32 hash, id;
505
506         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507
508         hash = jhash_3words((__force u32)iph->daddr,
509                             (__force u32)iph->saddr,
510                             iph->protocol ^ net_hash_mix(net),
511                             ip_idents_hashrnd);
512         id = ip_idents_reserve(hash, segs);
513         iph->id = htons(id);
514 }
515 EXPORT_SYMBOL(__ip_select_ident);
516
517 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
518                              const struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0,
535                            sock_net_uid(net, sk));
536 }
537
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539                                const struct sock *sk)
540 {
541         const struct net *net = dev_net(skb->dev);
542         const struct iphdr *iph = ip_hdr(skb);
543         int oif = skb->dev->ifindex;
544         u8 tos = RT_TOS(iph->tos);
545         u8 prot = iph->protocol;
546         u32 mark = skb->mark;
547
548         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
549 }
550
551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
552 {
553         const struct inet_sock *inet = inet_sk(sk);
554         const struct ip_options_rcu *inet_opt;
555         __be32 daddr = inet->inet_daddr;
556
557         rcu_read_lock();
558         inet_opt = rcu_dereference(inet->inet_opt);
559         if (inet_opt && inet_opt->opt.srr)
560                 daddr = inet_opt->opt.faddr;
561         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
562                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
563                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
564                            inet_sk_flowi_flags(sk),
565                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
566         rcu_read_unlock();
567 }
568
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
570                                  const struct sk_buff *skb)
571 {
572         if (skb)
573                 build_skb_flow_key(fl4, skb, sk);
574         else
575                 build_sk_flow_key(fl4, sk);
576 }
577
578 static DEFINE_SPINLOCK(fnhe_lock);
579
580 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
581 {
582         struct rtable *rt;
583
584         rt = rcu_dereference(fnhe->fnhe_rth_input);
585         if (rt) {
586                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
587                 dst_dev_put(&rt->dst);
588                 dst_release(&rt->dst);
589         }
590         rt = rcu_dereference(fnhe->fnhe_rth_output);
591         if (rt) {
592                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593                 dst_dev_put(&rt->dst);
594                 dst_release(&rt->dst);
595         }
596 }
597
598 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
599 {
600         struct fib_nh_exception *fnhe, *oldest;
601
602         oldest = rcu_dereference(hash->chain);
603         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
604              fnhe = rcu_dereference(fnhe->fnhe_next)) {
605                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
606                         oldest = fnhe;
607         }
608         fnhe_flush_routes(oldest);
609         return oldest;
610 }
611
612 static inline u32 fnhe_hashfun(__be32 daddr)
613 {
614         static u32 fnhe_hashrnd __read_mostly;
615         u32 hval;
616
617         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
618         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
619         return hash_32(hval, FNHE_HASH_SHIFT);
620 }
621
622 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
623 {
624         rt->rt_pmtu = fnhe->fnhe_pmtu;
625         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
626         rt->dst.expires = fnhe->fnhe_expires;
627
628         if (fnhe->fnhe_gw) {
629                 rt->rt_flags |= RTCF_REDIRECTED;
630                 rt->rt_gateway = fnhe->fnhe_gw;
631                 rt->rt_uses_gateway = 1;
632         }
633 }
634
635 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
636                                   u32 pmtu, bool lock, unsigned long expires)
637 {
638         struct fnhe_hash_bucket *hash;
639         struct fib_nh_exception *fnhe;
640         struct rtable *rt;
641         u32 genid, hval;
642         unsigned int i;
643         int depth;
644
645         genid = fnhe_genid(dev_net(nh->nh_dev));
646         hval = fnhe_hashfun(daddr);
647
648         spin_lock_bh(&fnhe_lock);
649
650         hash = rcu_dereference(nh->nh_exceptions);
651         if (!hash) {
652                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
653                 if (!hash)
654                         goto out_unlock;
655                 rcu_assign_pointer(nh->nh_exceptions, hash);
656         }
657
658         hash += hval;
659
660         depth = 0;
661         for (fnhe = rcu_dereference(hash->chain); fnhe;
662              fnhe = rcu_dereference(fnhe->fnhe_next)) {
663                 if (fnhe->fnhe_daddr == daddr)
664                         break;
665                 depth++;
666         }
667
668         if (fnhe) {
669                 if (fnhe->fnhe_genid != genid)
670                         fnhe->fnhe_genid = genid;
671                 if (gw)
672                         fnhe->fnhe_gw = gw;
673                 if (pmtu) {
674                         fnhe->fnhe_pmtu = pmtu;
675                         fnhe->fnhe_mtu_locked = lock;
676                 }
677                 fnhe->fnhe_expires = max(1UL, expires);
678                 /* Update all cached dsts too */
679                 rt = rcu_dereference(fnhe->fnhe_rth_input);
680                 if (rt)
681                         fill_route_from_fnhe(rt, fnhe);
682                 rt = rcu_dereference(fnhe->fnhe_rth_output);
683                 if (rt)
684                         fill_route_from_fnhe(rt, fnhe);
685         } else {
686                 if (depth > FNHE_RECLAIM_DEPTH)
687                         fnhe = fnhe_oldest(hash);
688                 else {
689                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
690                         if (!fnhe)
691                                 goto out_unlock;
692
693                         fnhe->fnhe_next = hash->chain;
694                         rcu_assign_pointer(hash->chain, fnhe);
695                 }
696                 fnhe->fnhe_genid = genid;
697                 fnhe->fnhe_daddr = daddr;
698                 fnhe->fnhe_gw = gw;
699                 fnhe->fnhe_pmtu = pmtu;
700                 fnhe->fnhe_mtu_locked = lock;
701                 fnhe->fnhe_expires = max(1UL, expires);
702
703                 /* Exception created; mark the cached routes for the nexthop
704                  * stale, so anyone caching it rechecks if this exception
705                  * applies to them.
706                  */
707                 rt = rcu_dereference(nh->nh_rth_input);
708                 if (rt)
709                         rt->dst.obsolete = DST_OBSOLETE_KILL;
710
711                 for_each_possible_cpu(i) {
712                         struct rtable __rcu **prt;
713                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
714                         rt = rcu_dereference(*prt);
715                         if (rt)
716                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
717                 }
718         }
719
720         fnhe->fnhe_stamp = jiffies;
721
722 out_unlock:
723         spin_unlock_bh(&fnhe_lock);
724 }
725
726 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
727                              bool kill_route)
728 {
729         __be32 new_gw = icmp_hdr(skb)->un.gateway;
730         __be32 old_gw = ip_hdr(skb)->saddr;
731         struct net_device *dev = skb->dev;
732         struct in_device *in_dev;
733         struct fib_result res;
734         struct neighbour *n;
735         struct net *net;
736
737         switch (icmp_hdr(skb)->code & 7) {
738         case ICMP_REDIR_NET:
739         case ICMP_REDIR_NETTOS:
740         case ICMP_REDIR_HOST:
741         case ICMP_REDIR_HOSTTOS:
742                 break;
743
744         default:
745                 return;
746         }
747
748         if (rt->rt_gateway != old_gw)
749                 return;
750
751         in_dev = __in_dev_get_rcu(dev);
752         if (!in_dev)
753                 return;
754
755         net = dev_net(dev);
756         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
757             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
758             ipv4_is_zeronet(new_gw))
759                 goto reject_redirect;
760
761         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
762                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
763                         goto reject_redirect;
764                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
765                         goto reject_redirect;
766         } else {
767                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
768                         goto reject_redirect;
769         }
770
771         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
772         if (!n)
773                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
774         if (!IS_ERR(n)) {
775                 if (!(n->nud_state & NUD_VALID)) {
776                         neigh_event_send(n, NULL);
777                 } else {
778                         if (fib_lookup(net, fl4, &res, 0) == 0) {
779                                 struct fib_nh *nh = &FIB_RES_NH(res);
780
781                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
782                                                 0, false,
783                                                 jiffies + ip_rt_gc_timeout);
784                         }
785                         if (kill_route)
786                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
787                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
788                 }
789                 neigh_release(n);
790         }
791         return;
792
793 reject_redirect:
794 #ifdef CONFIG_IP_ROUTE_VERBOSE
795         if (IN_DEV_LOG_MARTIANS(in_dev)) {
796                 const struct iphdr *iph = (const struct iphdr *) skb->data;
797                 __be32 daddr = iph->daddr;
798                 __be32 saddr = iph->saddr;
799
800                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
801                                      "  Advised path = %pI4 -> %pI4\n",
802                                      &old_gw, dev->name, &new_gw,
803                                      &saddr, &daddr);
804         }
805 #endif
806         ;
807 }
808
809 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
810 {
811         struct rtable *rt;
812         struct flowi4 fl4;
813         const struct iphdr *iph = (const struct iphdr *) skb->data;
814         struct net *net = dev_net(skb->dev);
815         int oif = skb->dev->ifindex;
816         u8 tos = RT_TOS(iph->tos);
817         u8 prot = iph->protocol;
818         u32 mark = skb->mark;
819
820         rt = (struct rtable *) dst;
821
822         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
823         __ip_do_redirect(rt, skb, &fl4, true);
824 }
825
826 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
827 {
828         struct rtable *rt = (struct rtable *)dst;
829         struct dst_entry *ret = dst;
830
831         if (rt) {
832                 if (dst->obsolete > 0) {
833                         ip_rt_put(rt);
834                         ret = NULL;
835                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
836                            rt->dst.expires) {
837                         ip_rt_put(rt);
838                         ret = NULL;
839                 }
840         }
841         return ret;
842 }
843
844 /*
845  * Algorithm:
846  *      1. The first ip_rt_redirect_number redirects are sent
847  *         with exponential backoff, then we stop sending them at all,
848  *         assuming that the host ignores our redirects.
849  *      2. If we did not see packets requiring redirects
850  *         during ip_rt_redirect_silence, we assume that the host
851  *         forgot redirected route and start to send redirects again.
852  *
853  * This algorithm is much cheaper and more intelligent than dumb load limiting
854  * in icmp.c.
855  *
856  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
857  * and "frag. need" (breaks PMTU discovery) in icmp.c.
858  */
859
860 void ip_rt_send_redirect(struct sk_buff *skb)
861 {
862         struct rtable *rt = skb_rtable(skb);
863         struct in_device *in_dev;
864         struct inet_peer *peer;
865         struct net *net;
866         int log_martians;
867         int vif;
868
869         rcu_read_lock();
870         in_dev = __in_dev_get_rcu(rt->dst.dev);
871         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
872                 rcu_read_unlock();
873                 return;
874         }
875         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
876         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
877         rcu_read_unlock();
878
879         net = dev_net(rt->dst.dev);
880         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
881         if (!peer) {
882                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
883                           rt_nexthop(rt, ip_hdr(skb)->daddr));
884                 return;
885         }
886
887         /* No redirected packets during ip_rt_redirect_silence;
888          * reset the algorithm.
889          */
890         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
891                 peer->rate_tokens = 0;
892
893         /* Too many ignored redirects; do not send anything
894          * set dst.rate_last to the last seen redirected packet.
895          */
896         if (peer->rate_tokens >= ip_rt_redirect_number) {
897                 peer->rate_last = jiffies;
898                 goto out_put_peer;
899         }
900
901         /* Check for load limit; set rate_last to the latest sent
902          * redirect.
903          */
904         if (peer->rate_tokens == 0 ||
905             time_after(jiffies,
906                        (peer->rate_last +
907                         (ip_rt_redirect_load << peer->rate_tokens)))) {
908                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
909
910                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
911                 peer->rate_last = jiffies;
912                 ++peer->rate_tokens;
913 #ifdef CONFIG_IP_ROUTE_VERBOSE
914                 if (log_martians &&
915                     peer->rate_tokens == ip_rt_redirect_number)
916                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
917                                              &ip_hdr(skb)->saddr, inet_iif(skb),
918                                              &ip_hdr(skb)->daddr, &gw);
919 #endif
920         }
921 out_put_peer:
922         inet_putpeer(peer);
923 }
924
925 static int ip_error(struct sk_buff *skb)
926 {
927         struct rtable *rt = skb_rtable(skb);
928         struct net_device *dev = skb->dev;
929         struct in_device *in_dev;
930         struct inet_peer *peer;
931         unsigned long now;
932         struct net *net;
933         bool send;
934         int code;
935
936         if (netif_is_l3_master(skb->dev)) {
937                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
938                 if (!dev)
939                         goto out;
940         }
941
942         in_dev = __in_dev_get_rcu(dev);
943
944         /* IP on this device is disabled. */
945         if (!in_dev)
946                 goto out;
947
948         net = dev_net(rt->dst.dev);
949         if (!IN_DEV_FORWARD(in_dev)) {
950                 switch (rt->dst.error) {
951                 case EHOSTUNREACH:
952                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
953                         break;
954
955                 case ENETUNREACH:
956                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
957                         break;
958                 }
959                 goto out;
960         }
961
962         switch (rt->dst.error) {
963         case EINVAL:
964         default:
965                 goto out;
966         case EHOSTUNREACH:
967                 code = ICMP_HOST_UNREACH;
968                 break;
969         case ENETUNREACH:
970                 code = ICMP_NET_UNREACH;
971                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
972                 break;
973         case EACCES:
974                 code = ICMP_PKT_FILTERED;
975                 break;
976         }
977
978         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
979                                l3mdev_master_ifindex(skb->dev), 1);
980
981         send = true;
982         if (peer) {
983                 now = jiffies;
984                 peer->rate_tokens += now - peer->rate_last;
985                 if (peer->rate_tokens > ip_rt_error_burst)
986                         peer->rate_tokens = ip_rt_error_burst;
987                 peer->rate_last = now;
988                 if (peer->rate_tokens >= ip_rt_error_cost)
989                         peer->rate_tokens -= ip_rt_error_cost;
990                 else
991                         send = false;
992                 inet_putpeer(peer);
993         }
994         if (send)
995                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
996
997 out:    kfree_skb(skb);
998         return 0;
999 }
1000
1001 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1002 {
1003         struct dst_entry *dst = &rt->dst;
1004         struct fib_result res;
1005         bool lock = false;
1006
1007         if (ip_mtu_locked(dst))
1008                 return;
1009
1010         if (ipv4_mtu(dst) < mtu)
1011                 return;
1012
1013         if (mtu < ip_rt_min_pmtu) {
1014                 lock = true;
1015                 mtu = ip_rt_min_pmtu;
1016         }
1017
1018         if (rt->rt_pmtu == mtu &&
1019             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1020                 return;
1021
1022         rcu_read_lock();
1023         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1024                 struct fib_nh *nh = &FIB_RES_NH(res);
1025
1026                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1027                                       jiffies + ip_rt_mtu_expires);
1028         }
1029         rcu_read_unlock();
1030 }
1031
1032 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1033                               struct sk_buff *skb, u32 mtu)
1034 {
1035         struct rtable *rt = (struct rtable *) dst;
1036         struct flowi4 fl4;
1037
1038         ip_rt_build_flow_key(&fl4, sk, skb);
1039         __ip_rt_update_pmtu(rt, &fl4, mtu);
1040 }
1041
1042 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1043                       int oif, u32 mark, u8 protocol, int flow_flags)
1044 {
1045         const struct iphdr *iph = (const struct iphdr *) skb->data;
1046         struct flowi4 fl4;
1047         struct rtable *rt;
1048
1049         if (!mark)
1050                 mark = IP4_REPLY_MARK(net, skb->mark);
1051
1052         __build_flow_key(net, &fl4, NULL, iph, oif,
1053                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1054         rt = __ip_route_output_key(net, &fl4);
1055         if (!IS_ERR(rt)) {
1056                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1057                 ip_rt_put(rt);
1058         }
1059 }
1060 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1061
1062 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1063 {
1064         const struct iphdr *iph = (const struct iphdr *) skb->data;
1065         struct flowi4 fl4;
1066         struct rtable *rt;
1067
1068         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1069
1070         if (!fl4.flowi4_mark)
1071                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1072
1073         rt = __ip_route_output_key(sock_net(sk), &fl4);
1074         if (!IS_ERR(rt)) {
1075                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1076                 ip_rt_put(rt);
1077         }
1078 }
1079
1080 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082         const struct iphdr *iph = (const struct iphdr *) skb->data;
1083         struct flowi4 fl4;
1084         struct rtable *rt;
1085         struct dst_entry *odst = NULL;
1086         bool new = false;
1087         struct net *net = sock_net(sk);
1088
1089         bh_lock_sock(sk);
1090
1091         if (!ip_sk_accept_pmtu(sk))
1092                 goto out;
1093
1094         odst = sk_dst_get(sk);
1095
1096         if (sock_owned_by_user(sk) || !odst) {
1097                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1098                 goto out;
1099         }
1100
1101         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1102
1103         rt = (struct rtable *)odst;
1104         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1105                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1106                 if (IS_ERR(rt))
1107                         goto out;
1108
1109                 new = true;
1110         }
1111
1112         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1113
1114         if (!dst_check(&rt->dst, 0)) {
1115                 if (new)
1116                         dst_release(&rt->dst);
1117
1118                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119                 if (IS_ERR(rt))
1120                         goto out;
1121
1122                 new = true;
1123         }
1124
1125         if (new)
1126                 sk_dst_set(sk, &rt->dst);
1127
1128 out:
1129         bh_unlock_sock(sk);
1130         dst_release(odst);
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1133
1134 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1135                    int oif, u32 mark, u8 protocol, int flow_flags)
1136 {
1137         const struct iphdr *iph = (const struct iphdr *) skb->data;
1138         struct flowi4 fl4;
1139         struct rtable *rt;
1140
1141         __build_flow_key(net, &fl4, NULL, iph, oif,
1142                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1143         rt = __ip_route_output_key(net, &fl4);
1144         if (!IS_ERR(rt)) {
1145                 __ip_do_redirect(rt, skb, &fl4, false);
1146                 ip_rt_put(rt);
1147         }
1148 }
1149 EXPORT_SYMBOL_GPL(ipv4_redirect);
1150
1151 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1152 {
1153         const struct iphdr *iph = (const struct iphdr *) skb->data;
1154         struct flowi4 fl4;
1155         struct rtable *rt;
1156         struct net *net = sock_net(sk);
1157
1158         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1159         rt = __ip_route_output_key(net, &fl4);
1160         if (!IS_ERR(rt)) {
1161                 __ip_do_redirect(rt, skb, &fl4, false);
1162                 ip_rt_put(rt);
1163         }
1164 }
1165 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1166
1167 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1168 {
1169         struct rtable *rt = (struct rtable *) dst;
1170
1171         /* All IPV4 dsts are created with ->obsolete set to the value
1172          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1173          * into this function always.
1174          *
1175          * When a PMTU/redirect information update invalidates a route,
1176          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1177          * DST_OBSOLETE_DEAD by dst_free().
1178          */
1179         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1180                 return NULL;
1181         return dst;
1182 }
1183
1184 static void ipv4_link_failure(struct sk_buff *skb)
1185 {
1186         struct rtable *rt;
1187
1188         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1189
1190         rt = skb_rtable(skb);
1191         if (rt)
1192                 dst_set_expires(&rt->dst, 0);
1193 }
1194
1195 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1196 {
1197         pr_debug("%s: %pI4 -> %pI4, %s\n",
1198                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1199                  skb->dev ? skb->dev->name : "?");
1200         kfree_skb(skb);
1201         WARN_ON(1);
1202         return 0;
1203 }
1204
1205 /*
1206    We do not cache source address of outgoing interface,
1207    because it is used only by IP RR, TS and SRR options,
1208    so that it out of fast path.
1209
1210    BTW remember: "addr" is allowed to be not aligned
1211    in IP options!
1212  */
1213
1214 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1215 {
1216         __be32 src;
1217
1218         if (rt_is_output_route(rt))
1219                 src = ip_hdr(skb)->saddr;
1220         else {
1221                 struct fib_result res;
1222                 struct flowi4 fl4;
1223                 struct iphdr *iph;
1224
1225                 iph = ip_hdr(skb);
1226
1227                 memset(&fl4, 0, sizeof(fl4));
1228                 fl4.daddr = iph->daddr;
1229                 fl4.saddr = iph->saddr;
1230                 fl4.flowi4_tos = RT_TOS(iph->tos);
1231                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1232                 fl4.flowi4_iif = skb->dev->ifindex;
1233                 fl4.flowi4_mark = skb->mark;
1234
1235                 rcu_read_lock();
1236                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1237                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1238                 else
1239                         src = inet_select_addr(rt->dst.dev,
1240                                                rt_nexthop(rt, iph->daddr),
1241                                                RT_SCOPE_UNIVERSE);
1242                 rcu_read_unlock();
1243         }
1244         memcpy(addr, &src, 4);
1245 }
1246
1247 #ifdef CONFIG_IP_ROUTE_CLASSID
1248 static void set_class_tag(struct rtable *rt, u32 tag)
1249 {
1250         if (!(rt->dst.tclassid & 0xFFFF))
1251                 rt->dst.tclassid |= tag & 0xFFFF;
1252         if (!(rt->dst.tclassid & 0xFFFF0000))
1253                 rt->dst.tclassid |= tag & 0xFFFF0000;
1254 }
1255 #endif
1256
1257 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1258 {
1259         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1260         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1261                                     ip_rt_min_advmss);
1262
1263         return min(advmss, IPV4_MAX_PMTU - header_size);
1264 }
1265
1266 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1267 {
1268         const struct rtable *rt = (const struct rtable *) dst;
1269         unsigned int mtu = rt->rt_pmtu;
1270
1271         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1272                 mtu = dst_metric_raw(dst, RTAX_MTU);
1273
1274         if (mtu)
1275                 return mtu;
1276
1277         mtu = READ_ONCE(dst->dev->mtu);
1278
1279         if (unlikely(ip_mtu_locked(dst))) {
1280                 if (rt->rt_uses_gateway && mtu > 576)
1281                         mtu = 576;
1282         }
1283
1284         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1285
1286         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1287 }
1288
1289 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1290 {
1291         struct fnhe_hash_bucket *hash;
1292         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1293         u32 hval = fnhe_hashfun(daddr);
1294
1295         spin_lock_bh(&fnhe_lock);
1296
1297         hash = rcu_dereference_protected(nh->nh_exceptions,
1298                                          lockdep_is_held(&fnhe_lock));
1299         hash += hval;
1300
1301         fnhe_p = &hash->chain;
1302         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1303         while (fnhe) {
1304                 if (fnhe->fnhe_daddr == daddr) {
1305                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1306                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1307                         fnhe_flush_routes(fnhe);
1308                         kfree_rcu(fnhe, rcu);
1309                         break;
1310                 }
1311                 fnhe_p = &fnhe->fnhe_next;
1312                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1313                                                  lockdep_is_held(&fnhe_lock));
1314         }
1315
1316         spin_unlock_bh(&fnhe_lock);
1317 }
1318
1319 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1320 {
1321         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1322         struct fib_nh_exception *fnhe;
1323         u32 hval;
1324
1325         if (!hash)
1326                 return NULL;
1327
1328         hval = fnhe_hashfun(daddr);
1329
1330         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1331              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1332                 if (fnhe->fnhe_daddr == daddr) {
1333                         if (fnhe->fnhe_expires &&
1334                             time_after(jiffies, fnhe->fnhe_expires)) {
1335                                 ip_del_fnhe(nh, daddr);
1336                                 break;
1337                         }
1338                         return fnhe;
1339                 }
1340         }
1341         return NULL;
1342 }
1343
1344 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1345                               __be32 daddr, const bool do_cache)
1346 {
1347         bool ret = false;
1348
1349         spin_lock_bh(&fnhe_lock);
1350
1351         if (daddr == fnhe->fnhe_daddr) {
1352                 struct rtable __rcu **porig;
1353                 struct rtable *orig;
1354                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1355
1356                 if (rt_is_input_route(rt))
1357                         porig = &fnhe->fnhe_rth_input;
1358                 else
1359                         porig = &fnhe->fnhe_rth_output;
1360                 orig = rcu_dereference(*porig);
1361
1362                 if (fnhe->fnhe_genid != genid) {
1363                         fnhe->fnhe_genid = genid;
1364                         fnhe->fnhe_gw = 0;
1365                         fnhe->fnhe_pmtu = 0;
1366                         fnhe->fnhe_expires = 0;
1367                         fnhe->fnhe_mtu_locked = false;
1368                         fnhe_flush_routes(fnhe);
1369                         orig = NULL;
1370                 }
1371                 fill_route_from_fnhe(rt, fnhe);
1372                 if (!rt->rt_gateway)
1373                         rt->rt_gateway = daddr;
1374
1375                 if (do_cache) {
1376                         dst_hold(&rt->dst);
1377                         rcu_assign_pointer(*porig, rt);
1378                         if (orig) {
1379                                 dst_dev_put(&orig->dst);
1380                                 dst_release(&orig->dst);
1381                         }
1382                         ret = true;
1383                 }
1384
1385                 fnhe->fnhe_stamp = jiffies;
1386         }
1387         spin_unlock_bh(&fnhe_lock);
1388
1389         return ret;
1390 }
1391
1392 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1393 {
1394         struct rtable *orig, *prev, **p;
1395         bool ret = true;
1396
1397         if (rt_is_input_route(rt)) {
1398                 p = (struct rtable **)&nh->nh_rth_input;
1399         } else {
1400                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1401         }
1402         orig = *p;
1403
1404         /* hold dst before doing cmpxchg() to avoid race condition
1405          * on this dst
1406          */
1407         dst_hold(&rt->dst);
1408         prev = cmpxchg(p, orig, rt);
1409         if (prev == orig) {
1410                 if (orig) {
1411                         dst_dev_put(&orig->dst);
1412                         dst_release(&orig->dst);
1413                 }
1414         } else {
1415                 dst_release(&rt->dst);
1416                 ret = false;
1417         }
1418
1419         return ret;
1420 }
1421
1422 struct uncached_list {
1423         spinlock_t              lock;
1424         struct list_head        head;
1425 };
1426
1427 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1428
1429 void rt_add_uncached_list(struct rtable *rt)
1430 {
1431         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1432
1433         rt->rt_uncached_list = ul;
1434
1435         spin_lock_bh(&ul->lock);
1436         list_add_tail(&rt->rt_uncached, &ul->head);
1437         spin_unlock_bh(&ul->lock);
1438 }
1439
1440 void rt_del_uncached_list(struct rtable *rt)
1441 {
1442         if (!list_empty(&rt->rt_uncached)) {
1443                 struct uncached_list *ul = rt->rt_uncached_list;
1444
1445                 spin_lock_bh(&ul->lock);
1446                 list_del(&rt->rt_uncached);
1447                 spin_unlock_bh(&ul->lock);
1448         }
1449 }
1450
1451 static void ipv4_dst_destroy(struct dst_entry *dst)
1452 {
1453         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1454         struct rtable *rt = (struct rtable *)dst;
1455
1456         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1457                 kfree(p);
1458
1459         rt_del_uncached_list(rt);
1460 }
1461
1462 void rt_flush_dev(struct net_device *dev)
1463 {
1464         struct net *net = dev_net(dev);
1465         struct rtable *rt;
1466         int cpu;
1467
1468         for_each_possible_cpu(cpu) {
1469                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1470
1471                 spin_lock_bh(&ul->lock);
1472                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1473                         if (rt->dst.dev != dev)
1474                                 continue;
1475                         rt->dst.dev = net->loopback_dev;
1476                         dev_hold(rt->dst.dev);
1477                         dev_put(dev);
1478                 }
1479                 spin_unlock_bh(&ul->lock);
1480         }
1481 }
1482
1483 static bool rt_cache_valid(const struct rtable *rt)
1484 {
1485         return  rt &&
1486                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1487                 !rt_is_expired(rt);
1488 }
1489
1490 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1491                            const struct fib_result *res,
1492                            struct fib_nh_exception *fnhe,
1493                            struct fib_info *fi, u16 type, u32 itag,
1494                            const bool do_cache)
1495 {
1496         bool cached = false;
1497
1498         if (fi) {
1499                 struct fib_nh *nh = &FIB_RES_NH(*res);
1500
1501                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1502                         rt->rt_gateway = nh->nh_gw;
1503                         rt->rt_uses_gateway = 1;
1504                 }
1505                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1506                 if (fi->fib_metrics != &dst_default_metrics) {
1507                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1508                         refcount_inc(&fi->fib_metrics->refcnt);
1509                 }
1510 #ifdef CONFIG_IP_ROUTE_CLASSID
1511                 rt->dst.tclassid = nh->nh_tclassid;
1512 #endif
1513                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1514                 if (unlikely(fnhe))
1515                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1516                 else if (do_cache)
1517                         cached = rt_cache_route(nh, rt);
1518                 if (unlikely(!cached)) {
1519                         /* Routes we intend to cache in nexthop exception or
1520                          * FIB nexthop have the DST_NOCACHE bit clear.
1521                          * However, if we are unsuccessful at storing this
1522                          * route into the cache we really need to set it.
1523                          */
1524                         if (!rt->rt_gateway)
1525                                 rt->rt_gateway = daddr;
1526                         rt_add_uncached_list(rt);
1527                 }
1528         } else
1529                 rt_add_uncached_list(rt);
1530
1531 #ifdef CONFIG_IP_ROUTE_CLASSID
1532 #ifdef CONFIG_IP_MULTIPLE_TABLES
1533         set_class_tag(rt, res->tclassid);
1534 #endif
1535         set_class_tag(rt, itag);
1536 #endif
1537 }
1538
1539 struct rtable *rt_dst_alloc(struct net_device *dev,
1540                             unsigned int flags, u16 type,
1541                             bool nopolicy, bool noxfrm, bool will_cache)
1542 {
1543         struct rtable *rt;
1544
1545         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1546                        (will_cache ? 0 : DST_HOST) |
1547                        (nopolicy ? DST_NOPOLICY : 0) |
1548                        (noxfrm ? DST_NOXFRM : 0));
1549
1550         if (rt) {
1551                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1552                 rt->rt_flags = flags;
1553                 rt->rt_type = type;
1554                 rt->rt_is_input = 0;
1555                 rt->rt_iif = 0;
1556                 rt->rt_pmtu = 0;
1557                 rt->rt_mtu_locked = 0;
1558                 rt->rt_gateway = 0;
1559                 rt->rt_uses_gateway = 0;
1560                 INIT_LIST_HEAD(&rt->rt_uncached);
1561
1562                 rt->dst.output = ip_output;
1563                 if (flags & RTCF_LOCAL)
1564                         rt->dst.input = ip_local_deliver;
1565         }
1566
1567         return rt;
1568 }
1569 EXPORT_SYMBOL(rt_dst_alloc);
1570
1571 /* called in rcu_read_lock() section */
1572 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1573                           u8 tos, struct net_device *dev,
1574                           struct in_device *in_dev, u32 *itag)
1575 {
1576         int err;
1577
1578         /* Primary sanity checks. */
1579         if (!in_dev)
1580                 return -EINVAL;
1581
1582         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1583             skb->protocol != htons(ETH_P_IP))
1584                 return -EINVAL;
1585
1586         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1587                 return -EINVAL;
1588
1589         if (ipv4_is_zeronet(saddr)) {
1590                 if (!ipv4_is_local_multicast(daddr))
1591                         return -EINVAL;
1592         } else {
1593                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1594                                           in_dev, itag);
1595                 if (err < 0)
1596                         return err;
1597         }
1598         return 0;
1599 }
1600
1601 /* called in rcu_read_lock() section */
1602 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1603                              u8 tos, struct net_device *dev, int our)
1604 {
1605         struct in_device *in_dev = __in_dev_get_rcu(dev);
1606         unsigned int flags = RTCF_MULTICAST;
1607         struct rtable *rth;
1608         u32 itag = 0;
1609         int err;
1610
1611         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1612         if (err)
1613                 return err;
1614
1615         if (our)
1616                 flags |= RTCF_LOCAL;
1617
1618         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1619                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1620         if (!rth)
1621                 return -ENOBUFS;
1622
1623 #ifdef CONFIG_IP_ROUTE_CLASSID
1624         rth->dst.tclassid = itag;
1625 #endif
1626         rth->dst.output = ip_rt_bug;
1627         rth->rt_is_input= 1;
1628
1629 #ifdef CONFIG_IP_MROUTE
1630         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1631                 rth->dst.input = ip_mr_input;
1632 #endif
1633         RT_CACHE_STAT_INC(in_slow_mc);
1634
1635         skb_dst_set(skb, &rth->dst);
1636         return 0;
1637 }
1638
1639
1640 static void ip_handle_martian_source(struct net_device *dev,
1641                                      struct in_device *in_dev,
1642                                      struct sk_buff *skb,
1643                                      __be32 daddr,
1644                                      __be32 saddr)
1645 {
1646         RT_CACHE_STAT_INC(in_martian_src);
1647 #ifdef CONFIG_IP_ROUTE_VERBOSE
1648         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1649                 /*
1650                  *      RFC1812 recommendation, if source is martian,
1651                  *      the only hint is MAC header.
1652                  */
1653                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1654                         &daddr, &saddr, dev->name);
1655                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1656                         print_hex_dump(KERN_WARNING, "ll header: ",
1657                                        DUMP_PREFIX_OFFSET, 16, 1,
1658                                        skb_mac_header(skb),
1659                                        dev->hard_header_len, true);
1660                 }
1661         }
1662 #endif
1663 }
1664
1665 /* called in rcu_read_lock() section */
1666 static int __mkroute_input(struct sk_buff *skb,
1667                            const struct fib_result *res,
1668                            struct in_device *in_dev,
1669                            __be32 daddr, __be32 saddr, u32 tos)
1670 {
1671         struct fib_nh_exception *fnhe;
1672         struct rtable *rth;
1673         int err;
1674         struct in_device *out_dev;
1675         bool do_cache;
1676         u32 itag = 0;
1677
1678         /* get a working reference to the output device */
1679         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1680         if (!out_dev) {
1681                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1682                 return -EINVAL;
1683         }
1684
1685         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1686                                   in_dev->dev, in_dev, &itag);
1687         if (err < 0) {
1688                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1689                                          saddr);
1690
1691                 goto cleanup;
1692         }
1693
1694         do_cache = res->fi && !itag;
1695         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1696             skb->protocol == htons(ETH_P_IP) &&
1697             (IN_DEV_SHARED_MEDIA(out_dev) ||
1698              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1699                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1700
1701         if (skb->protocol != htons(ETH_P_IP)) {
1702                 /* Not IP (i.e. ARP). Do not create route, if it is
1703                  * invalid for proxy arp. DNAT routes are always valid.
1704                  *
1705                  * Proxy arp feature have been extended to allow, ARP
1706                  * replies back to the same interface, to support
1707                  * Private VLAN switch technologies. See arp.c.
1708                  */
1709                 if (out_dev == in_dev &&
1710                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1711                         err = -EINVAL;
1712                         goto cleanup;
1713                 }
1714         }
1715
1716         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1717         if (do_cache) {
1718                 if (fnhe)
1719                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1720                 else
1721                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1722                 if (rt_cache_valid(rth)) {
1723                         skb_dst_set_noref(skb, &rth->dst);
1724                         goto out;
1725                 }
1726         }
1727
1728         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1729                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1730                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1731         if (!rth) {
1732                 err = -ENOBUFS;
1733                 goto cleanup;
1734         }
1735
1736         rth->rt_is_input = 1;
1737         RT_CACHE_STAT_INC(in_slow_tot);
1738
1739         rth->dst.input = ip_forward;
1740
1741         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1742                        do_cache);
1743         lwtunnel_set_redirect(&rth->dst);
1744         skb_dst_set(skb, &rth->dst);
1745 out:
1746         err = 0;
1747  cleanup:
1748         return err;
1749 }
1750
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752 /* To make ICMP packets follow the right flow, the multipath hash is
1753  * calculated from the inner IP addresses.
1754  */
1755 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1756                                  struct flow_keys *hash_keys)
1757 {
1758         const struct iphdr *outer_iph = ip_hdr(skb);
1759         const struct iphdr *key_iph = outer_iph;
1760         const struct iphdr *inner_iph;
1761         const struct icmphdr *icmph;
1762         struct iphdr _inner_iph;
1763         struct icmphdr _icmph;
1764
1765         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1766                 goto out;
1767
1768         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1769                 goto out;
1770
1771         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1772                                    &_icmph);
1773         if (!icmph)
1774                 goto out;
1775
1776         if (icmph->type != ICMP_DEST_UNREACH &&
1777             icmph->type != ICMP_REDIRECT &&
1778             icmph->type != ICMP_TIME_EXCEEDED &&
1779             icmph->type != ICMP_PARAMETERPROB)
1780                 goto out;
1781
1782         inner_iph = skb_header_pointer(skb,
1783                                        outer_iph->ihl * 4 + sizeof(_icmph),
1784                                        sizeof(_inner_iph), &_inner_iph);
1785         if (!inner_iph)
1786                 goto out;
1787
1788         key_iph = inner_iph;
1789 out:
1790         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1791         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1792 }
1793
1794 /* if skb is set it will be used and fl4 can be NULL */
1795 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1796                        const struct sk_buff *skb, struct flow_keys *flkeys)
1797 {
1798         struct flow_keys hash_keys;
1799         u32 mhash;
1800
1801         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1802         case 0:
1803                 memset(&hash_keys, 0, sizeof(hash_keys));
1804                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1805                 if (skb) {
1806                         ip_multipath_l3_keys(skb, &hash_keys);
1807                 } else {
1808                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1809                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1810                 }
1811                 break;
1812         case 1:
1813                 /* skb is currently provided only when forwarding */
1814                 if (skb) {
1815                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1816                         struct flow_keys keys;
1817
1818                         /* short-circuit if we already have L4 hash present */
1819                         if (skb->l4_hash)
1820                                 return skb_get_hash_raw(skb) >> 1;
1821
1822                         memset(&hash_keys, 0, sizeof(hash_keys));
1823
1824                         if (!flkeys) {
1825                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1826                                 flkeys = &keys;
1827                         }
1828
1829                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1830                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1831                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1832                         hash_keys.ports.src = flkeys->ports.src;
1833                         hash_keys.ports.dst = flkeys->ports.dst;
1834                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1835                 } else {
1836                         memset(&hash_keys, 0, sizeof(hash_keys));
1837                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1838                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1839                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1840                         hash_keys.ports.src = fl4->fl4_sport;
1841                         hash_keys.ports.dst = fl4->fl4_dport;
1842                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1843                 }
1844                 break;
1845         }
1846         mhash = flow_hash_from_keys(&hash_keys);
1847
1848         return mhash >> 1;
1849 }
1850 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1851
1852 static int ip_mkroute_input(struct sk_buff *skb,
1853                             struct fib_result *res,
1854                             struct in_device *in_dev,
1855                             __be32 daddr, __be32 saddr, u32 tos,
1856                             struct flow_keys *hkeys)
1857 {
1858 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1859         if (res->fi && res->fi->fib_nhs > 1) {
1860                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1861
1862                 fib_select_multipath(res, h);
1863         }
1864 #endif
1865
1866         /* create a routing cache entry */
1867         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1868 }
1869
1870 /*
1871  *      NOTE. We drop all the packets that has local source
1872  *      addresses, because every properly looped back packet
1873  *      must have correct destination already attached by output routine.
1874  *
1875  *      Such approach solves two big problems:
1876  *      1. Not simplex devices are handled properly.
1877  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1878  *      called with rcu_read_lock()
1879  */
1880
1881 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1882                                u8 tos, struct net_device *dev,
1883                                struct fib_result *res)
1884 {
1885         struct in_device *in_dev = __in_dev_get_rcu(dev);
1886         struct flow_keys *flkeys = NULL, _flkeys;
1887         struct net    *net = dev_net(dev);
1888         struct ip_tunnel_info *tun_info;
1889         int             err = -EINVAL;
1890         unsigned int    flags = 0;
1891         u32             itag = 0;
1892         struct rtable   *rth;
1893         struct flowi4   fl4;
1894         bool do_cache;
1895
1896         /* IP on this device is disabled. */
1897
1898         if (!in_dev)
1899                 goto out;
1900
1901         /* Check for the most weird martians, which can be not detected
1902            by fib_lookup.
1903          */
1904
1905         tun_info = skb_tunnel_info(skb);
1906         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1907                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1908         else
1909                 fl4.flowi4_tun_key.tun_id = 0;
1910         skb_dst_drop(skb);
1911
1912         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1913                 goto martian_source;
1914
1915         res->fi = NULL;
1916         res->table = NULL;
1917         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1918                 goto brd_input;
1919
1920         /* Accept zero addresses only to limited broadcast;
1921          * I even do not know to fix it or not. Waiting for complains :-)
1922          */
1923         if (ipv4_is_zeronet(saddr))
1924                 goto martian_source;
1925
1926         if (ipv4_is_zeronet(daddr))
1927                 goto martian_destination;
1928
1929         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1930          * and call it once if daddr or/and saddr are loopback addresses
1931          */
1932         if (ipv4_is_loopback(daddr)) {
1933                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1934                         goto martian_destination;
1935         } else if (ipv4_is_loopback(saddr)) {
1936                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1937                         goto martian_source;
1938         }
1939
1940         /*
1941          *      Now we are ready to route packet.
1942          */
1943         fl4.flowi4_oif = 0;
1944         fl4.flowi4_iif = dev->ifindex;
1945         fl4.flowi4_mark = skb->mark;
1946         fl4.flowi4_tos = tos;
1947         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1948         fl4.flowi4_flags = 0;
1949         fl4.daddr = daddr;
1950         fl4.saddr = saddr;
1951         fl4.flowi4_uid = sock_net_uid(net, NULL);
1952
1953         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1954                 flkeys = &_flkeys;
1955
1956         err = fib_lookup(net, &fl4, res, 0);
1957         if (err != 0) {
1958                 if (!IN_DEV_FORWARD(in_dev))
1959                         err = -EHOSTUNREACH;
1960                 goto no_route;
1961         }
1962
1963         if (res->type == RTN_BROADCAST)
1964                 goto brd_input;
1965
1966         if (res->type == RTN_LOCAL) {
1967                 err = fib_validate_source(skb, saddr, daddr, tos,
1968                                           0, dev, in_dev, &itag);
1969                 if (err < 0)
1970                         goto martian_source;
1971                 goto local_input;
1972         }
1973
1974         if (!IN_DEV_FORWARD(in_dev)) {
1975                 err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978         if (res->type != RTN_UNICAST)
1979                 goto martian_destination;
1980
1981         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1982 out:    return err;
1983
1984 brd_input:
1985         if (skb->protocol != htons(ETH_P_IP))
1986                 goto e_inval;
1987
1988         if (!ipv4_is_zeronet(saddr)) {
1989                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1990                                           in_dev, &itag);
1991                 if (err < 0)
1992                         goto martian_source;
1993         }
1994         flags |= RTCF_BROADCAST;
1995         res->type = RTN_BROADCAST;
1996         RT_CACHE_STAT_INC(in_brd);
1997
1998 local_input:
1999         do_cache = false;
2000         if (res->fi) {
2001                 if (!itag) {
2002                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2003                         if (rt_cache_valid(rth)) {
2004                                 skb_dst_set_noref(skb, &rth->dst);
2005                                 err = 0;
2006                                 goto out;
2007                         }
2008                         do_cache = true;
2009                 }
2010         }
2011
2012         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2013                            flags | RTCF_LOCAL, res->type,
2014                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2015         if (!rth)
2016                 goto e_nobufs;
2017
2018         rth->dst.output= ip_rt_bug;
2019 #ifdef CONFIG_IP_ROUTE_CLASSID
2020         rth->dst.tclassid = itag;
2021 #endif
2022         rth->rt_is_input = 1;
2023
2024         RT_CACHE_STAT_INC(in_slow_tot);
2025         if (res->type == RTN_UNREACHABLE) {
2026                 rth->dst.input= ip_error;
2027                 rth->dst.error= -err;
2028                 rth->rt_flags   &= ~RTCF_LOCAL;
2029         }
2030
2031         if (do_cache) {
2032                 struct fib_nh *nh = &FIB_RES_NH(*res);
2033
2034                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2035                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2036                         WARN_ON(rth->dst.input == lwtunnel_input);
2037                         rth->dst.lwtstate->orig_input = rth->dst.input;
2038                         rth->dst.input = lwtunnel_input;
2039                 }
2040
2041                 if (unlikely(!rt_cache_route(nh, rth)))
2042                         rt_add_uncached_list(rth);
2043         }
2044         skb_dst_set(skb, &rth->dst);
2045         err = 0;
2046         goto out;
2047
2048 no_route:
2049         RT_CACHE_STAT_INC(in_no_route);
2050         res->type = RTN_UNREACHABLE;
2051         res->fi = NULL;
2052         res->table = NULL;
2053         goto local_input;
2054
2055         /*
2056          *      Do not cache martian addresses: they should be logged (RFC1812)
2057          */
2058 martian_destination:
2059         RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061         if (IN_DEV_LOG_MARTIANS(in_dev))
2062                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2063                                      &daddr, &saddr, dev->name);
2064 #endif
2065
2066 e_inval:
2067         err = -EINVAL;
2068         goto out;
2069
2070 e_nobufs:
2071         err = -ENOBUFS;
2072         goto out;
2073
2074 martian_source:
2075         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2076         goto out;
2077 }
2078
2079 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080                          u8 tos, struct net_device *dev)
2081 {
2082         struct fib_result res;
2083         int err;
2084
2085         tos &= IPTOS_RT_MASK;
2086         rcu_read_lock();
2087         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2088         rcu_read_unlock();
2089
2090         return err;
2091 }
2092 EXPORT_SYMBOL(ip_route_input_noref);
2093
2094 /* called with rcu_read_lock held */
2095 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096                        u8 tos, struct net_device *dev, struct fib_result *res)
2097 {
2098         /* Multicast recognition logic is moved from route cache to here.
2099            The problem was that too many Ethernet cards have broken/missing
2100            hardware multicast filters :-( As result the host on multicasting
2101            network acquires a lot of useless route cache entries, sort of
2102            SDR messages from all the world. Now we try to get rid of them.
2103            Really, provided software IP multicast filter is organized
2104            reasonably (at least, hashed), it does not result in a slowdown
2105            comparing with route cache reject entries.
2106            Note, that multicast routers are not affected, because
2107            route cache entry is created eventually.
2108          */
2109         if (ipv4_is_multicast(daddr)) {
2110                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2111                 int our = 0;
2112                 int err = -EINVAL;
2113
2114                 if (in_dev)
2115                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2116                                               ip_hdr(skb)->protocol);
2117
2118                 /* check l3 master if no match yet */
2119                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2120                         struct in_device *l3_in_dev;
2121
2122                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2123                         if (l3_in_dev)
2124                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2125                                                       ip_hdr(skb)->protocol);
2126                 }
2127
2128                 if (our
2129 #ifdef CONFIG_IP_MROUTE
2130                         ||
2131                     (!ipv4_is_local_multicast(daddr) &&
2132                      IN_DEV_MFORWARD(in_dev))
2133 #endif
2134                    ) {
2135                         err = ip_route_input_mc(skb, daddr, saddr,
2136                                                 tos, dev, our);
2137                 }
2138                 return err;
2139         }
2140
2141         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2142 }
2143
2144 /* called with rcu_read_lock() */
2145 static struct rtable *__mkroute_output(const struct fib_result *res,
2146                                        const struct flowi4 *fl4, int orig_oif,
2147                                        struct net_device *dev_out,
2148                                        unsigned int flags)
2149 {
2150         struct fib_info *fi = res->fi;
2151         struct fib_nh_exception *fnhe;
2152         struct in_device *in_dev;
2153         u16 type = res->type;
2154         struct rtable *rth;
2155         bool do_cache;
2156
2157         in_dev = __in_dev_get_rcu(dev_out);
2158         if (!in_dev)
2159                 return ERR_PTR(-EINVAL);
2160
2161         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2162                 if (ipv4_is_loopback(fl4->saddr) &&
2163                     !(dev_out->flags & IFF_LOOPBACK) &&
2164                     !netif_is_l3_master(dev_out))
2165                         return ERR_PTR(-EINVAL);
2166
2167         if (ipv4_is_lbcast(fl4->daddr))
2168                 type = RTN_BROADCAST;
2169         else if (ipv4_is_multicast(fl4->daddr))
2170                 type = RTN_MULTICAST;
2171         else if (ipv4_is_zeronet(fl4->daddr))
2172                 return ERR_PTR(-EINVAL);
2173
2174         if (dev_out->flags & IFF_LOOPBACK)
2175                 flags |= RTCF_LOCAL;
2176
2177         do_cache = true;
2178         if (type == RTN_BROADCAST) {
2179                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2180                 fi = NULL;
2181         } else if (type == RTN_MULTICAST) {
2182                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2183                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2184                                      fl4->flowi4_proto))
2185                         flags &= ~RTCF_LOCAL;
2186                 else
2187                         do_cache = false;
2188                 /* If multicast route do not exist use
2189                  * default one, but do not gateway in this case.
2190                  * Yes, it is hack.
2191                  */
2192                 if (fi && res->prefixlen < 4)
2193                         fi = NULL;
2194         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2195                    (orig_oif != dev_out->ifindex)) {
2196                 /* For local routes that require a particular output interface
2197                  * we do not want to cache the result.  Caching the result
2198                  * causes incorrect behaviour when there are multiple source
2199                  * addresses on the interface, the end result being that if the
2200                  * intended recipient is waiting on that interface for the
2201                  * packet he won't receive it because it will be delivered on
2202                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2203                  * be set to the loopback interface as well.
2204                  */
2205                 do_cache = false;
2206         }
2207
2208         fnhe = NULL;
2209         do_cache &= fi != NULL;
2210         if (fi) {
2211                 struct rtable __rcu **prth;
2212                 struct fib_nh *nh = &FIB_RES_NH(*res);
2213
2214                 fnhe = find_exception(nh, fl4->daddr);
2215                 if (!do_cache)
2216                         goto add;
2217                 if (fnhe) {
2218                         prth = &fnhe->fnhe_rth_output;
2219                 } else {
2220                         if (unlikely(fl4->flowi4_flags &
2221                                      FLOWI_FLAG_KNOWN_NH &&
2222                                      !(nh->nh_gw &&
2223                                        nh->nh_scope == RT_SCOPE_LINK))) {
2224                                 do_cache = false;
2225                                 goto add;
2226                         }
2227                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2228                 }
2229                 rth = rcu_dereference(*prth);
2230                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2231                         return rth;
2232         }
2233
2234 add:
2235         rth = rt_dst_alloc(dev_out, flags, type,
2236                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2237                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2238                            do_cache);
2239         if (!rth)
2240                 return ERR_PTR(-ENOBUFS);
2241
2242         rth->rt_iif = orig_oif;
2243
2244         RT_CACHE_STAT_INC(out_slow_tot);
2245
2246         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2247                 if (flags & RTCF_LOCAL &&
2248                     !(dev_out->flags & IFF_LOOPBACK)) {
2249                         rth->dst.output = ip_mc_output;
2250                         RT_CACHE_STAT_INC(out_slow_mc);
2251                 }
2252 #ifdef CONFIG_IP_MROUTE
2253                 if (type == RTN_MULTICAST) {
2254                         if (IN_DEV_MFORWARD(in_dev) &&
2255                             !ipv4_is_local_multicast(fl4->daddr)) {
2256                                 rth->dst.input = ip_mr_input;
2257                                 rth->dst.output = ip_mc_output;
2258                         }
2259                 }
2260 #endif
2261         }
2262
2263         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2264         lwtunnel_set_redirect(&rth->dst);
2265
2266         return rth;
2267 }
2268
2269 /*
2270  * Major route resolver routine.
2271  */
2272
2273 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2274                                         const struct sk_buff *skb)
2275 {
2276         __u8 tos = RT_FL_TOS(fl4);
2277         struct fib_result res = {
2278                 .type           = RTN_UNSPEC,
2279                 .fi             = NULL,
2280                 .table          = NULL,
2281                 .tclassid       = 0,
2282         };
2283         struct rtable *rth;
2284
2285         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2286         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2287         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2288                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2289
2290         rcu_read_lock();
2291         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2292         rcu_read_unlock();
2293
2294         return rth;
2295 }
2296 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2297
2298 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2299                                             struct fib_result *res,
2300                                             const struct sk_buff *skb)
2301 {
2302         struct net_device *dev_out = NULL;
2303         int orig_oif = fl4->flowi4_oif;
2304         unsigned int flags = 0;
2305         struct rtable *rth;
2306         int err = -ENETUNREACH;
2307
2308         if (fl4->saddr) {
2309                 rth = ERR_PTR(-EINVAL);
2310                 if (ipv4_is_multicast(fl4->saddr) ||
2311                     ipv4_is_lbcast(fl4->saddr) ||
2312                     ipv4_is_zeronet(fl4->saddr))
2313                         goto out;
2314
2315                 /* I removed check for oif == dev_out->oif here.
2316                    It was wrong for two reasons:
2317                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2318                       is assigned to multiple interfaces.
2319                    2. Moreover, we are allowed to send packets with saddr
2320                       of another iface. --ANK
2321                  */
2322
2323                 if (fl4->flowi4_oif == 0 &&
2324                     (ipv4_is_multicast(fl4->daddr) ||
2325                      ipv4_is_lbcast(fl4->daddr))) {
2326                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2327                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2328                         if (!dev_out)
2329                                 goto out;
2330
2331                         /* Special hack: user can direct multicasts
2332                            and limited broadcast via necessary interface
2333                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2334                            This hack is not just for fun, it allows
2335                            vic,vat and friends to work.
2336                            They bind socket to loopback, set ttl to zero
2337                            and expect that it will work.
2338                            From the viewpoint of routing cache they are broken,
2339                            because we are not allowed to build multicast path
2340                            with loopback source addr (look, routing cache
2341                            cannot know, that ttl is zero, so that packet
2342                            will not leave this host and route is valid).
2343                            Luckily, this hack is good workaround.
2344                          */
2345
2346                         fl4->flowi4_oif = dev_out->ifindex;
2347                         goto make_route;
2348                 }
2349
2350                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2351                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2352                         if (!__ip_dev_find(net, fl4->saddr, false))
2353                                 goto out;
2354                 }
2355         }
2356
2357
2358         if (fl4->flowi4_oif) {
2359                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2360                 rth = ERR_PTR(-ENODEV);
2361                 if (!dev_out)
2362                         goto out;
2363
2364                 /* RACE: Check return value of inet_select_addr instead. */
2365                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2366                         rth = ERR_PTR(-ENETUNREACH);
2367                         goto out;
2368                 }
2369                 if (ipv4_is_local_multicast(fl4->daddr) ||
2370                     ipv4_is_lbcast(fl4->daddr) ||
2371                     fl4->flowi4_proto == IPPROTO_IGMP) {
2372                         if (!fl4->saddr)
2373                                 fl4->saddr = inet_select_addr(dev_out, 0,
2374                                                               RT_SCOPE_LINK);
2375                         goto make_route;
2376                 }
2377                 if (!fl4->saddr) {
2378                         if (ipv4_is_multicast(fl4->daddr))
2379                                 fl4->saddr = inet_select_addr(dev_out, 0,
2380                                                               fl4->flowi4_scope);
2381                         else if (!fl4->daddr)
2382                                 fl4->saddr = inet_select_addr(dev_out, 0,
2383                                                               RT_SCOPE_HOST);
2384                 }
2385         }
2386
2387         if (!fl4->daddr) {
2388                 fl4->daddr = fl4->saddr;
2389                 if (!fl4->daddr)
2390                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2391                 dev_out = net->loopback_dev;
2392                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2393                 res->type = RTN_LOCAL;
2394                 flags |= RTCF_LOCAL;
2395                 goto make_route;
2396         }
2397
2398         err = fib_lookup(net, fl4, res, 0);
2399         if (err) {
2400                 res->fi = NULL;
2401                 res->table = NULL;
2402                 if (fl4->flowi4_oif &&
2403                     (ipv4_is_multicast(fl4->daddr) ||
2404                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2405                         /* Apparently, routing tables are wrong. Assume,
2406                            that the destination is on link.
2407
2408                            WHY? DW.
2409                            Because we are allowed to send to iface
2410                            even if it has NO routes and NO assigned
2411                            addresses. When oif is specified, routing
2412                            tables are looked up with only one purpose:
2413                            to catch if destination is gatewayed, rather than
2414                            direct. Moreover, if MSG_DONTROUTE is set,
2415                            we send packet, ignoring both routing tables
2416                            and ifaddr state. --ANK
2417
2418
2419                            We could make it even if oif is unknown,
2420                            likely IPv6, but we do not.
2421                          */
2422
2423                         if (fl4->saddr == 0)
2424                                 fl4->saddr = inet_select_addr(dev_out, 0,
2425                                                               RT_SCOPE_LINK);
2426                         res->type = RTN_UNICAST;
2427                         goto make_route;
2428                 }
2429                 rth = ERR_PTR(err);
2430                 goto out;
2431         }
2432
2433         if (res->type == RTN_LOCAL) {
2434                 if (!fl4->saddr) {
2435                         if (res->fi->fib_prefsrc)
2436                                 fl4->saddr = res->fi->fib_prefsrc;
2437                         else
2438                                 fl4->saddr = fl4->daddr;
2439                 }
2440
2441                 /* L3 master device is the loopback for that domain */
2442                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2443                         net->loopback_dev;
2444
2445                 /* make sure orig_oif points to fib result device even
2446                  * though packet rx/tx happens over loopback or l3mdev
2447                  */
2448                 orig_oif = FIB_RES_OIF(*res);
2449
2450                 fl4->flowi4_oif = dev_out->ifindex;
2451                 flags |= RTCF_LOCAL;
2452                 goto make_route;
2453         }
2454
2455         fib_select_path(net, res, fl4, skb);
2456
2457         dev_out = FIB_RES_DEV(*res);
2458         fl4->flowi4_oif = dev_out->ifindex;
2459
2460
2461 make_route:
2462         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2463
2464 out:
2465         return rth;
2466 }
2467
2468 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2469 {
2470         return NULL;
2471 }
2472
2473 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2474 {
2475         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2476
2477         return mtu ? : dst->dev->mtu;
2478 }
2479
2480 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2481                                           struct sk_buff *skb, u32 mtu)
2482 {
2483 }
2484
2485 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2486                                        struct sk_buff *skb)
2487 {
2488 }
2489
2490 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2491                                           unsigned long old)
2492 {
2493         return NULL;
2494 }
2495
2496 static struct dst_ops ipv4_dst_blackhole_ops = {
2497         .family                 =       AF_INET,
2498         .check                  =       ipv4_blackhole_dst_check,
2499         .mtu                    =       ipv4_blackhole_mtu,
2500         .default_advmss         =       ipv4_default_advmss,
2501         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2502         .redirect               =       ipv4_rt_blackhole_redirect,
2503         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2504         .neigh_lookup           =       ipv4_neigh_lookup,
2505 };
2506
2507 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2508 {
2509         struct rtable *ort = (struct rtable *) dst_orig;
2510         struct rtable *rt;
2511
2512         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2513         if (rt) {
2514                 struct dst_entry *new = &rt->dst;
2515
2516                 new->__use = 1;
2517                 new->input = dst_discard;
2518                 new->output = dst_discard_out;
2519
2520                 new->dev = net->loopback_dev;
2521                 if (new->dev)
2522                         dev_hold(new->dev);
2523
2524                 rt->rt_is_input = ort->rt_is_input;
2525                 rt->rt_iif = ort->rt_iif;
2526                 rt->rt_pmtu = ort->rt_pmtu;
2527                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2528
2529                 rt->rt_genid = rt_genid_ipv4(net);
2530                 rt->rt_flags = ort->rt_flags;
2531                 rt->rt_type = ort->rt_type;
2532                 rt->rt_gateway = ort->rt_gateway;
2533                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2534
2535                 INIT_LIST_HEAD(&rt->rt_uncached);
2536         }
2537
2538         dst_release(dst_orig);
2539
2540         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2541 }
2542
2543 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2544                                     const struct sock *sk)
2545 {
2546         struct rtable *rt = __ip_route_output_key(net, flp4);
2547
2548         if (IS_ERR(rt))
2549                 return rt;
2550
2551         if (flp4->flowi4_proto)
2552                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2553                                                         flowi4_to_flowi(flp4),
2554                                                         sk, 0);
2555
2556         return rt;
2557 }
2558 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2559
2560 /* called with rcu_read_lock held */
2561 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2562                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2563                         u32 seq)
2564 {
2565         struct rtable *rt = skb_rtable(skb);
2566         struct rtmsg *r;
2567         struct nlmsghdr *nlh;
2568         unsigned long expires = 0;
2569         u32 error;
2570         u32 metrics[RTAX_MAX];
2571
2572         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2573         if (!nlh)
2574                 return -EMSGSIZE;
2575
2576         r = nlmsg_data(nlh);
2577         r->rtm_family    = AF_INET;
2578         r->rtm_dst_len  = 32;
2579         r->rtm_src_len  = 0;
2580         r->rtm_tos      = fl4->flowi4_tos;
2581         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2582         if (nla_put_u32(skb, RTA_TABLE, table_id))
2583                 goto nla_put_failure;
2584         r->rtm_type     = rt->rt_type;
2585         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2586         r->rtm_protocol = RTPROT_UNSPEC;
2587         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2588         if (rt->rt_flags & RTCF_NOTIFY)
2589                 r->rtm_flags |= RTM_F_NOTIFY;
2590         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2591                 r->rtm_flags |= RTCF_DOREDIRECT;
2592
2593         if (nla_put_in_addr(skb, RTA_DST, dst))
2594                 goto nla_put_failure;
2595         if (src) {
2596                 r->rtm_src_len = 32;
2597                 if (nla_put_in_addr(skb, RTA_SRC, src))
2598                         goto nla_put_failure;
2599         }
2600         if (rt->dst.dev &&
2601             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2602                 goto nla_put_failure;
2603 #ifdef CONFIG_IP_ROUTE_CLASSID
2604         if (rt->dst.tclassid &&
2605             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2606                 goto nla_put_failure;
2607 #endif
2608         if (!rt_is_input_route(rt) &&
2609             fl4->saddr != src) {
2610                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2611                         goto nla_put_failure;
2612         }
2613         if (rt->rt_uses_gateway &&
2614             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2615                 goto nla_put_failure;
2616
2617         expires = rt->dst.expires;
2618         if (expires) {
2619                 unsigned long now = jiffies;
2620
2621                 if (time_before(now, expires))
2622                         expires -= now;
2623                 else
2624                         expires = 0;
2625         }
2626
2627         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2628         if (rt->rt_pmtu && expires)
2629                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2630         if (rt->rt_mtu_locked && expires)
2631                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2632         if (rtnetlink_put_metrics(skb, metrics) < 0)
2633                 goto nla_put_failure;
2634
2635         if (fl4->flowi4_mark &&
2636             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2637                 goto nla_put_failure;
2638
2639         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2640             nla_put_u32(skb, RTA_UID,
2641                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2642                 goto nla_put_failure;
2643
2644         error = rt->dst.error;
2645
2646         if (rt_is_input_route(rt)) {
2647 #ifdef CONFIG_IP_MROUTE
2648                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2649                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2650                         int err = ipmr_get_route(net, skb,
2651                                                  fl4->saddr, fl4->daddr,
2652                                                  r, portid);
2653
2654                         if (err <= 0) {
2655                                 if (err == 0)
2656                                         return 0;
2657                                 goto nla_put_failure;
2658                         }
2659                 } else
2660 #endif
2661                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2662                                 goto nla_put_failure;
2663         }
2664
2665         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2666                 goto nla_put_failure;
2667
2668         nlmsg_end(skb, nlh);
2669         return 0;
2670
2671 nla_put_failure:
2672         nlmsg_cancel(skb, nlh);
2673         return -EMSGSIZE;
2674 }
2675
2676 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2677                              struct netlink_ext_ack *extack)
2678 {
2679         struct net *net = sock_net(in_skb->sk);
2680         struct rtmsg *rtm;
2681         struct nlattr *tb[RTA_MAX+1];
2682         struct fib_result res = {};
2683         struct rtable *rt = NULL;
2684         struct flowi4 fl4;
2685         __be32 dst = 0;
2686         __be32 src = 0;
2687         u32 iif;
2688         int err;
2689         int mark;
2690         struct sk_buff *skb;
2691         u32 table_id = RT_TABLE_MAIN;
2692         kuid_t uid;
2693
2694         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2695                           extack);
2696         if (err < 0)
2697                 goto errout;
2698
2699         rtm = nlmsg_data(nlh);
2700
2701         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2702         if (!skb) {
2703                 err = -ENOBUFS;
2704                 goto errout;
2705         }
2706
2707         /* Reserve room for dummy headers, this skb can pass
2708            through good chunk of routing engine.
2709          */
2710         skb_reset_mac_header(skb);
2711         skb_reset_network_header(skb);
2712
2713         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2714         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2715         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2716         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2717         if (tb[RTA_UID])
2718                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2719         else
2720                 uid = (iif ? INVALID_UID : current_uid());
2721
2722         /* Bugfix: need to give ip_route_input enough of an IP header to
2723          * not gag.
2724          */
2725         ip_hdr(skb)->protocol = IPPROTO_UDP;
2726         ip_hdr(skb)->saddr = src;
2727         ip_hdr(skb)->daddr = dst;
2728
2729         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2730
2731         memset(&fl4, 0, sizeof(fl4));
2732         fl4.daddr = dst;
2733         fl4.saddr = src;
2734         fl4.flowi4_tos = rtm->rtm_tos;
2735         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2736         fl4.flowi4_mark = mark;
2737         fl4.flowi4_uid = uid;
2738
2739         rcu_read_lock();
2740
2741         if (iif) {
2742                 struct net_device *dev;
2743
2744                 dev = dev_get_by_index_rcu(net, iif);
2745                 if (!dev) {
2746                         err = -ENODEV;
2747                         goto errout_free;
2748                 }
2749
2750                 skb->protocol   = htons(ETH_P_IP);
2751                 skb->dev        = dev;
2752                 skb->mark       = mark;
2753                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2754                                          dev, &res);
2755
2756                 rt = skb_rtable(skb);
2757                 if (err == 0 && rt->dst.error)
2758                         err = -rt->dst.error;
2759         } else {
2760                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2761                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2762                 err = 0;
2763                 if (IS_ERR(rt))
2764                         err = PTR_ERR(rt);
2765                 else
2766                         skb_dst_set(skb, &rt->dst);
2767         }
2768
2769         if (err)
2770                 goto errout_free;
2771
2772         if (rtm->rtm_flags & RTM_F_NOTIFY)
2773                 rt->rt_flags |= RTCF_NOTIFY;
2774
2775         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2776                 table_id = res.table ? res.table->tb_id : 0;
2777
2778         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2779                 if (!res.fi) {
2780                         err = fib_props[res.type].error;
2781                         if (!err)
2782                                 err = -EHOSTUNREACH;
2783                         goto errout_free;
2784                 }
2785                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2786                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2787                                     rt->rt_type, res.prefix, res.prefixlen,
2788                                     fl4.flowi4_tos, res.fi, 0);
2789         } else {
2790                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2791                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2792         }
2793         if (err < 0)
2794                 goto errout_free;
2795
2796         rcu_read_unlock();
2797
2798         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2799 errout:
2800         return err;
2801
2802 errout_free:
2803         rcu_read_unlock();
2804         kfree_skb(skb);
2805         goto errout;
2806 }
2807
2808 void ip_rt_multicast_event(struct in_device *in_dev)
2809 {
2810         rt_cache_flush(dev_net(in_dev->dev));
2811 }
2812
2813 #ifdef CONFIG_SYSCTL
2814 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2815 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2816 static int ip_rt_gc_elasticity __read_mostly    = 8;
2817 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2818
2819 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2820                                         void __user *buffer,
2821                                         size_t *lenp, loff_t *ppos)
2822 {
2823         struct net *net = (struct net *)__ctl->extra1;
2824
2825         if (write) {
2826                 rt_cache_flush(net);
2827                 fnhe_genid_bump(net);
2828                 return 0;
2829         }
2830
2831         return -EINVAL;
2832 }
2833
2834 static struct ctl_table ipv4_route_table[] = {
2835         {
2836                 .procname       = "gc_thresh",
2837                 .data           = &ipv4_dst_ops.gc_thresh,
2838                 .maxlen         = sizeof(int),
2839                 .mode           = 0644,
2840                 .proc_handler   = proc_dointvec,
2841         },
2842         {
2843                 .procname       = "max_size",
2844                 .data           = &ip_rt_max_size,
2845                 .maxlen         = sizeof(int),
2846                 .mode           = 0644,
2847                 .proc_handler   = proc_dointvec,
2848         },
2849         {
2850                 /*  Deprecated. Use gc_min_interval_ms */
2851
2852                 .procname       = "gc_min_interval",
2853                 .data           = &ip_rt_gc_min_interval,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = proc_dointvec_jiffies,
2857         },
2858         {
2859                 .procname       = "gc_min_interval_ms",
2860                 .data           = &ip_rt_gc_min_interval,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec_ms_jiffies,
2864         },
2865         {
2866                 .procname       = "gc_timeout",
2867                 .data           = &ip_rt_gc_timeout,
2868                 .maxlen         = sizeof(int),
2869                 .mode           = 0644,
2870                 .proc_handler   = proc_dointvec_jiffies,
2871         },
2872         {
2873                 .procname       = "gc_interval",
2874                 .data           = &ip_rt_gc_interval,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = proc_dointvec_jiffies,
2878         },
2879         {
2880                 .procname       = "redirect_load",
2881                 .data           = &ip_rt_redirect_load,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = proc_dointvec,
2885         },
2886         {
2887                 .procname       = "redirect_number",
2888                 .data           = &ip_rt_redirect_number,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = proc_dointvec,
2892         },
2893         {
2894                 .procname       = "redirect_silence",
2895                 .data           = &ip_rt_redirect_silence,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = proc_dointvec,
2899         },
2900         {
2901                 .procname       = "error_cost",
2902                 .data           = &ip_rt_error_cost,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = proc_dointvec,
2906         },
2907         {
2908                 .procname       = "error_burst",
2909                 .data           = &ip_rt_error_burst,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = proc_dointvec,
2913         },
2914         {
2915                 .procname       = "gc_elasticity",
2916                 .data           = &ip_rt_gc_elasticity,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = proc_dointvec,
2920         },
2921         {
2922                 .procname       = "mtu_expires",
2923                 .data           = &ip_rt_mtu_expires,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = proc_dointvec_jiffies,
2927         },
2928         {
2929                 .procname       = "min_pmtu",
2930                 .data           = &ip_rt_min_pmtu,
2931                 .maxlen         = sizeof(int),
2932                 .mode           = 0644,
2933                 .proc_handler   = proc_dointvec_minmax,
2934                 .extra1         = &ip_min_valid_pmtu,
2935         },
2936         {
2937                 .procname       = "min_adv_mss",
2938                 .data           = &ip_rt_min_advmss,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = proc_dointvec,
2942         },
2943         { }
2944 };
2945
2946 static struct ctl_table ipv4_route_flush_table[] = {
2947         {
2948                 .procname       = "flush",
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0200,
2951                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2952         },
2953         { },
2954 };
2955
2956 static __net_init int sysctl_route_net_init(struct net *net)
2957 {
2958         struct ctl_table *tbl;
2959
2960         tbl = ipv4_route_flush_table;
2961         if (!net_eq(net, &init_net)) {
2962                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2963                 if (!tbl)
2964                         goto err_dup;
2965
2966                 /* Don't export sysctls to unprivileged users */
2967                 if (net->user_ns != &init_user_ns)
2968                         tbl[0].procname = NULL;
2969         }
2970         tbl[0].extra1 = net;
2971
2972         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2973         if (!net->ipv4.route_hdr)
2974                 goto err_reg;
2975         return 0;
2976
2977 err_reg:
2978         if (tbl != ipv4_route_flush_table)
2979                 kfree(tbl);
2980 err_dup:
2981         return -ENOMEM;
2982 }
2983
2984 static __net_exit void sysctl_route_net_exit(struct net *net)
2985 {
2986         struct ctl_table *tbl;
2987
2988         tbl = net->ipv4.route_hdr->ctl_table_arg;
2989         unregister_net_sysctl_table(net->ipv4.route_hdr);
2990         BUG_ON(tbl == ipv4_route_flush_table);
2991         kfree(tbl);
2992 }
2993
2994 static __net_initdata struct pernet_operations sysctl_route_ops = {
2995         .init = sysctl_route_net_init,
2996         .exit = sysctl_route_net_exit,
2997 };
2998 #endif
2999
3000 static __net_init int rt_genid_init(struct net *net)
3001 {
3002         atomic_set(&net->ipv4.rt_genid, 0);
3003         atomic_set(&net->fnhe_genid, 0);
3004         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3005         return 0;
3006 }
3007
3008 static __net_initdata struct pernet_operations rt_genid_ops = {
3009         .init = rt_genid_init,
3010 };
3011
3012 static int __net_init ipv4_inetpeer_init(struct net *net)
3013 {
3014         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3015
3016         if (!bp)
3017                 return -ENOMEM;
3018         inet_peer_base_init(bp);
3019         net->ipv4.peers = bp;
3020         return 0;
3021 }
3022
3023 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3024 {
3025         struct inet_peer_base *bp = net->ipv4.peers;
3026
3027         net->ipv4.peers = NULL;
3028         inetpeer_invalidate_tree(bp);
3029         kfree(bp);
3030 }
3031
3032 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3033         .init   =       ipv4_inetpeer_init,
3034         .exit   =       ipv4_inetpeer_exit,
3035 };
3036
3037 #ifdef CONFIG_IP_ROUTE_CLASSID
3038 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3039 #endif /* CONFIG_IP_ROUTE_CLASSID */
3040
3041 int __init ip_rt_init(void)
3042 {
3043         int cpu;
3044
3045         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3046         if (!ip_idents)
3047                 panic("IP: failed to allocate ip_idents\n");
3048
3049         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3050
3051         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3052         if (!ip_tstamps)
3053                 panic("IP: failed to allocate ip_tstamps\n");
3054
3055         for_each_possible_cpu(cpu) {
3056                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3057
3058                 INIT_LIST_HEAD(&ul->head);
3059                 spin_lock_init(&ul->lock);
3060         }
3061 #ifdef CONFIG_IP_ROUTE_CLASSID
3062         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3063         if (!ip_rt_acct)
3064                 panic("IP: failed to allocate ip_rt_acct\n");
3065 #endif
3066
3067         ipv4_dst_ops.kmem_cachep =
3068                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3069                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3070
3071         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3072
3073         if (dst_entries_init(&ipv4_dst_ops) < 0)
3074                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3075
3076         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3077                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3078
3079         ipv4_dst_ops.gc_thresh = ~0;
3080         ip_rt_max_size = INT_MAX;
3081
3082         devinet_init();
3083         ip_fib_init();
3084
3085         if (ip_rt_proc_init())
3086                 pr_err("Unable to create route proc files\n");
3087 #ifdef CONFIG_XFRM
3088         xfrm_init();
3089         xfrm4_init();
3090 #endif
3091         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3092                       RTNL_FLAG_DOIT_UNLOCKED);
3093
3094 #ifdef CONFIG_SYSCTL
3095         register_pernet_subsys(&sysctl_route_ops);
3096 #endif
3097         register_pernet_subsys(&rt_genid_ops);
3098         register_pernet_subsys(&ipv4_inetpeer_ops);
3099         return 0;
3100 }
3101
3102 #ifdef CONFIG_SYSCTL
3103 /*
3104  * We really need to sanitize the damn ipv4 init order, then all
3105  * this nonsense will go away.
3106  */
3107 void __init ip_static_sysctl_init(void)
3108 {
3109         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3110 }
3111 #endif