net: rt_cache_flush() cleanup
[linux-block.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 static inline int rt_genid(struct net *net)
206 {
207         return atomic_read(&net->ipv4.rt_genid);
208 }
209
210 #ifdef CONFIG_PROC_FS
211 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
212 {
213         if (*pos)
214                 return NULL;
215         return SEQ_START_TOKEN;
216 }
217
218 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
219 {
220         ++*pos;
221         return NULL;
222 }
223
224 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
225 {
226 }
227
228 static int rt_cache_seq_show(struct seq_file *seq, void *v)
229 {
230         if (v == SEQ_START_TOKEN)
231                 seq_printf(seq, "%-127s\n",
232                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
233                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
234                            "HHUptod\tSpecDst");
235         return 0;
236 }
237
238 static const struct seq_operations rt_cache_seq_ops = {
239         .start  = rt_cache_seq_start,
240         .next   = rt_cache_seq_next,
241         .stop   = rt_cache_seq_stop,
242         .show   = rt_cache_seq_show,
243 };
244
245 static int rt_cache_seq_open(struct inode *inode, struct file *file)
246 {
247         return seq_open(file, &rt_cache_seq_ops);
248 }
249
250 static const struct file_operations rt_cache_seq_fops = {
251         .owner   = THIS_MODULE,
252         .open    = rt_cache_seq_open,
253         .read    = seq_read,
254         .llseek  = seq_lseek,
255         .release = seq_release,
256 };
257
258
259 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
260 {
261         int cpu;
262
263         if (*pos == 0)
264                 return SEQ_START_TOKEN;
265
266         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273 }
274
275 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 {
277         int cpu;
278
279         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
280                 if (!cpu_possible(cpu))
281                         continue;
282                 *pos = cpu+1;
283                 return &per_cpu(rt_cache_stat, cpu);
284         }
285         return NULL;
286
287 }
288
289 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
290 {
291
292 }
293
294 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
295 {
296         struct rt_cache_stat *st = v;
297
298         if (v == SEQ_START_TOKEN) {
299                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
300                 return 0;
301         }
302
303         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
304                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
305                    dst_entries_get_slow(&ipv4_dst_ops),
306                    st->in_hit,
307                    st->in_slow_tot,
308                    st->in_slow_mc,
309                    st->in_no_route,
310                    st->in_brd,
311                    st->in_martian_dst,
312                    st->in_martian_src,
313
314                    st->out_hit,
315                    st->out_slow_tot,
316                    st->out_slow_mc,
317
318                    st->gc_total,
319                    st->gc_ignored,
320                    st->gc_goal_miss,
321                    st->gc_dst_overflow,
322                    st->in_hlist_search,
323                    st->out_hlist_search
324                 );
325         return 0;
326 }
327
328 static const struct seq_operations rt_cpu_seq_ops = {
329         .start  = rt_cpu_seq_start,
330         .next   = rt_cpu_seq_next,
331         .stop   = rt_cpu_seq_stop,
332         .show   = rt_cpu_seq_show,
333 };
334
335
336 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
337 {
338         return seq_open(file, &rt_cpu_seq_ops);
339 }
340
341 static const struct file_operations rt_cpu_seq_fops = {
342         .owner   = THIS_MODULE,
343         .open    = rt_cpu_seq_open,
344         .read    = seq_read,
345         .llseek  = seq_lseek,
346         .release = seq_release,
347 };
348
349 #ifdef CONFIG_IP_ROUTE_CLASSID
350 static int rt_acct_proc_show(struct seq_file *m, void *v)
351 {
352         struct ip_rt_acct *dst, *src;
353         unsigned int i, j;
354
355         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
356         if (!dst)
357                 return -ENOMEM;
358
359         for_each_possible_cpu(i) {
360                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
361                 for (j = 0; j < 256; j++) {
362                         dst[j].o_bytes   += src[j].o_bytes;
363                         dst[j].o_packets += src[j].o_packets;
364                         dst[j].i_bytes   += src[j].i_bytes;
365                         dst[j].i_packets += src[j].i_packets;
366                 }
367         }
368
369         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
370         kfree(dst);
371         return 0;
372 }
373
374 static int rt_acct_proc_open(struct inode *inode, struct file *file)
375 {
376         return single_open(file, rt_acct_proc_show, NULL);
377 }
378
379 static const struct file_operations rt_acct_proc_fops = {
380         .owner          = THIS_MODULE,
381         .open           = rt_acct_proc_open,
382         .read           = seq_read,
383         .llseek         = seq_lseek,
384         .release        = single_release,
385 };
386 #endif
387
388 static int __net_init ip_rt_do_proc_init(struct net *net)
389 {
390         struct proc_dir_entry *pde;
391
392         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
393                         &rt_cache_seq_fops);
394         if (!pde)
395                 goto err1;
396
397         pde = proc_create("rt_cache", S_IRUGO,
398                           net->proc_net_stat, &rt_cpu_seq_fops);
399         if (!pde)
400                 goto err2;
401
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
404         if (!pde)
405                 goto err3;
406 #endif
407         return 0;
408
409 #ifdef CONFIG_IP_ROUTE_CLASSID
410 err3:
411         remove_proc_entry("rt_cache", net->proc_net_stat);
412 #endif
413 err2:
414         remove_proc_entry("rt_cache", net->proc_net);
415 err1:
416         return -ENOMEM;
417 }
418
419 static void __net_exit ip_rt_do_proc_exit(struct net *net)
420 {
421         remove_proc_entry("rt_cache", net->proc_net_stat);
422         remove_proc_entry("rt_cache", net->proc_net);
423 #ifdef CONFIG_IP_ROUTE_CLASSID
424         remove_proc_entry("rt_acct", net->proc_net);
425 #endif
426 }
427
428 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
429         .init = ip_rt_do_proc_init,
430         .exit = ip_rt_do_proc_exit,
431 };
432
433 static int __init ip_rt_proc_init(void)
434 {
435         return register_pernet_subsys(&ip_rt_proc_ops);
436 }
437
438 #else
439 static inline int ip_rt_proc_init(void)
440 {
441         return 0;
442 }
443 #endif /* CONFIG_PROC_FS */
444
445 static inline bool rt_is_expired(const struct rtable *rth)
446 {
447         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
448 }
449
450 void rt_cache_flush(struct net *net)
451 {
452         atomic_inc(&net->ipv4.rt_genid);
453 }
454
455 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
456                                            struct sk_buff *skb,
457                                            const void *daddr)
458 {
459         struct net_device *dev = dst->dev;
460         const __be32 *pkey = daddr;
461         const struct rtable *rt;
462         struct neighbour *n;
463
464         rt = (const struct rtable *) dst;
465         if (rt->rt_gateway)
466                 pkey = (const __be32 *) &rt->rt_gateway;
467         else if (skb)
468                 pkey = &ip_hdr(skb)->daddr;
469
470         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
471         if (n)
472                 return n;
473         return neigh_create(&arp_tbl, pkey, dev);
474 }
475
476 /*
477  * Peer allocation may fail only in serious out-of-memory conditions.  However
478  * we still can generate some output.
479  * Random ID selection looks a bit dangerous because we have no chances to
480  * select ID being unique in a reasonable period of time.
481  * But broken packet identifier may be better than no packet at all.
482  */
483 static void ip_select_fb_ident(struct iphdr *iph)
484 {
485         static DEFINE_SPINLOCK(ip_fb_id_lock);
486         static u32 ip_fallback_id;
487         u32 salt;
488
489         spin_lock_bh(&ip_fb_id_lock);
490         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
491         iph->id = htons(salt & 0xFFFF);
492         ip_fallback_id = salt;
493         spin_unlock_bh(&ip_fb_id_lock);
494 }
495
496 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
497 {
498         struct net *net = dev_net(dst->dev);
499         struct inet_peer *peer;
500
501         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
502         if (peer) {
503                 iph->id = htons(inet_getid(peer, more));
504                 inet_putpeer(peer);
505                 return;
506         }
507
508         ip_select_fb_ident(iph);
509 }
510 EXPORT_SYMBOL(__ip_select_ident);
511
512 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
513                              const struct iphdr *iph,
514                              int oif, u8 tos,
515                              u8 prot, u32 mark, int flow_flags)
516 {
517         if (sk) {
518                 const struct inet_sock *inet = inet_sk(sk);
519
520                 oif = sk->sk_bound_dev_if;
521                 mark = sk->sk_mark;
522                 tos = RT_CONN_FLAGS(sk);
523                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
524         }
525         flowi4_init_output(fl4, oif, mark, tos,
526                            RT_SCOPE_UNIVERSE, prot,
527                            flow_flags,
528                            iph->daddr, iph->saddr, 0, 0);
529 }
530
531 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
532                                const struct sock *sk)
533 {
534         const struct iphdr *iph = ip_hdr(skb);
535         int oif = skb->dev->ifindex;
536         u8 tos = RT_TOS(iph->tos);
537         u8 prot = iph->protocol;
538         u32 mark = skb->mark;
539
540         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
541 }
542
543 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
544 {
545         const struct inet_sock *inet = inet_sk(sk);
546         const struct ip_options_rcu *inet_opt;
547         __be32 daddr = inet->inet_daddr;
548
549         rcu_read_lock();
550         inet_opt = rcu_dereference(inet->inet_opt);
551         if (inet_opt && inet_opt->opt.srr)
552                 daddr = inet_opt->opt.faddr;
553         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
554                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
555                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
556                            inet_sk_flowi_flags(sk),
557                            daddr, inet->inet_saddr, 0, 0);
558         rcu_read_unlock();
559 }
560
561 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
562                                  const struct sk_buff *skb)
563 {
564         if (skb)
565                 build_skb_flow_key(fl4, skb, sk);
566         else
567                 build_sk_flow_key(fl4, sk);
568 }
569
570 static inline void rt_free(struct rtable *rt)
571 {
572         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
573 }
574
575 static DEFINE_SPINLOCK(fnhe_lock);
576
577 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
578 {
579         struct fib_nh_exception *fnhe, *oldest;
580         struct rtable *orig;
581
582         oldest = rcu_dereference(hash->chain);
583         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
584              fnhe = rcu_dereference(fnhe->fnhe_next)) {
585                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
586                         oldest = fnhe;
587         }
588         orig = rcu_dereference(oldest->fnhe_rth);
589         if (orig) {
590                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
591                 rt_free(orig);
592         }
593         return oldest;
594 }
595
596 static inline u32 fnhe_hashfun(__be32 daddr)
597 {
598         u32 hval;
599
600         hval = (__force u32) daddr;
601         hval ^= (hval >> 11) ^ (hval >> 22);
602
603         return hval & (FNHE_HASH_SIZE - 1);
604 }
605
606 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
607                                   u32 pmtu, unsigned long expires)
608 {
609         struct fnhe_hash_bucket *hash;
610         struct fib_nh_exception *fnhe;
611         int depth;
612         u32 hval = fnhe_hashfun(daddr);
613
614         spin_lock_bh(&fnhe_lock);
615
616         hash = nh->nh_exceptions;
617         if (!hash) {
618                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
619                 if (!hash)
620                         goto out_unlock;
621                 nh->nh_exceptions = hash;
622         }
623
624         hash += hval;
625
626         depth = 0;
627         for (fnhe = rcu_dereference(hash->chain); fnhe;
628              fnhe = rcu_dereference(fnhe->fnhe_next)) {
629                 if (fnhe->fnhe_daddr == daddr)
630                         break;
631                 depth++;
632         }
633
634         if (fnhe) {
635                 if (gw)
636                         fnhe->fnhe_gw = gw;
637                 if (pmtu) {
638                         fnhe->fnhe_pmtu = pmtu;
639                         fnhe->fnhe_expires = expires;
640                 }
641         } else {
642                 if (depth > FNHE_RECLAIM_DEPTH)
643                         fnhe = fnhe_oldest(hash);
644                 else {
645                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
646                         if (!fnhe)
647                                 goto out_unlock;
648
649                         fnhe->fnhe_next = hash->chain;
650                         rcu_assign_pointer(hash->chain, fnhe);
651                 }
652                 fnhe->fnhe_daddr = daddr;
653                 fnhe->fnhe_gw = gw;
654                 fnhe->fnhe_pmtu = pmtu;
655                 fnhe->fnhe_expires = expires;
656         }
657
658         fnhe->fnhe_stamp = jiffies;
659
660 out_unlock:
661         spin_unlock_bh(&fnhe_lock);
662         return;
663 }
664
665 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
666                              bool kill_route)
667 {
668         __be32 new_gw = icmp_hdr(skb)->un.gateway;
669         __be32 old_gw = ip_hdr(skb)->saddr;
670         struct net_device *dev = skb->dev;
671         struct in_device *in_dev;
672         struct fib_result res;
673         struct neighbour *n;
674         struct net *net;
675
676         switch (icmp_hdr(skb)->code & 7) {
677         case ICMP_REDIR_NET:
678         case ICMP_REDIR_NETTOS:
679         case ICMP_REDIR_HOST:
680         case ICMP_REDIR_HOSTTOS:
681                 break;
682
683         default:
684                 return;
685         }
686
687         if (rt->rt_gateway != old_gw)
688                 return;
689
690         in_dev = __in_dev_get_rcu(dev);
691         if (!in_dev)
692                 return;
693
694         net = dev_net(dev);
695         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
696             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
697             ipv4_is_zeronet(new_gw))
698                 goto reject_redirect;
699
700         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
701                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
702                         goto reject_redirect;
703                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
704                         goto reject_redirect;
705         } else {
706                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
707                         goto reject_redirect;
708         }
709
710         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
711         if (n) {
712                 if (!(n->nud_state & NUD_VALID)) {
713                         neigh_event_send(n, NULL);
714                 } else {
715                         if (fib_lookup(net, fl4, &res) == 0) {
716                                 struct fib_nh *nh = &FIB_RES_NH(res);
717
718                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
719                                                       0, 0);
720                         }
721                         if (kill_route)
722                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
723                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
724                 }
725                 neigh_release(n);
726         }
727         return;
728
729 reject_redirect:
730 #ifdef CONFIG_IP_ROUTE_VERBOSE
731         if (IN_DEV_LOG_MARTIANS(in_dev)) {
732                 const struct iphdr *iph = (const struct iphdr *) skb->data;
733                 __be32 daddr = iph->daddr;
734                 __be32 saddr = iph->saddr;
735
736                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
737                                      "  Advised path = %pI4 -> %pI4\n",
738                                      &old_gw, dev->name, &new_gw,
739                                      &saddr, &daddr);
740         }
741 #endif
742         ;
743 }
744
745 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
746 {
747         struct rtable *rt;
748         struct flowi4 fl4;
749
750         rt = (struct rtable *) dst;
751
752         ip_rt_build_flow_key(&fl4, sk, skb);
753         __ip_do_redirect(rt, skb, &fl4, true);
754 }
755
756 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
757 {
758         struct rtable *rt = (struct rtable *)dst;
759         struct dst_entry *ret = dst;
760
761         if (rt) {
762                 if (dst->obsolete > 0) {
763                         ip_rt_put(rt);
764                         ret = NULL;
765                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
766                            rt->dst.expires) {
767                         ip_rt_put(rt);
768                         ret = NULL;
769                 }
770         }
771         return ret;
772 }
773
774 /*
775  * Algorithm:
776  *      1. The first ip_rt_redirect_number redirects are sent
777  *         with exponential backoff, then we stop sending them at all,
778  *         assuming that the host ignores our redirects.
779  *      2. If we did not see packets requiring redirects
780  *         during ip_rt_redirect_silence, we assume that the host
781  *         forgot redirected route and start to send redirects again.
782  *
783  * This algorithm is much cheaper and more intelligent than dumb load limiting
784  * in icmp.c.
785  *
786  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
787  * and "frag. need" (breaks PMTU discovery) in icmp.c.
788  */
789
790 void ip_rt_send_redirect(struct sk_buff *skb)
791 {
792         struct rtable *rt = skb_rtable(skb);
793         struct in_device *in_dev;
794         struct inet_peer *peer;
795         struct net *net;
796         int log_martians;
797
798         rcu_read_lock();
799         in_dev = __in_dev_get_rcu(rt->dst.dev);
800         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
801                 rcu_read_unlock();
802                 return;
803         }
804         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
805         rcu_read_unlock();
806
807         net = dev_net(rt->dst.dev);
808         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
809         if (!peer) {
810                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
811                 return;
812         }
813
814         /* No redirected packets during ip_rt_redirect_silence;
815          * reset the algorithm.
816          */
817         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
818                 peer->rate_tokens = 0;
819
820         /* Too many ignored redirects; do not send anything
821          * set dst.rate_last to the last seen redirected packet.
822          */
823         if (peer->rate_tokens >= ip_rt_redirect_number) {
824                 peer->rate_last = jiffies;
825                 goto out_put_peer;
826         }
827
828         /* Check for load limit; set rate_last to the latest sent
829          * redirect.
830          */
831         if (peer->rate_tokens == 0 ||
832             time_after(jiffies,
833                        (peer->rate_last +
834                         (ip_rt_redirect_load << peer->rate_tokens)))) {
835                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
836                 peer->rate_last = jiffies;
837                 ++peer->rate_tokens;
838 #ifdef CONFIG_IP_ROUTE_VERBOSE
839                 if (log_martians &&
840                     peer->rate_tokens == ip_rt_redirect_number)
841                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
842                                              &ip_hdr(skb)->saddr, inet_iif(skb),
843                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
844 #endif
845         }
846 out_put_peer:
847         inet_putpeer(peer);
848 }
849
850 static int ip_error(struct sk_buff *skb)
851 {
852         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
853         struct rtable *rt = skb_rtable(skb);
854         struct inet_peer *peer;
855         unsigned long now;
856         struct net *net;
857         bool send;
858         int code;
859
860         net = dev_net(rt->dst.dev);
861         if (!IN_DEV_FORWARD(in_dev)) {
862                 switch (rt->dst.error) {
863                 case EHOSTUNREACH:
864                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
865                         break;
866
867                 case ENETUNREACH:
868                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
869                         break;
870                 }
871                 goto out;
872         }
873
874         switch (rt->dst.error) {
875         case EINVAL:
876         default:
877                 goto out;
878         case EHOSTUNREACH:
879                 code = ICMP_HOST_UNREACH;
880                 break;
881         case ENETUNREACH:
882                 code = ICMP_NET_UNREACH;
883                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
884                 break;
885         case EACCES:
886                 code = ICMP_PKT_FILTERED;
887                 break;
888         }
889
890         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
891
892         send = true;
893         if (peer) {
894                 now = jiffies;
895                 peer->rate_tokens += now - peer->rate_last;
896                 if (peer->rate_tokens > ip_rt_error_burst)
897                         peer->rate_tokens = ip_rt_error_burst;
898                 peer->rate_last = now;
899                 if (peer->rate_tokens >= ip_rt_error_cost)
900                         peer->rate_tokens -= ip_rt_error_cost;
901                 else
902                         send = false;
903                 inet_putpeer(peer);
904         }
905         if (send)
906                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
907
908 out:    kfree_skb(skb);
909         return 0;
910 }
911
912 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
913 {
914         struct fib_result res;
915
916         if (mtu < ip_rt_min_pmtu)
917                 mtu = ip_rt_min_pmtu;
918
919         rcu_read_lock();
920         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
921                 struct fib_nh *nh = &FIB_RES_NH(res);
922
923                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
924                                       jiffies + ip_rt_mtu_expires);
925         }
926         rcu_read_unlock();
927         return mtu;
928 }
929
930 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
931                               struct sk_buff *skb, u32 mtu)
932 {
933         struct rtable *rt = (struct rtable *) dst;
934         struct flowi4 fl4;
935
936         ip_rt_build_flow_key(&fl4, sk, skb);
937         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
938
939         if (!rt->rt_pmtu) {
940                 dst->obsolete = DST_OBSOLETE_KILL;
941         } else {
942                 rt->rt_pmtu = mtu;
943                 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
944         }
945 }
946
947 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
948                       int oif, u32 mark, u8 protocol, int flow_flags)
949 {
950         const struct iphdr *iph = (const struct iphdr *) skb->data;
951         struct flowi4 fl4;
952         struct rtable *rt;
953
954         __build_flow_key(&fl4, NULL, iph, oif,
955                          RT_TOS(iph->tos), protocol, mark, flow_flags);
956         rt = __ip_route_output_key(net, &fl4);
957         if (!IS_ERR(rt)) {
958                 __ip_rt_update_pmtu(rt, &fl4, mtu);
959                 ip_rt_put(rt);
960         }
961 }
962 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
963
964 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
965 {
966         const struct iphdr *iph = (const struct iphdr *) skb->data;
967         struct flowi4 fl4;
968         struct rtable *rt;
969
970         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
971         rt = __ip_route_output_key(sock_net(sk), &fl4);
972         if (!IS_ERR(rt)) {
973                 __ip_rt_update_pmtu(rt, &fl4, mtu);
974                 ip_rt_put(rt);
975         }
976 }
977 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
978
979 void ipv4_redirect(struct sk_buff *skb, struct net *net,
980                    int oif, u32 mark, u8 protocol, int flow_flags)
981 {
982         const struct iphdr *iph = (const struct iphdr *) skb->data;
983         struct flowi4 fl4;
984         struct rtable *rt;
985
986         __build_flow_key(&fl4, NULL, iph, oif,
987                          RT_TOS(iph->tos), protocol, mark, flow_flags);
988         rt = __ip_route_output_key(net, &fl4);
989         if (!IS_ERR(rt)) {
990                 __ip_do_redirect(rt, skb, &fl4, false);
991                 ip_rt_put(rt);
992         }
993 }
994 EXPORT_SYMBOL_GPL(ipv4_redirect);
995
996 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
997 {
998         const struct iphdr *iph = (const struct iphdr *) skb->data;
999         struct flowi4 fl4;
1000         struct rtable *rt;
1001
1002         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1003         rt = __ip_route_output_key(sock_net(sk), &fl4);
1004         if (!IS_ERR(rt)) {
1005                 __ip_do_redirect(rt, skb, &fl4, false);
1006                 ip_rt_put(rt);
1007         }
1008 }
1009 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1010
1011 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1012 {
1013         struct rtable *rt = (struct rtable *) dst;
1014
1015         /* All IPV4 dsts are created with ->obsolete set to the value
1016          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1017          * into this function always.
1018          *
1019          * When a PMTU/redirect information update invalidates a
1020          * route, this is indicated by setting obsolete to
1021          * DST_OBSOLETE_KILL.
1022          */
1023         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1024                 return NULL;
1025         return dst;
1026 }
1027
1028 static void ipv4_link_failure(struct sk_buff *skb)
1029 {
1030         struct rtable *rt;
1031
1032         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1033
1034         rt = skb_rtable(skb);
1035         if (rt)
1036                 dst_set_expires(&rt->dst, 0);
1037 }
1038
1039 static int ip_rt_bug(struct sk_buff *skb)
1040 {
1041         pr_debug("%s: %pI4 -> %pI4, %s\n",
1042                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1043                  skb->dev ? skb->dev->name : "?");
1044         kfree_skb(skb);
1045         WARN_ON(1);
1046         return 0;
1047 }
1048
1049 /*
1050    We do not cache source address of outgoing interface,
1051    because it is used only by IP RR, TS and SRR options,
1052    so that it out of fast path.
1053
1054    BTW remember: "addr" is allowed to be not aligned
1055    in IP options!
1056  */
1057
1058 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1059 {
1060         __be32 src;
1061
1062         if (rt_is_output_route(rt))
1063                 src = ip_hdr(skb)->saddr;
1064         else {
1065                 struct fib_result res;
1066                 struct flowi4 fl4;
1067                 struct iphdr *iph;
1068
1069                 iph = ip_hdr(skb);
1070
1071                 memset(&fl4, 0, sizeof(fl4));
1072                 fl4.daddr = iph->daddr;
1073                 fl4.saddr = iph->saddr;
1074                 fl4.flowi4_tos = RT_TOS(iph->tos);
1075                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1076                 fl4.flowi4_iif = skb->dev->ifindex;
1077                 fl4.flowi4_mark = skb->mark;
1078
1079                 rcu_read_lock();
1080                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1081                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1082                 else
1083                         src = inet_select_addr(rt->dst.dev,
1084                                                rt_nexthop(rt, iph->daddr),
1085                                                RT_SCOPE_UNIVERSE);
1086                 rcu_read_unlock();
1087         }
1088         memcpy(addr, &src, 4);
1089 }
1090
1091 #ifdef CONFIG_IP_ROUTE_CLASSID
1092 static void set_class_tag(struct rtable *rt, u32 tag)
1093 {
1094         if (!(rt->dst.tclassid & 0xFFFF))
1095                 rt->dst.tclassid |= tag & 0xFFFF;
1096         if (!(rt->dst.tclassid & 0xFFFF0000))
1097                 rt->dst.tclassid |= tag & 0xFFFF0000;
1098 }
1099 #endif
1100
1101 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1102 {
1103         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1104
1105         if (advmss == 0) {
1106                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1107                                ip_rt_min_advmss);
1108                 if (advmss > 65535 - 40)
1109                         advmss = 65535 - 40;
1110         }
1111         return advmss;
1112 }
1113
1114 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1115 {
1116         const struct rtable *rt = (const struct rtable *) dst;
1117         unsigned int mtu = rt->rt_pmtu;
1118
1119         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1120                 mtu = dst_metric_raw(dst, RTAX_MTU);
1121
1122         if (mtu && rt_is_output_route(rt))
1123                 return mtu;
1124
1125         mtu = dst->dev->mtu;
1126
1127         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1128                 if (rt->rt_gateway && mtu > 576)
1129                         mtu = 576;
1130         }
1131
1132         if (mtu > IP_MAX_MTU)
1133                 mtu = IP_MAX_MTU;
1134
1135         return mtu;
1136 }
1137
1138 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1139 {
1140         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1141         struct fib_nh_exception *fnhe;
1142         u32 hval;
1143
1144         if (!hash)
1145                 return NULL;
1146
1147         hval = fnhe_hashfun(daddr);
1148
1149         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1150              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1151                 if (fnhe->fnhe_daddr == daddr)
1152                         return fnhe;
1153         }
1154         return NULL;
1155 }
1156
1157 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1158                               __be32 daddr)
1159 {
1160         bool ret = false;
1161
1162         spin_lock_bh(&fnhe_lock);
1163
1164         if (daddr == fnhe->fnhe_daddr) {
1165                 struct rtable *orig;
1166
1167                 if (fnhe->fnhe_pmtu) {
1168                         unsigned long expires = fnhe->fnhe_expires;
1169                         unsigned long diff = expires - jiffies;
1170
1171                         if (time_before(jiffies, expires)) {
1172                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1173                                 dst_set_expires(&rt->dst, diff);
1174                         }
1175                 }
1176                 if (fnhe->fnhe_gw) {
1177                         rt->rt_flags |= RTCF_REDIRECTED;
1178                         rt->rt_gateway = fnhe->fnhe_gw;
1179                 }
1180
1181                 orig = rcu_dereference(fnhe->fnhe_rth);
1182                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1183                 if (orig)
1184                         rt_free(orig);
1185
1186                 fnhe->fnhe_stamp = jiffies;
1187                 ret = true;
1188         } else {
1189                 /* Routes we intend to cache in nexthop exception have
1190                  * the DST_NOCACHE bit clear.  However, if we are
1191                  * unsuccessful at storing this route into the cache
1192                  * we really need to set it.
1193                  */
1194                 rt->dst.flags |= DST_NOCACHE;
1195         }
1196         spin_unlock_bh(&fnhe_lock);
1197
1198         return ret;
1199 }
1200
1201 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1202 {
1203         struct rtable *orig, *prev, **p;
1204         bool ret = true;
1205
1206         if (rt_is_input_route(rt)) {
1207                 p = (struct rtable **)&nh->nh_rth_input;
1208         } else {
1209                 if (!nh->nh_pcpu_rth_output)
1210                         goto nocache;
1211                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1212         }
1213         orig = *p;
1214
1215         prev = cmpxchg(p, orig, rt);
1216         if (prev == orig) {
1217                 if (orig)
1218                         rt_free(orig);
1219         } else {
1220                 /* Routes we intend to cache in the FIB nexthop have
1221                  * the DST_NOCACHE bit clear.  However, if we are
1222                  * unsuccessful at storing this route into the cache
1223                  * we really need to set it.
1224                  */
1225 nocache:
1226                 rt->dst.flags |= DST_NOCACHE;
1227                 ret = false;
1228         }
1229
1230         return ret;
1231 }
1232
1233 static DEFINE_SPINLOCK(rt_uncached_lock);
1234 static LIST_HEAD(rt_uncached_list);
1235
1236 static void rt_add_uncached_list(struct rtable *rt)
1237 {
1238         spin_lock_bh(&rt_uncached_lock);
1239         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1240         spin_unlock_bh(&rt_uncached_lock);
1241 }
1242
1243 static void ipv4_dst_destroy(struct dst_entry *dst)
1244 {
1245         struct rtable *rt = (struct rtable *) dst;
1246
1247         if (!list_empty(&rt->rt_uncached)) {
1248                 spin_lock_bh(&rt_uncached_lock);
1249                 list_del(&rt->rt_uncached);
1250                 spin_unlock_bh(&rt_uncached_lock);
1251         }
1252 }
1253
1254 void rt_flush_dev(struct net_device *dev)
1255 {
1256         if (!list_empty(&rt_uncached_list)) {
1257                 struct net *net = dev_net(dev);
1258                 struct rtable *rt;
1259
1260                 spin_lock_bh(&rt_uncached_lock);
1261                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1262                         if (rt->dst.dev != dev)
1263                                 continue;
1264                         rt->dst.dev = net->loopback_dev;
1265                         dev_hold(rt->dst.dev);
1266                         dev_put(dev);
1267                 }
1268                 spin_unlock_bh(&rt_uncached_lock);
1269         }
1270 }
1271
1272 static bool rt_cache_valid(const struct rtable *rt)
1273 {
1274         return  rt &&
1275                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1276                 !rt_is_expired(rt);
1277 }
1278
1279 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1280                            const struct fib_result *res,
1281                            struct fib_nh_exception *fnhe,
1282                            struct fib_info *fi, u16 type, u32 itag)
1283 {
1284         bool cached = false;
1285
1286         if (fi) {
1287                 struct fib_nh *nh = &FIB_RES_NH(*res);
1288
1289                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1290                         rt->rt_gateway = nh->nh_gw;
1291                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1292 #ifdef CONFIG_IP_ROUTE_CLASSID
1293                 rt->dst.tclassid = nh->nh_tclassid;
1294 #endif
1295                 if (unlikely(fnhe))
1296                         cached = rt_bind_exception(rt, fnhe, daddr);
1297                 else if (!(rt->dst.flags & DST_NOCACHE))
1298                         cached = rt_cache_route(nh, rt);
1299         }
1300         if (unlikely(!cached))
1301                 rt_add_uncached_list(rt);
1302
1303 #ifdef CONFIG_IP_ROUTE_CLASSID
1304 #ifdef CONFIG_IP_MULTIPLE_TABLES
1305         set_class_tag(rt, res->tclassid);
1306 #endif
1307         set_class_tag(rt, itag);
1308 #endif
1309 }
1310
1311 static struct rtable *rt_dst_alloc(struct net_device *dev,
1312                                    bool nopolicy, bool noxfrm, bool will_cache)
1313 {
1314         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1315                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1316                          (nopolicy ? DST_NOPOLICY : 0) |
1317                          (noxfrm ? DST_NOXFRM : 0));
1318 }
1319
1320 /* called in rcu_read_lock() section */
1321 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1322                                 u8 tos, struct net_device *dev, int our)
1323 {
1324         struct rtable *rth;
1325         struct in_device *in_dev = __in_dev_get_rcu(dev);
1326         u32 itag = 0;
1327         int err;
1328
1329         /* Primary sanity checks. */
1330
1331         if (in_dev == NULL)
1332                 return -EINVAL;
1333
1334         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1335             skb->protocol != htons(ETH_P_IP))
1336                 goto e_inval;
1337
1338         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1339                 if (ipv4_is_loopback(saddr))
1340                         goto e_inval;
1341
1342         if (ipv4_is_zeronet(saddr)) {
1343                 if (!ipv4_is_local_multicast(daddr))
1344                         goto e_inval;
1345         } else {
1346                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1347                                           in_dev, &itag);
1348                 if (err < 0)
1349                         goto e_err;
1350         }
1351         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1352                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1353         if (!rth)
1354                 goto e_nobufs;
1355
1356 #ifdef CONFIG_IP_ROUTE_CLASSID
1357         rth->dst.tclassid = itag;
1358 #endif
1359         rth->dst.output = ip_rt_bug;
1360
1361         rth->rt_genid   = rt_genid(dev_net(dev));
1362         rth->rt_flags   = RTCF_MULTICAST;
1363         rth->rt_type    = RTN_MULTICAST;
1364         rth->rt_is_input= 1;
1365         rth->rt_iif     = 0;
1366         rth->rt_pmtu    = 0;
1367         rth->rt_gateway = 0;
1368         INIT_LIST_HEAD(&rth->rt_uncached);
1369         if (our) {
1370                 rth->dst.input= ip_local_deliver;
1371                 rth->rt_flags |= RTCF_LOCAL;
1372         }
1373
1374 #ifdef CONFIG_IP_MROUTE
1375         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1376                 rth->dst.input = ip_mr_input;
1377 #endif
1378         RT_CACHE_STAT_INC(in_slow_mc);
1379
1380         skb_dst_set(skb, &rth->dst);
1381         return 0;
1382
1383 e_nobufs:
1384         return -ENOBUFS;
1385 e_inval:
1386         return -EINVAL;
1387 e_err:
1388         return err;
1389 }
1390
1391
1392 static void ip_handle_martian_source(struct net_device *dev,
1393                                      struct in_device *in_dev,
1394                                      struct sk_buff *skb,
1395                                      __be32 daddr,
1396                                      __be32 saddr)
1397 {
1398         RT_CACHE_STAT_INC(in_martian_src);
1399 #ifdef CONFIG_IP_ROUTE_VERBOSE
1400         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1401                 /*
1402                  *      RFC1812 recommendation, if source is martian,
1403                  *      the only hint is MAC header.
1404                  */
1405                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1406                         &daddr, &saddr, dev->name);
1407                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1408                         print_hex_dump(KERN_WARNING, "ll header: ",
1409                                        DUMP_PREFIX_OFFSET, 16, 1,
1410                                        skb_mac_header(skb),
1411                                        dev->hard_header_len, true);
1412                 }
1413         }
1414 #endif
1415 }
1416
1417 /* called in rcu_read_lock() section */
1418 static int __mkroute_input(struct sk_buff *skb,
1419                            const struct fib_result *res,
1420                            struct in_device *in_dev,
1421                            __be32 daddr, __be32 saddr, u32 tos)
1422 {
1423         struct rtable *rth;
1424         int err;
1425         struct in_device *out_dev;
1426         unsigned int flags = 0;
1427         bool do_cache;
1428         u32 itag;
1429
1430         /* get a working reference to the output device */
1431         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1432         if (out_dev == NULL) {
1433                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1434                 return -EINVAL;
1435         }
1436
1437
1438         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1439                                   in_dev->dev, in_dev, &itag);
1440         if (err < 0) {
1441                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1442                                          saddr);
1443
1444                 goto cleanup;
1445         }
1446
1447         if (out_dev == in_dev && err &&
1448             (IN_DEV_SHARED_MEDIA(out_dev) ||
1449              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1450                 flags |= RTCF_DOREDIRECT;
1451
1452         if (skb->protocol != htons(ETH_P_IP)) {
1453                 /* Not IP (i.e. ARP). Do not create route, if it is
1454                  * invalid for proxy arp. DNAT routes are always valid.
1455                  *
1456                  * Proxy arp feature have been extended to allow, ARP
1457                  * replies back to the same interface, to support
1458                  * Private VLAN switch technologies. See arp.c.
1459                  */
1460                 if (out_dev == in_dev &&
1461                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1462                         err = -EINVAL;
1463                         goto cleanup;
1464                 }
1465         }
1466
1467         do_cache = false;
1468         if (res->fi) {
1469                 if (!itag) {
1470                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1471                         if (rt_cache_valid(rth)) {
1472                                 skb_dst_set_noref(skb, &rth->dst);
1473                                 goto out;
1474                         }
1475                         do_cache = true;
1476                 }
1477         }
1478
1479         rth = rt_dst_alloc(out_dev->dev,
1480                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482         if (!rth) {
1483                 err = -ENOBUFS;
1484                 goto cleanup;
1485         }
1486
1487         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488         rth->rt_flags = flags;
1489         rth->rt_type = res->type;
1490         rth->rt_is_input = 1;
1491         rth->rt_iif     = 0;
1492         rth->rt_pmtu    = 0;
1493         rth->rt_gateway = 0;
1494         INIT_LIST_HEAD(&rth->rt_uncached);
1495
1496         rth->dst.input = ip_forward;
1497         rth->dst.output = ip_output;
1498
1499         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1500         skb_dst_set(skb, &rth->dst);
1501 out:
1502         err = 0;
1503  cleanup:
1504         return err;
1505 }
1506
1507 static int ip_mkroute_input(struct sk_buff *skb,
1508                             struct fib_result *res,
1509                             const struct flowi4 *fl4,
1510                             struct in_device *in_dev,
1511                             __be32 daddr, __be32 saddr, u32 tos)
1512 {
1513 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1514         if (res->fi && res->fi->fib_nhs > 1)
1515                 fib_select_multipath(res);
1516 #endif
1517
1518         /* create a routing cache entry */
1519         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1520 }
1521
1522 /*
1523  *      NOTE. We drop all the packets that has local source
1524  *      addresses, because every properly looped back packet
1525  *      must have correct destination already attached by output routine.
1526  *
1527  *      Such approach solves two big problems:
1528  *      1. Not simplex devices are handled properly.
1529  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1530  *      called with rcu_read_lock()
1531  */
1532
1533 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1534                                u8 tos, struct net_device *dev)
1535 {
1536         struct fib_result res;
1537         struct in_device *in_dev = __in_dev_get_rcu(dev);
1538         struct flowi4   fl4;
1539         unsigned int    flags = 0;
1540         u32             itag = 0;
1541         struct rtable   *rth;
1542         int             err = -EINVAL;
1543         struct net    *net = dev_net(dev);
1544         bool do_cache;
1545
1546         /* IP on this device is disabled. */
1547
1548         if (!in_dev)
1549                 goto out;
1550
1551         /* Check for the most weird martians, which can be not detected
1552            by fib_lookup.
1553          */
1554
1555         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1556                 goto martian_source;
1557
1558         res.fi = NULL;
1559         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1560                 goto brd_input;
1561
1562         /* Accept zero addresses only to limited broadcast;
1563          * I even do not know to fix it or not. Waiting for complains :-)
1564          */
1565         if (ipv4_is_zeronet(saddr))
1566                 goto martian_source;
1567
1568         if (ipv4_is_zeronet(daddr))
1569                 goto martian_destination;
1570
1571         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1572          * and call it once if daddr or/and saddr are loopback addresses
1573          */
1574         if (ipv4_is_loopback(daddr)) {
1575                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1576                         goto martian_destination;
1577         } else if (ipv4_is_loopback(saddr)) {
1578                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1579                         goto martian_source;
1580         }
1581
1582         /*
1583          *      Now we are ready to route packet.
1584          */
1585         fl4.flowi4_oif = 0;
1586         fl4.flowi4_iif = dev->ifindex;
1587         fl4.flowi4_mark = skb->mark;
1588         fl4.flowi4_tos = tos;
1589         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1590         fl4.daddr = daddr;
1591         fl4.saddr = saddr;
1592         err = fib_lookup(net, &fl4, &res);
1593         if (err != 0)
1594                 goto no_route;
1595
1596         RT_CACHE_STAT_INC(in_slow_tot);
1597
1598         if (res.type == RTN_BROADCAST)
1599                 goto brd_input;
1600
1601         if (res.type == RTN_LOCAL) {
1602                 err = fib_validate_source(skb, saddr, daddr, tos,
1603                                           LOOPBACK_IFINDEX,
1604                                           dev, in_dev, &itag);
1605                 if (err < 0)
1606                         goto martian_source_keep_err;
1607                 goto local_input;
1608         }
1609
1610         if (!IN_DEV_FORWARD(in_dev))
1611                 goto no_route;
1612         if (res.type != RTN_UNICAST)
1613                 goto martian_destination;
1614
1615         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1616 out:    return err;
1617
1618 brd_input:
1619         if (skb->protocol != htons(ETH_P_IP))
1620                 goto e_inval;
1621
1622         if (!ipv4_is_zeronet(saddr)) {
1623                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1624                                           in_dev, &itag);
1625                 if (err < 0)
1626                         goto martian_source_keep_err;
1627         }
1628         flags |= RTCF_BROADCAST;
1629         res.type = RTN_BROADCAST;
1630         RT_CACHE_STAT_INC(in_brd);
1631
1632 local_input:
1633         do_cache = false;
1634         if (res.fi) {
1635                 if (!itag) {
1636                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1637                         if (rt_cache_valid(rth)) {
1638                                 skb_dst_set_noref(skb, &rth->dst);
1639                                 err = 0;
1640                                 goto out;
1641                         }
1642                         do_cache = true;
1643                 }
1644         }
1645
1646         rth = rt_dst_alloc(net->loopback_dev,
1647                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1648         if (!rth)
1649                 goto e_nobufs;
1650
1651         rth->dst.input= ip_local_deliver;
1652         rth->dst.output= ip_rt_bug;
1653 #ifdef CONFIG_IP_ROUTE_CLASSID
1654         rth->dst.tclassid = itag;
1655 #endif
1656
1657         rth->rt_genid = rt_genid(net);
1658         rth->rt_flags   = flags|RTCF_LOCAL;
1659         rth->rt_type    = res.type;
1660         rth->rt_is_input = 1;
1661         rth->rt_iif     = 0;
1662         rth->rt_pmtu    = 0;
1663         rth->rt_gateway = 0;
1664         INIT_LIST_HEAD(&rth->rt_uncached);
1665         if (res.type == RTN_UNREACHABLE) {
1666                 rth->dst.input= ip_error;
1667                 rth->dst.error= -err;
1668                 rth->rt_flags   &= ~RTCF_LOCAL;
1669         }
1670         if (do_cache)
1671                 rt_cache_route(&FIB_RES_NH(res), rth);
1672         skb_dst_set(skb, &rth->dst);
1673         err = 0;
1674         goto out;
1675
1676 no_route:
1677         RT_CACHE_STAT_INC(in_no_route);
1678         res.type = RTN_UNREACHABLE;
1679         if (err == -ESRCH)
1680                 err = -ENETUNREACH;
1681         goto local_input;
1682
1683         /*
1684          *      Do not cache martian addresses: they should be logged (RFC1812)
1685          */
1686 martian_destination:
1687         RT_CACHE_STAT_INC(in_martian_dst);
1688 #ifdef CONFIG_IP_ROUTE_VERBOSE
1689         if (IN_DEV_LOG_MARTIANS(in_dev))
1690                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1691                                      &daddr, &saddr, dev->name);
1692 #endif
1693
1694 e_inval:
1695         err = -EINVAL;
1696         goto out;
1697
1698 e_nobufs:
1699         err = -ENOBUFS;
1700         goto out;
1701
1702 martian_source:
1703         err = -EINVAL;
1704 martian_source_keep_err:
1705         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1706         goto out;
1707 }
1708
1709 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1710                          u8 tos, struct net_device *dev)
1711 {
1712         int res;
1713
1714         rcu_read_lock();
1715
1716         /* Multicast recognition logic is moved from route cache to here.
1717            The problem was that too many Ethernet cards have broken/missing
1718            hardware multicast filters :-( As result the host on multicasting
1719            network acquires a lot of useless route cache entries, sort of
1720            SDR messages from all the world. Now we try to get rid of them.
1721            Really, provided software IP multicast filter is organized
1722            reasonably (at least, hashed), it does not result in a slowdown
1723            comparing with route cache reject entries.
1724            Note, that multicast routers are not affected, because
1725            route cache entry is created eventually.
1726          */
1727         if (ipv4_is_multicast(daddr)) {
1728                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1729
1730                 if (in_dev) {
1731                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1732                                                   ip_hdr(skb)->protocol);
1733                         if (our
1734 #ifdef CONFIG_IP_MROUTE
1735                                 ||
1736                             (!ipv4_is_local_multicast(daddr) &&
1737                              IN_DEV_MFORWARD(in_dev))
1738 #endif
1739                            ) {
1740                                 int res = ip_route_input_mc(skb, daddr, saddr,
1741                                                             tos, dev, our);
1742                                 rcu_read_unlock();
1743                                 return res;
1744                         }
1745                 }
1746                 rcu_read_unlock();
1747                 return -EINVAL;
1748         }
1749         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1750         rcu_read_unlock();
1751         return res;
1752 }
1753 EXPORT_SYMBOL(ip_route_input_noref);
1754
1755 /* called with rcu_read_lock() */
1756 static struct rtable *__mkroute_output(const struct fib_result *res,
1757                                        const struct flowi4 *fl4, int orig_oif,
1758                                        struct net_device *dev_out,
1759                                        unsigned int flags)
1760 {
1761         struct fib_info *fi = res->fi;
1762         struct fib_nh_exception *fnhe;
1763         struct in_device *in_dev;
1764         u16 type = res->type;
1765         struct rtable *rth;
1766
1767         in_dev = __in_dev_get_rcu(dev_out);
1768         if (!in_dev)
1769                 return ERR_PTR(-EINVAL);
1770
1771         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1772                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1773                         return ERR_PTR(-EINVAL);
1774
1775         if (ipv4_is_lbcast(fl4->daddr))
1776                 type = RTN_BROADCAST;
1777         else if (ipv4_is_multicast(fl4->daddr))
1778                 type = RTN_MULTICAST;
1779         else if (ipv4_is_zeronet(fl4->daddr))
1780                 return ERR_PTR(-EINVAL);
1781
1782         if (dev_out->flags & IFF_LOOPBACK)
1783                 flags |= RTCF_LOCAL;
1784
1785         if (type == RTN_BROADCAST) {
1786                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1787                 fi = NULL;
1788         } else if (type == RTN_MULTICAST) {
1789                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1790                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1791                                      fl4->flowi4_proto))
1792                         flags &= ~RTCF_LOCAL;
1793                 /* If multicast route do not exist use
1794                  * default one, but do not gateway in this case.
1795                  * Yes, it is hack.
1796                  */
1797                 if (fi && res->prefixlen < 4)
1798                         fi = NULL;
1799         }
1800
1801         fnhe = NULL;
1802         if (fi) {
1803                 struct rtable __rcu **prth;
1804
1805                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1806                 if (fnhe)
1807                         prth = &fnhe->fnhe_rth;
1808                 else
1809                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1810                 rth = rcu_dereference(*prth);
1811                 if (rt_cache_valid(rth)) {
1812                         dst_hold(&rth->dst);
1813                         return rth;
1814                 }
1815         }
1816         rth = rt_dst_alloc(dev_out,
1817                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1818                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1819                            fi);
1820         if (!rth)
1821                 return ERR_PTR(-ENOBUFS);
1822
1823         rth->dst.output = ip_output;
1824
1825         rth->rt_genid = rt_genid(dev_net(dev_out));
1826         rth->rt_flags   = flags;
1827         rth->rt_type    = type;
1828         rth->rt_is_input = 0;
1829         rth->rt_iif     = orig_oif ? : 0;
1830         rth->rt_pmtu    = 0;
1831         rth->rt_gateway = 0;
1832         INIT_LIST_HEAD(&rth->rt_uncached);
1833
1834         RT_CACHE_STAT_INC(out_slow_tot);
1835
1836         if (flags & RTCF_LOCAL)
1837                 rth->dst.input = ip_local_deliver;
1838         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1839                 if (flags & RTCF_LOCAL &&
1840                     !(dev_out->flags & IFF_LOOPBACK)) {
1841                         rth->dst.output = ip_mc_output;
1842                         RT_CACHE_STAT_INC(out_slow_mc);
1843                 }
1844 #ifdef CONFIG_IP_MROUTE
1845                 if (type == RTN_MULTICAST) {
1846                         if (IN_DEV_MFORWARD(in_dev) &&
1847                             !ipv4_is_local_multicast(fl4->daddr)) {
1848                                 rth->dst.input = ip_mr_input;
1849                                 rth->dst.output = ip_mc_output;
1850                         }
1851                 }
1852 #endif
1853         }
1854
1855         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1856
1857         return rth;
1858 }
1859
1860 /*
1861  * Major route resolver routine.
1862  */
1863
1864 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1865 {
1866         struct net_device *dev_out = NULL;
1867         __u8 tos = RT_FL_TOS(fl4);
1868         unsigned int flags = 0;
1869         struct fib_result res;
1870         struct rtable *rth;
1871         int orig_oif;
1872
1873         res.tclassid    = 0;
1874         res.fi          = NULL;
1875         res.table       = NULL;
1876
1877         orig_oif = fl4->flowi4_oif;
1878
1879         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1880         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1881         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1882                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1883
1884         rcu_read_lock();
1885         if (fl4->saddr) {
1886                 rth = ERR_PTR(-EINVAL);
1887                 if (ipv4_is_multicast(fl4->saddr) ||
1888                     ipv4_is_lbcast(fl4->saddr) ||
1889                     ipv4_is_zeronet(fl4->saddr))
1890                         goto out;
1891
1892                 /* I removed check for oif == dev_out->oif here.
1893                    It was wrong for two reasons:
1894                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1895                       is assigned to multiple interfaces.
1896                    2. Moreover, we are allowed to send packets with saddr
1897                       of another iface. --ANK
1898                  */
1899
1900                 if (fl4->flowi4_oif == 0 &&
1901                     (ipv4_is_multicast(fl4->daddr) ||
1902                      ipv4_is_lbcast(fl4->daddr))) {
1903                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1904                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1905                         if (dev_out == NULL)
1906                                 goto out;
1907
1908                         /* Special hack: user can direct multicasts
1909                            and limited broadcast via necessary interface
1910                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1911                            This hack is not just for fun, it allows
1912                            vic,vat and friends to work.
1913                            They bind socket to loopback, set ttl to zero
1914                            and expect that it will work.
1915                            From the viewpoint of routing cache they are broken,
1916                            because we are not allowed to build multicast path
1917                            with loopback source addr (look, routing cache
1918                            cannot know, that ttl is zero, so that packet
1919                            will not leave this host and route is valid).
1920                            Luckily, this hack is good workaround.
1921                          */
1922
1923                         fl4->flowi4_oif = dev_out->ifindex;
1924                         goto make_route;
1925                 }
1926
1927                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1928                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1929                         if (!__ip_dev_find(net, fl4->saddr, false))
1930                                 goto out;
1931                 }
1932         }
1933
1934
1935         if (fl4->flowi4_oif) {
1936                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1937                 rth = ERR_PTR(-ENODEV);
1938                 if (dev_out == NULL)
1939                         goto out;
1940
1941                 /* RACE: Check return value of inet_select_addr instead. */
1942                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1943                         rth = ERR_PTR(-ENETUNREACH);
1944                         goto out;
1945                 }
1946                 if (ipv4_is_local_multicast(fl4->daddr) ||
1947                     ipv4_is_lbcast(fl4->daddr)) {
1948                         if (!fl4->saddr)
1949                                 fl4->saddr = inet_select_addr(dev_out, 0,
1950                                                               RT_SCOPE_LINK);
1951                         goto make_route;
1952                 }
1953                 if (fl4->saddr) {
1954                         if (ipv4_is_multicast(fl4->daddr))
1955                                 fl4->saddr = inet_select_addr(dev_out, 0,
1956                                                               fl4->flowi4_scope);
1957                         else if (!fl4->daddr)
1958                                 fl4->saddr = inet_select_addr(dev_out, 0,
1959                                                               RT_SCOPE_HOST);
1960                 }
1961         }
1962
1963         if (!fl4->daddr) {
1964                 fl4->daddr = fl4->saddr;
1965                 if (!fl4->daddr)
1966                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1967                 dev_out = net->loopback_dev;
1968                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1969                 res.type = RTN_LOCAL;
1970                 flags |= RTCF_LOCAL;
1971                 goto make_route;
1972         }
1973
1974         if (fib_lookup(net, fl4, &res)) {
1975                 res.fi = NULL;
1976                 res.table = NULL;
1977                 if (fl4->flowi4_oif) {
1978                         /* Apparently, routing tables are wrong. Assume,
1979                            that the destination is on link.
1980
1981                            WHY? DW.
1982                            Because we are allowed to send to iface
1983                            even if it has NO routes and NO assigned
1984                            addresses. When oif is specified, routing
1985                            tables are looked up with only one purpose:
1986                            to catch if destination is gatewayed, rather than
1987                            direct. Moreover, if MSG_DONTROUTE is set,
1988                            we send packet, ignoring both routing tables
1989                            and ifaddr state. --ANK
1990
1991
1992                            We could make it even if oif is unknown,
1993                            likely IPv6, but we do not.
1994                          */
1995
1996                         if (fl4->saddr == 0)
1997                                 fl4->saddr = inet_select_addr(dev_out, 0,
1998                                                               RT_SCOPE_LINK);
1999                         res.type = RTN_UNICAST;
2000                         goto make_route;
2001                 }
2002                 rth = ERR_PTR(-ENETUNREACH);
2003                 goto out;
2004         }
2005
2006         if (res.type == RTN_LOCAL) {
2007                 if (!fl4->saddr) {
2008                         if (res.fi->fib_prefsrc)
2009                                 fl4->saddr = res.fi->fib_prefsrc;
2010                         else
2011                                 fl4->saddr = fl4->daddr;
2012                 }
2013                 dev_out = net->loopback_dev;
2014                 fl4->flowi4_oif = dev_out->ifindex;
2015                 flags |= RTCF_LOCAL;
2016                 goto make_route;
2017         }
2018
2019 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2020         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2021                 fib_select_multipath(&res);
2022         else
2023 #endif
2024         if (!res.prefixlen &&
2025             res.table->tb_num_default > 1 &&
2026             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2027                 fib_select_default(&res);
2028
2029         if (!fl4->saddr)
2030                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2031
2032         dev_out = FIB_RES_DEV(res);
2033         fl4->flowi4_oif = dev_out->ifindex;
2034
2035
2036 make_route:
2037         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2038
2039 out:
2040         rcu_read_unlock();
2041         return rth;
2042 }
2043 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2044
2045 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2046 {
2047         return NULL;
2048 }
2049
2050 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2051 {
2052         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2053
2054         return mtu ? : dst->dev->mtu;
2055 }
2056
2057 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2058                                           struct sk_buff *skb, u32 mtu)
2059 {
2060 }
2061
2062 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2063                                        struct sk_buff *skb)
2064 {
2065 }
2066
2067 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2068                                           unsigned long old)
2069 {
2070         return NULL;
2071 }
2072
2073 static struct dst_ops ipv4_dst_blackhole_ops = {
2074         .family                 =       AF_INET,
2075         .protocol               =       cpu_to_be16(ETH_P_IP),
2076         .check                  =       ipv4_blackhole_dst_check,
2077         .mtu                    =       ipv4_blackhole_mtu,
2078         .default_advmss         =       ipv4_default_advmss,
2079         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2080         .redirect               =       ipv4_rt_blackhole_redirect,
2081         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2082         .neigh_lookup           =       ipv4_neigh_lookup,
2083 };
2084
2085 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2086 {
2087         struct rtable *ort = (struct rtable *) dst_orig;
2088         struct rtable *rt;
2089
2090         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2091         if (rt) {
2092                 struct dst_entry *new = &rt->dst;
2093
2094                 new->__use = 1;
2095                 new->input = dst_discard;
2096                 new->output = dst_discard;
2097
2098                 new->dev = ort->dst.dev;
2099                 if (new->dev)
2100                         dev_hold(new->dev);
2101
2102                 rt->rt_is_input = ort->rt_is_input;
2103                 rt->rt_iif = ort->rt_iif;
2104                 rt->rt_pmtu = ort->rt_pmtu;
2105
2106                 rt->rt_genid = rt_genid(net);
2107                 rt->rt_flags = ort->rt_flags;
2108                 rt->rt_type = ort->rt_type;
2109                 rt->rt_gateway = ort->rt_gateway;
2110
2111                 INIT_LIST_HEAD(&rt->rt_uncached);
2112
2113                 dst_free(new);
2114         }
2115
2116         dst_release(dst_orig);
2117
2118         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2119 }
2120
2121 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2122                                     struct sock *sk)
2123 {
2124         struct rtable *rt = __ip_route_output_key(net, flp4);
2125
2126         if (IS_ERR(rt))
2127                 return rt;
2128
2129         if (flp4->flowi4_proto)
2130                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2131                                                    flowi4_to_flowi(flp4),
2132                                                    sk, 0);
2133
2134         return rt;
2135 }
2136 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2137
2138 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2139                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2140                         u32 seq, int event, int nowait, unsigned int flags)
2141 {
2142         struct rtable *rt = skb_rtable(skb);
2143         struct rtmsg *r;
2144         struct nlmsghdr *nlh;
2145         unsigned long expires = 0;
2146         u32 error;
2147         u32 metrics[RTAX_MAX];
2148
2149         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2150         if (nlh == NULL)
2151                 return -EMSGSIZE;
2152
2153         r = nlmsg_data(nlh);
2154         r->rtm_family    = AF_INET;
2155         r->rtm_dst_len  = 32;
2156         r->rtm_src_len  = 0;
2157         r->rtm_tos      = fl4->flowi4_tos;
2158         r->rtm_table    = RT_TABLE_MAIN;
2159         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2160                 goto nla_put_failure;
2161         r->rtm_type     = rt->rt_type;
2162         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2163         r->rtm_protocol = RTPROT_UNSPEC;
2164         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2165         if (rt->rt_flags & RTCF_NOTIFY)
2166                 r->rtm_flags |= RTM_F_NOTIFY;
2167
2168         if (nla_put_be32(skb, RTA_DST, dst))
2169                 goto nla_put_failure;
2170         if (src) {
2171                 r->rtm_src_len = 32;
2172                 if (nla_put_be32(skb, RTA_SRC, src))
2173                         goto nla_put_failure;
2174         }
2175         if (rt->dst.dev &&
2176             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2177                 goto nla_put_failure;
2178 #ifdef CONFIG_IP_ROUTE_CLASSID
2179         if (rt->dst.tclassid &&
2180             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2181                 goto nla_put_failure;
2182 #endif
2183         if (!rt_is_input_route(rt) &&
2184             fl4->saddr != src) {
2185                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2186                         goto nla_put_failure;
2187         }
2188         if (rt->rt_gateway &&
2189             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2190                 goto nla_put_failure;
2191
2192         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2193         if (rt->rt_pmtu)
2194                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2195         if (rtnetlink_put_metrics(skb, metrics) < 0)
2196                 goto nla_put_failure;
2197
2198         if (fl4->flowi4_mark &&
2199             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2200                 goto nla_put_failure;
2201
2202         error = rt->dst.error;
2203         expires = rt->dst.expires;
2204         if (expires) {
2205                 if (time_before(jiffies, expires))
2206                         expires -= jiffies;
2207                 else
2208                         expires = 0;
2209         }
2210
2211         if (rt_is_input_route(rt)) {
2212                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2213                         goto nla_put_failure;
2214         }
2215
2216         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2217                 goto nla_put_failure;
2218
2219         return nlmsg_end(skb, nlh);
2220
2221 nla_put_failure:
2222         nlmsg_cancel(skb, nlh);
2223         return -EMSGSIZE;
2224 }
2225
2226 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2227 {
2228         struct net *net = sock_net(in_skb->sk);
2229         struct rtmsg *rtm;
2230         struct nlattr *tb[RTA_MAX+1];
2231         struct rtable *rt = NULL;
2232         struct flowi4 fl4;
2233         __be32 dst = 0;
2234         __be32 src = 0;
2235         u32 iif;
2236         int err;
2237         int mark;
2238         struct sk_buff *skb;
2239
2240         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2241         if (err < 0)
2242                 goto errout;
2243
2244         rtm = nlmsg_data(nlh);
2245
2246         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2247         if (skb == NULL) {
2248                 err = -ENOBUFS;
2249                 goto errout;
2250         }
2251
2252         /* Reserve room for dummy headers, this skb can pass
2253            through good chunk of routing engine.
2254          */
2255         skb_reset_mac_header(skb);
2256         skb_reset_network_header(skb);
2257
2258         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2259         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2260         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2261
2262         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2263         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2264         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2265         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2266
2267         memset(&fl4, 0, sizeof(fl4));
2268         fl4.daddr = dst;
2269         fl4.saddr = src;
2270         fl4.flowi4_tos = rtm->rtm_tos;
2271         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2272         fl4.flowi4_mark = mark;
2273
2274         if (iif) {
2275                 struct net_device *dev;
2276
2277                 dev = __dev_get_by_index(net, iif);
2278                 if (dev == NULL) {
2279                         err = -ENODEV;
2280                         goto errout_free;
2281                 }
2282
2283                 skb->protocol   = htons(ETH_P_IP);
2284                 skb->dev        = dev;
2285                 skb->mark       = mark;
2286                 local_bh_disable();
2287                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2288                 local_bh_enable();
2289
2290                 rt = skb_rtable(skb);
2291                 if (err == 0 && rt->dst.error)
2292                         err = -rt->dst.error;
2293         } else {
2294                 rt = ip_route_output_key(net, &fl4);
2295
2296                 err = 0;
2297                 if (IS_ERR(rt))
2298                         err = PTR_ERR(rt);
2299         }
2300
2301         if (err)
2302                 goto errout_free;
2303
2304         skb_dst_set(skb, &rt->dst);
2305         if (rtm->rtm_flags & RTM_F_NOTIFY)
2306                 rt->rt_flags |= RTCF_NOTIFY;
2307
2308         err = rt_fill_info(net, dst, src, &fl4, skb,
2309                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2310                            RTM_NEWROUTE, 0, 0);
2311         if (err <= 0)
2312                 goto errout_free;
2313
2314         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2315 errout:
2316         return err;
2317
2318 errout_free:
2319         kfree_skb(skb);
2320         goto errout;
2321 }
2322
2323 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2324 {
2325         return skb->len;
2326 }
2327
2328 void ip_rt_multicast_event(struct in_device *in_dev)
2329 {
2330         rt_cache_flush(dev_net(in_dev->dev));
2331 }
2332
2333 #ifdef CONFIG_SYSCTL
2334 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2335                                         void __user *buffer,
2336                                         size_t *lenp, loff_t *ppos)
2337 {
2338         if (write) {
2339                 rt_cache_flush((struct net *)__ctl->extra1);
2340                 return 0;
2341         }
2342
2343         return -EINVAL;
2344 }
2345
2346 static ctl_table ipv4_route_table[] = {
2347         {
2348                 .procname       = "gc_thresh",
2349                 .data           = &ipv4_dst_ops.gc_thresh,
2350                 .maxlen         = sizeof(int),
2351                 .mode           = 0644,
2352                 .proc_handler   = proc_dointvec,
2353         },
2354         {
2355                 .procname       = "max_size",
2356                 .data           = &ip_rt_max_size,
2357                 .maxlen         = sizeof(int),
2358                 .mode           = 0644,
2359                 .proc_handler   = proc_dointvec,
2360         },
2361         {
2362                 /*  Deprecated. Use gc_min_interval_ms */
2363
2364                 .procname       = "gc_min_interval",
2365                 .data           = &ip_rt_gc_min_interval,
2366                 .maxlen         = sizeof(int),
2367                 .mode           = 0644,
2368                 .proc_handler   = proc_dointvec_jiffies,
2369         },
2370         {
2371                 .procname       = "gc_min_interval_ms",
2372                 .data           = &ip_rt_gc_min_interval,
2373                 .maxlen         = sizeof(int),
2374                 .mode           = 0644,
2375                 .proc_handler   = proc_dointvec_ms_jiffies,
2376         },
2377         {
2378                 .procname       = "gc_timeout",
2379                 .data           = &ip_rt_gc_timeout,
2380                 .maxlen         = sizeof(int),
2381                 .mode           = 0644,
2382                 .proc_handler   = proc_dointvec_jiffies,
2383         },
2384         {
2385                 .procname       = "gc_interval",
2386                 .data           = &ip_rt_gc_interval,
2387                 .maxlen         = sizeof(int),
2388                 .mode           = 0644,
2389                 .proc_handler   = proc_dointvec_jiffies,
2390         },
2391         {
2392                 .procname       = "redirect_load",
2393                 .data           = &ip_rt_redirect_load,
2394                 .maxlen         = sizeof(int),
2395                 .mode           = 0644,
2396                 .proc_handler   = proc_dointvec,
2397         },
2398         {
2399                 .procname       = "redirect_number",
2400                 .data           = &ip_rt_redirect_number,
2401                 .maxlen         = sizeof(int),
2402                 .mode           = 0644,
2403                 .proc_handler   = proc_dointvec,
2404         },
2405         {
2406                 .procname       = "redirect_silence",
2407                 .data           = &ip_rt_redirect_silence,
2408                 .maxlen         = sizeof(int),
2409                 .mode           = 0644,
2410                 .proc_handler   = proc_dointvec,
2411         },
2412         {
2413                 .procname       = "error_cost",
2414                 .data           = &ip_rt_error_cost,
2415                 .maxlen         = sizeof(int),
2416                 .mode           = 0644,
2417                 .proc_handler   = proc_dointvec,
2418         },
2419         {
2420                 .procname       = "error_burst",
2421                 .data           = &ip_rt_error_burst,
2422                 .maxlen         = sizeof(int),
2423                 .mode           = 0644,
2424                 .proc_handler   = proc_dointvec,
2425         },
2426         {
2427                 .procname       = "gc_elasticity",
2428                 .data           = &ip_rt_gc_elasticity,
2429                 .maxlen         = sizeof(int),
2430                 .mode           = 0644,
2431                 .proc_handler   = proc_dointvec,
2432         },
2433         {
2434                 .procname       = "mtu_expires",
2435                 .data           = &ip_rt_mtu_expires,
2436                 .maxlen         = sizeof(int),
2437                 .mode           = 0644,
2438                 .proc_handler   = proc_dointvec_jiffies,
2439         },
2440         {
2441                 .procname       = "min_pmtu",
2442                 .data           = &ip_rt_min_pmtu,
2443                 .maxlen         = sizeof(int),
2444                 .mode           = 0644,
2445                 .proc_handler   = proc_dointvec,
2446         },
2447         {
2448                 .procname       = "min_adv_mss",
2449                 .data           = &ip_rt_min_advmss,
2450                 .maxlen         = sizeof(int),
2451                 .mode           = 0644,
2452                 .proc_handler   = proc_dointvec,
2453         },
2454         { }
2455 };
2456
2457 static struct ctl_table ipv4_route_flush_table[] = {
2458         {
2459                 .procname       = "flush",
2460                 .maxlen         = sizeof(int),
2461                 .mode           = 0200,
2462                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2463         },
2464         { },
2465 };
2466
2467 static __net_init int sysctl_route_net_init(struct net *net)
2468 {
2469         struct ctl_table *tbl;
2470
2471         tbl = ipv4_route_flush_table;
2472         if (!net_eq(net, &init_net)) {
2473                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2474                 if (tbl == NULL)
2475                         goto err_dup;
2476         }
2477         tbl[0].extra1 = net;
2478
2479         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2480         if (net->ipv4.route_hdr == NULL)
2481                 goto err_reg;
2482         return 0;
2483
2484 err_reg:
2485         if (tbl != ipv4_route_flush_table)
2486                 kfree(tbl);
2487 err_dup:
2488         return -ENOMEM;
2489 }
2490
2491 static __net_exit void sysctl_route_net_exit(struct net *net)
2492 {
2493         struct ctl_table *tbl;
2494
2495         tbl = net->ipv4.route_hdr->ctl_table_arg;
2496         unregister_net_sysctl_table(net->ipv4.route_hdr);
2497         BUG_ON(tbl == ipv4_route_flush_table);
2498         kfree(tbl);
2499 }
2500
2501 static __net_initdata struct pernet_operations sysctl_route_ops = {
2502         .init = sysctl_route_net_init,
2503         .exit = sysctl_route_net_exit,
2504 };
2505 #endif
2506
2507 static __net_init int rt_genid_init(struct net *net)
2508 {
2509         atomic_set(&net->ipv4.rt_genid, 0);
2510         get_random_bytes(&net->ipv4.dev_addr_genid,
2511                          sizeof(net->ipv4.dev_addr_genid));
2512         return 0;
2513 }
2514
2515 static __net_initdata struct pernet_operations rt_genid_ops = {
2516         .init = rt_genid_init,
2517 };
2518
2519 static int __net_init ipv4_inetpeer_init(struct net *net)
2520 {
2521         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2522
2523         if (!bp)
2524                 return -ENOMEM;
2525         inet_peer_base_init(bp);
2526         net->ipv4.peers = bp;
2527         return 0;
2528 }
2529
2530 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2531 {
2532         struct inet_peer_base *bp = net->ipv4.peers;
2533
2534         net->ipv4.peers = NULL;
2535         inetpeer_invalidate_tree(bp);
2536         kfree(bp);
2537 }
2538
2539 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2540         .init   =       ipv4_inetpeer_init,
2541         .exit   =       ipv4_inetpeer_exit,
2542 };
2543
2544 #ifdef CONFIG_IP_ROUTE_CLASSID
2545 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2546 #endif /* CONFIG_IP_ROUTE_CLASSID */
2547
2548 int __init ip_rt_init(void)
2549 {
2550         int rc = 0;
2551
2552 #ifdef CONFIG_IP_ROUTE_CLASSID
2553         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2554         if (!ip_rt_acct)
2555                 panic("IP: failed to allocate ip_rt_acct\n");
2556 #endif
2557
2558         ipv4_dst_ops.kmem_cachep =
2559                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2560                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2561
2562         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2563
2564         if (dst_entries_init(&ipv4_dst_ops) < 0)
2565                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2566
2567         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2568                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2569
2570         ipv4_dst_ops.gc_thresh = ~0;
2571         ip_rt_max_size = INT_MAX;
2572
2573         devinet_init();
2574         ip_fib_init();
2575
2576         if (ip_rt_proc_init())
2577                 pr_err("Unable to create route proc files\n");
2578 #ifdef CONFIG_XFRM
2579         xfrm_init();
2580         xfrm4_init(ip_rt_max_size);
2581 #endif
2582         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2583
2584 #ifdef CONFIG_SYSCTL
2585         register_pernet_subsys(&sysctl_route_ops);
2586 #endif
2587         register_pernet_subsys(&rt_genid_ops);
2588         register_pernet_subsys(&ipv4_inetpeer_ops);
2589         return rc;
2590 }
2591
2592 #ifdef CONFIG_SYSCTL
2593 /*
2594  * We really need to sanitize the damn ipv4 init order, then all
2595  * this nonsense will go away.
2596  */
2597 void __init ip_static_sysctl_init(void)
2598 {
2599         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2600 }
2601 #endif