ACPI / SBS: Add 5 us delay to fix SBS hangs on MacBook
[linux-2.6-block.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_redirect_number __read_mostly  = 9;
120 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
121 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
122 static int ip_rt_error_cost __read_mostly       = HZ;
123 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
124 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
125 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
126 static int ip_rt_min_advmss __read_mostly       = 256;
127
128 /*
129  *      Interface to generic destination cache.
130  */
131
132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
134 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136 static void              ipv4_link_failure(struct sk_buff *skb);
137 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138                                            struct sk_buff *skb, u32 mtu);
139 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
140                                         struct sk_buff *skb);
141 static void             ipv4_dst_destroy(struct dst_entry *dst);
142
143 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
144 {
145         WARN_ON(1);
146         return NULL;
147 }
148
149 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
150                                            struct sk_buff *skb,
151                                            const void *daddr);
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .check =                ipv4_dst_check,
156         .default_advmss =       ipv4_default_advmss,
157         .mtu =                  ipv4_mtu,
158         .cow_metrics =          ipv4_cow_metrics,
159         .destroy =              ipv4_dst_destroy,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .redirect =             ip_do_redirect,
164         .local_out =            __ip_local_out,
165         .neigh_lookup =         ipv4_neigh_lookup,
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196         if (*pos)
197                 return NULL;
198         return SEQ_START_TOKEN;
199 }
200
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203         ++*pos;
204         return NULL;
205 }
206
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213         if (v == SEQ_START_TOKEN)
214                 seq_printf(seq, "%-127s\n",
215                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217                            "HHUptod\tSpecDst");
218         return 0;
219 }
220
221 static const struct seq_operations rt_cache_seq_ops = {
222         .start  = rt_cache_seq_start,
223         .next   = rt_cache_seq_next,
224         .stop   = rt_cache_seq_stop,
225         .show   = rt_cache_seq_show,
226 };
227
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230         return seq_open(file, &rt_cache_seq_ops);
231 }
232
233 static const struct file_operations rt_cache_seq_fops = {
234         .owner   = THIS_MODULE,
235         .open    = rt_cache_seq_open,
236         .read    = seq_read,
237         .llseek  = seq_lseek,
238         .release = seq_release,
239 };
240
241
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244         int cpu;
245
246         if (*pos == 0)
247                 return SEQ_START_TOKEN;
248
249         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250                 if (!cpu_possible(cpu))
251                         continue;
252                 *pos = cpu+1;
253                 return &per_cpu(rt_cache_stat, cpu);
254         }
255         return NULL;
256 }
257
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260         int cpu;
261
262         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263                 if (!cpu_possible(cpu))
264                         continue;
265                 *pos = cpu+1;
266                 return &per_cpu(rt_cache_stat, cpu);
267         }
268         return NULL;
269
270 }
271
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274
275 }
276
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279         struct rt_cache_stat *st = v;
280
281         if (v == SEQ_START_TOKEN) {
282                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283                 return 0;
284         }
285
286         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288                    dst_entries_get_slow(&ipv4_dst_ops),
289                    0, /* st->in_hit */
290                    st->in_slow_tot,
291                    st->in_slow_mc,
292                    st->in_no_route,
293                    st->in_brd,
294                    st->in_martian_dst,
295                    st->in_martian_src,
296
297                    0, /* st->out_hit */
298                    st->out_slow_tot,
299                    st->out_slow_mc,
300
301                    0, /* st->gc_total */
302                    0, /* st->gc_ignored */
303                    0, /* st->gc_goal_miss */
304                    0, /* st->gc_dst_overflow */
305                    0, /* st->in_hlist_search */
306                    0  /* st->out_hlist_search */
307                 );
308         return 0;
309 }
310
311 static const struct seq_operations rt_cpu_seq_ops = {
312         .start  = rt_cpu_seq_start,
313         .next   = rt_cpu_seq_next,
314         .stop   = rt_cpu_seq_stop,
315         .show   = rt_cpu_seq_show,
316 };
317
318
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321         return seq_open(file, &rt_cpu_seq_ops);
322 }
323
324 static const struct file_operations rt_cpu_seq_fops = {
325         .owner   = THIS_MODULE,
326         .open    = rt_cpu_seq_open,
327         .read    = seq_read,
328         .llseek  = seq_lseek,
329         .release = seq_release,
330 };
331
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335         struct ip_rt_acct *dst, *src;
336         unsigned int i, j;
337
338         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339         if (!dst)
340                 return -ENOMEM;
341
342         for_each_possible_cpu(i) {
343                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344                 for (j = 0; j < 256; j++) {
345                         dst[j].o_bytes   += src[j].o_bytes;
346                         dst[j].o_packets += src[j].o_packets;
347                         dst[j].i_bytes   += src[j].i_bytes;
348                         dst[j].i_packets += src[j].i_packets;
349                 }
350         }
351
352         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353         kfree(dst);
354         return 0;
355 }
356
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359         return single_open(file, rt_acct_proc_show, NULL);
360 }
361
362 static const struct file_operations rt_acct_proc_fops = {
363         .owner          = THIS_MODULE,
364         .open           = rt_acct_proc_open,
365         .read           = seq_read,
366         .llseek         = seq_lseek,
367         .release        = single_release,
368 };
369 #endif
370
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373         struct proc_dir_entry *pde;
374
375         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376                           &rt_cache_seq_fops);
377         if (!pde)
378                 goto err1;
379
380         pde = proc_create("rt_cache", S_IRUGO,
381                           net->proc_net_stat, &rt_cpu_seq_fops);
382         if (!pde)
383                 goto err2;
384
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387         if (!pde)
388                 goto err3;
389 #endif
390         return 0;
391
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394         remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397         remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399         return -ENOMEM;
400 }
401
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404         remove_proc_entry("rt_cache", net->proc_net_stat);
405         remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412         .init = ip_rt_do_proc_init,
413         .exit = ip_rt_do_proc_exit,
414 };
415
416 static int __init ip_rt_proc_init(void)
417 {
418         return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424         return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432
433 void rt_cache_flush(struct net *net)
434 {
435         rt_genid_bump_ipv4(net);
436 }
437
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439                                            struct sk_buff *skb,
440                                            const void *daddr)
441 {
442         struct net_device *dev = dst->dev;
443         const __be32 *pkey = daddr;
444         const struct rtable *rt;
445         struct neighbour *n;
446
447         rt = (const struct rtable *) dst;
448         if (rt->rt_gateway)
449                 pkey = (const __be32 *) &rt->rt_gateway;
450         else if (skb)
451                 pkey = &ip_hdr(skb)->daddr;
452
453         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454         if (n)
455                 return n;
456         return neigh_create(&arp_tbl, pkey, dev);
457 }
458
459 #define IP_IDENTS_SZ 2048u
460 struct ip_ident_bucket {
461         atomic_t        id;
462         u32             stamp32;
463 };
464
465 static struct ip_ident_bucket *ip_idents __read_mostly;
466
467 /* In order to protect privacy, we add a perturbation to identifiers
468  * if one generator is seldom used. This makes hard for an attacker
469  * to infer how many packets were sent between two points in time.
470  */
471 u32 ip_idents_reserve(u32 hash, int segs)
472 {
473         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
474         u32 old = ACCESS_ONCE(bucket->stamp32);
475         u32 now = (u32)jiffies;
476         u32 delta = 0;
477
478         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
479                 delta = prandom_u32_max(now - old);
480
481         return atomic_add_return(segs + delta, &bucket->id) - segs;
482 }
483 EXPORT_SYMBOL(ip_idents_reserve);
484
485 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
486 {
487         static u32 ip_idents_hashrnd __read_mostly;
488         u32 hash, id;
489
490         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
491
492         hash = jhash_3words((__force u32)iph->daddr,
493                             (__force u32)iph->saddr,
494                             iph->protocol ^ net_hash_mix(net),
495                             ip_idents_hashrnd);
496         id = ip_idents_reserve(hash, segs);
497         iph->id = htons(id);
498 }
499 EXPORT_SYMBOL(__ip_select_ident);
500
501 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
502                              const struct iphdr *iph,
503                              int oif, u8 tos,
504                              u8 prot, u32 mark, int flow_flags)
505 {
506         if (sk) {
507                 const struct inet_sock *inet = inet_sk(sk);
508
509                 oif = sk->sk_bound_dev_if;
510                 mark = sk->sk_mark;
511                 tos = RT_CONN_FLAGS(sk);
512                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
513         }
514         flowi4_init_output(fl4, oif, mark, tos,
515                            RT_SCOPE_UNIVERSE, prot,
516                            flow_flags,
517                            iph->daddr, iph->saddr, 0, 0);
518 }
519
520 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
521                                const struct sock *sk)
522 {
523         const struct iphdr *iph = ip_hdr(skb);
524         int oif = skb->dev->ifindex;
525         u8 tos = RT_TOS(iph->tos);
526         u8 prot = iph->protocol;
527         u32 mark = skb->mark;
528
529         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
530 }
531
532 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
533 {
534         const struct inet_sock *inet = inet_sk(sk);
535         const struct ip_options_rcu *inet_opt;
536         __be32 daddr = inet->inet_daddr;
537
538         rcu_read_lock();
539         inet_opt = rcu_dereference(inet->inet_opt);
540         if (inet_opt && inet_opt->opt.srr)
541                 daddr = inet_opt->opt.faddr;
542         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
543                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
544                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
545                            inet_sk_flowi_flags(sk),
546                            daddr, inet->inet_saddr, 0, 0);
547         rcu_read_unlock();
548 }
549
550 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
551                                  const struct sk_buff *skb)
552 {
553         if (skb)
554                 build_skb_flow_key(fl4, skb, sk);
555         else
556                 build_sk_flow_key(fl4, sk);
557 }
558
559 static inline void rt_free(struct rtable *rt)
560 {
561         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
562 }
563
564 static DEFINE_SPINLOCK(fnhe_lock);
565
566 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
567 {
568         struct rtable *rt;
569
570         rt = rcu_dereference(fnhe->fnhe_rth_input);
571         if (rt) {
572                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
573                 rt_free(rt);
574         }
575         rt = rcu_dereference(fnhe->fnhe_rth_output);
576         if (rt) {
577                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
578                 rt_free(rt);
579         }
580 }
581
582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
583 {
584         struct fib_nh_exception *fnhe, *oldest;
585
586         oldest = rcu_dereference(hash->chain);
587         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
588              fnhe = rcu_dereference(fnhe->fnhe_next)) {
589                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
590                         oldest = fnhe;
591         }
592         fnhe_flush_routes(oldest);
593         return oldest;
594 }
595
596 static inline u32 fnhe_hashfun(__be32 daddr)
597 {
598         static u32 fnhe_hashrnd __read_mostly;
599         u32 hval;
600
601         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
602         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
603         return hash_32(hval, FNHE_HASH_SHIFT);
604 }
605
606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
607 {
608         rt->rt_pmtu = fnhe->fnhe_pmtu;
609         rt->dst.expires = fnhe->fnhe_expires;
610
611         if (fnhe->fnhe_gw) {
612                 rt->rt_flags |= RTCF_REDIRECTED;
613                 rt->rt_gateway = fnhe->fnhe_gw;
614                 rt->rt_uses_gateway = 1;
615         }
616 }
617
618 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
619                                   u32 pmtu, unsigned long expires)
620 {
621         struct fnhe_hash_bucket *hash;
622         struct fib_nh_exception *fnhe;
623         struct rtable *rt;
624         unsigned int i;
625         int depth;
626         u32 hval = fnhe_hashfun(daddr);
627
628         spin_lock_bh(&fnhe_lock);
629
630         hash = rcu_dereference(nh->nh_exceptions);
631         if (!hash) {
632                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
633                 if (!hash)
634                         goto out_unlock;
635                 rcu_assign_pointer(nh->nh_exceptions, hash);
636         }
637
638         hash += hval;
639
640         depth = 0;
641         for (fnhe = rcu_dereference(hash->chain); fnhe;
642              fnhe = rcu_dereference(fnhe->fnhe_next)) {
643                 if (fnhe->fnhe_daddr == daddr)
644                         break;
645                 depth++;
646         }
647
648         if (fnhe) {
649                 if (gw)
650                         fnhe->fnhe_gw = gw;
651                 if (pmtu) {
652                         fnhe->fnhe_pmtu = pmtu;
653                         fnhe->fnhe_expires = max(1UL, expires);
654                 }
655                 /* Update all cached dsts too */
656                 rt = rcu_dereference(fnhe->fnhe_rth_input);
657                 if (rt)
658                         fill_route_from_fnhe(rt, fnhe);
659                 rt = rcu_dereference(fnhe->fnhe_rth_output);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662         } else {
663                 if (depth > FNHE_RECLAIM_DEPTH)
664                         fnhe = fnhe_oldest(hash);
665                 else {
666                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
667                         if (!fnhe)
668                                 goto out_unlock;
669
670                         fnhe->fnhe_next = hash->chain;
671                         rcu_assign_pointer(hash->chain, fnhe);
672                 }
673                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
674                 fnhe->fnhe_daddr = daddr;
675                 fnhe->fnhe_gw = gw;
676                 fnhe->fnhe_pmtu = pmtu;
677                 fnhe->fnhe_expires = expires;
678
679                 /* Exception created; mark the cached routes for the nexthop
680                  * stale, so anyone caching it rechecks if this exception
681                  * applies to them.
682                  */
683                 rt = rcu_dereference(nh->nh_rth_input);
684                 if (rt)
685                         rt->dst.obsolete = DST_OBSOLETE_KILL;
686
687                 for_each_possible_cpu(i) {
688                         struct rtable __rcu **prt;
689                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
690                         rt = rcu_dereference(*prt);
691                         if (rt)
692                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
693                 }
694         }
695
696         fnhe->fnhe_stamp = jiffies;
697
698 out_unlock:
699         spin_unlock_bh(&fnhe_lock);
700 }
701
702 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
703                              bool kill_route)
704 {
705         __be32 new_gw = icmp_hdr(skb)->un.gateway;
706         __be32 old_gw = ip_hdr(skb)->saddr;
707         struct net_device *dev = skb->dev;
708         struct in_device *in_dev;
709         struct fib_result res;
710         struct neighbour *n;
711         struct net *net;
712
713         switch (icmp_hdr(skb)->code & 7) {
714         case ICMP_REDIR_NET:
715         case ICMP_REDIR_NETTOS:
716         case ICMP_REDIR_HOST:
717         case ICMP_REDIR_HOSTTOS:
718                 break;
719
720         default:
721                 return;
722         }
723
724         if (rt->rt_gateway != old_gw)
725                 return;
726
727         in_dev = __in_dev_get_rcu(dev);
728         if (!in_dev)
729                 return;
730
731         net = dev_net(dev);
732         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
733             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
734             ipv4_is_zeronet(new_gw))
735                 goto reject_redirect;
736
737         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
738                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
739                         goto reject_redirect;
740                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
741                         goto reject_redirect;
742         } else {
743                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
744                         goto reject_redirect;
745         }
746
747         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
748         if (!IS_ERR(n)) {
749                 if (!(n->nud_state & NUD_VALID)) {
750                         neigh_event_send(n, NULL);
751                 } else {
752                         if (fib_lookup(net, fl4, &res) == 0) {
753                                 struct fib_nh *nh = &FIB_RES_NH(res);
754
755                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
756                                                       0, 0);
757                         }
758                         if (kill_route)
759                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
760                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
761                 }
762                 neigh_release(n);
763         }
764         return;
765
766 reject_redirect:
767 #ifdef CONFIG_IP_ROUTE_VERBOSE
768         if (IN_DEV_LOG_MARTIANS(in_dev)) {
769                 const struct iphdr *iph = (const struct iphdr *) skb->data;
770                 __be32 daddr = iph->daddr;
771                 __be32 saddr = iph->saddr;
772
773                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
774                                      "  Advised path = %pI4 -> %pI4\n",
775                                      &old_gw, dev->name, &new_gw,
776                                      &saddr, &daddr);
777         }
778 #endif
779         ;
780 }
781
782 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
783 {
784         struct rtable *rt;
785         struct flowi4 fl4;
786         const struct iphdr *iph = (const struct iphdr *) skb->data;
787         int oif = skb->dev->ifindex;
788         u8 tos = RT_TOS(iph->tos);
789         u8 prot = iph->protocol;
790         u32 mark = skb->mark;
791
792         rt = (struct rtable *) dst;
793
794         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
795         __ip_do_redirect(rt, skb, &fl4, true);
796 }
797
798 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
799 {
800         struct rtable *rt = (struct rtable *)dst;
801         struct dst_entry *ret = dst;
802
803         if (rt) {
804                 if (dst->obsolete > 0) {
805                         ip_rt_put(rt);
806                         ret = NULL;
807                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
808                            rt->dst.expires) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 }
812         }
813         return ret;
814 }
815
816 /*
817  * Algorithm:
818  *      1. The first ip_rt_redirect_number redirects are sent
819  *         with exponential backoff, then we stop sending them at all,
820  *         assuming that the host ignores our redirects.
821  *      2. If we did not see packets requiring redirects
822  *         during ip_rt_redirect_silence, we assume that the host
823  *         forgot redirected route and start to send redirects again.
824  *
825  * This algorithm is much cheaper and more intelligent than dumb load limiting
826  * in icmp.c.
827  *
828  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
829  * and "frag. need" (breaks PMTU discovery) in icmp.c.
830  */
831
832 void ip_rt_send_redirect(struct sk_buff *skb)
833 {
834         struct rtable *rt = skb_rtable(skb);
835         struct in_device *in_dev;
836         struct inet_peer *peer;
837         struct net *net;
838         int log_martians;
839
840         rcu_read_lock();
841         in_dev = __in_dev_get_rcu(rt->dst.dev);
842         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
843                 rcu_read_unlock();
844                 return;
845         }
846         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
847         rcu_read_unlock();
848
849         net = dev_net(rt->dst.dev);
850         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
851         if (!peer) {
852                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
853                           rt_nexthop(rt, ip_hdr(skb)->daddr));
854                 return;
855         }
856
857         /* No redirected packets during ip_rt_redirect_silence;
858          * reset the algorithm.
859          */
860         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
861                 peer->rate_tokens = 0;
862
863         /* Too many ignored redirects; do not send anything
864          * set dst.rate_last to the last seen redirected packet.
865          */
866         if (peer->rate_tokens >= ip_rt_redirect_number) {
867                 peer->rate_last = jiffies;
868                 goto out_put_peer;
869         }
870
871         /* Check for load limit; set rate_last to the latest sent
872          * redirect.
873          */
874         if (peer->rate_tokens == 0 ||
875             time_after(jiffies,
876                        (peer->rate_last +
877                         (ip_rt_redirect_load << peer->rate_tokens)))) {
878                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
879
880                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
881                 peer->rate_last = jiffies;
882                 ++peer->rate_tokens;
883 #ifdef CONFIG_IP_ROUTE_VERBOSE
884                 if (log_martians &&
885                     peer->rate_tokens == ip_rt_redirect_number)
886                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
887                                              &ip_hdr(skb)->saddr, inet_iif(skb),
888                                              &ip_hdr(skb)->daddr, &gw);
889 #endif
890         }
891 out_put_peer:
892         inet_putpeer(peer);
893 }
894
895 static int ip_error(struct sk_buff *skb)
896 {
897         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
898         struct rtable *rt = skb_rtable(skb);
899         struct inet_peer *peer;
900         unsigned long now;
901         struct net *net;
902         bool send;
903         int code;
904
905         net = dev_net(rt->dst.dev);
906         if (!IN_DEV_FORWARD(in_dev)) {
907                 switch (rt->dst.error) {
908                 case EHOSTUNREACH:
909                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
910                         break;
911
912                 case ENETUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
914                         break;
915                 }
916                 goto out;
917         }
918
919         switch (rt->dst.error) {
920         case EINVAL:
921         default:
922                 goto out;
923         case EHOSTUNREACH:
924                 code = ICMP_HOST_UNREACH;
925                 break;
926         case ENETUNREACH:
927                 code = ICMP_NET_UNREACH;
928                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
929                 break;
930         case EACCES:
931                 code = ICMP_PKT_FILTERED;
932                 break;
933         }
934
935         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
936
937         send = true;
938         if (peer) {
939                 now = jiffies;
940                 peer->rate_tokens += now - peer->rate_last;
941                 if (peer->rate_tokens > ip_rt_error_burst)
942                         peer->rate_tokens = ip_rt_error_burst;
943                 peer->rate_last = now;
944                 if (peer->rate_tokens >= ip_rt_error_cost)
945                         peer->rate_tokens -= ip_rt_error_cost;
946                 else
947                         send = false;
948                 inet_putpeer(peer);
949         }
950         if (send)
951                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
952
953 out:    kfree_skb(skb);
954         return 0;
955 }
956
957 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
958 {
959         struct dst_entry *dst = &rt->dst;
960         struct fib_result res;
961
962         if (dst_metric_locked(dst, RTAX_MTU))
963                 return;
964
965         if (dst->dev->mtu < mtu)
966                 return;
967
968         if (rt->rt_pmtu && rt->rt_pmtu < mtu)
969                 return;
970
971         if (mtu < ip_rt_min_pmtu)
972                 mtu = ip_rt_min_pmtu;
973
974         if (rt->rt_pmtu == mtu &&
975             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
976                 return;
977
978         rcu_read_lock();
979         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
980                 struct fib_nh *nh = &FIB_RES_NH(res);
981
982                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
983                                       jiffies + ip_rt_mtu_expires);
984         }
985         rcu_read_unlock();
986 }
987
988 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
989                               struct sk_buff *skb, u32 mtu)
990 {
991         struct rtable *rt = (struct rtable *) dst;
992         struct flowi4 fl4;
993
994         ip_rt_build_flow_key(&fl4, sk, skb);
995         __ip_rt_update_pmtu(rt, &fl4, mtu);
996 }
997
998 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
999                       int oif, u32 mark, u8 protocol, int flow_flags)
1000 {
1001         const struct iphdr *iph = (const struct iphdr *) skb->data;
1002         struct flowi4 fl4;
1003         struct rtable *rt;
1004
1005         if (!mark)
1006                 mark = IP4_REPLY_MARK(net, skb->mark);
1007
1008         __build_flow_key(&fl4, NULL, iph, oif,
1009                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1010         rt = __ip_route_output_key(net, &fl4);
1011         if (!IS_ERR(rt)) {
1012                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1013                 ip_rt_put(rt);
1014         }
1015 }
1016 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1017
1018 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1019 {
1020         const struct iphdr *iph = (const struct iphdr *) skb->data;
1021         struct flowi4 fl4;
1022         struct rtable *rt;
1023
1024         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1025
1026         if (!fl4.flowi4_mark)
1027                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1028
1029         rt = __ip_route_output_key(sock_net(sk), &fl4);
1030         if (!IS_ERR(rt)) {
1031                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1032                 ip_rt_put(rt);
1033         }
1034 }
1035
1036 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1037 {
1038         const struct iphdr *iph = (const struct iphdr *) skb->data;
1039         struct flowi4 fl4;
1040         struct rtable *rt;
1041         struct dst_entry *odst = NULL;
1042         bool new = false;
1043
1044         bh_lock_sock(sk);
1045
1046         if (!ip_sk_accept_pmtu(sk))
1047                 goto out;
1048
1049         odst = sk_dst_get(sk);
1050
1051         if (sock_owned_by_user(sk) || !odst) {
1052                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1053                 goto out;
1054         }
1055
1056         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1057
1058         rt = (struct rtable *)odst;
1059         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1060                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1061                 if (IS_ERR(rt))
1062                         goto out;
1063
1064                 new = true;
1065         }
1066
1067         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1068
1069         if (!dst_check(&rt->dst, 0)) {
1070                 if (new)
1071                         dst_release(&rt->dst);
1072
1073                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                 if (IS_ERR(rt))
1075                         goto out;
1076
1077                 new = true;
1078         }
1079
1080         if (new)
1081                 sk_dst_set(sk, &rt->dst);
1082
1083 out:
1084         bh_unlock_sock(sk);
1085         dst_release(odst);
1086 }
1087 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1088
1089 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1090                    int oif, u32 mark, u8 protocol, int flow_flags)
1091 {
1092         const struct iphdr *iph = (const struct iphdr *) skb->data;
1093         struct flowi4 fl4;
1094         struct rtable *rt;
1095
1096         __build_flow_key(&fl4, NULL, iph, oif,
1097                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1098         rt = __ip_route_output_key(net, &fl4);
1099         if (!IS_ERR(rt)) {
1100                 __ip_do_redirect(rt, skb, &fl4, false);
1101                 ip_rt_put(rt);
1102         }
1103 }
1104 EXPORT_SYMBOL_GPL(ipv4_redirect);
1105
1106 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1107 {
1108         const struct iphdr *iph = (const struct iphdr *) skb->data;
1109         struct flowi4 fl4;
1110         struct rtable *rt;
1111
1112         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1113         rt = __ip_route_output_key(sock_net(sk), &fl4);
1114         if (!IS_ERR(rt)) {
1115                 __ip_do_redirect(rt, skb, &fl4, false);
1116                 ip_rt_put(rt);
1117         }
1118 }
1119 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1120
1121 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1122 {
1123         struct rtable *rt = (struct rtable *) dst;
1124
1125         /* All IPV4 dsts are created with ->obsolete set to the value
1126          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1127          * into this function always.
1128          *
1129          * When a PMTU/redirect information update invalidates a route,
1130          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1131          * DST_OBSOLETE_DEAD by dst_free().
1132          */
1133         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1134                 return NULL;
1135         return dst;
1136 }
1137
1138 static void ipv4_link_failure(struct sk_buff *skb)
1139 {
1140         struct rtable *rt;
1141
1142         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1143
1144         rt = skb_rtable(skb);
1145         if (rt)
1146                 dst_set_expires(&rt->dst, 0);
1147 }
1148
1149 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1150 {
1151         pr_debug("%s: %pI4 -> %pI4, %s\n",
1152                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1153                  skb->dev ? skb->dev->name : "?");
1154         kfree_skb(skb);
1155         WARN_ON(1);
1156         return 0;
1157 }
1158
1159 /*
1160    We do not cache source address of outgoing interface,
1161    because it is used only by IP RR, TS and SRR options,
1162    so that it out of fast path.
1163
1164    BTW remember: "addr" is allowed to be not aligned
1165    in IP options!
1166  */
1167
1168 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1169 {
1170         __be32 src;
1171
1172         if (rt_is_output_route(rt))
1173                 src = ip_hdr(skb)->saddr;
1174         else {
1175                 struct fib_result res;
1176                 struct flowi4 fl4;
1177                 struct iphdr *iph;
1178
1179                 iph = ip_hdr(skb);
1180
1181                 memset(&fl4, 0, sizeof(fl4));
1182                 fl4.daddr = iph->daddr;
1183                 fl4.saddr = iph->saddr;
1184                 fl4.flowi4_tos = RT_TOS(iph->tos);
1185                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1186                 fl4.flowi4_iif = skb->dev->ifindex;
1187                 fl4.flowi4_mark = skb->mark;
1188
1189                 rcu_read_lock();
1190                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1191                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1192                 else
1193                         src = inet_select_addr(rt->dst.dev,
1194                                                rt_nexthop(rt, iph->daddr),
1195                                                RT_SCOPE_UNIVERSE);
1196                 rcu_read_unlock();
1197         }
1198         memcpy(addr, &src, 4);
1199 }
1200
1201 #ifdef CONFIG_IP_ROUTE_CLASSID
1202 static void set_class_tag(struct rtable *rt, u32 tag)
1203 {
1204         if (!(rt->dst.tclassid & 0xFFFF))
1205                 rt->dst.tclassid |= tag & 0xFFFF;
1206         if (!(rt->dst.tclassid & 0xFFFF0000))
1207                 rt->dst.tclassid |= tag & 0xFFFF0000;
1208 }
1209 #endif
1210
1211 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1212 {
1213         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1214
1215         if (advmss == 0) {
1216                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1217                                ip_rt_min_advmss);
1218                 if (advmss > 65535 - 40)
1219                         advmss = 65535 - 40;
1220         }
1221         return advmss;
1222 }
1223
1224 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1225 {
1226         const struct rtable *rt = (const struct rtable *) dst;
1227         unsigned int mtu = rt->rt_pmtu;
1228
1229         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1230                 mtu = dst_metric_raw(dst, RTAX_MTU);
1231
1232         if (mtu)
1233                 return mtu;
1234
1235         mtu = dst->dev->mtu;
1236
1237         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1238                 if (rt->rt_uses_gateway && mtu > 576)
1239                         mtu = 576;
1240         }
1241
1242         return min_t(unsigned int, mtu, IP_MAX_MTU);
1243 }
1244
1245 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1246 {
1247         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1248         struct fib_nh_exception *fnhe;
1249         u32 hval;
1250
1251         if (!hash)
1252                 return NULL;
1253
1254         hval = fnhe_hashfun(daddr);
1255
1256         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1257              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1258                 if (fnhe->fnhe_daddr == daddr)
1259                         return fnhe;
1260         }
1261         return NULL;
1262 }
1263
1264 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1265                               __be32 daddr)
1266 {
1267         bool ret = false;
1268
1269         spin_lock_bh(&fnhe_lock);
1270
1271         if (daddr == fnhe->fnhe_daddr) {
1272                 struct rtable __rcu **porig;
1273                 struct rtable *orig;
1274                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1275
1276                 if (rt_is_input_route(rt))
1277                         porig = &fnhe->fnhe_rth_input;
1278                 else
1279                         porig = &fnhe->fnhe_rth_output;
1280                 orig = rcu_dereference(*porig);
1281
1282                 if (fnhe->fnhe_genid != genid) {
1283                         fnhe->fnhe_genid = genid;
1284                         fnhe->fnhe_gw = 0;
1285                         fnhe->fnhe_pmtu = 0;
1286                         fnhe->fnhe_expires = 0;
1287                         fnhe_flush_routes(fnhe);
1288                         orig = NULL;
1289                 }
1290                 fill_route_from_fnhe(rt, fnhe);
1291                 if (!rt->rt_gateway)
1292                         rt->rt_gateway = daddr;
1293
1294                 if (!(rt->dst.flags & DST_NOCACHE)) {
1295                         rcu_assign_pointer(*porig, rt);
1296                         if (orig)
1297                                 rt_free(orig);
1298                         ret = true;
1299                 }
1300
1301                 fnhe->fnhe_stamp = jiffies;
1302         }
1303         spin_unlock_bh(&fnhe_lock);
1304
1305         return ret;
1306 }
1307
1308 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1309 {
1310         struct rtable *orig, *prev, **p;
1311         bool ret = true;
1312
1313         if (rt_is_input_route(rt)) {
1314                 p = (struct rtable **)&nh->nh_rth_input;
1315         } else {
1316                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1317         }
1318         orig = *p;
1319
1320         prev = cmpxchg(p, orig, rt);
1321         if (prev == orig) {
1322                 if (orig)
1323                         rt_free(orig);
1324         } else
1325                 ret = false;
1326
1327         return ret;
1328 }
1329
1330 struct uncached_list {
1331         spinlock_t              lock;
1332         struct list_head        head;
1333 };
1334
1335 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1336
1337 static void rt_add_uncached_list(struct rtable *rt)
1338 {
1339         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1340
1341         rt->rt_uncached_list = ul;
1342
1343         spin_lock_bh(&ul->lock);
1344         list_add_tail(&rt->rt_uncached, &ul->head);
1345         spin_unlock_bh(&ul->lock);
1346 }
1347
1348 static void ipv4_dst_destroy(struct dst_entry *dst)
1349 {
1350         struct rtable *rt = (struct rtable *) dst;
1351
1352         if (!list_empty(&rt->rt_uncached)) {
1353                 struct uncached_list *ul = rt->rt_uncached_list;
1354
1355                 spin_lock_bh(&ul->lock);
1356                 list_del(&rt->rt_uncached);
1357                 spin_unlock_bh(&ul->lock);
1358         }
1359 }
1360
1361 void rt_flush_dev(struct net_device *dev)
1362 {
1363         struct net *net = dev_net(dev);
1364         struct rtable *rt;
1365         int cpu;
1366
1367         for_each_possible_cpu(cpu) {
1368                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1369
1370                 spin_lock_bh(&ul->lock);
1371                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1372                         if (rt->dst.dev != dev)
1373                                 continue;
1374                         rt->dst.dev = net->loopback_dev;
1375                         dev_hold(rt->dst.dev);
1376                         dev_put(dev);
1377                 }
1378                 spin_unlock_bh(&ul->lock);
1379         }
1380 }
1381
1382 static bool rt_cache_valid(const struct rtable *rt)
1383 {
1384         return  rt &&
1385                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1386                 !rt_is_expired(rt);
1387 }
1388
1389 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1390                            const struct fib_result *res,
1391                            struct fib_nh_exception *fnhe,
1392                            struct fib_info *fi, u16 type, u32 itag)
1393 {
1394         bool cached = false;
1395
1396         if (fi) {
1397                 struct fib_nh *nh = &FIB_RES_NH(*res);
1398
1399                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1400                         rt->rt_gateway = nh->nh_gw;
1401                         rt->rt_uses_gateway = 1;
1402                 }
1403                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1404 #ifdef CONFIG_IP_ROUTE_CLASSID
1405                 rt->dst.tclassid = nh->nh_tclassid;
1406 #endif
1407                 if (unlikely(fnhe))
1408                         cached = rt_bind_exception(rt, fnhe, daddr);
1409                 else if (!(rt->dst.flags & DST_NOCACHE))
1410                         cached = rt_cache_route(nh, rt);
1411                 if (unlikely(!cached)) {
1412                         /* Routes we intend to cache in nexthop exception or
1413                          * FIB nexthop have the DST_NOCACHE bit clear.
1414                          * However, if we are unsuccessful at storing this
1415                          * route into the cache we really need to set it.
1416                          */
1417                         rt->dst.flags |= DST_NOCACHE;
1418                         if (!rt->rt_gateway)
1419                                 rt->rt_gateway = daddr;
1420                         rt_add_uncached_list(rt);
1421                 }
1422         } else
1423                 rt_add_uncached_list(rt);
1424
1425 #ifdef CONFIG_IP_ROUTE_CLASSID
1426 #ifdef CONFIG_IP_MULTIPLE_TABLES
1427         set_class_tag(rt, res->tclassid);
1428 #endif
1429         set_class_tag(rt, itag);
1430 #endif
1431 }
1432
1433 static struct rtable *rt_dst_alloc(struct net_device *dev,
1434                                    bool nopolicy, bool noxfrm, bool will_cache)
1435 {
1436         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1437                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1438                          (nopolicy ? DST_NOPOLICY : 0) |
1439                          (noxfrm ? DST_NOXFRM : 0));
1440 }
1441
1442 /* called in rcu_read_lock() section */
1443 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1444                                 u8 tos, struct net_device *dev, int our)
1445 {
1446         struct rtable *rth;
1447         struct in_device *in_dev = __in_dev_get_rcu(dev);
1448         u32 itag = 0;
1449         int err;
1450
1451         /* Primary sanity checks. */
1452
1453         if (!in_dev)
1454                 return -EINVAL;
1455
1456         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1457             skb->protocol != htons(ETH_P_IP))
1458                 goto e_inval;
1459
1460         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1461                 if (ipv4_is_loopback(saddr))
1462                         goto e_inval;
1463
1464         if (ipv4_is_zeronet(saddr)) {
1465                 if (!ipv4_is_local_multicast(daddr))
1466                         goto e_inval;
1467         } else {
1468                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1469                                           in_dev, &itag);
1470                 if (err < 0)
1471                         goto e_err;
1472         }
1473         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1474                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1475         if (!rth)
1476                 goto e_nobufs;
1477
1478 #ifdef CONFIG_IP_ROUTE_CLASSID
1479         rth->dst.tclassid = itag;
1480 #endif
1481         rth->dst.output = ip_rt_bug;
1482
1483         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1484         rth->rt_flags   = RTCF_MULTICAST;
1485         rth->rt_type    = RTN_MULTICAST;
1486         rth->rt_is_input= 1;
1487         rth->rt_iif     = 0;
1488         rth->rt_pmtu    = 0;
1489         rth->rt_gateway = 0;
1490         rth->rt_uses_gateway = 0;
1491         INIT_LIST_HEAD(&rth->rt_uncached);
1492         if (our) {
1493                 rth->dst.input= ip_local_deliver;
1494                 rth->rt_flags |= RTCF_LOCAL;
1495         }
1496
1497 #ifdef CONFIG_IP_MROUTE
1498         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1499                 rth->dst.input = ip_mr_input;
1500 #endif
1501         RT_CACHE_STAT_INC(in_slow_mc);
1502
1503         skb_dst_set(skb, &rth->dst);
1504         return 0;
1505
1506 e_nobufs:
1507         return -ENOBUFS;
1508 e_inval:
1509         return -EINVAL;
1510 e_err:
1511         return err;
1512 }
1513
1514
1515 static void ip_handle_martian_source(struct net_device *dev,
1516                                      struct in_device *in_dev,
1517                                      struct sk_buff *skb,
1518                                      __be32 daddr,
1519                                      __be32 saddr)
1520 {
1521         RT_CACHE_STAT_INC(in_martian_src);
1522 #ifdef CONFIG_IP_ROUTE_VERBOSE
1523         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1524                 /*
1525                  *      RFC1812 recommendation, if source is martian,
1526                  *      the only hint is MAC header.
1527                  */
1528                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1529                         &daddr, &saddr, dev->name);
1530                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1531                         print_hex_dump(KERN_WARNING, "ll header: ",
1532                                        DUMP_PREFIX_OFFSET, 16, 1,
1533                                        skb_mac_header(skb),
1534                                        dev->hard_header_len, true);
1535                 }
1536         }
1537 #endif
1538 }
1539
1540 /* called in rcu_read_lock() section */
1541 static int __mkroute_input(struct sk_buff *skb,
1542                            const struct fib_result *res,
1543                            struct in_device *in_dev,
1544                            __be32 daddr, __be32 saddr, u32 tos)
1545 {
1546         struct fib_nh_exception *fnhe;
1547         struct rtable *rth;
1548         int err;
1549         struct in_device *out_dev;
1550         unsigned int flags = 0;
1551         bool do_cache;
1552         u32 itag = 0;
1553
1554         /* get a working reference to the output device */
1555         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1556         if (!out_dev) {
1557                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1558                 return -EINVAL;
1559         }
1560
1561         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1562                                   in_dev->dev, in_dev, &itag);
1563         if (err < 0) {
1564                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1565                                          saddr);
1566
1567                 goto cleanup;
1568         }
1569
1570         do_cache = res->fi && !itag;
1571         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1572             skb->protocol == htons(ETH_P_IP) &&
1573             (IN_DEV_SHARED_MEDIA(out_dev) ||
1574              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1575                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1576
1577         if (skb->protocol != htons(ETH_P_IP)) {
1578                 /* Not IP (i.e. ARP). Do not create route, if it is
1579                  * invalid for proxy arp. DNAT routes are always valid.
1580                  *
1581                  * Proxy arp feature have been extended to allow, ARP
1582                  * replies back to the same interface, to support
1583                  * Private VLAN switch technologies. See arp.c.
1584                  */
1585                 if (out_dev == in_dev &&
1586                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1587                         err = -EINVAL;
1588                         goto cleanup;
1589                 }
1590         }
1591
1592         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1593         if (do_cache) {
1594                 if (fnhe)
1595                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1596                 else
1597                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1598
1599                 if (rt_cache_valid(rth)) {
1600                         skb_dst_set_noref(skb, &rth->dst);
1601                         goto out;
1602                 }
1603         }
1604
1605         rth = rt_dst_alloc(out_dev->dev,
1606                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1607                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1608         if (!rth) {
1609                 err = -ENOBUFS;
1610                 goto cleanup;
1611         }
1612
1613         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1614         rth->rt_flags = flags;
1615         rth->rt_type = res->type;
1616         rth->rt_is_input = 1;
1617         rth->rt_iif     = 0;
1618         rth->rt_pmtu    = 0;
1619         rth->rt_gateway = 0;
1620         rth->rt_uses_gateway = 0;
1621         INIT_LIST_HEAD(&rth->rt_uncached);
1622         RT_CACHE_STAT_INC(in_slow_tot);
1623
1624         rth->dst.input = ip_forward;
1625         rth->dst.output = ip_output;
1626
1627         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1628         skb_dst_set(skb, &rth->dst);
1629 out:
1630         err = 0;
1631  cleanup:
1632         return err;
1633 }
1634
1635 static int ip_mkroute_input(struct sk_buff *skb,
1636                             struct fib_result *res,
1637                             const struct flowi4 *fl4,
1638                             struct in_device *in_dev,
1639                             __be32 daddr, __be32 saddr, u32 tos)
1640 {
1641 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1642         if (res->fi && res->fi->fib_nhs > 1)
1643                 fib_select_multipath(res);
1644 #endif
1645
1646         /* create a routing cache entry */
1647         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1648 }
1649
1650 /*
1651  *      NOTE. We drop all the packets that has local source
1652  *      addresses, because every properly looped back packet
1653  *      must have correct destination already attached by output routine.
1654  *
1655  *      Such approach solves two big problems:
1656  *      1. Not simplex devices are handled properly.
1657  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1658  *      called with rcu_read_lock()
1659  */
1660
1661 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1662                                u8 tos, struct net_device *dev)
1663 {
1664         struct fib_result res;
1665         struct in_device *in_dev = __in_dev_get_rcu(dev);
1666         struct flowi4   fl4;
1667         unsigned int    flags = 0;
1668         u32             itag = 0;
1669         struct rtable   *rth;
1670         int             err = -EINVAL;
1671         struct net    *net = dev_net(dev);
1672         bool do_cache;
1673
1674         /* IP on this device is disabled. */
1675
1676         if (!in_dev)
1677                 goto out;
1678
1679         /* Check for the most weird martians, which can be not detected
1680            by fib_lookup.
1681          */
1682
1683         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1684                 goto martian_source;
1685
1686         res.fi = NULL;
1687         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1688                 goto brd_input;
1689
1690         /* Accept zero addresses only to limited broadcast;
1691          * I even do not know to fix it or not. Waiting for complains :-)
1692          */
1693         if (ipv4_is_zeronet(saddr))
1694                 goto martian_source;
1695
1696         if (ipv4_is_zeronet(daddr))
1697                 goto martian_destination;
1698
1699         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1700          * and call it once if daddr or/and saddr are loopback addresses
1701          */
1702         if (ipv4_is_loopback(daddr)) {
1703                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1704                         goto martian_destination;
1705         } else if (ipv4_is_loopback(saddr)) {
1706                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1707                         goto martian_source;
1708         }
1709
1710         /*
1711          *      Now we are ready to route packet.
1712          */
1713         fl4.flowi4_oif = 0;
1714         fl4.flowi4_iif = dev->ifindex;
1715         fl4.flowi4_mark = skb->mark;
1716         fl4.flowi4_tos = tos;
1717         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1718         fl4.daddr = daddr;
1719         fl4.saddr = saddr;
1720         err = fib_lookup(net, &fl4, &res);
1721         if (err != 0) {
1722                 if (!IN_DEV_FORWARD(in_dev))
1723                         err = -EHOSTUNREACH;
1724                 goto no_route;
1725         }
1726
1727         if (res.type == RTN_BROADCAST)
1728                 goto brd_input;
1729
1730         if (res.type == RTN_LOCAL) {
1731                 err = fib_validate_source(skb, saddr, daddr, tos,
1732                                           0, dev, in_dev, &itag);
1733                 if (err < 0)
1734                         goto martian_source_keep_err;
1735                 goto local_input;
1736         }
1737
1738         if (!IN_DEV_FORWARD(in_dev)) {
1739                 err = -EHOSTUNREACH;
1740                 goto no_route;
1741         }
1742         if (res.type != RTN_UNICAST)
1743                 goto martian_destination;
1744
1745         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1746 out:    return err;
1747
1748 brd_input:
1749         if (skb->protocol != htons(ETH_P_IP))
1750                 goto e_inval;
1751
1752         if (!ipv4_is_zeronet(saddr)) {
1753                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754                                           in_dev, &itag);
1755                 if (err < 0)
1756                         goto martian_source_keep_err;
1757         }
1758         flags |= RTCF_BROADCAST;
1759         res.type = RTN_BROADCAST;
1760         RT_CACHE_STAT_INC(in_brd);
1761
1762 local_input:
1763         do_cache = false;
1764         if (res.fi) {
1765                 if (!itag) {
1766                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1767                         if (rt_cache_valid(rth)) {
1768                                 skb_dst_set_noref(skb, &rth->dst);
1769                                 err = 0;
1770                                 goto out;
1771                         }
1772                         do_cache = true;
1773                 }
1774         }
1775
1776         rth = rt_dst_alloc(net->loopback_dev,
1777                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1778         if (!rth)
1779                 goto e_nobufs;
1780
1781         rth->dst.input= ip_local_deliver;
1782         rth->dst.output= ip_rt_bug;
1783 #ifdef CONFIG_IP_ROUTE_CLASSID
1784         rth->dst.tclassid = itag;
1785 #endif
1786
1787         rth->rt_genid = rt_genid_ipv4(net);
1788         rth->rt_flags   = flags|RTCF_LOCAL;
1789         rth->rt_type    = res.type;
1790         rth->rt_is_input = 1;
1791         rth->rt_iif     = 0;
1792         rth->rt_pmtu    = 0;
1793         rth->rt_gateway = 0;
1794         rth->rt_uses_gateway = 0;
1795         INIT_LIST_HEAD(&rth->rt_uncached);
1796         RT_CACHE_STAT_INC(in_slow_tot);
1797         if (res.type == RTN_UNREACHABLE) {
1798                 rth->dst.input= ip_error;
1799                 rth->dst.error= -err;
1800                 rth->rt_flags   &= ~RTCF_LOCAL;
1801         }
1802         if (do_cache) {
1803                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1804                         rth->dst.flags |= DST_NOCACHE;
1805                         rt_add_uncached_list(rth);
1806                 }
1807         }
1808         skb_dst_set(skb, &rth->dst);
1809         err = 0;
1810         goto out;
1811
1812 no_route:
1813         RT_CACHE_STAT_INC(in_no_route);
1814         res.type = RTN_UNREACHABLE;
1815         res.fi = NULL;
1816         goto local_input;
1817
1818         /*
1819          *      Do not cache martian addresses: they should be logged (RFC1812)
1820          */
1821 martian_destination:
1822         RT_CACHE_STAT_INC(in_martian_dst);
1823 #ifdef CONFIG_IP_ROUTE_VERBOSE
1824         if (IN_DEV_LOG_MARTIANS(in_dev))
1825                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1826                                      &daddr, &saddr, dev->name);
1827 #endif
1828
1829 e_inval:
1830         err = -EINVAL;
1831         goto out;
1832
1833 e_nobufs:
1834         err = -ENOBUFS;
1835         goto out;
1836
1837 martian_source:
1838         err = -EINVAL;
1839 martian_source_keep_err:
1840         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1841         goto out;
1842 }
1843
1844 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1845                          u8 tos, struct net_device *dev)
1846 {
1847         int res;
1848
1849         rcu_read_lock();
1850
1851         /* Multicast recognition logic is moved from route cache to here.
1852            The problem was that too many Ethernet cards have broken/missing
1853            hardware multicast filters :-( As result the host on multicasting
1854            network acquires a lot of useless route cache entries, sort of
1855            SDR messages from all the world. Now we try to get rid of them.
1856            Really, provided software IP multicast filter is organized
1857            reasonably (at least, hashed), it does not result in a slowdown
1858            comparing with route cache reject entries.
1859            Note, that multicast routers are not affected, because
1860            route cache entry is created eventually.
1861          */
1862         if (ipv4_is_multicast(daddr)) {
1863                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1864
1865                 if (in_dev) {
1866                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1867                                                   ip_hdr(skb)->protocol);
1868                         if (our
1869 #ifdef CONFIG_IP_MROUTE
1870                                 ||
1871                             (!ipv4_is_local_multicast(daddr) &&
1872                              IN_DEV_MFORWARD(in_dev))
1873 #endif
1874                            ) {
1875                                 int res = ip_route_input_mc(skb, daddr, saddr,
1876                                                             tos, dev, our);
1877                                 rcu_read_unlock();
1878                                 return res;
1879                         }
1880                 }
1881                 rcu_read_unlock();
1882                 return -EINVAL;
1883         }
1884         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1885         rcu_read_unlock();
1886         return res;
1887 }
1888 EXPORT_SYMBOL(ip_route_input_noref);
1889
1890 /* called with rcu_read_lock() */
1891 static struct rtable *__mkroute_output(const struct fib_result *res,
1892                                        const struct flowi4 *fl4, int orig_oif,
1893                                        struct net_device *dev_out,
1894                                        unsigned int flags)
1895 {
1896         struct fib_info *fi = res->fi;
1897         struct fib_nh_exception *fnhe;
1898         struct in_device *in_dev;
1899         u16 type = res->type;
1900         struct rtable *rth;
1901         bool do_cache;
1902
1903         in_dev = __in_dev_get_rcu(dev_out);
1904         if (!in_dev)
1905                 return ERR_PTR(-EINVAL);
1906
1907         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1908                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1909                         return ERR_PTR(-EINVAL);
1910
1911         if (ipv4_is_lbcast(fl4->daddr))
1912                 type = RTN_BROADCAST;
1913         else if (ipv4_is_multicast(fl4->daddr))
1914                 type = RTN_MULTICAST;
1915         else if (ipv4_is_zeronet(fl4->daddr))
1916                 return ERR_PTR(-EINVAL);
1917
1918         if (dev_out->flags & IFF_LOOPBACK)
1919                 flags |= RTCF_LOCAL;
1920
1921         do_cache = true;
1922         if (type == RTN_BROADCAST) {
1923                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1924                 fi = NULL;
1925         } else if (type == RTN_MULTICAST) {
1926                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1927                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1928                                      fl4->flowi4_proto))
1929                         flags &= ~RTCF_LOCAL;
1930                 else
1931                         do_cache = false;
1932                 /* If multicast route do not exist use
1933                  * default one, but do not gateway in this case.
1934                  * Yes, it is hack.
1935                  */
1936                 if (fi && res->prefixlen < 4)
1937                         fi = NULL;
1938         }
1939
1940         fnhe = NULL;
1941         do_cache &= fi != NULL;
1942         if (do_cache) {
1943                 struct rtable __rcu **prth;
1944                 struct fib_nh *nh = &FIB_RES_NH(*res);
1945
1946                 fnhe = find_exception(nh, fl4->daddr);
1947                 if (fnhe)
1948                         prth = &fnhe->fnhe_rth_output;
1949                 else {
1950                         if (unlikely(fl4->flowi4_flags &
1951                                      FLOWI_FLAG_KNOWN_NH &&
1952                                      !(nh->nh_gw &&
1953                                        nh->nh_scope == RT_SCOPE_LINK))) {
1954                                 do_cache = false;
1955                                 goto add;
1956                         }
1957                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1958                 }
1959                 rth = rcu_dereference(*prth);
1960                 if (rt_cache_valid(rth)) {
1961                         dst_hold(&rth->dst);
1962                         return rth;
1963                 }
1964         }
1965
1966 add:
1967         rth = rt_dst_alloc(dev_out,
1968                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1969                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1970                            do_cache);
1971         if (!rth)
1972                 return ERR_PTR(-ENOBUFS);
1973
1974         rth->dst.output = ip_output;
1975
1976         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1977         rth->rt_flags   = flags;
1978         rth->rt_type    = type;
1979         rth->rt_is_input = 0;
1980         rth->rt_iif     = orig_oif ? : 0;
1981         rth->rt_pmtu    = 0;
1982         rth->rt_gateway = 0;
1983         rth->rt_uses_gateway = 0;
1984         INIT_LIST_HEAD(&rth->rt_uncached);
1985
1986         RT_CACHE_STAT_INC(out_slow_tot);
1987
1988         if (flags & RTCF_LOCAL)
1989                 rth->dst.input = ip_local_deliver;
1990         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1991                 if (flags & RTCF_LOCAL &&
1992                     !(dev_out->flags & IFF_LOOPBACK)) {
1993                         rth->dst.output = ip_mc_output;
1994                         RT_CACHE_STAT_INC(out_slow_mc);
1995                 }
1996 #ifdef CONFIG_IP_MROUTE
1997                 if (type == RTN_MULTICAST) {
1998                         if (IN_DEV_MFORWARD(in_dev) &&
1999                             !ipv4_is_local_multicast(fl4->daddr)) {
2000                                 rth->dst.input = ip_mr_input;
2001                                 rth->dst.output = ip_mc_output;
2002                         }
2003                 }
2004 #endif
2005         }
2006
2007         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2008
2009         return rth;
2010 }
2011
2012 /*
2013  * Major route resolver routine.
2014  */
2015
2016 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2017 {
2018         struct net_device *dev_out = NULL;
2019         __u8 tos = RT_FL_TOS(fl4);
2020         unsigned int flags = 0;
2021         struct fib_result res;
2022         struct rtable *rth;
2023         int orig_oif;
2024
2025         res.tclassid    = 0;
2026         res.fi          = NULL;
2027         res.table       = NULL;
2028
2029         orig_oif = fl4->flowi4_oif;
2030
2031         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2032         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2033         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2034                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2035
2036         rcu_read_lock();
2037         if (fl4->saddr) {
2038                 rth = ERR_PTR(-EINVAL);
2039                 if (ipv4_is_multicast(fl4->saddr) ||
2040                     ipv4_is_lbcast(fl4->saddr) ||
2041                     ipv4_is_zeronet(fl4->saddr))
2042                         goto out;
2043
2044                 /* I removed check for oif == dev_out->oif here.
2045                    It was wrong for two reasons:
2046                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2047                       is assigned to multiple interfaces.
2048                    2. Moreover, we are allowed to send packets with saddr
2049                       of another iface. --ANK
2050                  */
2051
2052                 if (fl4->flowi4_oif == 0 &&
2053                     (ipv4_is_multicast(fl4->daddr) ||
2054                      ipv4_is_lbcast(fl4->daddr))) {
2055                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2056                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2057                         if (!dev_out)
2058                                 goto out;
2059
2060                         /* Special hack: user can direct multicasts
2061                            and limited broadcast via necessary interface
2062                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2063                            This hack is not just for fun, it allows
2064                            vic,vat and friends to work.
2065                            They bind socket to loopback, set ttl to zero
2066                            and expect that it will work.
2067                            From the viewpoint of routing cache they are broken,
2068                            because we are not allowed to build multicast path
2069                            with loopback source addr (look, routing cache
2070                            cannot know, that ttl is zero, so that packet
2071                            will not leave this host and route is valid).
2072                            Luckily, this hack is good workaround.
2073                          */
2074
2075                         fl4->flowi4_oif = dev_out->ifindex;
2076                         goto make_route;
2077                 }
2078
2079                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2080                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2081                         if (!__ip_dev_find(net, fl4->saddr, false))
2082                                 goto out;
2083                 }
2084         }
2085
2086
2087         if (fl4->flowi4_oif) {
2088                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2089                 rth = ERR_PTR(-ENODEV);
2090                 if (!dev_out)
2091                         goto out;
2092
2093                 /* RACE: Check return value of inet_select_addr instead. */
2094                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2095                         rth = ERR_PTR(-ENETUNREACH);
2096                         goto out;
2097                 }
2098                 if (ipv4_is_local_multicast(fl4->daddr) ||
2099                     ipv4_is_lbcast(fl4->daddr)) {
2100                         if (!fl4->saddr)
2101                                 fl4->saddr = inet_select_addr(dev_out, 0,
2102                                                               RT_SCOPE_LINK);
2103                         goto make_route;
2104                 }
2105                 if (!fl4->saddr) {
2106                         if (ipv4_is_multicast(fl4->daddr))
2107                                 fl4->saddr = inet_select_addr(dev_out, 0,
2108                                                               fl4->flowi4_scope);
2109                         else if (!fl4->daddr)
2110                                 fl4->saddr = inet_select_addr(dev_out, 0,
2111                                                               RT_SCOPE_HOST);
2112                 }
2113         }
2114
2115         if (!fl4->daddr) {
2116                 fl4->daddr = fl4->saddr;
2117                 if (!fl4->daddr)
2118                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2119                 dev_out = net->loopback_dev;
2120                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2121                 res.type = RTN_LOCAL;
2122                 flags |= RTCF_LOCAL;
2123                 goto make_route;
2124         }
2125
2126         if (fib_lookup(net, fl4, &res)) {
2127                 res.fi = NULL;
2128                 res.table = NULL;
2129                 if (fl4->flowi4_oif) {
2130                         /* Apparently, routing tables are wrong. Assume,
2131                            that the destination is on link.
2132
2133                            WHY? DW.
2134                            Because we are allowed to send to iface
2135                            even if it has NO routes and NO assigned
2136                            addresses. When oif is specified, routing
2137                            tables are looked up with only one purpose:
2138                            to catch if destination is gatewayed, rather than
2139                            direct. Moreover, if MSG_DONTROUTE is set,
2140                            we send packet, ignoring both routing tables
2141                            and ifaddr state. --ANK
2142
2143
2144                            We could make it even if oif is unknown,
2145                            likely IPv6, but we do not.
2146                          */
2147
2148                         if (fl4->saddr == 0)
2149                                 fl4->saddr = inet_select_addr(dev_out, 0,
2150                                                               RT_SCOPE_LINK);
2151                         res.type = RTN_UNICAST;
2152                         goto make_route;
2153                 }
2154                 rth = ERR_PTR(-ENETUNREACH);
2155                 goto out;
2156         }
2157
2158         if (res.type == RTN_LOCAL) {
2159                 if (!fl4->saddr) {
2160                         if (res.fi->fib_prefsrc)
2161                                 fl4->saddr = res.fi->fib_prefsrc;
2162                         else
2163                                 fl4->saddr = fl4->daddr;
2164                 }
2165                 dev_out = net->loopback_dev;
2166                 fl4->flowi4_oif = dev_out->ifindex;
2167                 flags |= RTCF_LOCAL;
2168                 goto make_route;
2169         }
2170
2171 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2172         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2173                 fib_select_multipath(&res);
2174         else
2175 #endif
2176         if (!res.prefixlen &&
2177             res.table->tb_num_default > 1 &&
2178             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2179                 fib_select_default(&res);
2180
2181         if (!fl4->saddr)
2182                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2183
2184         dev_out = FIB_RES_DEV(res);
2185         fl4->flowi4_oif = dev_out->ifindex;
2186
2187
2188 make_route:
2189         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2190
2191 out:
2192         rcu_read_unlock();
2193         return rth;
2194 }
2195 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2196
2197 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2198 {
2199         return NULL;
2200 }
2201
2202 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2203 {
2204         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2205
2206         return mtu ? : dst->dev->mtu;
2207 }
2208
2209 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2210                                           struct sk_buff *skb, u32 mtu)
2211 {
2212 }
2213
2214 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2215                                        struct sk_buff *skb)
2216 {
2217 }
2218
2219 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2220                                           unsigned long old)
2221 {
2222         return NULL;
2223 }
2224
2225 static struct dst_ops ipv4_dst_blackhole_ops = {
2226         .family                 =       AF_INET,
2227         .check                  =       ipv4_blackhole_dst_check,
2228         .mtu                    =       ipv4_blackhole_mtu,
2229         .default_advmss         =       ipv4_default_advmss,
2230         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2231         .redirect               =       ipv4_rt_blackhole_redirect,
2232         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2233         .neigh_lookup           =       ipv4_neigh_lookup,
2234 };
2235
2236 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2237 {
2238         struct rtable *ort = (struct rtable *) dst_orig;
2239         struct rtable *rt;
2240
2241         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2242         if (rt) {
2243                 struct dst_entry *new = &rt->dst;
2244
2245                 new->__use = 1;
2246                 new->input = dst_discard;
2247                 new->output = dst_discard_sk;
2248
2249                 new->dev = ort->dst.dev;
2250                 if (new->dev)
2251                         dev_hold(new->dev);
2252
2253                 rt->rt_is_input = ort->rt_is_input;
2254                 rt->rt_iif = ort->rt_iif;
2255                 rt->rt_pmtu = ort->rt_pmtu;
2256
2257                 rt->rt_genid = rt_genid_ipv4(net);
2258                 rt->rt_flags = ort->rt_flags;
2259                 rt->rt_type = ort->rt_type;
2260                 rt->rt_gateway = ort->rt_gateway;
2261                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2262
2263                 INIT_LIST_HEAD(&rt->rt_uncached);
2264
2265                 dst_free(new);
2266         }
2267
2268         dst_release(dst_orig);
2269
2270         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2271 }
2272
2273 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2274                                     struct sock *sk)
2275 {
2276         struct rtable *rt = __ip_route_output_key(net, flp4);
2277
2278         if (IS_ERR(rt))
2279                 return rt;
2280
2281         if (flp4->flowi4_proto)
2282                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2283                                                         flowi4_to_flowi(flp4),
2284                                                         sk, 0);
2285
2286         return rt;
2287 }
2288 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2289
2290 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2291                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2292                         u32 seq, int event, int nowait, unsigned int flags)
2293 {
2294         struct rtable *rt = skb_rtable(skb);
2295         struct rtmsg *r;
2296         struct nlmsghdr *nlh;
2297         unsigned long expires = 0;
2298         u32 error;
2299         u32 metrics[RTAX_MAX];
2300
2301         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2302         if (!nlh)
2303                 return -EMSGSIZE;
2304
2305         r = nlmsg_data(nlh);
2306         r->rtm_family    = AF_INET;
2307         r->rtm_dst_len  = 32;
2308         r->rtm_src_len  = 0;
2309         r->rtm_tos      = fl4->flowi4_tos;
2310         r->rtm_table    = RT_TABLE_MAIN;
2311         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2312                 goto nla_put_failure;
2313         r->rtm_type     = rt->rt_type;
2314         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2315         r->rtm_protocol = RTPROT_UNSPEC;
2316         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2317         if (rt->rt_flags & RTCF_NOTIFY)
2318                 r->rtm_flags |= RTM_F_NOTIFY;
2319         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2320                 r->rtm_flags |= RTCF_DOREDIRECT;
2321
2322         if (nla_put_in_addr(skb, RTA_DST, dst))
2323                 goto nla_put_failure;
2324         if (src) {
2325                 r->rtm_src_len = 32;
2326                 if (nla_put_in_addr(skb, RTA_SRC, src))
2327                         goto nla_put_failure;
2328         }
2329         if (rt->dst.dev &&
2330             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2331                 goto nla_put_failure;
2332 #ifdef CONFIG_IP_ROUTE_CLASSID
2333         if (rt->dst.tclassid &&
2334             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2335                 goto nla_put_failure;
2336 #endif
2337         if (!rt_is_input_route(rt) &&
2338             fl4->saddr != src) {
2339                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2340                         goto nla_put_failure;
2341         }
2342         if (rt->rt_uses_gateway &&
2343             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2344                 goto nla_put_failure;
2345
2346         expires = rt->dst.expires;
2347         if (expires) {
2348                 unsigned long now = jiffies;
2349
2350                 if (time_before(now, expires))
2351                         expires -= now;
2352                 else
2353                         expires = 0;
2354         }
2355
2356         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2357         if (rt->rt_pmtu && expires)
2358                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2359         if (rtnetlink_put_metrics(skb, metrics) < 0)
2360                 goto nla_put_failure;
2361
2362         if (fl4->flowi4_mark &&
2363             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2364                 goto nla_put_failure;
2365
2366         error = rt->dst.error;
2367
2368         if (rt_is_input_route(rt)) {
2369 #ifdef CONFIG_IP_MROUTE
2370                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2371                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2372                         int err = ipmr_get_route(net, skb,
2373                                                  fl4->saddr, fl4->daddr,
2374                                                  r, nowait);
2375                         if (err <= 0) {
2376                                 if (!nowait) {
2377                                         if (err == 0)
2378                                                 return 0;
2379                                         goto nla_put_failure;
2380                                 } else {
2381                                         if (err == -EMSGSIZE)
2382                                                 goto nla_put_failure;
2383                                         error = err;
2384                                 }
2385                         }
2386                 } else
2387 #endif
2388                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2389                                 goto nla_put_failure;
2390         }
2391
2392         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2393                 goto nla_put_failure;
2394
2395         nlmsg_end(skb, nlh);
2396         return 0;
2397
2398 nla_put_failure:
2399         nlmsg_cancel(skb, nlh);
2400         return -EMSGSIZE;
2401 }
2402
2403 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2404 {
2405         struct net *net = sock_net(in_skb->sk);
2406         struct rtmsg *rtm;
2407         struct nlattr *tb[RTA_MAX+1];
2408         struct rtable *rt = NULL;
2409         struct flowi4 fl4;
2410         __be32 dst = 0;
2411         __be32 src = 0;
2412         u32 iif;
2413         int err;
2414         int mark;
2415         struct sk_buff *skb;
2416
2417         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2418         if (err < 0)
2419                 goto errout;
2420
2421         rtm = nlmsg_data(nlh);
2422
2423         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2424         if (!skb) {
2425                 err = -ENOBUFS;
2426                 goto errout;
2427         }
2428
2429         /* Reserve room for dummy headers, this skb can pass
2430            through good chunk of routing engine.
2431          */
2432         skb_reset_mac_header(skb);
2433         skb_reset_network_header(skb);
2434
2435         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2436         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2437         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2438
2439         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2440         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2441         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2442         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2443
2444         memset(&fl4, 0, sizeof(fl4));
2445         fl4.daddr = dst;
2446         fl4.saddr = src;
2447         fl4.flowi4_tos = rtm->rtm_tos;
2448         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2449         fl4.flowi4_mark = mark;
2450
2451         if (iif) {
2452                 struct net_device *dev;
2453
2454                 dev = __dev_get_by_index(net, iif);
2455                 if (!dev) {
2456                         err = -ENODEV;
2457                         goto errout_free;
2458                 }
2459
2460                 skb->protocol   = htons(ETH_P_IP);
2461                 skb->dev        = dev;
2462                 skb->mark       = mark;
2463                 local_bh_disable();
2464                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2465                 local_bh_enable();
2466
2467                 rt = skb_rtable(skb);
2468                 if (err == 0 && rt->dst.error)
2469                         err = -rt->dst.error;
2470         } else {
2471                 rt = ip_route_output_key(net, &fl4);
2472
2473                 err = 0;
2474                 if (IS_ERR(rt))
2475                         err = PTR_ERR(rt);
2476         }
2477
2478         if (err)
2479                 goto errout_free;
2480
2481         skb_dst_set(skb, &rt->dst);
2482         if (rtm->rtm_flags & RTM_F_NOTIFY)
2483                 rt->rt_flags |= RTCF_NOTIFY;
2484
2485         err = rt_fill_info(net, dst, src, &fl4, skb,
2486                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2487                            RTM_NEWROUTE, 0, 0);
2488         if (err < 0)
2489                 goto errout_free;
2490
2491         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2492 errout:
2493         return err;
2494
2495 errout_free:
2496         kfree_skb(skb);
2497         goto errout;
2498 }
2499
2500 void ip_rt_multicast_event(struct in_device *in_dev)
2501 {
2502         rt_cache_flush(dev_net(in_dev->dev));
2503 }
2504
2505 #ifdef CONFIG_SYSCTL
2506 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2507 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2508 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2509 static int ip_rt_gc_elasticity __read_mostly    = 8;
2510
2511 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2512                                         void __user *buffer,
2513                                         size_t *lenp, loff_t *ppos)
2514 {
2515         struct net *net = (struct net *)__ctl->extra1;
2516
2517         if (write) {
2518                 rt_cache_flush(net);
2519                 fnhe_genid_bump(net);
2520                 return 0;
2521         }
2522
2523         return -EINVAL;
2524 }
2525
2526 static struct ctl_table ipv4_route_table[] = {
2527         {
2528                 .procname       = "gc_thresh",
2529                 .data           = &ipv4_dst_ops.gc_thresh,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec,
2533         },
2534         {
2535                 .procname       = "max_size",
2536                 .data           = &ip_rt_max_size,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec,
2540         },
2541         {
2542                 /*  Deprecated. Use gc_min_interval_ms */
2543
2544                 .procname       = "gc_min_interval",
2545                 .data           = &ip_rt_gc_min_interval,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = proc_dointvec_jiffies,
2549         },
2550         {
2551                 .procname       = "gc_min_interval_ms",
2552                 .data           = &ip_rt_gc_min_interval,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = proc_dointvec_ms_jiffies,
2556         },
2557         {
2558                 .procname       = "gc_timeout",
2559                 .data           = &ip_rt_gc_timeout,
2560                 .maxlen         = sizeof(int),
2561                 .mode           = 0644,
2562                 .proc_handler   = proc_dointvec_jiffies,
2563         },
2564         {
2565                 .procname       = "gc_interval",
2566                 .data           = &ip_rt_gc_interval,
2567                 .maxlen         = sizeof(int),
2568                 .mode           = 0644,
2569                 .proc_handler   = proc_dointvec_jiffies,
2570         },
2571         {
2572                 .procname       = "redirect_load",
2573                 .data           = &ip_rt_redirect_load,
2574                 .maxlen         = sizeof(int),
2575                 .mode           = 0644,
2576                 .proc_handler   = proc_dointvec,
2577         },
2578         {
2579                 .procname       = "redirect_number",
2580                 .data           = &ip_rt_redirect_number,
2581                 .maxlen         = sizeof(int),
2582                 .mode           = 0644,
2583                 .proc_handler   = proc_dointvec,
2584         },
2585         {
2586                 .procname       = "redirect_silence",
2587                 .data           = &ip_rt_redirect_silence,
2588                 .maxlen         = sizeof(int),
2589                 .mode           = 0644,
2590                 .proc_handler   = proc_dointvec,
2591         },
2592         {
2593                 .procname       = "error_cost",
2594                 .data           = &ip_rt_error_cost,
2595                 .maxlen         = sizeof(int),
2596                 .mode           = 0644,
2597                 .proc_handler   = proc_dointvec,
2598         },
2599         {
2600                 .procname       = "error_burst",
2601                 .data           = &ip_rt_error_burst,
2602                 .maxlen         = sizeof(int),
2603                 .mode           = 0644,
2604                 .proc_handler   = proc_dointvec,
2605         },
2606         {
2607                 .procname       = "gc_elasticity",
2608                 .data           = &ip_rt_gc_elasticity,
2609                 .maxlen         = sizeof(int),
2610                 .mode           = 0644,
2611                 .proc_handler   = proc_dointvec,
2612         },
2613         {
2614                 .procname       = "mtu_expires",
2615                 .data           = &ip_rt_mtu_expires,
2616                 .maxlen         = sizeof(int),
2617                 .mode           = 0644,
2618                 .proc_handler   = proc_dointvec_jiffies,
2619         },
2620         {
2621                 .procname       = "min_pmtu",
2622                 .data           = &ip_rt_min_pmtu,
2623                 .maxlen         = sizeof(int),
2624                 .mode           = 0644,
2625                 .proc_handler   = proc_dointvec,
2626         },
2627         {
2628                 .procname       = "min_adv_mss",
2629                 .data           = &ip_rt_min_advmss,
2630                 .maxlen         = sizeof(int),
2631                 .mode           = 0644,
2632                 .proc_handler   = proc_dointvec,
2633         },
2634         { }
2635 };
2636
2637 static struct ctl_table ipv4_route_flush_table[] = {
2638         {
2639                 .procname       = "flush",
2640                 .maxlen         = sizeof(int),
2641                 .mode           = 0200,
2642                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2643         },
2644         { },
2645 };
2646
2647 static __net_init int sysctl_route_net_init(struct net *net)
2648 {
2649         struct ctl_table *tbl;
2650
2651         tbl = ipv4_route_flush_table;
2652         if (!net_eq(net, &init_net)) {
2653                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2654                 if (!tbl)
2655                         goto err_dup;
2656
2657                 /* Don't export sysctls to unprivileged users */
2658                 if (net->user_ns != &init_user_ns)
2659                         tbl[0].procname = NULL;
2660         }
2661         tbl[0].extra1 = net;
2662
2663         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2664         if (!net->ipv4.route_hdr)
2665                 goto err_reg;
2666         return 0;
2667
2668 err_reg:
2669         if (tbl != ipv4_route_flush_table)
2670                 kfree(tbl);
2671 err_dup:
2672         return -ENOMEM;
2673 }
2674
2675 static __net_exit void sysctl_route_net_exit(struct net *net)
2676 {
2677         struct ctl_table *tbl;
2678
2679         tbl = net->ipv4.route_hdr->ctl_table_arg;
2680         unregister_net_sysctl_table(net->ipv4.route_hdr);
2681         BUG_ON(tbl == ipv4_route_flush_table);
2682         kfree(tbl);
2683 }
2684
2685 static __net_initdata struct pernet_operations sysctl_route_ops = {
2686         .init = sysctl_route_net_init,
2687         .exit = sysctl_route_net_exit,
2688 };
2689 #endif
2690
2691 static __net_init int rt_genid_init(struct net *net)
2692 {
2693         atomic_set(&net->ipv4.rt_genid, 0);
2694         atomic_set(&net->fnhe_genid, 0);
2695         get_random_bytes(&net->ipv4.dev_addr_genid,
2696                          sizeof(net->ipv4.dev_addr_genid));
2697         return 0;
2698 }
2699
2700 static __net_initdata struct pernet_operations rt_genid_ops = {
2701         .init = rt_genid_init,
2702 };
2703
2704 static int __net_init ipv4_inetpeer_init(struct net *net)
2705 {
2706         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2707
2708         if (!bp)
2709                 return -ENOMEM;
2710         inet_peer_base_init(bp);
2711         net->ipv4.peers = bp;
2712         return 0;
2713 }
2714
2715 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2716 {
2717         struct inet_peer_base *bp = net->ipv4.peers;
2718
2719         net->ipv4.peers = NULL;
2720         inetpeer_invalidate_tree(bp);
2721         kfree(bp);
2722 }
2723
2724 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2725         .init   =       ipv4_inetpeer_init,
2726         .exit   =       ipv4_inetpeer_exit,
2727 };
2728
2729 #ifdef CONFIG_IP_ROUTE_CLASSID
2730 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2731 #endif /* CONFIG_IP_ROUTE_CLASSID */
2732
2733 int __init ip_rt_init(void)
2734 {
2735         int rc = 0;
2736         int cpu;
2737
2738         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2739         if (!ip_idents)
2740                 panic("IP: failed to allocate ip_idents\n");
2741
2742         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2743
2744         for_each_possible_cpu(cpu) {
2745                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2746
2747                 INIT_LIST_HEAD(&ul->head);
2748                 spin_lock_init(&ul->lock);
2749         }
2750 #ifdef CONFIG_IP_ROUTE_CLASSID
2751         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2752         if (!ip_rt_acct)
2753                 panic("IP: failed to allocate ip_rt_acct\n");
2754 #endif
2755
2756         ipv4_dst_ops.kmem_cachep =
2757                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2758                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2759
2760         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2761
2762         if (dst_entries_init(&ipv4_dst_ops) < 0)
2763                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2764
2765         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2766                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2767
2768         ipv4_dst_ops.gc_thresh = ~0;
2769         ip_rt_max_size = INT_MAX;
2770
2771         devinet_init();
2772         ip_fib_init();
2773
2774         if (ip_rt_proc_init())
2775                 pr_err("Unable to create route proc files\n");
2776 #ifdef CONFIG_XFRM
2777         xfrm_init();
2778         xfrm4_init();
2779 #endif
2780         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2781
2782 #ifdef CONFIG_SYSCTL
2783         register_pernet_subsys(&sysctl_route_ops);
2784 #endif
2785         register_pernet_subsys(&rt_genid_ops);
2786         register_pernet_subsys(&ipv4_inetpeer_ops);
2787         return rc;
2788 }
2789
2790 #ifdef CONFIG_SYSCTL
2791 /*
2792  * We really need to sanitize the damn ipv4 init order, then all
2793  * this nonsense will go away.
2794  */
2795 void __init ip_static_sysctl_init(void)
2796 {
2797         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2798 }
2799 #endif