sysctl: pass kernel pointers to ->proc_handler
[linux-block.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 new, delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
500         do {
501                 old = (u32)atomic_read(p_id);
502                 new = old + delta + segs;
503         } while (atomic_cmpxchg(p_id, old, new) != old);
504
505         return new - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511         u32 hash, id;
512
513         /* Note the following code is not safe, but this is okay. */
514         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515                 get_random_bytes(&net->ipv4.ip_id_key,
516                                  sizeof(net->ipv4.ip_id_key));
517
518         hash = siphash_3u32((__force u32)iph->daddr,
519                             (__force u32)iph->saddr,
520                             iph->protocol,
521                             &net->ipv4.ip_id_key);
522         id = ip_idents_reserve(hash, segs);
523         iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528                              const struct sock *sk,
529                              const struct iphdr *iph,
530                              int oif, u8 tos,
531                              u8 prot, u32 mark, int flow_flags)
532 {
533         if (sk) {
534                 const struct inet_sock *inet = inet_sk(sk);
535
536                 oif = sk->sk_bound_dev_if;
537                 mark = sk->sk_mark;
538                 tos = RT_CONN_FLAGS(sk);
539                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540         }
541         flowi4_init_output(fl4, oif, mark, tos,
542                            RT_SCOPE_UNIVERSE, prot,
543                            flow_flags,
544                            iph->daddr, iph->saddr, 0, 0,
545                            sock_net_uid(net, sk));
546 }
547
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549                                const struct sock *sk)
550 {
551         const struct net *net = dev_net(skb->dev);
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static DEFINE_SPINLOCK(fnhe_lock);
589
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592         struct rtable *rt;
593
594         rt = rcu_dereference(fnhe->fnhe_rth_input);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597                 dst_dev_put(&rt->dst);
598                 dst_release(&rt->dst);
599         }
600         rt = rcu_dereference(fnhe->fnhe_rth_output);
601         if (rt) {
602                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603                 dst_dev_put(&rt->dst);
604                 dst_release(&rt->dst);
605         }
606 }
607
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610         struct fib_nh_exception *fnhe, *oldest;
611
612         oldest = rcu_dereference(hash->chain);
613         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614              fnhe = rcu_dereference(fnhe->fnhe_next)) {
615                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616                         oldest = fnhe;
617         }
618         fnhe_flush_routes(oldest);
619         return oldest;
620 }
621
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624         static u32 fnhe_hashrnd __read_mostly;
625         u32 hval;
626
627         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
629         return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634         rt->rt_pmtu = fnhe->fnhe_pmtu;
635         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636         rt->dst.expires = fnhe->fnhe_expires;
637
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_uses_gateway = 1;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659
660         spin_lock_bh(&fnhe_lock);
661
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669
670         hash += hval;
671
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731
732         fnhe->fnhe_stamp = jiffies;
733
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755
756         default:
757                 return;
758         }
759
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
792
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831
832         rt = (struct rtable *) dst;
833
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->rate_tokens == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->n_redirects)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->n_redirects;
927 #ifdef CONFIG_IP_ROUTE_VERBOSE
928                 if (log_martians &&
929                     peer->n_redirects == ip_rt_redirect_number)
930                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
931                                              &ip_hdr(skb)->saddr, inet_iif(skb),
932                                              &ip_hdr(skb)->daddr, &gw);
933 #endif
934         }
935 out_put_peer:
936         inet_putpeer(peer);
937 }
938
939 static int ip_error(struct sk_buff *skb)
940 {
941         struct rtable *rt = skb_rtable(skb);
942         struct net_device *dev = skb->dev;
943         struct in_device *in_dev;
944         struct inet_peer *peer;
945         unsigned long now;
946         struct net *net;
947         bool send;
948         int code;
949
950         if (netif_is_l3_master(skb->dev)) {
951                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952                 if (!dev)
953                         goto out;
954         }
955
956         in_dev = __in_dev_get_rcu(dev);
957
958         /* IP on this device is disabled. */
959         if (!in_dev)
960                 goto out;
961
962         net = dev_net(rt->dst.dev);
963         if (!IN_DEV_FORWARD(in_dev)) {
964                 switch (rt->dst.error) {
965                 case EHOSTUNREACH:
966                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
967                         break;
968
969                 case ENETUNREACH:
970                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
971                         break;
972                 }
973                 goto out;
974         }
975
976         switch (rt->dst.error) {
977         case EINVAL:
978         default:
979                 goto out;
980         case EHOSTUNREACH:
981                 code = ICMP_HOST_UNREACH;
982                 break;
983         case ENETUNREACH:
984                 code = ICMP_NET_UNREACH;
985                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986                 break;
987         case EACCES:
988                 code = ICMP_PKT_FILTERED;
989                 break;
990         }
991
992         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
993                                l3mdev_master_ifindex(skb->dev), 1);
994
995         send = true;
996         if (peer) {
997                 now = jiffies;
998                 peer->rate_tokens += now - peer->rate_last;
999                 if (peer->rate_tokens > ip_rt_error_burst)
1000                         peer->rate_tokens = ip_rt_error_burst;
1001                 peer->rate_last = now;
1002                 if (peer->rate_tokens >= ip_rt_error_cost)
1003                         peer->rate_tokens -= ip_rt_error_cost;
1004                 else
1005                         send = false;
1006                 inet_putpeer(peer);
1007         }
1008         if (send)
1009                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010
1011 out:    kfree_skb(skb);
1012         return 0;
1013 }
1014
1015 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 {
1017         struct dst_entry *dst = &rt->dst;
1018         u32 old_mtu = ipv4_mtu(dst);
1019         struct fib_result res;
1020         bool lock = false;
1021
1022         if (ip_mtu_locked(dst))
1023                 return;
1024
1025         if (old_mtu < mtu)
1026                 return;
1027
1028         if (mtu < ip_rt_min_pmtu) {
1029                 lock = true;
1030                 mtu = min(old_mtu, ip_rt_min_pmtu);
1031         }
1032
1033         if (rt->rt_pmtu == mtu && !lock &&
1034             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1035                 return;
1036
1037         rcu_read_lock();
1038         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1039                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1040
1041                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1042                                       jiffies + ip_rt_mtu_expires);
1043         }
1044         rcu_read_unlock();
1045 }
1046
1047 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1048                               struct sk_buff *skb, u32 mtu,
1049                               bool confirm_neigh)
1050 {
1051         struct rtable *rt = (struct rtable *) dst;
1052         struct flowi4 fl4;
1053
1054         ip_rt_build_flow_key(&fl4, sk, skb);
1055         __ip_rt_update_pmtu(rt, &fl4, mtu);
1056 }
1057
1058 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1059                       int oif, u8 protocol)
1060 {
1061         const struct iphdr *iph = (const struct iphdr *) skb->data;
1062         struct flowi4 fl4;
1063         struct rtable *rt;
1064         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1065
1066         __build_flow_key(net, &fl4, NULL, iph, oif,
1067                          RT_TOS(iph->tos), protocol, mark, 0);
1068         rt = __ip_route_output_key(net, &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075
1076 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1077 {
1078         const struct iphdr *iph = (const struct iphdr *) skb->data;
1079         struct flowi4 fl4;
1080         struct rtable *rt;
1081
1082         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1083
1084         if (!fl4.flowi4_mark)
1085                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086
1087         rt = __ip_route_output_key(sock_net(sk), &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093
1094 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099         struct dst_entry *odst = NULL;
1100         bool new = false;
1101         struct net *net = sock_net(sk);
1102
1103         bh_lock_sock(sk);
1104
1105         if (!ip_sk_accept_pmtu(sk))
1106                 goto out;
1107
1108         odst = sk_dst_get(sk);
1109
1110         if (sock_owned_by_user(sk) || !odst) {
1111                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1112                 goto out;
1113         }
1114
1115         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1116
1117         rt = (struct rtable *)odst;
1118         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1119                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120                 if (IS_ERR(rt))
1121                         goto out;
1122
1123                 new = true;
1124         }
1125
1126         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1127
1128         if (!dst_check(&rt->dst, 0)) {
1129                 if (new)
1130                         dst_release(&rt->dst);
1131
1132                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1133                 if (IS_ERR(rt))
1134                         goto out;
1135
1136                 new = true;
1137         }
1138
1139         if (new)
1140                 sk_dst_set(sk, &rt->dst);
1141
1142 out:
1143         bh_unlock_sock(sk);
1144         dst_release(odst);
1145 }
1146 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1147
1148 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1149                    int oif, u8 protocol)
1150 {
1151         const struct iphdr *iph = (const struct iphdr *) skb->data;
1152         struct flowi4 fl4;
1153         struct rtable *rt;
1154
1155         __build_flow_key(net, &fl4, NULL, iph, oif,
1156                          RT_TOS(iph->tos), protocol, 0, 0);
1157         rt = __ip_route_output_key(net, &fl4);
1158         if (!IS_ERR(rt)) {
1159                 __ip_do_redirect(rt, skb, &fl4, false);
1160                 ip_rt_put(rt);
1161         }
1162 }
1163 EXPORT_SYMBOL_GPL(ipv4_redirect);
1164
1165 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166 {
1167         const struct iphdr *iph = (const struct iphdr *) skb->data;
1168         struct flowi4 fl4;
1169         struct rtable *rt;
1170         struct net *net = sock_net(sk);
1171
1172         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1173         rt = __ip_route_output_key(net, &fl4);
1174         if (!IS_ERR(rt)) {
1175                 __ip_do_redirect(rt, skb, &fl4, false);
1176                 ip_rt_put(rt);
1177         }
1178 }
1179 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183         struct rtable *rt = (struct rtable *) dst;
1184
1185         /* All IPV4 dsts are created with ->obsolete set to the value
1186          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1187          * into this function always.
1188          *
1189          * When a PMTU/redirect information update invalidates a route,
1190          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1191          * DST_OBSOLETE_DEAD.
1192          */
1193         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1194                 return NULL;
1195         return dst;
1196 }
1197
1198 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1199 {
1200         struct ip_options opt;
1201         int res;
1202
1203         /* Recompile ip options since IPCB may not be valid anymore.
1204          * Also check we have a reasonable ipv4 header.
1205          */
1206         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1207             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1208                 return;
1209
1210         memset(&opt, 0, sizeof(opt));
1211         if (ip_hdr(skb)->ihl > 5) {
1212                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1213                         return;
1214                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1215
1216                 rcu_read_lock();
1217                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1218                 rcu_read_unlock();
1219
1220                 if (res)
1221                         return;
1222         }
1223         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1224 }
1225
1226 static void ipv4_link_failure(struct sk_buff *skb)
1227 {
1228         struct rtable *rt;
1229
1230         ipv4_send_dest_unreach(skb);
1231
1232         rt = skb_rtable(skb);
1233         if (rt)
1234                 dst_set_expires(&rt->dst, 0);
1235 }
1236
1237 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1238 {
1239         pr_debug("%s: %pI4 -> %pI4, %s\n",
1240                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1241                  skb->dev ? skb->dev->name : "?");
1242         kfree_skb(skb);
1243         WARN_ON(1);
1244         return 0;
1245 }
1246
1247 /*
1248    We do not cache source address of outgoing interface,
1249    because it is used only by IP RR, TS and SRR options,
1250    so that it out of fast path.
1251
1252    BTW remember: "addr" is allowed to be not aligned
1253    in IP options!
1254  */
1255
1256 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1257 {
1258         __be32 src;
1259
1260         if (rt_is_output_route(rt))
1261                 src = ip_hdr(skb)->saddr;
1262         else {
1263                 struct fib_result res;
1264                 struct iphdr *iph = ip_hdr(skb);
1265                 struct flowi4 fl4 = {
1266                         .daddr = iph->daddr,
1267                         .saddr = iph->saddr,
1268                         .flowi4_tos = RT_TOS(iph->tos),
1269                         .flowi4_oif = rt->dst.dev->ifindex,
1270                         .flowi4_iif = skb->dev->ifindex,
1271                         .flowi4_mark = skb->mark,
1272                 };
1273
1274                 rcu_read_lock();
1275                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1276                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1277                 else
1278                         src = inet_select_addr(rt->dst.dev,
1279                                                rt_nexthop(rt, iph->daddr),
1280                                                RT_SCOPE_UNIVERSE);
1281                 rcu_read_unlock();
1282         }
1283         memcpy(addr, &src, 4);
1284 }
1285
1286 #ifdef CONFIG_IP_ROUTE_CLASSID
1287 static void set_class_tag(struct rtable *rt, u32 tag)
1288 {
1289         if (!(rt->dst.tclassid & 0xFFFF))
1290                 rt->dst.tclassid |= tag & 0xFFFF;
1291         if (!(rt->dst.tclassid & 0xFFFF0000))
1292                 rt->dst.tclassid |= tag & 0xFFFF0000;
1293 }
1294 #endif
1295
1296 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297 {
1298         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1299         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1300                                     ip_rt_min_advmss);
1301
1302         return min(advmss, IPV4_MAX_PMTU - header_size);
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash;
1331         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332         u32 hval = fnhe_hashfun(daddr);
1333
1334         spin_lock_bh(&fnhe_lock);
1335
1336         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337                                          lockdep_is_held(&fnhe_lock));
1338         hash += hval;
1339
1340         fnhe_p = &hash->chain;
1341         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342         while (fnhe) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346                         /* set fnhe_daddr to 0 to ensure it won't bind with
1347                          * new dsts in rt_bind_exception().
1348                          */
1349                         fnhe->fnhe_daddr = 0;
1350                         fnhe_flush_routes(fnhe);
1351                         kfree_rcu(fnhe, rcu);
1352                         break;
1353                 }
1354                 fnhe_p = &fnhe->fnhe_next;
1355                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                                                  lockdep_is_held(&fnhe_lock));
1357         }
1358
1359         spin_unlock_bh(&fnhe_lock);
1360 }
1361
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363                                                __be32 daddr)
1364 {
1365         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366         struct fib_nh_exception *fnhe;
1367         u32 hval;
1368
1369         if (!hash)
1370                 return NULL;
1371
1372         hval = fnhe_hashfun(daddr);
1373
1374         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376                 if (fnhe->fnhe_daddr == daddr) {
1377                         if (fnhe->fnhe_expires &&
1378                             time_after(jiffies, fnhe->fnhe_expires)) {
1379                                 ip_del_fnhe(nhc, daddr);
1380                                 break;
1381                         }
1382                         return fnhe;
1383                 }
1384         }
1385         return NULL;
1386 }
1387
1388 /* MTU selection:
1389  * 1. mtu on route is locked - use it
1390  * 2. mtu from nexthop exception
1391  * 3. mtu from egress device
1392  */
1393
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396         struct fib_nh_common *nhc = res->nhc;
1397         struct net_device *dev = nhc->nhc_dev;
1398         struct fib_info *fi = res->fi;
1399         u32 mtu = 0;
1400
1401         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1402             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403                 mtu = fi->fib_mtu;
1404
1405         if (likely(!mtu)) {
1406                 struct fib_nh_exception *fnhe;
1407
1408                 fnhe = find_exception(nhc, daddr);
1409                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410                         mtu = fnhe->fnhe_pmtu;
1411         }
1412
1413         if (likely(!mtu))
1414                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415
1416         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420                               __be32 daddr, const bool do_cache)
1421 {
1422         bool ret = false;
1423
1424         spin_lock_bh(&fnhe_lock);
1425
1426         if (daddr == fnhe->fnhe_daddr) {
1427                 struct rtable __rcu **porig;
1428                 struct rtable *orig;
1429                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1430
1431                 if (rt_is_input_route(rt))
1432                         porig = &fnhe->fnhe_rth_input;
1433                 else
1434                         porig = &fnhe->fnhe_rth_output;
1435                 orig = rcu_dereference(*porig);
1436
1437                 if (fnhe->fnhe_genid != genid) {
1438                         fnhe->fnhe_genid = genid;
1439                         fnhe->fnhe_gw = 0;
1440                         fnhe->fnhe_pmtu = 0;
1441                         fnhe->fnhe_expires = 0;
1442                         fnhe->fnhe_mtu_locked = false;
1443                         fnhe_flush_routes(fnhe);
1444                         orig = NULL;
1445                 }
1446                 fill_route_from_fnhe(rt, fnhe);
1447                 if (!rt->rt_gw4) {
1448                         rt->rt_gw4 = daddr;
1449                         rt->rt_gw_family = AF_INET;
1450                 }
1451
1452                 if (do_cache) {
1453                         dst_hold(&rt->dst);
1454                         rcu_assign_pointer(*porig, rt);
1455                         if (orig) {
1456                                 dst_dev_put(&orig->dst);
1457                                 dst_release(&orig->dst);
1458                         }
1459                         ret = true;
1460                 }
1461
1462                 fnhe->fnhe_stamp = jiffies;
1463         }
1464         spin_unlock_bh(&fnhe_lock);
1465
1466         return ret;
1467 }
1468
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471         struct rtable *orig, *prev, **p;
1472         bool ret = true;
1473
1474         if (rt_is_input_route(rt)) {
1475                 p = (struct rtable **)&nhc->nhc_rth_input;
1476         } else {
1477                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478         }
1479         orig = *p;
1480
1481         /* hold dst before doing cmpxchg() to avoid race condition
1482          * on this dst
1483          */
1484         dst_hold(&rt->dst);
1485         prev = cmpxchg(p, orig, rt);
1486         if (prev == orig) {
1487                 if (orig) {
1488                         rt_add_uncached_list(orig);
1489                         dst_release(&orig->dst);
1490                 }
1491         } else {
1492                 dst_release(&rt->dst);
1493                 ret = false;
1494         }
1495
1496         return ret;
1497 }
1498
1499 struct uncached_list {
1500         spinlock_t              lock;
1501         struct list_head        head;
1502 };
1503
1504 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1505
1506 void rt_add_uncached_list(struct rtable *rt)
1507 {
1508         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1509
1510         rt->rt_uncached_list = ul;
1511
1512         spin_lock_bh(&ul->lock);
1513         list_add_tail(&rt->rt_uncached, &ul->head);
1514         spin_unlock_bh(&ul->lock);
1515 }
1516
1517 void rt_del_uncached_list(struct rtable *rt)
1518 {
1519         if (!list_empty(&rt->rt_uncached)) {
1520                 struct uncached_list *ul = rt->rt_uncached_list;
1521
1522                 spin_lock_bh(&ul->lock);
1523                 list_del(&rt->rt_uncached);
1524                 spin_unlock_bh(&ul->lock);
1525         }
1526 }
1527
1528 static void ipv4_dst_destroy(struct dst_entry *dst)
1529 {
1530         struct rtable *rt = (struct rtable *)dst;
1531
1532         ip_dst_metrics_put(dst);
1533         rt_del_uncached_list(rt);
1534 }
1535
1536 void rt_flush_dev(struct net_device *dev)
1537 {
1538         struct rtable *rt;
1539         int cpu;
1540
1541         for_each_possible_cpu(cpu) {
1542                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1543
1544                 spin_lock_bh(&ul->lock);
1545                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1546                         if (rt->dst.dev != dev)
1547                                 continue;
1548                         rt->dst.dev = blackhole_netdev;
1549                         dev_hold(rt->dst.dev);
1550                         dev_put(dev);
1551                 }
1552                 spin_unlock_bh(&ul->lock);
1553         }
1554 }
1555
1556 static bool rt_cache_valid(const struct rtable *rt)
1557 {
1558         return  rt &&
1559                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1560                 !rt_is_expired(rt);
1561 }
1562
1563 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1564                            const struct fib_result *res,
1565                            struct fib_nh_exception *fnhe,
1566                            struct fib_info *fi, u16 type, u32 itag,
1567                            const bool do_cache)
1568 {
1569         bool cached = false;
1570
1571         if (fi) {
1572                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1573
1574                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1575                         rt->rt_uses_gateway = 1;
1576                         rt->rt_gw_family = nhc->nhc_gw_family;
1577                         /* only INET and INET6 are supported */
1578                         if (likely(nhc->nhc_gw_family == AF_INET))
1579                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1580                         else
1581                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1582                 }
1583
1584                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1585
1586 #ifdef CONFIG_IP_ROUTE_CLASSID
1587                 if (nhc->nhc_family == AF_INET) {
1588                         struct fib_nh *nh;
1589
1590                         nh = container_of(nhc, struct fib_nh, nh_common);
1591                         rt->dst.tclassid = nh->nh_tclassid;
1592                 }
1593 #endif
1594                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1595                 if (unlikely(fnhe))
1596                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1597                 else if (do_cache)
1598                         cached = rt_cache_route(nhc, rt);
1599                 if (unlikely(!cached)) {
1600                         /* Routes we intend to cache in nexthop exception or
1601                          * FIB nexthop have the DST_NOCACHE bit clear.
1602                          * However, if we are unsuccessful at storing this
1603                          * route into the cache we really need to set it.
1604                          */
1605                         if (!rt->rt_gw4) {
1606                                 rt->rt_gw_family = AF_INET;
1607                                 rt->rt_gw4 = daddr;
1608                         }
1609                         rt_add_uncached_list(rt);
1610                 }
1611         } else
1612                 rt_add_uncached_list(rt);
1613
1614 #ifdef CONFIG_IP_ROUTE_CLASSID
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, res->tclassid);
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620 }
1621
1622 struct rtable *rt_dst_alloc(struct net_device *dev,
1623                             unsigned int flags, u16 type,
1624                             bool nopolicy, bool noxfrm)
1625 {
1626         struct rtable *rt;
1627
1628         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1629                        (nopolicy ? DST_NOPOLICY : 0) |
1630                        (noxfrm ? DST_NOXFRM : 0));
1631
1632         if (rt) {
1633                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1634                 rt->rt_flags = flags;
1635                 rt->rt_type = type;
1636                 rt->rt_is_input = 0;
1637                 rt->rt_iif = 0;
1638                 rt->rt_pmtu = 0;
1639                 rt->rt_mtu_locked = 0;
1640                 rt->rt_uses_gateway = 0;
1641                 rt->rt_gw_family = 0;
1642                 rt->rt_gw4 = 0;
1643                 INIT_LIST_HEAD(&rt->rt_uncached);
1644
1645                 rt->dst.output = ip_output;
1646                 if (flags & RTCF_LOCAL)
1647                         rt->dst.input = ip_local_deliver;
1648         }
1649
1650         return rt;
1651 }
1652 EXPORT_SYMBOL(rt_dst_alloc);
1653
1654 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1655 {
1656         struct rtable *new_rt;
1657
1658         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1659                            rt->dst.flags);
1660
1661         if (new_rt) {
1662                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1663                 new_rt->rt_flags = rt->rt_flags;
1664                 new_rt->rt_type = rt->rt_type;
1665                 new_rt->rt_is_input = rt->rt_is_input;
1666                 new_rt->rt_iif = rt->rt_iif;
1667                 new_rt->rt_pmtu = rt->rt_pmtu;
1668                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1669                 new_rt->rt_gw_family = rt->rt_gw_family;
1670                 if (rt->rt_gw_family == AF_INET)
1671                         new_rt->rt_gw4 = rt->rt_gw4;
1672                 else if (rt->rt_gw_family == AF_INET6)
1673                         new_rt->rt_gw6 = rt->rt_gw6;
1674                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1675
1676                 new_rt->dst.input = rt->dst.input;
1677                 new_rt->dst.output = rt->dst.output;
1678                 new_rt->dst.error = rt->dst.error;
1679                 new_rt->dst.lastuse = jiffies;
1680                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1681         }
1682         return new_rt;
1683 }
1684 EXPORT_SYMBOL(rt_dst_clone);
1685
1686 /* called in rcu_read_lock() section */
1687 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688                           u8 tos, struct net_device *dev,
1689                           struct in_device *in_dev, u32 *itag)
1690 {
1691         int err;
1692
1693         /* Primary sanity checks. */
1694         if (!in_dev)
1695                 return -EINVAL;
1696
1697         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1698             skb->protocol != htons(ETH_P_IP))
1699                 return -EINVAL;
1700
1701         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1702                 return -EINVAL;
1703
1704         if (ipv4_is_zeronet(saddr)) {
1705                 if (!ipv4_is_local_multicast(daddr) &&
1706                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1707                         return -EINVAL;
1708         } else {
1709                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1710                                           in_dev, itag);
1711                 if (err < 0)
1712                         return err;
1713         }
1714         return 0;
1715 }
1716
1717 /* called in rcu_read_lock() section */
1718 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719                              u8 tos, struct net_device *dev, int our)
1720 {
1721         struct in_device *in_dev = __in_dev_get_rcu(dev);
1722         unsigned int flags = RTCF_MULTICAST;
1723         struct rtable *rth;
1724         u32 itag = 0;
1725         int err;
1726
1727         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1728         if (err)
1729                 return err;
1730
1731         if (our)
1732                 flags |= RTCF_LOCAL;
1733
1734         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1735                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1736         if (!rth)
1737                 return -ENOBUFS;
1738
1739 #ifdef CONFIG_IP_ROUTE_CLASSID
1740         rth->dst.tclassid = itag;
1741 #endif
1742         rth->dst.output = ip_rt_bug;
1743         rth->rt_is_input= 1;
1744
1745 #ifdef CONFIG_IP_MROUTE
1746         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1747                 rth->dst.input = ip_mr_input;
1748 #endif
1749         RT_CACHE_STAT_INC(in_slow_mc);
1750
1751         skb_dst_set(skb, &rth->dst);
1752         return 0;
1753 }
1754
1755
1756 static void ip_handle_martian_source(struct net_device *dev,
1757                                      struct in_device *in_dev,
1758                                      struct sk_buff *skb,
1759                                      __be32 daddr,
1760                                      __be32 saddr)
1761 {
1762         RT_CACHE_STAT_INC(in_martian_src);
1763 #ifdef CONFIG_IP_ROUTE_VERBOSE
1764         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1765                 /*
1766                  *      RFC1812 recommendation, if source is martian,
1767                  *      the only hint is MAC header.
1768                  */
1769                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1770                         &daddr, &saddr, dev->name);
1771                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1772                         print_hex_dump(KERN_WARNING, "ll header: ",
1773                                        DUMP_PREFIX_OFFSET, 16, 1,
1774                                        skb_mac_header(skb),
1775                                        dev->hard_header_len, false);
1776                 }
1777         }
1778 #endif
1779 }
1780
1781 /* called in rcu_read_lock() section */
1782 static int __mkroute_input(struct sk_buff *skb,
1783                            const struct fib_result *res,
1784                            struct in_device *in_dev,
1785                            __be32 daddr, __be32 saddr, u32 tos)
1786 {
1787         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1788         struct net_device *dev = nhc->nhc_dev;
1789         struct fib_nh_exception *fnhe;
1790         struct rtable *rth;
1791         int err;
1792         struct in_device *out_dev;
1793         bool do_cache;
1794         u32 itag = 0;
1795
1796         /* get a working reference to the output device */
1797         out_dev = __in_dev_get_rcu(dev);
1798         if (!out_dev) {
1799                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1800                 return -EINVAL;
1801         }
1802
1803         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1804                                   in_dev->dev, in_dev, &itag);
1805         if (err < 0) {
1806                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1807                                          saddr);
1808
1809                 goto cleanup;
1810         }
1811
1812         do_cache = res->fi && !itag;
1813         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1814             skb->protocol == htons(ETH_P_IP)) {
1815                 __be32 gw;
1816
1817                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1818                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1819                     inet_addr_onlink(out_dev, saddr, gw))
1820                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1821         }
1822
1823         if (skb->protocol != htons(ETH_P_IP)) {
1824                 /* Not IP (i.e. ARP). Do not create route, if it is
1825                  * invalid for proxy arp. DNAT routes are always valid.
1826                  *
1827                  * Proxy arp feature have been extended to allow, ARP
1828                  * replies back to the same interface, to support
1829                  * Private VLAN switch technologies. See arp.c.
1830                  */
1831                 if (out_dev == in_dev &&
1832                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1833                         err = -EINVAL;
1834                         goto cleanup;
1835                 }
1836         }
1837
1838         fnhe = find_exception(nhc, daddr);
1839         if (do_cache) {
1840                 if (fnhe)
1841                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1842                 else
1843                         rth = rcu_dereference(nhc->nhc_rth_input);
1844                 if (rt_cache_valid(rth)) {
1845                         skb_dst_set_noref(skb, &rth->dst);
1846                         goto out;
1847                 }
1848         }
1849
1850         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1851                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1852                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1853         if (!rth) {
1854                 err = -ENOBUFS;
1855                 goto cleanup;
1856         }
1857
1858         rth->rt_is_input = 1;
1859         RT_CACHE_STAT_INC(in_slow_tot);
1860
1861         rth->dst.input = ip_forward;
1862
1863         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1864                        do_cache);
1865         lwtunnel_set_redirect(&rth->dst);
1866         skb_dst_set(skb, &rth->dst);
1867 out:
1868         err = 0;
1869  cleanup:
1870         return err;
1871 }
1872
1873 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1874 /* To make ICMP packets follow the right flow, the multipath hash is
1875  * calculated from the inner IP addresses.
1876  */
1877 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1878                                  struct flow_keys *hash_keys)
1879 {
1880         const struct iphdr *outer_iph = ip_hdr(skb);
1881         const struct iphdr *key_iph = outer_iph;
1882         const struct iphdr *inner_iph;
1883         const struct icmphdr *icmph;
1884         struct iphdr _inner_iph;
1885         struct icmphdr _icmph;
1886
1887         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1888                 goto out;
1889
1890         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1891                 goto out;
1892
1893         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1894                                    &_icmph);
1895         if (!icmph)
1896                 goto out;
1897
1898         if (!icmp_is_err(icmph->type))
1899                 goto out;
1900
1901         inner_iph = skb_header_pointer(skb,
1902                                        outer_iph->ihl * 4 + sizeof(_icmph),
1903                                        sizeof(_inner_iph), &_inner_iph);
1904         if (!inner_iph)
1905                 goto out;
1906
1907         key_iph = inner_iph;
1908 out:
1909         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1911 }
1912
1913 /* if skb is set it will be used and fl4 can be NULL */
1914 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1915                        const struct sk_buff *skb, struct flow_keys *flkeys)
1916 {
1917         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1918         struct flow_keys hash_keys;
1919         u32 mhash;
1920
1921         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922         case 0:
1923                 memset(&hash_keys, 0, sizeof(hash_keys));
1924                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                 if (skb) {
1926                         ip_multipath_l3_keys(skb, &hash_keys);
1927                 } else {
1928                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1929                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930                 }
1931                 break;
1932         case 1:
1933                 /* skb is currently provided only when forwarding */
1934                 if (skb) {
1935                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936                         struct flow_keys keys;
1937
1938                         /* short-circuit if we already have L4 hash present */
1939                         if (skb->l4_hash)
1940                                 return skb_get_hash_raw(skb) >> 1;
1941
1942                         memset(&hash_keys, 0, sizeof(hash_keys));
1943
1944                         if (!flkeys) {
1945                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1946                                 flkeys = &keys;
1947                         }
1948
1949                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952                         hash_keys.ports.src = flkeys->ports.src;
1953                         hash_keys.ports.dst = flkeys->ports.dst;
1954                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1955                 } else {
1956                         memset(&hash_keys, 0, sizeof(hash_keys));
1957                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1959                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960                         hash_keys.ports.src = fl4->fl4_sport;
1961                         hash_keys.ports.dst = fl4->fl4_dport;
1962                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963                 }
1964                 break;
1965         case 2:
1966                 memset(&hash_keys, 0, sizeof(hash_keys));
1967                 /* skb is currently provided only when forwarding */
1968                 if (skb) {
1969                         struct flow_keys keys;
1970
1971                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1972                         /* Inner can be v4 or v6 */
1973                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1974                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1975                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1976                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1977                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1978                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1979                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1980                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1981                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1982                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1983                         } else {
1984                                 /* Same as case 0 */
1985                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1986                                 ip_multipath_l3_keys(skb, &hash_keys);
1987                         }
1988                 } else {
1989                         /* Same as case 0 */
1990                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1991                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1992                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1993                 }
1994                 break;
1995         }
1996         mhash = flow_hash_from_keys(&hash_keys);
1997
1998         if (multipath_hash)
1999                 mhash = jhash_2words(mhash, multipath_hash, 0);
2000
2001         return mhash >> 1;
2002 }
2003 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2004
2005 static int ip_mkroute_input(struct sk_buff *skb,
2006                             struct fib_result *res,
2007                             struct in_device *in_dev,
2008                             __be32 daddr, __be32 saddr, u32 tos,
2009                             struct flow_keys *hkeys)
2010 {
2011 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2012         if (res->fi && fib_info_num_path(res->fi) > 1) {
2013                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2014
2015                 fib_select_multipath(res, h);
2016         }
2017 #endif
2018
2019         /* create a routing cache entry */
2020         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2021 }
2022
2023 /* Implements all the saddr-related checks as ip_route_input_slow(),
2024  * assuming daddr is valid and the destination is not a local broadcast one.
2025  * Uses the provided hint instead of performing a route lookup.
2026  */
2027 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2028                       u8 tos, struct net_device *dev,
2029                       const struct sk_buff *hint)
2030 {
2031         struct in_device *in_dev = __in_dev_get_rcu(dev);
2032         struct rtable *rt = (struct rtable *)hint;
2033         struct net *net = dev_net(dev);
2034         int err = -EINVAL;
2035         u32 tag = 0;
2036
2037         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2038                 goto martian_source;
2039
2040         if (ipv4_is_zeronet(saddr))
2041                 goto martian_source;
2042
2043         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2044                 goto martian_source;
2045
2046         if (rt->rt_type != RTN_LOCAL)
2047                 goto skip_validate_source;
2048
2049         tos &= IPTOS_RT_MASK;
2050         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2051         if (err < 0)
2052                 goto martian_source;
2053
2054 skip_validate_source:
2055         skb_dst_copy(skb, hint);
2056         return 0;
2057
2058 martian_source:
2059         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2060         return err;
2061 }
2062
2063 /*
2064  *      NOTE. We drop all the packets that has local source
2065  *      addresses, because every properly looped back packet
2066  *      must have correct destination already attached by output routine.
2067  *      Changes in the enforced policies must be applied also to
2068  *      ip_route_use_hint().
2069  *
2070  *      Such approach solves two big problems:
2071  *      1. Not simplex devices are handled properly.
2072  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2073  *      called with rcu_read_lock()
2074  */
2075
2076 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2077                                u8 tos, struct net_device *dev,
2078                                struct fib_result *res)
2079 {
2080         struct in_device *in_dev = __in_dev_get_rcu(dev);
2081         struct flow_keys *flkeys = NULL, _flkeys;
2082         struct net    *net = dev_net(dev);
2083         struct ip_tunnel_info *tun_info;
2084         int             err = -EINVAL;
2085         unsigned int    flags = 0;
2086         u32             itag = 0;
2087         struct rtable   *rth;
2088         struct flowi4   fl4;
2089         bool do_cache = true;
2090
2091         /* IP on this device is disabled. */
2092
2093         if (!in_dev)
2094                 goto out;
2095
2096         /* Check for the most weird martians, which can be not detected
2097            by fib_lookup.
2098          */
2099
2100         tun_info = skb_tunnel_info(skb);
2101         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2102                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2103         else
2104                 fl4.flowi4_tun_key.tun_id = 0;
2105         skb_dst_drop(skb);
2106
2107         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2108                 goto martian_source;
2109
2110         res->fi = NULL;
2111         res->table = NULL;
2112         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2113                 goto brd_input;
2114
2115         /* Accept zero addresses only to limited broadcast;
2116          * I even do not know to fix it or not. Waiting for complains :-)
2117          */
2118         if (ipv4_is_zeronet(saddr))
2119                 goto martian_source;
2120
2121         if (ipv4_is_zeronet(daddr))
2122                 goto martian_destination;
2123
2124         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2125          * and call it once if daddr or/and saddr are loopback addresses
2126          */
2127         if (ipv4_is_loopback(daddr)) {
2128                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2129                         goto martian_destination;
2130         } else if (ipv4_is_loopback(saddr)) {
2131                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2132                         goto martian_source;
2133         }
2134
2135         /*
2136          *      Now we are ready to route packet.
2137          */
2138         fl4.flowi4_oif = 0;
2139         fl4.flowi4_iif = dev->ifindex;
2140         fl4.flowi4_mark = skb->mark;
2141         fl4.flowi4_tos = tos;
2142         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2143         fl4.flowi4_flags = 0;
2144         fl4.daddr = daddr;
2145         fl4.saddr = saddr;
2146         fl4.flowi4_uid = sock_net_uid(net, NULL);
2147
2148         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2149                 flkeys = &_flkeys;
2150         } else {
2151                 fl4.flowi4_proto = 0;
2152                 fl4.fl4_sport = 0;
2153                 fl4.fl4_dport = 0;
2154         }
2155
2156         err = fib_lookup(net, &fl4, res, 0);
2157         if (err != 0) {
2158                 if (!IN_DEV_FORWARD(in_dev))
2159                         err = -EHOSTUNREACH;
2160                 goto no_route;
2161         }
2162
2163         if (res->type == RTN_BROADCAST) {
2164                 if (IN_DEV_BFORWARD(in_dev))
2165                         goto make_route;
2166                 /* not do cache if bc_forwarding is enabled */
2167                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2168                         do_cache = false;
2169                 goto brd_input;
2170         }
2171
2172         if (res->type == RTN_LOCAL) {
2173                 err = fib_validate_source(skb, saddr, daddr, tos,
2174                                           0, dev, in_dev, &itag);
2175                 if (err < 0)
2176                         goto martian_source;
2177                 goto local_input;
2178         }
2179
2180         if (!IN_DEV_FORWARD(in_dev)) {
2181                 err = -EHOSTUNREACH;
2182                 goto no_route;
2183         }
2184         if (res->type != RTN_UNICAST)
2185                 goto martian_destination;
2186
2187 make_route:
2188         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2189 out:    return err;
2190
2191 brd_input:
2192         if (skb->protocol != htons(ETH_P_IP))
2193                 goto e_inval;
2194
2195         if (!ipv4_is_zeronet(saddr)) {
2196                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2197                                           in_dev, &itag);
2198                 if (err < 0)
2199                         goto martian_source;
2200         }
2201         flags |= RTCF_BROADCAST;
2202         res->type = RTN_BROADCAST;
2203         RT_CACHE_STAT_INC(in_brd);
2204
2205 local_input:
2206         do_cache &= res->fi && !itag;
2207         if (do_cache) {
2208                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2209
2210                 rth = rcu_dereference(nhc->nhc_rth_input);
2211                 if (rt_cache_valid(rth)) {
2212                         skb_dst_set_noref(skb, &rth->dst);
2213                         err = 0;
2214                         goto out;
2215                 }
2216         }
2217
2218         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2219                            flags | RTCF_LOCAL, res->type,
2220                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2221         if (!rth)
2222                 goto e_nobufs;
2223
2224         rth->dst.output= ip_rt_bug;
2225 #ifdef CONFIG_IP_ROUTE_CLASSID
2226         rth->dst.tclassid = itag;
2227 #endif
2228         rth->rt_is_input = 1;
2229
2230         RT_CACHE_STAT_INC(in_slow_tot);
2231         if (res->type == RTN_UNREACHABLE) {
2232                 rth->dst.input= ip_error;
2233                 rth->dst.error= -err;
2234                 rth->rt_flags   &= ~RTCF_LOCAL;
2235         }
2236
2237         if (do_cache) {
2238                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2239
2240                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2241                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2242                         WARN_ON(rth->dst.input == lwtunnel_input);
2243                         rth->dst.lwtstate->orig_input = rth->dst.input;
2244                         rth->dst.input = lwtunnel_input;
2245                 }
2246
2247                 if (unlikely(!rt_cache_route(nhc, rth)))
2248                         rt_add_uncached_list(rth);
2249         }
2250         skb_dst_set(skb, &rth->dst);
2251         err = 0;
2252         goto out;
2253
2254 no_route:
2255         RT_CACHE_STAT_INC(in_no_route);
2256         res->type = RTN_UNREACHABLE;
2257         res->fi = NULL;
2258         res->table = NULL;
2259         goto local_input;
2260
2261         /*
2262          *      Do not cache martian addresses: they should be logged (RFC1812)
2263          */
2264 martian_destination:
2265         RT_CACHE_STAT_INC(in_martian_dst);
2266 #ifdef CONFIG_IP_ROUTE_VERBOSE
2267         if (IN_DEV_LOG_MARTIANS(in_dev))
2268                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2269                                      &daddr, &saddr, dev->name);
2270 #endif
2271
2272 e_inval:
2273         err = -EINVAL;
2274         goto out;
2275
2276 e_nobufs:
2277         err = -ENOBUFS;
2278         goto out;
2279
2280 martian_source:
2281         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2282         goto out;
2283 }
2284
2285 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2286                          u8 tos, struct net_device *dev)
2287 {
2288         struct fib_result res;
2289         int err;
2290
2291         tos &= IPTOS_RT_MASK;
2292         rcu_read_lock();
2293         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2294         rcu_read_unlock();
2295
2296         return err;
2297 }
2298 EXPORT_SYMBOL(ip_route_input_noref);
2299
2300 /* called with rcu_read_lock held */
2301 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2302                        u8 tos, struct net_device *dev, struct fib_result *res)
2303 {
2304         /* Multicast recognition logic is moved from route cache to here.
2305            The problem was that too many Ethernet cards have broken/missing
2306            hardware multicast filters :-( As result the host on multicasting
2307            network acquires a lot of useless route cache entries, sort of
2308            SDR messages from all the world. Now we try to get rid of them.
2309            Really, provided software IP multicast filter is organized
2310            reasonably (at least, hashed), it does not result in a slowdown
2311            comparing with route cache reject entries.
2312            Note, that multicast routers are not affected, because
2313            route cache entry is created eventually.
2314          */
2315         if (ipv4_is_multicast(daddr)) {
2316                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2317                 int our = 0;
2318                 int err = -EINVAL;
2319
2320                 if (!in_dev)
2321                         return err;
2322                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2323                                       ip_hdr(skb)->protocol);
2324
2325                 /* check l3 master if no match yet */
2326                 if (!our && netif_is_l3_slave(dev)) {
2327                         struct in_device *l3_in_dev;
2328
2329                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2330                         if (l3_in_dev)
2331                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2332                                                       ip_hdr(skb)->protocol);
2333                 }
2334
2335                 if (our
2336 #ifdef CONFIG_IP_MROUTE
2337                         ||
2338                     (!ipv4_is_local_multicast(daddr) &&
2339                      IN_DEV_MFORWARD(in_dev))
2340 #endif
2341                    ) {
2342                         err = ip_route_input_mc(skb, daddr, saddr,
2343                                                 tos, dev, our);
2344                 }
2345                 return err;
2346         }
2347
2348         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2349 }
2350
2351 /* called with rcu_read_lock() */
2352 static struct rtable *__mkroute_output(const struct fib_result *res,
2353                                        const struct flowi4 *fl4, int orig_oif,
2354                                        struct net_device *dev_out,
2355                                        unsigned int flags)
2356 {
2357         struct fib_info *fi = res->fi;
2358         struct fib_nh_exception *fnhe;
2359         struct in_device *in_dev;
2360         u16 type = res->type;
2361         struct rtable *rth;
2362         bool do_cache;
2363
2364         in_dev = __in_dev_get_rcu(dev_out);
2365         if (!in_dev)
2366                 return ERR_PTR(-EINVAL);
2367
2368         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2369                 if (ipv4_is_loopback(fl4->saddr) &&
2370                     !(dev_out->flags & IFF_LOOPBACK) &&
2371                     !netif_is_l3_master(dev_out))
2372                         return ERR_PTR(-EINVAL);
2373
2374         if (ipv4_is_lbcast(fl4->daddr))
2375                 type = RTN_BROADCAST;
2376         else if (ipv4_is_multicast(fl4->daddr))
2377                 type = RTN_MULTICAST;
2378         else if (ipv4_is_zeronet(fl4->daddr))
2379                 return ERR_PTR(-EINVAL);
2380
2381         if (dev_out->flags & IFF_LOOPBACK)
2382                 flags |= RTCF_LOCAL;
2383
2384         do_cache = true;
2385         if (type == RTN_BROADCAST) {
2386                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2387                 fi = NULL;
2388         } else if (type == RTN_MULTICAST) {
2389                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2390                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2391                                      fl4->flowi4_proto))
2392                         flags &= ~RTCF_LOCAL;
2393                 else
2394                         do_cache = false;
2395                 /* If multicast route do not exist use
2396                  * default one, but do not gateway in this case.
2397                  * Yes, it is hack.
2398                  */
2399                 if (fi && res->prefixlen < 4)
2400                         fi = NULL;
2401         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2402                    (orig_oif != dev_out->ifindex)) {
2403                 /* For local routes that require a particular output interface
2404                  * we do not want to cache the result.  Caching the result
2405                  * causes incorrect behaviour when there are multiple source
2406                  * addresses on the interface, the end result being that if the
2407                  * intended recipient is waiting on that interface for the
2408                  * packet he won't receive it because it will be delivered on
2409                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2410                  * be set to the loopback interface as well.
2411                  */
2412                 do_cache = false;
2413         }
2414
2415         fnhe = NULL;
2416         do_cache &= fi != NULL;
2417         if (fi) {
2418                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2419                 struct rtable __rcu **prth;
2420
2421                 fnhe = find_exception(nhc, fl4->daddr);
2422                 if (!do_cache)
2423                         goto add;
2424                 if (fnhe) {
2425                         prth = &fnhe->fnhe_rth_output;
2426                 } else {
2427                         if (unlikely(fl4->flowi4_flags &
2428                                      FLOWI_FLAG_KNOWN_NH &&
2429                                      !(nhc->nhc_gw_family &&
2430                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2431                                 do_cache = false;
2432                                 goto add;
2433                         }
2434                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2435                 }
2436                 rth = rcu_dereference(*prth);
2437                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2438                         return rth;
2439         }
2440
2441 add:
2442         rth = rt_dst_alloc(dev_out, flags, type,
2443                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2444                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2445         if (!rth)
2446                 return ERR_PTR(-ENOBUFS);
2447
2448         rth->rt_iif = orig_oif;
2449
2450         RT_CACHE_STAT_INC(out_slow_tot);
2451
2452         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2453                 if (flags & RTCF_LOCAL &&
2454                     !(dev_out->flags & IFF_LOOPBACK)) {
2455                         rth->dst.output = ip_mc_output;
2456                         RT_CACHE_STAT_INC(out_slow_mc);
2457                 }
2458 #ifdef CONFIG_IP_MROUTE
2459                 if (type == RTN_MULTICAST) {
2460                         if (IN_DEV_MFORWARD(in_dev) &&
2461                             !ipv4_is_local_multicast(fl4->daddr)) {
2462                                 rth->dst.input = ip_mr_input;
2463                                 rth->dst.output = ip_mc_output;
2464                         }
2465                 }
2466 #endif
2467         }
2468
2469         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2470         lwtunnel_set_redirect(&rth->dst);
2471
2472         return rth;
2473 }
2474
2475 /*
2476  * Major route resolver routine.
2477  */
2478
2479 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2480                                         const struct sk_buff *skb)
2481 {
2482         __u8 tos = RT_FL_TOS(fl4);
2483         struct fib_result res = {
2484                 .type           = RTN_UNSPEC,
2485                 .fi             = NULL,
2486                 .table          = NULL,
2487                 .tclassid       = 0,
2488         };
2489         struct rtable *rth;
2490
2491         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2492         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2493         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2494                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2495
2496         rcu_read_lock();
2497         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2498         rcu_read_unlock();
2499
2500         return rth;
2501 }
2502 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2503
2504 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2505                                             struct fib_result *res,
2506                                             const struct sk_buff *skb)
2507 {
2508         struct net_device *dev_out = NULL;
2509         int orig_oif = fl4->flowi4_oif;
2510         unsigned int flags = 0;
2511         struct rtable *rth;
2512         int err;
2513
2514         if (fl4->saddr) {
2515                 if (ipv4_is_multicast(fl4->saddr) ||
2516                     ipv4_is_lbcast(fl4->saddr) ||
2517                     ipv4_is_zeronet(fl4->saddr)) {
2518                         rth = ERR_PTR(-EINVAL);
2519                         goto out;
2520                 }
2521
2522                 rth = ERR_PTR(-ENETUNREACH);
2523
2524                 /* I removed check for oif == dev_out->oif here.
2525                    It was wrong for two reasons:
2526                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2527                       is assigned to multiple interfaces.
2528                    2. Moreover, we are allowed to send packets with saddr
2529                       of another iface. --ANK
2530                  */
2531
2532                 if (fl4->flowi4_oif == 0 &&
2533                     (ipv4_is_multicast(fl4->daddr) ||
2534                      ipv4_is_lbcast(fl4->daddr))) {
2535                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2536                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2537                         if (!dev_out)
2538                                 goto out;
2539
2540                         /* Special hack: user can direct multicasts
2541                            and limited broadcast via necessary interface
2542                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2543                            This hack is not just for fun, it allows
2544                            vic,vat and friends to work.
2545                            They bind socket to loopback, set ttl to zero
2546                            and expect that it will work.
2547                            From the viewpoint of routing cache they are broken,
2548                            because we are not allowed to build multicast path
2549                            with loopback source addr (look, routing cache
2550                            cannot know, that ttl is zero, so that packet
2551                            will not leave this host and route is valid).
2552                            Luckily, this hack is good workaround.
2553                          */
2554
2555                         fl4->flowi4_oif = dev_out->ifindex;
2556                         goto make_route;
2557                 }
2558
2559                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2560                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2561                         if (!__ip_dev_find(net, fl4->saddr, false))
2562                                 goto out;
2563                 }
2564         }
2565
2566
2567         if (fl4->flowi4_oif) {
2568                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2569                 rth = ERR_PTR(-ENODEV);
2570                 if (!dev_out)
2571                         goto out;
2572
2573                 /* RACE: Check return value of inet_select_addr instead. */
2574                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2575                         rth = ERR_PTR(-ENETUNREACH);
2576                         goto out;
2577                 }
2578                 if (ipv4_is_local_multicast(fl4->daddr) ||
2579                     ipv4_is_lbcast(fl4->daddr) ||
2580                     fl4->flowi4_proto == IPPROTO_IGMP) {
2581                         if (!fl4->saddr)
2582                                 fl4->saddr = inet_select_addr(dev_out, 0,
2583                                                               RT_SCOPE_LINK);
2584                         goto make_route;
2585                 }
2586                 if (!fl4->saddr) {
2587                         if (ipv4_is_multicast(fl4->daddr))
2588                                 fl4->saddr = inet_select_addr(dev_out, 0,
2589                                                               fl4->flowi4_scope);
2590                         else if (!fl4->daddr)
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               RT_SCOPE_HOST);
2593                 }
2594         }
2595
2596         if (!fl4->daddr) {
2597                 fl4->daddr = fl4->saddr;
2598                 if (!fl4->daddr)
2599                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2600                 dev_out = net->loopback_dev;
2601                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2602                 res->type = RTN_LOCAL;
2603                 flags |= RTCF_LOCAL;
2604                 goto make_route;
2605         }
2606
2607         err = fib_lookup(net, fl4, res, 0);
2608         if (err) {
2609                 res->fi = NULL;
2610                 res->table = NULL;
2611                 if (fl4->flowi4_oif &&
2612                     (ipv4_is_multicast(fl4->daddr) ||
2613                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2614                         /* Apparently, routing tables are wrong. Assume,
2615                            that the destination is on link.
2616
2617                            WHY? DW.
2618                            Because we are allowed to send to iface
2619                            even if it has NO routes and NO assigned
2620                            addresses. When oif is specified, routing
2621                            tables are looked up with only one purpose:
2622                            to catch if destination is gatewayed, rather than
2623                            direct. Moreover, if MSG_DONTROUTE is set,
2624                            we send packet, ignoring both routing tables
2625                            and ifaddr state. --ANK
2626
2627
2628                            We could make it even if oif is unknown,
2629                            likely IPv6, but we do not.
2630                          */
2631
2632                         if (fl4->saddr == 0)
2633                                 fl4->saddr = inet_select_addr(dev_out, 0,
2634                                                               RT_SCOPE_LINK);
2635                         res->type = RTN_UNICAST;
2636                         goto make_route;
2637                 }
2638                 rth = ERR_PTR(err);
2639                 goto out;
2640         }
2641
2642         if (res->type == RTN_LOCAL) {
2643                 if (!fl4->saddr) {
2644                         if (res->fi->fib_prefsrc)
2645                                 fl4->saddr = res->fi->fib_prefsrc;
2646                         else
2647                                 fl4->saddr = fl4->daddr;
2648                 }
2649
2650                 /* L3 master device is the loopback for that domain */
2651                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2652                         net->loopback_dev;
2653
2654                 /* make sure orig_oif points to fib result device even
2655                  * though packet rx/tx happens over loopback or l3mdev
2656                  */
2657                 orig_oif = FIB_RES_OIF(*res);
2658
2659                 fl4->flowi4_oif = dev_out->ifindex;
2660                 flags |= RTCF_LOCAL;
2661                 goto make_route;
2662         }
2663
2664         fib_select_path(net, res, fl4, skb);
2665
2666         dev_out = FIB_RES_DEV(*res);
2667         fl4->flowi4_oif = dev_out->ifindex;
2668
2669
2670 make_route:
2671         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2672
2673 out:
2674         return rth;
2675 }
2676
2677 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2678 {
2679         return NULL;
2680 }
2681
2682 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2683 {
2684         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2685
2686         return mtu ? : dst->dev->mtu;
2687 }
2688
2689 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2690                                           struct sk_buff *skb, u32 mtu,
2691                                           bool confirm_neigh)
2692 {
2693 }
2694
2695 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2696                                        struct sk_buff *skb)
2697 {
2698 }
2699
2700 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2701                                           unsigned long old)
2702 {
2703         return NULL;
2704 }
2705
2706 static struct dst_ops ipv4_dst_blackhole_ops = {
2707         .family                 =       AF_INET,
2708         .check                  =       ipv4_blackhole_dst_check,
2709         .mtu                    =       ipv4_blackhole_mtu,
2710         .default_advmss         =       ipv4_default_advmss,
2711         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2712         .redirect               =       ipv4_rt_blackhole_redirect,
2713         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2714         .neigh_lookup           =       ipv4_neigh_lookup,
2715 };
2716
2717 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2718 {
2719         struct rtable *ort = (struct rtable *) dst_orig;
2720         struct rtable *rt;
2721
2722         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2723         if (rt) {
2724                 struct dst_entry *new = &rt->dst;
2725
2726                 new->__use = 1;
2727                 new->input = dst_discard;
2728                 new->output = dst_discard_out;
2729
2730                 new->dev = net->loopback_dev;
2731                 if (new->dev)
2732                         dev_hold(new->dev);
2733
2734                 rt->rt_is_input = ort->rt_is_input;
2735                 rt->rt_iif = ort->rt_iif;
2736                 rt->rt_pmtu = ort->rt_pmtu;
2737                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2738
2739                 rt->rt_genid = rt_genid_ipv4(net);
2740                 rt->rt_flags = ort->rt_flags;
2741                 rt->rt_type = ort->rt_type;
2742                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2743                 rt->rt_gw_family = ort->rt_gw_family;
2744                 if (rt->rt_gw_family == AF_INET)
2745                         rt->rt_gw4 = ort->rt_gw4;
2746                 else if (rt->rt_gw_family == AF_INET6)
2747                         rt->rt_gw6 = ort->rt_gw6;
2748
2749                 INIT_LIST_HEAD(&rt->rt_uncached);
2750         }
2751
2752         dst_release(dst_orig);
2753
2754         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2755 }
2756
2757 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2758                                     const struct sock *sk)
2759 {
2760         struct rtable *rt = __ip_route_output_key(net, flp4);
2761
2762         if (IS_ERR(rt))
2763                 return rt;
2764
2765         if (flp4->flowi4_proto)
2766                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2767                                                         flowi4_to_flowi(flp4),
2768                                                         sk, 0);
2769
2770         return rt;
2771 }
2772 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2773
2774 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2775                                       struct net_device *dev,
2776                                       struct net *net, __be32 *saddr,
2777                                       const struct ip_tunnel_info *info,
2778                                       u8 protocol, bool use_cache)
2779 {
2780 #ifdef CONFIG_DST_CACHE
2781         struct dst_cache *dst_cache;
2782 #endif
2783         struct rtable *rt = NULL;
2784         struct flowi4 fl4;
2785         __u8 tos;
2786
2787 #ifdef CONFIG_DST_CACHE
2788         dst_cache = (struct dst_cache *)&info->dst_cache;
2789         if (use_cache) {
2790                 rt = dst_cache_get_ip4(dst_cache, saddr);
2791                 if (rt)
2792                         return rt;
2793         }
2794 #endif
2795         memset(&fl4, 0, sizeof(fl4));
2796         fl4.flowi4_mark = skb->mark;
2797         fl4.flowi4_proto = protocol;
2798         fl4.daddr = info->key.u.ipv4.dst;
2799         fl4.saddr = info->key.u.ipv4.src;
2800         tos = info->key.tos;
2801         fl4.flowi4_tos = RT_TOS(tos);
2802
2803         rt = ip_route_output_key(net, &fl4);
2804         if (IS_ERR(rt)) {
2805                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2806                 return ERR_PTR(-ENETUNREACH);
2807         }
2808         if (rt->dst.dev == dev) { /* is this necessary? */
2809                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2810                 ip_rt_put(rt);
2811                 return ERR_PTR(-ELOOP);
2812         }
2813 #ifdef CONFIG_DST_CACHE
2814         if (use_cache)
2815                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2816 #endif
2817         *saddr = fl4.saddr;
2818         return rt;
2819 }
2820 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2821
2822 /* called with rcu_read_lock held */
2823 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2824                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2825                         struct sk_buff *skb, u32 portid, u32 seq,
2826                         unsigned int flags)
2827 {
2828         struct rtmsg *r;
2829         struct nlmsghdr *nlh;
2830         unsigned long expires = 0;
2831         u32 error;
2832         u32 metrics[RTAX_MAX];
2833
2834         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2835         if (!nlh)
2836                 return -EMSGSIZE;
2837
2838         r = nlmsg_data(nlh);
2839         r->rtm_family    = AF_INET;
2840         r->rtm_dst_len  = 32;
2841         r->rtm_src_len  = 0;
2842         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2843         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2844         if (nla_put_u32(skb, RTA_TABLE, table_id))
2845                 goto nla_put_failure;
2846         r->rtm_type     = rt->rt_type;
2847         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2848         r->rtm_protocol = RTPROT_UNSPEC;
2849         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2850         if (rt->rt_flags & RTCF_NOTIFY)
2851                 r->rtm_flags |= RTM_F_NOTIFY;
2852         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2853                 r->rtm_flags |= RTCF_DOREDIRECT;
2854
2855         if (nla_put_in_addr(skb, RTA_DST, dst))
2856                 goto nla_put_failure;
2857         if (src) {
2858                 r->rtm_src_len = 32;
2859                 if (nla_put_in_addr(skb, RTA_SRC, src))
2860                         goto nla_put_failure;
2861         }
2862         if (rt->dst.dev &&
2863             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2864                 goto nla_put_failure;
2865 #ifdef CONFIG_IP_ROUTE_CLASSID
2866         if (rt->dst.tclassid &&
2867             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2868                 goto nla_put_failure;
2869 #endif
2870         if (fl4 && !rt_is_input_route(rt) &&
2871             fl4->saddr != src) {
2872                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2873                         goto nla_put_failure;
2874         }
2875         if (rt->rt_uses_gateway) {
2876                 if (rt->rt_gw_family == AF_INET &&
2877                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2878                         goto nla_put_failure;
2879                 } else if (rt->rt_gw_family == AF_INET6) {
2880                         int alen = sizeof(struct in6_addr);
2881                         struct nlattr *nla;
2882                         struct rtvia *via;
2883
2884                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2885                         if (!nla)
2886                                 goto nla_put_failure;
2887
2888                         via = nla_data(nla);
2889                         via->rtvia_family = AF_INET6;
2890                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2891                 }
2892         }
2893
2894         expires = rt->dst.expires;
2895         if (expires) {
2896                 unsigned long now = jiffies;
2897
2898                 if (time_before(now, expires))
2899                         expires -= now;
2900                 else
2901                         expires = 0;
2902         }
2903
2904         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2905         if (rt->rt_pmtu && expires)
2906                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2907         if (rt->rt_mtu_locked && expires)
2908                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2909         if (rtnetlink_put_metrics(skb, metrics) < 0)
2910                 goto nla_put_failure;
2911
2912         if (fl4) {
2913                 if (fl4->flowi4_mark &&
2914                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2915                         goto nla_put_failure;
2916
2917                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2918                     nla_put_u32(skb, RTA_UID,
2919                                 from_kuid_munged(current_user_ns(),
2920                                                  fl4->flowi4_uid)))
2921                         goto nla_put_failure;
2922
2923                 if (rt_is_input_route(rt)) {
2924 #ifdef CONFIG_IP_MROUTE
2925                         if (ipv4_is_multicast(dst) &&
2926                             !ipv4_is_local_multicast(dst) &&
2927                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2928                                 int err = ipmr_get_route(net, skb,
2929                                                          fl4->saddr, fl4->daddr,
2930                                                          r, portid);
2931
2932                                 if (err <= 0) {
2933                                         if (err == 0)
2934                                                 return 0;
2935                                         goto nla_put_failure;
2936                                 }
2937                         } else
2938 #endif
2939                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2940                                         goto nla_put_failure;
2941                 }
2942         }
2943
2944         error = rt->dst.error;
2945
2946         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2947                 goto nla_put_failure;
2948
2949         nlmsg_end(skb, nlh);
2950         return 0;
2951
2952 nla_put_failure:
2953         nlmsg_cancel(skb, nlh);
2954         return -EMSGSIZE;
2955 }
2956
2957 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2958                             struct netlink_callback *cb, u32 table_id,
2959                             struct fnhe_hash_bucket *bucket, int genid,
2960                             int *fa_index, int fa_start, unsigned int flags)
2961 {
2962         int i;
2963
2964         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2965                 struct fib_nh_exception *fnhe;
2966
2967                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2968                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2969                         struct rtable *rt;
2970                         int err;
2971
2972                         if (*fa_index < fa_start)
2973                                 goto next;
2974
2975                         if (fnhe->fnhe_genid != genid)
2976                                 goto next;
2977
2978                         if (fnhe->fnhe_expires &&
2979                             time_after(jiffies, fnhe->fnhe_expires))
2980                                 goto next;
2981
2982                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2983                         if (!rt)
2984                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2985                         if (!rt)
2986                                 goto next;
2987
2988                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2989                                            table_id, NULL, skb,
2990                                            NETLINK_CB(cb->skb).portid,
2991                                            cb->nlh->nlmsg_seq, flags);
2992                         if (err)
2993                                 return err;
2994 next:
2995                         (*fa_index)++;
2996                 }
2997         }
2998
2999         return 0;
3000 }
3001
3002 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3003                        u32 table_id, struct fib_info *fi,
3004                        int *fa_index, int fa_start, unsigned int flags)
3005 {
3006         struct net *net = sock_net(cb->skb->sk);
3007         int nhsel, genid = fnhe_genid(net);
3008
3009         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3010                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3011                 struct fnhe_hash_bucket *bucket;
3012                 int err;
3013
3014                 if (nhc->nhc_flags & RTNH_F_DEAD)
3015                         continue;
3016
3017                 rcu_read_lock();
3018                 bucket = rcu_dereference(nhc->nhc_exceptions);
3019                 err = 0;
3020                 if (bucket)
3021                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3022                                                genid, fa_index, fa_start,
3023                                                flags);
3024                 rcu_read_unlock();
3025                 if (err)
3026                         return err;
3027         }
3028
3029         return 0;
3030 }
3031
3032 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3033                                                    u8 ip_proto, __be16 sport,
3034                                                    __be16 dport)
3035 {
3036         struct sk_buff *skb;
3037         struct iphdr *iph;
3038
3039         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3040         if (!skb)
3041                 return NULL;
3042
3043         /* Reserve room for dummy headers, this skb can pass
3044          * through good chunk of routing engine.
3045          */
3046         skb_reset_mac_header(skb);
3047         skb_reset_network_header(skb);
3048         skb->protocol = htons(ETH_P_IP);
3049         iph = skb_put(skb, sizeof(struct iphdr));
3050         iph->protocol = ip_proto;
3051         iph->saddr = src;
3052         iph->daddr = dst;
3053         iph->version = 0x4;
3054         iph->frag_off = 0;
3055         iph->ihl = 0x5;
3056         skb_set_transport_header(skb, skb->len);
3057
3058         switch (iph->protocol) {
3059         case IPPROTO_UDP: {
3060                 struct udphdr *udph;
3061
3062                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3063                 udph->source = sport;
3064                 udph->dest = dport;
3065                 udph->len = sizeof(struct udphdr);
3066                 udph->check = 0;
3067                 break;
3068         }
3069         case IPPROTO_TCP: {
3070                 struct tcphdr *tcph;
3071
3072                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3073                 tcph->source    = sport;
3074                 tcph->dest      = dport;
3075                 tcph->doff      = sizeof(struct tcphdr) / 4;
3076                 tcph->rst = 1;
3077                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3078                                             src, dst, 0);
3079                 break;
3080         }
3081         case IPPROTO_ICMP: {
3082                 struct icmphdr *icmph;
3083
3084                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3085                 icmph->type = ICMP_ECHO;
3086                 icmph->code = 0;
3087         }
3088         }
3089
3090         return skb;
3091 }
3092
3093 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3094                                        const struct nlmsghdr *nlh,
3095                                        struct nlattr **tb,
3096                                        struct netlink_ext_ack *extack)
3097 {
3098         struct rtmsg *rtm;
3099         int i, err;
3100
3101         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3102                 NL_SET_ERR_MSG(extack,
3103                                "ipv4: Invalid header for route get request");
3104                 return -EINVAL;
3105         }
3106
3107         if (!netlink_strict_get_check(skb))
3108                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3109                                               rtm_ipv4_policy, extack);
3110
3111         rtm = nlmsg_data(nlh);
3112         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3113             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3114             rtm->rtm_table || rtm->rtm_protocol ||
3115             rtm->rtm_scope || rtm->rtm_type) {
3116                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3117                 return -EINVAL;
3118         }
3119
3120         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3121                                RTM_F_LOOKUP_TABLE |
3122                                RTM_F_FIB_MATCH)) {
3123                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3124                 return -EINVAL;
3125         }
3126
3127         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3128                                             rtm_ipv4_policy, extack);
3129         if (err)
3130                 return err;
3131
3132         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3133             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3134                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3135                 return -EINVAL;
3136         }
3137
3138         for (i = 0; i <= RTA_MAX; i++) {
3139                 if (!tb[i])
3140                         continue;
3141
3142                 switch (i) {
3143                 case RTA_IIF:
3144                 case RTA_OIF:
3145                 case RTA_SRC:
3146                 case RTA_DST:
3147                 case RTA_IP_PROTO:
3148                 case RTA_SPORT:
3149                 case RTA_DPORT:
3150                 case RTA_MARK:
3151                 case RTA_UID:
3152                         break;
3153                 default:
3154                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3155                         return -EINVAL;
3156                 }
3157         }
3158
3159         return 0;
3160 }
3161
3162 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3163                              struct netlink_ext_ack *extack)
3164 {
3165         struct net *net = sock_net(in_skb->sk);
3166         struct nlattr *tb[RTA_MAX+1];
3167         u32 table_id = RT_TABLE_MAIN;
3168         __be16 sport = 0, dport = 0;
3169         struct fib_result res = {};
3170         u8 ip_proto = IPPROTO_UDP;
3171         struct rtable *rt = NULL;
3172         struct sk_buff *skb;
3173         struct rtmsg *rtm;
3174         struct flowi4 fl4 = {};
3175         __be32 dst = 0;
3176         __be32 src = 0;
3177         kuid_t uid;
3178         u32 iif;
3179         int err;
3180         int mark;
3181
3182         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3183         if (err < 0)
3184                 return err;
3185
3186         rtm = nlmsg_data(nlh);
3187         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3188         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3189         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3190         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3191         if (tb[RTA_UID])
3192                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3193         else
3194                 uid = (iif ? INVALID_UID : current_uid());
3195
3196         if (tb[RTA_IP_PROTO]) {
3197                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3198                                                   &ip_proto, AF_INET, extack);
3199                 if (err)
3200                         return err;
3201         }
3202
3203         if (tb[RTA_SPORT])
3204                 sport = nla_get_be16(tb[RTA_SPORT]);
3205
3206         if (tb[RTA_DPORT])
3207                 dport = nla_get_be16(tb[RTA_DPORT]);
3208
3209         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3210         if (!skb)
3211                 return -ENOBUFS;
3212
3213         fl4.daddr = dst;
3214         fl4.saddr = src;
3215         fl4.flowi4_tos = rtm->rtm_tos;
3216         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3217         fl4.flowi4_mark = mark;
3218         fl4.flowi4_uid = uid;
3219         if (sport)
3220                 fl4.fl4_sport = sport;
3221         if (dport)
3222                 fl4.fl4_dport = dport;
3223         fl4.flowi4_proto = ip_proto;
3224
3225         rcu_read_lock();
3226
3227         if (iif) {
3228                 struct net_device *dev;
3229
3230                 dev = dev_get_by_index_rcu(net, iif);
3231                 if (!dev) {
3232                         err = -ENODEV;
3233                         goto errout_rcu;
3234                 }
3235
3236                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3237                 skb->dev        = dev;
3238                 skb->mark       = mark;
3239                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3240                                          dev, &res);
3241
3242                 rt = skb_rtable(skb);
3243                 if (err == 0 && rt->dst.error)
3244                         err = -rt->dst.error;
3245         } else {
3246                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3247                 skb->dev = net->loopback_dev;
3248                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3249                 err = 0;
3250                 if (IS_ERR(rt))
3251                         err = PTR_ERR(rt);
3252                 else
3253                         skb_dst_set(skb, &rt->dst);
3254         }
3255
3256         if (err)
3257                 goto errout_rcu;
3258
3259         if (rtm->rtm_flags & RTM_F_NOTIFY)
3260                 rt->rt_flags |= RTCF_NOTIFY;
3261
3262         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3263                 table_id = res.table ? res.table->tb_id : 0;
3264
3265         /* reset skb for netlink reply msg */
3266         skb_trim(skb, 0);
3267         skb_reset_network_header(skb);
3268         skb_reset_transport_header(skb);
3269         skb_reset_mac_header(skb);
3270
3271         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3272                 struct fib_rt_info fri;
3273
3274                 if (!res.fi) {
3275                         err = fib_props[res.type].error;
3276                         if (!err)
3277                                 err = -EHOSTUNREACH;
3278                         goto errout_rcu;
3279                 }
3280                 fri.fi = res.fi;
3281                 fri.tb_id = table_id;
3282                 fri.dst = res.prefix;
3283                 fri.dst_len = res.prefixlen;
3284                 fri.tos = fl4.flowi4_tos;
3285                 fri.type = rt->rt_type;
3286                 fri.offload = 0;
3287                 fri.trap = 0;
3288                 if (res.fa_head) {
3289                         struct fib_alias *fa;
3290
3291                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3292                                 u8 slen = 32 - fri.dst_len;
3293
3294                                 if (fa->fa_slen == slen &&
3295                                     fa->tb_id == fri.tb_id &&
3296                                     fa->fa_tos == fri.tos &&
3297                                     fa->fa_info == res.fi &&
3298                                     fa->fa_type == fri.type) {
3299                                         fri.offload = fa->offload;
3300                                         fri.trap = fa->trap;
3301                                         break;
3302                                 }
3303                         }
3304                 }
3305                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3306                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3307         } else {
3308                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3309                                    NETLINK_CB(in_skb).portid,
3310                                    nlh->nlmsg_seq, 0);
3311         }
3312         if (err < 0)
3313                 goto errout_rcu;
3314
3315         rcu_read_unlock();
3316
3317         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3318
3319 errout_free:
3320         return err;
3321 errout_rcu:
3322         rcu_read_unlock();
3323         kfree_skb(skb);
3324         goto errout_free;
3325 }
3326
3327 void ip_rt_multicast_event(struct in_device *in_dev)
3328 {
3329         rt_cache_flush(dev_net(in_dev->dev));
3330 }
3331
3332 #ifdef CONFIG_SYSCTL
3333 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3334 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3335 static int ip_rt_gc_elasticity __read_mostly    = 8;
3336 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3337
3338 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3339                 void *buffer, size_t *lenp, loff_t *ppos)
3340 {
3341         struct net *net = (struct net *)__ctl->extra1;
3342
3343         if (write) {
3344                 rt_cache_flush(net);
3345                 fnhe_genid_bump(net);
3346                 return 0;
3347         }
3348
3349         return -EINVAL;
3350 }
3351
3352 static struct ctl_table ipv4_route_table[] = {
3353         {
3354                 .procname       = "gc_thresh",
3355                 .data           = &ipv4_dst_ops.gc_thresh,
3356                 .maxlen         = sizeof(int),
3357                 .mode           = 0644,
3358                 .proc_handler   = proc_dointvec,
3359         },
3360         {
3361                 .procname       = "max_size",
3362                 .data           = &ip_rt_max_size,
3363                 .maxlen         = sizeof(int),
3364                 .mode           = 0644,
3365                 .proc_handler   = proc_dointvec,
3366         },
3367         {
3368                 /*  Deprecated. Use gc_min_interval_ms */
3369
3370                 .procname       = "gc_min_interval",
3371                 .data           = &ip_rt_gc_min_interval,
3372                 .maxlen         = sizeof(int),
3373                 .mode           = 0644,
3374                 .proc_handler   = proc_dointvec_jiffies,
3375         },
3376         {
3377                 .procname       = "gc_min_interval_ms",
3378                 .data           = &ip_rt_gc_min_interval,
3379                 .maxlen         = sizeof(int),
3380                 .mode           = 0644,
3381                 .proc_handler   = proc_dointvec_ms_jiffies,
3382         },
3383         {
3384                 .procname       = "gc_timeout",
3385                 .data           = &ip_rt_gc_timeout,
3386                 .maxlen         = sizeof(int),
3387                 .mode           = 0644,
3388                 .proc_handler   = proc_dointvec_jiffies,
3389         },
3390         {
3391                 .procname       = "gc_interval",
3392                 .data           = &ip_rt_gc_interval,
3393                 .maxlen         = sizeof(int),
3394                 .mode           = 0644,
3395                 .proc_handler   = proc_dointvec_jiffies,
3396         },
3397         {
3398                 .procname       = "redirect_load",
3399                 .data           = &ip_rt_redirect_load,
3400                 .maxlen         = sizeof(int),
3401                 .mode           = 0644,
3402                 .proc_handler   = proc_dointvec,
3403         },
3404         {
3405                 .procname       = "redirect_number",
3406                 .data           = &ip_rt_redirect_number,
3407                 .maxlen         = sizeof(int),
3408                 .mode           = 0644,
3409                 .proc_handler   = proc_dointvec,
3410         },
3411         {
3412                 .procname       = "redirect_silence",
3413                 .data           = &ip_rt_redirect_silence,
3414                 .maxlen         = sizeof(int),
3415                 .mode           = 0644,
3416                 .proc_handler   = proc_dointvec,
3417         },
3418         {
3419                 .procname       = "error_cost",
3420                 .data           = &ip_rt_error_cost,
3421                 .maxlen         = sizeof(int),
3422                 .mode           = 0644,
3423                 .proc_handler   = proc_dointvec,
3424         },
3425         {
3426                 .procname       = "error_burst",
3427                 .data           = &ip_rt_error_burst,
3428                 .maxlen         = sizeof(int),
3429                 .mode           = 0644,
3430                 .proc_handler   = proc_dointvec,
3431         },
3432         {
3433                 .procname       = "gc_elasticity",
3434                 .data           = &ip_rt_gc_elasticity,
3435                 .maxlen         = sizeof(int),
3436                 .mode           = 0644,
3437                 .proc_handler   = proc_dointvec,
3438         },
3439         {
3440                 .procname       = "mtu_expires",
3441                 .data           = &ip_rt_mtu_expires,
3442                 .maxlen         = sizeof(int),
3443                 .mode           = 0644,
3444                 .proc_handler   = proc_dointvec_jiffies,
3445         },
3446         {
3447                 .procname       = "min_pmtu",
3448                 .data           = &ip_rt_min_pmtu,
3449                 .maxlen         = sizeof(int),
3450                 .mode           = 0644,
3451                 .proc_handler   = proc_dointvec_minmax,
3452                 .extra1         = &ip_min_valid_pmtu,
3453         },
3454         {
3455                 .procname       = "min_adv_mss",
3456                 .data           = &ip_rt_min_advmss,
3457                 .maxlen         = sizeof(int),
3458                 .mode           = 0644,
3459                 .proc_handler   = proc_dointvec,
3460         },
3461         { }
3462 };
3463
3464 static const char ipv4_route_flush_procname[] = "flush";
3465
3466 static struct ctl_table ipv4_route_flush_table[] = {
3467         {
3468                 .procname       = ipv4_route_flush_procname,
3469                 .maxlen         = sizeof(int),
3470                 .mode           = 0200,
3471                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3472         },
3473         { },
3474 };
3475
3476 static __net_init int sysctl_route_net_init(struct net *net)
3477 {
3478         struct ctl_table *tbl;
3479
3480         tbl = ipv4_route_flush_table;
3481         if (!net_eq(net, &init_net)) {
3482                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3483                 if (!tbl)
3484                         goto err_dup;
3485
3486                 /* Don't export non-whitelisted sysctls to unprivileged users */
3487                 if (net->user_ns != &init_user_ns) {
3488                         if (tbl[0].procname != ipv4_route_flush_procname)
3489                                 tbl[0].procname = NULL;
3490                 }
3491         }
3492         tbl[0].extra1 = net;
3493
3494         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3495         if (!net->ipv4.route_hdr)
3496                 goto err_reg;
3497         return 0;
3498
3499 err_reg:
3500         if (tbl != ipv4_route_flush_table)
3501                 kfree(tbl);
3502 err_dup:
3503         return -ENOMEM;
3504 }
3505
3506 static __net_exit void sysctl_route_net_exit(struct net *net)
3507 {
3508         struct ctl_table *tbl;
3509
3510         tbl = net->ipv4.route_hdr->ctl_table_arg;
3511         unregister_net_sysctl_table(net->ipv4.route_hdr);
3512         BUG_ON(tbl == ipv4_route_flush_table);
3513         kfree(tbl);
3514 }
3515
3516 static __net_initdata struct pernet_operations sysctl_route_ops = {
3517         .init = sysctl_route_net_init,
3518         .exit = sysctl_route_net_exit,
3519 };
3520 #endif
3521
3522 static __net_init int rt_genid_init(struct net *net)
3523 {
3524         atomic_set(&net->ipv4.rt_genid, 0);
3525         atomic_set(&net->fnhe_genid, 0);
3526         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3527         return 0;
3528 }
3529
3530 static __net_initdata struct pernet_operations rt_genid_ops = {
3531         .init = rt_genid_init,
3532 };
3533
3534 static int __net_init ipv4_inetpeer_init(struct net *net)
3535 {
3536         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3537
3538         if (!bp)
3539                 return -ENOMEM;
3540         inet_peer_base_init(bp);
3541         net->ipv4.peers = bp;
3542         return 0;
3543 }
3544
3545 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3546 {
3547         struct inet_peer_base *bp = net->ipv4.peers;
3548
3549         net->ipv4.peers = NULL;
3550         inetpeer_invalidate_tree(bp);
3551         kfree(bp);
3552 }
3553
3554 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3555         .init   =       ipv4_inetpeer_init,
3556         .exit   =       ipv4_inetpeer_exit,
3557 };
3558
3559 #ifdef CONFIG_IP_ROUTE_CLASSID
3560 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3561 #endif /* CONFIG_IP_ROUTE_CLASSID */
3562
3563 int __init ip_rt_init(void)
3564 {
3565         int cpu;
3566
3567         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3568                                   GFP_KERNEL);
3569         if (!ip_idents)
3570                 panic("IP: failed to allocate ip_idents\n");
3571
3572         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3573
3574         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3575         if (!ip_tstamps)
3576                 panic("IP: failed to allocate ip_tstamps\n");
3577
3578         for_each_possible_cpu(cpu) {
3579                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3580
3581                 INIT_LIST_HEAD(&ul->head);
3582                 spin_lock_init(&ul->lock);
3583         }
3584 #ifdef CONFIG_IP_ROUTE_CLASSID
3585         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3586         if (!ip_rt_acct)
3587                 panic("IP: failed to allocate ip_rt_acct\n");
3588 #endif
3589
3590         ipv4_dst_ops.kmem_cachep =
3591                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3592                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3593
3594         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3595
3596         if (dst_entries_init(&ipv4_dst_ops) < 0)
3597                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3598
3599         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3600                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3601
3602         ipv4_dst_ops.gc_thresh = ~0;
3603         ip_rt_max_size = INT_MAX;
3604
3605         devinet_init();
3606         ip_fib_init();
3607
3608         if (ip_rt_proc_init())
3609                 pr_err("Unable to create route proc files\n");
3610 #ifdef CONFIG_XFRM
3611         xfrm_init();
3612         xfrm4_init();
3613 #endif
3614         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3615                       RTNL_FLAG_DOIT_UNLOCKED);
3616
3617 #ifdef CONFIG_SYSCTL
3618         register_pernet_subsys(&sysctl_route_ops);
3619 #endif
3620         register_pernet_subsys(&rt_genid_ops);
3621         register_pernet_subsys(&ipv4_inetpeer_ops);
3622         return 0;
3623 }
3624
3625 #ifdef CONFIG_SYSCTL
3626 /*
3627  * We really need to sanitize the damn ipv4 init order, then all
3628  * this nonsense will go away.
3629  */
3630 void __init ip_static_sysctl_init(void)
3631 {
3632         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3633 }
3634 #endif