ipv6: check fn->leaf before it is used
[linux-2.6-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146
147                 spin_lock_bh(&ul->lock);
148                 list_del(&rt->rt6i_uncached);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
186 {
187         return dst_metrics_write_ptr(rt->dst.from);
188 }
189
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
191 {
192         struct rt6_info *rt = (struct rt6_info *)dst;
193
194         if (rt->rt6i_flags & RTF_PCPU)
195                 return rt6_pcpu_cow_metrics(rt);
196         else if (rt->rt6i_flags & RTF_CACHE)
197                 return NULL;
198         else
199                 return dst_cow_metrics_generic(dst, old);
200 }
201
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203                                              struct sk_buff *skb,
204                                              const void *daddr)
205 {
206         struct in6_addr *p = &rt->rt6i_gateway;
207
208         if (!ipv6_addr_any(p))
209                 return (const void *) p;
210         else if (skb)
211                 return &ipv6_hdr(skb)->daddr;
212         return daddr;
213 }
214
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216                                           struct sk_buff *skb,
217                                           const void *daddr)
218 {
219         struct rt6_info *rt = (struct rt6_info *) dst;
220         struct neighbour *n;
221
222         daddr = choose_neigh_daddr(rt, skb, daddr);
223         n = __ipv6_neigh_lookup(dst->dev, daddr);
224         if (n)
225                 return n;
226         return neigh_create(&nd_tbl, daddr, dst->dev);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(rt, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       ipv6_cow_metrics,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu)
272 {
273 }
274
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276                                       struct sk_buff *skb)
277 {
278 }
279
280 static struct dst_ops ip6_dst_blackhole_ops = {
281         .family                 =       AF_INET6,
282         .destroy                =       ip6_dst_destroy,
283         .check                  =       ip6_dst_check,
284         .mtu                    =       ip6_blackhole_mtu,
285         .default_advmss         =       ip6_default_advmss,
286         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
287         .redirect               =       ip6_rt_blackhole_redirect,
288         .cow_metrics            =       dst_cow_metrics_generic,
289         .neigh_lookup           =       ip6_neigh_lookup,
290 };
291
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293         [RTAX_HOPLIMIT - 1] = 0,
294 };
295
296 static const struct rt6_info ip6_null_entry_template = {
297         .dst = {
298                 .__refcnt       = ATOMIC_INIT(1),
299                 .__use          = 1,
300                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
301                 .error          = -ENETUNREACH,
302                 .input          = ip6_pkt_discard,
303                 .output         = ip6_pkt_discard_out,
304         },
305         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
306         .rt6i_protocol  = RTPROT_KERNEL,
307         .rt6i_metric    = ~(u32) 0,
308         .rt6i_ref       = ATOMIC_INIT(1),
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326 };
327
328 static const struct rt6_info ip6_blk_hole_entry_template = {
329         .dst = {
330                 .__refcnt       = ATOMIC_INIT(1),
331                 .__use          = 1,
332                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
333                 .error          = -EINVAL,
334                 .input          = dst_discard,
335                 .output         = dst_discard_out,
336         },
337         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
338         .rt6i_protocol  = RTPROT_KERNEL,
339         .rt6i_metric    = ~(u32) 0,
340         .rt6i_ref       = ATOMIC_INIT(1),
341 };
342
343 #endif
344
345 static void rt6_info_init(struct rt6_info *rt)
346 {
347         struct dst_entry *dst = &rt->dst;
348
349         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350         INIT_LIST_HEAD(&rt->rt6i_siblings);
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356                                         struct net_device *dev,
357                                         int flags)
358 {
359         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360                                         1, DST_OBSOLETE_FORCE_CHK, flags);
361
362         if (rt)
363                 rt6_info_init(rt);
364
365         return rt;
366 }
367
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369                                struct net_device *dev,
370                                int flags)
371 {
372         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373
374         if (rt) {
375                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
376                 if (rt->rt6i_pcpu) {
377                         int cpu;
378
379                         for_each_possible_cpu(cpu) {
380                                 struct rt6_info **p;
381
382                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383                                 /* no one shares rt */
384                                 *p =  NULL;
385                         }
386                 } else {
387                         dst_release_immediate(&rt->dst);
388                         return NULL;
389                 }
390         }
391
392         return rt;
393 }
394 EXPORT_SYMBOL(ip6_dst_alloc);
395
396 static void ip6_dst_destroy(struct dst_entry *dst)
397 {
398         struct rt6_info *rt = (struct rt6_info *)dst;
399         struct rt6_exception_bucket *bucket;
400         struct dst_entry *from = dst->from;
401         struct inet6_dev *idev;
402
403         dst_destroy_metrics_generic(dst);
404         free_percpu(rt->rt6i_pcpu);
405         rt6_uncached_list_del(rt);
406
407         idev = rt->rt6i_idev;
408         if (idev) {
409                 rt->rt6i_idev = NULL;
410                 in6_dev_put(idev);
411         }
412         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
413         if (bucket) {
414                 rt->rt6i_exception_bucket = NULL;
415                 kfree(bucket);
416         }
417
418         dst->from = NULL;
419         dst_release(from);
420 }
421
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
423                            int how)
424 {
425         struct rt6_info *rt = (struct rt6_info *)dst;
426         struct inet6_dev *idev = rt->rt6i_idev;
427         struct net_device *loopback_dev =
428                 dev_net(dev)->loopback_dev;
429
430         if (idev && idev->dev != loopback_dev) {
431                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
432                 if (loopback_idev) {
433                         rt->rt6i_idev = loopback_idev;
434                         in6_dev_put(idev);
435                 }
436         }
437 }
438
439 static bool __rt6_check_expired(const struct rt6_info *rt)
440 {
441         if (rt->rt6i_flags & RTF_EXPIRES)
442                 return time_after(jiffies, rt->dst.expires);
443         else
444                 return false;
445 }
446
447 static bool rt6_check_expired(const struct rt6_info *rt)
448 {
449         if (rt->rt6i_flags & RTF_EXPIRES) {
450                 if (time_after(jiffies, rt->dst.expires))
451                         return true;
452         } else if (rt->dst.from) {
453                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454                        rt6_check_expired((struct rt6_info *)rt->dst.from);
455         }
456         return false;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         /* We might have already computed the hash for ICMPv6 errors. In such
467          * case it will always be non-zero. Otherwise now is the time to do it.
468          */
469         if (!fl6->mp_hash)
470                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
471
472         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473         /* Don't change the route, if route_choosen == 0
474          * (siblings does not include ourself)
475          */
476         if (route_choosen)
477                 list_for_each_entry_safe(sibling, next_sibling,
478                                 &match->rt6i_siblings, rt6i_siblings) {
479                         route_choosen--;
480                         if (route_choosen == 0) {
481                                 if (rt6_score_route(sibling, oif, strict) < 0)
482                                         break;
483                                 match = sibling;
484                                 break;
485                         }
486                 }
487         return match;
488 }
489
490 /*
491  *      Route lookup. Any table->tb6_lock is implied.
492  */
493
494 static inline struct rt6_info *rt6_device_match(struct net *net,
495                                                     struct rt6_info *rt,
496                                                     const struct in6_addr *saddr,
497                                                     int oif,
498                                                     int flags)
499 {
500         struct rt6_info *local = NULL;
501         struct rt6_info *sprt;
502
503         if (!oif && ipv6_addr_any(saddr))
504                 goto out;
505
506         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507                 struct net_device *dev = sprt->dst.dev;
508
509                 if (oif) {
510                         if (dev->ifindex == oif)
511                                 return sprt;
512                         if (dev->flags & IFF_LOOPBACK) {
513                                 if (!sprt->rt6i_idev ||
514                                     sprt->rt6i_idev->dev->ifindex != oif) {
515                                         if (flags & RT6_LOOKUP_F_IFACE)
516                                                 continue;
517                                         if (local &&
518                                             local->rt6i_idev->dev->ifindex == oif)
519                                                 continue;
520                                 }
521                                 local = sprt;
522                         }
523                 } else {
524                         if (ipv6_chk_addr(net, saddr, dev,
525                                           flags & RT6_LOOKUP_F_IFACE))
526                                 return sprt;
527                 }
528         }
529
530         if (oif) {
531                 if (local)
532                         return local;
533
534                 if (flags & RT6_LOOKUP_F_IFACE)
535                         return net->ipv6.ip6_null_entry;
536         }
537 out:
538         return rt;
539 }
540
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543         struct work_struct work;
544         struct in6_addr target;
545         struct net_device *dev;
546 };
547
548 static void rt6_probe_deferred(struct work_struct *w)
549 {
550         struct in6_addr mcaddr;
551         struct __rt6_probe_work *work =
552                 container_of(w, struct __rt6_probe_work, work);
553
554         addrconf_addr_solict_mult(&work->target, &mcaddr);
555         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556         dev_put(work->dev);
557         kfree(work);
558 }
559
560 static void rt6_probe(struct rt6_info *rt)
561 {
562         struct __rt6_probe_work *work;
563         struct neighbour *neigh;
564         /*
565          * Okay, this does not seem to be appropriate
566          * for now, however, we need to check if it
567          * is really so; aka Router Reachability Probing.
568          *
569          * Router Reachability Probe MUST be rate-limited
570          * to no more than one per minute.
571          */
572         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
573                 return;
574         rcu_read_lock_bh();
575         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
576         if (neigh) {
577                 if (neigh->nud_state & NUD_VALID)
578                         goto out;
579
580                 work = NULL;
581                 write_lock(&neigh->lock);
582                 if (!(neigh->nud_state & NUD_VALID) &&
583                     time_after(jiffies,
584                                neigh->updated +
585                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
586                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
587                         if (work)
588                                 __neigh_set_probe_once(neigh);
589                 }
590                 write_unlock(&neigh->lock);
591         } else {
592                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
593         }
594
595         if (work) {
596                 INIT_WORK(&work->work, rt6_probe_deferred);
597                 work->target = rt->rt6i_gateway;
598                 dev_hold(rt->dst.dev);
599                 work->dev = rt->dst.dev;
600                 schedule_work(&work->work);
601         }
602
603 out:
604         rcu_read_unlock_bh();
605 }
606 #else
607 static inline void rt6_probe(struct rt6_info *rt)
608 {
609 }
610 #endif
611
612 /*
613  * Default Router Selection (RFC 2461 6.3.6)
614  */
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
616 {
617         struct net_device *dev = rt->dst.dev;
618         if (!oif || dev->ifindex == oif)
619                 return 2;
620         if ((dev->flags & IFF_LOOPBACK) &&
621             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622                 return 1;
623         return 0;
624 }
625
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
627 {
628         struct neighbour *neigh;
629         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
630
631         if (rt->rt6i_flags & RTF_NONEXTHOP ||
632             !(rt->rt6i_flags & RTF_GATEWAY))
633                 return RT6_NUD_SUCCEED;
634
635         rcu_read_lock_bh();
636         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
637         if (neigh) {
638                 read_lock(&neigh->lock);
639                 if (neigh->nud_state & NUD_VALID)
640                         ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642                 else if (!(neigh->nud_state & NUD_FAILED))
643                         ret = RT6_NUD_SUCCEED;
644                 else
645                         ret = RT6_NUD_FAIL_PROBE;
646 #endif
647                 read_unlock(&neigh->lock);
648         } else {
649                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
651         }
652         rcu_read_unlock_bh();
653
654         return ret;
655 }
656
657 static int rt6_score_route(struct rt6_info *rt, int oif,
658                            int strict)
659 {
660         int m;
661
662         m = rt6_check_dev(rt, oif);
663         if (!m && (strict & RT6_LOOKUP_F_IFACE))
664                 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
667 #endif
668         if (strict & RT6_LOOKUP_F_REACHABLE) {
669                 int n = rt6_check_neigh(rt);
670                 if (n < 0)
671                         return n;
672         }
673         return m;
674 }
675
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677                                    int *mpri, struct rt6_info *match,
678                                    bool *do_rr)
679 {
680         int m;
681         bool match_do_rr = false;
682         struct inet6_dev *idev = rt->rt6i_idev;
683         struct net_device *dev = rt->dst.dev;
684
685         if (dev && !netif_carrier_ok(dev) &&
686             idev->cnf.ignore_routes_with_linkdown &&
687             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688                 goto out;
689
690         if (rt6_check_expired(rt))
691                 goto out;
692
693         m = rt6_score_route(rt, oif, strict);
694         if (m == RT6_NUD_FAIL_DO_RR) {
695                 match_do_rr = true;
696                 m = 0; /* lowest valid score */
697         } else if (m == RT6_NUD_FAIL_HARD) {
698                 goto out;
699         }
700
701         if (strict & RT6_LOOKUP_F_REACHABLE)
702                 rt6_probe(rt);
703
704         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
705         if (m > *mpri) {
706                 *do_rr = match_do_rr;
707                 *mpri = m;
708                 match = rt;
709         }
710 out:
711         return match;
712 }
713
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715                                      struct rt6_info *leaf,
716                                      struct rt6_info *rr_head,
717                                      u32 metric, int oif, int strict,
718                                      bool *do_rr)
719 {
720         struct rt6_info *rt, *match, *cont;
721         int mpri = -1;
722
723         match = NULL;
724         cont = NULL;
725         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
726                 if (rt->rt6i_metric != metric) {
727                         cont = rt;
728                         break;
729                 }
730
731                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732         }
733
734         for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
735                 if (rt->rt6i_metric != metric) {
736                         cont = rt;
737                         break;
738                 }
739
740                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741         }
742
743         if (match || !cont)
744                 return match;
745
746         for (rt = cont; rt; rt = rt->dst.rt6_next)
747                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
748
749         return match;
750 }
751
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
753                                    int oif, int strict)
754 {
755         struct rt6_info *leaf = fn->leaf;
756         struct rt6_info *match, *rt0;
757         bool do_rr = false;
758
759         if (!leaf)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = fn->rr_ptr;
763         if (!rt0)
764                 fn->rr_ptr = rt0 = leaf;
765
766         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
767                              &do_rr);
768
769         if (do_rr) {
770                 struct rt6_info *next = rt0->dst.rt6_next;
771
772                 /* no entries matched; do round-robin */
773                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
774                         next = leaf;
775
776                 if (next != rt0)
777                         fn->rr_ptr = next;
778         }
779
780         return match ? match : net->ipv6.ip6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
784 {
785         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct rt6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->rt6i_flags = RTF_ROUTEINFO |
850                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         rt6_clean_expires(rt);
855                 else
856                         rt6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 ip6_rt_put(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
865                                         struct in6_addr *saddr)
866 {
867         struct fib6_node *pn;
868         while (1) {
869                 if (fn->fn_flags & RTN_TL_ROOT)
870                         return NULL;
871                 pn = fn->parent;
872                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
873                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
874                 else
875                         fn = pn;
876                 if (fn->fn_flags & RTN_RTINFO)
877                         return fn;
878         }
879 }
880
881 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
882                           bool null_fallback)
883 {
884         struct rt6_info *rt = *prt;
885
886         if (dst_hold_safe(&rt->dst))
887                 return true;
888         if (null_fallback) {
889                 rt = net->ipv6.ip6_null_entry;
890                 dst_hold(&rt->dst);
891         } else {
892                 rt = NULL;
893         }
894         *prt = rt;
895         return false;
896 }
897
898 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
899                                              struct fib6_table *table,
900                                              struct flowi6 *fl6, int flags)
901 {
902         struct rt6_info *rt, *rt_cache;
903         struct fib6_node *fn;
904
905         read_lock_bh(&table->tb6_lock);
906         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
907 restart:
908         rt = fn->leaf;
909         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
910         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
911                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
912         if (rt == net->ipv6.ip6_null_entry) {
913                 fn = fib6_backtrack(fn, &fl6->saddr);
914                 if (fn)
915                         goto restart;
916         }
917         /* Search through exception table */
918         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
919         if (rt_cache)
920                 rt = rt_cache;
921
922         if (ip6_hold_safe(net, &rt, true))
923                 dst_use_noref(&rt->dst, jiffies);
924
925         read_unlock_bh(&table->tb6_lock);
926
927         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
928
929         return rt;
930
931 }
932
933 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
934                                     int flags)
935 {
936         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
937 }
938 EXPORT_SYMBOL_GPL(ip6_route_lookup);
939
940 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
941                             const struct in6_addr *saddr, int oif, int strict)
942 {
943         struct flowi6 fl6 = {
944                 .flowi6_oif = oif,
945                 .daddr = *daddr,
946         };
947         struct dst_entry *dst;
948         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
949
950         if (saddr) {
951                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
952                 flags |= RT6_LOOKUP_F_HAS_SADDR;
953         }
954
955         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
956         if (dst->error == 0)
957                 return (struct rt6_info *) dst;
958
959         dst_release(dst);
960
961         return NULL;
962 }
963 EXPORT_SYMBOL(rt6_lookup);
964
965 /* ip6_ins_rt is called with FREE table->tb6_lock.
966  * It takes new route entry, the addition fails by any reason the
967  * route is released.
968  * Caller must hold dst before calling it.
969  */
970
971 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
972                         struct mx6_config *mxc,
973                         struct netlink_ext_ack *extack)
974 {
975         int err;
976         struct fib6_table *table;
977
978         table = rt->rt6i_table;
979         write_lock_bh(&table->tb6_lock);
980         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
981         write_unlock_bh(&table->tb6_lock);
982
983         return err;
984 }
985
986 int ip6_ins_rt(struct rt6_info *rt)
987 {
988         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
989         struct mx6_config mxc = { .mx = NULL, };
990
991         /* Hold dst to account for the reference from the fib6 tree */
992         dst_hold(&rt->dst);
993         return __ip6_ins_rt(rt, &info, &mxc, NULL);
994 }
995
996 /* called with rcu_lock held */
997 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
998 {
999         struct net_device *dev = rt->dst.dev;
1000
1001         if (rt->rt6i_flags & RTF_LOCAL) {
1002                 /* for copies of local routes, dst->dev needs to be the
1003                  * device if it is a master device, the master device if
1004                  * device is enslaved, and the loopback as the default
1005                  */
1006                 if (netif_is_l3_slave(dev) &&
1007                     !rt6_need_strict(&rt->rt6i_dst.addr))
1008                         dev = l3mdev_master_dev_rcu(dev);
1009                 else if (!netif_is_l3_master(dev))
1010                         dev = dev_net(dev)->loopback_dev;
1011                 /* last case is netif_is_l3_master(dev) is true in which
1012                  * case we want dev returned to be dev
1013                  */
1014         }
1015
1016         return dev;
1017 }
1018
1019 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1020                                            const struct in6_addr *daddr,
1021                                            const struct in6_addr *saddr)
1022 {
1023         struct net_device *dev;
1024         struct rt6_info *rt;
1025
1026         /*
1027          *      Clone the route.
1028          */
1029
1030         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1031                 ort = (struct rt6_info *)ort->dst.from;
1032
1033         rcu_read_lock();
1034         dev = ip6_rt_get_dev_rcu(ort);
1035         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1036         rcu_read_unlock();
1037         if (!rt)
1038                 return NULL;
1039
1040         ip6_rt_copy_init(rt, ort);
1041         rt->rt6i_flags |= RTF_CACHE;
1042         rt->rt6i_metric = 0;
1043         rt->dst.flags |= DST_HOST;
1044         rt->rt6i_dst.addr = *daddr;
1045         rt->rt6i_dst.plen = 128;
1046
1047         if (!rt6_is_gw_or_nonexthop(ort)) {
1048                 if (ort->rt6i_dst.plen != 128 &&
1049                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1050                         rt->rt6i_flags |= RTF_ANYCAST;
1051 #ifdef CONFIG_IPV6_SUBTREES
1052                 if (rt->rt6i_src.plen && saddr) {
1053                         rt->rt6i_src.addr = *saddr;
1054                         rt->rt6i_src.plen = 128;
1055                 }
1056 #endif
1057         }
1058
1059         return rt;
1060 }
1061
1062 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1063 {
1064         struct net_device *dev;
1065         struct rt6_info *pcpu_rt;
1066
1067         rcu_read_lock();
1068         dev = ip6_rt_get_dev_rcu(rt);
1069         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1070         rcu_read_unlock();
1071         if (!pcpu_rt)
1072                 return NULL;
1073         ip6_rt_copy_init(pcpu_rt, rt);
1074         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1075         pcpu_rt->rt6i_flags |= RTF_PCPU;
1076         return pcpu_rt;
1077 }
1078
1079 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1080 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1081 {
1082         struct rt6_info *pcpu_rt, **p;
1083
1084         p = this_cpu_ptr(rt->rt6i_pcpu);
1085         pcpu_rt = *p;
1086
1087         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1088                 rt6_dst_from_metrics_check(pcpu_rt);
1089
1090         return pcpu_rt;
1091 }
1092
1093 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1094 {
1095         struct rt6_info *pcpu_rt, *prev, **p;
1096
1097         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1098         if (!pcpu_rt) {
1099                 struct net *net = dev_net(rt->dst.dev);
1100
1101                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1102                 return net->ipv6.ip6_null_entry;
1103         }
1104
1105         dst_hold(&pcpu_rt->dst);
1106         p = this_cpu_ptr(rt->rt6i_pcpu);
1107         prev = cmpxchg(p, NULL, pcpu_rt);
1108         if (prev) {
1109                 /* If someone did it before us, return prev instead */
1110                 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1111                 dst_release_immediate(&pcpu_rt->dst);
1112                 /* release refcnt taken by above dst_hold() */
1113                 dst_release_immediate(&pcpu_rt->dst);
1114                 dst_hold(&prev->dst);
1115                 pcpu_rt = prev;
1116         }
1117
1118         rt6_dst_from_metrics_check(pcpu_rt);
1119         return pcpu_rt;
1120 }
1121
1122 /* exception hash table implementation
1123  */
1124 static DEFINE_SPINLOCK(rt6_exception_lock);
1125
1126 /* Remove rt6_ex from hash table and free the memory
1127  * Caller must hold rt6_exception_lock
1128  */
1129 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1130                                  struct rt6_exception *rt6_ex)
1131 {
1132         if (!bucket || !rt6_ex)
1133                 return;
1134         rt6_ex->rt6i->rt6i_node = NULL;
1135         hlist_del_rcu(&rt6_ex->hlist);
1136         rt6_release(rt6_ex->rt6i);
1137         kfree_rcu(rt6_ex, rcu);
1138         WARN_ON_ONCE(!bucket->depth);
1139         bucket->depth--;
1140 }
1141
1142 /* Remove oldest rt6_ex in bucket and free the memory
1143  * Caller must hold rt6_exception_lock
1144  */
1145 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1146 {
1147         struct rt6_exception *rt6_ex, *oldest = NULL;
1148
1149         if (!bucket)
1150                 return;
1151
1152         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1153                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1154                         oldest = rt6_ex;
1155         }
1156         rt6_remove_exception(bucket, oldest);
1157 }
1158
1159 static u32 rt6_exception_hash(const struct in6_addr *dst,
1160                               const struct in6_addr *src)
1161 {
1162         static u32 seed __read_mostly;
1163         u32 val;
1164
1165         net_get_random_once(&seed, sizeof(seed));
1166         val = jhash(dst, sizeof(*dst), seed);
1167
1168 #ifdef CONFIG_IPV6_SUBTREES
1169         if (src)
1170                 val = jhash(src, sizeof(*src), val);
1171 #endif
1172         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1173 }
1174
1175 /* Helper function to find the cached rt in the hash table
1176  * and update bucket pointer to point to the bucket for this
1177  * (daddr, saddr) pair
1178  * Caller must hold rt6_exception_lock
1179  */
1180 static struct rt6_exception *
1181 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1182                               const struct in6_addr *daddr,
1183                               const struct in6_addr *saddr)
1184 {
1185         struct rt6_exception *rt6_ex;
1186         u32 hval;
1187
1188         if (!(*bucket) || !daddr)
1189                 return NULL;
1190
1191         hval = rt6_exception_hash(daddr, saddr);
1192         *bucket += hval;
1193
1194         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1195                 struct rt6_info *rt6 = rt6_ex->rt6i;
1196                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1197
1198 #ifdef CONFIG_IPV6_SUBTREES
1199                 if (matched && saddr)
1200                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1201 #endif
1202                 if (matched)
1203                         return rt6_ex;
1204         }
1205         return NULL;
1206 }
1207
1208 /* Helper function to find the cached rt in the hash table
1209  * and update bucket pointer to point to the bucket for this
1210  * (daddr, saddr) pair
1211  * Caller must hold rcu_read_lock()
1212  */
1213 static struct rt6_exception *
1214 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1215                          const struct in6_addr *daddr,
1216                          const struct in6_addr *saddr)
1217 {
1218         struct rt6_exception *rt6_ex;
1219         u32 hval;
1220
1221         WARN_ON_ONCE(!rcu_read_lock_held());
1222
1223         if (!(*bucket) || !daddr)
1224                 return NULL;
1225
1226         hval = rt6_exception_hash(daddr, saddr);
1227         *bucket += hval;
1228
1229         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1230                 struct rt6_info *rt6 = rt6_ex->rt6i;
1231                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1232
1233 #ifdef CONFIG_IPV6_SUBTREES
1234                 if (matched && saddr)
1235                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1236 #endif
1237                 if (matched)
1238                         return rt6_ex;
1239         }
1240         return NULL;
1241 }
1242
1243 static int rt6_insert_exception(struct rt6_info *nrt,
1244                                 struct rt6_info *ort)
1245 {
1246         struct rt6_exception_bucket *bucket;
1247         struct in6_addr *src_key = NULL;
1248         struct rt6_exception *rt6_ex;
1249         int err = 0;
1250
1251         /* ort can't be a cache or pcpu route */
1252         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1253                 ort = (struct rt6_info *)ort->dst.from;
1254         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1255
1256         spin_lock_bh(&rt6_exception_lock);
1257
1258         if (ort->exception_bucket_flushed) {
1259                 err = -EINVAL;
1260                 goto out;
1261         }
1262
1263         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1264                                         lockdep_is_held(&rt6_exception_lock));
1265         if (!bucket) {
1266                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1267                                  GFP_ATOMIC);
1268                 if (!bucket) {
1269                         err = -ENOMEM;
1270                         goto out;
1271                 }
1272                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1273         }
1274
1275 #ifdef CONFIG_IPV6_SUBTREES
1276         /* rt6i_src.plen != 0 indicates ort is in subtree
1277          * and exception table is indexed by a hash of
1278          * both rt6i_dst and rt6i_src.
1279          * Otherwise, the exception table is indexed by
1280          * a hash of only rt6i_dst.
1281          */
1282         if (ort->rt6i_src.plen)
1283                 src_key = &nrt->rt6i_src.addr;
1284 #endif
1285
1286         /* Update rt6i_prefsrc as it could be changed
1287          * in rt6_remove_prefsrc()
1288          */
1289         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1290         /* rt6_mtu_change() might lower mtu on ort.
1291          * Only insert this exception route if its mtu
1292          * is less than ort's mtu value.
1293          */
1294         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1295                 err = -EINVAL;
1296                 goto out;
1297         }
1298
1299         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1300                                                src_key);
1301         if (rt6_ex)
1302                 rt6_remove_exception(bucket, rt6_ex);
1303
1304         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1305         if (!rt6_ex) {
1306                 err = -ENOMEM;
1307                 goto out;
1308         }
1309         rt6_ex->rt6i = nrt;
1310         rt6_ex->stamp = jiffies;
1311         atomic_inc(&nrt->rt6i_ref);
1312         nrt->rt6i_node = ort->rt6i_node;
1313         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1314         bucket->depth++;
1315
1316         if (bucket->depth > FIB6_MAX_DEPTH)
1317                 rt6_exception_remove_oldest(bucket);
1318
1319 out:
1320         spin_unlock_bh(&rt6_exception_lock);
1321
1322         /* Update fn->fn_sernum to invalidate all cached dst */
1323         if (!err)
1324                 fib6_update_sernum(ort);
1325
1326         return err;
1327 }
1328
1329 void rt6_flush_exceptions(struct rt6_info *rt)
1330 {
1331         struct rt6_exception_bucket *bucket;
1332         struct rt6_exception *rt6_ex;
1333         struct hlist_node *tmp;
1334         int i;
1335
1336         spin_lock_bh(&rt6_exception_lock);
1337         /* Prevent rt6_insert_exception() to recreate the bucket list */
1338         rt->exception_bucket_flushed = 1;
1339
1340         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1341                                     lockdep_is_held(&rt6_exception_lock));
1342         if (!bucket)
1343                 goto out;
1344
1345         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1346                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1347                         rt6_remove_exception(bucket, rt6_ex);
1348                 WARN_ON_ONCE(bucket->depth);
1349                 bucket++;
1350         }
1351
1352 out:
1353         spin_unlock_bh(&rt6_exception_lock);
1354 }
1355
1356 /* Find cached rt in the hash table inside passed in rt
1357  * Caller has to hold rcu_read_lock()
1358  */
1359 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1360                                            struct in6_addr *daddr,
1361                                            struct in6_addr *saddr)
1362 {
1363         struct rt6_exception_bucket *bucket;
1364         struct in6_addr *src_key = NULL;
1365         struct rt6_exception *rt6_ex;
1366         struct rt6_info *res = NULL;
1367
1368         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1369
1370 #ifdef CONFIG_IPV6_SUBTREES
1371         /* rt6i_src.plen != 0 indicates rt is in subtree
1372          * and exception table is indexed by a hash of
1373          * both rt6i_dst and rt6i_src.
1374          * Otherwise, the exception table is indexed by
1375          * a hash of only rt6i_dst.
1376          */
1377         if (rt->rt6i_src.plen)
1378                 src_key = saddr;
1379 #endif
1380         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1381
1382         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1383                 res = rt6_ex->rt6i;
1384
1385         return res;
1386 }
1387
1388 /* Remove the passed in cached rt from the hash table that contains it */
1389 int rt6_remove_exception_rt(struct rt6_info *rt)
1390 {
1391         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1392         struct rt6_exception_bucket *bucket;
1393         struct in6_addr *src_key = NULL;
1394         struct rt6_exception *rt6_ex;
1395         int err;
1396
1397         if (!from ||
1398             !(rt->rt6i_flags | RTF_CACHE))
1399                 return -EINVAL;
1400
1401         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1402                 return -ENOENT;
1403
1404         spin_lock_bh(&rt6_exception_lock);
1405         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1406                                     lockdep_is_held(&rt6_exception_lock));
1407 #ifdef CONFIG_IPV6_SUBTREES
1408         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1409          * and exception table is indexed by a hash of
1410          * both rt6i_dst and rt6i_src.
1411          * Otherwise, the exception table is indexed by
1412          * a hash of only rt6i_dst.
1413          */
1414         if (from->rt6i_src.plen)
1415                 src_key = &rt->rt6i_src.addr;
1416 #endif
1417         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1418                                                &rt->rt6i_dst.addr,
1419                                                src_key);
1420         if (rt6_ex) {
1421                 rt6_remove_exception(bucket, rt6_ex);
1422                 err = 0;
1423         } else {
1424                 err = -ENOENT;
1425         }
1426
1427         spin_unlock_bh(&rt6_exception_lock);
1428         return err;
1429 }
1430
1431 /* Find rt6_ex which contains the passed in rt cache and
1432  * refresh its stamp
1433  */
1434 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1435 {
1436         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1437         struct rt6_exception_bucket *bucket;
1438         struct in6_addr *src_key = NULL;
1439         struct rt6_exception *rt6_ex;
1440
1441         if (!from ||
1442             !(rt->rt6i_flags | RTF_CACHE))
1443                 return;
1444
1445         rcu_read_lock();
1446         bucket = rcu_dereference(from->rt6i_exception_bucket);
1447
1448 #ifdef CONFIG_IPV6_SUBTREES
1449         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1450          * and exception table is indexed by a hash of
1451          * both rt6i_dst and rt6i_src.
1452          * Otherwise, the exception table is indexed by
1453          * a hash of only rt6i_dst.
1454          */
1455         if (from->rt6i_src.plen)
1456                 src_key = &rt->rt6i_src.addr;
1457 #endif
1458         rt6_ex = __rt6_find_exception_rcu(&bucket,
1459                                           &rt->rt6i_dst.addr,
1460                                           src_key);
1461         if (rt6_ex)
1462                 rt6_ex->stamp = jiffies;
1463
1464         rcu_read_unlock();
1465 }
1466
1467 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1468 {
1469         struct rt6_exception_bucket *bucket;
1470         struct rt6_exception *rt6_ex;
1471         int i;
1472
1473         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1474                                         lockdep_is_held(&rt6_exception_lock));
1475
1476         if (bucket) {
1477                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1478                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1479                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1480                         }
1481                         bucket++;
1482                 }
1483         }
1484 }
1485
1486 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1487 {
1488         struct rt6_exception_bucket *bucket;
1489         struct rt6_exception *rt6_ex;
1490         int i;
1491
1492         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493                                         lockdep_is_held(&rt6_exception_lock));
1494
1495         if (bucket) {
1496                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1497                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1498                                 struct rt6_info *entry = rt6_ex->rt6i;
1499                                 /* For RTF_CACHE with rt6i_pmtu == 0
1500                                  * (i.e. a redirected route),
1501                                  * the metrics of its rt->dst.from has already
1502                                  * been updated.
1503                                  */
1504                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1505                                         entry->rt6i_pmtu = mtu;
1506                         }
1507                         bucket++;
1508                 }
1509         }
1510 }
1511
1512 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1513
1514 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1515                                         struct in6_addr *gateway)
1516 {
1517         struct rt6_exception_bucket *bucket;
1518         struct rt6_exception *rt6_ex;
1519         struct hlist_node *tmp;
1520         int i;
1521
1522         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1523                 return;
1524
1525         spin_lock_bh(&rt6_exception_lock);
1526         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1527                                      lockdep_is_held(&rt6_exception_lock));
1528
1529         if (bucket) {
1530                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1531                         hlist_for_each_entry_safe(rt6_ex, tmp,
1532                                                   &bucket->chain, hlist) {
1533                                 struct rt6_info *entry = rt6_ex->rt6i;
1534
1535                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1536                                     RTF_CACHE_GATEWAY &&
1537                                     ipv6_addr_equal(gateway,
1538                                                     &entry->rt6i_gateway)) {
1539                                         rt6_remove_exception(bucket, rt6_ex);
1540                                 }
1541                         }
1542                         bucket++;
1543                 }
1544         }
1545
1546         spin_unlock_bh(&rt6_exception_lock);
1547 }
1548
1549 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1550                                       struct rt6_exception *rt6_ex,
1551                                       struct fib6_gc_args *gc_args,
1552                                       unsigned long now)
1553 {
1554         struct rt6_info *rt = rt6_ex->rt6i;
1555
1556         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1557             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1558                 RT6_TRACE("aging clone %p\n", rt);
1559                 rt6_remove_exception(bucket, rt6_ex);
1560                 return;
1561         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1562                 struct neighbour *neigh;
1563                 __u8 neigh_flags = 0;
1564
1565                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1566                 if (neigh) {
1567                         neigh_flags = neigh->flags;
1568                         neigh_release(neigh);
1569                 }
1570                 if (!(neigh_flags & NTF_ROUTER)) {
1571                         RT6_TRACE("purging route %p via non-router but gateway\n",
1572                                   rt);
1573                         rt6_remove_exception(bucket, rt6_ex);
1574                         return;
1575                 }
1576         }
1577         gc_args->more++;
1578 }
1579
1580 void rt6_age_exceptions(struct rt6_info *rt,
1581                         struct fib6_gc_args *gc_args,
1582                         unsigned long now)
1583 {
1584         struct rt6_exception_bucket *bucket;
1585         struct rt6_exception *rt6_ex;
1586         struct hlist_node *tmp;
1587         int i;
1588
1589         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1590                 return;
1591
1592         spin_lock_bh(&rt6_exception_lock);
1593         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1594                                     lockdep_is_held(&rt6_exception_lock));
1595
1596         if (bucket) {
1597                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1598                         hlist_for_each_entry_safe(rt6_ex, tmp,
1599                                                   &bucket->chain, hlist) {
1600                                 rt6_age_examine_exception(bucket, rt6_ex,
1601                                                           gc_args, now);
1602                         }
1603                         bucket++;
1604                 }
1605         }
1606         spin_unlock_bh(&rt6_exception_lock);
1607 }
1608
1609 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1610                                int oif, struct flowi6 *fl6, int flags)
1611 {
1612         struct fib6_node *fn, *saved_fn;
1613         struct rt6_info *rt, *rt_cache;
1614         int strict = 0;
1615
1616         strict |= flags & RT6_LOOKUP_F_IFACE;
1617         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1618         if (net->ipv6.devconf_all->forwarding == 0)
1619                 strict |= RT6_LOOKUP_F_REACHABLE;
1620
1621         read_lock_bh(&table->tb6_lock);
1622
1623         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1624         saved_fn = fn;
1625
1626         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1627                 oif = 0;
1628
1629 redo_rt6_select:
1630         rt = rt6_select(net, fn, oif, strict);
1631         if (rt->rt6i_nsiblings)
1632                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1633         if (rt == net->ipv6.ip6_null_entry) {
1634                 fn = fib6_backtrack(fn, &fl6->saddr);
1635                 if (fn)
1636                         goto redo_rt6_select;
1637                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1638                         /* also consider unreachable route */
1639                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1640                         fn = saved_fn;
1641                         goto redo_rt6_select;
1642                 }
1643         }
1644
1645         /*Search through exception table */
1646         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1647         if (rt_cache)
1648                 rt = rt_cache;
1649
1650         if (rt == net->ipv6.ip6_null_entry) {
1651                 read_unlock_bh(&table->tb6_lock);
1652                 dst_hold(&rt->dst);
1653                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1654                 return rt;
1655         } else if (rt->rt6i_flags & RTF_CACHE) {
1656                 if (ip6_hold_safe(net, &rt, true)) {
1657                         dst_use_noref(&rt->dst, jiffies);
1658                         rt6_dst_from_metrics_check(rt);
1659                 }
1660                 read_unlock_bh(&table->tb6_lock);
1661                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1662                 return rt;
1663         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1664                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1665                 /* Create a RTF_CACHE clone which will not be
1666                  * owned by the fib6 tree.  It is for the special case where
1667                  * the daddr in the skb during the neighbor look-up is different
1668                  * from the fl6->daddr used to look-up route here.
1669                  */
1670
1671                 struct rt6_info *uncached_rt;
1672
1673                 if (ip6_hold_safe(net, &rt, true)) {
1674                         dst_use_noref(&rt->dst, jiffies);
1675                 } else {
1676                         read_unlock_bh(&table->tb6_lock);
1677                         uncached_rt = rt;
1678                         goto uncached_rt_out;
1679                 }
1680                 read_unlock_bh(&table->tb6_lock);
1681
1682                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1683                 dst_release(&rt->dst);
1684
1685                 if (uncached_rt) {
1686                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1687                          * No need for another dst_hold()
1688                          */
1689                         rt6_uncached_list_add(uncached_rt);
1690                 } else {
1691                         uncached_rt = net->ipv6.ip6_null_entry;
1692                         dst_hold(&uncached_rt->dst);
1693                 }
1694
1695 uncached_rt_out:
1696                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1697                 return uncached_rt;
1698
1699         } else {
1700                 /* Get a percpu copy */
1701
1702                 struct rt6_info *pcpu_rt;
1703
1704                 dst_use_noref(&rt->dst, jiffies);
1705                 pcpu_rt = rt6_get_pcpu_route(rt);
1706
1707                 if (pcpu_rt) {
1708                         read_unlock_bh(&table->tb6_lock);
1709                 } else {
1710                         /* atomic_inc_not_zero() is needed when using rcu */
1711                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1712                                 /* We have to do the read_unlock first
1713                                  * because rt6_make_pcpu_route() may trigger
1714                                  * ip6_dst_gc() which will take the write_lock.
1715                                  *
1716                                  * No dst_hold() on rt is needed because grabbing
1717                                  * rt->rt6i_ref makes sure rt can't be released.
1718                                  */
1719                                 read_unlock_bh(&table->tb6_lock);
1720                                 pcpu_rt = rt6_make_pcpu_route(rt);
1721                                 rt6_release(rt);
1722                         } else {
1723                                 /* rt is already removed from tree */
1724                                 read_unlock_bh(&table->tb6_lock);
1725                                 pcpu_rt = net->ipv6.ip6_null_entry;
1726                                 dst_hold(&pcpu_rt->dst);
1727                         }
1728                 }
1729
1730                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1731                 return pcpu_rt;
1732         }
1733 }
1734 EXPORT_SYMBOL_GPL(ip6_pol_route);
1735
1736 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1737                                             struct flowi6 *fl6, int flags)
1738 {
1739         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1740 }
1741
1742 struct dst_entry *ip6_route_input_lookup(struct net *net,
1743                                          struct net_device *dev,
1744                                          struct flowi6 *fl6, int flags)
1745 {
1746         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1747                 flags |= RT6_LOOKUP_F_IFACE;
1748
1749         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1750 }
1751 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1752
1753 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1754                                   struct flow_keys *keys)
1755 {
1756         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1757         const struct ipv6hdr *key_iph = outer_iph;
1758         const struct ipv6hdr *inner_iph;
1759         const struct icmp6hdr *icmph;
1760         struct ipv6hdr _inner_iph;
1761
1762         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1763                 goto out;
1764
1765         icmph = icmp6_hdr(skb);
1766         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1767             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1768             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1769             icmph->icmp6_type != ICMPV6_PARAMPROB)
1770                 goto out;
1771
1772         inner_iph = skb_header_pointer(skb,
1773                                        skb_transport_offset(skb) + sizeof(*icmph),
1774                                        sizeof(_inner_iph), &_inner_iph);
1775         if (!inner_iph)
1776                 goto out;
1777
1778         key_iph = inner_iph;
1779 out:
1780         memset(keys, 0, sizeof(*keys));
1781         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1782         keys->addrs.v6addrs.src = key_iph->saddr;
1783         keys->addrs.v6addrs.dst = key_iph->daddr;
1784         keys->tags.flow_label = ip6_flowinfo(key_iph);
1785         keys->basic.ip_proto = key_iph->nexthdr;
1786 }
1787
1788 /* if skb is set it will be used and fl6 can be NULL */
1789 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1790 {
1791         struct flow_keys hash_keys;
1792
1793         if (skb) {
1794                 ip6_multipath_l3_keys(skb, &hash_keys);
1795                 return flow_hash_from_keys(&hash_keys);
1796         }
1797
1798         return get_hash_from_flowi6(fl6);
1799 }
1800
1801 void ip6_route_input(struct sk_buff *skb)
1802 {
1803         const struct ipv6hdr *iph = ipv6_hdr(skb);
1804         struct net *net = dev_net(skb->dev);
1805         int flags = RT6_LOOKUP_F_HAS_SADDR;
1806         struct ip_tunnel_info *tun_info;
1807         struct flowi6 fl6 = {
1808                 .flowi6_iif = skb->dev->ifindex,
1809                 .daddr = iph->daddr,
1810                 .saddr = iph->saddr,
1811                 .flowlabel = ip6_flowinfo(iph),
1812                 .flowi6_mark = skb->mark,
1813                 .flowi6_proto = iph->nexthdr,
1814         };
1815
1816         tun_info = skb_tunnel_info(skb);
1817         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1818                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1819         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1820                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1821         skb_dst_drop(skb);
1822         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1823 }
1824
1825 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1826                                              struct flowi6 *fl6, int flags)
1827 {
1828         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1829 }
1830
1831 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1832                                          struct flowi6 *fl6, int flags)
1833 {
1834         bool any_src;
1835
1836         if (rt6_need_strict(&fl6->daddr)) {
1837                 struct dst_entry *dst;
1838
1839                 dst = l3mdev_link_scope_lookup(net, fl6);
1840                 if (dst)
1841                         return dst;
1842         }
1843
1844         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1845
1846         any_src = ipv6_addr_any(&fl6->saddr);
1847         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1848             (fl6->flowi6_oif && any_src))
1849                 flags |= RT6_LOOKUP_F_IFACE;
1850
1851         if (!any_src)
1852                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1853         else if (sk)
1854                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1855
1856         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1857 }
1858 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1859
1860 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1861 {
1862         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1863         struct net_device *loopback_dev = net->loopback_dev;
1864         struct dst_entry *new = NULL;
1865
1866         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1867                        DST_OBSOLETE_NONE, 0);
1868         if (rt) {
1869                 rt6_info_init(rt);
1870
1871                 new = &rt->dst;
1872                 new->__use = 1;
1873                 new->input = dst_discard;
1874                 new->output = dst_discard_out;
1875
1876                 dst_copy_metrics(new, &ort->dst);
1877
1878                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1879                 rt->rt6i_gateway = ort->rt6i_gateway;
1880                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1881                 rt->rt6i_metric = 0;
1882
1883                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1884 #ifdef CONFIG_IPV6_SUBTREES
1885                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1886 #endif
1887         }
1888
1889         dst_release(dst_orig);
1890         return new ? new : ERR_PTR(-ENOMEM);
1891 }
1892
1893 /*
1894  *      Destination cache support functions
1895  */
1896
1897 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1898 {
1899         if (rt->dst.from &&
1900             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1901                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1902 }
1903
1904 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1905 {
1906         u32 rt_cookie = 0;
1907
1908         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1909                 return NULL;
1910
1911         if (rt6_check_expired(rt))
1912                 return NULL;
1913
1914         return &rt->dst;
1915 }
1916
1917 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1918 {
1919         if (!__rt6_check_expired(rt) &&
1920             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1921             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1922                 return &rt->dst;
1923         else
1924                 return NULL;
1925 }
1926
1927 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1928 {
1929         struct rt6_info *rt;
1930
1931         rt = (struct rt6_info *) dst;
1932
1933         /* All IPV6 dsts are created with ->obsolete set to the value
1934          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1935          * into this function always.
1936          */
1937
1938         rt6_dst_from_metrics_check(rt);
1939
1940         if (rt->rt6i_flags & RTF_PCPU ||
1941             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1942                 return rt6_dst_from_check(rt, cookie);
1943         else
1944                 return rt6_check(rt, cookie);
1945 }
1946
1947 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1948 {
1949         struct rt6_info *rt = (struct rt6_info *) dst;
1950
1951         if (rt) {
1952                 if (rt->rt6i_flags & RTF_CACHE) {
1953                         if (rt6_check_expired(rt)) {
1954                                 ip6_del_rt(rt);
1955                                 dst = NULL;
1956                         }
1957                 } else {
1958                         dst_release(dst);
1959                         dst = NULL;
1960                 }
1961         }
1962         return dst;
1963 }
1964
1965 static void ip6_link_failure(struct sk_buff *skb)
1966 {
1967         struct rt6_info *rt;
1968
1969         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1970
1971         rt = (struct rt6_info *) skb_dst(skb);
1972         if (rt) {
1973                 if (rt->rt6i_flags & RTF_CACHE) {
1974                         if (dst_hold_safe(&rt->dst))
1975                                 ip6_del_rt(rt);
1976                 } else {
1977                         struct fib6_node *fn;
1978
1979                         rcu_read_lock();
1980                         fn = rcu_dereference(rt->rt6i_node);
1981                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1982                                 fn->fn_sernum = -1;
1983                         rcu_read_unlock();
1984                 }
1985         }
1986 }
1987
1988 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1989 {
1990         struct net *net = dev_net(rt->dst.dev);
1991
1992         rt->rt6i_flags |= RTF_MODIFIED;
1993         rt->rt6i_pmtu = mtu;
1994         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1995 }
1996
1997 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1998 {
1999         return !(rt->rt6i_flags & RTF_CACHE) &&
2000                 (rt->rt6i_flags & RTF_PCPU ||
2001                  rcu_access_pointer(rt->rt6i_node));
2002 }
2003
2004 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2005                                  const struct ipv6hdr *iph, u32 mtu)
2006 {
2007         const struct in6_addr *daddr, *saddr;
2008         struct rt6_info *rt6 = (struct rt6_info *)dst;
2009
2010         if (rt6->rt6i_flags & RTF_LOCAL)
2011                 return;
2012
2013         if (dst_metric_locked(dst, RTAX_MTU))
2014                 return;
2015
2016         if (iph) {
2017                 daddr = &iph->daddr;
2018                 saddr = &iph->saddr;
2019         } else if (sk) {
2020                 daddr = &sk->sk_v6_daddr;
2021                 saddr = &inet6_sk(sk)->saddr;
2022         } else {
2023                 daddr = NULL;
2024                 saddr = NULL;
2025         }
2026         dst_confirm_neigh(dst, daddr);
2027         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2028         if (mtu >= dst_mtu(dst))
2029                 return;
2030
2031         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2032                 rt6_do_update_pmtu(rt6, mtu);
2033                 /* update rt6_ex->stamp for cache */
2034                 if (rt6->rt6i_flags & RTF_CACHE)
2035                         rt6_update_exception_stamp_rt(rt6);
2036         } else if (daddr) {
2037                 struct rt6_info *nrt6;
2038
2039                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2040                 if (nrt6) {
2041                         rt6_do_update_pmtu(nrt6, mtu);
2042                         if (rt6_insert_exception(nrt6, rt6))
2043                                 dst_release_immediate(&nrt6->dst);
2044                 }
2045         }
2046 }
2047
2048 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2049                                struct sk_buff *skb, u32 mtu)
2050 {
2051         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2052 }
2053
2054 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2055                      int oif, u32 mark, kuid_t uid)
2056 {
2057         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2058         struct dst_entry *dst;
2059         struct flowi6 fl6;
2060
2061         memset(&fl6, 0, sizeof(fl6));
2062         fl6.flowi6_oif = oif;
2063         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2064         fl6.daddr = iph->daddr;
2065         fl6.saddr = iph->saddr;
2066         fl6.flowlabel = ip6_flowinfo(iph);
2067         fl6.flowi6_uid = uid;
2068
2069         dst = ip6_route_output(net, NULL, &fl6);
2070         if (!dst->error)
2071                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2072         dst_release(dst);
2073 }
2074 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2075
2076 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2077 {
2078         struct dst_entry *dst;
2079
2080         ip6_update_pmtu(skb, sock_net(sk), mtu,
2081                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2082
2083         dst = __sk_dst_get(sk);
2084         if (!dst || !dst->obsolete ||
2085             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2086                 return;
2087
2088         bh_lock_sock(sk);
2089         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2090                 ip6_datagram_dst_update(sk, false);
2091         bh_unlock_sock(sk);
2092 }
2093 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2094
2095 /* Handle redirects */
2096 struct ip6rd_flowi {
2097         struct flowi6 fl6;
2098         struct in6_addr gateway;
2099 };
2100
2101 static struct rt6_info *__ip6_route_redirect(struct net *net,
2102                                              struct fib6_table *table,
2103                                              struct flowi6 *fl6,
2104                                              int flags)
2105 {
2106         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2107         struct rt6_info *rt, *rt_cache;
2108         struct fib6_node *fn;
2109
2110         /* Get the "current" route for this destination and
2111          * check if the redirect has come from appropriate router.
2112          *
2113          * RFC 4861 specifies that redirects should only be
2114          * accepted if they come from the nexthop to the target.
2115          * Due to the way the routes are chosen, this notion
2116          * is a bit fuzzy and one might need to check all possible
2117          * routes.
2118          */
2119
2120         read_lock_bh(&table->tb6_lock);
2121         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2122 restart:
2123         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2124                 if (rt6_check_expired(rt))
2125                         continue;
2126                 if (rt->dst.error)
2127                         break;
2128                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2129                         continue;
2130                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2131                         continue;
2132                 /* rt_cache's gateway might be different from its 'parent'
2133                  * in the case of an ip redirect.
2134                  * So we keep searching in the exception table if the gateway
2135                  * is different.
2136                  */
2137                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2138                         rt_cache = rt6_find_cached_rt(rt,
2139                                                       &fl6->daddr,
2140                                                       &fl6->saddr);
2141                         if (rt_cache &&
2142                             ipv6_addr_equal(&rdfl->gateway,
2143                                             &rt_cache->rt6i_gateway)) {
2144                                 rt = rt_cache;
2145                                 break;
2146                         }
2147                         continue;
2148                 }
2149                 break;
2150         }
2151
2152         if (!rt)
2153                 rt = net->ipv6.ip6_null_entry;
2154         else if (rt->dst.error) {
2155                 rt = net->ipv6.ip6_null_entry;
2156                 goto out;
2157         }
2158
2159         if (rt == net->ipv6.ip6_null_entry) {
2160                 fn = fib6_backtrack(fn, &fl6->saddr);
2161                 if (fn)
2162                         goto restart;
2163         }
2164
2165 out:
2166         ip6_hold_safe(net, &rt, true);
2167
2168         read_unlock_bh(&table->tb6_lock);
2169
2170         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2171         return rt;
2172 };
2173
2174 static struct dst_entry *ip6_route_redirect(struct net *net,
2175                                         const struct flowi6 *fl6,
2176                                         const struct in6_addr *gateway)
2177 {
2178         int flags = RT6_LOOKUP_F_HAS_SADDR;
2179         struct ip6rd_flowi rdfl;
2180
2181         rdfl.fl6 = *fl6;
2182         rdfl.gateway = *gateway;
2183
2184         return fib6_rule_lookup(net, &rdfl.fl6,
2185                                 flags, __ip6_route_redirect);
2186 }
2187
2188 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2189                   kuid_t uid)
2190 {
2191         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2192         struct dst_entry *dst;
2193         struct flowi6 fl6;
2194
2195         memset(&fl6, 0, sizeof(fl6));
2196         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2197         fl6.flowi6_oif = oif;
2198         fl6.flowi6_mark = mark;
2199         fl6.daddr = iph->daddr;
2200         fl6.saddr = iph->saddr;
2201         fl6.flowlabel = ip6_flowinfo(iph);
2202         fl6.flowi6_uid = uid;
2203
2204         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2205         rt6_do_redirect(dst, NULL, skb);
2206         dst_release(dst);
2207 }
2208 EXPORT_SYMBOL_GPL(ip6_redirect);
2209
2210 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2211                             u32 mark)
2212 {
2213         const struct ipv6hdr *iph = ipv6_hdr(skb);
2214         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2215         struct dst_entry *dst;
2216         struct flowi6 fl6;
2217
2218         memset(&fl6, 0, sizeof(fl6));
2219         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2220         fl6.flowi6_oif = oif;
2221         fl6.flowi6_mark = mark;
2222         fl6.daddr = msg->dest;
2223         fl6.saddr = iph->daddr;
2224         fl6.flowi6_uid = sock_net_uid(net, NULL);
2225
2226         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2227         rt6_do_redirect(dst, NULL, skb);
2228         dst_release(dst);
2229 }
2230
2231 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2232 {
2233         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2234                      sk->sk_uid);
2235 }
2236 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2237
2238 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2239 {
2240         struct net_device *dev = dst->dev;
2241         unsigned int mtu = dst_mtu(dst);
2242         struct net *net = dev_net(dev);
2243
2244         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2245
2246         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2247                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2248
2249         /*
2250          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2251          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2252          * IPV6_MAXPLEN is also valid and means: "any MSS,
2253          * rely only on pmtu discovery"
2254          */
2255         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2256                 mtu = IPV6_MAXPLEN;
2257         return mtu;
2258 }
2259
2260 static unsigned int ip6_mtu(const struct dst_entry *dst)
2261 {
2262         const struct rt6_info *rt = (const struct rt6_info *)dst;
2263         unsigned int mtu = rt->rt6i_pmtu;
2264         struct inet6_dev *idev;
2265
2266         if (mtu)
2267                 goto out;
2268
2269         mtu = dst_metric_raw(dst, RTAX_MTU);
2270         if (mtu)
2271                 goto out;
2272
2273         mtu = IPV6_MIN_MTU;
2274
2275         rcu_read_lock();
2276         idev = __in6_dev_get(dst->dev);
2277         if (idev)
2278                 mtu = idev->cnf.mtu6;
2279         rcu_read_unlock();
2280
2281 out:
2282         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2283
2284         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2285 }
2286
2287 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2288                                   struct flowi6 *fl6)
2289 {
2290         struct dst_entry *dst;
2291         struct rt6_info *rt;
2292         struct inet6_dev *idev = in6_dev_get(dev);
2293         struct net *net = dev_net(dev);
2294
2295         if (unlikely(!idev))
2296                 return ERR_PTR(-ENODEV);
2297
2298         rt = ip6_dst_alloc(net, dev, 0);
2299         if (unlikely(!rt)) {
2300                 in6_dev_put(idev);
2301                 dst = ERR_PTR(-ENOMEM);
2302                 goto out;
2303         }
2304
2305         rt->dst.flags |= DST_HOST;
2306         rt->dst.output  = ip6_output;
2307         rt->rt6i_gateway  = fl6->daddr;
2308         rt->rt6i_dst.addr = fl6->daddr;
2309         rt->rt6i_dst.plen = 128;
2310         rt->rt6i_idev     = idev;
2311         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2312
2313         /* Add this dst into uncached_list so that rt6_ifdown() can
2314          * do proper release of the net_device
2315          */
2316         rt6_uncached_list_add(rt);
2317
2318         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2319
2320 out:
2321         return dst;
2322 }
2323
2324 static int ip6_dst_gc(struct dst_ops *ops)
2325 {
2326         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2327         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2328         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2329         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2330         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2331         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2332         int entries;
2333
2334         entries = dst_entries_get_fast(ops);
2335         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2336             entries <= rt_max_size)
2337                 goto out;
2338
2339         net->ipv6.ip6_rt_gc_expire++;
2340         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2341         entries = dst_entries_get_slow(ops);
2342         if (entries < ops->gc_thresh)
2343                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2344 out:
2345         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2346         return entries > rt_max_size;
2347 }
2348
2349 static int ip6_convert_metrics(struct mx6_config *mxc,
2350                                const struct fib6_config *cfg)
2351 {
2352         bool ecn_ca = false;
2353         struct nlattr *nla;
2354         int remaining;
2355         u32 *mp;
2356
2357         if (!cfg->fc_mx)
2358                 return 0;
2359
2360         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2361         if (unlikely(!mp))
2362                 return -ENOMEM;
2363
2364         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2365                 int type = nla_type(nla);
2366                 u32 val;
2367
2368                 if (!type)
2369                         continue;
2370                 if (unlikely(type > RTAX_MAX))
2371                         goto err;
2372
2373                 if (type == RTAX_CC_ALGO) {
2374                         char tmp[TCP_CA_NAME_MAX];
2375
2376                         nla_strlcpy(tmp, nla, sizeof(tmp));
2377                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2378                         if (val == TCP_CA_UNSPEC)
2379                                 goto err;
2380                 } else {
2381                         val = nla_get_u32(nla);
2382                 }
2383                 if (type == RTAX_HOPLIMIT && val > 255)
2384                         val = 255;
2385                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2386                         goto err;
2387
2388                 mp[type - 1] = val;
2389                 __set_bit(type - 1, mxc->mx_valid);
2390         }
2391
2392         if (ecn_ca) {
2393                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2394                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2395         }
2396
2397         mxc->mx = mp;
2398         return 0;
2399  err:
2400         kfree(mp);
2401         return -EINVAL;
2402 }
2403
2404 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2405                                             struct fib6_config *cfg,
2406                                             const struct in6_addr *gw_addr)
2407 {
2408         struct flowi6 fl6 = {
2409                 .flowi6_oif = cfg->fc_ifindex,
2410                 .daddr = *gw_addr,
2411                 .saddr = cfg->fc_prefsrc,
2412         };
2413         struct fib6_table *table;
2414         struct rt6_info *rt;
2415         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2416
2417         table = fib6_get_table(net, cfg->fc_table);
2418         if (!table)
2419                 return NULL;
2420
2421         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2422                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2423
2424         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2425
2426         /* if table lookup failed, fall back to full lookup */
2427         if (rt == net->ipv6.ip6_null_entry) {
2428                 ip6_rt_put(rt);
2429                 rt = NULL;
2430         }
2431
2432         return rt;
2433 }
2434
2435 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2436                                               struct netlink_ext_ack *extack)
2437 {
2438         struct net *net = cfg->fc_nlinfo.nl_net;
2439         struct rt6_info *rt = NULL;
2440         struct net_device *dev = NULL;
2441         struct inet6_dev *idev = NULL;
2442         struct fib6_table *table;
2443         int addr_type;
2444         int err = -EINVAL;
2445
2446         /* RTF_PCPU is an internal flag; can not be set by userspace */
2447         if (cfg->fc_flags & RTF_PCPU) {
2448                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2449                 goto out;
2450         }
2451
2452         if (cfg->fc_dst_len > 128) {
2453                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2454                 goto out;
2455         }
2456         if (cfg->fc_src_len > 128) {
2457                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2458                 goto out;
2459         }
2460 #ifndef CONFIG_IPV6_SUBTREES
2461         if (cfg->fc_src_len) {
2462                 NL_SET_ERR_MSG(extack,
2463                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2464                 goto out;
2465         }
2466 #endif
2467         if (cfg->fc_ifindex) {
2468                 err = -ENODEV;
2469                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2470                 if (!dev)
2471                         goto out;
2472                 idev = in6_dev_get(dev);
2473                 if (!idev)
2474                         goto out;
2475         }
2476
2477         if (cfg->fc_metric == 0)
2478                 cfg->fc_metric = IP6_RT_PRIO_USER;
2479
2480         err = -ENOBUFS;
2481         if (cfg->fc_nlinfo.nlh &&
2482             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2483                 table = fib6_get_table(net, cfg->fc_table);
2484                 if (!table) {
2485                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2486                         table = fib6_new_table(net, cfg->fc_table);
2487                 }
2488         } else {
2489                 table = fib6_new_table(net, cfg->fc_table);
2490         }
2491
2492         if (!table)
2493                 goto out;
2494
2495         rt = ip6_dst_alloc(net, NULL,
2496                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2497
2498         if (!rt) {
2499                 err = -ENOMEM;
2500                 goto out;
2501         }
2502
2503         if (cfg->fc_flags & RTF_EXPIRES)
2504                 rt6_set_expires(rt, jiffies +
2505                                 clock_t_to_jiffies(cfg->fc_expires));
2506         else
2507                 rt6_clean_expires(rt);
2508
2509         if (cfg->fc_protocol == RTPROT_UNSPEC)
2510                 cfg->fc_protocol = RTPROT_BOOT;
2511         rt->rt6i_protocol = cfg->fc_protocol;
2512
2513         addr_type = ipv6_addr_type(&cfg->fc_dst);
2514
2515         if (addr_type & IPV6_ADDR_MULTICAST)
2516                 rt->dst.input = ip6_mc_input;
2517         else if (cfg->fc_flags & RTF_LOCAL)
2518                 rt->dst.input = ip6_input;
2519         else
2520                 rt->dst.input = ip6_forward;
2521
2522         rt->dst.output = ip6_output;
2523
2524         if (cfg->fc_encap) {
2525                 struct lwtunnel_state *lwtstate;
2526
2527                 err = lwtunnel_build_state(cfg->fc_encap_type,
2528                                            cfg->fc_encap, AF_INET6, cfg,
2529                                            &lwtstate, extack);
2530                 if (err)
2531                         goto out;
2532                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2533                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2534                         rt->dst.lwtstate->orig_output = rt->dst.output;
2535                         rt->dst.output = lwtunnel_output;
2536                 }
2537                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2538                         rt->dst.lwtstate->orig_input = rt->dst.input;
2539                         rt->dst.input = lwtunnel_input;
2540                 }
2541         }
2542
2543         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2544         rt->rt6i_dst.plen = cfg->fc_dst_len;
2545         if (rt->rt6i_dst.plen == 128)
2546                 rt->dst.flags |= DST_HOST;
2547
2548 #ifdef CONFIG_IPV6_SUBTREES
2549         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2550         rt->rt6i_src.plen = cfg->fc_src_len;
2551 #endif
2552
2553         rt->rt6i_metric = cfg->fc_metric;
2554
2555         /* We cannot add true routes via loopback here,
2556            they would result in kernel looping; promote them to reject routes
2557          */
2558         if ((cfg->fc_flags & RTF_REJECT) ||
2559             (dev && (dev->flags & IFF_LOOPBACK) &&
2560              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2561              !(cfg->fc_flags & RTF_LOCAL))) {
2562                 /* hold loopback dev/idev if we haven't done so. */
2563                 if (dev != net->loopback_dev) {
2564                         if (dev) {
2565                                 dev_put(dev);
2566                                 in6_dev_put(idev);
2567                         }
2568                         dev = net->loopback_dev;
2569                         dev_hold(dev);
2570                         idev = in6_dev_get(dev);
2571                         if (!idev) {
2572                                 err = -ENODEV;
2573                                 goto out;
2574                         }
2575                 }
2576                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2577                 switch (cfg->fc_type) {
2578                 case RTN_BLACKHOLE:
2579                         rt->dst.error = -EINVAL;
2580                         rt->dst.output = dst_discard_out;
2581                         rt->dst.input = dst_discard;
2582                         break;
2583                 case RTN_PROHIBIT:
2584                         rt->dst.error = -EACCES;
2585                         rt->dst.output = ip6_pkt_prohibit_out;
2586                         rt->dst.input = ip6_pkt_prohibit;
2587                         break;
2588                 case RTN_THROW:
2589                 case RTN_UNREACHABLE:
2590                 default:
2591                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2592                                         : (cfg->fc_type == RTN_UNREACHABLE)
2593                                         ? -EHOSTUNREACH : -ENETUNREACH;
2594                         rt->dst.output = ip6_pkt_discard_out;
2595                         rt->dst.input = ip6_pkt_discard;
2596                         break;
2597                 }
2598                 goto install_route;
2599         }
2600
2601         if (cfg->fc_flags & RTF_GATEWAY) {
2602                 const struct in6_addr *gw_addr;
2603                 int gwa_type;
2604
2605                 gw_addr = &cfg->fc_gateway;
2606                 gwa_type = ipv6_addr_type(gw_addr);
2607
2608                 /* if gw_addr is local we will fail to detect this in case
2609                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2610                  * will return already-added prefix route via interface that
2611                  * prefix route was assigned to, which might be non-loopback.
2612                  */
2613                 err = -EINVAL;
2614                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2615                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2616                                             dev : NULL, 0, 0)) {
2617                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2618                         goto out;
2619                 }
2620                 rt->rt6i_gateway = *gw_addr;
2621
2622                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2623                         struct rt6_info *grt = NULL;
2624
2625                         /* IPv6 strictly inhibits using not link-local
2626                            addresses as nexthop address.
2627                            Otherwise, router will not able to send redirects.
2628                            It is very good, but in some (rare!) circumstances
2629                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2630                            some exceptions. --ANK
2631                            We allow IPv4-mapped nexthops to support RFC4798-type
2632                            addressing
2633                          */
2634                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2635                                           IPV6_ADDR_MAPPED))) {
2636                                 NL_SET_ERR_MSG(extack,
2637                                                "Invalid gateway address");
2638                                 goto out;
2639                         }
2640
2641                         if (cfg->fc_table) {
2642                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2643
2644                                 if (grt) {
2645                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2646                                             (dev && dev != grt->dst.dev)) {
2647                                                 ip6_rt_put(grt);
2648                                                 grt = NULL;
2649                                         }
2650                                 }
2651                         }
2652
2653                         if (!grt)
2654                                 grt = rt6_lookup(net, gw_addr, NULL,
2655                                                  cfg->fc_ifindex, 1);
2656
2657                         err = -EHOSTUNREACH;
2658                         if (!grt)
2659                                 goto out;
2660                         if (dev) {
2661                                 if (dev != grt->dst.dev) {
2662                                         ip6_rt_put(grt);
2663                                         goto out;
2664                                 }
2665                         } else {
2666                                 dev = grt->dst.dev;
2667                                 idev = grt->rt6i_idev;
2668                                 dev_hold(dev);
2669                                 in6_dev_hold(grt->rt6i_idev);
2670                         }
2671                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2672                                 err = 0;
2673                         ip6_rt_put(grt);
2674
2675                         if (err)
2676                                 goto out;
2677                 }
2678                 err = -EINVAL;
2679                 if (!dev) {
2680                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2681                         goto out;
2682                 } else if (dev->flags & IFF_LOOPBACK) {
2683                         NL_SET_ERR_MSG(extack,
2684                                        "Egress device can not be loopback device for this route");
2685                         goto out;
2686                 }
2687         }
2688
2689         err = -ENODEV;
2690         if (!dev)
2691                 goto out;
2692
2693         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2694                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2695                         NL_SET_ERR_MSG(extack, "Invalid source address");
2696                         err = -EINVAL;
2697                         goto out;
2698                 }
2699                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2700                 rt->rt6i_prefsrc.plen = 128;
2701         } else
2702                 rt->rt6i_prefsrc.plen = 0;
2703
2704         rt->rt6i_flags = cfg->fc_flags;
2705
2706 install_route:
2707         rt->dst.dev = dev;
2708         rt->rt6i_idev = idev;
2709         rt->rt6i_table = table;
2710
2711         cfg->fc_nlinfo.nl_net = dev_net(dev);
2712
2713         return rt;
2714 out:
2715         if (dev)
2716                 dev_put(dev);
2717         if (idev)
2718                 in6_dev_put(idev);
2719         if (rt)
2720                 dst_release_immediate(&rt->dst);
2721
2722         return ERR_PTR(err);
2723 }
2724
2725 int ip6_route_add(struct fib6_config *cfg,
2726                   struct netlink_ext_ack *extack)
2727 {
2728         struct mx6_config mxc = { .mx = NULL, };
2729         struct rt6_info *rt;
2730         int err;
2731
2732         rt = ip6_route_info_create(cfg, extack);
2733         if (IS_ERR(rt)) {
2734                 err = PTR_ERR(rt);
2735                 rt = NULL;
2736                 goto out;
2737         }
2738
2739         err = ip6_convert_metrics(&mxc, cfg);
2740         if (err)
2741                 goto out;
2742
2743         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2744
2745         kfree(mxc.mx);
2746
2747         return err;
2748 out:
2749         if (rt)
2750                 dst_release_immediate(&rt->dst);
2751
2752         return err;
2753 }
2754
2755 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2756 {
2757         int err;
2758         struct fib6_table *table;
2759         struct net *net = dev_net(rt->dst.dev);
2760
2761         if (rt == net->ipv6.ip6_null_entry) {
2762                 err = -ENOENT;
2763                 goto out;
2764         }
2765
2766         table = rt->rt6i_table;
2767         write_lock_bh(&table->tb6_lock);
2768         err = fib6_del(rt, info);
2769         write_unlock_bh(&table->tb6_lock);
2770
2771 out:
2772         ip6_rt_put(rt);
2773         return err;
2774 }
2775
2776 int ip6_del_rt(struct rt6_info *rt)
2777 {
2778         struct nl_info info = {
2779                 .nl_net = dev_net(rt->dst.dev),
2780         };
2781         return __ip6_del_rt(rt, &info);
2782 }
2783
2784 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2785 {
2786         struct nl_info *info = &cfg->fc_nlinfo;
2787         struct net *net = info->nl_net;
2788         struct sk_buff *skb = NULL;
2789         struct fib6_table *table;
2790         int err = -ENOENT;
2791
2792         if (rt == net->ipv6.ip6_null_entry)
2793                 goto out_put;
2794         table = rt->rt6i_table;
2795         write_lock_bh(&table->tb6_lock);
2796
2797         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2798                 struct rt6_info *sibling, *next_sibling;
2799
2800                 /* prefer to send a single notification with all hops */
2801                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2802                 if (skb) {
2803                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2804
2805                         if (rt6_fill_node(net, skb, rt,
2806                                           NULL, NULL, 0, RTM_DELROUTE,
2807                                           info->portid, seq, 0) < 0) {
2808                                 kfree_skb(skb);
2809                                 skb = NULL;
2810                         } else
2811                                 info->skip_notify = 1;
2812                 }
2813
2814                 list_for_each_entry_safe(sibling, next_sibling,
2815                                          &rt->rt6i_siblings,
2816                                          rt6i_siblings) {
2817                         err = fib6_del(sibling, info);
2818                         if (err)
2819                                 goto out_unlock;
2820                 }
2821         }
2822
2823         err = fib6_del(rt, info);
2824 out_unlock:
2825         write_unlock_bh(&table->tb6_lock);
2826 out_put:
2827         ip6_rt_put(rt);
2828
2829         if (skb) {
2830                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2831                             info->nlh, gfp_any());
2832         }
2833         return err;
2834 }
2835
2836 static int ip6_route_del(struct fib6_config *cfg,
2837                          struct netlink_ext_ack *extack)
2838 {
2839         struct rt6_info *rt, *rt_cache;
2840         struct fib6_table *table;
2841         struct fib6_node *fn;
2842         int err = -ESRCH;
2843
2844         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2845         if (!table) {
2846                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2847                 return err;
2848         }
2849
2850         read_lock_bh(&table->tb6_lock);
2851
2852         fn = fib6_locate(&table->tb6_root,
2853                          &cfg->fc_dst, cfg->fc_dst_len,
2854                          &cfg->fc_src, cfg->fc_src_len,
2855                          !(cfg->fc_flags & RTF_CACHE));
2856
2857         if (fn) {
2858                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2859                         if (cfg->fc_flags & RTF_CACHE) {
2860                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2861                                                               &cfg->fc_src);
2862                                 if (!rt_cache)
2863                                         continue;
2864                                 rt = rt_cache;
2865                         }
2866                         if (cfg->fc_ifindex &&
2867                             (!rt->dst.dev ||
2868                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2869                                 continue;
2870                         if (cfg->fc_flags & RTF_GATEWAY &&
2871                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2872                                 continue;
2873                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2874                                 continue;
2875                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2876                                 continue;
2877                         if (!dst_hold_safe(&rt->dst))
2878                                 break;
2879                         read_unlock_bh(&table->tb6_lock);
2880
2881                         /* if gateway was specified only delete the one hop */
2882                         if (cfg->fc_flags & RTF_GATEWAY)
2883                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2884
2885                         return __ip6_del_rt_siblings(rt, cfg);
2886                 }
2887         }
2888         read_unlock_bh(&table->tb6_lock);
2889
2890         return err;
2891 }
2892
2893 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2894 {
2895         struct netevent_redirect netevent;
2896         struct rt6_info *rt, *nrt = NULL;
2897         struct ndisc_options ndopts;
2898         struct inet6_dev *in6_dev;
2899         struct neighbour *neigh;
2900         struct rd_msg *msg;
2901         int optlen, on_link;
2902         u8 *lladdr;
2903
2904         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2905         optlen -= sizeof(*msg);
2906
2907         if (optlen < 0) {
2908                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2909                 return;
2910         }
2911
2912         msg = (struct rd_msg *)icmp6_hdr(skb);
2913
2914         if (ipv6_addr_is_multicast(&msg->dest)) {
2915                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2916                 return;
2917         }
2918
2919         on_link = 0;
2920         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2921                 on_link = 1;
2922         } else if (ipv6_addr_type(&msg->target) !=
2923                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2924                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2925                 return;
2926         }
2927
2928         in6_dev = __in6_dev_get(skb->dev);
2929         if (!in6_dev)
2930                 return;
2931         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2932                 return;
2933
2934         /* RFC2461 8.1:
2935          *      The IP source address of the Redirect MUST be the same as the current
2936          *      first-hop router for the specified ICMP Destination Address.
2937          */
2938
2939         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2940                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2941                 return;
2942         }
2943
2944         lladdr = NULL;
2945         if (ndopts.nd_opts_tgt_lladdr) {
2946                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2947                                              skb->dev);
2948                 if (!lladdr) {
2949                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2950                         return;
2951                 }
2952         }
2953
2954         rt = (struct rt6_info *) dst;
2955         if (rt->rt6i_flags & RTF_REJECT) {
2956                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2957                 return;
2958         }
2959
2960         /* Redirect received -> path was valid.
2961          * Look, redirects are sent only in response to data packets,
2962          * so that this nexthop apparently is reachable. --ANK
2963          */
2964         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2965
2966         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2967         if (!neigh)
2968                 return;
2969
2970         /*
2971          *      We have finally decided to accept it.
2972          */
2973
2974         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2975                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2976                      NEIGH_UPDATE_F_OVERRIDE|
2977                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2978                                      NEIGH_UPDATE_F_ISROUTER)),
2979                      NDISC_REDIRECT, &ndopts);
2980
2981         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2982         if (!nrt)
2983                 goto out;
2984
2985         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2986         if (on_link)
2987                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2988
2989         nrt->rt6i_protocol = RTPROT_REDIRECT;
2990         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2991
2992         /* No need to remove rt from the exception table if rt is
2993          * a cached route because rt6_insert_exception() will
2994          * takes care of it
2995          */
2996         if (rt6_insert_exception(nrt, rt)) {
2997                 dst_release_immediate(&nrt->dst);
2998                 goto out;
2999         }
3000
3001         netevent.old = &rt->dst;
3002         netevent.new = &nrt->dst;
3003         netevent.daddr = &msg->dest;
3004         netevent.neigh = neigh;
3005         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3006
3007 out:
3008         neigh_release(neigh);
3009 }
3010
3011 /*
3012  *      Misc support functions
3013  */
3014
3015 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3016 {
3017         BUG_ON(from->dst.from);
3018
3019         rt->rt6i_flags &= ~RTF_EXPIRES;
3020         dst_hold(&from->dst);
3021         rt->dst.from = &from->dst;
3022         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3023 }
3024
3025 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3026 {
3027         rt->dst.input = ort->dst.input;
3028         rt->dst.output = ort->dst.output;
3029         rt->rt6i_dst = ort->rt6i_dst;
3030         rt->dst.error = ort->dst.error;
3031         rt->rt6i_idev = ort->rt6i_idev;
3032         if (rt->rt6i_idev)
3033                 in6_dev_hold(rt->rt6i_idev);
3034         rt->dst.lastuse = jiffies;
3035         rt->rt6i_gateway = ort->rt6i_gateway;
3036         rt->rt6i_flags = ort->rt6i_flags;
3037         rt6_set_from(rt, ort);
3038         rt->rt6i_metric = ort->rt6i_metric;
3039 #ifdef CONFIG_IPV6_SUBTREES
3040         rt->rt6i_src = ort->rt6i_src;
3041 #endif
3042         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3043         rt->rt6i_table = ort->rt6i_table;
3044         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3045 }
3046
3047 #ifdef CONFIG_IPV6_ROUTE_INFO
3048 static struct rt6_info *rt6_get_route_info(struct net *net,
3049                                            const struct in6_addr *prefix, int prefixlen,
3050                                            const struct in6_addr *gwaddr,
3051                                            struct net_device *dev)
3052 {
3053         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3054         int ifindex = dev->ifindex;
3055         struct fib6_node *fn;
3056         struct rt6_info *rt = NULL;
3057         struct fib6_table *table;
3058
3059         table = fib6_get_table(net, tb_id);
3060         if (!table)
3061                 return NULL;
3062
3063         read_lock_bh(&table->tb6_lock);
3064         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3065         if (!fn)
3066                 goto out;
3067
3068         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3069                 if (rt->dst.dev->ifindex != ifindex)
3070                         continue;
3071                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3072                         continue;
3073                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3074                         continue;
3075                 ip6_hold_safe(NULL, &rt, false);
3076                 break;
3077         }
3078 out:
3079         read_unlock_bh(&table->tb6_lock);
3080         return rt;
3081 }
3082
3083 static struct rt6_info *rt6_add_route_info(struct net *net,
3084                                            const struct in6_addr *prefix, int prefixlen,
3085                                            const struct in6_addr *gwaddr,
3086                                            struct net_device *dev,
3087                                            unsigned int pref)
3088 {
3089         struct fib6_config cfg = {
3090                 .fc_metric      = IP6_RT_PRIO_USER,
3091                 .fc_ifindex     = dev->ifindex,
3092                 .fc_dst_len     = prefixlen,
3093                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3094                                   RTF_UP | RTF_PREF(pref),
3095                 .fc_protocol = RTPROT_RA,
3096                 .fc_nlinfo.portid = 0,
3097                 .fc_nlinfo.nlh = NULL,
3098                 .fc_nlinfo.nl_net = net,
3099         };
3100
3101         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3102         cfg.fc_dst = *prefix;
3103         cfg.fc_gateway = *gwaddr;
3104
3105         /* We should treat it as a default route if prefix length is 0. */
3106         if (!prefixlen)
3107                 cfg.fc_flags |= RTF_DEFAULT;
3108
3109         ip6_route_add(&cfg, NULL);
3110
3111         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3112 }
3113 #endif
3114
3115 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3116 {
3117         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3118         struct rt6_info *rt;
3119         struct fib6_table *table;
3120
3121         table = fib6_get_table(dev_net(dev), tb_id);
3122         if (!table)
3123                 return NULL;
3124
3125         read_lock_bh(&table->tb6_lock);
3126         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3127                 if (dev == rt->dst.dev &&
3128                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3129                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3130                         break;
3131         }
3132         if (rt)
3133                 ip6_hold_safe(NULL, &rt, false);
3134         read_unlock_bh(&table->tb6_lock);
3135         return rt;
3136 }
3137
3138 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3139                                      struct net_device *dev,
3140                                      unsigned int pref)
3141 {
3142         struct fib6_config cfg = {
3143                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3144                 .fc_metric      = IP6_RT_PRIO_USER,
3145                 .fc_ifindex     = dev->ifindex,
3146                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3147                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3148                 .fc_protocol = RTPROT_RA,
3149                 .fc_nlinfo.portid = 0,
3150                 .fc_nlinfo.nlh = NULL,
3151                 .fc_nlinfo.nl_net = dev_net(dev),
3152         };
3153
3154         cfg.fc_gateway = *gwaddr;
3155
3156         if (!ip6_route_add(&cfg, NULL)) {
3157                 struct fib6_table *table;
3158
3159                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3160                 if (table)
3161                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3162         }
3163
3164         return rt6_get_dflt_router(gwaddr, dev);
3165 }
3166
3167 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3168 {
3169         struct rt6_info *rt;
3170
3171 restart:
3172         read_lock_bh(&table->tb6_lock);
3173         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3174                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3175                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3176                         if (dst_hold_safe(&rt->dst)) {
3177                                 read_unlock_bh(&table->tb6_lock);
3178                                 ip6_del_rt(rt);
3179                         } else {
3180                                 read_unlock_bh(&table->tb6_lock);
3181                         }
3182                         goto restart;
3183                 }
3184         }
3185         read_unlock_bh(&table->tb6_lock);
3186
3187         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3188 }
3189
3190 void rt6_purge_dflt_routers(struct net *net)
3191 {
3192         struct fib6_table *table;
3193         struct hlist_head *head;
3194         unsigned int h;
3195
3196         rcu_read_lock();
3197
3198         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3199                 head = &net->ipv6.fib_table_hash[h];
3200                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3201                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3202                                 __rt6_purge_dflt_routers(table);
3203                 }
3204         }
3205
3206         rcu_read_unlock();
3207 }
3208
3209 static void rtmsg_to_fib6_config(struct net *net,
3210                                  struct in6_rtmsg *rtmsg,
3211                                  struct fib6_config *cfg)
3212 {
3213         memset(cfg, 0, sizeof(*cfg));
3214
3215         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3216                          : RT6_TABLE_MAIN;
3217         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3218         cfg->fc_metric = rtmsg->rtmsg_metric;
3219         cfg->fc_expires = rtmsg->rtmsg_info;
3220         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3221         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3222         cfg->fc_flags = rtmsg->rtmsg_flags;
3223
3224         cfg->fc_nlinfo.nl_net = net;
3225
3226         cfg->fc_dst = rtmsg->rtmsg_dst;
3227         cfg->fc_src = rtmsg->rtmsg_src;
3228         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3229 }
3230
3231 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3232 {
3233         struct fib6_config cfg;
3234         struct in6_rtmsg rtmsg;
3235         int err;
3236
3237         switch (cmd) {
3238         case SIOCADDRT:         /* Add a route */
3239         case SIOCDELRT:         /* Delete a route */
3240                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3241                         return -EPERM;
3242                 err = copy_from_user(&rtmsg, arg,
3243                                      sizeof(struct in6_rtmsg));
3244                 if (err)
3245                         return -EFAULT;
3246
3247                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3248
3249                 rtnl_lock();
3250                 switch (cmd) {
3251                 case SIOCADDRT:
3252                         err = ip6_route_add(&cfg, NULL);
3253                         break;
3254                 case SIOCDELRT:
3255                         err = ip6_route_del(&cfg, NULL);
3256                         break;
3257                 default:
3258                         err = -EINVAL;
3259                 }
3260                 rtnl_unlock();
3261
3262                 return err;
3263         }
3264
3265         return -EINVAL;
3266 }
3267
3268 /*
3269  *      Drop the packet on the floor
3270  */
3271
3272 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3273 {
3274         int type;
3275         struct dst_entry *dst = skb_dst(skb);
3276         switch (ipstats_mib_noroutes) {
3277         case IPSTATS_MIB_INNOROUTES:
3278                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3279                 if (type == IPV6_ADDR_ANY) {
3280                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3281                                       IPSTATS_MIB_INADDRERRORS);
3282                         break;
3283                 }
3284                 /* FALLTHROUGH */
3285         case IPSTATS_MIB_OUTNOROUTES:
3286                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3287                               ipstats_mib_noroutes);
3288                 break;
3289         }
3290         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3291         kfree_skb(skb);
3292         return 0;
3293 }
3294
3295 static int ip6_pkt_discard(struct sk_buff *skb)
3296 {
3297         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3298 }
3299
3300 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3301 {
3302         skb->dev = skb_dst(skb)->dev;
3303         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3304 }
3305
3306 static int ip6_pkt_prohibit(struct sk_buff *skb)
3307 {
3308         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3309 }
3310
3311 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3312 {
3313         skb->dev = skb_dst(skb)->dev;
3314         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3315 }
3316
3317 /*
3318  *      Allocate a dst for local (unicast / anycast) address.
3319  */
3320
3321 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3322                                     const struct in6_addr *addr,
3323                                     bool anycast)
3324 {
3325         u32 tb_id;
3326         struct net *net = dev_net(idev->dev);
3327         struct net_device *dev = idev->dev;
3328         struct rt6_info *rt;
3329
3330         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3331         if (!rt)
3332                 return ERR_PTR(-ENOMEM);
3333
3334         in6_dev_hold(idev);
3335
3336         rt->dst.flags |= DST_HOST;
3337         rt->dst.input = ip6_input;
3338         rt->dst.output = ip6_output;
3339         rt->rt6i_idev = idev;
3340
3341         rt->rt6i_protocol = RTPROT_KERNEL;
3342         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3343         if (anycast)
3344                 rt->rt6i_flags |= RTF_ANYCAST;
3345         else
3346                 rt->rt6i_flags |= RTF_LOCAL;
3347
3348         rt->rt6i_gateway  = *addr;
3349         rt->rt6i_dst.addr = *addr;
3350         rt->rt6i_dst.plen = 128;
3351         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3352         rt->rt6i_table = fib6_get_table(net, tb_id);
3353
3354         return rt;
3355 }
3356
3357 /* remove deleted ip from prefsrc entries */
3358 struct arg_dev_net_ip {
3359         struct net_device *dev;
3360         struct net *net;
3361         struct in6_addr *addr;
3362 };
3363
3364 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3365 {
3366         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3367         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3368         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3369
3370         if (((void *)rt->dst.dev == dev || !dev) &&
3371             rt != net->ipv6.ip6_null_entry &&
3372             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3373                 spin_lock_bh(&rt6_exception_lock);
3374                 /* remove prefsrc entry */
3375                 rt->rt6i_prefsrc.plen = 0;
3376                 /* need to update cache as well */
3377                 rt6_exceptions_remove_prefsrc(rt);
3378                 spin_unlock_bh(&rt6_exception_lock);
3379         }
3380         return 0;
3381 }
3382
3383 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3384 {
3385         struct net *net = dev_net(ifp->idev->dev);
3386         struct arg_dev_net_ip adni = {
3387                 .dev = ifp->idev->dev,
3388                 .net = net,
3389                 .addr = &ifp->addr,
3390         };
3391         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3392 }
3393
3394 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3395
3396 /* Remove routers and update dst entries when gateway turn into host. */
3397 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3398 {
3399         struct in6_addr *gateway = (struct in6_addr *)arg;
3400
3401         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3402             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3403                 return -1;
3404         }
3405
3406         /* Further clean up cached routes in exception table.
3407          * This is needed because cached route may have a different
3408          * gateway than its 'parent' in the case of an ip redirect.
3409          */
3410         rt6_exceptions_clean_tohost(rt, gateway);
3411
3412         return 0;
3413 }
3414
3415 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3416 {
3417         fib6_clean_all(net, fib6_clean_tohost, gateway);
3418 }
3419
3420 struct arg_dev_net {
3421         struct net_device *dev;
3422         struct net *net;
3423 };
3424
3425 /* called with write lock held for table with rt */
3426 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3427 {
3428         const struct arg_dev_net *adn = arg;
3429         const struct net_device *dev = adn->dev;
3430
3431         if ((rt->dst.dev == dev || !dev) &&
3432             rt != adn->net->ipv6.ip6_null_entry &&
3433             (rt->rt6i_nsiblings == 0 ||
3434              (dev && netdev_unregistering(dev)) ||
3435              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3436                 return -1;
3437
3438         return 0;
3439 }
3440
3441 void rt6_ifdown(struct net *net, struct net_device *dev)
3442 {
3443         struct arg_dev_net adn = {
3444                 .dev = dev,
3445                 .net = net,
3446         };
3447
3448         fib6_clean_all(net, fib6_ifdown, &adn);
3449         if (dev)
3450                 rt6_uncached_list_flush_dev(net, dev);
3451 }
3452
3453 struct rt6_mtu_change_arg {
3454         struct net_device *dev;
3455         unsigned int mtu;
3456 };
3457
3458 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3459 {
3460         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3461         struct inet6_dev *idev;
3462
3463         /* In IPv6 pmtu discovery is not optional,
3464            so that RTAX_MTU lock cannot disable it.
3465            We still use this lock to block changes
3466            caused by addrconf/ndisc.
3467         */
3468
3469         idev = __in6_dev_get(arg->dev);
3470         if (!idev)
3471                 return 0;
3472
3473         /* For administrative MTU increase, there is no way to discover
3474            IPv6 PMTU increase, so PMTU increase should be updated here.
3475            Since RFC 1981 doesn't include administrative MTU increase
3476            update PMTU increase is a MUST. (i.e. jumbo frame)
3477          */
3478         /*
3479            If new MTU is less than route PMTU, this new MTU will be the
3480            lowest MTU in the path, update the route PMTU to reflect PMTU
3481            decreases; if new MTU is greater than route PMTU, and the
3482            old MTU is the lowest MTU in the path, update the route PMTU
3483            to reflect the increase. In this case if the other nodes' MTU
3484            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3485            PMTU discovery.
3486          */
3487         if (rt->dst.dev == arg->dev &&
3488             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3489             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3490                 spin_lock_bh(&rt6_exception_lock);
3491                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3492                     (dst_mtu(&rt->dst) < arg->mtu &&
3493                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3494                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3495                 }
3496                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3497                 spin_unlock_bh(&rt6_exception_lock);
3498         }
3499         return 0;
3500 }
3501
3502 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3503 {
3504         struct rt6_mtu_change_arg arg = {
3505                 .dev = dev,
3506                 .mtu = mtu,
3507         };
3508
3509         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3510 }
3511
3512 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3513         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3514         [RTA_OIF]               = { .type = NLA_U32 },
3515         [RTA_IIF]               = { .type = NLA_U32 },
3516         [RTA_PRIORITY]          = { .type = NLA_U32 },
3517         [RTA_METRICS]           = { .type = NLA_NESTED },
3518         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3519         [RTA_PREF]              = { .type = NLA_U8 },
3520         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3521         [RTA_ENCAP]             = { .type = NLA_NESTED },
3522         [RTA_EXPIRES]           = { .type = NLA_U32 },
3523         [RTA_UID]               = { .type = NLA_U32 },
3524         [RTA_MARK]              = { .type = NLA_U32 },
3525 };
3526
3527 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3528                               struct fib6_config *cfg,
3529                               struct netlink_ext_ack *extack)
3530 {
3531         struct rtmsg *rtm;
3532         struct nlattr *tb[RTA_MAX+1];
3533         unsigned int pref;
3534         int err;
3535
3536         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3537                           NULL);
3538         if (err < 0)
3539                 goto errout;
3540
3541         err = -EINVAL;
3542         rtm = nlmsg_data(nlh);
3543         memset(cfg, 0, sizeof(*cfg));
3544
3545         cfg->fc_table = rtm->rtm_table;
3546         cfg->fc_dst_len = rtm->rtm_dst_len;
3547         cfg->fc_src_len = rtm->rtm_src_len;
3548         cfg->fc_flags = RTF_UP;
3549         cfg->fc_protocol = rtm->rtm_protocol;
3550         cfg->fc_type = rtm->rtm_type;
3551
3552         if (rtm->rtm_type == RTN_UNREACHABLE ||
3553             rtm->rtm_type == RTN_BLACKHOLE ||
3554             rtm->rtm_type == RTN_PROHIBIT ||
3555             rtm->rtm_type == RTN_THROW)
3556                 cfg->fc_flags |= RTF_REJECT;
3557
3558         if (rtm->rtm_type == RTN_LOCAL)
3559                 cfg->fc_flags |= RTF_LOCAL;
3560
3561         if (rtm->rtm_flags & RTM_F_CLONED)
3562                 cfg->fc_flags |= RTF_CACHE;
3563
3564         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3565         cfg->fc_nlinfo.nlh = nlh;
3566         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3567
3568         if (tb[RTA_GATEWAY]) {
3569                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3570                 cfg->fc_flags |= RTF_GATEWAY;
3571         }
3572
3573         if (tb[RTA_DST]) {
3574                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3575
3576                 if (nla_len(tb[RTA_DST]) < plen)
3577                         goto errout;
3578
3579                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3580         }
3581
3582         if (tb[RTA_SRC]) {
3583                 int plen = (rtm->rtm_src_len + 7) >> 3;
3584
3585                 if (nla_len(tb[RTA_SRC]) < plen)
3586                         goto errout;
3587
3588                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3589         }
3590
3591         if (tb[RTA_PREFSRC])
3592                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3593
3594         if (tb[RTA_OIF])
3595                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3596
3597         if (tb[RTA_PRIORITY])
3598                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3599
3600         if (tb[RTA_METRICS]) {
3601                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3602                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3603         }
3604
3605         if (tb[RTA_TABLE])
3606                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3607
3608         if (tb[RTA_MULTIPATH]) {
3609                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3610                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3611
3612                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3613                                                      cfg->fc_mp_len, extack);
3614                 if (err < 0)
3615                         goto errout;
3616         }
3617
3618         if (tb[RTA_PREF]) {
3619                 pref = nla_get_u8(tb[RTA_PREF]);
3620                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3621                     pref != ICMPV6_ROUTER_PREF_HIGH)
3622                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3623                 cfg->fc_flags |= RTF_PREF(pref);
3624         }
3625
3626         if (tb[RTA_ENCAP])
3627                 cfg->fc_encap = tb[RTA_ENCAP];
3628
3629         if (tb[RTA_ENCAP_TYPE]) {
3630                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3631
3632                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3633                 if (err < 0)
3634                         goto errout;
3635         }
3636
3637         if (tb[RTA_EXPIRES]) {
3638                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3639
3640                 if (addrconf_finite_timeout(timeout)) {
3641                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3642                         cfg->fc_flags |= RTF_EXPIRES;
3643                 }
3644         }
3645
3646         err = 0;
3647 errout:
3648         return err;
3649 }
3650
3651 struct rt6_nh {
3652         struct rt6_info *rt6_info;
3653         struct fib6_config r_cfg;
3654         struct mx6_config mxc;
3655         struct list_head next;
3656 };
3657
3658 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3659 {
3660         struct rt6_nh *nh;
3661
3662         list_for_each_entry(nh, rt6_nh_list, next) {
3663                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3664                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3665                         nh->r_cfg.fc_ifindex);
3666         }
3667 }
3668
3669 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3670                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3671 {
3672         struct rt6_nh *nh;
3673         int err = -EEXIST;
3674
3675         list_for_each_entry(nh, rt6_nh_list, next) {
3676                 /* check if rt6_info already exists */
3677                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3678                         return err;
3679         }
3680
3681         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3682         if (!nh)
3683                 return -ENOMEM;
3684         nh->rt6_info = rt;
3685         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3686         if (err) {
3687                 kfree(nh);
3688                 return err;
3689         }
3690         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3691         list_add_tail(&nh->next, rt6_nh_list);
3692
3693         return 0;
3694 }
3695
3696 static void ip6_route_mpath_notify(struct rt6_info *rt,
3697                                    struct rt6_info *rt_last,
3698                                    struct nl_info *info,
3699                                    __u16 nlflags)
3700 {
3701         /* if this is an APPEND route, then rt points to the first route
3702          * inserted and rt_last points to last route inserted. Userspace
3703          * wants a consistent dump of the route which starts at the first
3704          * nexthop. Since sibling routes are always added at the end of
3705          * the list, find the first sibling of the last route appended
3706          */
3707         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3708                 rt = list_first_entry(&rt_last->rt6i_siblings,
3709                                       struct rt6_info,
3710                                       rt6i_siblings);
3711         }
3712
3713         if (rt)
3714                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3715 }
3716
3717 static int ip6_route_multipath_add(struct fib6_config *cfg,
3718                                    struct netlink_ext_ack *extack)
3719 {
3720         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3721         struct nl_info *info = &cfg->fc_nlinfo;
3722         struct fib6_config r_cfg;
3723         struct rtnexthop *rtnh;
3724         struct rt6_info *rt;
3725         struct rt6_nh *err_nh;
3726         struct rt6_nh *nh, *nh_safe;
3727         __u16 nlflags;
3728         int remaining;
3729         int attrlen;
3730         int err = 1;
3731         int nhn = 0;
3732         int replace = (cfg->fc_nlinfo.nlh &&
3733                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3734         LIST_HEAD(rt6_nh_list);
3735
3736         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3737         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3738                 nlflags |= NLM_F_APPEND;
3739
3740         remaining = cfg->fc_mp_len;
3741         rtnh = (struct rtnexthop *)cfg->fc_mp;
3742
3743         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3744          * rt6_info structs per nexthop
3745          */
3746         while (rtnh_ok(rtnh, remaining)) {
3747                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3748                 if (rtnh->rtnh_ifindex)
3749                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3750
3751                 attrlen = rtnh_attrlen(rtnh);
3752                 if (attrlen > 0) {
3753                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3754
3755                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3756                         if (nla) {
3757                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3758                                 r_cfg.fc_flags |= RTF_GATEWAY;
3759                         }
3760                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3761                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3762                         if (nla)
3763                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3764                 }
3765
3766                 rt = ip6_route_info_create(&r_cfg, extack);
3767                 if (IS_ERR(rt)) {
3768                         err = PTR_ERR(rt);
3769                         rt = NULL;
3770                         goto cleanup;
3771                 }
3772
3773                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3774                 if (err) {
3775                         dst_release_immediate(&rt->dst);
3776                         goto cleanup;
3777                 }
3778
3779                 rtnh = rtnh_next(rtnh, &remaining);
3780         }
3781
3782         /* for add and replace send one notification with all nexthops.
3783          * Skip the notification in fib6_add_rt2node and send one with
3784          * the full route when done
3785          */
3786         info->skip_notify = 1;
3787
3788         err_nh = NULL;
3789         list_for_each_entry(nh, &rt6_nh_list, next) {
3790                 rt_last = nh->rt6_info;
3791                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3792                 /* save reference to first route for notification */
3793                 if (!rt_notif && !err)
3794                         rt_notif = nh->rt6_info;
3795
3796                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3797                 nh->rt6_info = NULL;
3798                 if (err) {
3799                         if (replace && nhn)
3800                                 ip6_print_replace_route_err(&rt6_nh_list);
3801                         err_nh = nh;
3802                         goto add_errout;
3803                 }
3804
3805                 /* Because each route is added like a single route we remove
3806                  * these flags after the first nexthop: if there is a collision,
3807                  * we have already failed to add the first nexthop:
3808                  * fib6_add_rt2node() has rejected it; when replacing, old
3809                  * nexthops have been replaced by first new, the rest should
3810                  * be added to it.
3811                  */
3812                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3813                                                      NLM_F_REPLACE);
3814                 nhn++;
3815         }
3816
3817         /* success ... tell user about new route */
3818         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3819         goto cleanup;
3820
3821 add_errout:
3822         /* send notification for routes that were added so that
3823          * the delete notifications sent by ip6_route_del are
3824          * coherent
3825          */
3826         if (rt_notif)
3827                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3828
3829         /* Delete routes that were already added */
3830         list_for_each_entry(nh, &rt6_nh_list, next) {
3831                 if (err_nh == nh)
3832                         break;
3833                 ip6_route_del(&nh->r_cfg, extack);
3834         }
3835
3836 cleanup:
3837         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3838                 if (nh->rt6_info)
3839                         dst_release_immediate(&nh->rt6_info->dst);
3840                 kfree(nh->mxc.mx);
3841                 list_del(&nh->next);
3842                 kfree(nh);
3843         }
3844
3845         return err;
3846 }
3847
3848 static int ip6_route_multipath_del(struct fib6_config *cfg,
3849                                    struct netlink_ext_ack *extack)
3850 {
3851         struct fib6_config r_cfg;
3852         struct rtnexthop *rtnh;
3853         int remaining;
3854         int attrlen;
3855         int err = 1, last_err = 0;
3856
3857         remaining = cfg->fc_mp_len;
3858         rtnh = (struct rtnexthop *)cfg->fc_mp;
3859
3860         /* Parse a Multipath Entry */
3861         while (rtnh_ok(rtnh, remaining)) {
3862                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3863                 if (rtnh->rtnh_ifindex)
3864                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3865
3866                 attrlen = rtnh_attrlen(rtnh);
3867                 if (attrlen > 0) {
3868                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3869
3870                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3871                         if (nla) {
3872                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3873                                 r_cfg.fc_flags |= RTF_GATEWAY;
3874                         }
3875                 }
3876                 err = ip6_route_del(&r_cfg, extack);
3877                 if (err)
3878                         last_err = err;
3879
3880                 rtnh = rtnh_next(rtnh, &remaining);
3881         }
3882
3883         return last_err;
3884 }
3885
3886 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3887                               struct netlink_ext_ack *extack)
3888 {
3889         struct fib6_config cfg;
3890         int err;
3891
3892         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3893         if (err < 0)
3894                 return err;
3895
3896         if (cfg.fc_mp)
3897                 return ip6_route_multipath_del(&cfg, extack);
3898         else {
3899                 cfg.fc_delete_all_nh = 1;
3900                 return ip6_route_del(&cfg, extack);
3901         }
3902 }
3903
3904 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3905                               struct netlink_ext_ack *extack)
3906 {
3907         struct fib6_config cfg;
3908         int err;
3909
3910         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3911         if (err < 0)
3912                 return err;
3913
3914         if (cfg.fc_mp)
3915                 return ip6_route_multipath_add(&cfg, extack);
3916         else
3917                 return ip6_route_add(&cfg, extack);
3918 }
3919
3920 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3921 {
3922         int nexthop_len = 0;
3923
3924         if (rt->rt6i_nsiblings) {
3925                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3926                             + NLA_ALIGN(sizeof(struct rtnexthop))
3927                             + nla_total_size(16) /* RTA_GATEWAY */
3928                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3929
3930                 nexthop_len *= rt->rt6i_nsiblings;
3931         }
3932
3933         return NLMSG_ALIGN(sizeof(struct rtmsg))
3934                + nla_total_size(16) /* RTA_SRC */
3935                + nla_total_size(16) /* RTA_DST */
3936                + nla_total_size(16) /* RTA_GATEWAY */
3937                + nla_total_size(16) /* RTA_PREFSRC */
3938                + nla_total_size(4) /* RTA_TABLE */
3939                + nla_total_size(4) /* RTA_IIF */
3940                + nla_total_size(4) /* RTA_OIF */
3941                + nla_total_size(4) /* RTA_PRIORITY */
3942                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3943                + nla_total_size(sizeof(struct rta_cacheinfo))
3944                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3945                + nla_total_size(1) /* RTA_PREF */
3946                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3947                + nexthop_len;
3948 }
3949
3950 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3951                             unsigned int *flags, bool skip_oif)
3952 {
3953         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3954                 *flags |= RTNH_F_LINKDOWN;
3955                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3956                         *flags |= RTNH_F_DEAD;
3957         }
3958
3959         if (rt->rt6i_flags & RTF_GATEWAY) {
3960                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3961                         goto nla_put_failure;
3962         }
3963
3964         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3965                 *flags |= RTNH_F_OFFLOAD;
3966
3967         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3968         if (!skip_oif && rt->dst.dev &&
3969             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3970                 goto nla_put_failure;
3971
3972         if (rt->dst.lwtstate &&
3973             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3974                 goto nla_put_failure;
3975
3976         return 0;
3977
3978 nla_put_failure:
3979         return -EMSGSIZE;
3980 }
3981
3982 /* add multipath next hop */
3983 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3984 {
3985         struct rtnexthop *rtnh;
3986         unsigned int flags = 0;
3987
3988         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3989         if (!rtnh)
3990                 goto nla_put_failure;
3991
3992         rtnh->rtnh_hops = 0;
3993         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3994
3995         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3996                 goto nla_put_failure;
3997
3998         rtnh->rtnh_flags = flags;
3999
4000         /* length of rtnetlink header + attributes */
4001         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4002
4003         return 0;
4004
4005 nla_put_failure:
4006         return -EMSGSIZE;
4007 }
4008
4009 static int rt6_fill_node(struct net *net,
4010                          struct sk_buff *skb, struct rt6_info *rt,
4011                          struct in6_addr *dst, struct in6_addr *src,
4012                          int iif, int type, u32 portid, u32 seq,
4013                          unsigned int flags)
4014 {
4015         u32 metrics[RTAX_MAX];
4016         struct rtmsg *rtm;
4017         struct nlmsghdr *nlh;
4018         long expires;
4019         u32 table;
4020
4021         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4022         if (!nlh)
4023                 return -EMSGSIZE;
4024
4025         rtm = nlmsg_data(nlh);
4026         rtm->rtm_family = AF_INET6;
4027         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4028         rtm->rtm_src_len = rt->rt6i_src.plen;
4029         rtm->rtm_tos = 0;
4030         if (rt->rt6i_table)
4031                 table = rt->rt6i_table->tb6_id;
4032         else
4033                 table = RT6_TABLE_UNSPEC;
4034         rtm->rtm_table = table;
4035         if (nla_put_u32(skb, RTA_TABLE, table))
4036                 goto nla_put_failure;
4037         if (rt->rt6i_flags & RTF_REJECT) {
4038                 switch (rt->dst.error) {
4039                 case -EINVAL:
4040                         rtm->rtm_type = RTN_BLACKHOLE;
4041                         break;
4042                 case -EACCES:
4043                         rtm->rtm_type = RTN_PROHIBIT;
4044                         break;
4045                 case -EAGAIN:
4046                         rtm->rtm_type = RTN_THROW;
4047                         break;
4048                 default:
4049                         rtm->rtm_type = RTN_UNREACHABLE;
4050                         break;
4051                 }
4052         }
4053         else if (rt->rt6i_flags & RTF_LOCAL)
4054                 rtm->rtm_type = RTN_LOCAL;
4055         else if (rt->rt6i_flags & RTF_ANYCAST)
4056                 rtm->rtm_type = RTN_ANYCAST;
4057         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4058                 rtm->rtm_type = RTN_LOCAL;
4059         else
4060                 rtm->rtm_type = RTN_UNICAST;
4061         rtm->rtm_flags = 0;
4062         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4063         rtm->rtm_protocol = rt->rt6i_protocol;
4064
4065         if (rt->rt6i_flags & RTF_CACHE)
4066                 rtm->rtm_flags |= RTM_F_CLONED;
4067
4068         if (dst) {
4069                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4070                         goto nla_put_failure;
4071                 rtm->rtm_dst_len = 128;
4072         } else if (rtm->rtm_dst_len)
4073                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4074                         goto nla_put_failure;
4075 #ifdef CONFIG_IPV6_SUBTREES
4076         if (src) {
4077                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4078                         goto nla_put_failure;
4079                 rtm->rtm_src_len = 128;
4080         } else if (rtm->rtm_src_len &&
4081                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4082                 goto nla_put_failure;
4083 #endif
4084         if (iif) {
4085 #ifdef CONFIG_IPV6_MROUTE
4086                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4087                         int err = ip6mr_get_route(net, skb, rtm, portid);
4088
4089                         if (err == 0)
4090                                 return 0;
4091                         if (err < 0)
4092                                 goto nla_put_failure;
4093                 } else
4094 #endif
4095                         if (nla_put_u32(skb, RTA_IIF, iif))
4096                                 goto nla_put_failure;
4097         } else if (dst) {
4098                 struct in6_addr saddr_buf;
4099                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4100                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4101                         goto nla_put_failure;
4102         }
4103
4104         if (rt->rt6i_prefsrc.plen) {
4105                 struct in6_addr saddr_buf;
4106                 saddr_buf = rt->rt6i_prefsrc.addr;
4107                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4108                         goto nla_put_failure;
4109         }
4110
4111         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4112         if (rt->rt6i_pmtu)
4113                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4114         if (rtnetlink_put_metrics(skb, metrics) < 0)
4115                 goto nla_put_failure;
4116
4117         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4118                 goto nla_put_failure;
4119
4120         /* For multipath routes, walk the siblings list and add
4121          * each as a nexthop within RTA_MULTIPATH.
4122          */
4123         if (rt->rt6i_nsiblings) {
4124                 struct rt6_info *sibling, *next_sibling;
4125                 struct nlattr *mp;
4126
4127                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4128                 if (!mp)
4129                         goto nla_put_failure;
4130
4131                 if (rt6_add_nexthop(skb, rt) < 0)
4132                         goto nla_put_failure;
4133
4134                 list_for_each_entry_safe(sibling, next_sibling,
4135                                          &rt->rt6i_siblings, rt6i_siblings) {
4136                         if (rt6_add_nexthop(skb, sibling) < 0)
4137                                 goto nla_put_failure;
4138                 }
4139
4140                 nla_nest_end(skb, mp);
4141         } else {
4142                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4143                         goto nla_put_failure;
4144         }
4145
4146         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4147
4148         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4149                 goto nla_put_failure;
4150
4151         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4152                 goto nla_put_failure;
4153
4154
4155         nlmsg_end(skb, nlh);
4156         return 0;
4157
4158 nla_put_failure:
4159         nlmsg_cancel(skb, nlh);
4160         return -EMSGSIZE;
4161 }
4162
4163 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4164 {
4165         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4166         struct net *net = arg->net;
4167
4168         if (rt == net->ipv6.ip6_null_entry)
4169                 return 0;
4170
4171         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4172                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4173
4174                 /* user wants prefix routes only */
4175                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4176                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4177                         /* success since this is not a prefix route */
4178                         return 1;
4179                 }
4180         }
4181
4182         return rt6_fill_node(net,
4183                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4184                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4185                      NLM_F_MULTI);
4186 }
4187
4188 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4189                               struct netlink_ext_ack *extack)
4190 {
4191         struct net *net = sock_net(in_skb->sk);
4192         struct nlattr *tb[RTA_MAX+1];
4193         int err, iif = 0, oif = 0;
4194         struct dst_entry *dst;
4195         struct rt6_info *rt;
4196         struct sk_buff *skb;
4197         struct rtmsg *rtm;
4198         struct flowi6 fl6;
4199         bool fibmatch;
4200
4201         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4202                           extack);
4203         if (err < 0)
4204                 goto errout;
4205
4206         err = -EINVAL;
4207         memset(&fl6, 0, sizeof(fl6));
4208         rtm = nlmsg_data(nlh);
4209         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4210         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4211
4212         if (tb[RTA_SRC]) {
4213                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4214                         goto errout;
4215
4216                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4217         }
4218
4219         if (tb[RTA_DST]) {
4220                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4221                         goto errout;
4222
4223                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4224         }
4225
4226         if (tb[RTA_IIF])
4227                 iif = nla_get_u32(tb[RTA_IIF]);
4228
4229         if (tb[RTA_OIF])
4230                 oif = nla_get_u32(tb[RTA_OIF]);
4231
4232         if (tb[RTA_MARK])
4233                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4234
4235         if (tb[RTA_UID])
4236                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4237                                            nla_get_u32(tb[RTA_UID]));
4238         else
4239                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4240
4241         if (iif) {
4242                 struct net_device *dev;
4243                 int flags = 0;
4244
4245                 rcu_read_lock();
4246
4247                 dev = dev_get_by_index_rcu(net, iif);
4248                 if (!dev) {
4249                         rcu_read_unlock();
4250                         err = -ENODEV;
4251                         goto errout;
4252                 }
4253
4254                 fl6.flowi6_iif = iif;
4255
4256                 if (!ipv6_addr_any(&fl6.saddr))
4257                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4258
4259                 if (!fibmatch)
4260                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4261                 else
4262                         dst = ip6_route_lookup(net, &fl6, 0);
4263
4264                 rcu_read_unlock();
4265         } else {
4266                 fl6.flowi6_oif = oif;
4267
4268                 if (!fibmatch)
4269                         dst = ip6_route_output(net, NULL, &fl6);
4270                 else
4271                         dst = ip6_route_lookup(net, &fl6, 0);
4272         }
4273
4274
4275         rt = container_of(dst, struct rt6_info, dst);
4276         if (rt->dst.error) {
4277                 err = rt->dst.error;
4278                 ip6_rt_put(rt);
4279                 goto errout;
4280         }
4281
4282         if (rt == net->ipv6.ip6_null_entry) {
4283                 err = rt->dst.error;
4284                 ip6_rt_put(rt);
4285                 goto errout;
4286         }
4287
4288         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4289         if (!skb) {
4290                 ip6_rt_put(rt);
4291                 err = -ENOBUFS;
4292                 goto errout;
4293         }
4294
4295         skb_dst_set(skb, &rt->dst);
4296         if (fibmatch)
4297                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4298                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4299                                     nlh->nlmsg_seq, 0);
4300         else
4301                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4302                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4303                                     nlh->nlmsg_seq, 0);
4304         if (err < 0) {
4305                 kfree_skb(skb);
4306                 goto errout;
4307         }
4308
4309         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4310 errout:
4311         return err;
4312 }
4313
4314 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4315                      unsigned int nlm_flags)
4316 {
4317         struct sk_buff *skb;
4318         struct net *net = info->nl_net;
4319         u32 seq;
4320         int err;
4321
4322         err = -ENOBUFS;
4323         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4324
4325         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4326         if (!skb)
4327                 goto errout;
4328
4329         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4330                                 event, info->portid, seq, nlm_flags);
4331         if (err < 0) {
4332                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4333                 WARN_ON(err == -EMSGSIZE);
4334                 kfree_skb(skb);
4335                 goto errout;
4336         }
4337         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4338                     info->nlh, gfp_any());
4339         return;
4340 errout:
4341         if (err < 0)
4342                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4343 }
4344
4345 static int ip6_route_dev_notify(struct notifier_block *this,
4346                                 unsigned long event, void *ptr)
4347 {
4348         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4349         struct net *net = dev_net(dev);
4350
4351         if (!(dev->flags & IFF_LOOPBACK))
4352                 return NOTIFY_OK;
4353
4354         if (event == NETDEV_REGISTER) {
4355                 net->ipv6.ip6_null_entry->dst.dev = dev;
4356                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4358                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4359                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4360                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4361                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4362 #endif
4363          } else if (event == NETDEV_UNREGISTER &&
4364                     dev->reg_state != NETREG_UNREGISTERED) {
4365                 /* NETDEV_UNREGISTER could be fired for multiple times by
4366                  * netdev_wait_allrefs(). Make sure we only call this once.
4367                  */
4368                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4369 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4370                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4371                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4372 #endif
4373         }
4374
4375         return NOTIFY_OK;
4376 }
4377
4378 /*
4379  *      /proc
4380  */
4381
4382 #ifdef CONFIG_PROC_FS
4383
4384 static const struct file_operations ipv6_route_proc_fops = {
4385         .owner          = THIS_MODULE,
4386         .open           = ipv6_route_open,
4387         .read           = seq_read,
4388         .llseek         = seq_lseek,
4389         .release        = seq_release_net,
4390 };
4391
4392 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4393 {
4394         struct net *net = (struct net *)seq->private;
4395         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4396                    net->ipv6.rt6_stats->fib_nodes,
4397                    net->ipv6.rt6_stats->fib_route_nodes,
4398                    net->ipv6.rt6_stats->fib_rt_alloc,
4399                    net->ipv6.rt6_stats->fib_rt_entries,
4400                    net->ipv6.rt6_stats->fib_rt_cache,
4401                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4402                    net->ipv6.rt6_stats->fib_discarded_routes);
4403
4404         return 0;
4405 }
4406
4407 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4408 {
4409         return single_open_net(inode, file, rt6_stats_seq_show);
4410 }
4411
4412 static const struct file_operations rt6_stats_seq_fops = {
4413         .owner   = THIS_MODULE,
4414         .open    = rt6_stats_seq_open,
4415         .read    = seq_read,
4416         .llseek  = seq_lseek,
4417         .release = single_release_net,
4418 };
4419 #endif  /* CONFIG_PROC_FS */
4420
4421 #ifdef CONFIG_SYSCTL
4422
4423 static
4424 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4425                               void __user *buffer, size_t *lenp, loff_t *ppos)
4426 {
4427         struct net *net;
4428         int delay;
4429         if (!write)
4430                 return -EINVAL;
4431
4432         net = (struct net *)ctl->extra1;
4433         delay = net->ipv6.sysctl.flush_delay;
4434         proc_dointvec(ctl, write, buffer, lenp, ppos);
4435         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4436         return 0;
4437 }
4438
4439 struct ctl_table ipv6_route_table_template[] = {
4440         {
4441                 .procname       =       "flush",
4442                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4443                 .maxlen         =       sizeof(int),
4444                 .mode           =       0200,
4445                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4446         },
4447         {
4448                 .procname       =       "gc_thresh",
4449                 .data           =       &ip6_dst_ops_template.gc_thresh,
4450                 .maxlen         =       sizeof(int),
4451                 .mode           =       0644,
4452                 .proc_handler   =       proc_dointvec,
4453         },
4454         {
4455                 .procname       =       "max_size",
4456                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4457                 .maxlen         =       sizeof(int),
4458                 .mode           =       0644,
4459                 .proc_handler   =       proc_dointvec,
4460         },
4461         {
4462                 .procname       =       "gc_min_interval",
4463                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4464                 .maxlen         =       sizeof(int),
4465                 .mode           =       0644,
4466                 .proc_handler   =       proc_dointvec_jiffies,
4467         },
4468         {
4469                 .procname       =       "gc_timeout",
4470                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4471                 .maxlen         =       sizeof(int),
4472                 .mode           =       0644,
4473                 .proc_handler   =       proc_dointvec_jiffies,
4474         },
4475         {
4476                 .procname       =       "gc_interval",
4477                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4478                 .maxlen         =       sizeof(int),
4479                 .mode           =       0644,
4480                 .proc_handler   =       proc_dointvec_jiffies,
4481         },
4482         {
4483                 .procname       =       "gc_elasticity",
4484                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4485                 .maxlen         =       sizeof(int),
4486                 .mode           =       0644,
4487                 .proc_handler   =       proc_dointvec,
4488         },
4489         {
4490                 .procname       =       "mtu_expires",
4491                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4492                 .maxlen         =       sizeof(int),
4493                 .mode           =       0644,
4494                 .proc_handler   =       proc_dointvec_jiffies,
4495         },
4496         {
4497                 .procname       =       "min_adv_mss",
4498                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4499                 .maxlen         =       sizeof(int),
4500                 .mode           =       0644,
4501                 .proc_handler   =       proc_dointvec,
4502         },
4503         {
4504                 .procname       =       "gc_min_interval_ms",
4505                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4506                 .maxlen         =       sizeof(int),
4507                 .mode           =       0644,
4508                 .proc_handler   =       proc_dointvec_ms_jiffies,
4509         },
4510         { }
4511 };
4512
4513 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4514 {
4515         struct ctl_table *table;
4516
4517         table = kmemdup(ipv6_route_table_template,
4518                         sizeof(ipv6_route_table_template),
4519                         GFP_KERNEL);
4520
4521         if (table) {
4522                 table[0].data = &net->ipv6.sysctl.flush_delay;
4523                 table[0].extra1 = net;
4524                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4525                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4526                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4527                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4528                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4529                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4530                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4531                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4532                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4533
4534                 /* Don't export sysctls to unprivileged users */
4535                 if (net->user_ns != &init_user_ns)
4536                         table[0].procname = NULL;
4537         }
4538
4539         return table;
4540 }
4541 #endif
4542
4543 static int __net_init ip6_route_net_init(struct net *net)
4544 {
4545         int ret = -ENOMEM;
4546
4547         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4548                sizeof(net->ipv6.ip6_dst_ops));
4549
4550         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4551                 goto out_ip6_dst_ops;
4552
4553         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4554                                            sizeof(*net->ipv6.ip6_null_entry),
4555                                            GFP_KERNEL);
4556         if (!net->ipv6.ip6_null_entry)
4557                 goto out_ip6_dst_entries;
4558         net->ipv6.ip6_null_entry->dst.path =
4559                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4560         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4561         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4562                          ip6_template_metrics, true);
4563
4564 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4565         net->ipv6.fib6_has_custom_rules = false;
4566         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4567                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4568                                                GFP_KERNEL);
4569         if (!net->ipv6.ip6_prohibit_entry)
4570                 goto out_ip6_null_entry;
4571         net->ipv6.ip6_prohibit_entry->dst.path =
4572                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4573         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4574         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4575                          ip6_template_metrics, true);
4576
4577         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4578                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4579                                                GFP_KERNEL);
4580         if (!net->ipv6.ip6_blk_hole_entry)
4581                 goto out_ip6_prohibit_entry;
4582         net->ipv6.ip6_blk_hole_entry->dst.path =
4583                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4584         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4585         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4586                          ip6_template_metrics, true);
4587 #endif
4588
4589         net->ipv6.sysctl.flush_delay = 0;
4590         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4591         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4592         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4593         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4594         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4595         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4596         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4597
4598         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4599
4600         ret = 0;
4601 out:
4602         return ret;
4603
4604 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4605 out_ip6_prohibit_entry:
4606         kfree(net->ipv6.ip6_prohibit_entry);
4607 out_ip6_null_entry:
4608         kfree(net->ipv6.ip6_null_entry);
4609 #endif
4610 out_ip6_dst_entries:
4611         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4612 out_ip6_dst_ops:
4613         goto out;
4614 }
4615
4616 static void __net_exit ip6_route_net_exit(struct net *net)
4617 {
4618         kfree(net->ipv6.ip6_null_entry);
4619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4620         kfree(net->ipv6.ip6_prohibit_entry);
4621         kfree(net->ipv6.ip6_blk_hole_entry);
4622 #endif
4623         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4624 }
4625
4626 static int __net_init ip6_route_net_init_late(struct net *net)
4627 {
4628 #ifdef CONFIG_PROC_FS
4629         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4630         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4631 #endif
4632         return 0;
4633 }
4634
4635 static void __net_exit ip6_route_net_exit_late(struct net *net)
4636 {
4637 #ifdef CONFIG_PROC_FS
4638         remove_proc_entry("ipv6_route", net->proc_net);
4639         remove_proc_entry("rt6_stats", net->proc_net);
4640 #endif
4641 }
4642
4643 static struct pernet_operations ip6_route_net_ops = {
4644         .init = ip6_route_net_init,
4645         .exit = ip6_route_net_exit,
4646 };
4647
4648 static int __net_init ipv6_inetpeer_init(struct net *net)
4649 {
4650         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4651
4652         if (!bp)
4653                 return -ENOMEM;
4654         inet_peer_base_init(bp);
4655         net->ipv6.peers = bp;
4656         return 0;
4657 }
4658
4659 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4660 {
4661         struct inet_peer_base *bp = net->ipv6.peers;
4662
4663         net->ipv6.peers = NULL;
4664         inetpeer_invalidate_tree(bp);
4665         kfree(bp);
4666 }
4667
4668 static struct pernet_operations ipv6_inetpeer_ops = {
4669         .init   =       ipv6_inetpeer_init,
4670         .exit   =       ipv6_inetpeer_exit,
4671 };
4672
4673 static struct pernet_operations ip6_route_net_late_ops = {
4674         .init = ip6_route_net_init_late,
4675         .exit = ip6_route_net_exit_late,
4676 };
4677
4678 static struct notifier_block ip6_route_dev_notifier = {
4679         .notifier_call = ip6_route_dev_notify,
4680         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4681 };
4682
4683 void __init ip6_route_init_special_entries(void)
4684 {
4685         /* Registering of the loopback is done before this portion of code,
4686          * the loopback reference in rt6_info will not be taken, do it
4687          * manually for init_net */
4688         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4689         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4690   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4691         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4692         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4693         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4694         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4695   #endif
4696 }
4697
4698 int __init ip6_route_init(void)
4699 {
4700         int ret;
4701         int cpu;
4702
4703         ret = -ENOMEM;
4704         ip6_dst_ops_template.kmem_cachep =
4705                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4706                                   SLAB_HWCACHE_ALIGN, NULL);
4707         if (!ip6_dst_ops_template.kmem_cachep)
4708                 goto out;
4709
4710         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4711         if (ret)
4712                 goto out_kmem_cache;
4713
4714         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4715         if (ret)
4716                 goto out_dst_entries;
4717
4718         ret = register_pernet_subsys(&ip6_route_net_ops);
4719         if (ret)
4720                 goto out_register_inetpeer;
4721
4722         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4723
4724         ret = fib6_init();
4725         if (ret)
4726                 goto out_register_subsys;
4727
4728         ret = xfrm6_init();
4729         if (ret)
4730                 goto out_fib6_init;
4731
4732         ret = fib6_rules_init();
4733         if (ret)
4734                 goto xfrm6_init;
4735
4736         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4737         if (ret)
4738                 goto fib6_rules_init;
4739
4740         ret = -ENOBUFS;
4741         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4742             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4743             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4744                             RTNL_FLAG_DOIT_UNLOCKED))
4745                 goto out_register_late_subsys;
4746
4747         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4748         if (ret)
4749                 goto out_register_late_subsys;
4750
4751         for_each_possible_cpu(cpu) {
4752                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4753
4754                 INIT_LIST_HEAD(&ul->head);
4755                 spin_lock_init(&ul->lock);
4756         }
4757
4758 out:
4759         return ret;
4760
4761 out_register_late_subsys:
4762         unregister_pernet_subsys(&ip6_route_net_late_ops);
4763 fib6_rules_init:
4764         fib6_rules_cleanup();
4765 xfrm6_init:
4766         xfrm6_fini();
4767 out_fib6_init:
4768         fib6_gc_cleanup();
4769 out_register_subsys:
4770         unregister_pernet_subsys(&ip6_route_net_ops);
4771 out_register_inetpeer:
4772         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4773 out_dst_entries:
4774         dst_entries_destroy(&ip6_dst_blackhole_ops);
4775 out_kmem_cache:
4776         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4777         goto out;
4778 }
4779
4780 void ip6_route_cleanup(void)
4781 {
4782         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4783         unregister_pernet_subsys(&ip6_route_net_late_ops);
4784         fib6_rules_cleanup();
4785         xfrm6_fini();
4786         fib6_gc_cleanup();
4787         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4788         unregister_pernet_subsys(&ip6_route_net_ops);
4789         dst_entries_destroy(&ip6_dst_blackhole_ops);
4790         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4791 }