ipv6: replace rwlock with rcu and spinlock in fib6_table
[linux-2.6-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146
147                 spin_lock_bh(&ul->lock);
148                 list_del(&rt->rt6i_uncached);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
186 {
187         return dst_metrics_write_ptr(rt->dst.from);
188 }
189
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
191 {
192         struct rt6_info *rt = (struct rt6_info *)dst;
193
194         if (rt->rt6i_flags & RTF_PCPU)
195                 return rt6_pcpu_cow_metrics(rt);
196         else if (rt->rt6i_flags & RTF_CACHE)
197                 return NULL;
198         else
199                 return dst_cow_metrics_generic(dst, old);
200 }
201
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203                                              struct sk_buff *skb,
204                                              const void *daddr)
205 {
206         struct in6_addr *p = &rt->rt6i_gateway;
207
208         if (!ipv6_addr_any(p))
209                 return (const void *) p;
210         else if (skb)
211                 return &ipv6_hdr(skb)->daddr;
212         return daddr;
213 }
214
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216                                           struct sk_buff *skb,
217                                           const void *daddr)
218 {
219         struct rt6_info *rt = (struct rt6_info *) dst;
220         struct neighbour *n;
221
222         daddr = choose_neigh_daddr(rt, skb, daddr);
223         n = __ipv6_neigh_lookup(dst->dev, daddr);
224         if (n)
225                 return n;
226         return neigh_create(&nd_tbl, daddr, dst->dev);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(rt, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       ipv6_cow_metrics,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu)
272 {
273 }
274
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276                                       struct sk_buff *skb)
277 {
278 }
279
280 static struct dst_ops ip6_dst_blackhole_ops = {
281         .family                 =       AF_INET6,
282         .destroy                =       ip6_dst_destroy,
283         .check                  =       ip6_dst_check,
284         .mtu                    =       ip6_blackhole_mtu,
285         .default_advmss         =       ip6_default_advmss,
286         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
287         .redirect               =       ip6_rt_blackhole_redirect,
288         .cow_metrics            =       dst_cow_metrics_generic,
289         .neigh_lookup           =       ip6_neigh_lookup,
290 };
291
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293         [RTAX_HOPLIMIT - 1] = 0,
294 };
295
296 static const struct rt6_info ip6_null_entry_template = {
297         .dst = {
298                 .__refcnt       = ATOMIC_INIT(1),
299                 .__use          = 1,
300                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
301                 .error          = -ENETUNREACH,
302                 .input          = ip6_pkt_discard,
303                 .output         = ip6_pkt_discard_out,
304         },
305         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
306         .rt6i_protocol  = RTPROT_KERNEL,
307         .rt6i_metric    = ~(u32) 0,
308         .rt6i_ref       = ATOMIC_INIT(1),
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326 };
327
328 static const struct rt6_info ip6_blk_hole_entry_template = {
329         .dst = {
330                 .__refcnt       = ATOMIC_INIT(1),
331                 .__use          = 1,
332                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
333                 .error          = -EINVAL,
334                 .input          = dst_discard,
335                 .output         = dst_discard_out,
336         },
337         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
338         .rt6i_protocol  = RTPROT_KERNEL,
339         .rt6i_metric    = ~(u32) 0,
340         .rt6i_ref       = ATOMIC_INIT(1),
341 };
342
343 #endif
344
345 static void rt6_info_init(struct rt6_info *rt)
346 {
347         struct dst_entry *dst = &rt->dst;
348
349         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350         INIT_LIST_HEAD(&rt->rt6i_siblings);
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356                                         struct net_device *dev,
357                                         int flags)
358 {
359         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360                                         1, DST_OBSOLETE_FORCE_CHK, flags);
361
362         if (rt)
363                 rt6_info_init(rt);
364
365         return rt;
366 }
367
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369                                struct net_device *dev,
370                                int flags)
371 {
372         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373
374         if (rt) {
375                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
376                 if (rt->rt6i_pcpu) {
377                         int cpu;
378
379                         for_each_possible_cpu(cpu) {
380                                 struct rt6_info **p;
381
382                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383                                 /* no one shares rt */
384                                 *p =  NULL;
385                         }
386                 } else {
387                         dst_release_immediate(&rt->dst);
388                         return NULL;
389                 }
390         }
391
392         return rt;
393 }
394 EXPORT_SYMBOL(ip6_dst_alloc);
395
396 static void ip6_dst_destroy(struct dst_entry *dst)
397 {
398         struct rt6_info *rt = (struct rt6_info *)dst;
399         struct rt6_exception_bucket *bucket;
400         struct dst_entry *from = dst->from;
401         struct inet6_dev *idev;
402
403         dst_destroy_metrics_generic(dst);
404         free_percpu(rt->rt6i_pcpu);
405         rt6_uncached_list_del(rt);
406
407         idev = rt->rt6i_idev;
408         if (idev) {
409                 rt->rt6i_idev = NULL;
410                 in6_dev_put(idev);
411         }
412         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
413         if (bucket) {
414                 rt->rt6i_exception_bucket = NULL;
415                 kfree(bucket);
416         }
417
418         dst->from = NULL;
419         dst_release(from);
420 }
421
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
423                            int how)
424 {
425         struct rt6_info *rt = (struct rt6_info *)dst;
426         struct inet6_dev *idev = rt->rt6i_idev;
427         struct net_device *loopback_dev =
428                 dev_net(dev)->loopback_dev;
429
430         if (idev && idev->dev != loopback_dev) {
431                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
432                 if (loopback_idev) {
433                         rt->rt6i_idev = loopback_idev;
434                         in6_dev_put(idev);
435                 }
436         }
437 }
438
439 static bool __rt6_check_expired(const struct rt6_info *rt)
440 {
441         if (rt->rt6i_flags & RTF_EXPIRES)
442                 return time_after(jiffies, rt->dst.expires);
443         else
444                 return false;
445 }
446
447 static bool rt6_check_expired(const struct rt6_info *rt)
448 {
449         if (rt->rt6i_flags & RTF_EXPIRES) {
450                 if (time_after(jiffies, rt->dst.expires))
451                         return true;
452         } else if (rt->dst.from) {
453                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454                        rt6_check_expired((struct rt6_info *)rt->dst.from);
455         }
456         return false;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         /* We might have already computed the hash for ICMPv6 errors. In such
467          * case it will always be non-zero. Otherwise now is the time to do it.
468          */
469         if (!fl6->mp_hash)
470                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
471
472         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473         /* Don't change the route, if route_choosen == 0
474          * (siblings does not include ourself)
475          */
476         if (route_choosen)
477                 list_for_each_entry_safe(sibling, next_sibling,
478                                 &match->rt6i_siblings, rt6i_siblings) {
479                         route_choosen--;
480                         if (route_choosen == 0) {
481                                 if (rt6_score_route(sibling, oif, strict) < 0)
482                                         break;
483                                 match = sibling;
484                                 break;
485                         }
486                 }
487         return match;
488 }
489
490 /*
491  *      Route lookup. rcu_read_lock() should be held.
492  */
493
494 static inline struct rt6_info *rt6_device_match(struct net *net,
495                                                     struct rt6_info *rt,
496                                                     const struct in6_addr *saddr,
497                                                     int oif,
498                                                     int flags)
499 {
500         struct rt6_info *local = NULL;
501         struct rt6_info *sprt;
502
503         if (!oif && ipv6_addr_any(saddr))
504                 goto out;
505
506         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
507                 struct net_device *dev = sprt->dst.dev;
508
509                 if (oif) {
510                         if (dev->ifindex == oif)
511                                 return sprt;
512                         if (dev->flags & IFF_LOOPBACK) {
513                                 if (!sprt->rt6i_idev ||
514                                     sprt->rt6i_idev->dev->ifindex != oif) {
515                                         if (flags & RT6_LOOKUP_F_IFACE)
516                                                 continue;
517                                         if (local &&
518                                             local->rt6i_idev->dev->ifindex == oif)
519                                                 continue;
520                                 }
521                                 local = sprt;
522                         }
523                 } else {
524                         if (ipv6_chk_addr(net, saddr, dev,
525                                           flags & RT6_LOOKUP_F_IFACE))
526                                 return sprt;
527                 }
528         }
529
530         if (oif) {
531                 if (local)
532                         return local;
533
534                 if (flags & RT6_LOOKUP_F_IFACE)
535                         return net->ipv6.ip6_null_entry;
536         }
537 out:
538         return rt;
539 }
540
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543         struct work_struct work;
544         struct in6_addr target;
545         struct net_device *dev;
546 };
547
548 static void rt6_probe_deferred(struct work_struct *w)
549 {
550         struct in6_addr mcaddr;
551         struct __rt6_probe_work *work =
552                 container_of(w, struct __rt6_probe_work, work);
553
554         addrconf_addr_solict_mult(&work->target, &mcaddr);
555         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556         dev_put(work->dev);
557         kfree(work);
558 }
559
560 static void rt6_probe(struct rt6_info *rt)
561 {
562         struct __rt6_probe_work *work;
563         struct neighbour *neigh;
564         /*
565          * Okay, this does not seem to be appropriate
566          * for now, however, we need to check if it
567          * is really so; aka Router Reachability Probing.
568          *
569          * Router Reachability Probe MUST be rate-limited
570          * to no more than one per minute.
571          */
572         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
573                 return;
574         rcu_read_lock_bh();
575         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
576         if (neigh) {
577                 if (neigh->nud_state & NUD_VALID)
578                         goto out;
579
580                 work = NULL;
581                 write_lock(&neigh->lock);
582                 if (!(neigh->nud_state & NUD_VALID) &&
583                     time_after(jiffies,
584                                neigh->updated +
585                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
586                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
587                         if (work)
588                                 __neigh_set_probe_once(neigh);
589                 }
590                 write_unlock(&neigh->lock);
591         } else {
592                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
593         }
594
595         if (work) {
596                 INIT_WORK(&work->work, rt6_probe_deferred);
597                 work->target = rt->rt6i_gateway;
598                 dev_hold(rt->dst.dev);
599                 work->dev = rt->dst.dev;
600                 schedule_work(&work->work);
601         }
602
603 out:
604         rcu_read_unlock_bh();
605 }
606 #else
607 static inline void rt6_probe(struct rt6_info *rt)
608 {
609 }
610 #endif
611
612 /*
613  * Default Router Selection (RFC 2461 6.3.6)
614  */
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
616 {
617         struct net_device *dev = rt->dst.dev;
618         if (!oif || dev->ifindex == oif)
619                 return 2;
620         if ((dev->flags & IFF_LOOPBACK) &&
621             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622                 return 1;
623         return 0;
624 }
625
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
627 {
628         struct neighbour *neigh;
629         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
630
631         if (rt->rt6i_flags & RTF_NONEXTHOP ||
632             !(rt->rt6i_flags & RTF_GATEWAY))
633                 return RT6_NUD_SUCCEED;
634
635         rcu_read_lock_bh();
636         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
637         if (neigh) {
638                 read_lock(&neigh->lock);
639                 if (neigh->nud_state & NUD_VALID)
640                         ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642                 else if (!(neigh->nud_state & NUD_FAILED))
643                         ret = RT6_NUD_SUCCEED;
644                 else
645                         ret = RT6_NUD_FAIL_PROBE;
646 #endif
647                 read_unlock(&neigh->lock);
648         } else {
649                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
651         }
652         rcu_read_unlock_bh();
653
654         return ret;
655 }
656
657 static int rt6_score_route(struct rt6_info *rt, int oif,
658                            int strict)
659 {
660         int m;
661
662         m = rt6_check_dev(rt, oif);
663         if (!m && (strict & RT6_LOOKUP_F_IFACE))
664                 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
667 #endif
668         if (strict & RT6_LOOKUP_F_REACHABLE) {
669                 int n = rt6_check_neigh(rt);
670                 if (n < 0)
671                         return n;
672         }
673         return m;
674 }
675
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677                                    int *mpri, struct rt6_info *match,
678                                    bool *do_rr)
679 {
680         int m;
681         bool match_do_rr = false;
682         struct inet6_dev *idev = rt->rt6i_idev;
683         struct net_device *dev = rt->dst.dev;
684
685         if (dev && !netif_carrier_ok(dev) &&
686             idev->cnf.ignore_routes_with_linkdown &&
687             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688                 goto out;
689
690         if (rt6_check_expired(rt))
691                 goto out;
692
693         m = rt6_score_route(rt, oif, strict);
694         if (m == RT6_NUD_FAIL_DO_RR) {
695                 match_do_rr = true;
696                 m = 0; /* lowest valid score */
697         } else if (m == RT6_NUD_FAIL_HARD) {
698                 goto out;
699         }
700
701         if (strict & RT6_LOOKUP_F_REACHABLE)
702                 rt6_probe(rt);
703
704         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
705         if (m > *mpri) {
706                 *do_rr = match_do_rr;
707                 *mpri = m;
708                 match = rt;
709         }
710 out:
711         return match;
712 }
713
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715                                      struct rt6_info *leaf,
716                                      struct rt6_info *rr_head,
717                                      u32 metric, int oif, int strict,
718                                      bool *do_rr)
719 {
720         struct rt6_info *rt, *match, *cont;
721         int mpri = -1;
722
723         match = NULL;
724         cont = NULL;
725         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
726                 if (rt->rt6i_metric != metric) {
727                         cont = rt;
728                         break;
729                 }
730
731                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732         }
733
734         for (rt = leaf; rt && rt != rr_head;
735              rt = rcu_dereference(rt->dst.rt6_next)) {
736                 if (rt->rt6i_metric != metric) {
737                         cont = rt;
738                         break;
739                 }
740
741                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
742         }
743
744         if (match || !cont)
745                 return match;
746
747         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
748                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
749
750         return match;
751 }
752
753 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
754                                    int oif, int strict)
755 {
756         struct rt6_info *leaf = rcu_dereference(fn->leaf);
757         struct rt6_info *match, *rt0;
758         bool do_rr = false;
759         int key_plen;
760
761         if (!leaf)
762                 return net->ipv6.ip6_null_entry;
763
764         rt0 = rcu_dereference(fn->rr_ptr);
765         if (!rt0)
766                 rt0 = leaf;
767
768         /* Double check to make sure fn is not an intermediate node
769          * and fn->leaf does not points to its child's leaf
770          * (This might happen if all routes under fn are deleted from
771          * the tree and fib6_repair_tree() is called on the node.)
772          */
773         key_plen = rt0->rt6i_dst.plen;
774 #ifdef CONFIG_IPV6_SUBTREES
775         if (rt0->rt6i_src.plen)
776                 key_plen = rt0->rt6i_src.plen;
777 #endif
778         if (fn->fn_bit != key_plen)
779                 return net->ipv6.ip6_null_entry;
780
781         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
782                              &do_rr);
783
784         if (do_rr) {
785                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
786
787                 /* no entries matched; do round-robin */
788                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
789                         next = leaf;
790
791                 if (next != rt0) {
792                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
793                         /* make sure next is not being deleted from the tree */
794                         if (next->rt6i_node)
795                                 rcu_assign_pointer(fn->rr_ptr, next);
796                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
797                 }
798         }
799
800         return match ? match : net->ipv6.ip6_null_entry;
801 }
802
803 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
804 {
805         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
806 }
807
808 #ifdef CONFIG_IPV6_ROUTE_INFO
809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
810                   const struct in6_addr *gwaddr)
811 {
812         struct net *net = dev_net(dev);
813         struct route_info *rinfo = (struct route_info *) opt;
814         struct in6_addr prefix_buf, *prefix;
815         unsigned int pref;
816         unsigned long lifetime;
817         struct rt6_info *rt;
818
819         if (len < sizeof(struct route_info)) {
820                 return -EINVAL;
821         }
822
823         /* Sanity check for prefix_len and length */
824         if (rinfo->length > 3) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 128) {
827                 return -EINVAL;
828         } else if (rinfo->prefix_len > 64) {
829                 if (rinfo->length < 2) {
830                         return -EINVAL;
831                 }
832         } else if (rinfo->prefix_len > 0) {
833                 if (rinfo->length < 1) {
834                         return -EINVAL;
835                 }
836         }
837
838         pref = rinfo->route_pref;
839         if (pref == ICMPV6_ROUTER_PREF_INVALID)
840                 return -EINVAL;
841
842         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
843
844         if (rinfo->length == 3)
845                 prefix = (struct in6_addr *)rinfo->prefix;
846         else {
847                 /* this function is safe */
848                 ipv6_addr_prefix(&prefix_buf,
849                                  (struct in6_addr *)rinfo->prefix,
850                                  rinfo->prefix_len);
851                 prefix = &prefix_buf;
852         }
853
854         if (rinfo->prefix_len == 0)
855                 rt = rt6_get_dflt_router(gwaddr, dev);
856         else
857                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858                                         gwaddr, dev);
859
860         if (rt && !lifetime) {
861                 ip6_del_rt(rt);
862                 rt = NULL;
863         }
864
865         if (!rt && lifetime)
866                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867                                         dev, pref);
868         else if (rt)
869                 rt->rt6i_flags = RTF_ROUTEINFO |
870                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871
872         if (rt) {
873                 if (!addrconf_finite_timeout(lifetime))
874                         rt6_clean_expires(rt);
875                 else
876                         rt6_set_expires(rt, jiffies + HZ * lifetime);
877
878                 ip6_rt_put(rt);
879         }
880         return 0;
881 }
882 #endif
883
884 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
885                                         struct in6_addr *saddr)
886 {
887         struct fib6_node *pn, *sn;
888         while (1) {
889                 if (fn->fn_flags & RTN_TL_ROOT)
890                         return NULL;
891                 pn = rcu_dereference(fn->parent);
892                 sn = FIB6_SUBTREE(pn);
893                 if (sn && sn != fn)
894                         fn = fib6_lookup(sn, NULL, saddr);
895                 else
896                         fn = pn;
897                 if (fn->fn_flags & RTN_RTINFO)
898                         return fn;
899         }
900 }
901
902 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
903                           bool null_fallback)
904 {
905         struct rt6_info *rt = *prt;
906
907         if (dst_hold_safe(&rt->dst))
908                 return true;
909         if (null_fallback) {
910                 rt = net->ipv6.ip6_null_entry;
911                 dst_hold(&rt->dst);
912         } else {
913                 rt = NULL;
914         }
915         *prt = rt;
916         return false;
917 }
918
919 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
920                                              struct fib6_table *table,
921                                              struct flowi6 *fl6, int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         rcu_read_lock();
927         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
928 restart:
929         rt = rcu_dereference(fn->leaf);
930         if (!rt) {
931                 rt = net->ipv6.ip6_null_entry;
932         } else {
933                 rt = rt6_device_match(net, rt, &fl6->saddr,
934                                       fl6->flowi6_oif, flags);
935                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
936                         rt = rt6_multipath_select(rt, fl6,
937                                                   fl6->flowi6_oif, flags);
938         }
939         if (rt == net->ipv6.ip6_null_entry) {
940                 fn = fib6_backtrack(fn, &fl6->saddr);
941                 if (fn)
942                         goto restart;
943         }
944         /* Search through exception table */
945         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
946         if (rt_cache)
947                 rt = rt_cache;
948
949         if (ip6_hold_safe(net, &rt, true))
950                 dst_use_noref(&rt->dst, jiffies);
951
952         rcu_read_unlock();
953
954         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
955
956         return rt;
957
958 }
959
960 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
961                                     int flags)
962 {
963         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
964 }
965 EXPORT_SYMBOL_GPL(ip6_route_lookup);
966
967 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
968                             const struct in6_addr *saddr, int oif, int strict)
969 {
970         struct flowi6 fl6 = {
971                 .flowi6_oif = oif,
972                 .daddr = *daddr,
973         };
974         struct dst_entry *dst;
975         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
976
977         if (saddr) {
978                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
979                 flags |= RT6_LOOKUP_F_HAS_SADDR;
980         }
981
982         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
983         if (dst->error == 0)
984                 return (struct rt6_info *) dst;
985
986         dst_release(dst);
987
988         return NULL;
989 }
990 EXPORT_SYMBOL(rt6_lookup);
991
992 /* ip6_ins_rt is called with FREE table->tb6_lock.
993  * It takes new route entry, the addition fails by any reason the
994  * route is released.
995  * Caller must hold dst before calling it.
996  */
997
998 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
999                         struct mx6_config *mxc,
1000                         struct netlink_ext_ack *extack)
1001 {
1002         int err;
1003         struct fib6_table *table;
1004
1005         table = rt->rt6i_table;
1006         spin_lock_bh(&table->tb6_lock);
1007         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1008         spin_unlock_bh(&table->tb6_lock);
1009
1010         return err;
1011 }
1012
1013 int ip6_ins_rt(struct rt6_info *rt)
1014 {
1015         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1016         struct mx6_config mxc = { .mx = NULL, };
1017
1018         /* Hold dst to account for the reference from the fib6 tree */
1019         dst_hold(&rt->dst);
1020         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1021 }
1022
1023 /* called with rcu_lock held */
1024 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1025 {
1026         struct net_device *dev = rt->dst.dev;
1027
1028         if (rt->rt6i_flags & RTF_LOCAL) {
1029                 /* for copies of local routes, dst->dev needs to be the
1030                  * device if it is a master device, the master device if
1031                  * device is enslaved, and the loopback as the default
1032                  */
1033                 if (netif_is_l3_slave(dev) &&
1034                     !rt6_need_strict(&rt->rt6i_dst.addr))
1035                         dev = l3mdev_master_dev_rcu(dev);
1036                 else if (!netif_is_l3_master(dev))
1037                         dev = dev_net(dev)->loopback_dev;
1038                 /* last case is netif_is_l3_master(dev) is true in which
1039                  * case we want dev returned to be dev
1040                  */
1041         }
1042
1043         return dev;
1044 }
1045
1046 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1047                                            const struct in6_addr *daddr,
1048                                            const struct in6_addr *saddr)
1049 {
1050         struct net_device *dev;
1051         struct rt6_info *rt;
1052
1053         /*
1054          *      Clone the route.
1055          */
1056
1057         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1058                 ort = (struct rt6_info *)ort->dst.from;
1059
1060         rcu_read_lock();
1061         dev = ip6_rt_get_dev_rcu(ort);
1062         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1063         rcu_read_unlock();
1064         if (!rt)
1065                 return NULL;
1066
1067         ip6_rt_copy_init(rt, ort);
1068         rt->rt6i_flags |= RTF_CACHE;
1069         rt->rt6i_metric = 0;
1070         rt->dst.flags |= DST_HOST;
1071         rt->rt6i_dst.addr = *daddr;
1072         rt->rt6i_dst.plen = 128;
1073
1074         if (!rt6_is_gw_or_nonexthop(ort)) {
1075                 if (ort->rt6i_dst.plen != 128 &&
1076                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1077                         rt->rt6i_flags |= RTF_ANYCAST;
1078 #ifdef CONFIG_IPV6_SUBTREES
1079                 if (rt->rt6i_src.plen && saddr) {
1080                         rt->rt6i_src.addr = *saddr;
1081                         rt->rt6i_src.plen = 128;
1082                 }
1083 #endif
1084         }
1085
1086         return rt;
1087 }
1088
1089 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1090 {
1091         struct net_device *dev;
1092         struct rt6_info *pcpu_rt;
1093
1094         rcu_read_lock();
1095         dev = ip6_rt_get_dev_rcu(rt);
1096         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1097         rcu_read_unlock();
1098         if (!pcpu_rt)
1099                 return NULL;
1100         ip6_rt_copy_init(pcpu_rt, rt);
1101         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1102         pcpu_rt->rt6i_flags |= RTF_PCPU;
1103         return pcpu_rt;
1104 }
1105
1106 /* It should be called with rcu_read_lock() acquired */
1107 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1108 {
1109         struct rt6_info *pcpu_rt, **p;
1110
1111         p = this_cpu_ptr(rt->rt6i_pcpu);
1112         pcpu_rt = *p;
1113
1114         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1115                 rt6_dst_from_metrics_check(pcpu_rt);
1116
1117         return pcpu_rt;
1118 }
1119
1120 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1121 {
1122         struct rt6_info *pcpu_rt, *prev, **p;
1123
1124         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1125         if (!pcpu_rt) {
1126                 struct net *net = dev_net(rt->dst.dev);
1127
1128                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1129                 return net->ipv6.ip6_null_entry;
1130         }
1131
1132         dst_hold(&pcpu_rt->dst);
1133         p = this_cpu_ptr(rt->rt6i_pcpu);
1134         prev = cmpxchg(p, NULL, pcpu_rt);
1135         if (prev) {
1136                 /* If someone did it before us, return prev instead */
1137                 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1138                 dst_release_immediate(&pcpu_rt->dst);
1139                 /* release refcnt taken by above dst_hold() */
1140                 dst_release_immediate(&pcpu_rt->dst);
1141                 dst_hold(&prev->dst);
1142                 pcpu_rt = prev;
1143         }
1144
1145         rt6_dst_from_metrics_check(pcpu_rt);
1146         return pcpu_rt;
1147 }
1148
1149 /* exception hash table implementation
1150  */
1151 static DEFINE_SPINLOCK(rt6_exception_lock);
1152
1153 /* Remove rt6_ex from hash table and free the memory
1154  * Caller must hold rt6_exception_lock
1155  */
1156 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1157                                  struct rt6_exception *rt6_ex)
1158 {
1159         if (!bucket || !rt6_ex)
1160                 return;
1161         rt6_ex->rt6i->rt6i_node = NULL;
1162         hlist_del_rcu(&rt6_ex->hlist);
1163         rt6_release(rt6_ex->rt6i);
1164         kfree_rcu(rt6_ex, rcu);
1165         WARN_ON_ONCE(!bucket->depth);
1166         bucket->depth--;
1167 }
1168
1169 /* Remove oldest rt6_ex in bucket and free the memory
1170  * Caller must hold rt6_exception_lock
1171  */
1172 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1173 {
1174         struct rt6_exception *rt6_ex, *oldest = NULL;
1175
1176         if (!bucket)
1177                 return;
1178
1179         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1180                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1181                         oldest = rt6_ex;
1182         }
1183         rt6_remove_exception(bucket, oldest);
1184 }
1185
1186 static u32 rt6_exception_hash(const struct in6_addr *dst,
1187                               const struct in6_addr *src)
1188 {
1189         static u32 seed __read_mostly;
1190         u32 val;
1191
1192         net_get_random_once(&seed, sizeof(seed));
1193         val = jhash(dst, sizeof(*dst), seed);
1194
1195 #ifdef CONFIG_IPV6_SUBTREES
1196         if (src)
1197                 val = jhash(src, sizeof(*src), val);
1198 #endif
1199         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1200 }
1201
1202 /* Helper function to find the cached rt in the hash table
1203  * and update bucket pointer to point to the bucket for this
1204  * (daddr, saddr) pair
1205  * Caller must hold rt6_exception_lock
1206  */
1207 static struct rt6_exception *
1208 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1209                               const struct in6_addr *daddr,
1210                               const struct in6_addr *saddr)
1211 {
1212         struct rt6_exception *rt6_ex;
1213         u32 hval;
1214
1215         if (!(*bucket) || !daddr)
1216                 return NULL;
1217
1218         hval = rt6_exception_hash(daddr, saddr);
1219         *bucket += hval;
1220
1221         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1222                 struct rt6_info *rt6 = rt6_ex->rt6i;
1223                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1224
1225 #ifdef CONFIG_IPV6_SUBTREES
1226                 if (matched && saddr)
1227                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1228 #endif
1229                 if (matched)
1230                         return rt6_ex;
1231         }
1232         return NULL;
1233 }
1234
1235 /* Helper function to find the cached rt in the hash table
1236  * and update bucket pointer to point to the bucket for this
1237  * (daddr, saddr) pair
1238  * Caller must hold rcu_read_lock()
1239  */
1240 static struct rt6_exception *
1241 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1242                          const struct in6_addr *daddr,
1243                          const struct in6_addr *saddr)
1244 {
1245         struct rt6_exception *rt6_ex;
1246         u32 hval;
1247
1248         WARN_ON_ONCE(!rcu_read_lock_held());
1249
1250         if (!(*bucket) || !daddr)
1251                 return NULL;
1252
1253         hval = rt6_exception_hash(daddr, saddr);
1254         *bucket += hval;
1255
1256         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1257                 struct rt6_info *rt6 = rt6_ex->rt6i;
1258                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1259
1260 #ifdef CONFIG_IPV6_SUBTREES
1261                 if (matched && saddr)
1262                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1263 #endif
1264                 if (matched)
1265                         return rt6_ex;
1266         }
1267         return NULL;
1268 }
1269
1270 static int rt6_insert_exception(struct rt6_info *nrt,
1271                                 struct rt6_info *ort)
1272 {
1273         struct rt6_exception_bucket *bucket;
1274         struct in6_addr *src_key = NULL;
1275         struct rt6_exception *rt6_ex;
1276         int err = 0;
1277
1278         /* ort can't be a cache or pcpu route */
1279         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1280                 ort = (struct rt6_info *)ort->dst.from;
1281         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1282
1283         spin_lock_bh(&rt6_exception_lock);
1284
1285         if (ort->exception_bucket_flushed) {
1286                 err = -EINVAL;
1287                 goto out;
1288         }
1289
1290         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1291                                         lockdep_is_held(&rt6_exception_lock));
1292         if (!bucket) {
1293                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1294                                  GFP_ATOMIC);
1295                 if (!bucket) {
1296                         err = -ENOMEM;
1297                         goto out;
1298                 }
1299                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1300         }
1301
1302 #ifdef CONFIG_IPV6_SUBTREES
1303         /* rt6i_src.plen != 0 indicates ort is in subtree
1304          * and exception table is indexed by a hash of
1305          * both rt6i_dst and rt6i_src.
1306          * Otherwise, the exception table is indexed by
1307          * a hash of only rt6i_dst.
1308          */
1309         if (ort->rt6i_src.plen)
1310                 src_key = &nrt->rt6i_src.addr;
1311 #endif
1312
1313         /* Update rt6i_prefsrc as it could be changed
1314          * in rt6_remove_prefsrc()
1315          */
1316         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1317         /* rt6_mtu_change() might lower mtu on ort.
1318          * Only insert this exception route if its mtu
1319          * is less than ort's mtu value.
1320          */
1321         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1322                 err = -EINVAL;
1323                 goto out;
1324         }
1325
1326         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1327                                                src_key);
1328         if (rt6_ex)
1329                 rt6_remove_exception(bucket, rt6_ex);
1330
1331         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1332         if (!rt6_ex) {
1333                 err = -ENOMEM;
1334                 goto out;
1335         }
1336         rt6_ex->rt6i = nrt;
1337         rt6_ex->stamp = jiffies;
1338         atomic_inc(&nrt->rt6i_ref);
1339         nrt->rt6i_node = ort->rt6i_node;
1340         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1341         bucket->depth++;
1342
1343         if (bucket->depth > FIB6_MAX_DEPTH)
1344                 rt6_exception_remove_oldest(bucket);
1345
1346 out:
1347         spin_unlock_bh(&rt6_exception_lock);
1348
1349         /* Update fn->fn_sernum to invalidate all cached dst */
1350         if (!err)
1351                 fib6_update_sernum(ort);
1352
1353         return err;
1354 }
1355
1356 void rt6_flush_exceptions(struct rt6_info *rt)
1357 {
1358         struct rt6_exception_bucket *bucket;
1359         struct rt6_exception *rt6_ex;
1360         struct hlist_node *tmp;
1361         int i;
1362
1363         spin_lock_bh(&rt6_exception_lock);
1364         /* Prevent rt6_insert_exception() to recreate the bucket list */
1365         rt->exception_bucket_flushed = 1;
1366
1367         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1368                                     lockdep_is_held(&rt6_exception_lock));
1369         if (!bucket)
1370                 goto out;
1371
1372         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1373                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1374                         rt6_remove_exception(bucket, rt6_ex);
1375                 WARN_ON_ONCE(bucket->depth);
1376                 bucket++;
1377         }
1378
1379 out:
1380         spin_unlock_bh(&rt6_exception_lock);
1381 }
1382
1383 /* Find cached rt in the hash table inside passed in rt
1384  * Caller has to hold rcu_read_lock()
1385  */
1386 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1387                                            struct in6_addr *daddr,
1388                                            struct in6_addr *saddr)
1389 {
1390         struct rt6_exception_bucket *bucket;
1391         struct in6_addr *src_key = NULL;
1392         struct rt6_exception *rt6_ex;
1393         struct rt6_info *res = NULL;
1394
1395         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398         /* rt6i_src.plen != 0 indicates rt is in subtree
1399          * and exception table is indexed by a hash of
1400          * both rt6i_dst and rt6i_src.
1401          * Otherwise, the exception table is indexed by
1402          * a hash of only rt6i_dst.
1403          */
1404         if (rt->rt6i_src.plen)
1405                 src_key = saddr;
1406 #endif
1407         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1408
1409         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1410                 res = rt6_ex->rt6i;
1411
1412         return res;
1413 }
1414
1415 /* Remove the passed in cached rt from the hash table that contains it */
1416 int rt6_remove_exception_rt(struct rt6_info *rt)
1417 {
1418         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1419         struct rt6_exception_bucket *bucket;
1420         struct in6_addr *src_key = NULL;
1421         struct rt6_exception *rt6_ex;
1422         int err;
1423
1424         if (!from ||
1425             !(rt->rt6i_flags | RTF_CACHE))
1426                 return -EINVAL;
1427
1428         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1429                 return -ENOENT;
1430
1431         spin_lock_bh(&rt6_exception_lock);
1432         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1433                                     lockdep_is_held(&rt6_exception_lock));
1434 #ifdef CONFIG_IPV6_SUBTREES
1435         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1436          * and exception table is indexed by a hash of
1437          * both rt6i_dst and rt6i_src.
1438          * Otherwise, the exception table is indexed by
1439          * a hash of only rt6i_dst.
1440          */
1441         if (from->rt6i_src.plen)
1442                 src_key = &rt->rt6i_src.addr;
1443 #endif
1444         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1445                                                &rt->rt6i_dst.addr,
1446                                                src_key);
1447         if (rt6_ex) {
1448                 rt6_remove_exception(bucket, rt6_ex);
1449                 err = 0;
1450         } else {
1451                 err = -ENOENT;
1452         }
1453
1454         spin_unlock_bh(&rt6_exception_lock);
1455         return err;
1456 }
1457
1458 /* Find rt6_ex which contains the passed in rt cache and
1459  * refresh its stamp
1460  */
1461 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1462 {
1463         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1464         struct rt6_exception_bucket *bucket;
1465         struct in6_addr *src_key = NULL;
1466         struct rt6_exception *rt6_ex;
1467
1468         if (!from ||
1469             !(rt->rt6i_flags | RTF_CACHE))
1470                 return;
1471
1472         rcu_read_lock();
1473         bucket = rcu_dereference(from->rt6i_exception_bucket);
1474
1475 #ifdef CONFIG_IPV6_SUBTREES
1476         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1477          * and exception table is indexed by a hash of
1478          * both rt6i_dst and rt6i_src.
1479          * Otherwise, the exception table is indexed by
1480          * a hash of only rt6i_dst.
1481          */
1482         if (from->rt6i_src.plen)
1483                 src_key = &rt->rt6i_src.addr;
1484 #endif
1485         rt6_ex = __rt6_find_exception_rcu(&bucket,
1486                                           &rt->rt6i_dst.addr,
1487                                           src_key);
1488         if (rt6_ex)
1489                 rt6_ex->stamp = jiffies;
1490
1491         rcu_read_unlock();
1492 }
1493
1494 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1495 {
1496         struct rt6_exception_bucket *bucket;
1497         struct rt6_exception *rt6_ex;
1498         int i;
1499
1500         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1501                                         lockdep_is_held(&rt6_exception_lock));
1502
1503         if (bucket) {
1504                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1506                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1507                         }
1508                         bucket++;
1509                 }
1510         }
1511 }
1512
1513 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct rt6_exception *rt6_ex;
1517         int i;
1518
1519         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1520                                         lockdep_is_held(&rt6_exception_lock));
1521
1522         if (bucket) {
1523                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1525                                 struct rt6_info *entry = rt6_ex->rt6i;
1526                                 /* For RTF_CACHE with rt6i_pmtu == 0
1527                                  * (i.e. a redirected route),
1528                                  * the metrics of its rt->dst.from has already
1529                                  * been updated.
1530                                  */
1531                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1532                                         entry->rt6i_pmtu = mtu;
1533                         }
1534                         bucket++;
1535                 }
1536         }
1537 }
1538
1539 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1540
1541 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1542                                         struct in6_addr *gateway)
1543 {
1544         struct rt6_exception_bucket *bucket;
1545         struct rt6_exception *rt6_ex;
1546         struct hlist_node *tmp;
1547         int i;
1548
1549         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1550                 return;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554                                      lockdep_is_held(&rt6_exception_lock));
1555
1556         if (bucket) {
1557                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558                         hlist_for_each_entry_safe(rt6_ex, tmp,
1559                                                   &bucket->chain, hlist) {
1560                                 struct rt6_info *entry = rt6_ex->rt6i;
1561
1562                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1563                                     RTF_CACHE_GATEWAY &&
1564                                     ipv6_addr_equal(gateway,
1565                                                     &entry->rt6i_gateway)) {
1566                                         rt6_remove_exception(bucket, rt6_ex);
1567                                 }
1568                         }
1569                         bucket++;
1570                 }
1571         }
1572
1573         spin_unlock_bh(&rt6_exception_lock);
1574 }
1575
1576 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1577                                       struct rt6_exception *rt6_ex,
1578                                       struct fib6_gc_args *gc_args,
1579                                       unsigned long now)
1580 {
1581         struct rt6_info *rt = rt6_ex->rt6i;
1582
1583         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1584             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1585                 RT6_TRACE("aging clone %p\n", rt);
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 return;
1588         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1589                 struct neighbour *neigh;
1590                 __u8 neigh_flags = 0;
1591
1592                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1593                 if (neigh) {
1594                         neigh_flags = neigh->flags;
1595                         neigh_release(neigh);
1596                 }
1597                 if (!(neigh_flags & NTF_ROUTER)) {
1598                         RT6_TRACE("purging route %p via non-router but gateway\n",
1599                                   rt);
1600                         rt6_remove_exception(bucket, rt6_ex);
1601                         return;
1602                 }
1603         }
1604         gc_args->more++;
1605 }
1606
1607 void rt6_age_exceptions(struct rt6_info *rt,
1608                         struct fib6_gc_args *gc_args,
1609                         unsigned long now)
1610 {
1611         struct rt6_exception_bucket *bucket;
1612         struct rt6_exception *rt6_ex;
1613         struct hlist_node *tmp;
1614         int i;
1615
1616         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1617                 return;
1618
1619         spin_lock_bh(&rt6_exception_lock);
1620         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1621                                     lockdep_is_held(&rt6_exception_lock));
1622
1623         if (bucket) {
1624                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1625                         hlist_for_each_entry_safe(rt6_ex, tmp,
1626                                                   &bucket->chain, hlist) {
1627                                 rt6_age_examine_exception(bucket, rt6_ex,
1628                                                           gc_args, now);
1629                         }
1630                         bucket++;
1631                 }
1632         }
1633         spin_unlock_bh(&rt6_exception_lock);
1634 }
1635
1636 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1637                                int oif, struct flowi6 *fl6, int flags)
1638 {
1639         struct fib6_node *fn, *saved_fn;
1640         struct rt6_info *rt, *rt_cache;
1641         int strict = 0;
1642
1643         strict |= flags & RT6_LOOKUP_F_IFACE;
1644         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1645         if (net->ipv6.devconf_all->forwarding == 0)
1646                 strict |= RT6_LOOKUP_F_REACHABLE;
1647
1648         rcu_read_lock();
1649
1650         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1651         saved_fn = fn;
1652
1653         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1654                 oif = 0;
1655
1656 redo_rt6_select:
1657         rt = rt6_select(net, fn, oif, strict);
1658         if (rt->rt6i_nsiblings)
1659                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1660         if (rt == net->ipv6.ip6_null_entry) {
1661                 fn = fib6_backtrack(fn, &fl6->saddr);
1662                 if (fn)
1663                         goto redo_rt6_select;
1664                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1665                         /* also consider unreachable route */
1666                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1667                         fn = saved_fn;
1668                         goto redo_rt6_select;
1669                 }
1670         }
1671
1672         /*Search through exception table */
1673         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1674         if (rt_cache)
1675                 rt = rt_cache;
1676
1677         if (rt == net->ipv6.ip6_null_entry) {
1678                 rcu_read_unlock();
1679                 dst_hold(&rt->dst);
1680                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1681                 return rt;
1682         } else if (rt->rt6i_flags & RTF_CACHE) {
1683                 if (ip6_hold_safe(net, &rt, true)) {
1684                         dst_use_noref(&rt->dst, jiffies);
1685                         rt6_dst_from_metrics_check(rt);
1686                 }
1687                 rcu_read_unlock();
1688                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1689                 return rt;
1690         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1691                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1692                 /* Create a RTF_CACHE clone which will not be
1693                  * owned by the fib6 tree.  It is for the special case where
1694                  * the daddr in the skb during the neighbor look-up is different
1695                  * from the fl6->daddr used to look-up route here.
1696                  */
1697
1698                 struct rt6_info *uncached_rt;
1699
1700                 if (ip6_hold_safe(net, &rt, true)) {
1701                         dst_use_noref(&rt->dst, jiffies);
1702                 } else {
1703                         rcu_read_unlock();
1704                         uncached_rt = rt;
1705                         goto uncached_rt_out;
1706                 }
1707                 rcu_read_unlock();
1708
1709                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1710                 dst_release(&rt->dst);
1711
1712                 if (uncached_rt) {
1713                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1714                          * No need for another dst_hold()
1715                          */
1716                         rt6_uncached_list_add(uncached_rt);
1717                 } else {
1718                         uncached_rt = net->ipv6.ip6_null_entry;
1719                         dst_hold(&uncached_rt->dst);
1720                 }
1721
1722 uncached_rt_out:
1723                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1724                 return uncached_rt;
1725
1726         } else {
1727                 /* Get a percpu copy */
1728
1729                 struct rt6_info *pcpu_rt;
1730
1731                 dst_use_noref(&rt->dst, jiffies);
1732                 pcpu_rt = rt6_get_pcpu_route(rt);
1733
1734                 if (pcpu_rt) {
1735                         rcu_read_unlock();
1736                 } else {
1737                         /* atomic_inc_not_zero() is needed when using rcu */
1738                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1739                                 /* We have to do the read_unlock first
1740                                  * because rt6_make_pcpu_route() may trigger
1741                                  * ip6_dst_gc() which will take the write_lock.
1742                                  *
1743                                  * No dst_hold() on rt is needed because grabbing
1744                                  * rt->rt6i_ref makes sure rt can't be released.
1745                                  */
1746                                 rcu_read_unlock();
1747                                 pcpu_rt = rt6_make_pcpu_route(rt);
1748                                 rt6_release(rt);
1749                         } else {
1750                                 /* rt is already removed from tree */
1751                                 rcu_read_unlock();
1752                                 pcpu_rt = net->ipv6.ip6_null_entry;
1753                                 dst_hold(&pcpu_rt->dst);
1754                         }
1755                 }
1756
1757                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1758                 return pcpu_rt;
1759         }
1760 }
1761 EXPORT_SYMBOL_GPL(ip6_pol_route);
1762
1763 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1764                                             struct flowi6 *fl6, int flags)
1765 {
1766         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1767 }
1768
1769 struct dst_entry *ip6_route_input_lookup(struct net *net,
1770                                          struct net_device *dev,
1771                                          struct flowi6 *fl6, int flags)
1772 {
1773         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1774                 flags |= RT6_LOOKUP_F_IFACE;
1775
1776         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1777 }
1778 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1779
1780 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1781                                   struct flow_keys *keys)
1782 {
1783         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1784         const struct ipv6hdr *key_iph = outer_iph;
1785         const struct ipv6hdr *inner_iph;
1786         const struct icmp6hdr *icmph;
1787         struct ipv6hdr _inner_iph;
1788
1789         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1790                 goto out;
1791
1792         icmph = icmp6_hdr(skb);
1793         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1794             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1795             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1796             icmph->icmp6_type != ICMPV6_PARAMPROB)
1797                 goto out;
1798
1799         inner_iph = skb_header_pointer(skb,
1800                                        skb_transport_offset(skb) + sizeof(*icmph),
1801                                        sizeof(_inner_iph), &_inner_iph);
1802         if (!inner_iph)
1803                 goto out;
1804
1805         key_iph = inner_iph;
1806 out:
1807         memset(keys, 0, sizeof(*keys));
1808         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1809         keys->addrs.v6addrs.src = key_iph->saddr;
1810         keys->addrs.v6addrs.dst = key_iph->daddr;
1811         keys->tags.flow_label = ip6_flowinfo(key_iph);
1812         keys->basic.ip_proto = key_iph->nexthdr;
1813 }
1814
1815 /* if skb is set it will be used and fl6 can be NULL */
1816 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1817 {
1818         struct flow_keys hash_keys;
1819
1820         if (skb) {
1821                 ip6_multipath_l3_keys(skb, &hash_keys);
1822                 return flow_hash_from_keys(&hash_keys);
1823         }
1824
1825         return get_hash_from_flowi6(fl6);
1826 }
1827
1828 void ip6_route_input(struct sk_buff *skb)
1829 {
1830         const struct ipv6hdr *iph = ipv6_hdr(skb);
1831         struct net *net = dev_net(skb->dev);
1832         int flags = RT6_LOOKUP_F_HAS_SADDR;
1833         struct ip_tunnel_info *tun_info;
1834         struct flowi6 fl6 = {
1835                 .flowi6_iif = skb->dev->ifindex,
1836                 .daddr = iph->daddr,
1837                 .saddr = iph->saddr,
1838                 .flowlabel = ip6_flowinfo(iph),
1839                 .flowi6_mark = skb->mark,
1840                 .flowi6_proto = iph->nexthdr,
1841         };
1842
1843         tun_info = skb_tunnel_info(skb);
1844         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1845                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1846         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1847                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1848         skb_dst_drop(skb);
1849         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1850 }
1851
1852 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1853                                              struct flowi6 *fl6, int flags)
1854 {
1855         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1856 }
1857
1858 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1859                                          struct flowi6 *fl6, int flags)
1860 {
1861         bool any_src;
1862
1863         if (rt6_need_strict(&fl6->daddr)) {
1864                 struct dst_entry *dst;
1865
1866                 dst = l3mdev_link_scope_lookup(net, fl6);
1867                 if (dst)
1868                         return dst;
1869         }
1870
1871         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1872
1873         any_src = ipv6_addr_any(&fl6->saddr);
1874         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1875             (fl6->flowi6_oif && any_src))
1876                 flags |= RT6_LOOKUP_F_IFACE;
1877
1878         if (!any_src)
1879                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1880         else if (sk)
1881                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1882
1883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1884 }
1885 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1886
1887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1888 {
1889         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1890         struct net_device *loopback_dev = net->loopback_dev;
1891         struct dst_entry *new = NULL;
1892
1893         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1894                        DST_OBSOLETE_NONE, 0);
1895         if (rt) {
1896                 rt6_info_init(rt);
1897
1898                 new = &rt->dst;
1899                 new->__use = 1;
1900                 new->input = dst_discard;
1901                 new->output = dst_discard_out;
1902
1903                 dst_copy_metrics(new, &ort->dst);
1904
1905                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1906                 rt->rt6i_gateway = ort->rt6i_gateway;
1907                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1908                 rt->rt6i_metric = 0;
1909
1910                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1911 #ifdef CONFIG_IPV6_SUBTREES
1912                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1913 #endif
1914         }
1915
1916         dst_release(dst_orig);
1917         return new ? new : ERR_PTR(-ENOMEM);
1918 }
1919
1920 /*
1921  *      Destination cache support functions
1922  */
1923
1924 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1925 {
1926         if (rt->dst.from &&
1927             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1928                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1929 }
1930
1931 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1932 {
1933         u32 rt_cookie = 0;
1934
1935         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1936                 return NULL;
1937
1938         if (rt6_check_expired(rt))
1939                 return NULL;
1940
1941         return &rt->dst;
1942 }
1943
1944 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1945 {
1946         if (!__rt6_check_expired(rt) &&
1947             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1948             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1949                 return &rt->dst;
1950         else
1951                 return NULL;
1952 }
1953
1954 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1955 {
1956         struct rt6_info *rt;
1957
1958         rt = (struct rt6_info *) dst;
1959
1960         /* All IPV6 dsts are created with ->obsolete set to the value
1961          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1962          * into this function always.
1963          */
1964
1965         rt6_dst_from_metrics_check(rt);
1966
1967         if (rt->rt6i_flags & RTF_PCPU ||
1968             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1969                 return rt6_dst_from_check(rt, cookie);
1970         else
1971                 return rt6_check(rt, cookie);
1972 }
1973
1974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1975 {
1976         struct rt6_info *rt = (struct rt6_info *) dst;
1977
1978         if (rt) {
1979                 if (rt->rt6i_flags & RTF_CACHE) {
1980                         if (rt6_check_expired(rt)) {
1981                                 ip6_del_rt(rt);
1982                                 dst = NULL;
1983                         }
1984                 } else {
1985                         dst_release(dst);
1986                         dst = NULL;
1987                 }
1988         }
1989         return dst;
1990 }
1991
1992 static void ip6_link_failure(struct sk_buff *skb)
1993 {
1994         struct rt6_info *rt;
1995
1996         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1997
1998         rt = (struct rt6_info *) skb_dst(skb);
1999         if (rt) {
2000                 if (rt->rt6i_flags & RTF_CACHE) {
2001                         if (dst_hold_safe(&rt->dst))
2002                                 ip6_del_rt(rt);
2003                 } else {
2004                         struct fib6_node *fn;
2005
2006                         rcu_read_lock();
2007                         fn = rcu_dereference(rt->rt6i_node);
2008                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2009                                 fn->fn_sernum = -1;
2010                         rcu_read_unlock();
2011                 }
2012         }
2013 }
2014
2015 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2016 {
2017         struct net *net = dev_net(rt->dst.dev);
2018
2019         rt->rt6i_flags |= RTF_MODIFIED;
2020         rt->rt6i_pmtu = mtu;
2021         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2022 }
2023
2024 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2025 {
2026         return !(rt->rt6i_flags & RTF_CACHE) &&
2027                 (rt->rt6i_flags & RTF_PCPU ||
2028                  rcu_access_pointer(rt->rt6i_node));
2029 }
2030
2031 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2032                                  const struct ipv6hdr *iph, u32 mtu)
2033 {
2034         const struct in6_addr *daddr, *saddr;
2035         struct rt6_info *rt6 = (struct rt6_info *)dst;
2036
2037         if (rt6->rt6i_flags & RTF_LOCAL)
2038                 return;
2039
2040         if (dst_metric_locked(dst, RTAX_MTU))
2041                 return;
2042
2043         if (iph) {
2044                 daddr = &iph->daddr;
2045                 saddr = &iph->saddr;
2046         } else if (sk) {
2047                 daddr = &sk->sk_v6_daddr;
2048                 saddr = &inet6_sk(sk)->saddr;
2049         } else {
2050                 daddr = NULL;
2051                 saddr = NULL;
2052         }
2053         dst_confirm_neigh(dst, daddr);
2054         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2055         if (mtu >= dst_mtu(dst))
2056                 return;
2057
2058         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2059                 rt6_do_update_pmtu(rt6, mtu);
2060                 /* update rt6_ex->stamp for cache */
2061                 if (rt6->rt6i_flags & RTF_CACHE)
2062                         rt6_update_exception_stamp_rt(rt6);
2063         } else if (daddr) {
2064                 struct rt6_info *nrt6;
2065
2066                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2067                 if (nrt6) {
2068                         rt6_do_update_pmtu(nrt6, mtu);
2069                         if (rt6_insert_exception(nrt6, rt6))
2070                                 dst_release_immediate(&nrt6->dst);
2071                 }
2072         }
2073 }
2074
2075 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2076                                struct sk_buff *skb, u32 mtu)
2077 {
2078         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2079 }
2080
2081 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2082                      int oif, u32 mark, kuid_t uid)
2083 {
2084         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2085         struct dst_entry *dst;
2086         struct flowi6 fl6;
2087
2088         memset(&fl6, 0, sizeof(fl6));
2089         fl6.flowi6_oif = oif;
2090         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2091         fl6.daddr = iph->daddr;
2092         fl6.saddr = iph->saddr;
2093         fl6.flowlabel = ip6_flowinfo(iph);
2094         fl6.flowi6_uid = uid;
2095
2096         dst = ip6_route_output(net, NULL, &fl6);
2097         if (!dst->error)
2098                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2099         dst_release(dst);
2100 }
2101 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2102
2103 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2104 {
2105         struct dst_entry *dst;
2106
2107         ip6_update_pmtu(skb, sock_net(sk), mtu,
2108                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2109
2110         dst = __sk_dst_get(sk);
2111         if (!dst || !dst->obsolete ||
2112             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2113                 return;
2114
2115         bh_lock_sock(sk);
2116         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2117                 ip6_datagram_dst_update(sk, false);
2118         bh_unlock_sock(sk);
2119 }
2120 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2121
2122 /* Handle redirects */
2123 struct ip6rd_flowi {
2124         struct flowi6 fl6;
2125         struct in6_addr gateway;
2126 };
2127
2128 static struct rt6_info *__ip6_route_redirect(struct net *net,
2129                                              struct fib6_table *table,
2130                                              struct flowi6 *fl6,
2131                                              int flags)
2132 {
2133         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2134         struct rt6_info *rt, *rt_cache;
2135         struct fib6_node *fn;
2136
2137         /* Get the "current" route for this destination and
2138          * check if the redirect has come from appropriate router.
2139          *
2140          * RFC 4861 specifies that redirects should only be
2141          * accepted if they come from the nexthop to the target.
2142          * Due to the way the routes are chosen, this notion
2143          * is a bit fuzzy and one might need to check all possible
2144          * routes.
2145          */
2146
2147         rcu_read_lock();
2148         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2149 restart:
2150         for_each_fib6_node_rt_rcu(fn) {
2151                 if (rt6_check_expired(rt))
2152                         continue;
2153                 if (rt->dst.error)
2154                         break;
2155                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2156                         continue;
2157                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2158                         continue;
2159                 /* rt_cache's gateway might be different from its 'parent'
2160                  * in the case of an ip redirect.
2161                  * So we keep searching in the exception table if the gateway
2162                  * is different.
2163                  */
2164                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2165                         rt_cache = rt6_find_cached_rt(rt,
2166                                                       &fl6->daddr,
2167                                                       &fl6->saddr);
2168                         if (rt_cache &&
2169                             ipv6_addr_equal(&rdfl->gateway,
2170                                             &rt_cache->rt6i_gateway)) {
2171                                 rt = rt_cache;
2172                                 break;
2173                         }
2174                         continue;
2175                 }
2176                 break;
2177         }
2178
2179         if (!rt)
2180                 rt = net->ipv6.ip6_null_entry;
2181         else if (rt->dst.error) {
2182                 rt = net->ipv6.ip6_null_entry;
2183                 goto out;
2184         }
2185
2186         if (rt == net->ipv6.ip6_null_entry) {
2187                 fn = fib6_backtrack(fn, &fl6->saddr);
2188                 if (fn)
2189                         goto restart;
2190         }
2191
2192 out:
2193         ip6_hold_safe(net, &rt, true);
2194
2195         rcu_read_unlock();
2196
2197         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2198         return rt;
2199 };
2200
2201 static struct dst_entry *ip6_route_redirect(struct net *net,
2202                                         const struct flowi6 *fl6,
2203                                         const struct in6_addr *gateway)
2204 {
2205         int flags = RT6_LOOKUP_F_HAS_SADDR;
2206         struct ip6rd_flowi rdfl;
2207
2208         rdfl.fl6 = *fl6;
2209         rdfl.gateway = *gateway;
2210
2211         return fib6_rule_lookup(net, &rdfl.fl6,
2212                                 flags, __ip6_route_redirect);
2213 }
2214
2215 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2216                   kuid_t uid)
2217 {
2218         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2219         struct dst_entry *dst;
2220         struct flowi6 fl6;
2221
2222         memset(&fl6, 0, sizeof(fl6));
2223         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2224         fl6.flowi6_oif = oif;
2225         fl6.flowi6_mark = mark;
2226         fl6.daddr = iph->daddr;
2227         fl6.saddr = iph->saddr;
2228         fl6.flowlabel = ip6_flowinfo(iph);
2229         fl6.flowi6_uid = uid;
2230
2231         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2232         rt6_do_redirect(dst, NULL, skb);
2233         dst_release(dst);
2234 }
2235 EXPORT_SYMBOL_GPL(ip6_redirect);
2236
2237 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2238                             u32 mark)
2239 {
2240         const struct ipv6hdr *iph = ipv6_hdr(skb);
2241         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2242         struct dst_entry *dst;
2243         struct flowi6 fl6;
2244
2245         memset(&fl6, 0, sizeof(fl6));
2246         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2247         fl6.flowi6_oif = oif;
2248         fl6.flowi6_mark = mark;
2249         fl6.daddr = msg->dest;
2250         fl6.saddr = iph->daddr;
2251         fl6.flowi6_uid = sock_net_uid(net, NULL);
2252
2253         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2254         rt6_do_redirect(dst, NULL, skb);
2255         dst_release(dst);
2256 }
2257
2258 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2259 {
2260         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2261                      sk->sk_uid);
2262 }
2263 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2264
2265 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2266 {
2267         struct net_device *dev = dst->dev;
2268         unsigned int mtu = dst_mtu(dst);
2269         struct net *net = dev_net(dev);
2270
2271         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2272
2273         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2274                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2275
2276         /*
2277          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2278          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2279          * IPV6_MAXPLEN is also valid and means: "any MSS,
2280          * rely only on pmtu discovery"
2281          */
2282         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2283                 mtu = IPV6_MAXPLEN;
2284         return mtu;
2285 }
2286
2287 static unsigned int ip6_mtu(const struct dst_entry *dst)
2288 {
2289         const struct rt6_info *rt = (const struct rt6_info *)dst;
2290         unsigned int mtu = rt->rt6i_pmtu;
2291         struct inet6_dev *idev;
2292
2293         if (mtu)
2294                 goto out;
2295
2296         mtu = dst_metric_raw(dst, RTAX_MTU);
2297         if (mtu)
2298                 goto out;
2299
2300         mtu = IPV6_MIN_MTU;
2301
2302         rcu_read_lock();
2303         idev = __in6_dev_get(dst->dev);
2304         if (idev)
2305                 mtu = idev->cnf.mtu6;
2306         rcu_read_unlock();
2307
2308 out:
2309         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2310
2311         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2312 }
2313
2314 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2315                                   struct flowi6 *fl6)
2316 {
2317         struct dst_entry *dst;
2318         struct rt6_info *rt;
2319         struct inet6_dev *idev = in6_dev_get(dev);
2320         struct net *net = dev_net(dev);
2321
2322         if (unlikely(!idev))
2323                 return ERR_PTR(-ENODEV);
2324
2325         rt = ip6_dst_alloc(net, dev, 0);
2326         if (unlikely(!rt)) {
2327                 in6_dev_put(idev);
2328                 dst = ERR_PTR(-ENOMEM);
2329                 goto out;
2330         }
2331
2332         rt->dst.flags |= DST_HOST;
2333         rt->dst.output  = ip6_output;
2334         rt->rt6i_gateway  = fl6->daddr;
2335         rt->rt6i_dst.addr = fl6->daddr;
2336         rt->rt6i_dst.plen = 128;
2337         rt->rt6i_idev     = idev;
2338         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2339
2340         /* Add this dst into uncached_list so that rt6_ifdown() can
2341          * do proper release of the net_device
2342          */
2343         rt6_uncached_list_add(rt);
2344
2345         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2346
2347 out:
2348         return dst;
2349 }
2350
2351 static int ip6_dst_gc(struct dst_ops *ops)
2352 {
2353         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2354         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2355         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2356         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2357         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2358         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2359         int entries;
2360
2361         entries = dst_entries_get_fast(ops);
2362         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2363             entries <= rt_max_size)
2364                 goto out;
2365
2366         net->ipv6.ip6_rt_gc_expire++;
2367         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2368         entries = dst_entries_get_slow(ops);
2369         if (entries < ops->gc_thresh)
2370                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2371 out:
2372         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2373         return entries > rt_max_size;
2374 }
2375
2376 static int ip6_convert_metrics(struct mx6_config *mxc,
2377                                const struct fib6_config *cfg)
2378 {
2379         bool ecn_ca = false;
2380         struct nlattr *nla;
2381         int remaining;
2382         u32 *mp;
2383
2384         if (!cfg->fc_mx)
2385                 return 0;
2386
2387         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2388         if (unlikely(!mp))
2389                 return -ENOMEM;
2390
2391         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2392                 int type = nla_type(nla);
2393                 u32 val;
2394
2395                 if (!type)
2396                         continue;
2397                 if (unlikely(type > RTAX_MAX))
2398                         goto err;
2399
2400                 if (type == RTAX_CC_ALGO) {
2401                         char tmp[TCP_CA_NAME_MAX];
2402
2403                         nla_strlcpy(tmp, nla, sizeof(tmp));
2404                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2405                         if (val == TCP_CA_UNSPEC)
2406                                 goto err;
2407                 } else {
2408                         val = nla_get_u32(nla);
2409                 }
2410                 if (type == RTAX_HOPLIMIT && val > 255)
2411                         val = 255;
2412                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2413                         goto err;
2414
2415                 mp[type - 1] = val;
2416                 __set_bit(type - 1, mxc->mx_valid);
2417         }
2418
2419         if (ecn_ca) {
2420                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2421                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2422         }
2423
2424         mxc->mx = mp;
2425         return 0;
2426  err:
2427         kfree(mp);
2428         return -EINVAL;
2429 }
2430
2431 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2432                                             struct fib6_config *cfg,
2433                                             const struct in6_addr *gw_addr)
2434 {
2435         struct flowi6 fl6 = {
2436                 .flowi6_oif = cfg->fc_ifindex,
2437                 .daddr = *gw_addr,
2438                 .saddr = cfg->fc_prefsrc,
2439         };
2440         struct fib6_table *table;
2441         struct rt6_info *rt;
2442         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2443
2444         table = fib6_get_table(net, cfg->fc_table);
2445         if (!table)
2446                 return NULL;
2447
2448         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2449                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2450
2451         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2452
2453         /* if table lookup failed, fall back to full lookup */
2454         if (rt == net->ipv6.ip6_null_entry) {
2455                 ip6_rt_put(rt);
2456                 rt = NULL;
2457         }
2458
2459         return rt;
2460 }
2461
2462 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2463                                               struct netlink_ext_ack *extack)
2464 {
2465         struct net *net = cfg->fc_nlinfo.nl_net;
2466         struct rt6_info *rt = NULL;
2467         struct net_device *dev = NULL;
2468         struct inet6_dev *idev = NULL;
2469         struct fib6_table *table;
2470         int addr_type;
2471         int err = -EINVAL;
2472
2473         /* RTF_PCPU is an internal flag; can not be set by userspace */
2474         if (cfg->fc_flags & RTF_PCPU) {
2475                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2476                 goto out;
2477         }
2478
2479         if (cfg->fc_dst_len > 128) {
2480                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2481                 goto out;
2482         }
2483         if (cfg->fc_src_len > 128) {
2484                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2485                 goto out;
2486         }
2487 #ifndef CONFIG_IPV6_SUBTREES
2488         if (cfg->fc_src_len) {
2489                 NL_SET_ERR_MSG(extack,
2490                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2491                 goto out;
2492         }
2493 #endif
2494         if (cfg->fc_ifindex) {
2495                 err = -ENODEV;
2496                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2497                 if (!dev)
2498                         goto out;
2499                 idev = in6_dev_get(dev);
2500                 if (!idev)
2501                         goto out;
2502         }
2503
2504         if (cfg->fc_metric == 0)
2505                 cfg->fc_metric = IP6_RT_PRIO_USER;
2506
2507         err = -ENOBUFS;
2508         if (cfg->fc_nlinfo.nlh &&
2509             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2510                 table = fib6_get_table(net, cfg->fc_table);
2511                 if (!table) {
2512                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2513                         table = fib6_new_table(net, cfg->fc_table);
2514                 }
2515         } else {
2516                 table = fib6_new_table(net, cfg->fc_table);
2517         }
2518
2519         if (!table)
2520                 goto out;
2521
2522         rt = ip6_dst_alloc(net, NULL,
2523                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2524
2525         if (!rt) {
2526                 err = -ENOMEM;
2527                 goto out;
2528         }
2529
2530         if (cfg->fc_flags & RTF_EXPIRES)
2531                 rt6_set_expires(rt, jiffies +
2532                                 clock_t_to_jiffies(cfg->fc_expires));
2533         else
2534                 rt6_clean_expires(rt);
2535
2536         if (cfg->fc_protocol == RTPROT_UNSPEC)
2537                 cfg->fc_protocol = RTPROT_BOOT;
2538         rt->rt6i_protocol = cfg->fc_protocol;
2539
2540         addr_type = ipv6_addr_type(&cfg->fc_dst);
2541
2542         if (addr_type & IPV6_ADDR_MULTICAST)
2543                 rt->dst.input = ip6_mc_input;
2544         else if (cfg->fc_flags & RTF_LOCAL)
2545                 rt->dst.input = ip6_input;
2546         else
2547                 rt->dst.input = ip6_forward;
2548
2549         rt->dst.output = ip6_output;
2550
2551         if (cfg->fc_encap) {
2552                 struct lwtunnel_state *lwtstate;
2553
2554                 err = lwtunnel_build_state(cfg->fc_encap_type,
2555                                            cfg->fc_encap, AF_INET6, cfg,
2556                                            &lwtstate, extack);
2557                 if (err)
2558                         goto out;
2559                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2560                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2561                         rt->dst.lwtstate->orig_output = rt->dst.output;
2562                         rt->dst.output = lwtunnel_output;
2563                 }
2564                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2565                         rt->dst.lwtstate->orig_input = rt->dst.input;
2566                         rt->dst.input = lwtunnel_input;
2567                 }
2568         }
2569
2570         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2571         rt->rt6i_dst.plen = cfg->fc_dst_len;
2572         if (rt->rt6i_dst.plen == 128)
2573                 rt->dst.flags |= DST_HOST;
2574
2575 #ifdef CONFIG_IPV6_SUBTREES
2576         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2577         rt->rt6i_src.plen = cfg->fc_src_len;
2578 #endif
2579
2580         rt->rt6i_metric = cfg->fc_metric;
2581
2582         /* We cannot add true routes via loopback here,
2583            they would result in kernel looping; promote them to reject routes
2584          */
2585         if ((cfg->fc_flags & RTF_REJECT) ||
2586             (dev && (dev->flags & IFF_LOOPBACK) &&
2587              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2588              !(cfg->fc_flags & RTF_LOCAL))) {
2589                 /* hold loopback dev/idev if we haven't done so. */
2590                 if (dev != net->loopback_dev) {
2591                         if (dev) {
2592                                 dev_put(dev);
2593                                 in6_dev_put(idev);
2594                         }
2595                         dev = net->loopback_dev;
2596                         dev_hold(dev);
2597                         idev = in6_dev_get(dev);
2598                         if (!idev) {
2599                                 err = -ENODEV;
2600                                 goto out;
2601                         }
2602                 }
2603                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2604                 switch (cfg->fc_type) {
2605                 case RTN_BLACKHOLE:
2606                         rt->dst.error = -EINVAL;
2607                         rt->dst.output = dst_discard_out;
2608                         rt->dst.input = dst_discard;
2609                         break;
2610                 case RTN_PROHIBIT:
2611                         rt->dst.error = -EACCES;
2612                         rt->dst.output = ip6_pkt_prohibit_out;
2613                         rt->dst.input = ip6_pkt_prohibit;
2614                         break;
2615                 case RTN_THROW:
2616                 case RTN_UNREACHABLE:
2617                 default:
2618                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2619                                         : (cfg->fc_type == RTN_UNREACHABLE)
2620                                         ? -EHOSTUNREACH : -ENETUNREACH;
2621                         rt->dst.output = ip6_pkt_discard_out;
2622                         rt->dst.input = ip6_pkt_discard;
2623                         break;
2624                 }
2625                 goto install_route;
2626         }
2627
2628         if (cfg->fc_flags & RTF_GATEWAY) {
2629                 const struct in6_addr *gw_addr;
2630                 int gwa_type;
2631
2632                 gw_addr = &cfg->fc_gateway;
2633                 gwa_type = ipv6_addr_type(gw_addr);
2634
2635                 /* if gw_addr is local we will fail to detect this in case
2636                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2637                  * will return already-added prefix route via interface that
2638                  * prefix route was assigned to, which might be non-loopback.
2639                  */
2640                 err = -EINVAL;
2641                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2642                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2643                                             dev : NULL, 0, 0)) {
2644                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2645                         goto out;
2646                 }
2647                 rt->rt6i_gateway = *gw_addr;
2648
2649                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2650                         struct rt6_info *grt = NULL;
2651
2652                         /* IPv6 strictly inhibits using not link-local
2653                            addresses as nexthop address.
2654                            Otherwise, router will not able to send redirects.
2655                            It is very good, but in some (rare!) circumstances
2656                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2657                            some exceptions. --ANK
2658                            We allow IPv4-mapped nexthops to support RFC4798-type
2659                            addressing
2660                          */
2661                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2662                                           IPV6_ADDR_MAPPED))) {
2663                                 NL_SET_ERR_MSG(extack,
2664                                                "Invalid gateway address");
2665                                 goto out;
2666                         }
2667
2668                         if (cfg->fc_table) {
2669                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2670
2671                                 if (grt) {
2672                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2673                                             (dev && dev != grt->dst.dev)) {
2674                                                 ip6_rt_put(grt);
2675                                                 grt = NULL;
2676                                         }
2677                                 }
2678                         }
2679
2680                         if (!grt)
2681                                 grt = rt6_lookup(net, gw_addr, NULL,
2682                                                  cfg->fc_ifindex, 1);
2683
2684                         err = -EHOSTUNREACH;
2685                         if (!grt)
2686                                 goto out;
2687                         if (dev) {
2688                                 if (dev != grt->dst.dev) {
2689                                         ip6_rt_put(grt);
2690                                         goto out;
2691                                 }
2692                         } else {
2693                                 dev = grt->dst.dev;
2694                                 idev = grt->rt6i_idev;
2695                                 dev_hold(dev);
2696                                 in6_dev_hold(grt->rt6i_idev);
2697                         }
2698                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2699                                 err = 0;
2700                         ip6_rt_put(grt);
2701
2702                         if (err)
2703                                 goto out;
2704                 }
2705                 err = -EINVAL;
2706                 if (!dev) {
2707                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2708                         goto out;
2709                 } else if (dev->flags & IFF_LOOPBACK) {
2710                         NL_SET_ERR_MSG(extack,
2711                                        "Egress device can not be loopback device for this route");
2712                         goto out;
2713                 }
2714         }
2715
2716         err = -ENODEV;
2717         if (!dev)
2718                 goto out;
2719
2720         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2721                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2722                         NL_SET_ERR_MSG(extack, "Invalid source address");
2723                         err = -EINVAL;
2724                         goto out;
2725                 }
2726                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2727                 rt->rt6i_prefsrc.plen = 128;
2728         } else
2729                 rt->rt6i_prefsrc.plen = 0;
2730
2731         rt->rt6i_flags = cfg->fc_flags;
2732
2733 install_route:
2734         rt->dst.dev = dev;
2735         rt->rt6i_idev = idev;
2736         rt->rt6i_table = table;
2737
2738         cfg->fc_nlinfo.nl_net = dev_net(dev);
2739
2740         return rt;
2741 out:
2742         if (dev)
2743                 dev_put(dev);
2744         if (idev)
2745                 in6_dev_put(idev);
2746         if (rt)
2747                 dst_release_immediate(&rt->dst);
2748
2749         return ERR_PTR(err);
2750 }
2751
2752 int ip6_route_add(struct fib6_config *cfg,
2753                   struct netlink_ext_ack *extack)
2754 {
2755         struct mx6_config mxc = { .mx = NULL, };
2756         struct rt6_info *rt;
2757         int err;
2758
2759         rt = ip6_route_info_create(cfg, extack);
2760         if (IS_ERR(rt)) {
2761                 err = PTR_ERR(rt);
2762                 rt = NULL;
2763                 goto out;
2764         }
2765
2766         err = ip6_convert_metrics(&mxc, cfg);
2767         if (err)
2768                 goto out;
2769
2770         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2771
2772         kfree(mxc.mx);
2773
2774         return err;
2775 out:
2776         if (rt)
2777                 dst_release_immediate(&rt->dst);
2778
2779         return err;
2780 }
2781
2782 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2783 {
2784         int err;
2785         struct fib6_table *table;
2786         struct net *net = dev_net(rt->dst.dev);
2787
2788         if (rt == net->ipv6.ip6_null_entry) {
2789                 err = -ENOENT;
2790                 goto out;
2791         }
2792
2793         table = rt->rt6i_table;
2794         spin_lock_bh(&table->tb6_lock);
2795         err = fib6_del(rt, info);
2796         spin_unlock_bh(&table->tb6_lock);
2797
2798 out:
2799         ip6_rt_put(rt);
2800         return err;
2801 }
2802
2803 int ip6_del_rt(struct rt6_info *rt)
2804 {
2805         struct nl_info info = {
2806                 .nl_net = dev_net(rt->dst.dev),
2807         };
2808         return __ip6_del_rt(rt, &info);
2809 }
2810
2811 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2812 {
2813         struct nl_info *info = &cfg->fc_nlinfo;
2814         struct net *net = info->nl_net;
2815         struct sk_buff *skb = NULL;
2816         struct fib6_table *table;
2817         int err = -ENOENT;
2818
2819         if (rt == net->ipv6.ip6_null_entry)
2820                 goto out_put;
2821         table = rt->rt6i_table;
2822         spin_lock_bh(&table->tb6_lock);
2823
2824         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2825                 struct rt6_info *sibling, *next_sibling;
2826
2827                 /* prefer to send a single notification with all hops */
2828                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2829                 if (skb) {
2830                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2831
2832                         if (rt6_fill_node(net, skb, rt,
2833                                           NULL, NULL, 0, RTM_DELROUTE,
2834                                           info->portid, seq, 0) < 0) {
2835                                 kfree_skb(skb);
2836                                 skb = NULL;
2837                         } else
2838                                 info->skip_notify = 1;
2839                 }
2840
2841                 list_for_each_entry_safe(sibling, next_sibling,
2842                                          &rt->rt6i_siblings,
2843                                          rt6i_siblings) {
2844                         err = fib6_del(sibling, info);
2845                         if (err)
2846                                 goto out_unlock;
2847                 }
2848         }
2849
2850         err = fib6_del(rt, info);
2851 out_unlock:
2852         spin_unlock_bh(&table->tb6_lock);
2853 out_put:
2854         ip6_rt_put(rt);
2855
2856         if (skb) {
2857                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2858                             info->nlh, gfp_any());
2859         }
2860         return err;
2861 }
2862
2863 static int ip6_route_del(struct fib6_config *cfg,
2864                          struct netlink_ext_ack *extack)
2865 {
2866         struct rt6_info *rt, *rt_cache;
2867         struct fib6_table *table;
2868         struct fib6_node *fn;
2869         int err = -ESRCH;
2870
2871         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2872         if (!table) {
2873                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2874                 return err;
2875         }
2876
2877         rcu_read_lock();
2878
2879         fn = fib6_locate(&table->tb6_root,
2880                          &cfg->fc_dst, cfg->fc_dst_len,
2881                          &cfg->fc_src, cfg->fc_src_len,
2882                          !(cfg->fc_flags & RTF_CACHE));
2883
2884         if (fn) {
2885                 for_each_fib6_node_rt_rcu(fn) {
2886                         if (cfg->fc_flags & RTF_CACHE) {
2887                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2888                                                               &cfg->fc_src);
2889                                 if (!rt_cache)
2890                                         continue;
2891                                 rt = rt_cache;
2892                         }
2893                         if (cfg->fc_ifindex &&
2894                             (!rt->dst.dev ||
2895                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2896                                 continue;
2897                         if (cfg->fc_flags & RTF_GATEWAY &&
2898                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2899                                 continue;
2900                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2901                                 continue;
2902                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2903                                 continue;
2904                         if (!dst_hold_safe(&rt->dst))
2905                                 break;
2906                         rcu_read_unlock();
2907
2908                         /* if gateway was specified only delete the one hop */
2909                         if (cfg->fc_flags & RTF_GATEWAY)
2910                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2911
2912                         return __ip6_del_rt_siblings(rt, cfg);
2913                 }
2914         }
2915         rcu_read_unlock();
2916
2917         return err;
2918 }
2919
2920 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2921 {
2922         struct netevent_redirect netevent;
2923         struct rt6_info *rt, *nrt = NULL;
2924         struct ndisc_options ndopts;
2925         struct inet6_dev *in6_dev;
2926         struct neighbour *neigh;
2927         struct rd_msg *msg;
2928         int optlen, on_link;
2929         u8 *lladdr;
2930
2931         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2932         optlen -= sizeof(*msg);
2933
2934         if (optlen < 0) {
2935                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2936                 return;
2937         }
2938
2939         msg = (struct rd_msg *)icmp6_hdr(skb);
2940
2941         if (ipv6_addr_is_multicast(&msg->dest)) {
2942                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2943                 return;
2944         }
2945
2946         on_link = 0;
2947         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2948                 on_link = 1;
2949         } else if (ipv6_addr_type(&msg->target) !=
2950                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2951                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2952                 return;
2953         }
2954
2955         in6_dev = __in6_dev_get(skb->dev);
2956         if (!in6_dev)
2957                 return;
2958         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2959                 return;
2960
2961         /* RFC2461 8.1:
2962          *      The IP source address of the Redirect MUST be the same as the current
2963          *      first-hop router for the specified ICMP Destination Address.
2964          */
2965
2966         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2967                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2968                 return;
2969         }
2970
2971         lladdr = NULL;
2972         if (ndopts.nd_opts_tgt_lladdr) {
2973                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2974                                              skb->dev);
2975                 if (!lladdr) {
2976                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2977                         return;
2978                 }
2979         }
2980
2981         rt = (struct rt6_info *) dst;
2982         if (rt->rt6i_flags & RTF_REJECT) {
2983                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2984                 return;
2985         }
2986
2987         /* Redirect received -> path was valid.
2988          * Look, redirects are sent only in response to data packets,
2989          * so that this nexthop apparently is reachable. --ANK
2990          */
2991         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2992
2993         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2994         if (!neigh)
2995                 return;
2996
2997         /*
2998          *      We have finally decided to accept it.
2999          */
3000
3001         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3002                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3003                      NEIGH_UPDATE_F_OVERRIDE|
3004                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3005                                      NEIGH_UPDATE_F_ISROUTER)),
3006                      NDISC_REDIRECT, &ndopts);
3007
3008         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3009         if (!nrt)
3010                 goto out;
3011
3012         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3013         if (on_link)
3014                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3015
3016         nrt->rt6i_protocol = RTPROT_REDIRECT;
3017         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3018
3019         /* No need to remove rt from the exception table if rt is
3020          * a cached route because rt6_insert_exception() will
3021          * takes care of it
3022          */
3023         if (rt6_insert_exception(nrt, rt)) {
3024                 dst_release_immediate(&nrt->dst);
3025                 goto out;
3026         }
3027
3028         netevent.old = &rt->dst;
3029         netevent.new = &nrt->dst;
3030         netevent.daddr = &msg->dest;
3031         netevent.neigh = neigh;
3032         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3033
3034 out:
3035         neigh_release(neigh);
3036 }
3037
3038 /*
3039  *      Misc support functions
3040  */
3041
3042 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3043 {
3044         BUG_ON(from->dst.from);
3045
3046         rt->rt6i_flags &= ~RTF_EXPIRES;
3047         dst_hold(&from->dst);
3048         rt->dst.from = &from->dst;
3049         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3050 }
3051
3052 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3053 {
3054         rt->dst.input = ort->dst.input;
3055         rt->dst.output = ort->dst.output;
3056         rt->rt6i_dst = ort->rt6i_dst;
3057         rt->dst.error = ort->dst.error;
3058         rt->rt6i_idev = ort->rt6i_idev;
3059         if (rt->rt6i_idev)
3060                 in6_dev_hold(rt->rt6i_idev);
3061         rt->dst.lastuse = jiffies;
3062         rt->rt6i_gateway = ort->rt6i_gateway;
3063         rt->rt6i_flags = ort->rt6i_flags;
3064         rt6_set_from(rt, ort);
3065         rt->rt6i_metric = ort->rt6i_metric;
3066 #ifdef CONFIG_IPV6_SUBTREES
3067         rt->rt6i_src = ort->rt6i_src;
3068 #endif
3069         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3070         rt->rt6i_table = ort->rt6i_table;
3071         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3072 }
3073
3074 #ifdef CONFIG_IPV6_ROUTE_INFO
3075 static struct rt6_info *rt6_get_route_info(struct net *net,
3076                                            const struct in6_addr *prefix, int prefixlen,
3077                                            const struct in6_addr *gwaddr,
3078                                            struct net_device *dev)
3079 {
3080         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3081         int ifindex = dev->ifindex;
3082         struct fib6_node *fn;
3083         struct rt6_info *rt = NULL;
3084         struct fib6_table *table;
3085
3086         table = fib6_get_table(net, tb_id);
3087         if (!table)
3088                 return NULL;
3089
3090         rcu_read_lock();
3091         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3092         if (!fn)
3093                 goto out;
3094
3095         for_each_fib6_node_rt_rcu(fn) {
3096                 if (rt->dst.dev->ifindex != ifindex)
3097                         continue;
3098                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3099                         continue;
3100                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3101                         continue;
3102                 ip6_hold_safe(NULL, &rt, false);
3103                 break;
3104         }
3105 out:
3106         rcu_read_unlock();
3107         return rt;
3108 }
3109
3110 static struct rt6_info *rt6_add_route_info(struct net *net,
3111                                            const struct in6_addr *prefix, int prefixlen,
3112                                            const struct in6_addr *gwaddr,
3113                                            struct net_device *dev,
3114                                            unsigned int pref)
3115 {
3116         struct fib6_config cfg = {
3117                 .fc_metric      = IP6_RT_PRIO_USER,
3118                 .fc_ifindex     = dev->ifindex,
3119                 .fc_dst_len     = prefixlen,
3120                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3121                                   RTF_UP | RTF_PREF(pref),
3122                 .fc_protocol = RTPROT_RA,
3123                 .fc_nlinfo.portid = 0,
3124                 .fc_nlinfo.nlh = NULL,
3125                 .fc_nlinfo.nl_net = net,
3126         };
3127
3128         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3129         cfg.fc_dst = *prefix;
3130         cfg.fc_gateway = *gwaddr;
3131
3132         /* We should treat it as a default route if prefix length is 0. */
3133         if (!prefixlen)
3134                 cfg.fc_flags |= RTF_DEFAULT;
3135
3136         ip6_route_add(&cfg, NULL);
3137
3138         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3139 }
3140 #endif
3141
3142 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3143 {
3144         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3145         struct rt6_info *rt;
3146         struct fib6_table *table;
3147
3148         table = fib6_get_table(dev_net(dev), tb_id);
3149         if (!table)
3150                 return NULL;
3151
3152         rcu_read_lock();
3153         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3154                 if (dev == rt->dst.dev &&
3155                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3156                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3157                         break;
3158         }
3159         if (rt)
3160                 ip6_hold_safe(NULL, &rt, false);
3161         rcu_read_unlock();
3162         return rt;
3163 }
3164
3165 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3166                                      struct net_device *dev,
3167                                      unsigned int pref)
3168 {
3169         struct fib6_config cfg = {
3170                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3171                 .fc_metric      = IP6_RT_PRIO_USER,
3172                 .fc_ifindex     = dev->ifindex,
3173                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3174                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3175                 .fc_protocol = RTPROT_RA,
3176                 .fc_nlinfo.portid = 0,
3177                 .fc_nlinfo.nlh = NULL,
3178                 .fc_nlinfo.nl_net = dev_net(dev),
3179         };
3180
3181         cfg.fc_gateway = *gwaddr;
3182
3183         if (!ip6_route_add(&cfg, NULL)) {
3184                 struct fib6_table *table;
3185
3186                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3187                 if (table)
3188                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3189         }
3190
3191         return rt6_get_dflt_router(gwaddr, dev);
3192 }
3193
3194 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3195 {
3196         struct rt6_info *rt;
3197
3198 restart:
3199         rcu_read_lock();
3200         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3201                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3202                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3203                         if (dst_hold_safe(&rt->dst)) {
3204                                 rcu_read_unlock();
3205                                 ip6_del_rt(rt);
3206                         } else {
3207                                 rcu_read_unlock();
3208                         }
3209                         goto restart;
3210                 }
3211         }
3212         rcu_read_unlock();
3213
3214         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3215 }
3216
3217 void rt6_purge_dflt_routers(struct net *net)
3218 {
3219         struct fib6_table *table;
3220         struct hlist_head *head;
3221         unsigned int h;
3222
3223         rcu_read_lock();
3224
3225         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3226                 head = &net->ipv6.fib_table_hash[h];
3227                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3228                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3229                                 __rt6_purge_dflt_routers(table);
3230                 }
3231         }
3232
3233         rcu_read_unlock();
3234 }
3235
3236 static void rtmsg_to_fib6_config(struct net *net,
3237                                  struct in6_rtmsg *rtmsg,
3238                                  struct fib6_config *cfg)
3239 {
3240         memset(cfg, 0, sizeof(*cfg));
3241
3242         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3243                          : RT6_TABLE_MAIN;
3244         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3245         cfg->fc_metric = rtmsg->rtmsg_metric;
3246         cfg->fc_expires = rtmsg->rtmsg_info;
3247         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3248         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3249         cfg->fc_flags = rtmsg->rtmsg_flags;
3250
3251         cfg->fc_nlinfo.nl_net = net;
3252
3253         cfg->fc_dst = rtmsg->rtmsg_dst;
3254         cfg->fc_src = rtmsg->rtmsg_src;
3255         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3256 }
3257
3258 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3259 {
3260         struct fib6_config cfg;
3261         struct in6_rtmsg rtmsg;
3262         int err;
3263
3264         switch (cmd) {
3265         case SIOCADDRT:         /* Add a route */
3266         case SIOCDELRT:         /* Delete a route */
3267                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3268                         return -EPERM;
3269                 err = copy_from_user(&rtmsg, arg,
3270                                      sizeof(struct in6_rtmsg));
3271                 if (err)
3272                         return -EFAULT;
3273
3274                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3275
3276                 rtnl_lock();
3277                 switch (cmd) {
3278                 case SIOCADDRT:
3279                         err = ip6_route_add(&cfg, NULL);
3280                         break;
3281                 case SIOCDELRT:
3282                         err = ip6_route_del(&cfg, NULL);
3283                         break;
3284                 default:
3285                         err = -EINVAL;
3286                 }
3287                 rtnl_unlock();
3288
3289                 return err;
3290         }
3291
3292         return -EINVAL;
3293 }
3294
3295 /*
3296  *      Drop the packet on the floor
3297  */
3298
3299 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3300 {
3301         int type;
3302         struct dst_entry *dst = skb_dst(skb);
3303         switch (ipstats_mib_noroutes) {
3304         case IPSTATS_MIB_INNOROUTES:
3305                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3306                 if (type == IPV6_ADDR_ANY) {
3307                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3308                                       IPSTATS_MIB_INADDRERRORS);
3309                         break;
3310                 }
3311                 /* FALLTHROUGH */
3312         case IPSTATS_MIB_OUTNOROUTES:
3313                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3314                               ipstats_mib_noroutes);
3315                 break;
3316         }
3317         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3318         kfree_skb(skb);
3319         return 0;
3320 }
3321
3322 static int ip6_pkt_discard(struct sk_buff *skb)
3323 {
3324         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3325 }
3326
3327 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3328 {
3329         skb->dev = skb_dst(skb)->dev;
3330         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3331 }
3332
3333 static int ip6_pkt_prohibit(struct sk_buff *skb)
3334 {
3335         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3336 }
3337
3338 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3339 {
3340         skb->dev = skb_dst(skb)->dev;
3341         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3342 }
3343
3344 /*
3345  *      Allocate a dst for local (unicast / anycast) address.
3346  */
3347
3348 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3349                                     const struct in6_addr *addr,
3350                                     bool anycast)
3351 {
3352         u32 tb_id;
3353         struct net *net = dev_net(idev->dev);
3354         struct net_device *dev = idev->dev;
3355         struct rt6_info *rt;
3356
3357         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3358         if (!rt)
3359                 return ERR_PTR(-ENOMEM);
3360
3361         in6_dev_hold(idev);
3362
3363         rt->dst.flags |= DST_HOST;
3364         rt->dst.input = ip6_input;
3365         rt->dst.output = ip6_output;
3366         rt->rt6i_idev = idev;
3367
3368         rt->rt6i_protocol = RTPROT_KERNEL;
3369         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3370         if (anycast)
3371                 rt->rt6i_flags |= RTF_ANYCAST;
3372         else
3373                 rt->rt6i_flags |= RTF_LOCAL;
3374
3375         rt->rt6i_gateway  = *addr;
3376         rt->rt6i_dst.addr = *addr;
3377         rt->rt6i_dst.plen = 128;
3378         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3379         rt->rt6i_table = fib6_get_table(net, tb_id);
3380
3381         return rt;
3382 }
3383
3384 /* remove deleted ip from prefsrc entries */
3385 struct arg_dev_net_ip {
3386         struct net_device *dev;
3387         struct net *net;
3388         struct in6_addr *addr;
3389 };
3390
3391 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3392 {
3393         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3394         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3395         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3396
3397         if (((void *)rt->dst.dev == dev || !dev) &&
3398             rt != net->ipv6.ip6_null_entry &&
3399             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3400                 spin_lock_bh(&rt6_exception_lock);
3401                 /* remove prefsrc entry */
3402                 rt->rt6i_prefsrc.plen = 0;
3403                 /* need to update cache as well */
3404                 rt6_exceptions_remove_prefsrc(rt);
3405                 spin_unlock_bh(&rt6_exception_lock);
3406         }
3407         return 0;
3408 }
3409
3410 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3411 {
3412         struct net *net = dev_net(ifp->idev->dev);
3413         struct arg_dev_net_ip adni = {
3414                 .dev = ifp->idev->dev,
3415                 .net = net,
3416                 .addr = &ifp->addr,
3417         };
3418         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3419 }
3420
3421 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3422
3423 /* Remove routers and update dst entries when gateway turn into host. */
3424 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3425 {
3426         struct in6_addr *gateway = (struct in6_addr *)arg;
3427
3428         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3429             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3430                 return -1;
3431         }
3432
3433         /* Further clean up cached routes in exception table.
3434          * This is needed because cached route may have a different
3435          * gateway than its 'parent' in the case of an ip redirect.
3436          */
3437         rt6_exceptions_clean_tohost(rt, gateway);
3438
3439         return 0;
3440 }
3441
3442 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3443 {
3444         fib6_clean_all(net, fib6_clean_tohost, gateway);
3445 }
3446
3447 struct arg_dev_net {
3448         struct net_device *dev;
3449         struct net *net;
3450 };
3451
3452 /* called with write lock held for table with rt */
3453 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3454 {
3455         const struct arg_dev_net *adn = arg;
3456         const struct net_device *dev = adn->dev;
3457
3458         if ((rt->dst.dev == dev || !dev) &&
3459             rt != adn->net->ipv6.ip6_null_entry &&
3460             (rt->rt6i_nsiblings == 0 ||
3461              (dev && netdev_unregistering(dev)) ||
3462              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3463                 return -1;
3464
3465         return 0;
3466 }
3467
3468 void rt6_ifdown(struct net *net, struct net_device *dev)
3469 {
3470         struct arg_dev_net adn = {
3471                 .dev = dev,
3472                 .net = net,
3473         };
3474
3475         fib6_clean_all(net, fib6_ifdown, &adn);
3476         if (dev)
3477                 rt6_uncached_list_flush_dev(net, dev);
3478 }
3479
3480 struct rt6_mtu_change_arg {
3481         struct net_device *dev;
3482         unsigned int mtu;
3483 };
3484
3485 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3486 {
3487         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3488         struct inet6_dev *idev;
3489
3490         /* In IPv6 pmtu discovery is not optional,
3491            so that RTAX_MTU lock cannot disable it.
3492            We still use this lock to block changes
3493            caused by addrconf/ndisc.
3494         */
3495
3496         idev = __in6_dev_get(arg->dev);
3497         if (!idev)
3498                 return 0;
3499
3500         /* For administrative MTU increase, there is no way to discover
3501            IPv6 PMTU increase, so PMTU increase should be updated here.
3502            Since RFC 1981 doesn't include administrative MTU increase
3503            update PMTU increase is a MUST. (i.e. jumbo frame)
3504          */
3505         /*
3506            If new MTU is less than route PMTU, this new MTU will be the
3507            lowest MTU in the path, update the route PMTU to reflect PMTU
3508            decreases; if new MTU is greater than route PMTU, and the
3509            old MTU is the lowest MTU in the path, update the route PMTU
3510            to reflect the increase. In this case if the other nodes' MTU
3511            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3512            PMTU discovery.
3513          */
3514         if (rt->dst.dev == arg->dev &&
3515             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3516             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3517                 spin_lock_bh(&rt6_exception_lock);
3518                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3519                     (dst_mtu(&rt->dst) < arg->mtu &&
3520                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3521                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3522                 }
3523                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3524                 spin_unlock_bh(&rt6_exception_lock);
3525         }
3526         return 0;
3527 }
3528
3529 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3530 {
3531         struct rt6_mtu_change_arg arg = {
3532                 .dev = dev,
3533                 .mtu = mtu,
3534         };
3535
3536         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3537 }
3538
3539 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3540         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3541         [RTA_OIF]               = { .type = NLA_U32 },
3542         [RTA_IIF]               = { .type = NLA_U32 },
3543         [RTA_PRIORITY]          = { .type = NLA_U32 },
3544         [RTA_METRICS]           = { .type = NLA_NESTED },
3545         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3546         [RTA_PREF]              = { .type = NLA_U8 },
3547         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3548         [RTA_ENCAP]             = { .type = NLA_NESTED },
3549         [RTA_EXPIRES]           = { .type = NLA_U32 },
3550         [RTA_UID]               = { .type = NLA_U32 },
3551         [RTA_MARK]              = { .type = NLA_U32 },
3552 };
3553
3554 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3555                               struct fib6_config *cfg,
3556                               struct netlink_ext_ack *extack)
3557 {
3558         struct rtmsg *rtm;
3559         struct nlattr *tb[RTA_MAX+1];
3560         unsigned int pref;
3561         int err;
3562
3563         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3564                           NULL);
3565         if (err < 0)
3566                 goto errout;
3567
3568         err = -EINVAL;
3569         rtm = nlmsg_data(nlh);
3570         memset(cfg, 0, sizeof(*cfg));
3571
3572         cfg->fc_table = rtm->rtm_table;
3573         cfg->fc_dst_len = rtm->rtm_dst_len;
3574         cfg->fc_src_len = rtm->rtm_src_len;
3575         cfg->fc_flags = RTF_UP;
3576         cfg->fc_protocol = rtm->rtm_protocol;
3577         cfg->fc_type = rtm->rtm_type;
3578
3579         if (rtm->rtm_type == RTN_UNREACHABLE ||
3580             rtm->rtm_type == RTN_BLACKHOLE ||
3581             rtm->rtm_type == RTN_PROHIBIT ||
3582             rtm->rtm_type == RTN_THROW)
3583                 cfg->fc_flags |= RTF_REJECT;
3584
3585         if (rtm->rtm_type == RTN_LOCAL)
3586                 cfg->fc_flags |= RTF_LOCAL;
3587
3588         if (rtm->rtm_flags & RTM_F_CLONED)
3589                 cfg->fc_flags |= RTF_CACHE;
3590
3591         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3592         cfg->fc_nlinfo.nlh = nlh;
3593         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3594
3595         if (tb[RTA_GATEWAY]) {
3596                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3597                 cfg->fc_flags |= RTF_GATEWAY;
3598         }
3599
3600         if (tb[RTA_DST]) {
3601                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3602
3603                 if (nla_len(tb[RTA_DST]) < plen)
3604                         goto errout;
3605
3606                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3607         }
3608
3609         if (tb[RTA_SRC]) {
3610                 int plen = (rtm->rtm_src_len + 7) >> 3;
3611
3612                 if (nla_len(tb[RTA_SRC]) < plen)
3613                         goto errout;
3614
3615                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3616         }
3617
3618         if (tb[RTA_PREFSRC])
3619                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3620
3621         if (tb[RTA_OIF])
3622                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3623
3624         if (tb[RTA_PRIORITY])
3625                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3626
3627         if (tb[RTA_METRICS]) {
3628                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3629                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3630         }
3631
3632         if (tb[RTA_TABLE])
3633                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3634
3635         if (tb[RTA_MULTIPATH]) {
3636                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3637                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3638
3639                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3640                                                      cfg->fc_mp_len, extack);
3641                 if (err < 0)
3642                         goto errout;
3643         }
3644
3645         if (tb[RTA_PREF]) {
3646                 pref = nla_get_u8(tb[RTA_PREF]);
3647                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3648                     pref != ICMPV6_ROUTER_PREF_HIGH)
3649                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3650                 cfg->fc_flags |= RTF_PREF(pref);
3651         }
3652
3653         if (tb[RTA_ENCAP])
3654                 cfg->fc_encap = tb[RTA_ENCAP];
3655
3656         if (tb[RTA_ENCAP_TYPE]) {
3657                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3658
3659                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3660                 if (err < 0)
3661                         goto errout;
3662         }
3663
3664         if (tb[RTA_EXPIRES]) {
3665                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3666
3667                 if (addrconf_finite_timeout(timeout)) {
3668                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3669                         cfg->fc_flags |= RTF_EXPIRES;
3670                 }
3671         }
3672
3673         err = 0;
3674 errout:
3675         return err;
3676 }
3677
3678 struct rt6_nh {
3679         struct rt6_info *rt6_info;
3680         struct fib6_config r_cfg;
3681         struct mx6_config mxc;
3682         struct list_head next;
3683 };
3684
3685 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3686 {
3687         struct rt6_nh *nh;
3688
3689         list_for_each_entry(nh, rt6_nh_list, next) {
3690                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3691                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3692                         nh->r_cfg.fc_ifindex);
3693         }
3694 }
3695
3696 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3697                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3698 {
3699         struct rt6_nh *nh;
3700         int err = -EEXIST;
3701
3702         list_for_each_entry(nh, rt6_nh_list, next) {
3703                 /* check if rt6_info already exists */
3704                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3705                         return err;
3706         }
3707
3708         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3709         if (!nh)
3710                 return -ENOMEM;
3711         nh->rt6_info = rt;
3712         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3713         if (err) {
3714                 kfree(nh);
3715                 return err;
3716         }
3717         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3718         list_add_tail(&nh->next, rt6_nh_list);
3719
3720         return 0;
3721 }
3722
3723 static void ip6_route_mpath_notify(struct rt6_info *rt,
3724                                    struct rt6_info *rt_last,
3725                                    struct nl_info *info,
3726                                    __u16 nlflags)
3727 {
3728         /* if this is an APPEND route, then rt points to the first route
3729          * inserted and rt_last points to last route inserted. Userspace
3730          * wants a consistent dump of the route which starts at the first
3731          * nexthop. Since sibling routes are always added at the end of
3732          * the list, find the first sibling of the last route appended
3733          */
3734         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3735                 rt = list_first_entry(&rt_last->rt6i_siblings,
3736                                       struct rt6_info,
3737                                       rt6i_siblings);
3738         }
3739
3740         if (rt)
3741                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3742 }
3743
3744 static int ip6_route_multipath_add(struct fib6_config *cfg,
3745                                    struct netlink_ext_ack *extack)
3746 {
3747         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3748         struct nl_info *info = &cfg->fc_nlinfo;
3749         struct fib6_config r_cfg;
3750         struct rtnexthop *rtnh;
3751         struct rt6_info *rt;
3752         struct rt6_nh *err_nh;
3753         struct rt6_nh *nh, *nh_safe;
3754         __u16 nlflags;
3755         int remaining;
3756         int attrlen;
3757         int err = 1;
3758         int nhn = 0;
3759         int replace = (cfg->fc_nlinfo.nlh &&
3760                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3761         LIST_HEAD(rt6_nh_list);
3762
3763         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3764         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3765                 nlflags |= NLM_F_APPEND;
3766
3767         remaining = cfg->fc_mp_len;
3768         rtnh = (struct rtnexthop *)cfg->fc_mp;
3769
3770         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3771          * rt6_info structs per nexthop
3772          */
3773         while (rtnh_ok(rtnh, remaining)) {
3774                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3775                 if (rtnh->rtnh_ifindex)
3776                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3777
3778                 attrlen = rtnh_attrlen(rtnh);
3779                 if (attrlen > 0) {
3780                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3781
3782                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3783                         if (nla) {
3784                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3785                                 r_cfg.fc_flags |= RTF_GATEWAY;
3786                         }
3787                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3788                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3789                         if (nla)
3790                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3791                 }
3792
3793                 rt = ip6_route_info_create(&r_cfg, extack);
3794                 if (IS_ERR(rt)) {
3795                         err = PTR_ERR(rt);
3796                         rt = NULL;
3797                         goto cleanup;
3798                 }
3799
3800                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3801                 if (err) {
3802                         dst_release_immediate(&rt->dst);
3803                         goto cleanup;
3804                 }
3805
3806                 rtnh = rtnh_next(rtnh, &remaining);
3807         }
3808
3809         /* for add and replace send one notification with all nexthops.
3810          * Skip the notification in fib6_add_rt2node and send one with
3811          * the full route when done
3812          */
3813         info->skip_notify = 1;
3814
3815         err_nh = NULL;
3816         list_for_each_entry(nh, &rt6_nh_list, next) {
3817                 rt_last = nh->rt6_info;
3818                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3819                 /* save reference to first route for notification */
3820                 if (!rt_notif && !err)
3821                         rt_notif = nh->rt6_info;
3822
3823                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3824                 nh->rt6_info = NULL;
3825                 if (err) {
3826                         if (replace && nhn)
3827                                 ip6_print_replace_route_err(&rt6_nh_list);
3828                         err_nh = nh;
3829                         goto add_errout;
3830                 }
3831
3832                 /* Because each route is added like a single route we remove
3833                  * these flags after the first nexthop: if there is a collision,
3834                  * we have already failed to add the first nexthop:
3835                  * fib6_add_rt2node() has rejected it; when replacing, old
3836                  * nexthops have been replaced by first new, the rest should
3837                  * be added to it.
3838                  */
3839                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3840                                                      NLM_F_REPLACE);
3841                 nhn++;
3842         }
3843
3844         /* success ... tell user about new route */
3845         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3846         goto cleanup;
3847
3848 add_errout:
3849         /* send notification for routes that were added so that
3850          * the delete notifications sent by ip6_route_del are
3851          * coherent
3852          */
3853         if (rt_notif)
3854                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3855
3856         /* Delete routes that were already added */
3857         list_for_each_entry(nh, &rt6_nh_list, next) {
3858                 if (err_nh == nh)
3859                         break;
3860                 ip6_route_del(&nh->r_cfg, extack);
3861         }
3862
3863 cleanup:
3864         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3865                 if (nh->rt6_info)
3866                         dst_release_immediate(&nh->rt6_info->dst);
3867                 kfree(nh->mxc.mx);
3868                 list_del(&nh->next);
3869                 kfree(nh);
3870         }
3871
3872         return err;
3873 }
3874
3875 static int ip6_route_multipath_del(struct fib6_config *cfg,
3876                                    struct netlink_ext_ack *extack)
3877 {
3878         struct fib6_config r_cfg;
3879         struct rtnexthop *rtnh;
3880         int remaining;
3881         int attrlen;
3882         int err = 1, last_err = 0;
3883
3884         remaining = cfg->fc_mp_len;
3885         rtnh = (struct rtnexthop *)cfg->fc_mp;
3886
3887         /* Parse a Multipath Entry */
3888         while (rtnh_ok(rtnh, remaining)) {
3889                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3890                 if (rtnh->rtnh_ifindex)
3891                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3892
3893                 attrlen = rtnh_attrlen(rtnh);
3894                 if (attrlen > 0) {
3895                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3896
3897                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3898                         if (nla) {
3899                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3900                                 r_cfg.fc_flags |= RTF_GATEWAY;
3901                         }
3902                 }
3903                 err = ip6_route_del(&r_cfg, extack);
3904                 if (err)
3905                         last_err = err;
3906
3907                 rtnh = rtnh_next(rtnh, &remaining);
3908         }
3909
3910         return last_err;
3911 }
3912
3913 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3914                               struct netlink_ext_ack *extack)
3915 {
3916         struct fib6_config cfg;
3917         int err;
3918
3919         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3920         if (err < 0)
3921                 return err;
3922
3923         if (cfg.fc_mp)
3924                 return ip6_route_multipath_del(&cfg, extack);
3925         else {
3926                 cfg.fc_delete_all_nh = 1;
3927                 return ip6_route_del(&cfg, extack);
3928         }
3929 }
3930
3931 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3932                               struct netlink_ext_ack *extack)
3933 {
3934         struct fib6_config cfg;
3935         int err;
3936
3937         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3938         if (err < 0)
3939                 return err;
3940
3941         if (cfg.fc_mp)
3942                 return ip6_route_multipath_add(&cfg, extack);
3943         else
3944                 return ip6_route_add(&cfg, extack);
3945 }
3946
3947 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3948 {
3949         int nexthop_len = 0;
3950
3951         if (rt->rt6i_nsiblings) {
3952                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3953                             + NLA_ALIGN(sizeof(struct rtnexthop))
3954                             + nla_total_size(16) /* RTA_GATEWAY */
3955                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3956
3957                 nexthop_len *= rt->rt6i_nsiblings;
3958         }
3959
3960         return NLMSG_ALIGN(sizeof(struct rtmsg))
3961                + nla_total_size(16) /* RTA_SRC */
3962                + nla_total_size(16) /* RTA_DST */
3963                + nla_total_size(16) /* RTA_GATEWAY */
3964                + nla_total_size(16) /* RTA_PREFSRC */
3965                + nla_total_size(4) /* RTA_TABLE */
3966                + nla_total_size(4) /* RTA_IIF */
3967                + nla_total_size(4) /* RTA_OIF */
3968                + nla_total_size(4) /* RTA_PRIORITY */
3969                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3970                + nla_total_size(sizeof(struct rta_cacheinfo))
3971                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3972                + nla_total_size(1) /* RTA_PREF */
3973                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3974                + nexthop_len;
3975 }
3976
3977 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3978                             unsigned int *flags, bool skip_oif)
3979 {
3980         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3981                 *flags |= RTNH_F_LINKDOWN;
3982                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3983                         *flags |= RTNH_F_DEAD;
3984         }
3985
3986         if (rt->rt6i_flags & RTF_GATEWAY) {
3987                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3988                         goto nla_put_failure;
3989         }
3990
3991         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3992                 *flags |= RTNH_F_OFFLOAD;
3993
3994         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3995         if (!skip_oif && rt->dst.dev &&
3996             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3997                 goto nla_put_failure;
3998
3999         if (rt->dst.lwtstate &&
4000             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4001                 goto nla_put_failure;
4002
4003         return 0;
4004
4005 nla_put_failure:
4006         return -EMSGSIZE;
4007 }
4008
4009 /* add multipath next hop */
4010 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4011 {
4012         struct rtnexthop *rtnh;
4013         unsigned int flags = 0;
4014
4015         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4016         if (!rtnh)
4017                 goto nla_put_failure;
4018
4019         rtnh->rtnh_hops = 0;
4020         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4021
4022         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4023                 goto nla_put_failure;
4024
4025         rtnh->rtnh_flags = flags;
4026
4027         /* length of rtnetlink header + attributes */
4028         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4029
4030         return 0;
4031
4032 nla_put_failure:
4033         return -EMSGSIZE;
4034 }
4035
4036 static int rt6_fill_node(struct net *net,
4037                          struct sk_buff *skb, struct rt6_info *rt,
4038                          struct in6_addr *dst, struct in6_addr *src,
4039                          int iif, int type, u32 portid, u32 seq,
4040                          unsigned int flags)
4041 {
4042         u32 metrics[RTAX_MAX];
4043         struct rtmsg *rtm;
4044         struct nlmsghdr *nlh;
4045         long expires;
4046         u32 table;
4047
4048         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4049         if (!nlh)
4050                 return -EMSGSIZE;
4051
4052         rtm = nlmsg_data(nlh);
4053         rtm->rtm_family = AF_INET6;
4054         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4055         rtm->rtm_src_len = rt->rt6i_src.plen;
4056         rtm->rtm_tos = 0;
4057         if (rt->rt6i_table)
4058                 table = rt->rt6i_table->tb6_id;
4059         else
4060                 table = RT6_TABLE_UNSPEC;
4061         rtm->rtm_table = table;
4062         if (nla_put_u32(skb, RTA_TABLE, table))
4063                 goto nla_put_failure;
4064         if (rt->rt6i_flags & RTF_REJECT) {
4065                 switch (rt->dst.error) {
4066                 case -EINVAL:
4067                         rtm->rtm_type = RTN_BLACKHOLE;
4068                         break;
4069                 case -EACCES:
4070                         rtm->rtm_type = RTN_PROHIBIT;
4071                         break;
4072                 case -EAGAIN:
4073                         rtm->rtm_type = RTN_THROW;
4074                         break;
4075                 default:
4076                         rtm->rtm_type = RTN_UNREACHABLE;
4077                         break;
4078                 }
4079         }
4080         else if (rt->rt6i_flags & RTF_LOCAL)
4081                 rtm->rtm_type = RTN_LOCAL;
4082         else if (rt->rt6i_flags & RTF_ANYCAST)
4083                 rtm->rtm_type = RTN_ANYCAST;
4084         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4085                 rtm->rtm_type = RTN_LOCAL;
4086         else
4087                 rtm->rtm_type = RTN_UNICAST;
4088         rtm->rtm_flags = 0;
4089         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4090         rtm->rtm_protocol = rt->rt6i_protocol;
4091
4092         if (rt->rt6i_flags & RTF_CACHE)
4093                 rtm->rtm_flags |= RTM_F_CLONED;
4094
4095         if (dst) {
4096                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4097                         goto nla_put_failure;
4098                 rtm->rtm_dst_len = 128;
4099         } else if (rtm->rtm_dst_len)
4100                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4101                         goto nla_put_failure;
4102 #ifdef CONFIG_IPV6_SUBTREES
4103         if (src) {
4104                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4105                         goto nla_put_failure;
4106                 rtm->rtm_src_len = 128;
4107         } else if (rtm->rtm_src_len &&
4108                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4109                 goto nla_put_failure;
4110 #endif
4111         if (iif) {
4112 #ifdef CONFIG_IPV6_MROUTE
4113                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4114                         int err = ip6mr_get_route(net, skb, rtm, portid);
4115
4116                         if (err == 0)
4117                                 return 0;
4118                         if (err < 0)
4119                                 goto nla_put_failure;
4120                 } else
4121 #endif
4122                         if (nla_put_u32(skb, RTA_IIF, iif))
4123                                 goto nla_put_failure;
4124         } else if (dst) {
4125                 struct in6_addr saddr_buf;
4126                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4127                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4128                         goto nla_put_failure;
4129         }
4130
4131         if (rt->rt6i_prefsrc.plen) {
4132                 struct in6_addr saddr_buf;
4133                 saddr_buf = rt->rt6i_prefsrc.addr;
4134                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4135                         goto nla_put_failure;
4136         }
4137
4138         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4139         if (rt->rt6i_pmtu)
4140                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4141         if (rtnetlink_put_metrics(skb, metrics) < 0)
4142                 goto nla_put_failure;
4143
4144         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4145                 goto nla_put_failure;
4146
4147         /* For multipath routes, walk the siblings list and add
4148          * each as a nexthop within RTA_MULTIPATH.
4149          */
4150         if (rt->rt6i_nsiblings) {
4151                 struct rt6_info *sibling, *next_sibling;
4152                 struct nlattr *mp;
4153
4154                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4155                 if (!mp)
4156                         goto nla_put_failure;
4157
4158                 if (rt6_add_nexthop(skb, rt) < 0)
4159                         goto nla_put_failure;
4160
4161                 list_for_each_entry_safe(sibling, next_sibling,
4162                                          &rt->rt6i_siblings, rt6i_siblings) {
4163                         if (rt6_add_nexthop(skb, sibling) < 0)
4164                                 goto nla_put_failure;
4165                 }
4166
4167                 nla_nest_end(skb, mp);
4168         } else {
4169                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4170                         goto nla_put_failure;
4171         }
4172
4173         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4174
4175         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4176                 goto nla_put_failure;
4177
4178         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4179                 goto nla_put_failure;
4180
4181
4182         nlmsg_end(skb, nlh);
4183         return 0;
4184
4185 nla_put_failure:
4186         nlmsg_cancel(skb, nlh);
4187         return -EMSGSIZE;
4188 }
4189
4190 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4191 {
4192         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4193         struct net *net = arg->net;
4194
4195         if (rt == net->ipv6.ip6_null_entry)
4196                 return 0;
4197
4198         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4199                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4200
4201                 /* user wants prefix routes only */
4202                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4203                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4204                         /* success since this is not a prefix route */
4205                         return 1;
4206                 }
4207         }
4208
4209         return rt6_fill_node(net,
4210                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4211                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4212                      NLM_F_MULTI);
4213 }
4214
4215 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4216                               struct netlink_ext_ack *extack)
4217 {
4218         struct net *net = sock_net(in_skb->sk);
4219         struct nlattr *tb[RTA_MAX+1];
4220         int err, iif = 0, oif = 0;
4221         struct dst_entry *dst;
4222         struct rt6_info *rt;
4223         struct sk_buff *skb;
4224         struct rtmsg *rtm;
4225         struct flowi6 fl6;
4226         bool fibmatch;
4227
4228         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4229                           extack);
4230         if (err < 0)
4231                 goto errout;
4232
4233         err = -EINVAL;
4234         memset(&fl6, 0, sizeof(fl6));
4235         rtm = nlmsg_data(nlh);
4236         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4237         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4238
4239         if (tb[RTA_SRC]) {
4240                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4241                         goto errout;
4242
4243                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4244         }
4245
4246         if (tb[RTA_DST]) {
4247                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4248                         goto errout;
4249
4250                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4251         }
4252
4253         if (tb[RTA_IIF])
4254                 iif = nla_get_u32(tb[RTA_IIF]);
4255
4256         if (tb[RTA_OIF])
4257                 oif = nla_get_u32(tb[RTA_OIF]);
4258
4259         if (tb[RTA_MARK])
4260                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4261
4262         if (tb[RTA_UID])
4263                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4264                                            nla_get_u32(tb[RTA_UID]));
4265         else
4266                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4267
4268         if (iif) {
4269                 struct net_device *dev;
4270                 int flags = 0;
4271
4272                 rcu_read_lock();
4273
4274                 dev = dev_get_by_index_rcu(net, iif);
4275                 if (!dev) {
4276                         rcu_read_unlock();
4277                         err = -ENODEV;
4278                         goto errout;
4279                 }
4280
4281                 fl6.flowi6_iif = iif;
4282
4283                 if (!ipv6_addr_any(&fl6.saddr))
4284                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4285
4286                 if (!fibmatch)
4287                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4288                 else
4289                         dst = ip6_route_lookup(net, &fl6, 0);
4290
4291                 rcu_read_unlock();
4292         } else {
4293                 fl6.flowi6_oif = oif;
4294
4295                 if (!fibmatch)
4296                         dst = ip6_route_output(net, NULL, &fl6);
4297                 else
4298                         dst = ip6_route_lookup(net, &fl6, 0);
4299         }
4300
4301
4302         rt = container_of(dst, struct rt6_info, dst);
4303         if (rt->dst.error) {
4304                 err = rt->dst.error;
4305                 ip6_rt_put(rt);
4306                 goto errout;
4307         }
4308
4309         if (rt == net->ipv6.ip6_null_entry) {
4310                 err = rt->dst.error;
4311                 ip6_rt_put(rt);
4312                 goto errout;
4313         }
4314
4315         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4316         if (!skb) {
4317                 ip6_rt_put(rt);
4318                 err = -ENOBUFS;
4319                 goto errout;
4320         }
4321
4322         skb_dst_set(skb, &rt->dst);
4323         if (fibmatch)
4324                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4325                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4326                                     nlh->nlmsg_seq, 0);
4327         else
4328                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4329                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4330                                     nlh->nlmsg_seq, 0);
4331         if (err < 0) {
4332                 kfree_skb(skb);
4333                 goto errout;
4334         }
4335
4336         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4337 errout:
4338         return err;
4339 }
4340
4341 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4342                      unsigned int nlm_flags)
4343 {
4344         struct sk_buff *skb;
4345         struct net *net = info->nl_net;
4346         u32 seq;
4347         int err;
4348
4349         err = -ENOBUFS;
4350         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4351
4352         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4353         if (!skb)
4354                 goto errout;
4355
4356         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4357                                 event, info->portid, seq, nlm_flags);
4358         if (err < 0) {
4359                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4360                 WARN_ON(err == -EMSGSIZE);
4361                 kfree_skb(skb);
4362                 goto errout;
4363         }
4364         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4365                     info->nlh, gfp_any());
4366         return;
4367 errout:
4368         if (err < 0)
4369                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4370 }
4371
4372 static int ip6_route_dev_notify(struct notifier_block *this,
4373                                 unsigned long event, void *ptr)
4374 {
4375         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4376         struct net *net = dev_net(dev);
4377
4378         if (!(dev->flags & IFF_LOOPBACK))
4379                 return NOTIFY_OK;
4380
4381         if (event == NETDEV_REGISTER) {
4382                 net->ipv6.ip6_null_entry->dst.dev = dev;
4383                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4384 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4385                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4386                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4387                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4388                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4389 #endif
4390          } else if (event == NETDEV_UNREGISTER &&
4391                     dev->reg_state != NETREG_UNREGISTERED) {
4392                 /* NETDEV_UNREGISTER could be fired for multiple times by
4393                  * netdev_wait_allrefs(). Make sure we only call this once.
4394                  */
4395                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4396 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4397                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4398                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4399 #endif
4400         }
4401
4402         return NOTIFY_OK;
4403 }
4404
4405 /*
4406  *      /proc
4407  */
4408
4409 #ifdef CONFIG_PROC_FS
4410
4411 static const struct file_operations ipv6_route_proc_fops = {
4412         .owner          = THIS_MODULE,
4413         .open           = ipv6_route_open,
4414         .read           = seq_read,
4415         .llseek         = seq_lseek,
4416         .release        = seq_release_net,
4417 };
4418
4419 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4420 {
4421         struct net *net = (struct net *)seq->private;
4422         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4423                    net->ipv6.rt6_stats->fib_nodes,
4424                    net->ipv6.rt6_stats->fib_route_nodes,
4425                    net->ipv6.rt6_stats->fib_rt_alloc,
4426                    net->ipv6.rt6_stats->fib_rt_entries,
4427                    net->ipv6.rt6_stats->fib_rt_cache,
4428                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4429                    net->ipv6.rt6_stats->fib_discarded_routes);
4430
4431         return 0;
4432 }
4433
4434 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4435 {
4436         return single_open_net(inode, file, rt6_stats_seq_show);
4437 }
4438
4439 static const struct file_operations rt6_stats_seq_fops = {
4440         .owner   = THIS_MODULE,
4441         .open    = rt6_stats_seq_open,
4442         .read    = seq_read,
4443         .llseek  = seq_lseek,
4444         .release = single_release_net,
4445 };
4446 #endif  /* CONFIG_PROC_FS */
4447
4448 #ifdef CONFIG_SYSCTL
4449
4450 static
4451 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4452                               void __user *buffer, size_t *lenp, loff_t *ppos)
4453 {
4454         struct net *net;
4455         int delay;
4456         if (!write)
4457                 return -EINVAL;
4458
4459         net = (struct net *)ctl->extra1;
4460         delay = net->ipv6.sysctl.flush_delay;
4461         proc_dointvec(ctl, write, buffer, lenp, ppos);
4462         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4463         return 0;
4464 }
4465
4466 struct ctl_table ipv6_route_table_template[] = {
4467         {
4468                 .procname       =       "flush",
4469                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4470                 .maxlen         =       sizeof(int),
4471                 .mode           =       0200,
4472                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4473         },
4474         {
4475                 .procname       =       "gc_thresh",
4476                 .data           =       &ip6_dst_ops_template.gc_thresh,
4477                 .maxlen         =       sizeof(int),
4478                 .mode           =       0644,
4479                 .proc_handler   =       proc_dointvec,
4480         },
4481         {
4482                 .procname       =       "max_size",
4483                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4484                 .maxlen         =       sizeof(int),
4485                 .mode           =       0644,
4486                 .proc_handler   =       proc_dointvec,
4487         },
4488         {
4489                 .procname       =       "gc_min_interval",
4490                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4491                 .maxlen         =       sizeof(int),
4492                 .mode           =       0644,
4493                 .proc_handler   =       proc_dointvec_jiffies,
4494         },
4495         {
4496                 .procname       =       "gc_timeout",
4497                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4498                 .maxlen         =       sizeof(int),
4499                 .mode           =       0644,
4500                 .proc_handler   =       proc_dointvec_jiffies,
4501         },
4502         {
4503                 .procname       =       "gc_interval",
4504                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4505                 .maxlen         =       sizeof(int),
4506                 .mode           =       0644,
4507                 .proc_handler   =       proc_dointvec_jiffies,
4508         },
4509         {
4510                 .procname       =       "gc_elasticity",
4511                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4512                 .maxlen         =       sizeof(int),
4513                 .mode           =       0644,
4514                 .proc_handler   =       proc_dointvec,
4515         },
4516         {
4517                 .procname       =       "mtu_expires",
4518                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4519                 .maxlen         =       sizeof(int),
4520                 .mode           =       0644,
4521                 .proc_handler   =       proc_dointvec_jiffies,
4522         },
4523         {
4524                 .procname       =       "min_adv_mss",
4525                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4526                 .maxlen         =       sizeof(int),
4527                 .mode           =       0644,
4528                 .proc_handler   =       proc_dointvec,
4529         },
4530         {
4531                 .procname       =       "gc_min_interval_ms",
4532                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4533                 .maxlen         =       sizeof(int),
4534                 .mode           =       0644,
4535                 .proc_handler   =       proc_dointvec_ms_jiffies,
4536         },
4537         { }
4538 };
4539
4540 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4541 {
4542         struct ctl_table *table;
4543
4544         table = kmemdup(ipv6_route_table_template,
4545                         sizeof(ipv6_route_table_template),
4546                         GFP_KERNEL);
4547
4548         if (table) {
4549                 table[0].data = &net->ipv6.sysctl.flush_delay;
4550                 table[0].extra1 = net;
4551                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4552                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4553                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4554                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4555                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4556                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4557                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4558                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4559                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4560
4561                 /* Don't export sysctls to unprivileged users */
4562                 if (net->user_ns != &init_user_ns)
4563                         table[0].procname = NULL;
4564         }
4565
4566         return table;
4567 }
4568 #endif
4569
4570 static int __net_init ip6_route_net_init(struct net *net)
4571 {
4572         int ret = -ENOMEM;
4573
4574         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4575                sizeof(net->ipv6.ip6_dst_ops));
4576
4577         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4578                 goto out_ip6_dst_ops;
4579
4580         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4581                                            sizeof(*net->ipv6.ip6_null_entry),
4582                                            GFP_KERNEL);
4583         if (!net->ipv6.ip6_null_entry)
4584                 goto out_ip6_dst_entries;
4585         net->ipv6.ip6_null_entry->dst.path =
4586                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4587         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4588         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4589                          ip6_template_metrics, true);
4590
4591 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4592         net->ipv6.fib6_has_custom_rules = false;
4593         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4594                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4595                                                GFP_KERNEL);
4596         if (!net->ipv6.ip6_prohibit_entry)
4597                 goto out_ip6_null_entry;
4598         net->ipv6.ip6_prohibit_entry->dst.path =
4599                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4600         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4601         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4602                          ip6_template_metrics, true);
4603
4604         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4605                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4606                                                GFP_KERNEL);
4607         if (!net->ipv6.ip6_blk_hole_entry)
4608                 goto out_ip6_prohibit_entry;
4609         net->ipv6.ip6_blk_hole_entry->dst.path =
4610                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4611         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4612         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4613                          ip6_template_metrics, true);
4614 #endif
4615
4616         net->ipv6.sysctl.flush_delay = 0;
4617         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4618         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4619         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4620         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4621         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4622         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4623         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4624
4625         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4626
4627         ret = 0;
4628 out:
4629         return ret;
4630
4631 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4632 out_ip6_prohibit_entry:
4633         kfree(net->ipv6.ip6_prohibit_entry);
4634 out_ip6_null_entry:
4635         kfree(net->ipv6.ip6_null_entry);
4636 #endif
4637 out_ip6_dst_entries:
4638         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4639 out_ip6_dst_ops:
4640         goto out;
4641 }
4642
4643 static void __net_exit ip6_route_net_exit(struct net *net)
4644 {
4645         kfree(net->ipv6.ip6_null_entry);
4646 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4647         kfree(net->ipv6.ip6_prohibit_entry);
4648         kfree(net->ipv6.ip6_blk_hole_entry);
4649 #endif
4650         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4651 }
4652
4653 static int __net_init ip6_route_net_init_late(struct net *net)
4654 {
4655 #ifdef CONFIG_PROC_FS
4656         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4657         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4658 #endif
4659         return 0;
4660 }
4661
4662 static void __net_exit ip6_route_net_exit_late(struct net *net)
4663 {
4664 #ifdef CONFIG_PROC_FS
4665         remove_proc_entry("ipv6_route", net->proc_net);
4666         remove_proc_entry("rt6_stats", net->proc_net);
4667 #endif
4668 }
4669
4670 static struct pernet_operations ip6_route_net_ops = {
4671         .init = ip6_route_net_init,
4672         .exit = ip6_route_net_exit,
4673 };
4674
4675 static int __net_init ipv6_inetpeer_init(struct net *net)
4676 {
4677         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4678
4679         if (!bp)
4680                 return -ENOMEM;
4681         inet_peer_base_init(bp);
4682         net->ipv6.peers = bp;
4683         return 0;
4684 }
4685
4686 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4687 {
4688         struct inet_peer_base *bp = net->ipv6.peers;
4689
4690         net->ipv6.peers = NULL;
4691         inetpeer_invalidate_tree(bp);
4692         kfree(bp);
4693 }
4694
4695 static struct pernet_operations ipv6_inetpeer_ops = {
4696         .init   =       ipv6_inetpeer_init,
4697         .exit   =       ipv6_inetpeer_exit,
4698 };
4699
4700 static struct pernet_operations ip6_route_net_late_ops = {
4701         .init = ip6_route_net_init_late,
4702         .exit = ip6_route_net_exit_late,
4703 };
4704
4705 static struct notifier_block ip6_route_dev_notifier = {
4706         .notifier_call = ip6_route_dev_notify,
4707         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4708 };
4709
4710 void __init ip6_route_init_special_entries(void)
4711 {
4712         /* Registering of the loopback is done before this portion of code,
4713          * the loopback reference in rt6_info will not be taken, do it
4714          * manually for init_net */
4715         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4716         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4717   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4718         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4719         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4720         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4721         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4722   #endif
4723 }
4724
4725 int __init ip6_route_init(void)
4726 {
4727         int ret;
4728         int cpu;
4729
4730         ret = -ENOMEM;
4731         ip6_dst_ops_template.kmem_cachep =
4732                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4733                                   SLAB_HWCACHE_ALIGN, NULL);
4734         if (!ip6_dst_ops_template.kmem_cachep)
4735                 goto out;
4736
4737         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4738         if (ret)
4739                 goto out_kmem_cache;
4740
4741         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4742         if (ret)
4743                 goto out_dst_entries;
4744
4745         ret = register_pernet_subsys(&ip6_route_net_ops);
4746         if (ret)
4747                 goto out_register_inetpeer;
4748
4749         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4750
4751         ret = fib6_init();
4752         if (ret)
4753                 goto out_register_subsys;
4754
4755         ret = xfrm6_init();
4756         if (ret)
4757                 goto out_fib6_init;
4758
4759         ret = fib6_rules_init();
4760         if (ret)
4761                 goto xfrm6_init;
4762
4763         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4764         if (ret)
4765                 goto fib6_rules_init;
4766
4767         ret = -ENOBUFS;
4768         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4769             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4770             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4771                             RTNL_FLAG_DOIT_UNLOCKED))
4772                 goto out_register_late_subsys;
4773
4774         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4775         if (ret)
4776                 goto out_register_late_subsys;
4777
4778         for_each_possible_cpu(cpu) {
4779                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4780
4781                 INIT_LIST_HEAD(&ul->head);
4782                 spin_lock_init(&ul->lock);
4783         }
4784
4785 out:
4786         return ret;
4787
4788 out_register_late_subsys:
4789         unregister_pernet_subsys(&ip6_route_net_late_ops);
4790 fib6_rules_init:
4791         fib6_rules_cleanup();
4792 xfrm6_init:
4793         xfrm6_fini();
4794 out_fib6_init:
4795         fib6_gc_cleanup();
4796 out_register_subsys:
4797         unregister_pernet_subsys(&ip6_route_net_ops);
4798 out_register_inetpeer:
4799         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4800 out_dst_entries:
4801         dst_entries_destroy(&ip6_dst_blackhole_ops);
4802 out_kmem_cache:
4803         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4804         goto out;
4805 }
4806
4807 void ip6_route_cleanup(void)
4808 {
4809         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4810         unregister_pernet_subsys(&ip6_route_net_late_ops);
4811         fib6_rules_cleanup();
4812         xfrm6_fini();
4813         fib6_gc_cleanup();
4814         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4815         unregister_pernet_subsys(&ip6_route_net_ops);
4816         dst_entries_destroy(&ip6_dst_blackhole_ops);
4817         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4818 }