Merge branch '10GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next...
[linux-2.6-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 if (rt6_score_route(sibling, oif, strict) < 0)
476                                         break;
477                                 match = sibling;
478                                 break;
479                         }
480                 }
481         return match;
482 }
483
484 /*
485  *      Route lookup. rcu_read_lock() should be held.
486  */
487
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489                                                     struct rt6_info *rt,
490                                                     const struct in6_addr *saddr,
491                                                     int oif,
492                                                     int flags)
493 {
494         struct rt6_info *local = NULL;
495         struct rt6_info *sprt;
496
497         if (!oif && ipv6_addr_any(saddr))
498                 goto out;
499
500         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501                 struct net_device *dev = sprt->dst.dev;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531 out:
532         return rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677         struct net_device *dev = rt->dst.dev;
678
679         if (dev && !netif_carrier_ok(dev) &&
680             idev->cnf.ignore_routes_with_linkdown &&
681             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682                 goto out;
683
684         if (rt6_check_expired(rt))
685                 goto out;
686
687         m = rt6_score_route(rt, oif, strict);
688         if (m == RT6_NUD_FAIL_DO_RR) {
689                 match_do_rr = true;
690                 m = 0; /* lowest valid score */
691         } else if (m == RT6_NUD_FAIL_HARD) {
692                 goto out;
693         }
694
695         if (strict & RT6_LOOKUP_F_REACHABLE)
696                 rt6_probe(rt);
697
698         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699         if (m > *mpri) {
700                 *do_rr = match_do_rr;
701                 *mpri = m;
702                 match = rt;
703         }
704 out:
705         return match;
706 }
707
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709                                      struct rt6_info *leaf,
710                                      struct rt6_info *rr_head,
711                                      u32 metric, int oif, int strict,
712                                      bool *do_rr)
713 {
714         struct rt6_info *rt, *match, *cont;
715         int mpri = -1;
716
717         match = NULL;
718         cont = NULL;
719         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720                 if (rt->rt6i_metric != metric) {
721                         cont = rt;
722                         break;
723                 }
724
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726         }
727
728         for (rt = leaf; rt && rt != rr_head;
729              rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         if (match || !cont)
739                 return match;
740
741         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
743
744         return match;
745 }
746
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
748                                    int oif, int strict)
749 {
750         struct rt6_info *leaf = rcu_dereference(fn->leaf);
751         struct rt6_info *match, *rt0;
752         bool do_rr = false;
753         int key_plen;
754
755         if (!leaf)
756                 return net->ipv6.ip6_null_entry;
757
758         rt0 = rcu_dereference(fn->rr_ptr);
759         if (!rt0)
760                 rt0 = leaf;
761
762         /* Double check to make sure fn is not an intermediate node
763          * and fn->leaf does not points to its child's leaf
764          * (This might happen if all routes under fn are deleted from
765          * the tree and fib6_repair_tree() is called on the node.)
766          */
767         key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769         if (rt0->rt6i_src.plen)
770                 key_plen = rt0->rt6i_src.plen;
771 #endif
772         if (fn->fn_bit != key_plen)
773                 return net->ipv6.ip6_null_entry;
774
775         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
776                              &do_rr);
777
778         if (do_rr) {
779                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
780
781                 /* no entries matched; do round-robin */
782                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
783                         next = leaf;
784
785                 if (next != rt0) {
786                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787                         /* make sure next is not being deleted from the tree */
788                         if (next->rt6i_node)
789                                 rcu_assign_pointer(fn->rr_ptr, next);
790                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
791                 }
792         }
793
794         return match ? match : net->ipv6.ip6_null_entry;
795 }
796
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804                   const struct in6_addr *gwaddr)
805 {
806         struct net *net = dev_net(dev);
807         struct route_info *rinfo = (struct route_info *) opt;
808         struct in6_addr prefix_buf, *prefix;
809         unsigned int pref;
810         unsigned long lifetime;
811         struct rt6_info *rt;
812
813         if (len < sizeof(struct route_info)) {
814                 return -EINVAL;
815         }
816
817         /* Sanity check for prefix_len and length */
818         if (rinfo->length > 3) {
819                 return -EINVAL;
820         } else if (rinfo->prefix_len > 128) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 64) {
823                 if (rinfo->length < 2) {
824                         return -EINVAL;
825                 }
826         } else if (rinfo->prefix_len > 0) {
827                 if (rinfo->length < 1) {
828                         return -EINVAL;
829                 }
830         }
831
832         pref = rinfo->route_pref;
833         if (pref == ICMPV6_ROUTER_PREF_INVALID)
834                 return -EINVAL;
835
836         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837
838         if (rinfo->length == 3)
839                 prefix = (struct in6_addr *)rinfo->prefix;
840         else {
841                 /* this function is safe */
842                 ipv6_addr_prefix(&prefix_buf,
843                                  (struct in6_addr *)rinfo->prefix,
844                                  rinfo->prefix_len);
845                 prefix = &prefix_buf;
846         }
847
848         if (rinfo->prefix_len == 0)
849                 rt = rt6_get_dflt_router(gwaddr, dev);
850         else
851                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852                                         gwaddr, dev);
853
854         if (rt && !lifetime) {
855                 ip6_del_rt(rt);
856                 rt = NULL;
857         }
858
859         if (!rt && lifetime)
860                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861                                         dev, pref);
862         else if (rt)
863                 rt->rt6i_flags = RTF_ROUTEINFO |
864                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865
866         if (rt) {
867                 if (!addrconf_finite_timeout(lifetime))
868                         rt6_clean_expires(rt);
869                 else
870                         rt6_set_expires(rt, jiffies + HZ * lifetime);
871
872                 ip6_rt_put(rt);
873         }
874         return 0;
875 }
876 #endif
877
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879                                         struct in6_addr *saddr)
880 {
881         struct fib6_node *pn, *sn;
882         while (1) {
883                 if (fn->fn_flags & RTN_TL_ROOT)
884                         return NULL;
885                 pn = rcu_dereference(fn->parent);
886                 sn = FIB6_SUBTREE(pn);
887                 if (sn && sn != fn)
888                         fn = fib6_lookup(sn, NULL, saddr);
889                 else
890                         fn = pn;
891                 if (fn->fn_flags & RTN_RTINFO)
892                         return fn;
893         }
894 }
895
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
897                           bool null_fallback)
898 {
899         struct rt6_info *rt = *prt;
900
901         if (dst_hold_safe(&rt->dst))
902                 return true;
903         if (null_fallback) {
904                 rt = net->ipv6.ip6_null_entry;
905                 dst_hold(&rt->dst);
906         } else {
907                 rt = NULL;
908         }
909         *prt = rt;
910         return false;
911 }
912
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914                                              struct fib6_table *table,
915                                              struct flowi6 *fl6, int flags)
916 {
917         struct rt6_info *rt, *rt_cache;
918         struct fib6_node *fn;
919
920         rcu_read_lock();
921         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
922 restart:
923         rt = rcu_dereference(fn->leaf);
924         if (!rt) {
925                 rt = net->ipv6.ip6_null_entry;
926         } else {
927                 rt = rt6_device_match(net, rt, &fl6->saddr,
928                                       fl6->flowi6_oif, flags);
929                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930                         rt = rt6_multipath_select(rt, fl6,
931                                                   fl6->flowi6_oif, flags);
932         }
933         if (rt == net->ipv6.ip6_null_entry) {
934                 fn = fib6_backtrack(fn, &fl6->saddr);
935                 if (fn)
936                         goto restart;
937         }
938         /* Search through exception table */
939         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
940         if (rt_cache)
941                 rt = rt_cache;
942
943         if (ip6_hold_safe(net, &rt, true))
944                 dst_use_noref(&rt->dst, jiffies);
945
946         rcu_read_unlock();
947
948         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
949
950         return rt;
951
952 }
953
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
955                                     int flags)
956 {
957         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
958 }
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
960
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962                             const struct in6_addr *saddr, int oif, int strict)
963 {
964         struct flowi6 fl6 = {
965                 .flowi6_oif = oif,
966                 .daddr = *daddr,
967         };
968         struct dst_entry *dst;
969         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
970
971         if (saddr) {
972                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973                 flags |= RT6_LOOKUP_F_HAS_SADDR;
974         }
975
976         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
977         if (dst->error == 0)
978                 return (struct rt6_info *) dst;
979
980         dst_release(dst);
981
982         return NULL;
983 }
984 EXPORT_SYMBOL(rt6_lookup);
985
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987  * It takes new route entry, the addition fails by any reason the
988  * route is released.
989  * Caller must hold dst before calling it.
990  */
991
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993                         struct mx6_config *mxc,
994                         struct netlink_ext_ack *extack)
995 {
996         int err;
997         struct fib6_table *table;
998
999         table = rt->rt6i_table;
1000         spin_lock_bh(&table->tb6_lock);
1001         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002         spin_unlock_bh(&table->tb6_lock);
1003
1004         return err;
1005 }
1006
1007 int ip6_ins_rt(struct rt6_info *rt)
1008 {
1009         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1010         struct mx6_config mxc = { .mx = NULL, };
1011
1012         /* Hold dst to account for the reference from the fib6 tree */
1013         dst_hold(&rt->dst);
1014         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1015 }
1016
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1019 {
1020         struct net_device *dev = rt->dst.dev;
1021
1022         if (rt->rt6i_flags & RTF_LOCAL) {
1023                 /* for copies of local routes, dst->dev needs to be the
1024                  * device if it is a master device, the master device if
1025                  * device is enslaved, and the loopback as the default
1026                  */
1027                 if (netif_is_l3_slave(dev) &&
1028                     !rt6_need_strict(&rt->rt6i_dst.addr))
1029                         dev = l3mdev_master_dev_rcu(dev);
1030                 else if (!netif_is_l3_master(dev))
1031                         dev = dev_net(dev)->loopback_dev;
1032                 /* last case is netif_is_l3_master(dev) is true in which
1033                  * case we want dev returned to be dev
1034                  */
1035         }
1036
1037         return dev;
1038 }
1039
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041                                            const struct in6_addr *daddr,
1042                                            const struct in6_addr *saddr)
1043 {
1044         struct net_device *dev;
1045         struct rt6_info *rt;
1046
1047         /*
1048          *      Clone the route.
1049          */
1050
1051         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052                 ort = (struct rt6_info *)ort->dst.from;
1053
1054         rcu_read_lock();
1055         dev = ip6_rt_get_dev_rcu(ort);
1056         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1057         rcu_read_unlock();
1058         if (!rt)
1059                 return NULL;
1060
1061         ip6_rt_copy_init(rt, ort);
1062         rt->rt6i_flags |= RTF_CACHE;
1063         rt->rt6i_metric = 0;
1064         rt->dst.flags |= DST_HOST;
1065         rt->rt6i_dst.addr = *daddr;
1066         rt->rt6i_dst.plen = 128;
1067
1068         if (!rt6_is_gw_or_nonexthop(ort)) {
1069                 if (ort->rt6i_dst.plen != 128 &&
1070                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071                         rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073                 if (rt->rt6i_src.plen && saddr) {
1074                         rt->rt6i_src.addr = *saddr;
1075                         rt->rt6i_src.plen = 128;
1076                 }
1077 #endif
1078         }
1079
1080         return rt;
1081 }
1082
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1084 {
1085         struct net_device *dev;
1086         struct rt6_info *pcpu_rt;
1087
1088         rcu_read_lock();
1089         dev = ip6_rt_get_dev_rcu(rt);
1090         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1091         rcu_read_unlock();
1092         if (!pcpu_rt)
1093                 return NULL;
1094         ip6_rt_copy_init(pcpu_rt, rt);
1095         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096         pcpu_rt->rt6i_flags |= RTF_PCPU;
1097         return pcpu_rt;
1098 }
1099
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1102 {
1103         struct rt6_info *pcpu_rt, **p;
1104
1105         p = this_cpu_ptr(rt->rt6i_pcpu);
1106         pcpu_rt = *p;
1107
1108         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109                 rt6_dst_from_metrics_check(pcpu_rt);
1110
1111         return pcpu_rt;
1112 }
1113
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1115 {
1116         struct rt6_info *pcpu_rt, *prev, **p;
1117
1118         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1119         if (!pcpu_rt) {
1120                 struct net *net = dev_net(rt->dst.dev);
1121
1122                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1123                 return net->ipv6.ip6_null_entry;
1124         }
1125
1126         dst_hold(&pcpu_rt->dst);
1127         p = this_cpu_ptr(rt->rt6i_pcpu);
1128         prev = cmpxchg(p, NULL, pcpu_rt);
1129         BUG_ON(prev);
1130
1131         rt6_dst_from_metrics_check(pcpu_rt);
1132         return pcpu_rt;
1133 }
1134
1135 /* exception hash table implementation
1136  */
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1138
1139 /* Remove rt6_ex from hash table and free the memory
1140  * Caller must hold rt6_exception_lock
1141  */
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143                                  struct rt6_exception *rt6_ex)
1144 {
1145         struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1146
1147         if (!bucket || !rt6_ex)
1148                 return;
1149         rt6_ex->rt6i->rt6i_node = NULL;
1150         hlist_del_rcu(&rt6_ex->hlist);
1151         rt6_release(rt6_ex->rt6i);
1152         kfree_rcu(rt6_ex, rcu);
1153         WARN_ON_ONCE(!bucket->depth);
1154         bucket->depth--;
1155         net->ipv6.rt6_stats->fib_rt_cache--;
1156 }
1157
1158 /* Remove oldest rt6_ex in bucket and free the memory
1159  * Caller must hold rt6_exception_lock
1160  */
1161 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1162 {
1163         struct rt6_exception *rt6_ex, *oldest = NULL;
1164
1165         if (!bucket)
1166                 return;
1167
1168         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1169                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1170                         oldest = rt6_ex;
1171         }
1172         rt6_remove_exception(bucket, oldest);
1173 }
1174
1175 static u32 rt6_exception_hash(const struct in6_addr *dst,
1176                               const struct in6_addr *src)
1177 {
1178         static u32 seed __read_mostly;
1179         u32 val;
1180
1181         net_get_random_once(&seed, sizeof(seed));
1182         val = jhash(dst, sizeof(*dst), seed);
1183
1184 #ifdef CONFIG_IPV6_SUBTREES
1185         if (src)
1186                 val = jhash(src, sizeof(*src), val);
1187 #endif
1188         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1189 }
1190
1191 /* Helper function to find the cached rt in the hash table
1192  * and update bucket pointer to point to the bucket for this
1193  * (daddr, saddr) pair
1194  * Caller must hold rt6_exception_lock
1195  */
1196 static struct rt6_exception *
1197 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1198                               const struct in6_addr *daddr,
1199                               const struct in6_addr *saddr)
1200 {
1201         struct rt6_exception *rt6_ex;
1202         u32 hval;
1203
1204         if (!(*bucket) || !daddr)
1205                 return NULL;
1206
1207         hval = rt6_exception_hash(daddr, saddr);
1208         *bucket += hval;
1209
1210         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1211                 struct rt6_info *rt6 = rt6_ex->rt6i;
1212                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1213
1214 #ifdef CONFIG_IPV6_SUBTREES
1215                 if (matched && saddr)
1216                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1217 #endif
1218                 if (matched)
1219                         return rt6_ex;
1220         }
1221         return NULL;
1222 }
1223
1224 /* Helper function to find the cached rt in the hash table
1225  * and update bucket pointer to point to the bucket for this
1226  * (daddr, saddr) pair
1227  * Caller must hold rcu_read_lock()
1228  */
1229 static struct rt6_exception *
1230 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1231                          const struct in6_addr *daddr,
1232                          const struct in6_addr *saddr)
1233 {
1234         struct rt6_exception *rt6_ex;
1235         u32 hval;
1236
1237         WARN_ON_ONCE(!rcu_read_lock_held());
1238
1239         if (!(*bucket) || !daddr)
1240                 return NULL;
1241
1242         hval = rt6_exception_hash(daddr, saddr);
1243         *bucket += hval;
1244
1245         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1246                 struct rt6_info *rt6 = rt6_ex->rt6i;
1247                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1248
1249 #ifdef CONFIG_IPV6_SUBTREES
1250                 if (matched && saddr)
1251                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1252 #endif
1253                 if (matched)
1254                         return rt6_ex;
1255         }
1256         return NULL;
1257 }
1258
1259 static int rt6_insert_exception(struct rt6_info *nrt,
1260                                 struct rt6_info *ort)
1261 {
1262         struct net *net = dev_net(ort->dst.dev);
1263         struct rt6_exception_bucket *bucket;
1264         struct in6_addr *src_key = NULL;
1265         struct rt6_exception *rt6_ex;
1266         int err = 0;
1267
1268         /* ort can't be a cache or pcpu route */
1269         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1270                 ort = (struct rt6_info *)ort->dst.from;
1271         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1272
1273         spin_lock_bh(&rt6_exception_lock);
1274
1275         if (ort->exception_bucket_flushed) {
1276                 err = -EINVAL;
1277                 goto out;
1278         }
1279
1280         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1281                                         lockdep_is_held(&rt6_exception_lock));
1282         if (!bucket) {
1283                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1284                                  GFP_ATOMIC);
1285                 if (!bucket) {
1286                         err = -ENOMEM;
1287                         goto out;
1288                 }
1289                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1290         }
1291
1292 #ifdef CONFIG_IPV6_SUBTREES
1293         /* rt6i_src.plen != 0 indicates ort is in subtree
1294          * and exception table is indexed by a hash of
1295          * both rt6i_dst and rt6i_src.
1296          * Otherwise, the exception table is indexed by
1297          * a hash of only rt6i_dst.
1298          */
1299         if (ort->rt6i_src.plen)
1300                 src_key = &nrt->rt6i_src.addr;
1301 #endif
1302
1303         /* Update rt6i_prefsrc as it could be changed
1304          * in rt6_remove_prefsrc()
1305          */
1306         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1307         /* rt6_mtu_change() might lower mtu on ort.
1308          * Only insert this exception route if its mtu
1309          * is less than ort's mtu value.
1310          */
1311         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1312                 err = -EINVAL;
1313                 goto out;
1314         }
1315
1316         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1317                                                src_key);
1318         if (rt6_ex)
1319                 rt6_remove_exception(bucket, rt6_ex);
1320
1321         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1322         if (!rt6_ex) {
1323                 err = -ENOMEM;
1324                 goto out;
1325         }
1326         rt6_ex->rt6i = nrt;
1327         rt6_ex->stamp = jiffies;
1328         atomic_inc(&nrt->rt6i_ref);
1329         nrt->rt6i_node = ort->rt6i_node;
1330         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1331         bucket->depth++;
1332         net->ipv6.rt6_stats->fib_rt_cache++;
1333
1334         if (bucket->depth > FIB6_MAX_DEPTH)
1335                 rt6_exception_remove_oldest(bucket);
1336
1337 out:
1338         spin_unlock_bh(&rt6_exception_lock);
1339
1340         /* Update fn->fn_sernum to invalidate all cached dst */
1341         if (!err)
1342                 fib6_update_sernum(ort);
1343
1344         return err;
1345 }
1346
1347 void rt6_flush_exceptions(struct rt6_info *rt)
1348 {
1349         struct rt6_exception_bucket *bucket;
1350         struct rt6_exception *rt6_ex;
1351         struct hlist_node *tmp;
1352         int i;
1353
1354         spin_lock_bh(&rt6_exception_lock);
1355         /* Prevent rt6_insert_exception() to recreate the bucket list */
1356         rt->exception_bucket_flushed = 1;
1357
1358         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1359                                     lockdep_is_held(&rt6_exception_lock));
1360         if (!bucket)
1361                 goto out;
1362
1363         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1364                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1365                         rt6_remove_exception(bucket, rt6_ex);
1366                 WARN_ON_ONCE(bucket->depth);
1367                 bucket++;
1368         }
1369
1370 out:
1371         spin_unlock_bh(&rt6_exception_lock);
1372 }
1373
1374 /* Find cached rt in the hash table inside passed in rt
1375  * Caller has to hold rcu_read_lock()
1376  */
1377 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1378                                            struct in6_addr *daddr,
1379                                            struct in6_addr *saddr)
1380 {
1381         struct rt6_exception_bucket *bucket;
1382         struct in6_addr *src_key = NULL;
1383         struct rt6_exception *rt6_ex;
1384         struct rt6_info *res = NULL;
1385
1386         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1387
1388 #ifdef CONFIG_IPV6_SUBTREES
1389         /* rt6i_src.plen != 0 indicates rt is in subtree
1390          * and exception table is indexed by a hash of
1391          * both rt6i_dst and rt6i_src.
1392          * Otherwise, the exception table is indexed by
1393          * a hash of only rt6i_dst.
1394          */
1395         if (rt->rt6i_src.plen)
1396                 src_key = saddr;
1397 #endif
1398         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1399
1400         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1401                 res = rt6_ex->rt6i;
1402
1403         return res;
1404 }
1405
1406 /* Remove the passed in cached rt from the hash table that contains it */
1407 int rt6_remove_exception_rt(struct rt6_info *rt)
1408 {
1409         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1410         struct rt6_exception_bucket *bucket;
1411         struct in6_addr *src_key = NULL;
1412         struct rt6_exception *rt6_ex;
1413         int err;
1414
1415         if (!from ||
1416             !(rt->rt6i_flags | RTF_CACHE))
1417                 return -EINVAL;
1418
1419         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1420                 return -ENOENT;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1424                                     lockdep_is_held(&rt6_exception_lock));
1425 #ifdef CONFIG_IPV6_SUBTREES
1426         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1427          * and exception table is indexed by a hash of
1428          * both rt6i_dst and rt6i_src.
1429          * Otherwise, the exception table is indexed by
1430          * a hash of only rt6i_dst.
1431          */
1432         if (from->rt6i_src.plen)
1433                 src_key = &rt->rt6i_src.addr;
1434 #endif
1435         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1436                                                &rt->rt6i_dst.addr,
1437                                                src_key);
1438         if (rt6_ex) {
1439                 rt6_remove_exception(bucket, rt6_ex);
1440                 err = 0;
1441         } else {
1442                 err = -ENOENT;
1443         }
1444
1445         spin_unlock_bh(&rt6_exception_lock);
1446         return err;
1447 }
1448
1449 /* Find rt6_ex which contains the passed in rt cache and
1450  * refresh its stamp
1451  */
1452 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1453 {
1454         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1455         struct rt6_exception_bucket *bucket;
1456         struct in6_addr *src_key = NULL;
1457         struct rt6_exception *rt6_ex;
1458
1459         if (!from ||
1460             !(rt->rt6i_flags | RTF_CACHE))
1461                 return;
1462
1463         rcu_read_lock();
1464         bucket = rcu_dereference(from->rt6i_exception_bucket);
1465
1466 #ifdef CONFIG_IPV6_SUBTREES
1467         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1468          * and exception table is indexed by a hash of
1469          * both rt6i_dst and rt6i_src.
1470          * Otherwise, the exception table is indexed by
1471          * a hash of only rt6i_dst.
1472          */
1473         if (from->rt6i_src.plen)
1474                 src_key = &rt->rt6i_src.addr;
1475 #endif
1476         rt6_ex = __rt6_find_exception_rcu(&bucket,
1477                                           &rt->rt6i_dst.addr,
1478                                           src_key);
1479         if (rt6_ex)
1480                 rt6_ex->stamp = jiffies;
1481
1482         rcu_read_unlock();
1483 }
1484
1485 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1486 {
1487         struct rt6_exception_bucket *bucket;
1488         struct rt6_exception *rt6_ex;
1489         int i;
1490
1491         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1492                                         lockdep_is_held(&rt6_exception_lock));
1493
1494         if (bucket) {
1495                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1496                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1497                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1498                         }
1499                         bucket++;
1500                 }
1501         }
1502 }
1503
1504 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1505 {
1506         struct rt6_exception_bucket *bucket;
1507         struct rt6_exception *rt6_ex;
1508         int i;
1509
1510         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1511                                         lockdep_is_held(&rt6_exception_lock));
1512
1513         if (bucket) {
1514                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1515                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1516                                 struct rt6_info *entry = rt6_ex->rt6i;
1517                                 /* For RTF_CACHE with rt6i_pmtu == 0
1518                                  * (i.e. a redirected route),
1519                                  * the metrics of its rt->dst.from has already
1520                                  * been updated.
1521                                  */
1522                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1523                                         entry->rt6i_pmtu = mtu;
1524                         }
1525                         bucket++;
1526                 }
1527         }
1528 }
1529
1530 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1531
1532 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1533                                         struct in6_addr *gateway)
1534 {
1535         struct rt6_exception_bucket *bucket;
1536         struct rt6_exception *rt6_ex;
1537         struct hlist_node *tmp;
1538         int i;
1539
1540         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1541                 return;
1542
1543         spin_lock_bh(&rt6_exception_lock);
1544         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1545                                      lockdep_is_held(&rt6_exception_lock));
1546
1547         if (bucket) {
1548                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1549                         hlist_for_each_entry_safe(rt6_ex, tmp,
1550                                                   &bucket->chain, hlist) {
1551                                 struct rt6_info *entry = rt6_ex->rt6i;
1552
1553                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1554                                     RTF_CACHE_GATEWAY &&
1555                                     ipv6_addr_equal(gateway,
1556                                                     &entry->rt6i_gateway)) {
1557                                         rt6_remove_exception(bucket, rt6_ex);
1558                                 }
1559                         }
1560                         bucket++;
1561                 }
1562         }
1563
1564         spin_unlock_bh(&rt6_exception_lock);
1565 }
1566
1567 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1568                                       struct rt6_exception *rt6_ex,
1569                                       struct fib6_gc_args *gc_args,
1570                                       unsigned long now)
1571 {
1572         struct rt6_info *rt = rt6_ex->rt6i;
1573
1574         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1575             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1576                 RT6_TRACE("aging clone %p\n", rt);
1577                 rt6_remove_exception(bucket, rt6_ex);
1578                 return;
1579         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1580                 struct neighbour *neigh;
1581                 __u8 neigh_flags = 0;
1582
1583                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1584                 if (neigh) {
1585                         neigh_flags = neigh->flags;
1586                         neigh_release(neigh);
1587                 }
1588                 if (!(neigh_flags & NTF_ROUTER)) {
1589                         RT6_TRACE("purging route %p via non-router but gateway\n",
1590                                   rt);
1591                         rt6_remove_exception(bucket, rt6_ex);
1592                         return;
1593                 }
1594         }
1595         gc_args->more++;
1596 }
1597
1598 void rt6_age_exceptions(struct rt6_info *rt,
1599                         struct fib6_gc_args *gc_args,
1600                         unsigned long now)
1601 {
1602         struct rt6_exception_bucket *bucket;
1603         struct rt6_exception *rt6_ex;
1604         struct hlist_node *tmp;
1605         int i;
1606
1607         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1608                 return;
1609
1610         spin_lock_bh(&rt6_exception_lock);
1611         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1612                                     lockdep_is_held(&rt6_exception_lock));
1613
1614         if (bucket) {
1615                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1616                         hlist_for_each_entry_safe(rt6_ex, tmp,
1617                                                   &bucket->chain, hlist) {
1618                                 rt6_age_examine_exception(bucket, rt6_ex,
1619                                                           gc_args, now);
1620                         }
1621                         bucket++;
1622                 }
1623         }
1624         spin_unlock_bh(&rt6_exception_lock);
1625 }
1626
1627 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1628                                int oif, struct flowi6 *fl6, int flags)
1629 {
1630         struct fib6_node *fn, *saved_fn;
1631         struct rt6_info *rt, *rt_cache;
1632         int strict = 0;
1633
1634         strict |= flags & RT6_LOOKUP_F_IFACE;
1635         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1636         if (net->ipv6.devconf_all->forwarding == 0)
1637                 strict |= RT6_LOOKUP_F_REACHABLE;
1638
1639         rcu_read_lock();
1640
1641         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1642         saved_fn = fn;
1643
1644         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1645                 oif = 0;
1646
1647 redo_rt6_select:
1648         rt = rt6_select(net, fn, oif, strict);
1649         if (rt->rt6i_nsiblings)
1650                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1651         if (rt == net->ipv6.ip6_null_entry) {
1652                 fn = fib6_backtrack(fn, &fl6->saddr);
1653                 if (fn)
1654                         goto redo_rt6_select;
1655                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1656                         /* also consider unreachable route */
1657                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1658                         fn = saved_fn;
1659                         goto redo_rt6_select;
1660                 }
1661         }
1662
1663         /*Search through exception table */
1664         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1665         if (rt_cache)
1666                 rt = rt_cache;
1667
1668         if (rt == net->ipv6.ip6_null_entry) {
1669                 rcu_read_unlock();
1670                 dst_hold(&rt->dst);
1671                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1672                 return rt;
1673         } else if (rt->rt6i_flags & RTF_CACHE) {
1674                 if (ip6_hold_safe(net, &rt, true)) {
1675                         dst_use_noref(&rt->dst, jiffies);
1676                         rt6_dst_from_metrics_check(rt);
1677                 }
1678                 rcu_read_unlock();
1679                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1680                 return rt;
1681         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1682                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1683                 /* Create a RTF_CACHE clone which will not be
1684                  * owned by the fib6 tree.  It is for the special case where
1685                  * the daddr in the skb during the neighbor look-up is different
1686                  * from the fl6->daddr used to look-up route here.
1687                  */
1688
1689                 struct rt6_info *uncached_rt;
1690
1691                 if (ip6_hold_safe(net, &rt, true)) {
1692                         dst_use_noref(&rt->dst, jiffies);
1693                 } else {
1694                         rcu_read_unlock();
1695                         uncached_rt = rt;
1696                         goto uncached_rt_out;
1697                 }
1698                 rcu_read_unlock();
1699
1700                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1701                 dst_release(&rt->dst);
1702
1703                 if (uncached_rt) {
1704                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1705                          * No need for another dst_hold()
1706                          */
1707                         rt6_uncached_list_add(uncached_rt);
1708                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1709                 } else {
1710                         uncached_rt = net->ipv6.ip6_null_entry;
1711                         dst_hold(&uncached_rt->dst);
1712                 }
1713
1714 uncached_rt_out:
1715                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1716                 return uncached_rt;
1717
1718         } else {
1719                 /* Get a percpu copy */
1720
1721                 struct rt6_info *pcpu_rt;
1722
1723                 dst_use_noref(&rt->dst, jiffies);
1724                 local_bh_disable();
1725                 pcpu_rt = rt6_get_pcpu_route(rt);
1726
1727                 if (!pcpu_rt) {
1728                         /* atomic_inc_not_zero() is needed when using rcu */
1729                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1730                                 /* No dst_hold() on rt is needed because grabbing
1731                                  * rt->rt6i_ref makes sure rt can't be released.
1732                                  */
1733                                 pcpu_rt = rt6_make_pcpu_route(rt);
1734                                 rt6_release(rt);
1735                         } else {
1736                                 /* rt is already removed from tree */
1737                                 pcpu_rt = net->ipv6.ip6_null_entry;
1738                                 dst_hold(&pcpu_rt->dst);
1739                         }
1740                 }
1741                 local_bh_enable();
1742                 rcu_read_unlock();
1743                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1744                 return pcpu_rt;
1745         }
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_pol_route);
1748
1749 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1750                                             struct flowi6 *fl6, int flags)
1751 {
1752         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1753 }
1754
1755 struct dst_entry *ip6_route_input_lookup(struct net *net,
1756                                          struct net_device *dev,
1757                                          struct flowi6 *fl6, int flags)
1758 {
1759         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1760                 flags |= RT6_LOOKUP_F_IFACE;
1761
1762         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1763 }
1764 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1765
1766 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1767                                   struct flow_keys *keys)
1768 {
1769         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1770         const struct ipv6hdr *key_iph = outer_iph;
1771         const struct ipv6hdr *inner_iph;
1772         const struct icmp6hdr *icmph;
1773         struct ipv6hdr _inner_iph;
1774
1775         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1776                 goto out;
1777
1778         icmph = icmp6_hdr(skb);
1779         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1780             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1781             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1782             icmph->icmp6_type != ICMPV6_PARAMPROB)
1783                 goto out;
1784
1785         inner_iph = skb_header_pointer(skb,
1786                                        skb_transport_offset(skb) + sizeof(*icmph),
1787                                        sizeof(_inner_iph), &_inner_iph);
1788         if (!inner_iph)
1789                 goto out;
1790
1791         key_iph = inner_iph;
1792 out:
1793         memset(keys, 0, sizeof(*keys));
1794         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1795         keys->addrs.v6addrs.src = key_iph->saddr;
1796         keys->addrs.v6addrs.dst = key_iph->daddr;
1797         keys->tags.flow_label = ip6_flowinfo(key_iph);
1798         keys->basic.ip_proto = key_iph->nexthdr;
1799 }
1800
1801 /* if skb is set it will be used and fl6 can be NULL */
1802 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1803 {
1804         struct flow_keys hash_keys;
1805
1806         if (skb) {
1807                 ip6_multipath_l3_keys(skb, &hash_keys);
1808                 return flow_hash_from_keys(&hash_keys);
1809         }
1810
1811         return get_hash_from_flowi6(fl6);
1812 }
1813
1814 void ip6_route_input(struct sk_buff *skb)
1815 {
1816         const struct ipv6hdr *iph = ipv6_hdr(skb);
1817         struct net *net = dev_net(skb->dev);
1818         int flags = RT6_LOOKUP_F_HAS_SADDR;
1819         struct ip_tunnel_info *tun_info;
1820         struct flowi6 fl6 = {
1821                 .flowi6_iif = skb->dev->ifindex,
1822                 .daddr = iph->daddr,
1823                 .saddr = iph->saddr,
1824                 .flowlabel = ip6_flowinfo(iph),
1825                 .flowi6_mark = skb->mark,
1826                 .flowi6_proto = iph->nexthdr,
1827         };
1828
1829         tun_info = skb_tunnel_info(skb);
1830         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1831                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1832         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1833                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1834         skb_dst_drop(skb);
1835         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1836 }
1837
1838 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1839                                              struct flowi6 *fl6, int flags)
1840 {
1841         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1842 }
1843
1844 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1845                                          struct flowi6 *fl6, int flags)
1846 {
1847         bool any_src;
1848
1849         if (rt6_need_strict(&fl6->daddr)) {
1850                 struct dst_entry *dst;
1851
1852                 dst = l3mdev_link_scope_lookup(net, fl6);
1853                 if (dst)
1854                         return dst;
1855         }
1856
1857         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1858
1859         any_src = ipv6_addr_any(&fl6->saddr);
1860         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1861             (fl6->flowi6_oif && any_src))
1862                 flags |= RT6_LOOKUP_F_IFACE;
1863
1864         if (!any_src)
1865                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1866         else if (sk)
1867                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1868
1869         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1870 }
1871 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1872
1873 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1874 {
1875         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1876         struct net_device *loopback_dev = net->loopback_dev;
1877         struct dst_entry *new = NULL;
1878
1879         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1880                        DST_OBSOLETE_NONE, 0);
1881         if (rt) {
1882                 rt6_info_init(rt);
1883                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1884
1885                 new = &rt->dst;
1886                 new->__use = 1;
1887                 new->input = dst_discard;
1888                 new->output = dst_discard_out;
1889
1890                 dst_copy_metrics(new, &ort->dst);
1891
1892                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1893                 rt->rt6i_gateway = ort->rt6i_gateway;
1894                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1895                 rt->rt6i_metric = 0;
1896
1897                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1898 #ifdef CONFIG_IPV6_SUBTREES
1899                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1900 #endif
1901         }
1902
1903         dst_release(dst_orig);
1904         return new ? new : ERR_PTR(-ENOMEM);
1905 }
1906
1907 /*
1908  *      Destination cache support functions
1909  */
1910
1911 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1912 {
1913         if (rt->dst.from &&
1914             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1915                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1916 }
1917
1918 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1919 {
1920         u32 rt_cookie = 0;
1921
1922         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1923                 return NULL;
1924
1925         if (rt6_check_expired(rt))
1926                 return NULL;
1927
1928         return &rt->dst;
1929 }
1930
1931 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1932 {
1933         if (!__rt6_check_expired(rt) &&
1934             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1935             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1936                 return &rt->dst;
1937         else
1938                 return NULL;
1939 }
1940
1941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1942 {
1943         struct rt6_info *rt;
1944
1945         rt = (struct rt6_info *) dst;
1946
1947         /* All IPV6 dsts are created with ->obsolete set to the value
1948          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1949          * into this function always.
1950          */
1951
1952         rt6_dst_from_metrics_check(rt);
1953
1954         if (rt->rt6i_flags & RTF_PCPU ||
1955             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1956                 return rt6_dst_from_check(rt, cookie);
1957         else
1958                 return rt6_check(rt, cookie);
1959 }
1960
1961 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1962 {
1963         struct rt6_info *rt = (struct rt6_info *) dst;
1964
1965         if (rt) {
1966                 if (rt->rt6i_flags & RTF_CACHE) {
1967                         if (rt6_check_expired(rt)) {
1968                                 ip6_del_rt(rt);
1969                                 dst = NULL;
1970                         }
1971                 } else {
1972                         dst_release(dst);
1973                         dst = NULL;
1974                 }
1975         }
1976         return dst;
1977 }
1978
1979 static void ip6_link_failure(struct sk_buff *skb)
1980 {
1981         struct rt6_info *rt;
1982
1983         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1984
1985         rt = (struct rt6_info *) skb_dst(skb);
1986         if (rt) {
1987                 if (rt->rt6i_flags & RTF_CACHE) {
1988                         if (dst_hold_safe(&rt->dst))
1989                                 ip6_del_rt(rt);
1990                 } else {
1991                         struct fib6_node *fn;
1992
1993                         rcu_read_lock();
1994                         fn = rcu_dereference(rt->rt6i_node);
1995                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1996                                 fn->fn_sernum = -1;
1997                         rcu_read_unlock();
1998                 }
1999         }
2000 }
2001
2002 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2003 {
2004         struct net *net = dev_net(rt->dst.dev);
2005
2006         rt->rt6i_flags |= RTF_MODIFIED;
2007         rt->rt6i_pmtu = mtu;
2008         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2009 }
2010
2011 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2012 {
2013         return !(rt->rt6i_flags & RTF_CACHE) &&
2014                 (rt->rt6i_flags & RTF_PCPU ||
2015                  rcu_access_pointer(rt->rt6i_node));
2016 }
2017
2018 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2019                                  const struct ipv6hdr *iph, u32 mtu)
2020 {
2021         const struct in6_addr *daddr, *saddr;
2022         struct rt6_info *rt6 = (struct rt6_info *)dst;
2023
2024         if (rt6->rt6i_flags & RTF_LOCAL)
2025                 return;
2026
2027         if (dst_metric_locked(dst, RTAX_MTU))
2028                 return;
2029
2030         if (iph) {
2031                 daddr = &iph->daddr;
2032                 saddr = &iph->saddr;
2033         } else if (sk) {
2034                 daddr = &sk->sk_v6_daddr;
2035                 saddr = &inet6_sk(sk)->saddr;
2036         } else {
2037                 daddr = NULL;
2038                 saddr = NULL;
2039         }
2040         dst_confirm_neigh(dst, daddr);
2041         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2042         if (mtu >= dst_mtu(dst))
2043                 return;
2044
2045         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2046                 rt6_do_update_pmtu(rt6, mtu);
2047                 /* update rt6_ex->stamp for cache */
2048                 if (rt6->rt6i_flags & RTF_CACHE)
2049                         rt6_update_exception_stamp_rt(rt6);
2050         } else if (daddr) {
2051                 struct rt6_info *nrt6;
2052
2053                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2054                 if (nrt6) {
2055                         rt6_do_update_pmtu(nrt6, mtu);
2056                         if (rt6_insert_exception(nrt6, rt6))
2057                                 dst_release_immediate(&nrt6->dst);
2058                 }
2059         }
2060 }
2061
2062 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2063                                struct sk_buff *skb, u32 mtu)
2064 {
2065         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2066 }
2067
2068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2069                      int oif, u32 mark, kuid_t uid)
2070 {
2071         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2072         struct dst_entry *dst;
2073         struct flowi6 fl6;
2074
2075         memset(&fl6, 0, sizeof(fl6));
2076         fl6.flowi6_oif = oif;
2077         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2078         fl6.daddr = iph->daddr;
2079         fl6.saddr = iph->saddr;
2080         fl6.flowlabel = ip6_flowinfo(iph);
2081         fl6.flowi6_uid = uid;
2082
2083         dst = ip6_route_output(net, NULL, &fl6);
2084         if (!dst->error)
2085                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2086         dst_release(dst);
2087 }
2088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2089
2090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2091 {
2092         struct dst_entry *dst;
2093
2094         ip6_update_pmtu(skb, sock_net(sk), mtu,
2095                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2096
2097         dst = __sk_dst_get(sk);
2098         if (!dst || !dst->obsolete ||
2099             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2100                 return;
2101
2102         bh_lock_sock(sk);
2103         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2104                 ip6_datagram_dst_update(sk, false);
2105         bh_unlock_sock(sk);
2106 }
2107 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2108
2109 /* Handle redirects */
2110 struct ip6rd_flowi {
2111         struct flowi6 fl6;
2112         struct in6_addr gateway;
2113 };
2114
2115 static struct rt6_info *__ip6_route_redirect(struct net *net,
2116                                              struct fib6_table *table,
2117                                              struct flowi6 *fl6,
2118                                              int flags)
2119 {
2120         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2121         struct rt6_info *rt, *rt_cache;
2122         struct fib6_node *fn;
2123
2124         /* Get the "current" route for this destination and
2125          * check if the redirect has come from appropriate router.
2126          *
2127          * RFC 4861 specifies that redirects should only be
2128          * accepted if they come from the nexthop to the target.
2129          * Due to the way the routes are chosen, this notion
2130          * is a bit fuzzy and one might need to check all possible
2131          * routes.
2132          */
2133
2134         rcu_read_lock();
2135         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2136 restart:
2137         for_each_fib6_node_rt_rcu(fn) {
2138                 if (rt6_check_expired(rt))
2139                         continue;
2140                 if (rt->dst.error)
2141                         break;
2142                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2143                         continue;
2144                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2145                         continue;
2146                 /* rt_cache's gateway might be different from its 'parent'
2147                  * in the case of an ip redirect.
2148                  * So we keep searching in the exception table if the gateway
2149                  * is different.
2150                  */
2151                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2152                         rt_cache = rt6_find_cached_rt(rt,
2153                                                       &fl6->daddr,
2154                                                       &fl6->saddr);
2155                         if (rt_cache &&
2156                             ipv6_addr_equal(&rdfl->gateway,
2157                                             &rt_cache->rt6i_gateway)) {
2158                                 rt = rt_cache;
2159                                 break;
2160                         }
2161                         continue;
2162                 }
2163                 break;
2164         }
2165
2166         if (!rt)
2167                 rt = net->ipv6.ip6_null_entry;
2168         else if (rt->dst.error) {
2169                 rt = net->ipv6.ip6_null_entry;
2170                 goto out;
2171         }
2172
2173         if (rt == net->ipv6.ip6_null_entry) {
2174                 fn = fib6_backtrack(fn, &fl6->saddr);
2175                 if (fn)
2176                         goto restart;
2177         }
2178
2179 out:
2180         ip6_hold_safe(net, &rt, true);
2181
2182         rcu_read_unlock();
2183
2184         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2185         return rt;
2186 };
2187
2188 static struct dst_entry *ip6_route_redirect(struct net *net,
2189                                         const struct flowi6 *fl6,
2190                                         const struct in6_addr *gateway)
2191 {
2192         int flags = RT6_LOOKUP_F_HAS_SADDR;
2193         struct ip6rd_flowi rdfl;
2194
2195         rdfl.fl6 = *fl6;
2196         rdfl.gateway = *gateway;
2197
2198         return fib6_rule_lookup(net, &rdfl.fl6,
2199                                 flags, __ip6_route_redirect);
2200 }
2201
2202 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2203                   kuid_t uid)
2204 {
2205         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2206         struct dst_entry *dst;
2207         struct flowi6 fl6;
2208
2209         memset(&fl6, 0, sizeof(fl6));
2210         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2211         fl6.flowi6_oif = oif;
2212         fl6.flowi6_mark = mark;
2213         fl6.daddr = iph->daddr;
2214         fl6.saddr = iph->saddr;
2215         fl6.flowlabel = ip6_flowinfo(iph);
2216         fl6.flowi6_uid = uid;
2217
2218         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2219         rt6_do_redirect(dst, NULL, skb);
2220         dst_release(dst);
2221 }
2222 EXPORT_SYMBOL_GPL(ip6_redirect);
2223
2224 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2225                             u32 mark)
2226 {
2227         const struct ipv6hdr *iph = ipv6_hdr(skb);
2228         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2229         struct dst_entry *dst;
2230         struct flowi6 fl6;
2231
2232         memset(&fl6, 0, sizeof(fl6));
2233         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2234         fl6.flowi6_oif = oif;
2235         fl6.flowi6_mark = mark;
2236         fl6.daddr = msg->dest;
2237         fl6.saddr = iph->daddr;
2238         fl6.flowi6_uid = sock_net_uid(net, NULL);
2239
2240         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2241         rt6_do_redirect(dst, NULL, skb);
2242         dst_release(dst);
2243 }
2244
2245 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2246 {
2247         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2248                      sk->sk_uid);
2249 }
2250 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2251
2252 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2253 {
2254         struct net_device *dev = dst->dev;
2255         unsigned int mtu = dst_mtu(dst);
2256         struct net *net = dev_net(dev);
2257
2258         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2259
2260         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2261                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2262
2263         /*
2264          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2265          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2266          * IPV6_MAXPLEN is also valid and means: "any MSS,
2267          * rely only on pmtu discovery"
2268          */
2269         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2270                 mtu = IPV6_MAXPLEN;
2271         return mtu;
2272 }
2273
2274 static unsigned int ip6_mtu(const struct dst_entry *dst)
2275 {
2276         const struct rt6_info *rt = (const struct rt6_info *)dst;
2277         unsigned int mtu = rt->rt6i_pmtu;
2278         struct inet6_dev *idev;
2279
2280         if (mtu)
2281                 goto out;
2282
2283         mtu = dst_metric_raw(dst, RTAX_MTU);
2284         if (mtu)
2285                 goto out;
2286
2287         mtu = IPV6_MIN_MTU;
2288
2289         rcu_read_lock();
2290         idev = __in6_dev_get(dst->dev);
2291         if (idev)
2292                 mtu = idev->cnf.mtu6;
2293         rcu_read_unlock();
2294
2295 out:
2296         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2297
2298         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2299 }
2300
2301 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2302                                   struct flowi6 *fl6)
2303 {
2304         struct dst_entry *dst;
2305         struct rt6_info *rt;
2306         struct inet6_dev *idev = in6_dev_get(dev);
2307         struct net *net = dev_net(dev);
2308
2309         if (unlikely(!idev))
2310                 return ERR_PTR(-ENODEV);
2311
2312         rt = ip6_dst_alloc(net, dev, 0);
2313         if (unlikely(!rt)) {
2314                 in6_dev_put(idev);
2315                 dst = ERR_PTR(-ENOMEM);
2316                 goto out;
2317         }
2318
2319         rt->dst.flags |= DST_HOST;
2320         rt->dst.output  = ip6_output;
2321         rt->rt6i_gateway  = fl6->daddr;
2322         rt->rt6i_dst.addr = fl6->daddr;
2323         rt->rt6i_dst.plen = 128;
2324         rt->rt6i_idev     = idev;
2325         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2326
2327         /* Add this dst into uncached_list so that rt6_ifdown() can
2328          * do proper release of the net_device
2329          */
2330         rt6_uncached_list_add(rt);
2331         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2332
2333         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2334
2335 out:
2336         return dst;
2337 }
2338
2339 static int ip6_dst_gc(struct dst_ops *ops)
2340 {
2341         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2342         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2343         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2344         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2345         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2346         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2347         int entries;
2348
2349         entries = dst_entries_get_fast(ops);
2350         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2351             entries <= rt_max_size)
2352                 goto out;
2353
2354         net->ipv6.ip6_rt_gc_expire++;
2355         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2356         entries = dst_entries_get_slow(ops);
2357         if (entries < ops->gc_thresh)
2358                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2359 out:
2360         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2361         return entries > rt_max_size;
2362 }
2363
2364 static int ip6_convert_metrics(struct mx6_config *mxc,
2365                                const struct fib6_config *cfg)
2366 {
2367         bool ecn_ca = false;
2368         struct nlattr *nla;
2369         int remaining;
2370         u32 *mp;
2371
2372         if (!cfg->fc_mx)
2373                 return 0;
2374
2375         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2376         if (unlikely(!mp))
2377                 return -ENOMEM;
2378
2379         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2380                 int type = nla_type(nla);
2381                 u32 val;
2382
2383                 if (!type)
2384                         continue;
2385                 if (unlikely(type > RTAX_MAX))
2386                         goto err;
2387
2388                 if (type == RTAX_CC_ALGO) {
2389                         char tmp[TCP_CA_NAME_MAX];
2390
2391                         nla_strlcpy(tmp, nla, sizeof(tmp));
2392                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2393                         if (val == TCP_CA_UNSPEC)
2394                                 goto err;
2395                 } else {
2396                         val = nla_get_u32(nla);
2397                 }
2398                 if (type == RTAX_HOPLIMIT && val > 255)
2399                         val = 255;
2400                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2401                         goto err;
2402
2403                 mp[type - 1] = val;
2404                 __set_bit(type - 1, mxc->mx_valid);
2405         }
2406
2407         if (ecn_ca) {
2408                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2409                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2410         }
2411
2412         mxc->mx = mp;
2413         return 0;
2414  err:
2415         kfree(mp);
2416         return -EINVAL;
2417 }
2418
2419 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2420                                             struct fib6_config *cfg,
2421                                             const struct in6_addr *gw_addr)
2422 {
2423         struct flowi6 fl6 = {
2424                 .flowi6_oif = cfg->fc_ifindex,
2425                 .daddr = *gw_addr,
2426                 .saddr = cfg->fc_prefsrc,
2427         };
2428         struct fib6_table *table;
2429         struct rt6_info *rt;
2430         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2431
2432         table = fib6_get_table(net, cfg->fc_table);
2433         if (!table)
2434                 return NULL;
2435
2436         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2437                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2438
2439         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2440
2441         /* if table lookup failed, fall back to full lookup */
2442         if (rt == net->ipv6.ip6_null_entry) {
2443                 ip6_rt_put(rt);
2444                 rt = NULL;
2445         }
2446
2447         return rt;
2448 }
2449
2450 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2451                                               struct netlink_ext_ack *extack)
2452 {
2453         struct net *net = cfg->fc_nlinfo.nl_net;
2454         struct rt6_info *rt = NULL;
2455         struct net_device *dev = NULL;
2456         struct inet6_dev *idev = NULL;
2457         struct fib6_table *table;
2458         int addr_type;
2459         int err = -EINVAL;
2460
2461         /* RTF_PCPU is an internal flag; can not be set by userspace */
2462         if (cfg->fc_flags & RTF_PCPU) {
2463                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2464                 goto out;
2465         }
2466
2467         if (cfg->fc_dst_len > 128) {
2468                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2469                 goto out;
2470         }
2471         if (cfg->fc_src_len > 128) {
2472                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2473                 goto out;
2474         }
2475 #ifndef CONFIG_IPV6_SUBTREES
2476         if (cfg->fc_src_len) {
2477                 NL_SET_ERR_MSG(extack,
2478                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2479                 goto out;
2480         }
2481 #endif
2482         if (cfg->fc_ifindex) {
2483                 err = -ENODEV;
2484                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2485                 if (!dev)
2486                         goto out;
2487                 idev = in6_dev_get(dev);
2488                 if (!idev)
2489                         goto out;
2490         }
2491
2492         if (cfg->fc_metric == 0)
2493                 cfg->fc_metric = IP6_RT_PRIO_USER;
2494
2495         err = -ENOBUFS;
2496         if (cfg->fc_nlinfo.nlh &&
2497             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2498                 table = fib6_get_table(net, cfg->fc_table);
2499                 if (!table) {
2500                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2501                         table = fib6_new_table(net, cfg->fc_table);
2502                 }
2503         } else {
2504                 table = fib6_new_table(net, cfg->fc_table);
2505         }
2506
2507         if (!table)
2508                 goto out;
2509
2510         rt = ip6_dst_alloc(net, NULL,
2511                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2512
2513         if (!rt) {
2514                 err = -ENOMEM;
2515                 goto out;
2516         }
2517
2518         if (cfg->fc_flags & RTF_EXPIRES)
2519                 rt6_set_expires(rt, jiffies +
2520                                 clock_t_to_jiffies(cfg->fc_expires));
2521         else
2522                 rt6_clean_expires(rt);
2523
2524         if (cfg->fc_protocol == RTPROT_UNSPEC)
2525                 cfg->fc_protocol = RTPROT_BOOT;
2526         rt->rt6i_protocol = cfg->fc_protocol;
2527
2528         addr_type = ipv6_addr_type(&cfg->fc_dst);
2529
2530         if (addr_type & IPV6_ADDR_MULTICAST)
2531                 rt->dst.input = ip6_mc_input;
2532         else if (cfg->fc_flags & RTF_LOCAL)
2533                 rt->dst.input = ip6_input;
2534         else
2535                 rt->dst.input = ip6_forward;
2536
2537         rt->dst.output = ip6_output;
2538
2539         if (cfg->fc_encap) {
2540                 struct lwtunnel_state *lwtstate;
2541
2542                 err = lwtunnel_build_state(cfg->fc_encap_type,
2543                                            cfg->fc_encap, AF_INET6, cfg,
2544                                            &lwtstate, extack);
2545                 if (err)
2546                         goto out;
2547                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2548                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2549                         rt->dst.lwtstate->orig_output = rt->dst.output;
2550                         rt->dst.output = lwtunnel_output;
2551                 }
2552                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2553                         rt->dst.lwtstate->orig_input = rt->dst.input;
2554                         rt->dst.input = lwtunnel_input;
2555                 }
2556         }
2557
2558         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2559         rt->rt6i_dst.plen = cfg->fc_dst_len;
2560         if (rt->rt6i_dst.plen == 128)
2561                 rt->dst.flags |= DST_HOST;
2562
2563 #ifdef CONFIG_IPV6_SUBTREES
2564         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2565         rt->rt6i_src.plen = cfg->fc_src_len;
2566 #endif
2567
2568         rt->rt6i_metric = cfg->fc_metric;
2569
2570         /* We cannot add true routes via loopback here,
2571            they would result in kernel looping; promote them to reject routes
2572          */
2573         if ((cfg->fc_flags & RTF_REJECT) ||
2574             (dev && (dev->flags & IFF_LOOPBACK) &&
2575              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2576              !(cfg->fc_flags & RTF_LOCAL))) {
2577                 /* hold loopback dev/idev if we haven't done so. */
2578                 if (dev != net->loopback_dev) {
2579                         if (dev) {
2580                                 dev_put(dev);
2581                                 in6_dev_put(idev);
2582                         }
2583                         dev = net->loopback_dev;
2584                         dev_hold(dev);
2585                         idev = in6_dev_get(dev);
2586                         if (!idev) {
2587                                 err = -ENODEV;
2588                                 goto out;
2589                         }
2590                 }
2591                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2592                 switch (cfg->fc_type) {
2593                 case RTN_BLACKHOLE:
2594                         rt->dst.error = -EINVAL;
2595                         rt->dst.output = dst_discard_out;
2596                         rt->dst.input = dst_discard;
2597                         break;
2598                 case RTN_PROHIBIT:
2599                         rt->dst.error = -EACCES;
2600                         rt->dst.output = ip6_pkt_prohibit_out;
2601                         rt->dst.input = ip6_pkt_prohibit;
2602                         break;
2603                 case RTN_THROW:
2604                 case RTN_UNREACHABLE:
2605                 default:
2606                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2607                                         : (cfg->fc_type == RTN_UNREACHABLE)
2608                                         ? -EHOSTUNREACH : -ENETUNREACH;
2609                         rt->dst.output = ip6_pkt_discard_out;
2610                         rt->dst.input = ip6_pkt_discard;
2611                         break;
2612                 }
2613                 goto install_route;
2614         }
2615
2616         if (cfg->fc_flags & RTF_GATEWAY) {
2617                 const struct in6_addr *gw_addr;
2618                 int gwa_type;
2619
2620                 gw_addr = &cfg->fc_gateway;
2621                 gwa_type = ipv6_addr_type(gw_addr);
2622
2623                 /* if gw_addr is local we will fail to detect this in case
2624                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2625                  * will return already-added prefix route via interface that
2626                  * prefix route was assigned to, which might be non-loopback.
2627                  */
2628                 err = -EINVAL;
2629                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2630                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2631                                             dev : NULL, 0, 0)) {
2632                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2633                         goto out;
2634                 }
2635                 rt->rt6i_gateway = *gw_addr;
2636
2637                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2638                         struct rt6_info *grt = NULL;
2639
2640                         /* IPv6 strictly inhibits using not link-local
2641                            addresses as nexthop address.
2642                            Otherwise, router will not able to send redirects.
2643                            It is very good, but in some (rare!) circumstances
2644                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2645                            some exceptions. --ANK
2646                            We allow IPv4-mapped nexthops to support RFC4798-type
2647                            addressing
2648                          */
2649                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2650                                           IPV6_ADDR_MAPPED))) {
2651                                 NL_SET_ERR_MSG(extack,
2652                                                "Invalid gateway address");
2653                                 goto out;
2654                         }
2655
2656                         if (cfg->fc_table) {
2657                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2658
2659                                 if (grt) {
2660                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2661                                             (dev && dev != grt->dst.dev)) {
2662                                                 ip6_rt_put(grt);
2663                                                 grt = NULL;
2664                                         }
2665                                 }
2666                         }
2667
2668                         if (!grt)
2669                                 grt = rt6_lookup(net, gw_addr, NULL,
2670                                                  cfg->fc_ifindex, 1);
2671
2672                         err = -EHOSTUNREACH;
2673                         if (!grt)
2674                                 goto out;
2675                         if (dev) {
2676                                 if (dev != grt->dst.dev) {
2677                                         ip6_rt_put(grt);
2678                                         goto out;
2679                                 }
2680                         } else {
2681                                 dev = grt->dst.dev;
2682                                 idev = grt->rt6i_idev;
2683                                 dev_hold(dev);
2684                                 in6_dev_hold(grt->rt6i_idev);
2685                         }
2686                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2687                                 err = 0;
2688                         ip6_rt_put(grt);
2689
2690                         if (err)
2691                                 goto out;
2692                 }
2693                 err = -EINVAL;
2694                 if (!dev) {
2695                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2696                         goto out;
2697                 } else if (dev->flags & IFF_LOOPBACK) {
2698                         NL_SET_ERR_MSG(extack,
2699                                        "Egress device can not be loopback device for this route");
2700                         goto out;
2701                 }
2702         }
2703
2704         err = -ENODEV;
2705         if (!dev)
2706                 goto out;
2707
2708         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2709                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2710                         NL_SET_ERR_MSG(extack, "Invalid source address");
2711                         err = -EINVAL;
2712                         goto out;
2713                 }
2714                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2715                 rt->rt6i_prefsrc.plen = 128;
2716         } else
2717                 rt->rt6i_prefsrc.plen = 0;
2718
2719         rt->rt6i_flags = cfg->fc_flags;
2720
2721 install_route:
2722         rt->dst.dev = dev;
2723         rt->rt6i_idev = idev;
2724         rt->rt6i_table = table;
2725
2726         cfg->fc_nlinfo.nl_net = dev_net(dev);
2727
2728         return rt;
2729 out:
2730         if (dev)
2731                 dev_put(dev);
2732         if (idev)
2733                 in6_dev_put(idev);
2734         if (rt)
2735                 dst_release_immediate(&rt->dst);
2736
2737         return ERR_PTR(err);
2738 }
2739
2740 int ip6_route_add(struct fib6_config *cfg,
2741                   struct netlink_ext_ack *extack)
2742 {
2743         struct mx6_config mxc = { .mx = NULL, };
2744         struct rt6_info *rt;
2745         int err;
2746
2747         rt = ip6_route_info_create(cfg, extack);
2748         if (IS_ERR(rt)) {
2749                 err = PTR_ERR(rt);
2750                 rt = NULL;
2751                 goto out;
2752         }
2753
2754         err = ip6_convert_metrics(&mxc, cfg);
2755         if (err)
2756                 goto out;
2757
2758         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2759
2760         kfree(mxc.mx);
2761
2762         return err;
2763 out:
2764         if (rt)
2765                 dst_release_immediate(&rt->dst);
2766
2767         return err;
2768 }
2769
2770 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2771 {
2772         int err;
2773         struct fib6_table *table;
2774         struct net *net = dev_net(rt->dst.dev);
2775
2776         if (rt == net->ipv6.ip6_null_entry) {
2777                 err = -ENOENT;
2778                 goto out;
2779         }
2780
2781         table = rt->rt6i_table;
2782         spin_lock_bh(&table->tb6_lock);
2783         err = fib6_del(rt, info);
2784         spin_unlock_bh(&table->tb6_lock);
2785
2786 out:
2787         ip6_rt_put(rt);
2788         return err;
2789 }
2790
2791 int ip6_del_rt(struct rt6_info *rt)
2792 {
2793         struct nl_info info = {
2794                 .nl_net = dev_net(rt->dst.dev),
2795         };
2796         return __ip6_del_rt(rt, &info);
2797 }
2798
2799 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2800 {
2801         struct nl_info *info = &cfg->fc_nlinfo;
2802         struct net *net = info->nl_net;
2803         struct sk_buff *skb = NULL;
2804         struct fib6_table *table;
2805         int err = -ENOENT;
2806
2807         if (rt == net->ipv6.ip6_null_entry)
2808                 goto out_put;
2809         table = rt->rt6i_table;
2810         spin_lock_bh(&table->tb6_lock);
2811
2812         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2813                 struct rt6_info *sibling, *next_sibling;
2814
2815                 /* prefer to send a single notification with all hops */
2816                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2817                 if (skb) {
2818                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2819
2820                         if (rt6_fill_node(net, skb, rt,
2821                                           NULL, NULL, 0, RTM_DELROUTE,
2822                                           info->portid, seq, 0) < 0) {
2823                                 kfree_skb(skb);
2824                                 skb = NULL;
2825                         } else
2826                                 info->skip_notify = 1;
2827                 }
2828
2829                 list_for_each_entry_safe(sibling, next_sibling,
2830                                          &rt->rt6i_siblings,
2831                                          rt6i_siblings) {
2832                         err = fib6_del(sibling, info);
2833                         if (err)
2834                                 goto out_unlock;
2835                 }
2836         }
2837
2838         err = fib6_del(rt, info);
2839 out_unlock:
2840         spin_unlock_bh(&table->tb6_lock);
2841 out_put:
2842         ip6_rt_put(rt);
2843
2844         if (skb) {
2845                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2846                             info->nlh, gfp_any());
2847         }
2848         return err;
2849 }
2850
2851 static int ip6_route_del(struct fib6_config *cfg,
2852                          struct netlink_ext_ack *extack)
2853 {
2854         struct rt6_info *rt, *rt_cache;
2855         struct fib6_table *table;
2856         struct fib6_node *fn;
2857         int err = -ESRCH;
2858
2859         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2860         if (!table) {
2861                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2862                 return err;
2863         }
2864
2865         rcu_read_lock();
2866
2867         fn = fib6_locate(&table->tb6_root,
2868                          &cfg->fc_dst, cfg->fc_dst_len,
2869                          &cfg->fc_src, cfg->fc_src_len,
2870                          !(cfg->fc_flags & RTF_CACHE));
2871
2872         if (fn) {
2873                 for_each_fib6_node_rt_rcu(fn) {
2874                         if (cfg->fc_flags & RTF_CACHE) {
2875                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2876                                                               &cfg->fc_src);
2877                                 if (!rt_cache)
2878                                         continue;
2879                                 rt = rt_cache;
2880                         }
2881                         if (cfg->fc_ifindex &&
2882                             (!rt->dst.dev ||
2883                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2884                                 continue;
2885                         if (cfg->fc_flags & RTF_GATEWAY &&
2886                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2887                                 continue;
2888                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2889                                 continue;
2890                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2891                                 continue;
2892                         if (!dst_hold_safe(&rt->dst))
2893                                 break;
2894                         rcu_read_unlock();
2895
2896                         /* if gateway was specified only delete the one hop */
2897                         if (cfg->fc_flags & RTF_GATEWAY)
2898                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2899
2900                         return __ip6_del_rt_siblings(rt, cfg);
2901                 }
2902         }
2903         rcu_read_unlock();
2904
2905         return err;
2906 }
2907
2908 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2909 {
2910         struct netevent_redirect netevent;
2911         struct rt6_info *rt, *nrt = NULL;
2912         struct ndisc_options ndopts;
2913         struct inet6_dev *in6_dev;
2914         struct neighbour *neigh;
2915         struct rd_msg *msg;
2916         int optlen, on_link;
2917         u8 *lladdr;
2918
2919         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2920         optlen -= sizeof(*msg);
2921
2922         if (optlen < 0) {
2923                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2924                 return;
2925         }
2926
2927         msg = (struct rd_msg *)icmp6_hdr(skb);
2928
2929         if (ipv6_addr_is_multicast(&msg->dest)) {
2930                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2931                 return;
2932         }
2933
2934         on_link = 0;
2935         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2936                 on_link = 1;
2937         } else if (ipv6_addr_type(&msg->target) !=
2938                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2939                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2940                 return;
2941         }
2942
2943         in6_dev = __in6_dev_get(skb->dev);
2944         if (!in6_dev)
2945                 return;
2946         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2947                 return;
2948
2949         /* RFC2461 8.1:
2950          *      The IP source address of the Redirect MUST be the same as the current
2951          *      first-hop router for the specified ICMP Destination Address.
2952          */
2953
2954         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2955                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2956                 return;
2957         }
2958
2959         lladdr = NULL;
2960         if (ndopts.nd_opts_tgt_lladdr) {
2961                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2962                                              skb->dev);
2963                 if (!lladdr) {
2964                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2965                         return;
2966                 }
2967         }
2968
2969         rt = (struct rt6_info *) dst;
2970         if (rt->rt6i_flags & RTF_REJECT) {
2971                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2972                 return;
2973         }
2974
2975         /* Redirect received -> path was valid.
2976          * Look, redirects are sent only in response to data packets,
2977          * so that this nexthop apparently is reachable. --ANK
2978          */
2979         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2980
2981         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2982         if (!neigh)
2983                 return;
2984
2985         /*
2986          *      We have finally decided to accept it.
2987          */
2988
2989         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2990                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2991                      NEIGH_UPDATE_F_OVERRIDE|
2992                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2993                                      NEIGH_UPDATE_F_ISROUTER)),
2994                      NDISC_REDIRECT, &ndopts);
2995
2996         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2997         if (!nrt)
2998                 goto out;
2999
3000         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3001         if (on_link)
3002                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3003
3004         nrt->rt6i_protocol = RTPROT_REDIRECT;
3005         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3006
3007         /* No need to remove rt from the exception table if rt is
3008          * a cached route because rt6_insert_exception() will
3009          * takes care of it
3010          */
3011         if (rt6_insert_exception(nrt, rt)) {
3012                 dst_release_immediate(&nrt->dst);
3013                 goto out;
3014         }
3015
3016         netevent.old = &rt->dst;
3017         netevent.new = &nrt->dst;
3018         netevent.daddr = &msg->dest;
3019         netevent.neigh = neigh;
3020         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3021
3022 out:
3023         neigh_release(neigh);
3024 }
3025
3026 /*
3027  *      Misc support functions
3028  */
3029
3030 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3031 {
3032         BUG_ON(from->dst.from);
3033
3034         rt->rt6i_flags &= ~RTF_EXPIRES;
3035         dst_hold(&from->dst);
3036         rt->dst.from = &from->dst;
3037         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3038 }
3039
3040 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3041 {
3042         rt->dst.input = ort->dst.input;
3043         rt->dst.output = ort->dst.output;
3044         rt->rt6i_dst = ort->rt6i_dst;
3045         rt->dst.error = ort->dst.error;
3046         rt->rt6i_idev = ort->rt6i_idev;
3047         if (rt->rt6i_idev)
3048                 in6_dev_hold(rt->rt6i_idev);
3049         rt->dst.lastuse = jiffies;
3050         rt->rt6i_gateway = ort->rt6i_gateway;
3051         rt->rt6i_flags = ort->rt6i_flags;
3052         rt6_set_from(rt, ort);
3053         rt->rt6i_metric = ort->rt6i_metric;
3054 #ifdef CONFIG_IPV6_SUBTREES
3055         rt->rt6i_src = ort->rt6i_src;
3056 #endif
3057         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3058         rt->rt6i_table = ort->rt6i_table;
3059         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3060 }
3061
3062 #ifdef CONFIG_IPV6_ROUTE_INFO
3063 static struct rt6_info *rt6_get_route_info(struct net *net,
3064                                            const struct in6_addr *prefix, int prefixlen,
3065                                            const struct in6_addr *gwaddr,
3066                                            struct net_device *dev)
3067 {
3068         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3069         int ifindex = dev->ifindex;
3070         struct fib6_node *fn;
3071         struct rt6_info *rt = NULL;
3072         struct fib6_table *table;
3073
3074         table = fib6_get_table(net, tb_id);
3075         if (!table)
3076                 return NULL;
3077
3078         rcu_read_lock();
3079         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3080         if (!fn)
3081                 goto out;
3082
3083         for_each_fib6_node_rt_rcu(fn) {
3084                 if (rt->dst.dev->ifindex != ifindex)
3085                         continue;
3086                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3087                         continue;
3088                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3089                         continue;
3090                 ip6_hold_safe(NULL, &rt, false);
3091                 break;
3092         }
3093 out:
3094         rcu_read_unlock();
3095         return rt;
3096 }
3097
3098 static struct rt6_info *rt6_add_route_info(struct net *net,
3099                                            const struct in6_addr *prefix, int prefixlen,
3100                                            const struct in6_addr *gwaddr,
3101                                            struct net_device *dev,
3102                                            unsigned int pref)
3103 {
3104         struct fib6_config cfg = {
3105                 .fc_metric      = IP6_RT_PRIO_USER,
3106                 .fc_ifindex     = dev->ifindex,
3107                 .fc_dst_len     = prefixlen,
3108                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3109                                   RTF_UP | RTF_PREF(pref),
3110                 .fc_protocol = RTPROT_RA,
3111                 .fc_nlinfo.portid = 0,
3112                 .fc_nlinfo.nlh = NULL,
3113                 .fc_nlinfo.nl_net = net,
3114         };
3115
3116         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3117         cfg.fc_dst = *prefix;
3118         cfg.fc_gateway = *gwaddr;
3119
3120         /* We should treat it as a default route if prefix length is 0. */
3121         if (!prefixlen)
3122                 cfg.fc_flags |= RTF_DEFAULT;
3123
3124         ip6_route_add(&cfg, NULL);
3125
3126         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3127 }
3128 #endif
3129
3130 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3131 {
3132         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3133         struct rt6_info *rt;
3134         struct fib6_table *table;
3135
3136         table = fib6_get_table(dev_net(dev), tb_id);
3137         if (!table)
3138                 return NULL;
3139
3140         rcu_read_lock();
3141         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3142                 if (dev == rt->dst.dev &&
3143                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3144                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3145                         break;
3146         }
3147         if (rt)
3148                 ip6_hold_safe(NULL, &rt, false);
3149         rcu_read_unlock();
3150         return rt;
3151 }
3152
3153 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3154                                      struct net_device *dev,
3155                                      unsigned int pref)
3156 {
3157         struct fib6_config cfg = {
3158                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3159                 .fc_metric      = IP6_RT_PRIO_USER,
3160                 .fc_ifindex     = dev->ifindex,
3161                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3162                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3163                 .fc_protocol = RTPROT_RA,
3164                 .fc_nlinfo.portid = 0,
3165                 .fc_nlinfo.nlh = NULL,
3166                 .fc_nlinfo.nl_net = dev_net(dev),
3167         };
3168
3169         cfg.fc_gateway = *gwaddr;
3170
3171         if (!ip6_route_add(&cfg, NULL)) {
3172                 struct fib6_table *table;
3173
3174                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3175                 if (table)
3176                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3177         }
3178
3179         return rt6_get_dflt_router(gwaddr, dev);
3180 }
3181
3182 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3183 {
3184         struct rt6_info *rt;
3185
3186 restart:
3187         rcu_read_lock();
3188         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3189                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3190                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3191                         if (dst_hold_safe(&rt->dst)) {
3192                                 rcu_read_unlock();
3193                                 ip6_del_rt(rt);
3194                         } else {
3195                                 rcu_read_unlock();
3196                         }
3197                         goto restart;
3198                 }
3199         }
3200         rcu_read_unlock();
3201
3202         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3203 }
3204
3205 void rt6_purge_dflt_routers(struct net *net)
3206 {
3207         struct fib6_table *table;
3208         struct hlist_head *head;
3209         unsigned int h;
3210
3211         rcu_read_lock();
3212
3213         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3214                 head = &net->ipv6.fib_table_hash[h];
3215                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3216                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3217                                 __rt6_purge_dflt_routers(table);
3218                 }
3219         }
3220
3221         rcu_read_unlock();
3222 }
3223
3224 static void rtmsg_to_fib6_config(struct net *net,
3225                                  struct in6_rtmsg *rtmsg,
3226                                  struct fib6_config *cfg)
3227 {
3228         memset(cfg, 0, sizeof(*cfg));
3229
3230         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3231                          : RT6_TABLE_MAIN;
3232         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3233         cfg->fc_metric = rtmsg->rtmsg_metric;
3234         cfg->fc_expires = rtmsg->rtmsg_info;
3235         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3236         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3237         cfg->fc_flags = rtmsg->rtmsg_flags;
3238
3239         cfg->fc_nlinfo.nl_net = net;
3240
3241         cfg->fc_dst = rtmsg->rtmsg_dst;
3242         cfg->fc_src = rtmsg->rtmsg_src;
3243         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3244 }
3245
3246 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3247 {
3248         struct fib6_config cfg;
3249         struct in6_rtmsg rtmsg;
3250         int err;
3251
3252         switch (cmd) {
3253         case SIOCADDRT:         /* Add a route */
3254         case SIOCDELRT:         /* Delete a route */
3255                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3256                         return -EPERM;
3257                 err = copy_from_user(&rtmsg, arg,
3258                                      sizeof(struct in6_rtmsg));
3259                 if (err)
3260                         return -EFAULT;
3261
3262                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3263
3264                 rtnl_lock();
3265                 switch (cmd) {
3266                 case SIOCADDRT:
3267                         err = ip6_route_add(&cfg, NULL);
3268                         break;
3269                 case SIOCDELRT:
3270                         err = ip6_route_del(&cfg, NULL);
3271                         break;
3272                 default:
3273                         err = -EINVAL;
3274                 }
3275                 rtnl_unlock();
3276
3277                 return err;
3278         }
3279
3280         return -EINVAL;
3281 }
3282
3283 /*
3284  *      Drop the packet on the floor
3285  */
3286
3287 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3288 {
3289         int type;
3290         struct dst_entry *dst = skb_dst(skb);
3291         switch (ipstats_mib_noroutes) {
3292         case IPSTATS_MIB_INNOROUTES:
3293                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3294                 if (type == IPV6_ADDR_ANY) {
3295                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3296                                       IPSTATS_MIB_INADDRERRORS);
3297                         break;
3298                 }
3299                 /* FALLTHROUGH */
3300         case IPSTATS_MIB_OUTNOROUTES:
3301                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3302                               ipstats_mib_noroutes);
3303                 break;
3304         }
3305         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3306         kfree_skb(skb);
3307         return 0;
3308 }
3309
3310 static int ip6_pkt_discard(struct sk_buff *skb)
3311 {
3312         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3313 }
3314
3315 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3316 {
3317         skb->dev = skb_dst(skb)->dev;
3318         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3319 }
3320
3321 static int ip6_pkt_prohibit(struct sk_buff *skb)
3322 {
3323         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3324 }
3325
3326 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3327 {
3328         skb->dev = skb_dst(skb)->dev;
3329         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3330 }
3331
3332 /*
3333  *      Allocate a dst for local (unicast / anycast) address.
3334  */
3335
3336 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3337                                     const struct in6_addr *addr,
3338                                     bool anycast)
3339 {
3340         u32 tb_id;
3341         struct net *net = dev_net(idev->dev);
3342         struct net_device *dev = idev->dev;
3343         struct rt6_info *rt;
3344
3345         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3346         if (!rt)
3347                 return ERR_PTR(-ENOMEM);
3348
3349         in6_dev_hold(idev);
3350
3351         rt->dst.flags |= DST_HOST;
3352         rt->dst.input = ip6_input;
3353         rt->dst.output = ip6_output;
3354         rt->rt6i_idev = idev;
3355
3356         rt->rt6i_protocol = RTPROT_KERNEL;
3357         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3358         if (anycast)
3359                 rt->rt6i_flags |= RTF_ANYCAST;
3360         else
3361                 rt->rt6i_flags |= RTF_LOCAL;
3362
3363         rt->rt6i_gateway  = *addr;
3364         rt->rt6i_dst.addr = *addr;
3365         rt->rt6i_dst.plen = 128;
3366         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3367         rt->rt6i_table = fib6_get_table(net, tb_id);
3368
3369         return rt;
3370 }
3371
3372 /* remove deleted ip from prefsrc entries */
3373 struct arg_dev_net_ip {
3374         struct net_device *dev;
3375         struct net *net;
3376         struct in6_addr *addr;
3377 };
3378
3379 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3380 {
3381         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3382         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3383         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3384
3385         if (((void *)rt->dst.dev == dev || !dev) &&
3386             rt != net->ipv6.ip6_null_entry &&
3387             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3388                 spin_lock_bh(&rt6_exception_lock);
3389                 /* remove prefsrc entry */
3390                 rt->rt6i_prefsrc.plen = 0;
3391                 /* need to update cache as well */
3392                 rt6_exceptions_remove_prefsrc(rt);
3393                 spin_unlock_bh(&rt6_exception_lock);
3394         }
3395         return 0;
3396 }
3397
3398 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3399 {
3400         struct net *net = dev_net(ifp->idev->dev);
3401         struct arg_dev_net_ip adni = {
3402                 .dev = ifp->idev->dev,
3403                 .net = net,
3404                 .addr = &ifp->addr,
3405         };
3406         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3407 }
3408
3409 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3410
3411 /* Remove routers and update dst entries when gateway turn into host. */
3412 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3413 {
3414         struct in6_addr *gateway = (struct in6_addr *)arg;
3415
3416         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3417             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3418                 return -1;
3419         }
3420
3421         /* Further clean up cached routes in exception table.
3422          * This is needed because cached route may have a different
3423          * gateway than its 'parent' in the case of an ip redirect.
3424          */
3425         rt6_exceptions_clean_tohost(rt, gateway);
3426
3427         return 0;
3428 }
3429
3430 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3431 {
3432         fib6_clean_all(net, fib6_clean_tohost, gateway);
3433 }
3434
3435 struct arg_dev_net {
3436         struct net_device *dev;
3437         struct net *net;
3438 };
3439
3440 /* called with write lock held for table with rt */
3441 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3442 {
3443         const struct arg_dev_net *adn = arg;
3444         const struct net_device *dev = adn->dev;
3445
3446         if ((rt->dst.dev == dev || !dev) &&
3447             rt != adn->net->ipv6.ip6_null_entry &&
3448             (rt->rt6i_nsiblings == 0 ||
3449              (dev && netdev_unregistering(dev)) ||
3450              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3451                 return -1;
3452
3453         return 0;
3454 }
3455
3456 void rt6_ifdown(struct net *net, struct net_device *dev)
3457 {
3458         struct arg_dev_net adn = {
3459                 .dev = dev,
3460                 .net = net,
3461         };
3462
3463         fib6_clean_all(net, fib6_ifdown, &adn);
3464         if (dev)
3465                 rt6_uncached_list_flush_dev(net, dev);
3466 }
3467
3468 struct rt6_mtu_change_arg {
3469         struct net_device *dev;
3470         unsigned int mtu;
3471 };
3472
3473 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3474 {
3475         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3476         struct inet6_dev *idev;
3477
3478         /* In IPv6 pmtu discovery is not optional,
3479            so that RTAX_MTU lock cannot disable it.
3480            We still use this lock to block changes
3481            caused by addrconf/ndisc.
3482         */
3483
3484         idev = __in6_dev_get(arg->dev);
3485         if (!idev)
3486                 return 0;
3487
3488         /* For administrative MTU increase, there is no way to discover
3489            IPv6 PMTU increase, so PMTU increase should be updated here.
3490            Since RFC 1981 doesn't include administrative MTU increase
3491            update PMTU increase is a MUST. (i.e. jumbo frame)
3492          */
3493         /*
3494            If new MTU is less than route PMTU, this new MTU will be the
3495            lowest MTU in the path, update the route PMTU to reflect PMTU
3496            decreases; if new MTU is greater than route PMTU, and the
3497            old MTU is the lowest MTU in the path, update the route PMTU
3498            to reflect the increase. In this case if the other nodes' MTU
3499            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3500            PMTU discovery.
3501          */
3502         if (rt->dst.dev == arg->dev &&
3503             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3504             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3505                 spin_lock_bh(&rt6_exception_lock);
3506                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3507                     (dst_mtu(&rt->dst) < arg->mtu &&
3508                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3509                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3510                 }
3511                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3512                 spin_unlock_bh(&rt6_exception_lock);
3513         }
3514         return 0;
3515 }
3516
3517 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3518 {
3519         struct rt6_mtu_change_arg arg = {
3520                 .dev = dev,
3521                 .mtu = mtu,
3522         };
3523
3524         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3525 }
3526
3527 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3528         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3529         [RTA_OIF]               = { .type = NLA_U32 },
3530         [RTA_IIF]               = { .type = NLA_U32 },
3531         [RTA_PRIORITY]          = { .type = NLA_U32 },
3532         [RTA_METRICS]           = { .type = NLA_NESTED },
3533         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3534         [RTA_PREF]              = { .type = NLA_U8 },
3535         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3536         [RTA_ENCAP]             = { .type = NLA_NESTED },
3537         [RTA_EXPIRES]           = { .type = NLA_U32 },
3538         [RTA_UID]               = { .type = NLA_U32 },
3539         [RTA_MARK]              = { .type = NLA_U32 },
3540 };
3541
3542 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3543                               struct fib6_config *cfg,
3544                               struct netlink_ext_ack *extack)
3545 {
3546         struct rtmsg *rtm;
3547         struct nlattr *tb[RTA_MAX+1];
3548         unsigned int pref;
3549         int err;
3550
3551         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3552                           NULL);
3553         if (err < 0)
3554                 goto errout;
3555
3556         err = -EINVAL;
3557         rtm = nlmsg_data(nlh);
3558         memset(cfg, 0, sizeof(*cfg));
3559
3560         cfg->fc_table = rtm->rtm_table;
3561         cfg->fc_dst_len = rtm->rtm_dst_len;
3562         cfg->fc_src_len = rtm->rtm_src_len;
3563         cfg->fc_flags = RTF_UP;
3564         cfg->fc_protocol = rtm->rtm_protocol;
3565         cfg->fc_type = rtm->rtm_type;
3566
3567         if (rtm->rtm_type == RTN_UNREACHABLE ||
3568             rtm->rtm_type == RTN_BLACKHOLE ||
3569             rtm->rtm_type == RTN_PROHIBIT ||
3570             rtm->rtm_type == RTN_THROW)
3571                 cfg->fc_flags |= RTF_REJECT;
3572
3573         if (rtm->rtm_type == RTN_LOCAL)
3574                 cfg->fc_flags |= RTF_LOCAL;
3575
3576         if (rtm->rtm_flags & RTM_F_CLONED)
3577                 cfg->fc_flags |= RTF_CACHE;
3578
3579         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3580         cfg->fc_nlinfo.nlh = nlh;
3581         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3582
3583         if (tb[RTA_GATEWAY]) {
3584                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3585                 cfg->fc_flags |= RTF_GATEWAY;
3586         }
3587
3588         if (tb[RTA_DST]) {
3589                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3590
3591                 if (nla_len(tb[RTA_DST]) < plen)
3592                         goto errout;
3593
3594                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3595         }
3596
3597         if (tb[RTA_SRC]) {
3598                 int plen = (rtm->rtm_src_len + 7) >> 3;
3599
3600                 if (nla_len(tb[RTA_SRC]) < plen)
3601                         goto errout;
3602
3603                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3604         }
3605
3606         if (tb[RTA_PREFSRC])
3607                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3608
3609         if (tb[RTA_OIF])
3610                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3611
3612         if (tb[RTA_PRIORITY])
3613                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3614
3615         if (tb[RTA_METRICS]) {
3616                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3617                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3618         }
3619
3620         if (tb[RTA_TABLE])
3621                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3622
3623         if (tb[RTA_MULTIPATH]) {
3624                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3625                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3626
3627                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3628                                                      cfg->fc_mp_len, extack);
3629                 if (err < 0)
3630                         goto errout;
3631         }
3632
3633         if (tb[RTA_PREF]) {
3634                 pref = nla_get_u8(tb[RTA_PREF]);
3635                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3636                     pref != ICMPV6_ROUTER_PREF_HIGH)
3637                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3638                 cfg->fc_flags |= RTF_PREF(pref);
3639         }
3640
3641         if (tb[RTA_ENCAP])
3642                 cfg->fc_encap = tb[RTA_ENCAP];
3643
3644         if (tb[RTA_ENCAP_TYPE]) {
3645                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3646
3647                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3648                 if (err < 0)
3649                         goto errout;
3650         }
3651
3652         if (tb[RTA_EXPIRES]) {
3653                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3654
3655                 if (addrconf_finite_timeout(timeout)) {
3656                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3657                         cfg->fc_flags |= RTF_EXPIRES;
3658                 }
3659         }
3660
3661         err = 0;
3662 errout:
3663         return err;
3664 }
3665
3666 struct rt6_nh {
3667         struct rt6_info *rt6_info;
3668         struct fib6_config r_cfg;
3669         struct mx6_config mxc;
3670         struct list_head next;
3671 };
3672
3673 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3674 {
3675         struct rt6_nh *nh;
3676
3677         list_for_each_entry(nh, rt6_nh_list, next) {
3678                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3679                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3680                         nh->r_cfg.fc_ifindex);
3681         }
3682 }
3683
3684 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3685                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3686 {
3687         struct rt6_nh *nh;
3688         int err = -EEXIST;
3689
3690         list_for_each_entry(nh, rt6_nh_list, next) {
3691                 /* check if rt6_info already exists */
3692                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3693                         return err;
3694         }
3695
3696         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3697         if (!nh)
3698                 return -ENOMEM;
3699         nh->rt6_info = rt;
3700         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3701         if (err) {
3702                 kfree(nh);
3703                 return err;
3704         }
3705         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3706         list_add_tail(&nh->next, rt6_nh_list);
3707
3708         return 0;
3709 }
3710
3711 static void ip6_route_mpath_notify(struct rt6_info *rt,
3712                                    struct rt6_info *rt_last,
3713                                    struct nl_info *info,
3714                                    __u16 nlflags)
3715 {
3716         /* if this is an APPEND route, then rt points to the first route
3717          * inserted and rt_last points to last route inserted. Userspace
3718          * wants a consistent dump of the route which starts at the first
3719          * nexthop. Since sibling routes are always added at the end of
3720          * the list, find the first sibling of the last route appended
3721          */
3722         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3723                 rt = list_first_entry(&rt_last->rt6i_siblings,
3724                                       struct rt6_info,
3725                                       rt6i_siblings);
3726         }
3727
3728         if (rt)
3729                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3730 }
3731
3732 static int ip6_route_multipath_add(struct fib6_config *cfg,
3733                                    struct netlink_ext_ack *extack)
3734 {
3735         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3736         struct nl_info *info = &cfg->fc_nlinfo;
3737         struct fib6_config r_cfg;
3738         struct rtnexthop *rtnh;
3739         struct rt6_info *rt;
3740         struct rt6_nh *err_nh;
3741         struct rt6_nh *nh, *nh_safe;
3742         __u16 nlflags;
3743         int remaining;
3744         int attrlen;
3745         int err = 1;
3746         int nhn = 0;
3747         int replace = (cfg->fc_nlinfo.nlh &&
3748                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3749         LIST_HEAD(rt6_nh_list);
3750
3751         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3752         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3753                 nlflags |= NLM_F_APPEND;
3754
3755         remaining = cfg->fc_mp_len;
3756         rtnh = (struct rtnexthop *)cfg->fc_mp;
3757
3758         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3759          * rt6_info structs per nexthop
3760          */
3761         while (rtnh_ok(rtnh, remaining)) {
3762                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3763                 if (rtnh->rtnh_ifindex)
3764                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3765
3766                 attrlen = rtnh_attrlen(rtnh);
3767                 if (attrlen > 0) {
3768                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3769
3770                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3771                         if (nla) {
3772                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3773                                 r_cfg.fc_flags |= RTF_GATEWAY;
3774                         }
3775                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3776                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3777                         if (nla)
3778                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3779                 }
3780
3781                 rt = ip6_route_info_create(&r_cfg, extack);
3782                 if (IS_ERR(rt)) {
3783                         err = PTR_ERR(rt);
3784                         rt = NULL;
3785                         goto cleanup;
3786                 }
3787
3788                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3789                 if (err) {
3790                         dst_release_immediate(&rt->dst);
3791                         goto cleanup;
3792                 }
3793
3794                 rtnh = rtnh_next(rtnh, &remaining);
3795         }
3796
3797         /* for add and replace send one notification with all nexthops.
3798          * Skip the notification in fib6_add_rt2node and send one with
3799          * the full route when done
3800          */
3801         info->skip_notify = 1;
3802
3803         err_nh = NULL;
3804         list_for_each_entry(nh, &rt6_nh_list, next) {
3805                 rt_last = nh->rt6_info;
3806                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3807                 /* save reference to first route for notification */
3808                 if (!rt_notif && !err)
3809                         rt_notif = nh->rt6_info;
3810
3811                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3812                 nh->rt6_info = NULL;
3813                 if (err) {
3814                         if (replace && nhn)
3815                                 ip6_print_replace_route_err(&rt6_nh_list);
3816                         err_nh = nh;
3817                         goto add_errout;
3818                 }
3819
3820                 /* Because each route is added like a single route we remove
3821                  * these flags after the first nexthop: if there is a collision,
3822                  * we have already failed to add the first nexthop:
3823                  * fib6_add_rt2node() has rejected it; when replacing, old
3824                  * nexthops have been replaced by first new, the rest should
3825                  * be added to it.
3826                  */
3827                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3828                                                      NLM_F_REPLACE);
3829                 nhn++;
3830         }
3831
3832         /* success ... tell user about new route */
3833         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3834         goto cleanup;
3835
3836 add_errout:
3837         /* send notification for routes that were added so that
3838          * the delete notifications sent by ip6_route_del are
3839          * coherent
3840          */
3841         if (rt_notif)
3842                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3843
3844         /* Delete routes that were already added */
3845         list_for_each_entry(nh, &rt6_nh_list, next) {
3846                 if (err_nh == nh)
3847                         break;
3848                 ip6_route_del(&nh->r_cfg, extack);
3849         }
3850
3851 cleanup:
3852         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3853                 if (nh->rt6_info)
3854                         dst_release_immediate(&nh->rt6_info->dst);
3855                 kfree(nh->mxc.mx);
3856                 list_del(&nh->next);
3857                 kfree(nh);
3858         }
3859
3860         return err;
3861 }
3862
3863 static int ip6_route_multipath_del(struct fib6_config *cfg,
3864                                    struct netlink_ext_ack *extack)
3865 {
3866         struct fib6_config r_cfg;
3867         struct rtnexthop *rtnh;
3868         int remaining;
3869         int attrlen;
3870         int err = 1, last_err = 0;
3871
3872         remaining = cfg->fc_mp_len;
3873         rtnh = (struct rtnexthop *)cfg->fc_mp;
3874
3875         /* Parse a Multipath Entry */
3876         while (rtnh_ok(rtnh, remaining)) {
3877                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3878                 if (rtnh->rtnh_ifindex)
3879                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3880
3881                 attrlen = rtnh_attrlen(rtnh);
3882                 if (attrlen > 0) {
3883                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3884
3885                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3886                         if (nla) {
3887                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3888                                 r_cfg.fc_flags |= RTF_GATEWAY;
3889                         }
3890                 }
3891                 err = ip6_route_del(&r_cfg, extack);
3892                 if (err)
3893                         last_err = err;
3894
3895                 rtnh = rtnh_next(rtnh, &remaining);
3896         }
3897
3898         return last_err;
3899 }
3900
3901 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3902                               struct netlink_ext_ack *extack)
3903 {
3904         struct fib6_config cfg;
3905         int err;
3906
3907         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3908         if (err < 0)
3909                 return err;
3910
3911         if (cfg.fc_mp)
3912                 return ip6_route_multipath_del(&cfg, extack);
3913         else {
3914                 cfg.fc_delete_all_nh = 1;
3915                 return ip6_route_del(&cfg, extack);
3916         }
3917 }
3918
3919 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3920                               struct netlink_ext_ack *extack)
3921 {
3922         struct fib6_config cfg;
3923         int err;
3924
3925         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3926         if (err < 0)
3927                 return err;
3928
3929         if (cfg.fc_mp)
3930                 return ip6_route_multipath_add(&cfg, extack);
3931         else
3932                 return ip6_route_add(&cfg, extack);
3933 }
3934
3935 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3936 {
3937         int nexthop_len = 0;
3938
3939         if (rt->rt6i_nsiblings) {
3940                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3941                             + NLA_ALIGN(sizeof(struct rtnexthop))
3942                             + nla_total_size(16) /* RTA_GATEWAY */
3943                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3944
3945                 nexthop_len *= rt->rt6i_nsiblings;
3946         }
3947
3948         return NLMSG_ALIGN(sizeof(struct rtmsg))
3949                + nla_total_size(16) /* RTA_SRC */
3950                + nla_total_size(16) /* RTA_DST */
3951                + nla_total_size(16) /* RTA_GATEWAY */
3952                + nla_total_size(16) /* RTA_PREFSRC */
3953                + nla_total_size(4) /* RTA_TABLE */
3954                + nla_total_size(4) /* RTA_IIF */
3955                + nla_total_size(4) /* RTA_OIF */
3956                + nla_total_size(4) /* RTA_PRIORITY */
3957                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3958                + nla_total_size(sizeof(struct rta_cacheinfo))
3959                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3960                + nla_total_size(1) /* RTA_PREF */
3961                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3962                + nexthop_len;
3963 }
3964
3965 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3966                             unsigned int *flags, bool skip_oif)
3967 {
3968         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3969                 *flags |= RTNH_F_LINKDOWN;
3970                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3971                         *flags |= RTNH_F_DEAD;
3972         }
3973
3974         if (rt->rt6i_flags & RTF_GATEWAY) {
3975                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3976                         goto nla_put_failure;
3977         }
3978
3979         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3980                 *flags |= RTNH_F_OFFLOAD;
3981
3982         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3983         if (!skip_oif && rt->dst.dev &&
3984             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3985                 goto nla_put_failure;
3986
3987         if (rt->dst.lwtstate &&
3988             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3989                 goto nla_put_failure;
3990
3991         return 0;
3992
3993 nla_put_failure:
3994         return -EMSGSIZE;
3995 }
3996
3997 /* add multipath next hop */
3998 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3999 {
4000         struct rtnexthop *rtnh;
4001         unsigned int flags = 0;
4002
4003         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4004         if (!rtnh)
4005                 goto nla_put_failure;
4006
4007         rtnh->rtnh_hops = 0;
4008         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4009
4010         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4011                 goto nla_put_failure;
4012
4013         rtnh->rtnh_flags = flags;
4014
4015         /* length of rtnetlink header + attributes */
4016         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4017
4018         return 0;
4019
4020 nla_put_failure:
4021         return -EMSGSIZE;
4022 }
4023
4024 static int rt6_fill_node(struct net *net,
4025                          struct sk_buff *skb, struct rt6_info *rt,
4026                          struct in6_addr *dst, struct in6_addr *src,
4027                          int iif, int type, u32 portid, u32 seq,
4028                          unsigned int flags)
4029 {
4030         u32 metrics[RTAX_MAX];
4031         struct rtmsg *rtm;
4032         struct nlmsghdr *nlh;
4033         long expires;
4034         u32 table;
4035
4036         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4037         if (!nlh)
4038                 return -EMSGSIZE;
4039
4040         rtm = nlmsg_data(nlh);
4041         rtm->rtm_family = AF_INET6;
4042         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4043         rtm->rtm_src_len = rt->rt6i_src.plen;
4044         rtm->rtm_tos = 0;
4045         if (rt->rt6i_table)
4046                 table = rt->rt6i_table->tb6_id;
4047         else
4048                 table = RT6_TABLE_UNSPEC;
4049         rtm->rtm_table = table;
4050         if (nla_put_u32(skb, RTA_TABLE, table))
4051                 goto nla_put_failure;
4052         if (rt->rt6i_flags & RTF_REJECT) {
4053                 switch (rt->dst.error) {
4054                 case -EINVAL:
4055                         rtm->rtm_type = RTN_BLACKHOLE;
4056                         break;
4057                 case -EACCES:
4058                         rtm->rtm_type = RTN_PROHIBIT;
4059                         break;
4060                 case -EAGAIN:
4061                         rtm->rtm_type = RTN_THROW;
4062                         break;
4063                 default:
4064                         rtm->rtm_type = RTN_UNREACHABLE;
4065                         break;
4066                 }
4067         }
4068         else if (rt->rt6i_flags & RTF_LOCAL)
4069                 rtm->rtm_type = RTN_LOCAL;
4070         else if (rt->rt6i_flags & RTF_ANYCAST)
4071                 rtm->rtm_type = RTN_ANYCAST;
4072         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4073                 rtm->rtm_type = RTN_LOCAL;
4074         else
4075                 rtm->rtm_type = RTN_UNICAST;
4076         rtm->rtm_flags = 0;
4077         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4078         rtm->rtm_protocol = rt->rt6i_protocol;
4079
4080         if (rt->rt6i_flags & RTF_CACHE)
4081                 rtm->rtm_flags |= RTM_F_CLONED;
4082
4083         if (dst) {
4084                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4085                         goto nla_put_failure;
4086                 rtm->rtm_dst_len = 128;
4087         } else if (rtm->rtm_dst_len)
4088                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4089                         goto nla_put_failure;
4090 #ifdef CONFIG_IPV6_SUBTREES
4091         if (src) {
4092                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4093                         goto nla_put_failure;
4094                 rtm->rtm_src_len = 128;
4095         } else if (rtm->rtm_src_len &&
4096                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4097                 goto nla_put_failure;
4098 #endif
4099         if (iif) {
4100 #ifdef CONFIG_IPV6_MROUTE
4101                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4102                         int err = ip6mr_get_route(net, skb, rtm, portid);
4103
4104                         if (err == 0)
4105                                 return 0;
4106                         if (err < 0)
4107                                 goto nla_put_failure;
4108                 } else
4109 #endif
4110                         if (nla_put_u32(skb, RTA_IIF, iif))
4111                                 goto nla_put_failure;
4112         } else if (dst) {
4113                 struct in6_addr saddr_buf;
4114                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4115                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4116                         goto nla_put_failure;
4117         }
4118
4119         if (rt->rt6i_prefsrc.plen) {
4120                 struct in6_addr saddr_buf;
4121                 saddr_buf = rt->rt6i_prefsrc.addr;
4122                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4123                         goto nla_put_failure;
4124         }
4125
4126         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4127         if (rt->rt6i_pmtu)
4128                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4129         if (rtnetlink_put_metrics(skb, metrics) < 0)
4130                 goto nla_put_failure;
4131
4132         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4133                 goto nla_put_failure;
4134
4135         /* For multipath routes, walk the siblings list and add
4136          * each as a nexthop within RTA_MULTIPATH.
4137          */
4138         if (rt->rt6i_nsiblings) {
4139                 struct rt6_info *sibling, *next_sibling;
4140                 struct nlattr *mp;
4141
4142                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4143                 if (!mp)
4144                         goto nla_put_failure;
4145
4146                 if (rt6_add_nexthop(skb, rt) < 0)
4147                         goto nla_put_failure;
4148
4149                 list_for_each_entry_safe(sibling, next_sibling,
4150                                          &rt->rt6i_siblings, rt6i_siblings) {
4151                         if (rt6_add_nexthop(skb, sibling) < 0)
4152                                 goto nla_put_failure;
4153                 }
4154
4155                 nla_nest_end(skb, mp);
4156         } else {
4157                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4158                         goto nla_put_failure;
4159         }
4160
4161         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4162
4163         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4164                 goto nla_put_failure;
4165
4166         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4167                 goto nla_put_failure;
4168
4169
4170         nlmsg_end(skb, nlh);
4171         return 0;
4172
4173 nla_put_failure:
4174         nlmsg_cancel(skb, nlh);
4175         return -EMSGSIZE;
4176 }
4177
4178 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4179 {
4180         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4181         struct net *net = arg->net;
4182
4183         if (rt == net->ipv6.ip6_null_entry)
4184                 return 0;
4185
4186         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4187                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4188
4189                 /* user wants prefix routes only */
4190                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4191                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4192                         /* success since this is not a prefix route */
4193                         return 1;
4194                 }
4195         }
4196
4197         return rt6_fill_node(net,
4198                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4199                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4200                      NLM_F_MULTI);
4201 }
4202
4203 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4204                               struct netlink_ext_ack *extack)
4205 {
4206         struct net *net = sock_net(in_skb->sk);
4207         struct nlattr *tb[RTA_MAX+1];
4208         int err, iif = 0, oif = 0;
4209         struct dst_entry *dst;
4210         struct rt6_info *rt;
4211         struct sk_buff *skb;
4212         struct rtmsg *rtm;
4213         struct flowi6 fl6;
4214         bool fibmatch;
4215
4216         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4217                           extack);
4218         if (err < 0)
4219                 goto errout;
4220
4221         err = -EINVAL;
4222         memset(&fl6, 0, sizeof(fl6));
4223         rtm = nlmsg_data(nlh);
4224         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4225         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4226
4227         if (tb[RTA_SRC]) {
4228                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4229                         goto errout;
4230
4231                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4232         }
4233
4234         if (tb[RTA_DST]) {
4235                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4236                         goto errout;
4237
4238                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4239         }
4240
4241         if (tb[RTA_IIF])
4242                 iif = nla_get_u32(tb[RTA_IIF]);
4243
4244         if (tb[RTA_OIF])
4245                 oif = nla_get_u32(tb[RTA_OIF]);
4246
4247         if (tb[RTA_MARK])
4248                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4249
4250         if (tb[RTA_UID])
4251                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4252                                            nla_get_u32(tb[RTA_UID]));
4253         else
4254                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4255
4256         if (iif) {
4257                 struct net_device *dev;
4258                 int flags = 0;
4259
4260                 rcu_read_lock();
4261
4262                 dev = dev_get_by_index_rcu(net, iif);
4263                 if (!dev) {
4264                         rcu_read_unlock();
4265                         err = -ENODEV;
4266                         goto errout;
4267                 }
4268
4269                 fl6.flowi6_iif = iif;
4270
4271                 if (!ipv6_addr_any(&fl6.saddr))
4272                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4273
4274                 if (!fibmatch)
4275                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4276                 else
4277                         dst = ip6_route_lookup(net, &fl6, 0);
4278
4279                 rcu_read_unlock();
4280         } else {
4281                 fl6.flowi6_oif = oif;
4282
4283                 if (!fibmatch)
4284                         dst = ip6_route_output(net, NULL, &fl6);
4285                 else
4286                         dst = ip6_route_lookup(net, &fl6, 0);
4287         }
4288
4289
4290         rt = container_of(dst, struct rt6_info, dst);
4291         if (rt->dst.error) {
4292                 err = rt->dst.error;
4293                 ip6_rt_put(rt);
4294                 goto errout;
4295         }
4296
4297         if (rt == net->ipv6.ip6_null_entry) {
4298                 err = rt->dst.error;
4299                 ip6_rt_put(rt);
4300                 goto errout;
4301         }
4302
4303         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4304         if (!skb) {
4305                 ip6_rt_put(rt);
4306                 err = -ENOBUFS;
4307                 goto errout;
4308         }
4309
4310         skb_dst_set(skb, &rt->dst);
4311         if (fibmatch)
4312                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4313                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4314                                     nlh->nlmsg_seq, 0);
4315         else
4316                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4317                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4318                                     nlh->nlmsg_seq, 0);
4319         if (err < 0) {
4320                 kfree_skb(skb);
4321                 goto errout;
4322         }
4323
4324         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4325 errout:
4326         return err;
4327 }
4328
4329 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4330                      unsigned int nlm_flags)
4331 {
4332         struct sk_buff *skb;
4333         struct net *net = info->nl_net;
4334         u32 seq;
4335         int err;
4336
4337         err = -ENOBUFS;
4338         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4339
4340         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4341         if (!skb)
4342                 goto errout;
4343
4344         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4345                                 event, info->portid, seq, nlm_flags);
4346         if (err < 0) {
4347                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4348                 WARN_ON(err == -EMSGSIZE);
4349                 kfree_skb(skb);
4350                 goto errout;
4351         }
4352         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4353                     info->nlh, gfp_any());
4354         return;
4355 errout:
4356         if (err < 0)
4357                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4358 }
4359
4360 static int ip6_route_dev_notify(struct notifier_block *this,
4361                                 unsigned long event, void *ptr)
4362 {
4363         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4364         struct net *net = dev_net(dev);
4365
4366         if (!(dev->flags & IFF_LOOPBACK))
4367                 return NOTIFY_OK;
4368
4369         if (event == NETDEV_REGISTER) {
4370                 net->ipv6.ip6_null_entry->dst.dev = dev;
4371                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4373                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4374                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4375                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4376                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4377 #endif
4378          } else if (event == NETDEV_UNREGISTER &&
4379                     dev->reg_state != NETREG_UNREGISTERED) {
4380                 /* NETDEV_UNREGISTER could be fired for multiple times by
4381                  * netdev_wait_allrefs(). Make sure we only call this once.
4382                  */
4383                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4384 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4385                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4386                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4387 #endif
4388         }
4389
4390         return NOTIFY_OK;
4391 }
4392
4393 /*
4394  *      /proc
4395  */
4396
4397 #ifdef CONFIG_PROC_FS
4398
4399 static const struct file_operations ipv6_route_proc_fops = {
4400         .owner          = THIS_MODULE,
4401         .open           = ipv6_route_open,
4402         .read           = seq_read,
4403         .llseek         = seq_lseek,
4404         .release        = seq_release_net,
4405 };
4406
4407 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4408 {
4409         struct net *net = (struct net *)seq->private;
4410         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4411                    net->ipv6.rt6_stats->fib_nodes,
4412                    net->ipv6.rt6_stats->fib_route_nodes,
4413                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4414                    net->ipv6.rt6_stats->fib_rt_entries,
4415                    net->ipv6.rt6_stats->fib_rt_cache,
4416                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4417                    net->ipv6.rt6_stats->fib_discarded_routes);
4418
4419         return 0;
4420 }
4421
4422 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4423 {
4424         return single_open_net(inode, file, rt6_stats_seq_show);
4425 }
4426
4427 static const struct file_operations rt6_stats_seq_fops = {
4428         .owner   = THIS_MODULE,
4429         .open    = rt6_stats_seq_open,
4430         .read    = seq_read,
4431         .llseek  = seq_lseek,
4432         .release = single_release_net,
4433 };
4434 #endif  /* CONFIG_PROC_FS */
4435
4436 #ifdef CONFIG_SYSCTL
4437
4438 static
4439 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4440                               void __user *buffer, size_t *lenp, loff_t *ppos)
4441 {
4442         struct net *net;
4443         int delay;
4444         if (!write)
4445                 return -EINVAL;
4446
4447         net = (struct net *)ctl->extra1;
4448         delay = net->ipv6.sysctl.flush_delay;
4449         proc_dointvec(ctl, write, buffer, lenp, ppos);
4450         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4451         return 0;
4452 }
4453
4454 struct ctl_table ipv6_route_table_template[] = {
4455         {
4456                 .procname       =       "flush",
4457                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4458                 .maxlen         =       sizeof(int),
4459                 .mode           =       0200,
4460                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4461         },
4462         {
4463                 .procname       =       "gc_thresh",
4464                 .data           =       &ip6_dst_ops_template.gc_thresh,
4465                 .maxlen         =       sizeof(int),
4466                 .mode           =       0644,
4467                 .proc_handler   =       proc_dointvec,
4468         },
4469         {
4470                 .procname       =       "max_size",
4471                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4472                 .maxlen         =       sizeof(int),
4473                 .mode           =       0644,
4474                 .proc_handler   =       proc_dointvec,
4475         },
4476         {
4477                 .procname       =       "gc_min_interval",
4478                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4479                 .maxlen         =       sizeof(int),
4480                 .mode           =       0644,
4481                 .proc_handler   =       proc_dointvec_jiffies,
4482         },
4483         {
4484                 .procname       =       "gc_timeout",
4485                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4486                 .maxlen         =       sizeof(int),
4487                 .mode           =       0644,
4488                 .proc_handler   =       proc_dointvec_jiffies,
4489         },
4490         {
4491                 .procname       =       "gc_interval",
4492                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4493                 .maxlen         =       sizeof(int),
4494                 .mode           =       0644,
4495                 .proc_handler   =       proc_dointvec_jiffies,
4496         },
4497         {
4498                 .procname       =       "gc_elasticity",
4499                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4500                 .maxlen         =       sizeof(int),
4501                 .mode           =       0644,
4502                 .proc_handler   =       proc_dointvec,
4503         },
4504         {
4505                 .procname       =       "mtu_expires",
4506                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4507                 .maxlen         =       sizeof(int),
4508                 .mode           =       0644,
4509                 .proc_handler   =       proc_dointvec_jiffies,
4510         },
4511         {
4512                 .procname       =       "min_adv_mss",
4513                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4514                 .maxlen         =       sizeof(int),
4515                 .mode           =       0644,
4516                 .proc_handler   =       proc_dointvec,
4517         },
4518         {
4519                 .procname       =       "gc_min_interval_ms",
4520                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4521                 .maxlen         =       sizeof(int),
4522                 .mode           =       0644,
4523                 .proc_handler   =       proc_dointvec_ms_jiffies,
4524         },
4525         { }
4526 };
4527
4528 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4529 {
4530         struct ctl_table *table;
4531
4532         table = kmemdup(ipv6_route_table_template,
4533                         sizeof(ipv6_route_table_template),
4534                         GFP_KERNEL);
4535
4536         if (table) {
4537                 table[0].data = &net->ipv6.sysctl.flush_delay;
4538                 table[0].extra1 = net;
4539                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4540                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4541                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4542                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4543                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4544                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4545                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4546                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4547                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4548
4549                 /* Don't export sysctls to unprivileged users */
4550                 if (net->user_ns != &init_user_ns)
4551                         table[0].procname = NULL;
4552         }
4553
4554         return table;
4555 }
4556 #endif
4557
4558 static int __net_init ip6_route_net_init(struct net *net)
4559 {
4560         int ret = -ENOMEM;
4561
4562         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4563                sizeof(net->ipv6.ip6_dst_ops));
4564
4565         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4566                 goto out_ip6_dst_ops;
4567
4568         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4569                                            sizeof(*net->ipv6.ip6_null_entry),
4570                                            GFP_KERNEL);
4571         if (!net->ipv6.ip6_null_entry)
4572                 goto out_ip6_dst_entries;
4573         net->ipv6.ip6_null_entry->dst.path =
4574                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4575         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4576         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4577                          ip6_template_metrics, true);
4578
4579 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4580         net->ipv6.fib6_has_custom_rules = false;
4581         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4582                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4583                                                GFP_KERNEL);
4584         if (!net->ipv6.ip6_prohibit_entry)
4585                 goto out_ip6_null_entry;
4586         net->ipv6.ip6_prohibit_entry->dst.path =
4587                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4588         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4589         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4590                          ip6_template_metrics, true);
4591
4592         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4593                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4594                                                GFP_KERNEL);
4595         if (!net->ipv6.ip6_blk_hole_entry)
4596                 goto out_ip6_prohibit_entry;
4597         net->ipv6.ip6_blk_hole_entry->dst.path =
4598                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4599         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4600         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4601                          ip6_template_metrics, true);
4602 #endif
4603
4604         net->ipv6.sysctl.flush_delay = 0;
4605         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4606         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4607         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4608         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4609         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4610         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4611         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4612
4613         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4614
4615         ret = 0;
4616 out:
4617         return ret;
4618
4619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4620 out_ip6_prohibit_entry:
4621         kfree(net->ipv6.ip6_prohibit_entry);
4622 out_ip6_null_entry:
4623         kfree(net->ipv6.ip6_null_entry);
4624 #endif
4625 out_ip6_dst_entries:
4626         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4627 out_ip6_dst_ops:
4628         goto out;
4629 }
4630
4631 static void __net_exit ip6_route_net_exit(struct net *net)
4632 {
4633         kfree(net->ipv6.ip6_null_entry);
4634 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4635         kfree(net->ipv6.ip6_prohibit_entry);
4636         kfree(net->ipv6.ip6_blk_hole_entry);
4637 #endif
4638         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4639 }
4640
4641 static int __net_init ip6_route_net_init_late(struct net *net)
4642 {
4643 #ifdef CONFIG_PROC_FS
4644         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4645         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4646 #endif
4647         return 0;
4648 }
4649
4650 static void __net_exit ip6_route_net_exit_late(struct net *net)
4651 {
4652 #ifdef CONFIG_PROC_FS
4653         remove_proc_entry("ipv6_route", net->proc_net);
4654         remove_proc_entry("rt6_stats", net->proc_net);
4655 #endif
4656 }
4657
4658 static struct pernet_operations ip6_route_net_ops = {
4659         .init = ip6_route_net_init,
4660         .exit = ip6_route_net_exit,
4661 };
4662
4663 static int __net_init ipv6_inetpeer_init(struct net *net)
4664 {
4665         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4666
4667         if (!bp)
4668                 return -ENOMEM;
4669         inet_peer_base_init(bp);
4670         net->ipv6.peers = bp;
4671         return 0;
4672 }
4673
4674 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4675 {
4676         struct inet_peer_base *bp = net->ipv6.peers;
4677
4678         net->ipv6.peers = NULL;
4679         inetpeer_invalidate_tree(bp);
4680         kfree(bp);
4681 }
4682
4683 static struct pernet_operations ipv6_inetpeer_ops = {
4684         .init   =       ipv6_inetpeer_init,
4685         .exit   =       ipv6_inetpeer_exit,
4686 };
4687
4688 static struct pernet_operations ip6_route_net_late_ops = {
4689         .init = ip6_route_net_init_late,
4690         .exit = ip6_route_net_exit_late,
4691 };
4692
4693 static struct notifier_block ip6_route_dev_notifier = {
4694         .notifier_call = ip6_route_dev_notify,
4695         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4696 };
4697
4698 void __init ip6_route_init_special_entries(void)
4699 {
4700         /* Registering of the loopback is done before this portion of code,
4701          * the loopback reference in rt6_info will not be taken, do it
4702          * manually for init_net */
4703         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4704         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4705   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4706         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4707         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4708         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4709         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4710   #endif
4711 }
4712
4713 int __init ip6_route_init(void)
4714 {
4715         int ret;
4716         int cpu;
4717
4718         ret = -ENOMEM;
4719         ip6_dst_ops_template.kmem_cachep =
4720                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4721                                   SLAB_HWCACHE_ALIGN, NULL);
4722         if (!ip6_dst_ops_template.kmem_cachep)
4723                 goto out;
4724
4725         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4726         if (ret)
4727                 goto out_kmem_cache;
4728
4729         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4730         if (ret)
4731                 goto out_dst_entries;
4732
4733         ret = register_pernet_subsys(&ip6_route_net_ops);
4734         if (ret)
4735                 goto out_register_inetpeer;
4736
4737         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4738
4739         ret = fib6_init();
4740         if (ret)
4741                 goto out_register_subsys;
4742
4743         ret = xfrm6_init();
4744         if (ret)
4745                 goto out_fib6_init;
4746
4747         ret = fib6_rules_init();
4748         if (ret)
4749                 goto xfrm6_init;
4750
4751         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4752         if (ret)
4753                 goto fib6_rules_init;
4754
4755         ret = -ENOBUFS;
4756         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4757             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4758             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4759                             RTNL_FLAG_DOIT_UNLOCKED))
4760                 goto out_register_late_subsys;
4761
4762         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4763         if (ret)
4764                 goto out_register_late_subsys;
4765
4766         for_each_possible_cpu(cpu) {
4767                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4768
4769                 INIT_LIST_HEAD(&ul->head);
4770                 spin_lock_init(&ul->lock);
4771         }
4772
4773 out:
4774         return ret;
4775
4776 out_register_late_subsys:
4777         unregister_pernet_subsys(&ip6_route_net_late_ops);
4778 fib6_rules_init:
4779         fib6_rules_cleanup();
4780 xfrm6_init:
4781         xfrm6_fini();
4782 out_fib6_init:
4783         fib6_gc_cleanup();
4784 out_register_subsys:
4785         unregister_pernet_subsys(&ip6_route_net_ops);
4786 out_register_inetpeer:
4787         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4788 out_dst_entries:
4789         dst_entries_destroy(&ip6_dst_blackhole_ops);
4790 out_kmem_cache:
4791         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4792         goto out;
4793 }
4794
4795 void ip6_route_cleanup(void)
4796 {
4797         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4798         unregister_pernet_subsys(&ip6_route_net_late_ops);
4799         fib6_rules_cleanup();
4800         xfrm6_fini();
4801         fib6_gc_cleanup();
4802         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4803         unregister_pernet_subsys(&ip6_route_net_ops);
4804         dst_entries_destroy(&ip6_dst_blackhole_ops);
4805         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4806 }