Merge tag 'spdx-5.2-rc3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[linux-2.6-block.git] / net / ipv6 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      Linux INET6 implementation
4  *      FIB front-end.
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  */
9
10 /*      Changes:
11  *
12  *      YOSHIFUJI Hideaki @USAGI
13  *              reworked default router selection.
14  *              - respect outgoing interface
15  *              - select from (probably) reachable routers (i.e.
16  *              routers in REACHABLE, STALE, DELAY or PROBE states).
17  *              - always select the same router if it is (probably)
18  *              reachable.  otherwise, round-robin the list.
19  *      Ville Nuorvala
20  *              Fixed routing subtrees.
21  */
22
23 #define pr_fmt(fmt) "IPv6: " fmt
24
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/dst_metadata.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 #include <net/rtnh.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
62 #include <net/ip.h>
63 #include <linux/uaccess.h>
64
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68
69 static int ip6_rt_type_to_error(u8 fib6_type);
70
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/fib6.h>
73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
74 #undef CREATE_TRACE_POINTS
75
76 enum rt6_nud_state {
77         RT6_NUD_FAIL_HARD = -3,
78         RT6_NUD_FAIL_PROBE = -2,
79         RT6_NUD_FAIL_DO_RR = -1,
80         RT6_NUD_SUCCEED = 1
81 };
82
83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
85 static unsigned int      ip6_mtu(const struct dst_entry *dst);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(struct dst_ops *ops);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static int              ip6_pkt_prohibit(struct sk_buff *skb);
95 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
96 static void             ip6_link_failure(struct sk_buff *skb);
97 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
98                                            struct sk_buff *skb, u32 mtu);
99 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100                                         struct sk_buff *skb);
101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
102                            int strict);
103 static size_t rt6_nlmsg_size(struct fib6_info *rt);
104 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
105                          struct fib6_info *rt, struct dst_entry *dst,
106                          struct in6_addr *dest, struct in6_addr *src,
107                          int iif, int type, u32 portid, u32 seq,
108                          unsigned int flags);
109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
110                                            const struct in6_addr *daddr,
111                                            const struct in6_addr *saddr);
112
113 #ifdef CONFIG_IPV6_ROUTE_INFO
114 static struct fib6_info *rt6_add_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev,
118                                            unsigned int pref);
119 static struct fib6_info *rt6_get_route_info(struct net *net,
120                                            const struct in6_addr *prefix, int prefixlen,
121                                            const struct in6_addr *gwaddr,
122                                            struct net_device *dev);
123 #endif
124
125 struct uncached_list {
126         spinlock_t              lock;
127         struct list_head        head;
128 };
129
130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131
132 void rt6_uncached_list_add(struct rt6_info *rt)
133 {
134         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135
136         rt->rt6i_uncached_list = ul;
137
138         spin_lock_bh(&ul->lock);
139         list_add_tail(&rt->rt6i_uncached, &ul->head);
140         spin_unlock_bh(&ul->lock);
141 }
142
143 void rt6_uncached_list_del(struct rt6_info *rt)
144 {
145         if (!list_empty(&rt->rt6i_uncached)) {
146                 struct uncached_list *ul = rt->rt6i_uncached_list;
147                 struct net *net = dev_net(rt->dst.dev);
148
149                 spin_lock_bh(&ul->lock);
150                 list_del(&rt->rt6i_uncached);
151                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
152                 spin_unlock_bh(&ul->lock);
153         }
154 }
155
156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 {
158         struct net_device *loopback_dev = net->loopback_dev;
159         int cpu;
160
161         if (dev == loopback_dev)
162                 return;
163
164         for_each_possible_cpu(cpu) {
165                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
166                 struct rt6_info *rt;
167
168                 spin_lock_bh(&ul->lock);
169                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
170                         struct inet6_dev *rt_idev = rt->rt6i_idev;
171                         struct net_device *rt_dev = rt->dst.dev;
172
173                         if (rt_idev->dev == dev) {
174                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
175                                 in6_dev_put(rt_idev);
176                         }
177
178                         if (rt_dev == dev) {
179                                 rt->dst.dev = loopback_dev;
180                                 dev_hold(rt->dst.dev);
181                                 dev_put(rt_dev);
182                         }
183                 }
184                 spin_unlock_bh(&ul->lock);
185         }
186 }
187
188 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189                                              struct sk_buff *skb,
190                                              const void *daddr)
191 {
192         if (!ipv6_addr_any(p))
193                 return (const void *) p;
194         else if (skb)
195                 return &ipv6_hdr(skb)->daddr;
196         return daddr;
197 }
198
199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
200                                    struct net_device *dev,
201                                    struct sk_buff *skb,
202                                    const void *daddr)
203 {
204         struct neighbour *n;
205
206         daddr = choose_neigh_daddr(gw, skb, daddr);
207         n = __ipv6_neigh_lookup(dev, daddr);
208         if (n)
209                 return n;
210
211         n = neigh_create(&nd_tbl, daddr, dev);
212         return IS_ERR(n) ? NULL : n;
213 }
214
215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
216                                               struct sk_buff *skb,
217                                               const void *daddr)
218 {
219         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
220
221         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
222 }
223
224 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
225 {
226         struct net_device *dev = dst->dev;
227         struct rt6_info *rt = (struct rt6_info *)dst;
228
229         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
230         if (!daddr)
231                 return;
232         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
233                 return;
234         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
235                 return;
236         __ipv6_confirm_neigh(dev, daddr);
237 }
238
239 static struct dst_ops ip6_dst_ops_template = {
240         .family                 =       AF_INET6,
241         .gc                     =       ip6_dst_gc,
242         .gc_thresh              =       1024,
243         .check                  =       ip6_dst_check,
244         .default_advmss         =       ip6_default_advmss,
245         .mtu                    =       ip6_mtu,
246         .cow_metrics            =       dst_cow_metrics_generic,
247         .destroy                =       ip6_dst_destroy,
248         .ifdown                 =       ip6_dst_ifdown,
249         .negative_advice        =       ip6_negative_advice,
250         .link_failure           =       ip6_link_failure,
251         .update_pmtu            =       ip6_rt_update_pmtu,
252         .redirect               =       rt6_do_redirect,
253         .local_out              =       __ip6_local_out,
254         .neigh_lookup           =       ip6_dst_neigh_lookup,
255         .confirm_neigh          =       ip6_confirm_neigh,
256 };
257
258 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
259 {
260         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
261
262         return mtu ? : dst->dev->mtu;
263 }
264
265 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
266                                          struct sk_buff *skb, u32 mtu)
267 {
268 }
269
270 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
271                                       struct sk_buff *skb)
272 {
273 }
274
275 static struct dst_ops ip6_dst_blackhole_ops = {
276         .family                 =       AF_INET6,
277         .destroy                =       ip6_dst_destroy,
278         .check                  =       ip6_dst_check,
279         .mtu                    =       ip6_blackhole_mtu,
280         .default_advmss         =       ip6_default_advmss,
281         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
282         .redirect               =       ip6_rt_blackhole_redirect,
283         .cow_metrics            =       dst_cow_metrics_generic,
284         .neigh_lookup           =       ip6_dst_neigh_lookup,
285 };
286
287 static const u32 ip6_template_metrics[RTAX_MAX] = {
288         [RTAX_HOPLIMIT - 1] = 0,
289 };
290
291 static const struct fib6_info fib6_null_entry_template = {
292         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
293         .fib6_protocol  = RTPROT_KERNEL,
294         .fib6_metric    = ~(u32)0,
295         .fib6_ref       = REFCOUNT_INIT(1),
296         .fib6_type      = RTN_UNREACHABLE,
297         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
298 };
299
300 static const struct rt6_info ip6_null_entry_template = {
301         .dst = {
302                 .__refcnt       = ATOMIC_INIT(1),
303                 .__use          = 1,
304                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
305                 .error          = -ENETUNREACH,
306                 .input          = ip6_pkt_discard,
307                 .output         = ip6_pkt_discard_out,
308         },
309         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
310 };
311
312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313
314 static const struct rt6_info ip6_prohibit_entry_template = {
315         .dst = {
316                 .__refcnt       = ATOMIC_INIT(1),
317                 .__use          = 1,
318                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
319                 .error          = -EACCES,
320                 .input          = ip6_pkt_prohibit,
321                 .output         = ip6_pkt_prohibit_out,
322         },
323         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
324 };
325
326 static const struct rt6_info ip6_blk_hole_entry_template = {
327         .dst = {
328                 .__refcnt       = ATOMIC_INIT(1),
329                 .__use          = 1,
330                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
331                 .error          = -EINVAL,
332                 .input          = dst_discard,
333                 .output         = dst_discard_out,
334         },
335         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
336 };
337
338 #endif
339
340 static void rt6_info_init(struct rt6_info *rt)
341 {
342         struct dst_entry *dst = &rt->dst;
343
344         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
345         INIT_LIST_HEAD(&rt->rt6i_uncached);
346 }
347
348 /* allocate dst with ip6_dst_ops */
349 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
350                                int flags)
351 {
352         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
353                                         1, DST_OBSOLETE_FORCE_CHK, flags);
354
355         if (rt) {
356                 rt6_info_init(rt);
357                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
358         }
359
360         return rt;
361 }
362 EXPORT_SYMBOL(ip6_dst_alloc);
363
364 static void ip6_dst_destroy(struct dst_entry *dst)
365 {
366         struct rt6_info *rt = (struct rt6_info *)dst;
367         struct fib6_info *from;
368         struct inet6_dev *idev;
369
370         ip_dst_metrics_put(dst);
371         rt6_uncached_list_del(rt);
372
373         idev = rt->rt6i_idev;
374         if (idev) {
375                 rt->rt6i_idev = NULL;
376                 in6_dev_put(idev);
377         }
378
379         from = xchg((__force struct fib6_info **)&rt->from, NULL);
380         fib6_info_release(from);
381 }
382
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384                            int how)
385 {
386         struct rt6_info *rt = (struct rt6_info *)dst;
387         struct inet6_dev *idev = rt->rt6i_idev;
388         struct net_device *loopback_dev =
389                 dev_net(dev)->loopback_dev;
390
391         if (idev && idev->dev != loopback_dev) {
392                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
393                 if (loopback_idev) {
394                         rt->rt6i_idev = loopback_idev;
395                         in6_dev_put(idev);
396                 }
397         }
398 }
399
400 static bool __rt6_check_expired(const struct rt6_info *rt)
401 {
402         if (rt->rt6i_flags & RTF_EXPIRES)
403                 return time_after(jiffies, rt->dst.expires);
404         else
405                 return false;
406 }
407
408 static bool rt6_check_expired(const struct rt6_info *rt)
409 {
410         struct fib6_info *from;
411
412         from = rcu_dereference(rt->from);
413
414         if (rt->rt6i_flags & RTF_EXPIRES) {
415                 if (time_after(jiffies, rt->dst.expires))
416                         return true;
417         } else if (from) {
418                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
419                         fib6_check_expired(from);
420         }
421         return false;
422 }
423
424 void fib6_select_path(const struct net *net, struct fib6_result *res,
425                       struct flowi6 *fl6, int oif, bool have_oif_match,
426                       const struct sk_buff *skb, int strict)
427 {
428         struct fib6_info *sibling, *next_sibling;
429         struct fib6_info *match = res->f6i;
430
431         if (!match->fib6_nsiblings || have_oif_match)
432                 goto out;
433
434         /* We might have already computed the hash for ICMPv6 errors. In such
435          * case it will always be non-zero. Otherwise now is the time to do it.
436          */
437         if (!fl6->mp_hash)
438                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
439
440         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
441                 goto out;
442
443         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
444                                  fib6_siblings) {
445                 const struct fib6_nh *nh = &sibling->fib6_nh;
446                 int nh_upper_bound;
447
448                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
449                 if (fl6->mp_hash > nh_upper_bound)
450                         continue;
451                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
452                         break;
453                 match = sibling;
454                 break;
455         }
456
457 out:
458         res->f6i = match;
459         res->nh = &match->fib6_nh;
460 }
461
462 /*
463  *      Route lookup. rcu_read_lock() should be held.
464  */
465
466 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
467                                const struct in6_addr *saddr, int oif, int flags)
468 {
469         const struct net_device *dev;
470
471         if (nh->fib_nh_flags & RTNH_F_DEAD)
472                 return false;
473
474         dev = nh->fib_nh_dev;
475         if (oif) {
476                 if (dev->ifindex == oif)
477                         return true;
478         } else {
479                 if (ipv6_chk_addr(net, saddr, dev,
480                                   flags & RT6_LOOKUP_F_IFACE))
481                         return true;
482         }
483
484         return false;
485 }
486
487 static void rt6_device_match(struct net *net, struct fib6_result *res,
488                              const struct in6_addr *saddr, int oif, int flags)
489 {
490         struct fib6_info *f6i = res->f6i;
491         struct fib6_info *spf6i;
492         struct fib6_nh *nh;
493
494         if (!oif && ipv6_addr_any(saddr)) {
495                 nh = &f6i->fib6_nh;
496                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
497                         goto out;
498         }
499
500         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
501                 nh = &spf6i->fib6_nh;
502                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
503                         res->f6i = spf6i;
504                         goto out;
505                 }
506         }
507
508         if (oif && flags & RT6_LOOKUP_F_IFACE) {
509                 res->f6i = net->ipv6.fib6_null_entry;
510                 nh = &res->f6i->fib6_nh;
511                 goto out;
512         }
513
514         nh = &f6i->fib6_nh;
515         if (nh->fib_nh_flags & RTNH_F_DEAD) {
516                 res->f6i = net->ipv6.fib6_null_entry;
517                 nh = &res->f6i->fib6_nh;
518         }
519 out:
520         res->nh = nh;
521         res->fib6_type = res->f6i->fib6_type;
522         res->fib6_flags = res->f6i->fib6_flags;
523 }
524
525 #ifdef CONFIG_IPV6_ROUTER_PREF
526 struct __rt6_probe_work {
527         struct work_struct work;
528         struct in6_addr target;
529         struct net_device *dev;
530 };
531
532 static void rt6_probe_deferred(struct work_struct *w)
533 {
534         struct in6_addr mcaddr;
535         struct __rt6_probe_work *work =
536                 container_of(w, struct __rt6_probe_work, work);
537
538         addrconf_addr_solict_mult(&work->target, &mcaddr);
539         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
540         dev_put(work->dev);
541         kfree(work);
542 }
543
544 static void rt6_probe(struct fib6_nh *fib6_nh)
545 {
546         struct __rt6_probe_work *work = NULL;
547         const struct in6_addr *nh_gw;
548         struct neighbour *neigh;
549         struct net_device *dev;
550         struct inet6_dev *idev;
551
552         /*
553          * Okay, this does not seem to be appropriate
554          * for now, however, we need to check if it
555          * is really so; aka Router Reachability Probing.
556          *
557          * Router Reachability Probe MUST be rate-limited
558          * to no more than one per minute.
559          */
560         if (fib6_nh->fib_nh_gw_family)
561                 return;
562
563         nh_gw = &fib6_nh->fib_nh_gw6;
564         dev = fib6_nh->fib_nh_dev;
565         rcu_read_lock_bh();
566         idev = __in6_dev_get(dev);
567         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
568         if (neigh) {
569                 if (neigh->nud_state & NUD_VALID)
570                         goto out;
571
572                 write_lock(&neigh->lock);
573                 if (!(neigh->nud_state & NUD_VALID) &&
574                     time_after(jiffies,
575                                neigh->updated + idev->cnf.rtr_probe_interval)) {
576                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
577                         if (work)
578                                 __neigh_set_probe_once(neigh);
579                 }
580                 write_unlock(&neigh->lock);
581         } else if (time_after(jiffies, fib6_nh->last_probe +
582                                        idev->cnf.rtr_probe_interval)) {
583                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
584         }
585
586         if (work) {
587                 fib6_nh->last_probe = jiffies;
588                 INIT_WORK(&work->work, rt6_probe_deferred);
589                 work->target = *nh_gw;
590                 dev_hold(dev);
591                 work->dev = dev;
592                 schedule_work(&work->work);
593         }
594
595 out:
596         rcu_read_unlock_bh();
597 }
598 #else
599 static inline void rt6_probe(struct fib6_nh *fib6_nh)
600 {
601 }
602 #endif
603
604 /*
605  * Default Router Selection (RFC 2461 6.3.6)
606  */
607 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
608 {
609         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
610         struct neighbour *neigh;
611
612         rcu_read_lock_bh();
613         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
614                                           &fib6_nh->fib_nh_gw6);
615         if (neigh) {
616                 read_lock(&neigh->lock);
617                 if (neigh->nud_state & NUD_VALID)
618                         ret = RT6_NUD_SUCCEED;
619 #ifdef CONFIG_IPV6_ROUTER_PREF
620                 else if (!(neigh->nud_state & NUD_FAILED))
621                         ret = RT6_NUD_SUCCEED;
622                 else
623                         ret = RT6_NUD_FAIL_PROBE;
624 #endif
625                 read_unlock(&neigh->lock);
626         } else {
627                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
628                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
629         }
630         rcu_read_unlock_bh();
631
632         return ret;
633 }
634
635 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
636                            int strict)
637 {
638         int m = 0;
639
640         if (!oif || nh->fib_nh_dev->ifindex == oif)
641                 m = 2;
642
643         if (!m && (strict & RT6_LOOKUP_F_IFACE))
644                 return RT6_NUD_FAIL_HARD;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
647 #endif
648         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
649             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
650                 int n = rt6_check_neigh(nh);
651                 if (n < 0)
652                         return n;
653         }
654         return m;
655 }
656
657 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
658                        int oif, int strict, int *mpri, bool *do_rr)
659 {
660         bool match_do_rr = false;
661         bool rc = false;
662         int m;
663
664         if (nh->fib_nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
668             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         m = rt6_score_route(nh, fib6_flags, oif, strict);
673         if (m == RT6_NUD_FAIL_DO_RR) {
674                 match_do_rr = true;
675                 m = 0; /* lowest valid score */
676         } else if (m == RT6_NUD_FAIL_HARD) {
677                 goto out;
678         }
679
680         if (strict & RT6_LOOKUP_F_REACHABLE)
681                 rt6_probe(nh);
682
683         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
684         if (m > *mpri) {
685                 *do_rr = match_do_rr;
686                 *mpri = m;
687                 rc = true;
688         }
689 out:
690         return rc;
691 }
692
693 static void __find_rr_leaf(struct fib6_info *f6i_start,
694                            struct fib6_info *nomatch, u32 metric,
695                            struct fib6_result *res, struct fib6_info **cont,
696                            int oif, int strict, bool *do_rr, int *mpri)
697 {
698         struct fib6_info *f6i;
699
700         for (f6i = f6i_start;
701              f6i && f6i != nomatch;
702              f6i = rcu_dereference(f6i->fib6_next)) {
703                 struct fib6_nh *nh;
704
705                 if (cont && f6i->fib6_metric != metric) {
706                         *cont = f6i;
707                         return;
708                 }
709
710                 if (fib6_check_expired(f6i))
711                         continue;
712
713                 nh = &f6i->fib6_nh;
714                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
715                         res->f6i = f6i;
716                         res->nh = nh;
717                         res->fib6_flags = f6i->fib6_flags;
718                         res->fib6_type = f6i->fib6_type;
719                 }
720         }
721 }
722
723 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
724                          struct fib6_info *rr_head, int oif, int strict,
725                          bool *do_rr, struct fib6_result *res)
726 {
727         u32 metric = rr_head->fib6_metric;
728         struct fib6_info *cont = NULL;
729         int mpri = -1;
730
731         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
732                        oif, strict, do_rr, &mpri);
733
734         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
735                        oif, strict, do_rr, &mpri);
736
737         if (res->f6i || !cont)
738                 return;
739
740         __find_rr_leaf(cont, NULL, metric, res, NULL,
741                        oif, strict, do_rr, &mpri);
742 }
743
744 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
745                        struct fib6_result *res, int strict)
746 {
747         struct fib6_info *leaf = rcu_dereference(fn->leaf);
748         struct fib6_info *rt0;
749         bool do_rr = false;
750         int key_plen;
751
752         /* make sure this function or its helpers sets f6i */
753         res->f6i = NULL;
754
755         if (!leaf || leaf == net->ipv6.fib6_null_entry)
756                 goto out;
757
758         rt0 = rcu_dereference(fn->rr_ptr);
759         if (!rt0)
760                 rt0 = leaf;
761
762         /* Double check to make sure fn is not an intermediate node
763          * and fn->leaf does not points to its child's leaf
764          * (This might happen if all routes under fn are deleted from
765          * the tree and fib6_repair_tree() is called on the node.)
766          */
767         key_plen = rt0->fib6_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769         if (rt0->fib6_src.plen)
770                 key_plen = rt0->fib6_src.plen;
771 #endif
772         if (fn->fn_bit != key_plen)
773                 goto out;
774
775         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
776         if (do_rr) {
777                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
778
779                 /* no entries matched; do round-robin */
780                 if (!next || next->fib6_metric != rt0->fib6_metric)
781                         next = leaf;
782
783                 if (next != rt0) {
784                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
785                         /* make sure next is not being deleted from the tree */
786                         if (next->fib6_node)
787                                 rcu_assign_pointer(fn->rr_ptr, next);
788                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
789                 }
790         }
791
792 out:
793         if (!res->f6i) {
794                 res->f6i = net->ipv6.fib6_null_entry;
795                 res->nh = &res->f6i->fib6_nh;
796                 res->fib6_flags = res->f6i->fib6_flags;
797                 res->fib6_type = res->f6i->fib6_type;
798         }
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
802 {
803         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
804                res->nh->fib_nh_gw_family;
805 }
806
807 #ifdef CONFIG_IPV6_ROUTE_INFO
808 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
809                   const struct in6_addr *gwaddr)
810 {
811         struct net *net = dev_net(dev);
812         struct route_info *rinfo = (struct route_info *) opt;
813         struct in6_addr prefix_buf, *prefix;
814         unsigned int pref;
815         unsigned long lifetime;
816         struct fib6_info *rt;
817
818         if (len < sizeof(struct route_info)) {
819                 return -EINVAL;
820         }
821
822         /* Sanity check for prefix_len and length */
823         if (rinfo->length > 3) {
824                 return -EINVAL;
825         } else if (rinfo->prefix_len > 128) {
826                 return -EINVAL;
827         } else if (rinfo->prefix_len > 64) {
828                 if (rinfo->length < 2) {
829                         return -EINVAL;
830                 }
831         } else if (rinfo->prefix_len > 0) {
832                 if (rinfo->length < 1) {
833                         return -EINVAL;
834                 }
835         }
836
837         pref = rinfo->route_pref;
838         if (pref == ICMPV6_ROUTER_PREF_INVALID)
839                 return -EINVAL;
840
841         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842
843         if (rinfo->length == 3)
844                 prefix = (struct in6_addr *)rinfo->prefix;
845         else {
846                 /* this function is safe */
847                 ipv6_addr_prefix(&prefix_buf,
848                                  (struct in6_addr *)rinfo->prefix,
849                                  rinfo->prefix_len);
850                 prefix = &prefix_buf;
851         }
852
853         if (rinfo->prefix_len == 0)
854                 rt = rt6_get_dflt_router(net, gwaddr, dev);
855         else
856                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
857                                         gwaddr, dev);
858
859         if (rt && !lifetime) {
860                 ip6_del_rt(net, rt);
861                 rt = NULL;
862         }
863
864         if (!rt && lifetime)
865                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
866                                         dev, pref);
867         else if (rt)
868                 rt->fib6_flags = RTF_ROUTEINFO |
869                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
870
871         if (rt) {
872                 if (!addrconf_finite_timeout(lifetime))
873                         fib6_clean_expires(rt);
874                 else
875                         fib6_set_expires(rt, jiffies + HZ * lifetime);
876
877                 fib6_info_release(rt);
878         }
879         return 0;
880 }
881 #endif
882
883 /*
884  *      Misc support functions
885  */
886
887 /* called with rcu_lock held */
888 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
889 {
890         struct net_device *dev = res->nh->fib_nh_dev;
891
892         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
893                 /* for copies of local routes, dst->dev needs to be the
894                  * device if it is a master device, the master device if
895                  * device is enslaved, and the loopback as the default
896                  */
897                 if (netif_is_l3_slave(dev) &&
898                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
899                         dev = l3mdev_master_dev_rcu(dev);
900                 else if (!netif_is_l3_master(dev))
901                         dev = dev_net(dev)->loopback_dev;
902                 /* last case is netif_is_l3_master(dev) is true in which
903                  * case we want dev returned to be dev
904                  */
905         }
906
907         return dev;
908 }
909
910 static const int fib6_prop[RTN_MAX + 1] = {
911         [RTN_UNSPEC]    = 0,
912         [RTN_UNICAST]   = 0,
913         [RTN_LOCAL]     = 0,
914         [RTN_BROADCAST] = 0,
915         [RTN_ANYCAST]   = 0,
916         [RTN_MULTICAST] = 0,
917         [RTN_BLACKHOLE] = -EINVAL,
918         [RTN_UNREACHABLE] = -EHOSTUNREACH,
919         [RTN_PROHIBIT]  = -EACCES,
920         [RTN_THROW]     = -EAGAIN,
921         [RTN_NAT]       = -EINVAL,
922         [RTN_XRESOLVE]  = -EINVAL,
923 };
924
925 static int ip6_rt_type_to_error(u8 fib6_type)
926 {
927         return fib6_prop[fib6_type];
928 }
929
930 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
931 {
932         unsigned short flags = 0;
933
934         if (rt->dst_nocount)
935                 flags |= DST_NOCOUNT;
936         if (rt->dst_nopolicy)
937                 flags |= DST_NOPOLICY;
938         if (rt->dst_host)
939                 flags |= DST_HOST;
940
941         return flags;
942 }
943
944 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
945 {
946         rt->dst.error = ip6_rt_type_to_error(fib6_type);
947
948         switch (fib6_type) {
949         case RTN_BLACKHOLE:
950                 rt->dst.output = dst_discard_out;
951                 rt->dst.input = dst_discard;
952                 break;
953         case RTN_PROHIBIT:
954                 rt->dst.output = ip6_pkt_prohibit_out;
955                 rt->dst.input = ip6_pkt_prohibit;
956                 break;
957         case RTN_THROW:
958         case RTN_UNREACHABLE:
959         default:
960                 rt->dst.output = ip6_pkt_discard_out;
961                 rt->dst.input = ip6_pkt_discard;
962                 break;
963         }
964 }
965
966 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
967 {
968         struct fib6_info *f6i = res->f6i;
969
970         if (res->fib6_flags & RTF_REJECT) {
971                 ip6_rt_init_dst_reject(rt, res->fib6_type);
972                 return;
973         }
974
975         rt->dst.error = 0;
976         rt->dst.output = ip6_output;
977
978         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
979                 rt->dst.input = ip6_input;
980         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
981                 rt->dst.input = ip6_mc_input;
982         } else {
983                 rt->dst.input = ip6_forward;
984         }
985
986         if (res->nh->fib_nh_lws) {
987                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
988                 lwtunnel_set_redirect(&rt->dst);
989         }
990
991         rt->dst.lastuse = jiffies;
992 }
993
994 /* Caller must already hold reference to @from */
995 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
996 {
997         rt->rt6i_flags &= ~RTF_EXPIRES;
998         rcu_assign_pointer(rt->from, from);
999         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1000 }
1001
1002 /* Caller must already hold reference to f6i in result */
1003 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1004 {
1005         const struct fib6_nh *nh = res->nh;
1006         const struct net_device *dev = nh->fib_nh_dev;
1007         struct fib6_info *f6i = res->f6i;
1008
1009         ip6_rt_init_dst(rt, res);
1010
1011         rt->rt6i_dst = f6i->fib6_dst;
1012         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1013         rt->rt6i_flags = res->fib6_flags;
1014         if (nh->fib_nh_gw_family) {
1015                 rt->rt6i_gateway = nh->fib_nh_gw6;
1016                 rt->rt6i_flags |= RTF_GATEWAY;
1017         }
1018         rt6_set_from(rt, f6i);
1019 #ifdef CONFIG_IPV6_SUBTREES
1020         rt->rt6i_src = f6i->fib6_src;
1021 #endif
1022 }
1023
1024 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1025                                         struct in6_addr *saddr)
1026 {
1027         struct fib6_node *pn, *sn;
1028         while (1) {
1029                 if (fn->fn_flags & RTN_TL_ROOT)
1030                         return NULL;
1031                 pn = rcu_dereference(fn->parent);
1032                 sn = FIB6_SUBTREE(pn);
1033                 if (sn && sn != fn)
1034                         fn = fib6_node_lookup(sn, NULL, saddr);
1035                 else
1036                         fn = pn;
1037                 if (fn->fn_flags & RTN_RTINFO)
1038                         return fn;
1039         }
1040 }
1041
1042 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1043 {
1044         struct rt6_info *rt = *prt;
1045
1046         if (dst_hold_safe(&rt->dst))
1047                 return true;
1048         if (net) {
1049                 rt = net->ipv6.ip6_null_entry;
1050                 dst_hold(&rt->dst);
1051         } else {
1052                 rt = NULL;
1053         }
1054         *prt = rt;
1055         return false;
1056 }
1057
1058 /* called with rcu_lock held */
1059 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1060 {
1061         struct net_device *dev = res->nh->fib_nh_dev;
1062         struct fib6_info *f6i = res->f6i;
1063         unsigned short flags;
1064         struct rt6_info *nrt;
1065
1066         if (!fib6_info_hold_safe(f6i))
1067                 goto fallback;
1068
1069         flags = fib6_info_dst_flags(f6i);
1070         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1071         if (!nrt) {
1072                 fib6_info_release(f6i);
1073                 goto fallback;
1074         }
1075
1076         ip6_rt_copy_init(nrt, res);
1077         return nrt;
1078
1079 fallback:
1080         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1081         dst_hold(&nrt->dst);
1082         return nrt;
1083 }
1084
1085 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1086                                              struct fib6_table *table,
1087                                              struct flowi6 *fl6,
1088                                              const struct sk_buff *skb,
1089                                              int flags)
1090 {
1091         struct fib6_result res = {};
1092         struct fib6_node *fn;
1093         struct rt6_info *rt;
1094
1095         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1096                 flags &= ~RT6_LOOKUP_F_IFACE;
1097
1098         rcu_read_lock();
1099         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1100 restart:
1101         res.f6i = rcu_dereference(fn->leaf);
1102         if (!res.f6i)
1103                 res.f6i = net->ipv6.fib6_null_entry;
1104         else
1105                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1106                                  flags);
1107
1108         if (res.f6i == net->ipv6.fib6_null_entry) {
1109                 fn = fib6_backtrack(fn, &fl6->saddr);
1110                 if (fn)
1111                         goto restart;
1112
1113                 rt = net->ipv6.ip6_null_entry;
1114                 dst_hold(&rt->dst);
1115                 goto out;
1116         }
1117
1118         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1119                          fl6->flowi6_oif != 0, skb, flags);
1120
1121         /* Search through exception table */
1122         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1123         if (rt) {
1124                 if (ip6_hold_safe(net, &rt))
1125                         dst_use_noref(&rt->dst, jiffies);
1126         } else {
1127                 rt = ip6_create_rt_rcu(&res);
1128         }
1129
1130 out:
1131         trace_fib6_table_lookup(net, &res, table, fl6);
1132
1133         rcu_read_unlock();
1134
1135         return rt;
1136 }
1137
1138 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1139                                    const struct sk_buff *skb, int flags)
1140 {
1141         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1142 }
1143 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1144
1145 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1146                             const struct in6_addr *saddr, int oif,
1147                             const struct sk_buff *skb, int strict)
1148 {
1149         struct flowi6 fl6 = {
1150                 .flowi6_oif = oif,
1151                 .daddr = *daddr,
1152         };
1153         struct dst_entry *dst;
1154         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1155
1156         if (saddr) {
1157                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1158                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1159         }
1160
1161         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1162         if (dst->error == 0)
1163                 return (struct rt6_info *) dst;
1164
1165         dst_release(dst);
1166
1167         return NULL;
1168 }
1169 EXPORT_SYMBOL(rt6_lookup);
1170
1171 /* ip6_ins_rt is called with FREE table->tb6_lock.
1172  * It takes new route entry, the addition fails by any reason the
1173  * route is released.
1174  * Caller must hold dst before calling it.
1175  */
1176
1177 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1178                         struct netlink_ext_ack *extack)
1179 {
1180         int err;
1181         struct fib6_table *table;
1182
1183         table = rt->fib6_table;
1184         spin_lock_bh(&table->tb6_lock);
1185         err = fib6_add(&table->tb6_root, rt, info, extack);
1186         spin_unlock_bh(&table->tb6_lock);
1187
1188         return err;
1189 }
1190
1191 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1192 {
1193         struct nl_info info = { .nl_net = net, };
1194
1195         return __ip6_ins_rt(rt, &info, NULL);
1196 }
1197
1198 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1199                                            const struct in6_addr *daddr,
1200                                            const struct in6_addr *saddr)
1201 {
1202         struct fib6_info *f6i = res->f6i;
1203         struct net_device *dev;
1204         struct rt6_info *rt;
1205
1206         /*
1207          *      Clone the route.
1208          */
1209
1210         if (!fib6_info_hold_safe(f6i))
1211                 return NULL;
1212
1213         dev = ip6_rt_get_dev_rcu(res);
1214         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1215         if (!rt) {
1216                 fib6_info_release(f6i);
1217                 return NULL;
1218         }
1219
1220         ip6_rt_copy_init(rt, res);
1221         rt->rt6i_flags |= RTF_CACHE;
1222         rt->dst.flags |= DST_HOST;
1223         rt->rt6i_dst.addr = *daddr;
1224         rt->rt6i_dst.plen = 128;
1225
1226         if (!rt6_is_gw_or_nonexthop(res)) {
1227                 if (f6i->fib6_dst.plen != 128 &&
1228                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1229                         rt->rt6i_flags |= RTF_ANYCAST;
1230 #ifdef CONFIG_IPV6_SUBTREES
1231                 if (rt->rt6i_src.plen && saddr) {
1232                         rt->rt6i_src.addr = *saddr;
1233                         rt->rt6i_src.plen = 128;
1234                 }
1235 #endif
1236         }
1237
1238         return rt;
1239 }
1240
1241 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1242 {
1243         struct fib6_info *f6i = res->f6i;
1244         unsigned short flags = fib6_info_dst_flags(f6i);
1245         struct net_device *dev;
1246         struct rt6_info *pcpu_rt;
1247
1248         if (!fib6_info_hold_safe(f6i))
1249                 return NULL;
1250
1251         rcu_read_lock();
1252         dev = ip6_rt_get_dev_rcu(res);
1253         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1254         rcu_read_unlock();
1255         if (!pcpu_rt) {
1256                 fib6_info_release(f6i);
1257                 return NULL;
1258         }
1259         ip6_rt_copy_init(pcpu_rt, res);
1260         pcpu_rt->rt6i_flags |= RTF_PCPU;
1261         return pcpu_rt;
1262 }
1263
1264 /* It should be called with rcu_read_lock() acquired */
1265 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1266 {
1267         struct rt6_info *pcpu_rt, **p;
1268
1269         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1270         pcpu_rt = *p;
1271
1272         if (pcpu_rt)
1273                 ip6_hold_safe(NULL, &pcpu_rt);
1274
1275         return pcpu_rt;
1276 }
1277
1278 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1279                                             const struct fib6_result *res)
1280 {
1281         struct rt6_info *pcpu_rt, *prev, **p;
1282
1283         pcpu_rt = ip6_rt_pcpu_alloc(res);
1284         if (!pcpu_rt) {
1285                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1286                 return net->ipv6.ip6_null_entry;
1287         }
1288
1289         dst_hold(&pcpu_rt->dst);
1290         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1291         prev = cmpxchg(p, NULL, pcpu_rt);
1292         BUG_ON(prev);
1293
1294         if (res->f6i->fib6_destroying) {
1295                 struct fib6_info *from;
1296
1297                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1298                 fib6_info_release(from);
1299         }
1300
1301         return pcpu_rt;
1302 }
1303
1304 /* exception hash table implementation
1305  */
1306 static DEFINE_SPINLOCK(rt6_exception_lock);
1307
1308 /* Remove rt6_ex from hash table and free the memory
1309  * Caller must hold rt6_exception_lock
1310  */
1311 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1312                                  struct rt6_exception *rt6_ex)
1313 {
1314         struct fib6_info *from;
1315         struct net *net;
1316
1317         if (!bucket || !rt6_ex)
1318                 return;
1319
1320         net = dev_net(rt6_ex->rt6i->dst.dev);
1321         net->ipv6.rt6_stats->fib_rt_cache--;
1322
1323         /* purge completely the exception to allow releasing the held resources:
1324          * some [sk] cache may keep the dst around for unlimited time
1325          */
1326         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1327         fib6_info_release(from);
1328         dst_dev_put(&rt6_ex->rt6i->dst);
1329
1330         hlist_del_rcu(&rt6_ex->hlist);
1331         dst_release(&rt6_ex->rt6i->dst);
1332         kfree_rcu(rt6_ex, rcu);
1333         WARN_ON_ONCE(!bucket->depth);
1334         bucket->depth--;
1335 }
1336
1337 /* Remove oldest rt6_ex in bucket and free the memory
1338  * Caller must hold rt6_exception_lock
1339  */
1340 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1341 {
1342         struct rt6_exception *rt6_ex, *oldest = NULL;
1343
1344         if (!bucket)
1345                 return;
1346
1347         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1348                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1349                         oldest = rt6_ex;
1350         }
1351         rt6_remove_exception(bucket, oldest);
1352 }
1353
1354 static u32 rt6_exception_hash(const struct in6_addr *dst,
1355                               const struct in6_addr *src)
1356 {
1357         static u32 seed __read_mostly;
1358         u32 val;
1359
1360         net_get_random_once(&seed, sizeof(seed));
1361         val = jhash(dst, sizeof(*dst), seed);
1362
1363 #ifdef CONFIG_IPV6_SUBTREES
1364         if (src)
1365                 val = jhash(src, sizeof(*src), val);
1366 #endif
1367         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1368 }
1369
1370 /* Helper function to find the cached rt in the hash table
1371  * and update bucket pointer to point to the bucket for this
1372  * (daddr, saddr) pair
1373  * Caller must hold rt6_exception_lock
1374  */
1375 static struct rt6_exception *
1376 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1377                               const struct in6_addr *daddr,
1378                               const struct in6_addr *saddr)
1379 {
1380         struct rt6_exception *rt6_ex;
1381         u32 hval;
1382
1383         if (!(*bucket) || !daddr)
1384                 return NULL;
1385
1386         hval = rt6_exception_hash(daddr, saddr);
1387         *bucket += hval;
1388
1389         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1390                 struct rt6_info *rt6 = rt6_ex->rt6i;
1391                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1392
1393 #ifdef CONFIG_IPV6_SUBTREES
1394                 if (matched && saddr)
1395                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1396 #endif
1397                 if (matched)
1398                         return rt6_ex;
1399         }
1400         return NULL;
1401 }
1402
1403 /* Helper function to find the cached rt in the hash table
1404  * and update bucket pointer to point to the bucket for this
1405  * (daddr, saddr) pair
1406  * Caller must hold rcu_read_lock()
1407  */
1408 static struct rt6_exception *
1409 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1410                          const struct in6_addr *daddr,
1411                          const struct in6_addr *saddr)
1412 {
1413         struct rt6_exception *rt6_ex;
1414         u32 hval;
1415
1416         WARN_ON_ONCE(!rcu_read_lock_held());
1417
1418         if (!(*bucket) || !daddr)
1419                 return NULL;
1420
1421         hval = rt6_exception_hash(daddr, saddr);
1422         *bucket += hval;
1423
1424         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1425                 struct rt6_info *rt6 = rt6_ex->rt6i;
1426                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1427
1428 #ifdef CONFIG_IPV6_SUBTREES
1429                 if (matched && saddr)
1430                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1431 #endif
1432                 if (matched)
1433                         return rt6_ex;
1434         }
1435         return NULL;
1436 }
1437
1438 static unsigned int fib6_mtu(const struct fib6_result *res)
1439 {
1440         const struct fib6_nh *nh = res->nh;
1441         unsigned int mtu;
1442
1443         if (res->f6i->fib6_pmtu) {
1444                 mtu = res->f6i->fib6_pmtu;
1445         } else {
1446                 struct net_device *dev = nh->fib_nh_dev;
1447                 struct inet6_dev *idev;
1448
1449                 rcu_read_lock();
1450                 idev = __in6_dev_get(dev);
1451                 mtu = idev->cnf.mtu6;
1452                 rcu_read_unlock();
1453         }
1454
1455         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1456
1457         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1458 }
1459
1460 static int rt6_insert_exception(struct rt6_info *nrt,
1461                                 const struct fib6_result *res)
1462 {
1463         struct net *net = dev_net(nrt->dst.dev);
1464         struct rt6_exception_bucket *bucket;
1465         struct in6_addr *src_key = NULL;
1466         struct rt6_exception *rt6_ex;
1467         struct fib6_info *f6i = res->f6i;
1468         int err = 0;
1469
1470         spin_lock_bh(&rt6_exception_lock);
1471
1472         if (f6i->exception_bucket_flushed) {
1473                 err = -EINVAL;
1474                 goto out;
1475         }
1476
1477         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1478                                         lockdep_is_held(&rt6_exception_lock));
1479         if (!bucket) {
1480                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1481                                  GFP_ATOMIC);
1482                 if (!bucket) {
1483                         err = -ENOMEM;
1484                         goto out;
1485                 }
1486                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1487         }
1488
1489 #ifdef CONFIG_IPV6_SUBTREES
1490         /* fib6_src.plen != 0 indicates f6i is in subtree
1491          * and exception table is indexed by a hash of
1492          * both fib6_dst and fib6_src.
1493          * Otherwise, the exception table is indexed by
1494          * a hash of only fib6_dst.
1495          */
1496         if (f6i->fib6_src.plen)
1497                 src_key = &nrt->rt6i_src.addr;
1498 #endif
1499         /* rt6_mtu_change() might lower mtu on f6i.
1500          * Only insert this exception route if its mtu
1501          * is less than f6i's mtu value.
1502          */
1503         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1504                 err = -EINVAL;
1505                 goto out;
1506         }
1507
1508         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1509                                                src_key);
1510         if (rt6_ex)
1511                 rt6_remove_exception(bucket, rt6_ex);
1512
1513         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1514         if (!rt6_ex) {
1515                 err = -ENOMEM;
1516                 goto out;
1517         }
1518         rt6_ex->rt6i = nrt;
1519         rt6_ex->stamp = jiffies;
1520         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1521         bucket->depth++;
1522         net->ipv6.rt6_stats->fib_rt_cache++;
1523
1524         if (bucket->depth > FIB6_MAX_DEPTH)
1525                 rt6_exception_remove_oldest(bucket);
1526
1527 out:
1528         spin_unlock_bh(&rt6_exception_lock);
1529
1530         /* Update fn->fn_sernum to invalidate all cached dst */
1531         if (!err) {
1532                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1533                 fib6_update_sernum(net, f6i);
1534                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1535                 fib6_force_start_gc(net);
1536         }
1537
1538         return err;
1539 }
1540
1541 void rt6_flush_exceptions(struct fib6_info *rt)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct rt6_exception *rt6_ex;
1545         struct hlist_node *tmp;
1546         int i;
1547
1548         spin_lock_bh(&rt6_exception_lock);
1549         /* Prevent rt6_insert_exception() to recreate the bucket list */
1550         rt->exception_bucket_flushed = 1;
1551
1552         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553                                     lockdep_is_held(&rt6_exception_lock));
1554         if (!bucket)
1555                 goto out;
1556
1557         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1559                         rt6_remove_exception(bucket, rt6_ex);
1560                 WARN_ON_ONCE(bucket->depth);
1561                 bucket++;
1562         }
1563
1564 out:
1565         spin_unlock_bh(&rt6_exception_lock);
1566 }
1567
1568 /* Find cached rt in the hash table inside passed in rt
1569  * Caller has to hold rcu_read_lock()
1570  */
1571 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1572                                            const struct in6_addr *daddr,
1573                                            const struct in6_addr *saddr)
1574 {
1575         const struct in6_addr *src_key = NULL;
1576         struct rt6_exception_bucket *bucket;
1577         struct rt6_exception *rt6_ex;
1578         struct rt6_info *ret = NULL;
1579
1580 #ifdef CONFIG_IPV6_SUBTREES
1581         /* fib6i_src.plen != 0 indicates f6i is in subtree
1582          * and exception table is indexed by a hash of
1583          * both fib6_dst and fib6_src.
1584          * However, the src addr used to create the hash
1585          * might not be exactly the passed in saddr which
1586          * is a /128 addr from the flow.
1587          * So we need to use f6i->fib6_src to redo lookup
1588          * if the passed in saddr does not find anything.
1589          * (See the logic in ip6_rt_cache_alloc() on how
1590          * rt->rt6i_src is updated.)
1591          */
1592         if (res->f6i->fib6_src.plen)
1593                 src_key = saddr;
1594 find_ex:
1595 #endif
1596         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1597         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1598
1599         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1600                 ret = rt6_ex->rt6i;
1601
1602 #ifdef CONFIG_IPV6_SUBTREES
1603         /* Use fib6_src as src_key and redo lookup */
1604         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1605                 src_key = &res->f6i->fib6_src.addr;
1606                 goto find_ex;
1607         }
1608 #endif
1609
1610         return ret;
1611 }
1612
1613 /* Remove the passed in cached rt from the hash table that contains it */
1614 static int rt6_remove_exception_rt(struct rt6_info *rt)
1615 {
1616         struct rt6_exception_bucket *bucket;
1617         struct in6_addr *src_key = NULL;
1618         struct rt6_exception *rt6_ex;
1619         struct fib6_info *from;
1620         int err;
1621
1622         from = rcu_dereference(rt->from);
1623         if (!from ||
1624             !(rt->rt6i_flags & RTF_CACHE))
1625                 return -EINVAL;
1626
1627         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1628                 return -ENOENT;
1629
1630         spin_lock_bh(&rt6_exception_lock);
1631         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1632                                     lockdep_is_held(&rt6_exception_lock));
1633 #ifdef CONFIG_IPV6_SUBTREES
1634         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1635          * and exception table is indexed by a hash of
1636          * both rt6i_dst and rt6i_src.
1637          * Otherwise, the exception table is indexed by
1638          * a hash of only rt6i_dst.
1639          */
1640         if (from->fib6_src.plen)
1641                 src_key = &rt->rt6i_src.addr;
1642 #endif
1643         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1644                                                &rt->rt6i_dst.addr,
1645                                                src_key);
1646         if (rt6_ex) {
1647                 rt6_remove_exception(bucket, rt6_ex);
1648                 err = 0;
1649         } else {
1650                 err = -ENOENT;
1651         }
1652
1653         spin_unlock_bh(&rt6_exception_lock);
1654         return err;
1655 }
1656
1657 /* Find rt6_ex which contains the passed in rt cache and
1658  * refresh its stamp
1659  */
1660 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1661 {
1662         struct rt6_exception_bucket *bucket;
1663         struct in6_addr *src_key = NULL;
1664         struct rt6_exception *rt6_ex;
1665         struct fib6_info *from;
1666
1667         rcu_read_lock();
1668         from = rcu_dereference(rt->from);
1669         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1670                 goto unlock;
1671
1672         bucket = rcu_dereference(from->rt6i_exception_bucket);
1673
1674 #ifdef CONFIG_IPV6_SUBTREES
1675         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1676          * and exception table is indexed by a hash of
1677          * both rt6i_dst and rt6i_src.
1678          * Otherwise, the exception table is indexed by
1679          * a hash of only rt6i_dst.
1680          */
1681         if (from->fib6_src.plen)
1682                 src_key = &rt->rt6i_src.addr;
1683 #endif
1684         rt6_ex = __rt6_find_exception_rcu(&bucket,
1685                                           &rt->rt6i_dst.addr,
1686                                           src_key);
1687         if (rt6_ex)
1688                 rt6_ex->stamp = jiffies;
1689
1690 unlock:
1691         rcu_read_unlock();
1692 }
1693
1694 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1695                                          struct rt6_info *rt, int mtu)
1696 {
1697         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1698          * lowest MTU in the path: always allow updating the route PMTU to
1699          * reflect PMTU decreases.
1700          *
1701          * If the new MTU is higher, and the route PMTU is equal to the local
1702          * MTU, this means the old MTU is the lowest in the path, so allow
1703          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1704          * handle this.
1705          */
1706
1707         if (dst_mtu(&rt->dst) >= mtu)
1708                 return true;
1709
1710         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1711                 return true;
1712
1713         return false;
1714 }
1715
1716 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1717                                        struct fib6_info *rt, int mtu)
1718 {
1719         struct rt6_exception_bucket *bucket;
1720         struct rt6_exception *rt6_ex;
1721         int i;
1722
1723         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1724                                         lockdep_is_held(&rt6_exception_lock));
1725
1726         if (!bucket)
1727                 return;
1728
1729         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1730                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1731                         struct rt6_info *entry = rt6_ex->rt6i;
1732
1733                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1734                          * route), the metrics of its rt->from have already
1735                          * been updated.
1736                          */
1737                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1738                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1739                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1740                 }
1741                 bucket++;
1742         }
1743 }
1744
1745 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1746
1747 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1748                                         struct in6_addr *gateway)
1749 {
1750         struct rt6_exception_bucket *bucket;
1751         struct rt6_exception *rt6_ex;
1752         struct hlist_node *tmp;
1753         int i;
1754
1755         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1756                 return;
1757
1758         spin_lock_bh(&rt6_exception_lock);
1759         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1760                                      lockdep_is_held(&rt6_exception_lock));
1761
1762         if (bucket) {
1763                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1764                         hlist_for_each_entry_safe(rt6_ex, tmp,
1765                                                   &bucket->chain, hlist) {
1766                                 struct rt6_info *entry = rt6_ex->rt6i;
1767
1768                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1769                                     RTF_CACHE_GATEWAY &&
1770                                     ipv6_addr_equal(gateway,
1771                                                     &entry->rt6i_gateway)) {
1772                                         rt6_remove_exception(bucket, rt6_ex);
1773                                 }
1774                         }
1775                         bucket++;
1776                 }
1777         }
1778
1779         spin_unlock_bh(&rt6_exception_lock);
1780 }
1781
1782 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1783                                       struct rt6_exception *rt6_ex,
1784                                       struct fib6_gc_args *gc_args,
1785                                       unsigned long now)
1786 {
1787         struct rt6_info *rt = rt6_ex->rt6i;
1788
1789         /* we are pruning and obsoleting aged-out and non gateway exceptions
1790          * even if others have still references to them, so that on next
1791          * dst_check() such references can be dropped.
1792          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1793          * expired, independently from their aging, as per RFC 8201 section 4
1794          */
1795         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1796                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1797                         RT6_TRACE("aging clone %p\n", rt);
1798                         rt6_remove_exception(bucket, rt6_ex);
1799                         return;
1800                 }
1801         } else if (time_after(jiffies, rt->dst.expires)) {
1802                 RT6_TRACE("purging expired route %p\n", rt);
1803                 rt6_remove_exception(bucket, rt6_ex);
1804                 return;
1805         }
1806
1807         if (rt->rt6i_flags & RTF_GATEWAY) {
1808                 struct neighbour *neigh;
1809                 __u8 neigh_flags = 0;
1810
1811                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1812                 if (neigh)
1813                         neigh_flags = neigh->flags;
1814
1815                 if (!(neigh_flags & NTF_ROUTER)) {
1816                         RT6_TRACE("purging route %p via non-router but gateway\n",
1817                                   rt);
1818                         rt6_remove_exception(bucket, rt6_ex);
1819                         return;
1820                 }
1821         }
1822
1823         gc_args->more++;
1824 }
1825
1826 void rt6_age_exceptions(struct fib6_info *rt,
1827                         struct fib6_gc_args *gc_args,
1828                         unsigned long now)
1829 {
1830         struct rt6_exception_bucket *bucket;
1831         struct rt6_exception *rt6_ex;
1832         struct hlist_node *tmp;
1833         int i;
1834
1835         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1836                 return;
1837
1838         rcu_read_lock_bh();
1839         spin_lock(&rt6_exception_lock);
1840         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1841                                     lockdep_is_held(&rt6_exception_lock));
1842
1843         if (bucket) {
1844                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1845                         hlist_for_each_entry_safe(rt6_ex, tmp,
1846                                                   &bucket->chain, hlist) {
1847                                 rt6_age_examine_exception(bucket, rt6_ex,
1848                                                           gc_args, now);
1849                         }
1850                         bucket++;
1851                 }
1852         }
1853         spin_unlock(&rt6_exception_lock);
1854         rcu_read_unlock_bh();
1855 }
1856
1857 /* must be called with rcu lock held */
1858 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1859                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1860 {
1861         struct fib6_node *fn, *saved_fn;
1862
1863         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1864         saved_fn = fn;
1865
1866         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1867                 oif = 0;
1868
1869 redo_rt6_select:
1870         rt6_select(net, fn, oif, res, strict);
1871         if (res->f6i == net->ipv6.fib6_null_entry) {
1872                 fn = fib6_backtrack(fn, &fl6->saddr);
1873                 if (fn)
1874                         goto redo_rt6_select;
1875                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1876                         /* also consider unreachable route */
1877                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1878                         fn = saved_fn;
1879                         goto redo_rt6_select;
1880                 }
1881         }
1882
1883         trace_fib6_table_lookup(net, res, table, fl6);
1884
1885         return 0;
1886 }
1887
1888 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1889                                int oif, struct flowi6 *fl6,
1890                                const struct sk_buff *skb, int flags)
1891 {
1892         struct fib6_result res = {};
1893         struct rt6_info *rt;
1894         int strict = 0;
1895
1896         strict |= flags & RT6_LOOKUP_F_IFACE;
1897         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1898         if (net->ipv6.devconf_all->forwarding == 0)
1899                 strict |= RT6_LOOKUP_F_REACHABLE;
1900
1901         rcu_read_lock();
1902
1903         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1904         if (res.f6i == net->ipv6.fib6_null_entry) {
1905                 rt = net->ipv6.ip6_null_entry;
1906                 rcu_read_unlock();
1907                 dst_hold(&rt->dst);
1908                 return rt;
1909         }
1910
1911         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1912
1913         /*Search through exception table */
1914         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1915         if (rt) {
1916                 if (ip6_hold_safe(net, &rt))
1917                         dst_use_noref(&rt->dst, jiffies);
1918
1919                 rcu_read_unlock();
1920                 return rt;
1921         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1922                             !res.nh->fib_nh_gw_family)) {
1923                 /* Create a RTF_CACHE clone which will not be
1924                  * owned by the fib6 tree.  It is for the special case where
1925                  * the daddr in the skb during the neighbor look-up is different
1926                  * from the fl6->daddr used to look-up route here.
1927                  */
1928                 struct rt6_info *uncached_rt;
1929
1930                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1931
1932                 rcu_read_unlock();
1933
1934                 if (uncached_rt) {
1935                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1936                          * No need for another dst_hold()
1937                          */
1938                         rt6_uncached_list_add(uncached_rt);
1939                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1940                 } else {
1941                         uncached_rt = net->ipv6.ip6_null_entry;
1942                         dst_hold(&uncached_rt->dst);
1943                 }
1944
1945                 return uncached_rt;
1946         } else {
1947                 /* Get a percpu copy */
1948
1949                 struct rt6_info *pcpu_rt;
1950
1951                 local_bh_disable();
1952                 pcpu_rt = rt6_get_pcpu_route(&res);
1953
1954                 if (!pcpu_rt)
1955                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1956
1957                 local_bh_enable();
1958                 rcu_read_unlock();
1959
1960                 return pcpu_rt;
1961         }
1962 }
1963 EXPORT_SYMBOL_GPL(ip6_pol_route);
1964
1965 static struct rt6_info *ip6_pol_route_input(struct net *net,
1966                                             struct fib6_table *table,
1967                                             struct flowi6 *fl6,
1968                                             const struct sk_buff *skb,
1969                                             int flags)
1970 {
1971         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1972 }
1973
1974 struct dst_entry *ip6_route_input_lookup(struct net *net,
1975                                          struct net_device *dev,
1976                                          struct flowi6 *fl6,
1977                                          const struct sk_buff *skb,
1978                                          int flags)
1979 {
1980         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1981                 flags |= RT6_LOOKUP_F_IFACE;
1982
1983         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1984 }
1985 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1986
1987 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1988                                   struct flow_keys *keys,
1989                                   struct flow_keys *flkeys)
1990 {
1991         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1992         const struct ipv6hdr *key_iph = outer_iph;
1993         struct flow_keys *_flkeys = flkeys;
1994         const struct ipv6hdr *inner_iph;
1995         const struct icmp6hdr *icmph;
1996         struct ipv6hdr _inner_iph;
1997         struct icmp6hdr _icmph;
1998
1999         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2000                 goto out;
2001
2002         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2003                                    sizeof(_icmph), &_icmph);
2004         if (!icmph)
2005                 goto out;
2006
2007         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2008             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2009             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2010             icmph->icmp6_type != ICMPV6_PARAMPROB)
2011                 goto out;
2012
2013         inner_iph = skb_header_pointer(skb,
2014                                        skb_transport_offset(skb) + sizeof(*icmph),
2015                                        sizeof(_inner_iph), &_inner_iph);
2016         if (!inner_iph)
2017                 goto out;
2018
2019         key_iph = inner_iph;
2020         _flkeys = NULL;
2021 out:
2022         if (_flkeys) {
2023                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2024                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2025                 keys->tags.flow_label = _flkeys->tags.flow_label;
2026                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2027         } else {
2028                 keys->addrs.v6addrs.src = key_iph->saddr;
2029                 keys->addrs.v6addrs.dst = key_iph->daddr;
2030                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2031                 keys->basic.ip_proto = key_iph->nexthdr;
2032         }
2033 }
2034
2035 /* if skb is set it will be used and fl6 can be NULL */
2036 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2037                        const struct sk_buff *skb, struct flow_keys *flkeys)
2038 {
2039         struct flow_keys hash_keys;
2040         u32 mhash;
2041
2042         switch (ip6_multipath_hash_policy(net)) {
2043         case 0:
2044                 memset(&hash_keys, 0, sizeof(hash_keys));
2045                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2046                 if (skb) {
2047                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2048                 } else {
2049                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2050                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2051                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2052                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2053                 }
2054                 break;
2055         case 1:
2056                 if (skb) {
2057                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2058                         struct flow_keys keys;
2059
2060                         /* short-circuit if we already have L4 hash present */
2061                         if (skb->l4_hash)
2062                                 return skb_get_hash_raw(skb) >> 1;
2063
2064                         memset(&hash_keys, 0, sizeof(hash_keys));
2065
2066                         if (!flkeys) {
2067                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2068                                 flkeys = &keys;
2069                         }
2070                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2071                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2072                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2073                         hash_keys.ports.src = flkeys->ports.src;
2074                         hash_keys.ports.dst = flkeys->ports.dst;
2075                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2076                 } else {
2077                         memset(&hash_keys, 0, sizeof(hash_keys));
2078                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2079                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2080                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2081                         hash_keys.ports.src = fl6->fl6_sport;
2082                         hash_keys.ports.dst = fl6->fl6_dport;
2083                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2084                 }
2085                 break;
2086         }
2087         mhash = flow_hash_from_keys(&hash_keys);
2088
2089         return mhash >> 1;
2090 }
2091
2092 void ip6_route_input(struct sk_buff *skb)
2093 {
2094         const struct ipv6hdr *iph = ipv6_hdr(skb);
2095         struct net *net = dev_net(skb->dev);
2096         int flags = RT6_LOOKUP_F_HAS_SADDR;
2097         struct ip_tunnel_info *tun_info;
2098         struct flowi6 fl6 = {
2099                 .flowi6_iif = skb->dev->ifindex,
2100                 .daddr = iph->daddr,
2101                 .saddr = iph->saddr,
2102                 .flowlabel = ip6_flowinfo(iph),
2103                 .flowi6_mark = skb->mark,
2104                 .flowi6_proto = iph->nexthdr,
2105         };
2106         struct flow_keys *flkeys = NULL, _flkeys;
2107
2108         tun_info = skb_tunnel_info(skb);
2109         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2110                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2111
2112         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2113                 flkeys = &_flkeys;
2114
2115         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2116                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2117         skb_dst_drop(skb);
2118         skb_dst_set(skb,
2119                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2120 }
2121
2122 static struct rt6_info *ip6_pol_route_output(struct net *net,
2123                                              struct fib6_table *table,
2124                                              struct flowi6 *fl6,
2125                                              const struct sk_buff *skb,
2126                                              int flags)
2127 {
2128         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2129 }
2130
2131 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2132                                          struct flowi6 *fl6, int flags)
2133 {
2134         bool any_src;
2135
2136         if (ipv6_addr_type(&fl6->daddr) &
2137             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2138                 struct dst_entry *dst;
2139
2140                 dst = l3mdev_link_scope_lookup(net, fl6);
2141                 if (dst)
2142                         return dst;
2143         }
2144
2145         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2146
2147         any_src = ipv6_addr_any(&fl6->saddr);
2148         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2149             (fl6->flowi6_oif && any_src))
2150                 flags |= RT6_LOOKUP_F_IFACE;
2151
2152         if (!any_src)
2153                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2154         else if (sk)
2155                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2156
2157         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2158 }
2159 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2160
2161 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2162 {
2163         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2164         struct net_device *loopback_dev = net->loopback_dev;
2165         struct dst_entry *new = NULL;
2166
2167         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2168                        DST_OBSOLETE_DEAD, 0);
2169         if (rt) {
2170                 rt6_info_init(rt);
2171                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2172
2173                 new = &rt->dst;
2174                 new->__use = 1;
2175                 new->input = dst_discard;
2176                 new->output = dst_discard_out;
2177
2178                 dst_copy_metrics(new, &ort->dst);
2179
2180                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2181                 rt->rt6i_gateway = ort->rt6i_gateway;
2182                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2183
2184                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2185 #ifdef CONFIG_IPV6_SUBTREES
2186                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2187 #endif
2188         }
2189
2190         dst_release(dst_orig);
2191         return new ? new : ERR_PTR(-ENOMEM);
2192 }
2193
2194 /*
2195  *      Destination cache support functions
2196  */
2197
2198 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2199 {
2200         u32 rt_cookie = 0;
2201
2202         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2203                 return false;
2204
2205         if (fib6_check_expired(f6i))
2206                 return false;
2207
2208         return true;
2209 }
2210
2211 static struct dst_entry *rt6_check(struct rt6_info *rt,
2212                                    struct fib6_info *from,
2213                                    u32 cookie)
2214 {
2215         u32 rt_cookie = 0;
2216
2217         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2218             rt_cookie != cookie)
2219                 return NULL;
2220
2221         if (rt6_check_expired(rt))
2222                 return NULL;
2223
2224         return &rt->dst;
2225 }
2226
2227 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2228                                             struct fib6_info *from,
2229                                             u32 cookie)
2230 {
2231         if (!__rt6_check_expired(rt) &&
2232             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2233             fib6_check(from, cookie))
2234                 return &rt->dst;
2235         else
2236                 return NULL;
2237 }
2238
2239 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2240 {
2241         struct dst_entry *dst_ret;
2242         struct fib6_info *from;
2243         struct rt6_info *rt;
2244
2245         rt = container_of(dst, struct rt6_info, dst);
2246
2247         rcu_read_lock();
2248
2249         /* All IPV6 dsts are created with ->obsolete set to the value
2250          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2251          * into this function always.
2252          */
2253
2254         from = rcu_dereference(rt->from);
2255
2256         if (from && (rt->rt6i_flags & RTF_PCPU ||
2257             unlikely(!list_empty(&rt->rt6i_uncached))))
2258                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2259         else
2260                 dst_ret = rt6_check(rt, from, cookie);
2261
2262         rcu_read_unlock();
2263
2264         return dst_ret;
2265 }
2266
2267 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2268 {
2269         struct rt6_info *rt = (struct rt6_info *) dst;
2270
2271         if (rt) {
2272                 if (rt->rt6i_flags & RTF_CACHE) {
2273                         rcu_read_lock();
2274                         if (rt6_check_expired(rt)) {
2275                                 rt6_remove_exception_rt(rt);
2276                                 dst = NULL;
2277                         }
2278                         rcu_read_unlock();
2279                 } else {
2280                         dst_release(dst);
2281                         dst = NULL;
2282                 }
2283         }
2284         return dst;
2285 }
2286
2287 static void ip6_link_failure(struct sk_buff *skb)
2288 {
2289         struct rt6_info *rt;
2290
2291         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2292
2293         rt = (struct rt6_info *) skb_dst(skb);
2294         if (rt) {
2295                 rcu_read_lock();
2296                 if (rt->rt6i_flags & RTF_CACHE) {
2297                         rt6_remove_exception_rt(rt);
2298                 } else {
2299                         struct fib6_info *from;
2300                         struct fib6_node *fn;
2301
2302                         from = rcu_dereference(rt->from);
2303                         if (from) {
2304                                 fn = rcu_dereference(from->fib6_node);
2305                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2306                                         fn->fn_sernum = -1;
2307                         }
2308                 }
2309                 rcu_read_unlock();
2310         }
2311 }
2312
2313 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2314 {
2315         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2316                 struct fib6_info *from;
2317
2318                 rcu_read_lock();
2319                 from = rcu_dereference(rt0->from);
2320                 if (from)
2321                         rt0->dst.expires = from->expires;
2322                 rcu_read_unlock();
2323         }
2324
2325         dst_set_expires(&rt0->dst, timeout);
2326         rt0->rt6i_flags |= RTF_EXPIRES;
2327 }
2328
2329 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2330 {
2331         struct net *net = dev_net(rt->dst.dev);
2332
2333         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2334         rt->rt6i_flags |= RTF_MODIFIED;
2335         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2336 }
2337
2338 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2339 {
2340         return !(rt->rt6i_flags & RTF_CACHE) &&
2341                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2342 }
2343
2344 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2345                                  const struct ipv6hdr *iph, u32 mtu)
2346 {
2347         const struct in6_addr *daddr, *saddr;
2348         struct rt6_info *rt6 = (struct rt6_info *)dst;
2349
2350         if (dst_metric_locked(dst, RTAX_MTU))
2351                 return;
2352
2353         if (iph) {
2354                 daddr = &iph->daddr;
2355                 saddr = &iph->saddr;
2356         } else if (sk) {
2357                 daddr = &sk->sk_v6_daddr;
2358                 saddr = &inet6_sk(sk)->saddr;
2359         } else {
2360                 daddr = NULL;
2361                 saddr = NULL;
2362         }
2363         dst_confirm_neigh(dst, daddr);
2364         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2365         if (mtu >= dst_mtu(dst))
2366                 return;
2367
2368         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2369                 rt6_do_update_pmtu(rt6, mtu);
2370                 /* update rt6_ex->stamp for cache */
2371                 if (rt6->rt6i_flags & RTF_CACHE)
2372                         rt6_update_exception_stamp_rt(rt6);
2373         } else if (daddr) {
2374                 struct fib6_result res = {};
2375                 struct rt6_info *nrt6;
2376
2377                 rcu_read_lock();
2378                 res.f6i = rcu_dereference(rt6->from);
2379                 if (!res.f6i) {
2380                         rcu_read_unlock();
2381                         return;
2382                 }
2383                 res.nh = &res.f6i->fib6_nh;
2384                 res.fib6_flags = res.f6i->fib6_flags;
2385                 res.fib6_type = res.f6i->fib6_type;
2386
2387                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2388                 if (nrt6) {
2389                         rt6_do_update_pmtu(nrt6, mtu);
2390                         if (rt6_insert_exception(nrt6, &res))
2391                                 dst_release_immediate(&nrt6->dst);
2392                 }
2393                 rcu_read_unlock();
2394         }
2395 }
2396
2397 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2398                                struct sk_buff *skb, u32 mtu)
2399 {
2400         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2401 }
2402
2403 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2404                      int oif, u32 mark, kuid_t uid)
2405 {
2406         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2407         struct dst_entry *dst;
2408         struct flowi6 fl6 = {
2409                 .flowi6_oif = oif,
2410                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2411                 .daddr = iph->daddr,
2412                 .saddr = iph->saddr,
2413                 .flowlabel = ip6_flowinfo(iph),
2414                 .flowi6_uid = uid,
2415         };
2416
2417         dst = ip6_route_output(net, NULL, &fl6);
2418         if (!dst->error)
2419                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2420         dst_release(dst);
2421 }
2422 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2423
2424 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2425 {
2426         int oif = sk->sk_bound_dev_if;
2427         struct dst_entry *dst;
2428
2429         if (!oif && skb->dev)
2430                 oif = l3mdev_master_ifindex(skb->dev);
2431
2432         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2433
2434         dst = __sk_dst_get(sk);
2435         if (!dst || !dst->obsolete ||
2436             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2437                 return;
2438
2439         bh_lock_sock(sk);
2440         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2441                 ip6_datagram_dst_update(sk, false);
2442         bh_unlock_sock(sk);
2443 }
2444 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2445
2446 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2447                            const struct flowi6 *fl6)
2448 {
2449 #ifdef CONFIG_IPV6_SUBTREES
2450         struct ipv6_pinfo *np = inet6_sk(sk);
2451 #endif
2452
2453         ip6_dst_store(sk, dst,
2454                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2455                       &sk->sk_v6_daddr : NULL,
2456 #ifdef CONFIG_IPV6_SUBTREES
2457                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2458                       &np->saddr :
2459 #endif
2460                       NULL);
2461 }
2462
2463 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2464                                   struct flowi6 *fl6,
2465                                   const struct in6_addr *gw,
2466                                   struct rt6_info **ret)
2467 {
2468         const struct fib6_nh *nh = res->nh;
2469
2470         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2471             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2472                 return false;
2473
2474         /* rt_cache's gateway might be different from its 'parent'
2475          * in the case of an ip redirect.
2476          * So we keep searching in the exception table if the gateway
2477          * is different.
2478          */
2479         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2480                 struct rt6_info *rt_cache;
2481
2482                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2483                 if (rt_cache &&
2484                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2485                         *ret = rt_cache;
2486                         return true;
2487                 }
2488                 return false;
2489         }
2490         return true;
2491 }
2492
2493 /* Handle redirects */
2494 struct ip6rd_flowi {
2495         struct flowi6 fl6;
2496         struct in6_addr gateway;
2497 };
2498
2499 static struct rt6_info *__ip6_route_redirect(struct net *net,
2500                                              struct fib6_table *table,
2501                                              struct flowi6 *fl6,
2502                                              const struct sk_buff *skb,
2503                                              int flags)
2504 {
2505         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2506         struct rt6_info *ret = NULL;
2507         struct fib6_result res = {};
2508         struct fib6_info *rt;
2509         struct fib6_node *fn;
2510
2511         /* l3mdev_update_flow overrides oif if the device is enslaved; in
2512          * this case we must match on the real ingress device, so reset it
2513          */
2514         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2515                 fl6->flowi6_oif = skb->dev->ifindex;
2516
2517         /* Get the "current" route for this destination and
2518          * check if the redirect has come from appropriate router.
2519          *
2520          * RFC 4861 specifies that redirects should only be
2521          * accepted if they come from the nexthop to the target.
2522          * Due to the way the routes are chosen, this notion
2523          * is a bit fuzzy and one might need to check all possible
2524          * routes.
2525          */
2526
2527         rcu_read_lock();
2528         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2529 restart:
2530         for_each_fib6_node_rt_rcu(fn) {
2531                 res.f6i = rt;
2532                 res.nh = &rt->fib6_nh;
2533
2534                 if (fib6_check_expired(rt))
2535                         continue;
2536                 if (rt->fib6_flags & RTF_REJECT)
2537                         break;
2538                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2539                         goto out;
2540         }
2541
2542         if (!rt)
2543                 rt = net->ipv6.fib6_null_entry;
2544         else if (rt->fib6_flags & RTF_REJECT) {
2545                 ret = net->ipv6.ip6_null_entry;
2546                 goto out;
2547         }
2548
2549         if (rt == net->ipv6.fib6_null_entry) {
2550                 fn = fib6_backtrack(fn, &fl6->saddr);
2551                 if (fn)
2552                         goto restart;
2553         }
2554
2555         res.f6i = rt;
2556         res.nh = &rt->fib6_nh;
2557 out:
2558         if (ret) {
2559                 ip6_hold_safe(net, &ret);
2560         } else {
2561                 res.fib6_flags = res.f6i->fib6_flags;
2562                 res.fib6_type = res.f6i->fib6_type;
2563                 ret = ip6_create_rt_rcu(&res);
2564         }
2565
2566         rcu_read_unlock();
2567
2568         trace_fib6_table_lookup(net, &res, table, fl6);
2569         return ret;
2570 };
2571
2572 static struct dst_entry *ip6_route_redirect(struct net *net,
2573                                             const struct flowi6 *fl6,
2574                                             const struct sk_buff *skb,
2575                                             const struct in6_addr *gateway)
2576 {
2577         int flags = RT6_LOOKUP_F_HAS_SADDR;
2578         struct ip6rd_flowi rdfl;
2579
2580         rdfl.fl6 = *fl6;
2581         rdfl.gateway = *gateway;
2582
2583         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2584                                 flags, __ip6_route_redirect);
2585 }
2586
2587 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2588                   kuid_t uid)
2589 {
2590         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2591         struct dst_entry *dst;
2592         struct flowi6 fl6 = {
2593                 .flowi6_iif = LOOPBACK_IFINDEX,
2594                 .flowi6_oif = oif,
2595                 .flowi6_mark = mark,
2596                 .daddr = iph->daddr,
2597                 .saddr = iph->saddr,
2598                 .flowlabel = ip6_flowinfo(iph),
2599                 .flowi6_uid = uid,
2600         };
2601
2602         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2603         rt6_do_redirect(dst, NULL, skb);
2604         dst_release(dst);
2605 }
2606 EXPORT_SYMBOL_GPL(ip6_redirect);
2607
2608 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2609 {
2610         const struct ipv6hdr *iph = ipv6_hdr(skb);
2611         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2612         struct dst_entry *dst;
2613         struct flowi6 fl6 = {
2614                 .flowi6_iif = LOOPBACK_IFINDEX,
2615                 .flowi6_oif = oif,
2616                 .daddr = msg->dest,
2617                 .saddr = iph->daddr,
2618                 .flowi6_uid = sock_net_uid(net, NULL),
2619         };
2620
2621         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2622         rt6_do_redirect(dst, NULL, skb);
2623         dst_release(dst);
2624 }
2625
2626 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2627 {
2628         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2629                      sk->sk_uid);
2630 }
2631 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2632
2633 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2634 {
2635         struct net_device *dev = dst->dev;
2636         unsigned int mtu = dst_mtu(dst);
2637         struct net *net = dev_net(dev);
2638
2639         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2640
2641         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2642                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2643
2644         /*
2645          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2646          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2647          * IPV6_MAXPLEN is also valid and means: "any MSS,
2648          * rely only on pmtu discovery"
2649          */
2650         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2651                 mtu = IPV6_MAXPLEN;
2652         return mtu;
2653 }
2654
2655 static unsigned int ip6_mtu(const struct dst_entry *dst)
2656 {
2657         struct inet6_dev *idev;
2658         unsigned int mtu;
2659
2660         mtu = dst_metric_raw(dst, RTAX_MTU);
2661         if (mtu)
2662                 goto out;
2663
2664         mtu = IPV6_MIN_MTU;
2665
2666         rcu_read_lock();
2667         idev = __in6_dev_get(dst->dev);
2668         if (idev)
2669                 mtu = idev->cnf.mtu6;
2670         rcu_read_unlock();
2671
2672 out:
2673         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2674
2675         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2676 }
2677
2678 /* MTU selection:
2679  * 1. mtu on route is locked - use it
2680  * 2. mtu from nexthop exception
2681  * 3. mtu from egress device
2682  *
2683  * based on ip6_dst_mtu_forward and exception logic of
2684  * rt6_find_cached_rt; called with rcu_read_lock
2685  */
2686 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2687                       const struct in6_addr *daddr,
2688                       const struct in6_addr *saddr)
2689 {
2690         const struct fib6_nh *nh = res->nh;
2691         struct fib6_info *f6i = res->f6i;
2692         struct inet6_dev *idev;
2693         struct rt6_info *rt;
2694         u32 mtu = 0;
2695
2696         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2697                 mtu = f6i->fib6_pmtu;
2698                 if (mtu)
2699                         goto out;
2700         }
2701
2702         rt = rt6_find_cached_rt(res, daddr, saddr);
2703         if (unlikely(rt)) {
2704                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2705         } else {
2706                 struct net_device *dev = nh->fib_nh_dev;
2707
2708                 mtu = IPV6_MIN_MTU;
2709                 idev = __in6_dev_get(dev);
2710                 if (idev && idev->cnf.mtu6 > mtu)
2711                         mtu = idev->cnf.mtu6;
2712         }
2713
2714         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2715 out:
2716         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2717 }
2718
2719 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2720                                   struct flowi6 *fl6)
2721 {
2722         struct dst_entry *dst;
2723         struct rt6_info *rt;
2724         struct inet6_dev *idev = in6_dev_get(dev);
2725         struct net *net = dev_net(dev);
2726
2727         if (unlikely(!idev))
2728                 return ERR_PTR(-ENODEV);
2729
2730         rt = ip6_dst_alloc(net, dev, 0);
2731         if (unlikely(!rt)) {
2732                 in6_dev_put(idev);
2733                 dst = ERR_PTR(-ENOMEM);
2734                 goto out;
2735         }
2736
2737         rt->dst.flags |= DST_HOST;
2738         rt->dst.input = ip6_input;
2739         rt->dst.output  = ip6_output;
2740         rt->rt6i_gateway  = fl6->daddr;
2741         rt->rt6i_dst.addr = fl6->daddr;
2742         rt->rt6i_dst.plen = 128;
2743         rt->rt6i_idev     = idev;
2744         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2745
2746         /* Add this dst into uncached_list so that rt6_disable_ip() can
2747          * do proper release of the net_device
2748          */
2749         rt6_uncached_list_add(rt);
2750         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2751
2752         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2753
2754 out:
2755         return dst;
2756 }
2757
2758 static int ip6_dst_gc(struct dst_ops *ops)
2759 {
2760         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2761         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2762         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2763         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2764         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2765         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2766         int entries;
2767
2768         entries = dst_entries_get_fast(ops);
2769         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2770             entries <= rt_max_size)
2771                 goto out;
2772
2773         net->ipv6.ip6_rt_gc_expire++;
2774         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2775         entries = dst_entries_get_slow(ops);
2776         if (entries < ops->gc_thresh)
2777                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2778 out:
2779         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2780         return entries > rt_max_size;
2781 }
2782
2783 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2784                                             struct fib6_config *cfg,
2785                                             const struct in6_addr *gw_addr,
2786                                             u32 tbid, int flags)
2787 {
2788         struct flowi6 fl6 = {
2789                 .flowi6_oif = cfg->fc_ifindex,
2790                 .daddr = *gw_addr,
2791                 .saddr = cfg->fc_prefsrc,
2792         };
2793         struct fib6_table *table;
2794         struct rt6_info *rt;
2795
2796         table = fib6_get_table(net, tbid);
2797         if (!table)
2798                 return NULL;
2799
2800         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2801                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2802
2803         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2804         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2805
2806         /* if table lookup failed, fall back to full lookup */
2807         if (rt == net->ipv6.ip6_null_entry) {
2808                 ip6_rt_put(rt);
2809                 rt = NULL;
2810         }
2811
2812         return rt;
2813 }
2814
2815 static int ip6_route_check_nh_onlink(struct net *net,
2816                                      struct fib6_config *cfg,
2817                                      const struct net_device *dev,
2818                                      struct netlink_ext_ack *extack)
2819 {
2820         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2821         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2822         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2823         struct fib6_info *from;
2824         struct rt6_info *grt;
2825         int err;
2826
2827         err = 0;
2828         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2829         if (grt) {
2830                 rcu_read_lock();
2831                 from = rcu_dereference(grt->from);
2832                 if (!grt->dst.error &&
2833                     /* ignore match if it is the default route */
2834                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2835                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2836                         NL_SET_ERR_MSG(extack,
2837                                        "Nexthop has invalid gateway or device mismatch");
2838                         err = -EINVAL;
2839                 }
2840                 rcu_read_unlock();
2841
2842                 ip6_rt_put(grt);
2843         }
2844
2845         return err;
2846 }
2847
2848 static int ip6_route_check_nh(struct net *net,
2849                               struct fib6_config *cfg,
2850                               struct net_device **_dev,
2851                               struct inet6_dev **idev)
2852 {
2853         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854         struct net_device *dev = _dev ? *_dev : NULL;
2855         struct rt6_info *grt = NULL;
2856         int err = -EHOSTUNREACH;
2857
2858         if (cfg->fc_table) {
2859                 int flags = RT6_LOOKUP_F_IFACE;
2860
2861                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2862                                           cfg->fc_table, flags);
2863                 if (grt) {
2864                         if (grt->rt6i_flags & RTF_GATEWAY ||
2865                             (dev && dev != grt->dst.dev)) {
2866                                 ip6_rt_put(grt);
2867                                 grt = NULL;
2868                         }
2869                 }
2870         }
2871
2872         if (!grt)
2873                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2874
2875         if (!grt)
2876                 goto out;
2877
2878         if (dev) {
2879                 if (dev != grt->dst.dev) {
2880                         ip6_rt_put(grt);
2881                         goto out;
2882                 }
2883         } else {
2884                 *_dev = dev = grt->dst.dev;
2885                 *idev = grt->rt6i_idev;
2886                 dev_hold(dev);
2887                 in6_dev_hold(grt->rt6i_idev);
2888         }
2889
2890         if (!(grt->rt6i_flags & RTF_GATEWAY))
2891                 err = 0;
2892
2893         ip6_rt_put(grt);
2894
2895 out:
2896         return err;
2897 }
2898
2899 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2900                            struct net_device **_dev, struct inet6_dev **idev,
2901                            struct netlink_ext_ack *extack)
2902 {
2903         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2904         int gwa_type = ipv6_addr_type(gw_addr);
2905         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2906         const struct net_device *dev = *_dev;
2907         bool need_addr_check = !dev;
2908         int err = -EINVAL;
2909
2910         /* if gw_addr is local we will fail to detect this in case
2911          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2912          * will return already-added prefix route via interface that
2913          * prefix route was assigned to, which might be non-loopback.
2914          */
2915         if (dev &&
2916             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2917                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2918                 goto out;
2919         }
2920
2921         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2922                 /* IPv6 strictly inhibits using not link-local
2923                  * addresses as nexthop address.
2924                  * Otherwise, router will not able to send redirects.
2925                  * It is very good, but in some (rare!) circumstances
2926                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2927                  * some exceptions. --ANK
2928                  * We allow IPv4-mapped nexthops to support RFC4798-type
2929                  * addressing
2930                  */
2931                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2932                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2933                         goto out;
2934                 }
2935
2936                 if (cfg->fc_flags & RTNH_F_ONLINK)
2937                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2938                 else
2939                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2940
2941                 if (err)
2942                         goto out;
2943         }
2944
2945         /* reload in case device was changed */
2946         dev = *_dev;
2947
2948         err = -EINVAL;
2949         if (!dev) {
2950                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2951                 goto out;
2952         } else if (dev->flags & IFF_LOOPBACK) {
2953                 NL_SET_ERR_MSG(extack,
2954                                "Egress device can not be loopback device for this route");
2955                 goto out;
2956         }
2957
2958         /* if we did not check gw_addr above, do so now that the
2959          * egress device has been resolved.
2960          */
2961         if (need_addr_check &&
2962             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2963                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2964                 goto out;
2965         }
2966
2967         err = 0;
2968 out:
2969         return err;
2970 }
2971
2972 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2973 {
2974         if ((flags & RTF_REJECT) ||
2975             (dev && (dev->flags & IFF_LOOPBACK) &&
2976              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2977              !(flags & RTF_LOCAL)))
2978                 return true;
2979
2980         return false;
2981 }
2982
2983 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2984                  struct fib6_config *cfg, gfp_t gfp_flags,
2985                  struct netlink_ext_ack *extack)
2986 {
2987         struct net_device *dev = NULL;
2988         struct inet6_dev *idev = NULL;
2989         int addr_type;
2990         int err;
2991
2992         fib6_nh->fib_nh_family = AF_INET6;
2993
2994         err = -ENODEV;
2995         if (cfg->fc_ifindex) {
2996                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2997                 if (!dev)
2998                         goto out;
2999                 idev = in6_dev_get(dev);
3000                 if (!idev)
3001                         goto out;
3002         }
3003
3004         if (cfg->fc_flags & RTNH_F_ONLINK) {
3005                 if (!dev) {
3006                         NL_SET_ERR_MSG(extack,
3007                                        "Nexthop device required for onlink");
3008                         goto out;
3009                 }
3010
3011                 if (!(dev->flags & IFF_UP)) {
3012                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3013                         err = -ENETDOWN;
3014                         goto out;
3015                 }
3016
3017                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3018         }
3019
3020         fib6_nh->fib_nh_weight = 1;
3021
3022         /* We cannot add true routes via loopback here,
3023          * they would result in kernel looping; promote them to reject routes
3024          */
3025         addr_type = ipv6_addr_type(&cfg->fc_dst);
3026         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3027                 /* hold loopback dev/idev if we haven't done so. */
3028                 if (dev != net->loopback_dev) {
3029                         if (dev) {
3030                                 dev_put(dev);
3031                                 in6_dev_put(idev);
3032                         }
3033                         dev = net->loopback_dev;
3034                         dev_hold(dev);
3035                         idev = in6_dev_get(dev);
3036                         if (!idev) {
3037                                 err = -ENODEV;
3038                                 goto out;
3039                         }
3040                 }
3041                 goto set_dev;
3042         }
3043
3044         if (cfg->fc_flags & RTF_GATEWAY) {
3045                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3046                 if (err)
3047                         goto out;
3048
3049                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3050                 fib6_nh->fib_nh_gw_family = AF_INET6;
3051         }
3052
3053         err = -ENODEV;
3054         if (!dev)
3055                 goto out;
3056
3057         if (idev->cnf.disable_ipv6) {
3058                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3059                 err = -EACCES;
3060                 goto out;
3061         }
3062
3063         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3064                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3065                 err = -ENETDOWN;
3066                 goto out;
3067         }
3068
3069         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3070             !netif_carrier_ok(dev))
3071                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3072
3073         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3074                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3075         if (err)
3076                 goto out;
3077 set_dev:
3078         fib6_nh->fib_nh_dev = dev;
3079         fib6_nh->fib_nh_oif = dev->ifindex;
3080         err = 0;
3081 out:
3082         if (idev)
3083                 in6_dev_put(idev);
3084
3085         if (err) {
3086                 lwtstate_put(fib6_nh->fib_nh_lws);
3087                 fib6_nh->fib_nh_lws = NULL;
3088                 if (dev)
3089                         dev_put(dev);
3090         }
3091
3092         return err;
3093 }
3094
3095 void fib6_nh_release(struct fib6_nh *fib6_nh)
3096 {
3097         fib_nh_common_release(&fib6_nh->nh_common);
3098 }
3099
3100 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3101                                               gfp_t gfp_flags,
3102                                               struct netlink_ext_ack *extack)
3103 {
3104         struct net *net = cfg->fc_nlinfo.nl_net;
3105         struct fib6_info *rt = NULL;
3106         struct fib6_table *table;
3107         int err = -EINVAL;
3108         int addr_type;
3109
3110         /* RTF_PCPU is an internal flag; can not be set by userspace */
3111         if (cfg->fc_flags & RTF_PCPU) {
3112                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3113                 goto out;
3114         }
3115
3116         /* RTF_CACHE is an internal flag; can not be set by userspace */
3117         if (cfg->fc_flags & RTF_CACHE) {
3118                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3119                 goto out;
3120         }
3121
3122         if (cfg->fc_type > RTN_MAX) {
3123                 NL_SET_ERR_MSG(extack, "Invalid route type");
3124                 goto out;
3125         }
3126
3127         if (cfg->fc_dst_len > 128) {
3128                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3129                 goto out;
3130         }
3131         if (cfg->fc_src_len > 128) {
3132                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3133                 goto out;
3134         }
3135 #ifndef CONFIG_IPV6_SUBTREES
3136         if (cfg->fc_src_len) {
3137                 NL_SET_ERR_MSG(extack,
3138                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3139                 goto out;
3140         }
3141 #endif
3142
3143         err = -ENOBUFS;
3144         if (cfg->fc_nlinfo.nlh &&
3145             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3146                 table = fib6_get_table(net, cfg->fc_table);
3147                 if (!table) {
3148                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3149                         table = fib6_new_table(net, cfg->fc_table);
3150                 }
3151         } else {
3152                 table = fib6_new_table(net, cfg->fc_table);
3153         }
3154
3155         if (!table)
3156                 goto out;
3157
3158         err = -ENOMEM;
3159         rt = fib6_info_alloc(gfp_flags);
3160         if (!rt)
3161                 goto out;
3162
3163         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3164                                                extack);
3165         if (IS_ERR(rt->fib6_metrics)) {
3166                 err = PTR_ERR(rt->fib6_metrics);
3167                 /* Do not leave garbage there. */
3168                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3169                 goto out;
3170         }
3171
3172         if (cfg->fc_flags & RTF_ADDRCONF)
3173                 rt->dst_nocount = true;
3174
3175         if (cfg->fc_flags & RTF_EXPIRES)
3176                 fib6_set_expires(rt, jiffies +
3177                                 clock_t_to_jiffies(cfg->fc_expires));
3178         else
3179                 fib6_clean_expires(rt);
3180
3181         if (cfg->fc_protocol == RTPROT_UNSPEC)
3182                 cfg->fc_protocol = RTPROT_BOOT;
3183         rt->fib6_protocol = cfg->fc_protocol;
3184
3185         rt->fib6_table = table;
3186         rt->fib6_metric = cfg->fc_metric;
3187         rt->fib6_type = cfg->fc_type;
3188         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3189
3190         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3191         rt->fib6_dst.plen = cfg->fc_dst_len;
3192         if (rt->fib6_dst.plen == 128)
3193                 rt->dst_host = true;
3194
3195 #ifdef CONFIG_IPV6_SUBTREES
3196         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3197         rt->fib6_src.plen = cfg->fc_src_len;
3198 #endif
3199         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3200         if (err)
3201                 goto out;
3202
3203         /* We cannot add true routes via loopback here,
3204          * they would result in kernel looping; promote them to reject routes
3205          */
3206         addr_type = ipv6_addr_type(&cfg->fc_dst);
3207         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3208                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3209
3210         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3211                 struct net_device *dev = fib6_info_nh_dev(rt);
3212
3213                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3214                         NL_SET_ERR_MSG(extack, "Invalid source address");
3215                         err = -EINVAL;
3216                         goto out;
3217                 }
3218                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3219                 rt->fib6_prefsrc.plen = 128;
3220         } else
3221                 rt->fib6_prefsrc.plen = 0;
3222
3223         return rt;
3224 out:
3225         fib6_info_release(rt);
3226         return ERR_PTR(err);
3227 }
3228
3229 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3230                   struct netlink_ext_ack *extack)
3231 {
3232         struct fib6_info *rt;
3233         int err;
3234
3235         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3236         if (IS_ERR(rt))
3237                 return PTR_ERR(rt);
3238
3239         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3240         fib6_info_release(rt);
3241
3242         return err;
3243 }
3244
3245 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3246 {
3247         struct net *net = info->nl_net;
3248         struct fib6_table *table;
3249         int err;
3250
3251         if (rt == net->ipv6.fib6_null_entry) {
3252                 err = -ENOENT;
3253                 goto out;
3254         }
3255
3256         table = rt->fib6_table;
3257         spin_lock_bh(&table->tb6_lock);
3258         err = fib6_del(rt, info);
3259         spin_unlock_bh(&table->tb6_lock);
3260
3261 out:
3262         fib6_info_release(rt);
3263         return err;
3264 }
3265
3266 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3267 {
3268         struct nl_info info = { .nl_net = net };
3269
3270         return __ip6_del_rt(rt, &info);
3271 }
3272
3273 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3274 {
3275         struct nl_info *info = &cfg->fc_nlinfo;
3276         struct net *net = info->nl_net;
3277         struct sk_buff *skb = NULL;
3278         struct fib6_table *table;
3279         int err = -ENOENT;
3280
3281         if (rt == net->ipv6.fib6_null_entry)
3282                 goto out_put;
3283         table = rt->fib6_table;
3284         spin_lock_bh(&table->tb6_lock);
3285
3286         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3287                 struct fib6_info *sibling, *next_sibling;
3288
3289                 /* prefer to send a single notification with all hops */
3290                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3291                 if (skb) {
3292                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3293
3294                         if (rt6_fill_node(net, skb, rt, NULL,
3295                                           NULL, NULL, 0, RTM_DELROUTE,
3296                                           info->portid, seq, 0) < 0) {
3297                                 kfree_skb(skb);
3298                                 skb = NULL;
3299                         } else
3300                                 info->skip_notify = 1;
3301                 }
3302
3303                 list_for_each_entry_safe(sibling, next_sibling,
3304                                          &rt->fib6_siblings,
3305                                          fib6_siblings) {
3306                         err = fib6_del(sibling, info);
3307                         if (err)
3308                                 goto out_unlock;
3309                 }
3310         }
3311
3312         err = fib6_del(rt, info);
3313 out_unlock:
3314         spin_unlock_bh(&table->tb6_lock);
3315 out_put:
3316         fib6_info_release(rt);
3317
3318         if (skb) {
3319                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3320                             info->nlh, gfp_any());
3321         }
3322         return err;
3323 }
3324
3325 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3326 {
3327         int rc = -ESRCH;
3328
3329         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3330                 goto out;
3331
3332         if (cfg->fc_flags & RTF_GATEWAY &&
3333             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3334                 goto out;
3335
3336         rc = rt6_remove_exception_rt(rt);
3337 out:
3338         return rc;
3339 }
3340
3341 static int ip6_route_del(struct fib6_config *cfg,
3342                          struct netlink_ext_ack *extack)
3343 {
3344         struct rt6_info *rt_cache;
3345         struct fib6_table *table;
3346         struct fib6_info *rt;
3347         struct fib6_node *fn;
3348         int err = -ESRCH;
3349
3350         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3351         if (!table) {
3352                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3353                 return err;
3354         }
3355
3356         rcu_read_lock();
3357
3358         fn = fib6_locate(&table->tb6_root,
3359                          &cfg->fc_dst, cfg->fc_dst_len,
3360                          &cfg->fc_src, cfg->fc_src_len,
3361                          !(cfg->fc_flags & RTF_CACHE));
3362
3363         if (fn) {
3364                 for_each_fib6_node_rt_rcu(fn) {
3365                         struct fib6_nh *nh;
3366
3367                         if (cfg->fc_flags & RTF_CACHE) {
3368                                 struct fib6_result res = {
3369                                         .f6i = rt,
3370                                 };
3371                                 int rc;
3372
3373                                 rt_cache = rt6_find_cached_rt(&res,
3374                                                               &cfg->fc_dst,
3375                                                               &cfg->fc_src);
3376                                 if (rt_cache) {
3377                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3378                                         if (rc != -ESRCH) {
3379                                                 rcu_read_unlock();
3380                                                 return rc;
3381                                         }
3382                                 }
3383                                 continue;
3384                         }
3385
3386                         nh = &rt->fib6_nh;
3387                         if (cfg->fc_ifindex &&
3388                             (!nh->fib_nh_dev ||
3389                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3390                                 continue;
3391                         if (cfg->fc_flags & RTF_GATEWAY &&
3392                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3393                                 continue;
3394                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3395                                 continue;
3396                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3397                                 continue;
3398                         if (!fib6_info_hold_safe(rt))
3399                                 continue;
3400                         rcu_read_unlock();
3401
3402                         /* if gateway was specified only delete the one hop */
3403                         if (cfg->fc_flags & RTF_GATEWAY)
3404                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3405
3406                         return __ip6_del_rt_siblings(rt, cfg);
3407                 }
3408         }
3409         rcu_read_unlock();
3410
3411         return err;
3412 }
3413
3414 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3415 {
3416         struct netevent_redirect netevent;
3417         struct rt6_info *rt, *nrt = NULL;
3418         struct fib6_result res = {};
3419         struct ndisc_options ndopts;
3420         struct inet6_dev *in6_dev;
3421         struct neighbour *neigh;
3422         struct rd_msg *msg;
3423         int optlen, on_link;
3424         u8 *lladdr;
3425
3426         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3427         optlen -= sizeof(*msg);
3428
3429         if (optlen < 0) {
3430                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3431                 return;
3432         }
3433
3434         msg = (struct rd_msg *)icmp6_hdr(skb);
3435
3436         if (ipv6_addr_is_multicast(&msg->dest)) {
3437                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3438                 return;
3439         }
3440
3441         on_link = 0;
3442         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3443                 on_link = 1;
3444         } else if (ipv6_addr_type(&msg->target) !=
3445                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3446                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3447                 return;
3448         }
3449
3450         in6_dev = __in6_dev_get(skb->dev);
3451         if (!in6_dev)
3452                 return;
3453         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3454                 return;
3455
3456         /* RFC2461 8.1:
3457          *      The IP source address of the Redirect MUST be the same as the current
3458          *      first-hop router for the specified ICMP Destination Address.
3459          */
3460
3461         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3462                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3463                 return;
3464         }
3465
3466         lladdr = NULL;
3467         if (ndopts.nd_opts_tgt_lladdr) {
3468                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3469                                              skb->dev);
3470                 if (!lladdr) {
3471                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3472                         return;
3473                 }
3474         }
3475
3476         rt = (struct rt6_info *) dst;
3477         if (rt->rt6i_flags & RTF_REJECT) {
3478                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3479                 return;
3480         }
3481
3482         /* Redirect received -> path was valid.
3483          * Look, redirects are sent only in response to data packets,
3484          * so that this nexthop apparently is reachable. --ANK
3485          */
3486         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3487
3488         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3489         if (!neigh)
3490                 return;
3491
3492         /*
3493          *      We have finally decided to accept it.
3494          */
3495
3496         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3497                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3498                      NEIGH_UPDATE_F_OVERRIDE|
3499                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3500                                      NEIGH_UPDATE_F_ISROUTER)),
3501                      NDISC_REDIRECT, &ndopts);
3502
3503         rcu_read_lock();
3504         res.f6i = rcu_dereference(rt->from);
3505         if (!res.f6i)
3506                 goto out;
3507
3508         res.nh = &res.f6i->fib6_nh;
3509         res.fib6_flags = res.f6i->fib6_flags;
3510         res.fib6_type = res.f6i->fib6_type;
3511         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3512         if (!nrt)
3513                 goto out;
3514
3515         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3516         if (on_link)
3517                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3518
3519         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3520
3521         /* rt6_insert_exception() will take care of duplicated exceptions */
3522         if (rt6_insert_exception(nrt, &res)) {
3523                 dst_release_immediate(&nrt->dst);
3524                 goto out;
3525         }
3526
3527         netevent.old = &rt->dst;
3528         netevent.new = &nrt->dst;
3529         netevent.daddr = &msg->dest;
3530         netevent.neigh = neigh;
3531         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3532
3533 out:
3534         rcu_read_unlock();
3535         neigh_release(neigh);
3536 }
3537
3538 #ifdef CONFIG_IPV6_ROUTE_INFO
3539 static struct fib6_info *rt6_get_route_info(struct net *net,
3540                                            const struct in6_addr *prefix, int prefixlen,
3541                                            const struct in6_addr *gwaddr,
3542                                            struct net_device *dev)
3543 {
3544         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3545         int ifindex = dev->ifindex;
3546         struct fib6_node *fn;
3547         struct fib6_info *rt = NULL;
3548         struct fib6_table *table;
3549
3550         table = fib6_get_table(net, tb_id);
3551         if (!table)
3552                 return NULL;
3553
3554         rcu_read_lock();
3555         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3556         if (!fn)
3557                 goto out;
3558
3559         for_each_fib6_node_rt_rcu(fn) {
3560                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3561                         continue;
3562                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3563                     !rt->fib6_nh.fib_nh_gw_family)
3564                         continue;
3565                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3566                         continue;
3567                 if (!fib6_info_hold_safe(rt))
3568                         continue;
3569                 break;
3570         }
3571 out:
3572         rcu_read_unlock();
3573         return rt;
3574 }
3575
3576 static struct fib6_info *rt6_add_route_info(struct net *net,
3577                                            const struct in6_addr *prefix, int prefixlen,
3578                                            const struct in6_addr *gwaddr,
3579                                            struct net_device *dev,
3580                                            unsigned int pref)
3581 {
3582         struct fib6_config cfg = {
3583                 .fc_metric      = IP6_RT_PRIO_USER,
3584                 .fc_ifindex     = dev->ifindex,
3585                 .fc_dst_len     = prefixlen,
3586                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3587                                   RTF_UP | RTF_PREF(pref),
3588                 .fc_protocol = RTPROT_RA,
3589                 .fc_type = RTN_UNICAST,
3590                 .fc_nlinfo.portid = 0,
3591                 .fc_nlinfo.nlh = NULL,
3592                 .fc_nlinfo.nl_net = net,
3593         };
3594
3595         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3596         cfg.fc_dst = *prefix;
3597         cfg.fc_gateway = *gwaddr;
3598
3599         /* We should treat it as a default route if prefix length is 0. */
3600         if (!prefixlen)
3601                 cfg.fc_flags |= RTF_DEFAULT;
3602
3603         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3604
3605         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3606 }
3607 #endif
3608
3609 struct fib6_info *rt6_get_dflt_router(struct net *net,
3610                                      const struct in6_addr *addr,
3611                                      struct net_device *dev)
3612 {
3613         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3614         struct fib6_info *rt;
3615         struct fib6_table *table;
3616
3617         table = fib6_get_table(net, tb_id);
3618         if (!table)
3619                 return NULL;
3620
3621         rcu_read_lock();
3622         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3623                 struct fib6_nh *nh = &rt->fib6_nh;
3624
3625                 if (dev == nh->fib_nh_dev &&
3626                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3627                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3628                         break;
3629         }
3630         if (rt && !fib6_info_hold_safe(rt))
3631                 rt = NULL;
3632         rcu_read_unlock();
3633         return rt;
3634 }
3635
3636 struct fib6_info *rt6_add_dflt_router(struct net *net,
3637                                      const struct in6_addr *gwaddr,
3638                                      struct net_device *dev,
3639                                      unsigned int pref)
3640 {
3641         struct fib6_config cfg = {
3642                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3643                 .fc_metric      = IP6_RT_PRIO_USER,
3644                 .fc_ifindex     = dev->ifindex,
3645                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3646                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3647                 .fc_protocol = RTPROT_RA,
3648                 .fc_type = RTN_UNICAST,
3649                 .fc_nlinfo.portid = 0,
3650                 .fc_nlinfo.nlh = NULL,
3651                 .fc_nlinfo.nl_net = net,
3652         };
3653
3654         cfg.fc_gateway = *gwaddr;
3655
3656         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3657                 struct fib6_table *table;
3658
3659                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3660                 if (table)
3661                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3662         }
3663
3664         return rt6_get_dflt_router(net, gwaddr, dev);
3665 }
3666
3667 static void __rt6_purge_dflt_routers(struct net *net,
3668                                      struct fib6_table *table)
3669 {
3670         struct fib6_info *rt;
3671
3672 restart:
3673         rcu_read_lock();
3674         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3675                 struct net_device *dev = fib6_info_nh_dev(rt);
3676                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3677
3678                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3679                     (!idev || idev->cnf.accept_ra != 2) &&
3680                     fib6_info_hold_safe(rt)) {
3681                         rcu_read_unlock();
3682                         ip6_del_rt(net, rt);
3683                         goto restart;
3684                 }
3685         }
3686         rcu_read_unlock();
3687
3688         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3689 }
3690
3691 void rt6_purge_dflt_routers(struct net *net)
3692 {
3693         struct fib6_table *table;
3694         struct hlist_head *head;
3695         unsigned int h;
3696
3697         rcu_read_lock();
3698
3699         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3700                 head = &net->ipv6.fib_table_hash[h];
3701                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3702                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3703                                 __rt6_purge_dflt_routers(net, table);
3704                 }
3705         }
3706
3707         rcu_read_unlock();
3708 }
3709
3710 static void rtmsg_to_fib6_config(struct net *net,
3711                                  struct in6_rtmsg *rtmsg,
3712                                  struct fib6_config *cfg)
3713 {
3714         *cfg = (struct fib6_config){
3715                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3716                          : RT6_TABLE_MAIN,
3717                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3718                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3719                 .fc_expires = rtmsg->rtmsg_info,
3720                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3721                 .fc_src_len = rtmsg->rtmsg_src_len,
3722                 .fc_flags = rtmsg->rtmsg_flags,
3723                 .fc_type = rtmsg->rtmsg_type,
3724
3725                 .fc_nlinfo.nl_net = net,
3726
3727                 .fc_dst = rtmsg->rtmsg_dst,
3728                 .fc_src = rtmsg->rtmsg_src,
3729                 .fc_gateway = rtmsg->rtmsg_gateway,
3730         };
3731 }
3732
3733 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3734 {
3735         struct fib6_config cfg;
3736         struct in6_rtmsg rtmsg;
3737         int err;
3738
3739         switch (cmd) {
3740         case SIOCADDRT:         /* Add a route */
3741         case SIOCDELRT:         /* Delete a route */
3742                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3743                         return -EPERM;
3744                 err = copy_from_user(&rtmsg, arg,
3745                                      sizeof(struct in6_rtmsg));
3746                 if (err)
3747                         return -EFAULT;
3748
3749                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3750
3751                 rtnl_lock();
3752                 switch (cmd) {
3753                 case SIOCADDRT:
3754                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3755                         break;
3756                 case SIOCDELRT:
3757                         err = ip6_route_del(&cfg, NULL);
3758                         break;
3759                 default:
3760                         err = -EINVAL;
3761                 }
3762                 rtnl_unlock();
3763
3764                 return err;
3765         }
3766
3767         return -EINVAL;
3768 }
3769
3770 /*
3771  *      Drop the packet on the floor
3772  */
3773
3774 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3775 {
3776         struct dst_entry *dst = skb_dst(skb);
3777         struct net *net = dev_net(dst->dev);
3778         struct inet6_dev *idev;
3779         int type;
3780
3781         if (netif_is_l3_master(skb->dev) &&
3782             dst->dev == net->loopback_dev)
3783                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3784         else
3785                 idev = ip6_dst_idev(dst);
3786
3787         switch (ipstats_mib_noroutes) {
3788         case IPSTATS_MIB_INNOROUTES:
3789                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3790                 if (type == IPV6_ADDR_ANY) {
3791                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3792                         break;
3793                 }
3794                 /* FALLTHROUGH */
3795         case IPSTATS_MIB_OUTNOROUTES:
3796                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3797                 break;
3798         }
3799
3800         /* Start over by dropping the dst for l3mdev case */
3801         if (netif_is_l3_master(skb->dev))
3802                 skb_dst_drop(skb);
3803
3804         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3805         kfree_skb(skb);
3806         return 0;
3807 }
3808
3809 static int ip6_pkt_discard(struct sk_buff *skb)
3810 {
3811         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3812 }
3813
3814 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3815 {
3816         skb->dev = skb_dst(skb)->dev;
3817         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3818 }
3819
3820 static int ip6_pkt_prohibit(struct sk_buff *skb)
3821 {
3822         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3823 }
3824
3825 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3826 {
3827         skb->dev = skb_dst(skb)->dev;
3828         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3829 }
3830
3831 /*
3832  *      Allocate a dst for local (unicast / anycast) address.
3833  */
3834
3835 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3836                                      struct inet6_dev *idev,
3837                                      const struct in6_addr *addr,
3838                                      bool anycast, gfp_t gfp_flags)
3839 {
3840         struct fib6_config cfg = {
3841                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3842                 .fc_ifindex = idev->dev->ifindex,
3843                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3844                 .fc_dst = *addr,
3845                 .fc_dst_len = 128,
3846                 .fc_protocol = RTPROT_KERNEL,
3847                 .fc_nlinfo.nl_net = net,
3848                 .fc_ignore_dev_down = true,
3849         };
3850
3851         if (anycast) {
3852                 cfg.fc_type = RTN_ANYCAST;
3853                 cfg.fc_flags |= RTF_ANYCAST;
3854         } else {
3855                 cfg.fc_type = RTN_LOCAL;
3856                 cfg.fc_flags |= RTF_LOCAL;
3857         }
3858
3859         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3860 }
3861
3862 /* remove deleted ip from prefsrc entries */
3863 struct arg_dev_net_ip {
3864         struct net_device *dev;
3865         struct net *net;
3866         struct in6_addr *addr;
3867 };
3868
3869 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3870 {
3871         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3872         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3873         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3874
3875         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3876             rt != net->ipv6.fib6_null_entry &&
3877             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3878                 spin_lock_bh(&rt6_exception_lock);
3879                 /* remove prefsrc entry */
3880                 rt->fib6_prefsrc.plen = 0;
3881                 spin_unlock_bh(&rt6_exception_lock);
3882         }
3883         return 0;
3884 }
3885
3886 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3887 {
3888         struct net *net = dev_net(ifp->idev->dev);
3889         struct arg_dev_net_ip adni = {
3890                 .dev = ifp->idev->dev,
3891                 .net = net,
3892                 .addr = &ifp->addr,
3893         };
3894         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3895 }
3896
3897 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3898
3899 /* Remove routers and update dst entries when gateway turn into host. */
3900 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3901 {
3902         struct in6_addr *gateway = (struct in6_addr *)arg;
3903
3904         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3905             rt->fib6_nh.fib_nh_gw_family &&
3906             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3907                 return -1;
3908         }
3909
3910         /* Further clean up cached routes in exception table.
3911          * This is needed because cached route may have a different
3912          * gateway than its 'parent' in the case of an ip redirect.
3913          */
3914         rt6_exceptions_clean_tohost(rt, gateway);
3915
3916         return 0;
3917 }
3918
3919 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3920 {
3921         fib6_clean_all(net, fib6_clean_tohost, gateway);
3922 }
3923
3924 struct arg_netdev_event {
3925         const struct net_device *dev;
3926         union {
3927                 unsigned char nh_flags;
3928                 unsigned long event;
3929         };
3930 };
3931
3932 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3933 {
3934         struct fib6_info *iter;
3935         struct fib6_node *fn;
3936
3937         fn = rcu_dereference_protected(rt->fib6_node,
3938                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3939         iter = rcu_dereference_protected(fn->leaf,
3940                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3941         while (iter) {
3942                 if (iter->fib6_metric == rt->fib6_metric &&
3943                     rt6_qualify_for_ecmp(iter))
3944                         return iter;
3945                 iter = rcu_dereference_protected(iter->fib6_next,
3946                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3947         }
3948
3949         return NULL;
3950 }
3951
3952 static bool rt6_is_dead(const struct fib6_info *rt)
3953 {
3954         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3955             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3956              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3957                 return true;
3958
3959         return false;
3960 }
3961
3962 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3963 {
3964         struct fib6_info *iter;
3965         int total = 0;
3966
3967         if (!rt6_is_dead(rt))
3968                 total += rt->fib6_nh.fib_nh_weight;
3969
3970         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3971                 if (!rt6_is_dead(iter))
3972                         total += iter->fib6_nh.fib_nh_weight;
3973         }
3974
3975         return total;
3976 }
3977
3978 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3979 {
3980         int upper_bound = -1;
3981
3982         if (!rt6_is_dead(rt)) {
3983                 *weight += rt->fib6_nh.fib_nh_weight;
3984                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3985                                                     total) - 1;
3986         }
3987         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3988 }
3989
3990 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3991 {
3992         struct fib6_info *iter;
3993         int weight = 0;
3994
3995         rt6_upper_bound_set(rt, &weight, total);
3996
3997         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3998                 rt6_upper_bound_set(iter, &weight, total);
3999 }
4000
4001 void rt6_multipath_rebalance(struct fib6_info *rt)
4002 {
4003         struct fib6_info *first;
4004         int total;
4005
4006         /* In case the entire multipath route was marked for flushing,
4007          * then there is no need to rebalance upon the removal of every
4008          * sibling route.
4009          */
4010         if (!rt->fib6_nsiblings || rt->should_flush)
4011                 return;
4012
4013         /* During lookup routes are evaluated in order, so we need to
4014          * make sure upper bounds are assigned from the first sibling
4015          * onwards.
4016          */
4017         first = rt6_multipath_first_sibling(rt);
4018         if (WARN_ON_ONCE(!first))
4019                 return;
4020
4021         total = rt6_multipath_total_weight(first);
4022         rt6_multipath_upper_bound_set(first, total);
4023 }
4024
4025 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4026 {
4027         const struct arg_netdev_event *arg = p_arg;
4028         struct net *net = dev_net(arg->dev);
4029
4030         if (rt != net->ipv6.fib6_null_entry &&
4031             rt->fib6_nh.fib_nh_dev == arg->dev) {
4032                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4033                 fib6_update_sernum_upto_root(net, rt);
4034                 rt6_multipath_rebalance(rt);
4035         }
4036
4037         return 0;
4038 }
4039
4040 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4041 {
4042         struct arg_netdev_event arg = {
4043                 .dev = dev,
4044                 {
4045                         .nh_flags = nh_flags,
4046                 },
4047         };
4048
4049         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4050                 arg.nh_flags |= RTNH_F_LINKDOWN;
4051
4052         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4053 }
4054
4055 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4056                                    const struct net_device *dev)
4057 {
4058         struct fib6_info *iter;
4059
4060         if (rt->fib6_nh.fib_nh_dev == dev)
4061                 return true;
4062         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063                 if (iter->fib6_nh.fib_nh_dev == dev)
4064                         return true;
4065
4066         return false;
4067 }
4068
4069 static void rt6_multipath_flush(struct fib6_info *rt)
4070 {
4071         struct fib6_info *iter;
4072
4073         rt->should_flush = 1;
4074         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4075                 iter->should_flush = 1;
4076 }
4077
4078 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4079                                              const struct net_device *down_dev)
4080 {
4081         struct fib6_info *iter;
4082         unsigned int dead = 0;
4083
4084         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4085             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4086                 dead++;
4087         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4088                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4089                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4090                         dead++;
4091
4092         return dead;
4093 }
4094
4095 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4096                                        const struct net_device *dev,
4097                                        unsigned char nh_flags)
4098 {
4099         struct fib6_info *iter;
4100
4101         if (rt->fib6_nh.fib_nh_dev == dev)
4102                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4103         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4104                 if (iter->fib6_nh.fib_nh_dev == dev)
4105                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4106 }
4107
4108 /* called with write lock held for table with rt */
4109 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4110 {
4111         const struct arg_netdev_event *arg = p_arg;
4112         const struct net_device *dev = arg->dev;
4113         struct net *net = dev_net(dev);
4114
4115         if (rt == net->ipv6.fib6_null_entry)
4116                 return 0;
4117
4118         switch (arg->event) {
4119         case NETDEV_UNREGISTER:
4120                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4121         case NETDEV_DOWN:
4122                 if (rt->should_flush)
4123                         return -1;
4124                 if (!rt->fib6_nsiblings)
4125                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4126                 if (rt6_multipath_uses_dev(rt, dev)) {
4127                         unsigned int count;
4128
4129                         count = rt6_multipath_dead_count(rt, dev);
4130                         if (rt->fib6_nsiblings + 1 == count) {
4131                                 rt6_multipath_flush(rt);
4132                                 return -1;
4133                         }
4134                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4135                                                    RTNH_F_LINKDOWN);
4136                         fib6_update_sernum(net, rt);
4137                         rt6_multipath_rebalance(rt);
4138                 }
4139                 return -2;
4140         case NETDEV_CHANGE:
4141                 if (rt->fib6_nh.fib_nh_dev != dev ||
4142                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4143                         break;
4144                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4145                 rt6_multipath_rebalance(rt);
4146                 break;
4147         }
4148
4149         return 0;
4150 }
4151
4152 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4153 {
4154         struct arg_netdev_event arg = {
4155                 .dev = dev,
4156                 {
4157                         .event = event,
4158                 },
4159         };
4160         struct net *net = dev_net(dev);
4161
4162         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4163                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4164         else
4165                 fib6_clean_all(net, fib6_ifdown, &arg);
4166 }
4167
4168 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4169 {
4170         rt6_sync_down_dev(dev, event);
4171         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4172         neigh_ifdown(&nd_tbl, dev);
4173 }
4174
4175 struct rt6_mtu_change_arg {
4176         struct net_device *dev;
4177         unsigned int mtu;
4178 };
4179
4180 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4181 {
4182         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4183         struct inet6_dev *idev;
4184
4185         /* In IPv6 pmtu discovery is not optional,
4186            so that RTAX_MTU lock cannot disable it.
4187            We still use this lock to block changes
4188            caused by addrconf/ndisc.
4189         */
4190
4191         idev = __in6_dev_get(arg->dev);
4192         if (!idev)
4193                 return 0;
4194
4195         /* For administrative MTU increase, there is no way to discover
4196            IPv6 PMTU increase, so PMTU increase should be updated here.
4197            Since RFC 1981 doesn't include administrative MTU increase
4198            update PMTU increase is a MUST. (i.e. jumbo frame)
4199          */
4200         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4201             !fib6_metric_locked(rt, RTAX_MTU)) {
4202                 u32 mtu = rt->fib6_pmtu;
4203
4204                 if (mtu >= arg->mtu ||
4205                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4206                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4207
4208                 spin_lock_bh(&rt6_exception_lock);
4209                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4210                 spin_unlock_bh(&rt6_exception_lock);
4211         }
4212         return 0;
4213 }
4214
4215 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4216 {
4217         struct rt6_mtu_change_arg arg = {
4218                 .dev = dev,
4219                 .mtu = mtu,
4220         };
4221
4222         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4223 }
4224
4225 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4226         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4227         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4228         [RTA_OIF]               = { .type = NLA_U32 },
4229         [RTA_IIF]               = { .type = NLA_U32 },
4230         [RTA_PRIORITY]          = { .type = NLA_U32 },
4231         [RTA_METRICS]           = { .type = NLA_NESTED },
4232         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4233         [RTA_PREF]              = { .type = NLA_U8 },
4234         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4235         [RTA_ENCAP]             = { .type = NLA_NESTED },
4236         [RTA_EXPIRES]           = { .type = NLA_U32 },
4237         [RTA_UID]               = { .type = NLA_U32 },
4238         [RTA_MARK]              = { .type = NLA_U32 },
4239         [RTA_TABLE]             = { .type = NLA_U32 },
4240         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4241         [RTA_SPORT]             = { .type = NLA_U16 },
4242         [RTA_DPORT]             = { .type = NLA_U16 },
4243 };
4244
4245 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4246                               struct fib6_config *cfg,
4247                               struct netlink_ext_ack *extack)
4248 {
4249         struct rtmsg *rtm;
4250         struct nlattr *tb[RTA_MAX+1];
4251         unsigned int pref;
4252         int err;
4253
4254         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4255                                      rtm_ipv6_policy, extack);
4256         if (err < 0)
4257                 goto errout;
4258
4259         err = -EINVAL;
4260         rtm = nlmsg_data(nlh);
4261
4262         *cfg = (struct fib6_config){
4263                 .fc_table = rtm->rtm_table,
4264                 .fc_dst_len = rtm->rtm_dst_len,
4265                 .fc_src_len = rtm->rtm_src_len,
4266                 .fc_flags = RTF_UP,
4267                 .fc_protocol = rtm->rtm_protocol,
4268                 .fc_type = rtm->rtm_type,
4269
4270                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4271                 .fc_nlinfo.nlh = nlh,
4272                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4273         };
4274
4275         if (rtm->rtm_type == RTN_UNREACHABLE ||
4276             rtm->rtm_type == RTN_BLACKHOLE ||
4277             rtm->rtm_type == RTN_PROHIBIT ||
4278             rtm->rtm_type == RTN_THROW)
4279                 cfg->fc_flags |= RTF_REJECT;
4280
4281         if (rtm->rtm_type == RTN_LOCAL)
4282                 cfg->fc_flags |= RTF_LOCAL;
4283
4284         if (rtm->rtm_flags & RTM_F_CLONED)
4285                 cfg->fc_flags |= RTF_CACHE;
4286
4287         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4288
4289         if (tb[RTA_GATEWAY]) {
4290                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4291                 cfg->fc_flags |= RTF_GATEWAY;
4292         }
4293         if (tb[RTA_VIA]) {
4294                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4295                 goto errout;
4296         }
4297
4298         if (tb[RTA_DST]) {
4299                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4300
4301                 if (nla_len(tb[RTA_DST]) < plen)
4302                         goto errout;
4303
4304                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4305         }
4306
4307         if (tb[RTA_SRC]) {
4308                 int plen = (rtm->rtm_src_len + 7) >> 3;
4309
4310                 if (nla_len(tb[RTA_SRC]) < plen)
4311                         goto errout;
4312
4313                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4314         }
4315
4316         if (tb[RTA_PREFSRC])
4317                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4318
4319         if (tb[RTA_OIF])
4320                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4321
4322         if (tb[RTA_PRIORITY])
4323                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4324
4325         if (tb[RTA_METRICS]) {
4326                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4327                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4328         }
4329
4330         if (tb[RTA_TABLE])
4331                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4332
4333         if (tb[RTA_MULTIPATH]) {
4334                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4335                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4336
4337                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4338                                                      cfg->fc_mp_len, extack);
4339                 if (err < 0)
4340                         goto errout;
4341         }
4342
4343         if (tb[RTA_PREF]) {
4344                 pref = nla_get_u8(tb[RTA_PREF]);
4345                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4346                     pref != ICMPV6_ROUTER_PREF_HIGH)
4347                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4348                 cfg->fc_flags |= RTF_PREF(pref);
4349         }
4350
4351         if (tb[RTA_ENCAP])
4352                 cfg->fc_encap = tb[RTA_ENCAP];
4353
4354         if (tb[RTA_ENCAP_TYPE]) {
4355                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4356
4357                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4358                 if (err < 0)
4359                         goto errout;
4360         }
4361
4362         if (tb[RTA_EXPIRES]) {
4363                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4364
4365                 if (addrconf_finite_timeout(timeout)) {
4366                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4367                         cfg->fc_flags |= RTF_EXPIRES;
4368                 }
4369         }
4370
4371         err = 0;
4372 errout:
4373         return err;
4374 }
4375
4376 struct rt6_nh {
4377         struct fib6_info *fib6_info;
4378         struct fib6_config r_cfg;
4379         struct list_head next;
4380 };
4381
4382 static int ip6_route_info_append(struct net *net,
4383                                  struct list_head *rt6_nh_list,
4384                                  struct fib6_info *rt,
4385                                  struct fib6_config *r_cfg)
4386 {
4387         struct rt6_nh *nh;
4388         int err = -EEXIST;
4389
4390         list_for_each_entry(nh, rt6_nh_list, next) {
4391                 /* check if fib6_info already exists */
4392                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4393                         return err;
4394         }
4395
4396         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4397         if (!nh)
4398                 return -ENOMEM;
4399         nh->fib6_info = rt;
4400         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4401         list_add_tail(&nh->next, rt6_nh_list);
4402
4403         return 0;
4404 }
4405
4406 static void ip6_route_mpath_notify(struct fib6_info *rt,
4407                                    struct fib6_info *rt_last,
4408                                    struct nl_info *info,
4409                                    __u16 nlflags)
4410 {
4411         /* if this is an APPEND route, then rt points to the first route
4412          * inserted and rt_last points to last route inserted. Userspace
4413          * wants a consistent dump of the route which starts at the first
4414          * nexthop. Since sibling routes are always added at the end of
4415          * the list, find the first sibling of the last route appended
4416          */
4417         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4418                 rt = list_first_entry(&rt_last->fib6_siblings,
4419                                       struct fib6_info,
4420                                       fib6_siblings);
4421         }
4422
4423         if (rt)
4424                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4425 }
4426
4427 static int ip6_route_multipath_add(struct fib6_config *cfg,
4428                                    struct netlink_ext_ack *extack)
4429 {
4430         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4431         struct nl_info *info = &cfg->fc_nlinfo;
4432         struct fib6_config r_cfg;
4433         struct rtnexthop *rtnh;
4434         struct fib6_info *rt;
4435         struct rt6_nh *err_nh;
4436         struct rt6_nh *nh, *nh_safe;
4437         __u16 nlflags;
4438         int remaining;
4439         int attrlen;
4440         int err = 1;
4441         int nhn = 0;
4442         int replace = (cfg->fc_nlinfo.nlh &&
4443                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4444         LIST_HEAD(rt6_nh_list);
4445
4446         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4447         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4448                 nlflags |= NLM_F_APPEND;
4449
4450         remaining = cfg->fc_mp_len;
4451         rtnh = (struct rtnexthop *)cfg->fc_mp;
4452
4453         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4454          * fib6_info structs per nexthop
4455          */
4456         while (rtnh_ok(rtnh, remaining)) {
4457                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4458                 if (rtnh->rtnh_ifindex)
4459                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4460
4461                 attrlen = rtnh_attrlen(rtnh);
4462                 if (attrlen > 0) {
4463                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4464
4465                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4466                         if (nla) {
4467                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4468                                 r_cfg.fc_flags |= RTF_GATEWAY;
4469                         }
4470                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4471                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4472                         if (nla)
4473                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4474                 }
4475
4476                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4477                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4478                 if (IS_ERR(rt)) {
4479                         err = PTR_ERR(rt);
4480                         rt = NULL;
4481                         goto cleanup;
4482                 }
4483                 if (!rt6_qualify_for_ecmp(rt)) {
4484                         err = -EINVAL;
4485                         NL_SET_ERR_MSG(extack,
4486                                        "Device only routes can not be added for IPv6 using the multipath API.");
4487                         fib6_info_release(rt);
4488                         goto cleanup;
4489                 }
4490
4491                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4492
4493                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4494                                             rt, &r_cfg);
4495                 if (err) {
4496                         fib6_info_release(rt);
4497                         goto cleanup;
4498                 }
4499
4500                 rtnh = rtnh_next(rtnh, &remaining);
4501         }
4502
4503         /* for add and replace send one notification with all nexthops.
4504          * Skip the notification in fib6_add_rt2node and send one with
4505          * the full route when done
4506          */
4507         info->skip_notify = 1;
4508
4509         err_nh = NULL;
4510         list_for_each_entry(nh, &rt6_nh_list, next) {
4511                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4512                 fib6_info_release(nh->fib6_info);
4513
4514                 if (!err) {
4515                         /* save reference to last route successfully inserted */
4516                         rt_last = nh->fib6_info;
4517
4518                         /* save reference to first route for notification */
4519                         if (!rt_notif)
4520                                 rt_notif = nh->fib6_info;
4521                 }
4522
4523                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4524                 nh->fib6_info = NULL;
4525                 if (err) {
4526                         if (replace && nhn)
4527                                 NL_SET_ERR_MSG_MOD(extack,
4528                                                    "multipath route replace failed (check consistency of installed routes)");
4529                         err_nh = nh;
4530                         goto add_errout;
4531                 }
4532
4533                 /* Because each route is added like a single route we remove
4534                  * these flags after the first nexthop: if there is a collision,
4535                  * we have already failed to add the first nexthop:
4536                  * fib6_add_rt2node() has rejected it; when replacing, old
4537                  * nexthops have been replaced by first new, the rest should
4538                  * be added to it.
4539                  */
4540                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4541                                                      NLM_F_REPLACE);
4542                 nhn++;
4543         }
4544
4545         /* success ... tell user about new route */
4546         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4547         goto cleanup;
4548
4549 add_errout:
4550         /* send notification for routes that were added so that
4551          * the delete notifications sent by ip6_route_del are
4552          * coherent
4553          */
4554         if (rt_notif)
4555                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4556
4557         /* Delete routes that were already added */
4558         list_for_each_entry(nh, &rt6_nh_list, next) {
4559                 if (err_nh == nh)
4560                         break;
4561                 ip6_route_del(&nh->r_cfg, extack);
4562         }
4563
4564 cleanup:
4565         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4566                 if (nh->fib6_info)
4567                         fib6_info_release(nh->fib6_info);
4568                 list_del(&nh->next);
4569                 kfree(nh);
4570         }
4571
4572         return err;
4573 }
4574
4575 static int ip6_route_multipath_del(struct fib6_config *cfg,
4576                                    struct netlink_ext_ack *extack)
4577 {
4578         struct fib6_config r_cfg;
4579         struct rtnexthop *rtnh;
4580         int remaining;
4581         int attrlen;
4582         int err = 1, last_err = 0;
4583
4584         remaining = cfg->fc_mp_len;
4585         rtnh = (struct rtnexthop *)cfg->fc_mp;
4586
4587         /* Parse a Multipath Entry */
4588         while (rtnh_ok(rtnh, remaining)) {
4589                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4590                 if (rtnh->rtnh_ifindex)
4591                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4592
4593                 attrlen = rtnh_attrlen(rtnh);
4594                 if (attrlen > 0) {
4595                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4596
4597                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4598                         if (nla) {
4599                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4600                                 r_cfg.fc_flags |= RTF_GATEWAY;
4601                         }
4602                 }
4603                 err = ip6_route_del(&r_cfg, extack);
4604                 if (err)
4605                         last_err = err;
4606
4607                 rtnh = rtnh_next(rtnh, &remaining);
4608         }
4609
4610         return last_err;
4611 }
4612
4613 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4614                               struct netlink_ext_ack *extack)
4615 {
4616         struct fib6_config cfg;
4617         int err;
4618
4619         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4620         if (err < 0)
4621                 return err;
4622
4623         if (cfg.fc_mp)
4624                 return ip6_route_multipath_del(&cfg, extack);
4625         else {
4626                 cfg.fc_delete_all_nh = 1;
4627                 return ip6_route_del(&cfg, extack);
4628         }
4629 }
4630
4631 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4632                               struct netlink_ext_ack *extack)
4633 {
4634         struct fib6_config cfg;
4635         int err;
4636
4637         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4638         if (err < 0)
4639                 return err;
4640
4641         if (cfg.fc_metric == 0)
4642                 cfg.fc_metric = IP6_RT_PRIO_USER;
4643
4644         if (cfg.fc_mp)
4645                 return ip6_route_multipath_add(&cfg, extack);
4646         else
4647                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4648 }
4649
4650 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4651 {
4652         int nexthop_len = 0;
4653
4654         if (rt->fib6_nsiblings) {
4655                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4656                             + NLA_ALIGN(sizeof(struct rtnexthop))
4657                             + nla_total_size(16) /* RTA_GATEWAY */
4658                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4659
4660                 nexthop_len *= rt->fib6_nsiblings;
4661         }
4662
4663         return NLMSG_ALIGN(sizeof(struct rtmsg))
4664                + nla_total_size(16) /* RTA_SRC */
4665                + nla_total_size(16) /* RTA_DST */
4666                + nla_total_size(16) /* RTA_GATEWAY */
4667                + nla_total_size(16) /* RTA_PREFSRC */
4668                + nla_total_size(4) /* RTA_TABLE */
4669                + nla_total_size(4) /* RTA_IIF */
4670                + nla_total_size(4) /* RTA_OIF */
4671                + nla_total_size(4) /* RTA_PRIORITY */
4672                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4673                + nla_total_size(sizeof(struct rta_cacheinfo))
4674                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4675                + nla_total_size(1) /* RTA_PREF */
4676                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4677                + nexthop_len;
4678 }
4679
4680 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4681                          struct fib6_info *rt, struct dst_entry *dst,
4682                          struct in6_addr *dest, struct in6_addr *src,
4683                          int iif, int type, u32 portid, u32 seq,
4684                          unsigned int flags)
4685 {
4686         struct rt6_info *rt6 = (struct rt6_info *)dst;
4687         struct rt6key *rt6_dst, *rt6_src;
4688         u32 *pmetrics, table, rt6_flags;
4689         struct nlmsghdr *nlh;
4690         struct rtmsg *rtm;
4691         long expires = 0;
4692
4693         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4694         if (!nlh)
4695                 return -EMSGSIZE;
4696
4697         if (rt6) {
4698                 rt6_dst = &rt6->rt6i_dst;
4699                 rt6_src = &rt6->rt6i_src;
4700                 rt6_flags = rt6->rt6i_flags;
4701         } else {
4702                 rt6_dst = &rt->fib6_dst;
4703                 rt6_src = &rt->fib6_src;
4704                 rt6_flags = rt->fib6_flags;
4705         }
4706
4707         rtm = nlmsg_data(nlh);
4708         rtm->rtm_family = AF_INET6;
4709         rtm->rtm_dst_len = rt6_dst->plen;
4710         rtm->rtm_src_len = rt6_src->plen;
4711         rtm->rtm_tos = 0;
4712         if (rt->fib6_table)
4713                 table = rt->fib6_table->tb6_id;
4714         else
4715                 table = RT6_TABLE_UNSPEC;
4716         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4717         if (nla_put_u32(skb, RTA_TABLE, table))
4718                 goto nla_put_failure;
4719
4720         rtm->rtm_type = rt->fib6_type;
4721         rtm->rtm_flags = 0;
4722         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4723         rtm->rtm_protocol = rt->fib6_protocol;
4724
4725         if (rt6_flags & RTF_CACHE)
4726                 rtm->rtm_flags |= RTM_F_CLONED;
4727
4728         if (dest) {
4729                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4730                         goto nla_put_failure;
4731                 rtm->rtm_dst_len = 128;
4732         } else if (rtm->rtm_dst_len)
4733                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4734                         goto nla_put_failure;
4735 #ifdef CONFIG_IPV6_SUBTREES
4736         if (src) {
4737                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4738                         goto nla_put_failure;
4739                 rtm->rtm_src_len = 128;
4740         } else if (rtm->rtm_src_len &&
4741                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4742                 goto nla_put_failure;
4743 #endif
4744         if (iif) {
4745 #ifdef CONFIG_IPV6_MROUTE
4746                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4747                         int err = ip6mr_get_route(net, skb, rtm, portid);
4748
4749                         if (err == 0)
4750                                 return 0;
4751                         if (err < 0)
4752                                 goto nla_put_failure;
4753                 } else
4754 #endif
4755                         if (nla_put_u32(skb, RTA_IIF, iif))
4756                                 goto nla_put_failure;
4757         } else if (dest) {
4758                 struct in6_addr saddr_buf;
4759                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4760                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4761                         goto nla_put_failure;
4762         }
4763
4764         if (rt->fib6_prefsrc.plen) {
4765                 struct in6_addr saddr_buf;
4766                 saddr_buf = rt->fib6_prefsrc.addr;
4767                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4768                         goto nla_put_failure;
4769         }
4770
4771         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4772         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4773                 goto nla_put_failure;
4774
4775         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4776                 goto nla_put_failure;
4777
4778         /* For multipath routes, walk the siblings list and add
4779          * each as a nexthop within RTA_MULTIPATH.
4780          */
4781         if (rt6) {
4782                 if (rt6_flags & RTF_GATEWAY &&
4783                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4784                         goto nla_put_failure;
4785
4786                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4787                         goto nla_put_failure;
4788         } else if (rt->fib6_nsiblings) {
4789                 struct fib6_info *sibling, *next_sibling;
4790                 struct nlattr *mp;
4791
4792                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4793                 if (!mp)
4794                         goto nla_put_failure;
4795
4796                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4797                                     rt->fib6_nh.fib_nh_weight) < 0)
4798                         goto nla_put_failure;
4799
4800                 list_for_each_entry_safe(sibling, next_sibling,
4801                                          &rt->fib6_siblings, fib6_siblings) {
4802                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4803                                             sibling->fib6_nh.fib_nh_weight) < 0)
4804                                 goto nla_put_failure;
4805                 }
4806
4807                 nla_nest_end(skb, mp);
4808         } else {
4809                 unsigned char nh_flags = 0;
4810
4811                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4812                                      &nh_flags, false) < 0)
4813                         goto nla_put_failure;
4814
4815                 rtm->rtm_flags |= nh_flags;
4816         }
4817
4818         if (rt6_flags & RTF_EXPIRES) {
4819                 expires = dst ? dst->expires : rt->expires;
4820                 expires -= jiffies;
4821         }
4822
4823         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4824                 goto nla_put_failure;
4825
4826         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4827                 goto nla_put_failure;
4828
4829
4830         nlmsg_end(skb, nlh);
4831         return 0;
4832
4833 nla_put_failure:
4834         nlmsg_cancel(skb, nlh);
4835         return -EMSGSIZE;
4836 }
4837
4838 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4839                                const struct net_device *dev)
4840 {
4841         if (f6i->fib6_nh.fib_nh_dev == dev)
4842                 return true;
4843
4844         if (f6i->fib6_nsiblings) {
4845                 struct fib6_info *sibling, *next_sibling;
4846
4847                 list_for_each_entry_safe(sibling, next_sibling,
4848                                          &f6i->fib6_siblings, fib6_siblings) {
4849                         if (sibling->fib6_nh.fib_nh_dev == dev)
4850                                 return true;
4851                 }
4852         }
4853
4854         return false;
4855 }
4856
4857 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4858 {
4859         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4860         struct fib_dump_filter *filter = &arg->filter;
4861         unsigned int flags = NLM_F_MULTI;
4862         struct net *net = arg->net;
4863
4864         if (rt == net->ipv6.fib6_null_entry)
4865                 return 0;
4866
4867         if ((filter->flags & RTM_F_PREFIX) &&
4868             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4869                 /* success since this is not a prefix route */
4870                 return 1;
4871         }
4872         if (filter->filter_set) {
4873                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4874                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4875                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4876                         return 1;
4877                 }
4878                 flags |= NLM_F_DUMP_FILTERED;
4879         }
4880
4881         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4882                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4883                              arg->cb->nlh->nlmsg_seq, flags);
4884 }
4885
4886 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4887                                         const struct nlmsghdr *nlh,
4888                                         struct nlattr **tb,
4889                                         struct netlink_ext_ack *extack)
4890 {
4891         struct rtmsg *rtm;
4892         int i, err;
4893
4894         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4895                 NL_SET_ERR_MSG_MOD(extack,
4896                                    "Invalid header for get route request");
4897                 return -EINVAL;
4898         }
4899
4900         if (!netlink_strict_get_check(skb))
4901                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4902                                               rtm_ipv6_policy, extack);
4903
4904         rtm = nlmsg_data(nlh);
4905         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4906             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4907             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4908             rtm->rtm_type) {
4909                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4910                 return -EINVAL;
4911         }
4912         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4913                 NL_SET_ERR_MSG_MOD(extack,
4914                                    "Invalid flags for get route request");
4915                 return -EINVAL;
4916         }
4917
4918         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4919                                             rtm_ipv6_policy, extack);
4920         if (err)
4921                 return err;
4922
4923         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4924             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4925                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4926                 return -EINVAL;
4927         }
4928
4929         for (i = 0; i <= RTA_MAX; i++) {
4930                 if (!tb[i])
4931                         continue;
4932
4933                 switch (i) {
4934                 case RTA_SRC:
4935                 case RTA_DST:
4936                 case RTA_IIF:
4937                 case RTA_OIF:
4938                 case RTA_MARK:
4939                 case RTA_UID:
4940                 case RTA_SPORT:
4941                 case RTA_DPORT:
4942                 case RTA_IP_PROTO:
4943                         break;
4944                 default:
4945                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4946                         return -EINVAL;
4947                 }
4948         }
4949
4950         return 0;
4951 }
4952
4953 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4954                               struct netlink_ext_ack *extack)
4955 {
4956         struct net *net = sock_net(in_skb->sk);
4957         struct nlattr *tb[RTA_MAX+1];
4958         int err, iif = 0, oif = 0;
4959         struct fib6_info *from;
4960         struct dst_entry *dst;
4961         struct rt6_info *rt;
4962         struct sk_buff *skb;
4963         struct rtmsg *rtm;
4964         struct flowi6 fl6 = {};
4965         bool fibmatch;
4966
4967         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4968         if (err < 0)
4969                 goto errout;
4970
4971         err = -EINVAL;
4972         rtm = nlmsg_data(nlh);
4973         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4974         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4975
4976         if (tb[RTA_SRC]) {
4977                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4978                         goto errout;
4979
4980                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4981         }
4982
4983         if (tb[RTA_DST]) {
4984                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4985                         goto errout;
4986
4987                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4988         }
4989
4990         if (tb[RTA_IIF])
4991                 iif = nla_get_u32(tb[RTA_IIF]);
4992
4993         if (tb[RTA_OIF])
4994                 oif = nla_get_u32(tb[RTA_OIF]);
4995
4996         if (tb[RTA_MARK])
4997                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4998
4999         if (tb[RTA_UID])
5000                 fl6.flowi6_uid = make_kuid(current_user_ns(),
5001                                            nla_get_u32(tb[RTA_UID]));
5002         else
5003                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5004
5005         if (tb[RTA_SPORT])
5006                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5007
5008         if (tb[RTA_DPORT])
5009                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5010
5011         if (tb[RTA_IP_PROTO]) {
5012                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5013                                                   &fl6.flowi6_proto, AF_INET6,
5014                                                   extack);
5015                 if (err)
5016                         goto errout;
5017         }
5018
5019         if (iif) {
5020                 struct net_device *dev;
5021                 int flags = 0;
5022
5023                 rcu_read_lock();
5024
5025                 dev = dev_get_by_index_rcu(net, iif);
5026                 if (!dev) {
5027                         rcu_read_unlock();
5028                         err = -ENODEV;
5029                         goto errout;
5030                 }
5031
5032                 fl6.flowi6_iif = iif;
5033
5034                 if (!ipv6_addr_any(&fl6.saddr))
5035                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5036
5037                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5038
5039                 rcu_read_unlock();
5040         } else {
5041                 fl6.flowi6_oif = oif;
5042
5043                 dst = ip6_route_output(net, NULL, &fl6);
5044         }
5045
5046
5047         rt = container_of(dst, struct rt6_info, dst);
5048         if (rt->dst.error) {
5049                 err = rt->dst.error;
5050                 ip6_rt_put(rt);
5051                 goto errout;
5052         }
5053
5054         if (rt == net->ipv6.ip6_null_entry) {
5055                 err = rt->dst.error;
5056                 ip6_rt_put(rt);
5057                 goto errout;
5058         }
5059
5060         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5061         if (!skb) {
5062                 ip6_rt_put(rt);
5063                 err = -ENOBUFS;
5064                 goto errout;
5065         }
5066
5067         skb_dst_set(skb, &rt->dst);
5068
5069         rcu_read_lock();
5070         from = rcu_dereference(rt->from);
5071         if (from) {
5072                 if (fibmatch)
5073                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5074                                             iif, RTM_NEWROUTE,
5075                                             NETLINK_CB(in_skb).portid,
5076                                             nlh->nlmsg_seq, 0);
5077                 else
5078                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5079                                             &fl6.saddr, iif, RTM_NEWROUTE,
5080                                             NETLINK_CB(in_skb).portid,
5081                                             nlh->nlmsg_seq, 0);
5082         } else {
5083                 err = -ENETUNREACH;
5084         }
5085         rcu_read_unlock();
5086
5087         if (err < 0) {
5088                 kfree_skb(skb);
5089                 goto errout;
5090         }
5091
5092         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5093 errout:
5094         return err;
5095 }
5096
5097 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5098                      unsigned int nlm_flags)
5099 {
5100         struct sk_buff *skb;
5101         struct net *net = info->nl_net;
5102         u32 seq;
5103         int err;
5104
5105         err = -ENOBUFS;
5106         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5107
5108         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5109         if (!skb)
5110                 goto errout;
5111
5112         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5113                             event, info->portid, seq, nlm_flags);
5114         if (err < 0) {
5115                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5116                 WARN_ON(err == -EMSGSIZE);
5117                 kfree_skb(skb);
5118                 goto errout;
5119         }
5120         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5121                     info->nlh, gfp_any());
5122         return;
5123 errout:
5124         if (err < 0)
5125                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5126 }
5127
5128 static int ip6_route_dev_notify(struct notifier_block *this,
5129                                 unsigned long event, void *ptr)
5130 {
5131         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5132         struct net *net = dev_net(dev);
5133
5134         if (!(dev->flags & IFF_LOOPBACK))
5135                 return NOTIFY_OK;
5136
5137         if (event == NETDEV_REGISTER) {
5138                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5139                 net->ipv6.ip6_null_entry->dst.dev = dev;
5140                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5142                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5143                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5144                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5145                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5146 #endif
5147          } else if (event == NETDEV_UNREGISTER &&
5148                     dev->reg_state != NETREG_UNREGISTERED) {
5149                 /* NETDEV_UNREGISTER could be fired for multiple times by
5150                  * netdev_wait_allrefs(). Make sure we only call this once.
5151                  */
5152                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5153 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5154                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5155                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5156 #endif
5157         }
5158
5159         return NOTIFY_OK;
5160 }
5161
5162 /*
5163  *      /proc
5164  */
5165
5166 #ifdef CONFIG_PROC_FS
5167 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5168 {
5169         struct net *net = (struct net *)seq->private;
5170         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5171                    net->ipv6.rt6_stats->fib_nodes,
5172                    net->ipv6.rt6_stats->fib_route_nodes,
5173                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5174                    net->ipv6.rt6_stats->fib_rt_entries,
5175                    net->ipv6.rt6_stats->fib_rt_cache,
5176                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5177                    net->ipv6.rt6_stats->fib_discarded_routes);
5178
5179         return 0;
5180 }
5181 #endif  /* CONFIG_PROC_FS */
5182
5183 #ifdef CONFIG_SYSCTL
5184
5185 static
5186 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5187                               void __user *buffer, size_t *lenp, loff_t *ppos)
5188 {
5189         struct net *net;
5190         int delay;
5191         int ret;
5192         if (!write)
5193                 return -EINVAL;
5194
5195         net = (struct net *)ctl->extra1;
5196         delay = net->ipv6.sysctl.flush_delay;
5197         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5198         if (ret)
5199                 return ret;
5200
5201         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5202         return 0;
5203 }
5204
5205 static int zero;
5206 static int one = 1;
5207
5208 static struct ctl_table ipv6_route_table_template[] = {
5209         {
5210                 .procname       =       "flush",
5211                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5212                 .maxlen         =       sizeof(int),
5213                 .mode           =       0200,
5214                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5215         },
5216         {
5217                 .procname       =       "gc_thresh",
5218                 .data           =       &ip6_dst_ops_template.gc_thresh,
5219                 .maxlen         =       sizeof(int),
5220                 .mode           =       0644,
5221                 .proc_handler   =       proc_dointvec,
5222         },
5223         {
5224                 .procname       =       "max_size",
5225                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5226                 .maxlen         =       sizeof(int),
5227                 .mode           =       0644,
5228                 .proc_handler   =       proc_dointvec,
5229         },
5230         {
5231                 .procname       =       "gc_min_interval",
5232                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5233                 .maxlen         =       sizeof(int),
5234                 .mode           =       0644,
5235                 .proc_handler   =       proc_dointvec_jiffies,
5236         },
5237         {
5238                 .procname       =       "gc_timeout",
5239                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5240                 .maxlen         =       sizeof(int),
5241                 .mode           =       0644,
5242                 .proc_handler   =       proc_dointvec_jiffies,
5243         },
5244         {
5245                 .procname       =       "gc_interval",
5246                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5247                 .maxlen         =       sizeof(int),
5248                 .mode           =       0644,
5249                 .proc_handler   =       proc_dointvec_jiffies,
5250         },
5251         {
5252                 .procname       =       "gc_elasticity",
5253                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5254                 .maxlen         =       sizeof(int),
5255                 .mode           =       0644,
5256                 .proc_handler   =       proc_dointvec,
5257         },
5258         {
5259                 .procname       =       "mtu_expires",
5260                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5261                 .maxlen         =       sizeof(int),
5262                 .mode           =       0644,
5263                 .proc_handler   =       proc_dointvec_jiffies,
5264         },
5265         {
5266                 .procname       =       "min_adv_mss",
5267                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5268                 .maxlen         =       sizeof(int),
5269                 .mode           =       0644,
5270                 .proc_handler   =       proc_dointvec,
5271         },
5272         {
5273                 .procname       =       "gc_min_interval_ms",
5274                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5275                 .maxlen         =       sizeof(int),
5276                 .mode           =       0644,
5277                 .proc_handler   =       proc_dointvec_ms_jiffies,
5278         },
5279         {
5280                 .procname       =       "skip_notify_on_dev_down",
5281                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5282                 .maxlen         =       sizeof(int),
5283                 .mode           =       0644,
5284                 .proc_handler   =       proc_dointvec,
5285                 .extra1         =       &zero,
5286                 .extra2         =       &one,
5287         },
5288         { }
5289 };
5290
5291 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5292 {
5293         struct ctl_table *table;
5294
5295         table = kmemdup(ipv6_route_table_template,
5296                         sizeof(ipv6_route_table_template),
5297                         GFP_KERNEL);
5298
5299         if (table) {
5300                 table[0].data = &net->ipv6.sysctl.flush_delay;
5301                 table[0].extra1 = net;
5302                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5303                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5304                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5305                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5306                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5307                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5308                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5309                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5310                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5311                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5312
5313                 /* Don't export sysctls to unprivileged users */
5314                 if (net->user_ns != &init_user_ns)
5315                         table[0].procname = NULL;
5316         }
5317
5318         return table;
5319 }
5320 #endif
5321
5322 static int __net_init ip6_route_net_init(struct net *net)
5323 {
5324         int ret = -ENOMEM;
5325
5326         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5327                sizeof(net->ipv6.ip6_dst_ops));
5328
5329         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5330                 goto out_ip6_dst_ops;
5331
5332         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5333                                             sizeof(*net->ipv6.fib6_null_entry),
5334                                             GFP_KERNEL);
5335         if (!net->ipv6.fib6_null_entry)
5336                 goto out_ip6_dst_entries;
5337
5338         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5339                                            sizeof(*net->ipv6.ip6_null_entry),
5340                                            GFP_KERNEL);
5341         if (!net->ipv6.ip6_null_entry)
5342                 goto out_fib6_null_entry;
5343         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5344         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5345                          ip6_template_metrics, true);
5346
5347 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5348         net->ipv6.fib6_has_custom_rules = false;
5349         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5350                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5351                                                GFP_KERNEL);
5352         if (!net->ipv6.ip6_prohibit_entry)
5353                 goto out_ip6_null_entry;
5354         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5355         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5356                          ip6_template_metrics, true);
5357
5358         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5359                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5360                                                GFP_KERNEL);
5361         if (!net->ipv6.ip6_blk_hole_entry)
5362                 goto out_ip6_prohibit_entry;
5363         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5364         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5365                          ip6_template_metrics, true);
5366 #endif
5367
5368         net->ipv6.sysctl.flush_delay = 0;
5369         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5370         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5371         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5372         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5373         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5374         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5375         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5376         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5377
5378         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5379
5380         ret = 0;
5381 out:
5382         return ret;
5383
5384 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5385 out_ip6_prohibit_entry:
5386         kfree(net->ipv6.ip6_prohibit_entry);
5387 out_ip6_null_entry:
5388         kfree(net->ipv6.ip6_null_entry);
5389 #endif
5390 out_fib6_null_entry:
5391         kfree(net->ipv6.fib6_null_entry);
5392 out_ip6_dst_entries:
5393         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5394 out_ip6_dst_ops:
5395         goto out;
5396 }
5397
5398 static void __net_exit ip6_route_net_exit(struct net *net)
5399 {
5400         kfree(net->ipv6.fib6_null_entry);
5401         kfree(net->ipv6.ip6_null_entry);
5402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5403         kfree(net->ipv6.ip6_prohibit_entry);
5404         kfree(net->ipv6.ip6_blk_hole_entry);
5405 #endif
5406         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5407 }
5408
5409 static int __net_init ip6_route_net_init_late(struct net *net)
5410 {
5411 #ifdef CONFIG_PROC_FS
5412         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5413                         sizeof(struct ipv6_route_iter));
5414         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5415                         rt6_stats_seq_show, NULL);
5416 #endif
5417         return 0;
5418 }
5419
5420 static void __net_exit ip6_route_net_exit_late(struct net *net)
5421 {
5422 #ifdef CONFIG_PROC_FS
5423         remove_proc_entry("ipv6_route", net->proc_net);
5424         remove_proc_entry("rt6_stats", net->proc_net);
5425 #endif
5426 }
5427
5428 static struct pernet_operations ip6_route_net_ops = {
5429         .init = ip6_route_net_init,
5430         .exit = ip6_route_net_exit,
5431 };
5432
5433 static int __net_init ipv6_inetpeer_init(struct net *net)
5434 {
5435         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5436
5437         if (!bp)
5438                 return -ENOMEM;
5439         inet_peer_base_init(bp);
5440         net->ipv6.peers = bp;
5441         return 0;
5442 }
5443
5444 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5445 {
5446         struct inet_peer_base *bp = net->ipv6.peers;
5447
5448         net->ipv6.peers = NULL;
5449         inetpeer_invalidate_tree(bp);
5450         kfree(bp);
5451 }
5452
5453 static struct pernet_operations ipv6_inetpeer_ops = {
5454         .init   =       ipv6_inetpeer_init,
5455         .exit   =       ipv6_inetpeer_exit,
5456 };
5457
5458 static struct pernet_operations ip6_route_net_late_ops = {
5459         .init = ip6_route_net_init_late,
5460         .exit = ip6_route_net_exit_late,
5461 };
5462
5463 static struct notifier_block ip6_route_dev_notifier = {
5464         .notifier_call = ip6_route_dev_notify,
5465         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5466 };
5467
5468 void __init ip6_route_init_special_entries(void)
5469 {
5470         /* Registering of the loopback is done before this portion of code,
5471          * the loopback reference in rt6_info will not be taken, do it
5472          * manually for init_net */
5473         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5474         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5475         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5476   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5477         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5478         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5479         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5480         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5481   #endif
5482 }
5483
5484 int __init ip6_route_init(void)
5485 {
5486         int ret;
5487         int cpu;
5488
5489         ret = -ENOMEM;
5490         ip6_dst_ops_template.kmem_cachep =
5491                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5492                                   SLAB_HWCACHE_ALIGN, NULL);
5493         if (!ip6_dst_ops_template.kmem_cachep)
5494                 goto out;
5495
5496         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5497         if (ret)
5498                 goto out_kmem_cache;
5499
5500         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5501         if (ret)
5502                 goto out_dst_entries;
5503
5504         ret = register_pernet_subsys(&ip6_route_net_ops);
5505         if (ret)
5506                 goto out_register_inetpeer;
5507
5508         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5509
5510         ret = fib6_init();
5511         if (ret)
5512                 goto out_register_subsys;
5513
5514         ret = xfrm6_init();
5515         if (ret)
5516                 goto out_fib6_init;
5517
5518         ret = fib6_rules_init();
5519         if (ret)
5520                 goto xfrm6_init;
5521
5522         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5523         if (ret)
5524                 goto fib6_rules_init;
5525
5526         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5527                                    inet6_rtm_newroute, NULL, 0);
5528         if (ret < 0)
5529                 goto out_register_late_subsys;
5530
5531         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5532                                    inet6_rtm_delroute, NULL, 0);
5533         if (ret < 0)
5534                 goto out_register_late_subsys;
5535
5536         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5537                                    inet6_rtm_getroute, NULL,
5538                                    RTNL_FLAG_DOIT_UNLOCKED);
5539         if (ret < 0)
5540                 goto out_register_late_subsys;
5541
5542         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5543         if (ret)
5544                 goto out_register_late_subsys;
5545
5546         for_each_possible_cpu(cpu) {
5547                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5548
5549                 INIT_LIST_HEAD(&ul->head);
5550                 spin_lock_init(&ul->lock);
5551         }
5552
5553 out:
5554         return ret;
5555
5556 out_register_late_subsys:
5557         rtnl_unregister_all(PF_INET6);
5558         unregister_pernet_subsys(&ip6_route_net_late_ops);
5559 fib6_rules_init:
5560         fib6_rules_cleanup();
5561 xfrm6_init:
5562         xfrm6_fini();
5563 out_fib6_init:
5564         fib6_gc_cleanup();
5565 out_register_subsys:
5566         unregister_pernet_subsys(&ip6_route_net_ops);
5567 out_register_inetpeer:
5568         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5569 out_dst_entries:
5570         dst_entries_destroy(&ip6_dst_blackhole_ops);
5571 out_kmem_cache:
5572         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5573         goto out;
5574 }
5575
5576 void ip6_route_cleanup(void)
5577 {
5578         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5579         unregister_pernet_subsys(&ip6_route_net_late_ops);
5580         fib6_rules_cleanup();
5581         xfrm6_fini();
5582         fib6_gc_cleanup();
5583         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5584         unregister_pernet_subsys(&ip6_route_net_ops);
5585         dst_entries_destroy(&ip6_dst_blackhole_ops);
5586         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5587 }