Merge tag 'wireless-drivers-next-for-davem-2017-06-12' of git://git.kernel.org/pub...
[linux-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->dst.flags |= DST_NOCACHE;
132         rt->rt6i_uncached_list = ul;
133
134         spin_lock_bh(&ul->lock);
135         list_add_tail(&rt->rt6i_uncached, &ul->head);
136         spin_unlock_bh(&ul->lock);
137 }
138
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141         if (!list_empty(&rt->rt6i_uncached)) {
142                 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144                 spin_lock_bh(&ul->lock);
145                 list_del(&rt->rt6i_uncached);
146                 spin_unlock_bh(&ul->lock);
147         }
148 }
149
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152         struct net_device *loopback_dev = net->loopback_dev;
153         int cpu;
154
155         if (dev == loopback_dev)
156                 return;
157
158         for_each_possible_cpu(cpu) {
159                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160                 struct rt6_info *rt;
161
162                 spin_lock_bh(&ul->lock);
163                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164                         struct inet6_dev *rt_idev = rt->rt6i_idev;
165                         struct net_device *rt_dev = rt->dst.dev;
166
167                         if (rt_idev->dev == dev) {
168                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
169                                 in6_dev_put(rt_idev);
170                         }
171
172                         if (rt_dev == dev) {
173                                 rt->dst.dev = loopback_dev;
174                                 dev_hold(rt->dst.dev);
175                                 dev_put(rt_dev);
176                         }
177                 }
178                 spin_unlock_bh(&ul->lock);
179         }
180 }
181
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184         return dst_metrics_write_ptr(rt->dst.from);
185 }
186
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190
191         if (rt->rt6i_flags & RTF_PCPU)
192                 return rt6_pcpu_cow_metrics(rt);
193         else if (rt->rt6i_flags & RTF_CACHE)
194                 return NULL;
195         else
196                 return dst_cow_metrics_generic(dst, old);
197 }
198
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200                                              struct sk_buff *skb,
201                                              const void *daddr)
202 {
203         struct in6_addr *p = &rt->rt6i_gateway;
204
205         if (!ipv6_addr_any(p))
206                 return (const void *) p;
207         else if (skb)
208                 return &ipv6_hdr(skb)->daddr;
209         return daddr;
210 }
211
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213                                           struct sk_buff *skb,
214                                           const void *daddr)
215 {
216         struct rt6_info *rt = (struct rt6_info *) dst;
217         struct neighbour *n;
218
219         daddr = choose_neigh_daddr(rt, skb, daddr);
220         n = __ipv6_neigh_lookup(dst->dev, daddr);
221         if (n)
222                 return n;
223         return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228         struct net_device *dev = dst->dev;
229         struct rt6_info *rt = (struct rt6_info *)dst;
230
231         daddr = choose_neigh_daddr(rt, NULL, daddr);
232         if (!daddr)
233                 return;
234         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235                 return;
236         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237                 return;
238         __ipv6_confirm_neigh(dev, daddr);
239 }
240
241 static struct dst_ops ip6_dst_ops_template = {
242         .family                 =       AF_INET6,
243         .gc                     =       ip6_dst_gc,
244         .gc_thresh              =       1024,
245         .check                  =       ip6_dst_check,
246         .default_advmss         =       ip6_default_advmss,
247         .mtu                    =       ip6_mtu,
248         .cow_metrics            =       ipv6_cow_metrics,
249         .destroy                =       ip6_dst_destroy,
250         .ifdown                 =       ip6_dst_ifdown,
251         .negative_advice        =       ip6_negative_advice,
252         .link_failure           =       ip6_link_failure,
253         .update_pmtu            =       ip6_rt_update_pmtu,
254         .redirect               =       rt6_do_redirect,
255         .local_out              =       __ip6_local_out,
256         .neigh_lookup           =       ip6_neigh_lookup,
257         .confirm_neigh          =       ip6_confirm_neigh,
258 };
259
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264         return mtu ? : dst->dev->mtu;
265 }
266
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268                                          struct sk_buff *skb, u32 mtu)
269 {
270 }
271
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273                                       struct sk_buff *skb)
274 {
275 }
276
277 static struct dst_ops ip6_dst_blackhole_ops = {
278         .family                 =       AF_INET6,
279         .destroy                =       ip6_dst_destroy,
280         .check                  =       ip6_dst_check,
281         .mtu                    =       ip6_blackhole_mtu,
282         .default_advmss         =       ip6_default_advmss,
283         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
284         .redirect               =       ip6_rt_blackhole_redirect,
285         .cow_metrics            =       dst_cow_metrics_generic,
286         .neigh_lookup           =       ip6_neigh_lookup,
287 };
288
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290         [RTAX_HOPLIMIT - 1] = 0,
291 };
292
293 static const struct rt6_info ip6_null_entry_template = {
294         .dst = {
295                 .__refcnt       = ATOMIC_INIT(1),
296                 .__use          = 1,
297                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
298                 .error          = -ENETUNREACH,
299                 .input          = ip6_pkt_discard,
300                 .output         = ip6_pkt_discard_out,
301         },
302         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
303         .rt6i_protocol  = RTPROT_KERNEL,
304         .rt6i_metric    = ~(u32) 0,
305         .rt6i_ref       = ATOMIC_INIT(1),
306 };
307
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309
310 static const struct rt6_info ip6_prohibit_entry_template = {
311         .dst = {
312                 .__refcnt       = ATOMIC_INIT(1),
313                 .__use          = 1,
314                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
315                 .error          = -EACCES,
316                 .input          = ip6_pkt_prohibit,
317                 .output         = ip6_pkt_prohibit_out,
318         },
319         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
320         .rt6i_protocol  = RTPROT_KERNEL,
321         .rt6i_metric    = ~(u32) 0,
322         .rt6i_ref       = ATOMIC_INIT(1),
323 };
324
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326         .dst = {
327                 .__refcnt       = ATOMIC_INIT(1),
328                 .__use          = 1,
329                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
330                 .error          = -EINVAL,
331                 .input          = dst_discard,
332                 .output         = dst_discard_out,
333         },
334         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
335         .rt6i_protocol  = RTPROT_KERNEL,
336         .rt6i_metric    = ~(u32) 0,
337         .rt6i_ref       = ATOMIC_INIT(1),
338 };
339
340 #endif
341
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344         struct dst_entry *dst = &rt->dst;
345
346         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347         INIT_LIST_HEAD(&rt->rt6i_siblings);
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353                                         struct net_device *dev,
354                                         int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         0, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt)
360                 rt6_info_init(rt);
361
362         return rt;
363 }
364
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366                                struct net_device *dev,
367                                int flags)
368 {
369         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370
371         if (rt) {
372                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373                 if (rt->rt6i_pcpu) {
374                         int cpu;
375
376                         for_each_possible_cpu(cpu) {
377                                 struct rt6_info **p;
378
379                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380                                 /* no one shares rt */
381                                 *p =  NULL;
382                         }
383                 } else {
384                         dst_destroy((struct dst_entry *)rt);
385                         return NULL;
386                 }
387         }
388
389         return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct dst_entry *from = dst->from;
397         struct inet6_dev *idev;
398
399         dst_destroy_metrics_generic(dst);
400         free_percpu(rt->rt6i_pcpu);
401         rt6_uncached_list_del(rt);
402
403         idev = rt->rt6i_idev;
404         if (idev) {
405                 rt->rt6i_idev = NULL;
406                 in6_dev_put(idev);
407         }
408
409         dst->from = NULL;
410         dst_release(from);
411 }
412
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414                            int how)
415 {
416         struct rt6_info *rt = (struct rt6_info *)dst;
417         struct inet6_dev *idev = rt->rt6i_idev;
418         struct net_device *loopback_dev =
419                 dev_net(dev)->loopback_dev;
420
421         if (dev != loopback_dev) {
422                 if (idev && idev->dev == dev) {
423                         struct inet6_dev *loopback_idev =
424                                 in6_dev_get(loopback_dev);
425                         if (loopback_idev) {
426                                 rt->rt6i_idev = loopback_idev;
427                                 in6_dev_put(idev);
428                         }
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448         }
449         return false;
450 }
451
452 /* Multipath route selection:
453  *   Hash based function using packet header and flowlabel.
454  * Adapted from fib_info_hashfn()
455  */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457                                const struct flowi6 *fl6)
458 {
459         return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463                                              struct flowi6 *fl6, int oif,
464                                              int strict)
465 {
466         struct rt6_info *sibling, *next_sibling;
467         int route_choosen;
468
469         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470         /* Don't change the route, if route_choosen == 0
471          * (siblings does not include ourself)
472          */
473         if (route_choosen)
474                 list_for_each_entry_safe(sibling, next_sibling,
475                                 &match->rt6i_siblings, rt6i_siblings) {
476                         route_choosen--;
477                         if (route_choosen == 0) {
478                                 if (rt6_score_route(sibling, oif, strict) < 0)
479                                         break;
480                                 match = sibling;
481                                 break;
482                         }
483                 }
484         return match;
485 }
486
487 /*
488  *      Route lookup. Any table->tb6_lock is implied.
489  */
490
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492                                                     struct rt6_info *rt,
493                                                     const struct in6_addr *saddr,
494                                                     int oif,
495                                                     int flags)
496 {
497         struct rt6_info *local = NULL;
498         struct rt6_info *sprt;
499
500         if (!oif && ipv6_addr_any(saddr))
501                 goto out;
502
503         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504                 struct net_device *dev = sprt->dst.dev;
505
506                 if (oif) {
507                         if (dev->ifindex == oif)
508                                 return sprt;
509                         if (dev->flags & IFF_LOOPBACK) {
510                                 if (!sprt->rt6i_idev ||
511                                     sprt->rt6i_idev->dev->ifindex != oif) {
512                                         if (flags & RT6_LOOKUP_F_IFACE)
513                                                 continue;
514                                         if (local &&
515                                             local->rt6i_idev->dev->ifindex == oif)
516                                                 continue;
517                                 }
518                                 local = sprt;
519                         }
520                 } else {
521                         if (ipv6_chk_addr(net, saddr, dev,
522                                           flags & RT6_LOOKUP_F_IFACE))
523                                 return sprt;
524                 }
525         }
526
527         if (oif) {
528                 if (local)
529                         return local;
530
531                 if (flags & RT6_LOOKUP_F_IFACE)
532                         return net->ipv6.ip6_null_entry;
533         }
534 out:
535         return rt;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540         struct work_struct work;
541         struct in6_addr target;
542         struct net_device *dev;
543 };
544
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547         struct in6_addr mcaddr;
548         struct __rt6_probe_work *work =
549                 container_of(w, struct __rt6_probe_work, work);
550
551         addrconf_addr_solict_mult(&work->target, &mcaddr);
552         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553         dev_put(work->dev);
554         kfree(work);
555 }
556
557 static void rt6_probe(struct rt6_info *rt)
558 {
559         struct __rt6_probe_work *work;
560         struct neighbour *neigh;
561         /*
562          * Okay, this does not seem to be appropriate
563          * for now, however, we need to check if it
564          * is really so; aka Router Reachability Probing.
565          *
566          * Router Reachability Probe MUST be rate-limited
567          * to no more than one per minute.
568          */
569         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570                 return;
571         rcu_read_lock_bh();
572         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573         if (neigh) {
574                 if (neigh->nud_state & NUD_VALID)
575                         goto out;
576
577                 work = NULL;
578                 write_lock(&neigh->lock);
579                 if (!(neigh->nud_state & NUD_VALID) &&
580                     time_after(jiffies,
581                                neigh->updated +
582                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
583                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
584                         if (work)
585                                 __neigh_set_probe_once(neigh);
586                 }
587                 write_unlock(&neigh->lock);
588         } else {
589                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590         }
591
592         if (work) {
593                 INIT_WORK(&work->work, rt6_probe_deferred);
594                 work->target = rt->rt6i_gateway;
595                 dev_hold(rt->dst.dev);
596                 work->dev = rt->dst.dev;
597                 schedule_work(&work->work);
598         }
599
600 out:
601         rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614         struct net_device *dev = rt->dst.dev;
615         if (!oif || dev->ifindex == oif)
616                 return 2;
617         if ((dev->flags & IFF_LOOPBACK) &&
618             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619                 return 1;
620         return 0;
621 }
622
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625         struct neighbour *neigh;
626         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627
628         if (rt->rt6i_flags & RTF_NONEXTHOP ||
629             !(rt->rt6i_flags & RTF_GATEWAY))
630                 return RT6_NUD_SUCCEED;
631
632         rcu_read_lock_bh();
633         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634         if (neigh) {
635                 read_lock(&neigh->lock);
636                 if (neigh->nud_state & NUD_VALID)
637                         ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639                 else if (!(neigh->nud_state & NUD_FAILED))
640                         ret = RT6_NUD_SUCCEED;
641                 else
642                         ret = RT6_NUD_FAIL_PROBE;
643 #endif
644                 read_unlock(&neigh->lock);
645         } else {
646                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648         }
649         rcu_read_unlock_bh();
650
651         return ret;
652 }
653
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655                            int strict)
656 {
657         int m;
658
659         m = rt6_check_dev(rt, oif);
660         if (!m && (strict & RT6_LOOKUP_F_IFACE))
661                 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665         if (strict & RT6_LOOKUP_F_REACHABLE) {
666                 int n = rt6_check_neigh(rt);
667                 if (n < 0)
668                         return n;
669         }
670         return m;
671 }
672
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674                                    int *mpri, struct rt6_info *match,
675                                    bool *do_rr)
676 {
677         int m;
678         bool match_do_rr = false;
679         struct inet6_dev *idev = rt->rt6i_idev;
680         struct net_device *dev = rt->dst.dev;
681
682         if (dev && !netif_carrier_ok(dev) &&
683             idev->cnf.ignore_routes_with_linkdown &&
684             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685                 goto out;
686
687         if (rt6_check_expired(rt))
688                 goto out;
689
690         m = rt6_score_route(rt, oif, strict);
691         if (m == RT6_NUD_FAIL_DO_RR) {
692                 match_do_rr = true;
693                 m = 0; /* lowest valid score */
694         } else if (m == RT6_NUD_FAIL_HARD) {
695                 goto out;
696         }
697
698         if (strict & RT6_LOOKUP_F_REACHABLE)
699                 rt6_probe(rt);
700
701         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702         if (m > *mpri) {
703                 *do_rr = match_do_rr;
704                 *mpri = m;
705                 match = rt;
706         }
707 out:
708         return match;
709 }
710
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731                 if (rt->rt6i_metric != metric) {
732                         cont = rt;
733                         break;
734                 }
735
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737         }
738
739         if (match || !cont)
740                 return match;
741
742         for (rt = cont; rt; rt = rt->dst.rt6_next)
743                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744
745         return match;
746 }
747
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750         struct rt6_info *match, *rt0;
751         struct net *net;
752         bool do_rr = false;
753
754         rt0 = fn->rr_ptr;
755         if (!rt0)
756                 fn->rr_ptr = rt0 = fn->leaf;
757
758         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759                              &do_rr);
760
761         if (do_rr) {
762                 struct rt6_info *next = rt0->dst.rt6_next;
763
764                 /* no entries matched; do round-robin */
765                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766                         next = fn->leaf;
767
768                 if (next != rt0)
769                         fn->rr_ptr = next;
770         }
771
772         net = dev_net(rt0->dst.dev);
773         return match ? match : net->ipv6.ip6_null_entry;
774 }
775
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783                   const struct in6_addr *gwaddr)
784 {
785         struct net *net = dev_net(dev);
786         struct route_info *rinfo = (struct route_info *) opt;
787         struct in6_addr prefix_buf, *prefix;
788         unsigned int pref;
789         unsigned long lifetime;
790         struct rt6_info *rt;
791
792         if (len < sizeof(struct route_info)) {
793                 return -EINVAL;
794         }
795
796         /* Sanity check for prefix_len and length */
797         if (rinfo->length > 3) {
798                 return -EINVAL;
799         } else if (rinfo->prefix_len > 128) {
800                 return -EINVAL;
801         } else if (rinfo->prefix_len > 64) {
802                 if (rinfo->length < 2) {
803                         return -EINVAL;
804                 }
805         } else if (rinfo->prefix_len > 0) {
806                 if (rinfo->length < 1) {
807                         return -EINVAL;
808                 }
809         }
810
811         pref = rinfo->route_pref;
812         if (pref == ICMPV6_ROUTER_PREF_INVALID)
813                 return -EINVAL;
814
815         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816
817         if (rinfo->length == 3)
818                 prefix = (struct in6_addr *)rinfo->prefix;
819         else {
820                 /* this function is safe */
821                 ipv6_addr_prefix(&prefix_buf,
822                                  (struct in6_addr *)rinfo->prefix,
823                                  rinfo->prefix_len);
824                 prefix = &prefix_buf;
825         }
826
827         if (rinfo->prefix_len == 0)
828                 rt = rt6_get_dflt_router(gwaddr, dev);
829         else
830                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831                                         gwaddr, dev);
832
833         if (rt && !lifetime) {
834                 ip6_del_rt(rt);
835                 rt = NULL;
836         }
837
838         if (!rt && lifetime)
839                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840                                         dev, pref);
841         else if (rt)
842                 rt->rt6i_flags = RTF_ROUTEINFO |
843                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844
845         if (rt) {
846                 if (!addrconf_finite_timeout(lifetime))
847                         rt6_clean_expires(rt);
848                 else
849                         rt6_set_expires(rt, jiffies + HZ * lifetime);
850
851                 ip6_rt_put(rt);
852         }
853         return 0;
854 }
855 #endif
856
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858                                         struct in6_addr *saddr)
859 {
860         struct fib6_node *pn;
861         while (1) {
862                 if (fn->fn_flags & RTN_TL_ROOT)
863                         return NULL;
864                 pn = fn->parent;
865                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867                 else
868                         fn = pn;
869                 if (fn->fn_flags & RTN_RTINFO)
870                         return fn;
871         }
872 }
873
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875                                              struct fib6_table *table,
876                                              struct flowi6 *fl6, int flags)
877 {
878         struct fib6_node *fn;
879         struct rt6_info *rt;
880
881         read_lock_bh(&table->tb6_lock);
882         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884         rt = fn->leaf;
885         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888         if (rt == net->ipv6.ip6_null_entry) {
889                 fn = fib6_backtrack(fn, &fl6->saddr);
890                 if (fn)
891                         goto restart;
892         }
893         dst_use(&rt->dst, jiffies);
894         read_unlock_bh(&table->tb6_lock);
895
896         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897
898         return rt;
899
900 }
901
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903                                     int flags)
904 {
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910                             const struct in6_addr *saddr, int oif, int strict)
911 {
912         struct flowi6 fl6 = {
913                 .flowi6_oif = oif,
914                 .daddr = *daddr,
915         };
916         struct dst_entry *dst;
917         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918
919         if (saddr) {
920                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921                 flags |= RT6_LOOKUP_F_HAS_SADDR;
922         }
923
924         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925         if (dst->error == 0)
926                 return (struct rt6_info *) dst;
927
928         dst_release(dst);
929
930         return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935    It takes new route entry, the addition fails by any reason the
936    route is freed. In any case, if caller does not hold it, it may
937    be destroyed.
938  */
939
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941                         struct mx6_config *mxc,
942                         struct netlink_ext_ack *extack)
943 {
944         int err;
945         struct fib6_table *table;
946
947         table = rt->rt6i_table;
948         write_lock_bh(&table->tb6_lock);
949         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
950         write_unlock_bh(&table->tb6_lock);
951
952         return err;
953 }
954
955 int ip6_ins_rt(struct rt6_info *rt)
956 {
957         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
958         struct mx6_config mxc = { .mx = NULL, };
959
960         return __ip6_ins_rt(rt, &info, &mxc, NULL);
961 }
962
963 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
964                                            const struct in6_addr *daddr,
965                                            const struct in6_addr *saddr)
966 {
967         struct rt6_info *rt;
968
969         /*
970          *      Clone the route.
971          */
972
973         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
974                 ort = (struct rt6_info *)ort->dst.from;
975
976         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
977
978         if (!rt)
979                 return NULL;
980
981         ip6_rt_copy_init(rt, ort);
982         rt->rt6i_flags |= RTF_CACHE;
983         rt->rt6i_metric = 0;
984         rt->dst.flags |= DST_HOST;
985         rt->rt6i_dst.addr = *daddr;
986         rt->rt6i_dst.plen = 128;
987
988         if (!rt6_is_gw_or_nonexthop(ort)) {
989                 if (ort->rt6i_dst.plen != 128 &&
990                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
991                         rt->rt6i_flags |= RTF_ANYCAST;
992 #ifdef CONFIG_IPV6_SUBTREES
993                 if (rt->rt6i_src.plen && saddr) {
994                         rt->rt6i_src.addr = *saddr;
995                         rt->rt6i_src.plen = 128;
996                 }
997 #endif
998         }
999
1000         return rt;
1001 }
1002
1003 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1004 {
1005         struct rt6_info *pcpu_rt;
1006
1007         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1008                                   rt->dst.dev, rt->dst.flags);
1009
1010         if (!pcpu_rt)
1011                 return NULL;
1012         ip6_rt_copy_init(pcpu_rt, rt);
1013         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1014         pcpu_rt->rt6i_flags |= RTF_PCPU;
1015         return pcpu_rt;
1016 }
1017
1018 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1019 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1020 {
1021         struct rt6_info *pcpu_rt, **p;
1022
1023         p = this_cpu_ptr(rt->rt6i_pcpu);
1024         pcpu_rt = *p;
1025
1026         if (pcpu_rt) {
1027                 dst_hold(&pcpu_rt->dst);
1028                 rt6_dst_from_metrics_check(pcpu_rt);
1029         }
1030         return pcpu_rt;
1031 }
1032
1033 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1034 {
1035         struct fib6_table *table = rt->rt6i_table;
1036         struct rt6_info *pcpu_rt, *prev, **p;
1037
1038         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1039         if (!pcpu_rt) {
1040                 struct net *net = dev_net(rt->dst.dev);
1041
1042                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1043                 return net->ipv6.ip6_null_entry;
1044         }
1045
1046         read_lock_bh(&table->tb6_lock);
1047         if (rt->rt6i_pcpu) {
1048                 p = this_cpu_ptr(rt->rt6i_pcpu);
1049                 prev = cmpxchg(p, NULL, pcpu_rt);
1050                 if (prev) {
1051                         /* If someone did it before us, return prev instead */
1052                         dst_destroy(&pcpu_rt->dst);
1053                         pcpu_rt = prev;
1054                 }
1055         } else {
1056                 /* rt has been removed from the fib6 tree
1057                  * before we have a chance to acquire the read_lock.
1058                  * In this case, don't brother to create a pcpu rt
1059                  * since rt is going away anyway.  The next
1060                  * dst_check() will trigger a re-lookup.
1061                  */
1062                 dst_destroy(&pcpu_rt->dst);
1063                 pcpu_rt = rt;
1064         }
1065         dst_hold(&pcpu_rt->dst);
1066         rt6_dst_from_metrics_check(pcpu_rt);
1067         read_unlock_bh(&table->tb6_lock);
1068         return pcpu_rt;
1069 }
1070
1071 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1072                                int oif, struct flowi6 *fl6, int flags)
1073 {
1074         struct fib6_node *fn, *saved_fn;
1075         struct rt6_info *rt;
1076         int strict = 0;
1077
1078         strict |= flags & RT6_LOOKUP_F_IFACE;
1079         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1080         if (net->ipv6.devconf_all->forwarding == 0)
1081                 strict |= RT6_LOOKUP_F_REACHABLE;
1082
1083         read_lock_bh(&table->tb6_lock);
1084
1085         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1086         saved_fn = fn;
1087
1088         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1089                 oif = 0;
1090
1091 redo_rt6_select:
1092         rt = rt6_select(fn, oif, strict);
1093         if (rt->rt6i_nsiblings)
1094                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1095         if (rt == net->ipv6.ip6_null_entry) {
1096                 fn = fib6_backtrack(fn, &fl6->saddr);
1097                 if (fn)
1098                         goto redo_rt6_select;
1099                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1100                         /* also consider unreachable route */
1101                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1102                         fn = saved_fn;
1103                         goto redo_rt6_select;
1104                 }
1105         }
1106
1107
1108         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1109                 dst_use(&rt->dst, jiffies);
1110                 read_unlock_bh(&table->tb6_lock);
1111
1112                 rt6_dst_from_metrics_check(rt);
1113
1114                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1115                 return rt;
1116         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1117                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1118                 /* Create a RTF_CACHE clone which will not be
1119                  * owned by the fib6 tree.  It is for the special case where
1120                  * the daddr in the skb during the neighbor look-up is different
1121                  * from the fl6->daddr used to look-up route here.
1122                  */
1123
1124                 struct rt6_info *uncached_rt;
1125
1126                 dst_use(&rt->dst, jiffies);
1127                 read_unlock_bh(&table->tb6_lock);
1128
1129                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1130                 dst_release(&rt->dst);
1131
1132                 if (uncached_rt)
1133                         rt6_uncached_list_add(uncached_rt);
1134                 else
1135                         uncached_rt = net->ipv6.ip6_null_entry;
1136
1137                 dst_hold(&uncached_rt->dst);
1138
1139                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1140                 return uncached_rt;
1141
1142         } else {
1143                 /* Get a percpu copy */
1144
1145                 struct rt6_info *pcpu_rt;
1146
1147                 rt->dst.lastuse = jiffies;
1148                 rt->dst.__use++;
1149                 pcpu_rt = rt6_get_pcpu_route(rt);
1150
1151                 if (pcpu_rt) {
1152                         read_unlock_bh(&table->tb6_lock);
1153                 } else {
1154                         /* We have to do the read_unlock first
1155                          * because rt6_make_pcpu_route() may trigger
1156                          * ip6_dst_gc() which will take the write_lock.
1157                          */
1158                         dst_hold(&rt->dst);
1159                         read_unlock_bh(&table->tb6_lock);
1160                         pcpu_rt = rt6_make_pcpu_route(rt);
1161                         dst_release(&rt->dst);
1162                 }
1163
1164                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1165                 return pcpu_rt;
1166
1167         }
1168 }
1169 EXPORT_SYMBOL_GPL(ip6_pol_route);
1170
1171 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1172                                             struct flowi6 *fl6, int flags)
1173 {
1174         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1175 }
1176
1177 struct dst_entry *ip6_route_input_lookup(struct net *net,
1178                                          struct net_device *dev,
1179                                          struct flowi6 *fl6, int flags)
1180 {
1181         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1182                 flags |= RT6_LOOKUP_F_IFACE;
1183
1184         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1185 }
1186 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1187
1188 void ip6_route_input(struct sk_buff *skb)
1189 {
1190         const struct ipv6hdr *iph = ipv6_hdr(skb);
1191         struct net *net = dev_net(skb->dev);
1192         int flags = RT6_LOOKUP_F_HAS_SADDR;
1193         struct ip_tunnel_info *tun_info;
1194         struct flowi6 fl6 = {
1195                 .flowi6_iif = skb->dev->ifindex,
1196                 .daddr = iph->daddr,
1197                 .saddr = iph->saddr,
1198                 .flowlabel = ip6_flowinfo(iph),
1199                 .flowi6_mark = skb->mark,
1200                 .flowi6_proto = iph->nexthdr,
1201         };
1202
1203         tun_info = skb_tunnel_info(skb);
1204         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1205                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1206         skb_dst_drop(skb);
1207         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1208 }
1209
1210 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1211                                              struct flowi6 *fl6, int flags)
1212 {
1213         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1214 }
1215
1216 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1217                                          struct flowi6 *fl6, int flags)
1218 {
1219         bool any_src;
1220
1221         if (rt6_need_strict(&fl6->daddr)) {
1222                 struct dst_entry *dst;
1223
1224                 dst = l3mdev_link_scope_lookup(net, fl6);
1225                 if (dst)
1226                         return dst;
1227         }
1228
1229         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1230
1231         any_src = ipv6_addr_any(&fl6->saddr);
1232         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1233             (fl6->flowi6_oif && any_src))
1234                 flags |= RT6_LOOKUP_F_IFACE;
1235
1236         if (!any_src)
1237                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1238         else if (sk)
1239                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1240
1241         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1244
1245 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1246 {
1247         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1248         struct dst_entry *new = NULL;
1249
1250         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1251         if (rt) {
1252                 rt6_info_init(rt);
1253
1254                 new = &rt->dst;
1255                 new->__use = 1;
1256                 new->input = dst_discard;
1257                 new->output = dst_discard_out;
1258
1259                 dst_copy_metrics(new, &ort->dst);
1260                 rt->rt6i_idev = ort->rt6i_idev;
1261                 if (rt->rt6i_idev)
1262                         in6_dev_hold(rt->rt6i_idev);
1263
1264                 rt->rt6i_gateway = ort->rt6i_gateway;
1265                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1266                 rt->rt6i_metric = 0;
1267
1268                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1269 #ifdef CONFIG_IPV6_SUBTREES
1270                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1271 #endif
1272
1273                 dst_free(new);
1274         }
1275
1276         dst_release(dst_orig);
1277         return new ? new : ERR_PTR(-ENOMEM);
1278 }
1279
1280 /*
1281  *      Destination cache support functions
1282  */
1283
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1285 {
1286         if (rt->dst.from &&
1287             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1289 }
1290
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1292 {
1293         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1294                 return NULL;
1295
1296         if (rt6_check_expired(rt))
1297                 return NULL;
1298
1299         return &rt->dst;
1300 }
1301
1302 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1303 {
1304         if (!__rt6_check_expired(rt) &&
1305             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1306             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1307                 return &rt->dst;
1308         else
1309                 return NULL;
1310 }
1311
1312 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1313 {
1314         struct rt6_info *rt;
1315
1316         rt = (struct rt6_info *) dst;
1317
1318         /* All IPV6 dsts are created with ->obsolete set to the value
1319          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1320          * into this function always.
1321          */
1322
1323         rt6_dst_from_metrics_check(rt);
1324
1325         if (rt->rt6i_flags & RTF_PCPU ||
1326             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1327                 return rt6_dst_from_check(rt, cookie);
1328         else
1329                 return rt6_check(rt, cookie);
1330 }
1331
1332 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1333 {
1334         struct rt6_info *rt = (struct rt6_info *) dst;
1335
1336         if (rt) {
1337                 if (rt->rt6i_flags & RTF_CACHE) {
1338                         if (rt6_check_expired(rt)) {
1339                                 ip6_del_rt(rt);
1340                                 dst = NULL;
1341                         }
1342                 } else {
1343                         dst_release(dst);
1344                         dst = NULL;
1345                 }
1346         }
1347         return dst;
1348 }
1349
1350 static void ip6_link_failure(struct sk_buff *skb)
1351 {
1352         struct rt6_info *rt;
1353
1354         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1355
1356         rt = (struct rt6_info *) skb_dst(skb);
1357         if (rt) {
1358                 if (rt->rt6i_flags & RTF_CACHE) {
1359                         dst_hold(&rt->dst);
1360                         ip6_del_rt(rt);
1361                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1362                         rt->rt6i_node->fn_sernum = -1;
1363                 }
1364         }
1365 }
1366
1367 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1368 {
1369         struct net *net = dev_net(rt->dst.dev);
1370
1371         rt->rt6i_flags |= RTF_MODIFIED;
1372         rt->rt6i_pmtu = mtu;
1373         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1374 }
1375
1376 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1377 {
1378         return !(rt->rt6i_flags & RTF_CACHE) &&
1379                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1380 }
1381
1382 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1383                                  const struct ipv6hdr *iph, u32 mtu)
1384 {
1385         const struct in6_addr *daddr, *saddr;
1386         struct rt6_info *rt6 = (struct rt6_info *)dst;
1387
1388         if (rt6->rt6i_flags & RTF_LOCAL)
1389                 return;
1390
1391         if (dst_metric_locked(dst, RTAX_MTU))
1392                 return;
1393
1394         if (iph) {
1395                 daddr = &iph->daddr;
1396                 saddr = &iph->saddr;
1397         } else if (sk) {
1398                 daddr = &sk->sk_v6_daddr;
1399                 saddr = &inet6_sk(sk)->saddr;
1400         } else {
1401                 daddr = NULL;
1402                 saddr = NULL;
1403         }
1404         dst_confirm_neigh(dst, daddr);
1405         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1406         if (mtu >= dst_mtu(dst))
1407                 return;
1408
1409         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1410                 rt6_do_update_pmtu(rt6, mtu);
1411         } else if (daddr) {
1412                 struct rt6_info *nrt6;
1413
1414                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1415                 if (nrt6) {
1416                         rt6_do_update_pmtu(nrt6, mtu);
1417
1418                         /* ip6_ins_rt(nrt6) will bump the
1419                          * rt6->rt6i_node->fn_sernum
1420                          * which will fail the next rt6_check() and
1421                          * invalidate the sk->sk_dst_cache.
1422                          */
1423                         ip6_ins_rt(nrt6);
1424                 }
1425         }
1426 }
1427
1428 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1429                                struct sk_buff *skb, u32 mtu)
1430 {
1431         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1432 }
1433
1434 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1435                      int oif, u32 mark, kuid_t uid)
1436 {
1437         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1438         struct dst_entry *dst;
1439         struct flowi6 fl6;
1440
1441         memset(&fl6, 0, sizeof(fl6));
1442         fl6.flowi6_oif = oif;
1443         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1444         fl6.daddr = iph->daddr;
1445         fl6.saddr = iph->saddr;
1446         fl6.flowlabel = ip6_flowinfo(iph);
1447         fl6.flowi6_uid = uid;
1448
1449         dst = ip6_route_output(net, NULL, &fl6);
1450         if (!dst->error)
1451                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1452         dst_release(dst);
1453 }
1454 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1455
1456 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1457 {
1458         struct dst_entry *dst;
1459
1460         ip6_update_pmtu(skb, sock_net(sk), mtu,
1461                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1462
1463         dst = __sk_dst_get(sk);
1464         if (!dst || !dst->obsolete ||
1465             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1466                 return;
1467
1468         bh_lock_sock(sk);
1469         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1470                 ip6_datagram_dst_update(sk, false);
1471         bh_unlock_sock(sk);
1472 }
1473 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1474
1475 /* Handle redirects */
1476 struct ip6rd_flowi {
1477         struct flowi6 fl6;
1478         struct in6_addr gateway;
1479 };
1480
1481 static struct rt6_info *__ip6_route_redirect(struct net *net,
1482                                              struct fib6_table *table,
1483                                              struct flowi6 *fl6,
1484                                              int flags)
1485 {
1486         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1487         struct rt6_info *rt;
1488         struct fib6_node *fn;
1489
1490         /* Get the "current" route for this destination and
1491          * check if the redirect has come from appropriate router.
1492          *
1493          * RFC 4861 specifies that redirects should only be
1494          * accepted if they come from the nexthop to the target.
1495          * Due to the way the routes are chosen, this notion
1496          * is a bit fuzzy and one might need to check all possible
1497          * routes.
1498          */
1499
1500         read_lock_bh(&table->tb6_lock);
1501         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1502 restart:
1503         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1504                 if (rt6_check_expired(rt))
1505                         continue;
1506                 if (rt->dst.error)
1507                         break;
1508                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1509                         continue;
1510                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1511                         continue;
1512                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1513                         continue;
1514                 break;
1515         }
1516
1517         if (!rt)
1518                 rt = net->ipv6.ip6_null_entry;
1519         else if (rt->dst.error) {
1520                 rt = net->ipv6.ip6_null_entry;
1521                 goto out;
1522         }
1523
1524         if (rt == net->ipv6.ip6_null_entry) {
1525                 fn = fib6_backtrack(fn, &fl6->saddr);
1526                 if (fn)
1527                         goto restart;
1528         }
1529
1530 out:
1531         dst_hold(&rt->dst);
1532
1533         read_unlock_bh(&table->tb6_lock);
1534
1535         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1536         return rt;
1537 };
1538
1539 static struct dst_entry *ip6_route_redirect(struct net *net,
1540                                         const struct flowi6 *fl6,
1541                                         const struct in6_addr *gateway)
1542 {
1543         int flags = RT6_LOOKUP_F_HAS_SADDR;
1544         struct ip6rd_flowi rdfl;
1545
1546         rdfl.fl6 = *fl6;
1547         rdfl.gateway = *gateway;
1548
1549         return fib6_rule_lookup(net, &rdfl.fl6,
1550                                 flags, __ip6_route_redirect);
1551 }
1552
1553 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1554                   kuid_t uid)
1555 {
1556         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1557         struct dst_entry *dst;
1558         struct flowi6 fl6;
1559
1560         memset(&fl6, 0, sizeof(fl6));
1561         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1562         fl6.flowi6_oif = oif;
1563         fl6.flowi6_mark = mark;
1564         fl6.daddr = iph->daddr;
1565         fl6.saddr = iph->saddr;
1566         fl6.flowlabel = ip6_flowinfo(iph);
1567         fl6.flowi6_uid = uid;
1568
1569         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1570         rt6_do_redirect(dst, NULL, skb);
1571         dst_release(dst);
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_redirect);
1574
1575 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1576                             u32 mark)
1577 {
1578         const struct ipv6hdr *iph = ipv6_hdr(skb);
1579         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1580         struct dst_entry *dst;
1581         struct flowi6 fl6;
1582
1583         memset(&fl6, 0, sizeof(fl6));
1584         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1585         fl6.flowi6_oif = oif;
1586         fl6.flowi6_mark = mark;
1587         fl6.daddr = msg->dest;
1588         fl6.saddr = iph->daddr;
1589         fl6.flowi6_uid = sock_net_uid(net, NULL);
1590
1591         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1592         rt6_do_redirect(dst, NULL, skb);
1593         dst_release(dst);
1594 }
1595
1596 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1597 {
1598         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1599                      sk->sk_uid);
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1602
1603 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1604 {
1605         struct net_device *dev = dst->dev;
1606         unsigned int mtu = dst_mtu(dst);
1607         struct net *net = dev_net(dev);
1608
1609         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1610
1611         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1612                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1613
1614         /*
1615          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1616          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1617          * IPV6_MAXPLEN is also valid and means: "any MSS,
1618          * rely only on pmtu discovery"
1619          */
1620         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1621                 mtu = IPV6_MAXPLEN;
1622         return mtu;
1623 }
1624
1625 static unsigned int ip6_mtu(const struct dst_entry *dst)
1626 {
1627         const struct rt6_info *rt = (const struct rt6_info *)dst;
1628         unsigned int mtu = rt->rt6i_pmtu;
1629         struct inet6_dev *idev;
1630
1631         if (mtu)
1632                 goto out;
1633
1634         mtu = dst_metric_raw(dst, RTAX_MTU);
1635         if (mtu)
1636                 goto out;
1637
1638         mtu = IPV6_MIN_MTU;
1639
1640         rcu_read_lock();
1641         idev = __in6_dev_get(dst->dev);
1642         if (idev)
1643                 mtu = idev->cnf.mtu6;
1644         rcu_read_unlock();
1645
1646 out:
1647         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1648
1649         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1650 }
1651
1652 static struct dst_entry *icmp6_dst_gc_list;
1653 static DEFINE_SPINLOCK(icmp6_dst_lock);
1654
1655 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1656                                   struct flowi6 *fl6)
1657 {
1658         struct dst_entry *dst;
1659         struct rt6_info *rt;
1660         struct inet6_dev *idev = in6_dev_get(dev);
1661         struct net *net = dev_net(dev);
1662
1663         if (unlikely(!idev))
1664                 return ERR_PTR(-ENODEV);
1665
1666         rt = ip6_dst_alloc(net, dev, 0);
1667         if (unlikely(!rt)) {
1668                 in6_dev_put(idev);
1669                 dst = ERR_PTR(-ENOMEM);
1670                 goto out;
1671         }
1672
1673         rt->dst.flags |= DST_HOST;
1674         rt->dst.output  = ip6_output;
1675         atomic_set(&rt->dst.__refcnt, 1);
1676         rt->rt6i_gateway  = fl6->daddr;
1677         rt->rt6i_dst.addr = fl6->daddr;
1678         rt->rt6i_dst.plen = 128;
1679         rt->rt6i_idev     = idev;
1680         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1681
1682         spin_lock_bh(&icmp6_dst_lock);
1683         rt->dst.next = icmp6_dst_gc_list;
1684         icmp6_dst_gc_list = &rt->dst;
1685         spin_unlock_bh(&icmp6_dst_lock);
1686
1687         fib6_force_start_gc(net);
1688
1689         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1690
1691 out:
1692         return dst;
1693 }
1694
1695 int icmp6_dst_gc(void)
1696 {
1697         struct dst_entry *dst, **pprev;
1698         int more = 0;
1699
1700         spin_lock_bh(&icmp6_dst_lock);
1701         pprev = &icmp6_dst_gc_list;
1702
1703         while ((dst = *pprev) != NULL) {
1704                 if (!atomic_read(&dst->__refcnt)) {
1705                         *pprev = dst->next;
1706                         dst_free(dst);
1707                 } else {
1708                         pprev = &dst->next;
1709                         ++more;
1710                 }
1711         }
1712
1713         spin_unlock_bh(&icmp6_dst_lock);
1714
1715         return more;
1716 }
1717
1718 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1719                             void *arg)
1720 {
1721         struct dst_entry *dst, **pprev;
1722
1723         spin_lock_bh(&icmp6_dst_lock);
1724         pprev = &icmp6_dst_gc_list;
1725         while ((dst = *pprev) != NULL) {
1726                 struct rt6_info *rt = (struct rt6_info *) dst;
1727                 if (func(rt, arg)) {
1728                         *pprev = dst->next;
1729                         dst_free(dst);
1730                 } else {
1731                         pprev = &dst->next;
1732                 }
1733         }
1734         spin_unlock_bh(&icmp6_dst_lock);
1735 }
1736
1737 static int ip6_dst_gc(struct dst_ops *ops)
1738 {
1739         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1740         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1741         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1742         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1743         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1744         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1745         int entries;
1746
1747         entries = dst_entries_get_fast(ops);
1748         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1749             entries <= rt_max_size)
1750                 goto out;
1751
1752         net->ipv6.ip6_rt_gc_expire++;
1753         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1754         entries = dst_entries_get_slow(ops);
1755         if (entries < ops->gc_thresh)
1756                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1757 out:
1758         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1759         return entries > rt_max_size;
1760 }
1761
1762 static int ip6_convert_metrics(struct mx6_config *mxc,
1763                                const struct fib6_config *cfg)
1764 {
1765         bool ecn_ca = false;
1766         struct nlattr *nla;
1767         int remaining;
1768         u32 *mp;
1769
1770         if (!cfg->fc_mx)
1771                 return 0;
1772
1773         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1774         if (unlikely(!mp))
1775                 return -ENOMEM;
1776
1777         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1778                 int type = nla_type(nla);
1779                 u32 val;
1780
1781                 if (!type)
1782                         continue;
1783                 if (unlikely(type > RTAX_MAX))
1784                         goto err;
1785
1786                 if (type == RTAX_CC_ALGO) {
1787                         char tmp[TCP_CA_NAME_MAX];
1788
1789                         nla_strlcpy(tmp, nla, sizeof(tmp));
1790                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1791                         if (val == TCP_CA_UNSPEC)
1792                                 goto err;
1793                 } else {
1794                         val = nla_get_u32(nla);
1795                 }
1796                 if (type == RTAX_HOPLIMIT && val > 255)
1797                         val = 255;
1798                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1799                         goto err;
1800
1801                 mp[type - 1] = val;
1802                 __set_bit(type - 1, mxc->mx_valid);
1803         }
1804
1805         if (ecn_ca) {
1806                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1807                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1808         }
1809
1810         mxc->mx = mp;
1811         return 0;
1812  err:
1813         kfree(mp);
1814         return -EINVAL;
1815 }
1816
1817 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1818                                             struct fib6_config *cfg,
1819                                             const struct in6_addr *gw_addr)
1820 {
1821         struct flowi6 fl6 = {
1822                 .flowi6_oif = cfg->fc_ifindex,
1823                 .daddr = *gw_addr,
1824                 .saddr = cfg->fc_prefsrc,
1825         };
1826         struct fib6_table *table;
1827         struct rt6_info *rt;
1828         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1829
1830         table = fib6_get_table(net, cfg->fc_table);
1831         if (!table)
1832                 return NULL;
1833
1834         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1835                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1836
1837         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1838
1839         /* if table lookup failed, fall back to full lookup */
1840         if (rt == net->ipv6.ip6_null_entry) {
1841                 ip6_rt_put(rt);
1842                 rt = NULL;
1843         }
1844
1845         return rt;
1846 }
1847
1848 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1849                                               struct netlink_ext_ack *extack)
1850 {
1851         struct net *net = cfg->fc_nlinfo.nl_net;
1852         struct rt6_info *rt = NULL;
1853         struct net_device *dev = NULL;
1854         struct inet6_dev *idev = NULL;
1855         struct fib6_table *table;
1856         int addr_type;
1857         int err = -EINVAL;
1858
1859         /* RTF_PCPU is an internal flag; can not be set by userspace */
1860         if (cfg->fc_flags & RTF_PCPU) {
1861                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1862                 goto out;
1863         }
1864
1865         if (cfg->fc_dst_len > 128) {
1866                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1867                 goto out;
1868         }
1869         if (cfg->fc_src_len > 128) {
1870                 NL_SET_ERR_MSG(extack, "Invalid source address length");
1871                 goto out;
1872         }
1873 #ifndef CONFIG_IPV6_SUBTREES
1874         if (cfg->fc_src_len) {
1875                 NL_SET_ERR_MSG(extack,
1876                                "Specifying source address requires IPV6_SUBTREES to be enabled");
1877                 goto out;
1878         }
1879 #endif
1880         if (cfg->fc_ifindex) {
1881                 err = -ENODEV;
1882                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1883                 if (!dev)
1884                         goto out;
1885                 idev = in6_dev_get(dev);
1886                 if (!idev)
1887                         goto out;
1888         }
1889
1890         if (cfg->fc_metric == 0)
1891                 cfg->fc_metric = IP6_RT_PRIO_USER;
1892
1893         err = -ENOBUFS;
1894         if (cfg->fc_nlinfo.nlh &&
1895             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1896                 table = fib6_get_table(net, cfg->fc_table);
1897                 if (!table) {
1898                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1899                         table = fib6_new_table(net, cfg->fc_table);
1900                 }
1901         } else {
1902                 table = fib6_new_table(net, cfg->fc_table);
1903         }
1904
1905         if (!table)
1906                 goto out;
1907
1908         rt = ip6_dst_alloc(net, NULL,
1909                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1910
1911         if (!rt) {
1912                 err = -ENOMEM;
1913                 goto out;
1914         }
1915
1916         if (cfg->fc_flags & RTF_EXPIRES)
1917                 rt6_set_expires(rt, jiffies +
1918                                 clock_t_to_jiffies(cfg->fc_expires));
1919         else
1920                 rt6_clean_expires(rt);
1921
1922         if (cfg->fc_protocol == RTPROT_UNSPEC)
1923                 cfg->fc_protocol = RTPROT_BOOT;
1924         rt->rt6i_protocol = cfg->fc_protocol;
1925
1926         addr_type = ipv6_addr_type(&cfg->fc_dst);
1927
1928         if (addr_type & IPV6_ADDR_MULTICAST)
1929                 rt->dst.input = ip6_mc_input;
1930         else if (cfg->fc_flags & RTF_LOCAL)
1931                 rt->dst.input = ip6_input;
1932         else
1933                 rt->dst.input = ip6_forward;
1934
1935         rt->dst.output = ip6_output;
1936
1937         if (cfg->fc_encap) {
1938                 struct lwtunnel_state *lwtstate;
1939
1940                 err = lwtunnel_build_state(cfg->fc_encap_type,
1941                                            cfg->fc_encap, AF_INET6, cfg,
1942                                            &lwtstate, extack);
1943                 if (err)
1944                         goto out;
1945                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1946                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1947                         rt->dst.lwtstate->orig_output = rt->dst.output;
1948                         rt->dst.output = lwtunnel_output;
1949                 }
1950                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1951                         rt->dst.lwtstate->orig_input = rt->dst.input;
1952                         rt->dst.input = lwtunnel_input;
1953                 }
1954         }
1955
1956         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1957         rt->rt6i_dst.plen = cfg->fc_dst_len;
1958         if (rt->rt6i_dst.plen == 128)
1959                 rt->dst.flags |= DST_HOST;
1960
1961 #ifdef CONFIG_IPV6_SUBTREES
1962         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1963         rt->rt6i_src.plen = cfg->fc_src_len;
1964 #endif
1965
1966         rt->rt6i_metric = cfg->fc_metric;
1967
1968         /* We cannot add true routes via loopback here,
1969            they would result in kernel looping; promote them to reject routes
1970          */
1971         if ((cfg->fc_flags & RTF_REJECT) ||
1972             (dev && (dev->flags & IFF_LOOPBACK) &&
1973              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1974              !(cfg->fc_flags & RTF_LOCAL))) {
1975                 /* hold loopback dev/idev if we haven't done so. */
1976                 if (dev != net->loopback_dev) {
1977                         if (dev) {
1978                                 dev_put(dev);
1979                                 in6_dev_put(idev);
1980                         }
1981                         dev = net->loopback_dev;
1982                         dev_hold(dev);
1983                         idev = in6_dev_get(dev);
1984                         if (!idev) {
1985                                 err = -ENODEV;
1986                                 goto out;
1987                         }
1988                 }
1989                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1990                 switch (cfg->fc_type) {
1991                 case RTN_BLACKHOLE:
1992                         rt->dst.error = -EINVAL;
1993                         rt->dst.output = dst_discard_out;
1994                         rt->dst.input = dst_discard;
1995                         break;
1996                 case RTN_PROHIBIT:
1997                         rt->dst.error = -EACCES;
1998                         rt->dst.output = ip6_pkt_prohibit_out;
1999                         rt->dst.input = ip6_pkt_prohibit;
2000                         break;
2001                 case RTN_THROW:
2002                 case RTN_UNREACHABLE:
2003                 default:
2004                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2005                                         : (cfg->fc_type == RTN_UNREACHABLE)
2006                                         ? -EHOSTUNREACH : -ENETUNREACH;
2007                         rt->dst.output = ip6_pkt_discard_out;
2008                         rt->dst.input = ip6_pkt_discard;
2009                         break;
2010                 }
2011                 goto install_route;
2012         }
2013
2014         if (cfg->fc_flags & RTF_GATEWAY) {
2015                 const struct in6_addr *gw_addr;
2016                 int gwa_type;
2017
2018                 gw_addr = &cfg->fc_gateway;
2019                 gwa_type = ipv6_addr_type(gw_addr);
2020
2021                 /* if gw_addr is local we will fail to detect this in case
2022                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2023                  * will return already-added prefix route via interface that
2024                  * prefix route was assigned to, which might be non-loopback.
2025                  */
2026                 err = -EINVAL;
2027                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2028                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2029                                             dev : NULL, 0, 0)) {
2030                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2031                         goto out;
2032                 }
2033                 rt->rt6i_gateway = *gw_addr;
2034
2035                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2036                         struct rt6_info *grt = NULL;
2037
2038                         /* IPv6 strictly inhibits using not link-local
2039                            addresses as nexthop address.
2040                            Otherwise, router will not able to send redirects.
2041                            It is very good, but in some (rare!) circumstances
2042                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2043                            some exceptions. --ANK
2044                            We allow IPv4-mapped nexthops to support RFC4798-type
2045                            addressing
2046                          */
2047                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2048                                           IPV6_ADDR_MAPPED))) {
2049                                 NL_SET_ERR_MSG(extack,
2050                                                "Invalid gateway address");
2051                                 goto out;
2052                         }
2053
2054                         if (cfg->fc_table) {
2055                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2056
2057                                 if (grt) {
2058                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2059                                             (dev && dev != grt->dst.dev)) {
2060                                                 ip6_rt_put(grt);
2061                                                 grt = NULL;
2062                                         }
2063                                 }
2064                         }
2065
2066                         if (!grt)
2067                                 grt = rt6_lookup(net, gw_addr, NULL,
2068                                                  cfg->fc_ifindex, 1);
2069
2070                         err = -EHOSTUNREACH;
2071                         if (!grt)
2072                                 goto out;
2073                         if (dev) {
2074                                 if (dev != grt->dst.dev) {
2075                                         ip6_rt_put(grt);
2076                                         goto out;
2077                                 }
2078                         } else {
2079                                 dev = grt->dst.dev;
2080                                 idev = grt->rt6i_idev;
2081                                 dev_hold(dev);
2082                                 in6_dev_hold(grt->rt6i_idev);
2083                         }
2084                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2085                                 err = 0;
2086                         ip6_rt_put(grt);
2087
2088                         if (err)
2089                                 goto out;
2090                 }
2091                 err = -EINVAL;
2092                 if (!dev) {
2093                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2094                         goto out;
2095                 } else if (dev->flags & IFF_LOOPBACK) {
2096                         NL_SET_ERR_MSG(extack,
2097                                        "Egress device can not be loopback device for this route");
2098                         goto out;
2099                 }
2100         }
2101
2102         err = -ENODEV;
2103         if (!dev)
2104                 goto out;
2105
2106         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2107                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2108                         NL_SET_ERR_MSG(extack, "Invalid source address");
2109                         err = -EINVAL;
2110                         goto out;
2111                 }
2112                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2113                 rt->rt6i_prefsrc.plen = 128;
2114         } else
2115                 rt->rt6i_prefsrc.plen = 0;
2116
2117         rt->rt6i_flags = cfg->fc_flags;
2118
2119 install_route:
2120         rt->dst.dev = dev;
2121         rt->rt6i_idev = idev;
2122         rt->rt6i_table = table;
2123
2124         cfg->fc_nlinfo.nl_net = dev_net(dev);
2125
2126         return rt;
2127 out:
2128         if (dev)
2129                 dev_put(dev);
2130         if (idev)
2131                 in6_dev_put(idev);
2132         if (rt)
2133                 dst_free(&rt->dst);
2134
2135         return ERR_PTR(err);
2136 }
2137
2138 int ip6_route_add(struct fib6_config *cfg,
2139                   struct netlink_ext_ack *extack)
2140 {
2141         struct mx6_config mxc = { .mx = NULL, };
2142         struct rt6_info *rt;
2143         int err;
2144
2145         rt = ip6_route_info_create(cfg, extack);
2146         if (IS_ERR(rt)) {
2147                 err = PTR_ERR(rt);
2148                 rt = NULL;
2149                 goto out;
2150         }
2151
2152         err = ip6_convert_metrics(&mxc, cfg);
2153         if (err)
2154                 goto out;
2155
2156         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2157
2158         kfree(mxc.mx);
2159
2160         return err;
2161 out:
2162         if (rt)
2163                 dst_free(&rt->dst);
2164
2165         return err;
2166 }
2167
2168 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2169 {
2170         int err;
2171         struct fib6_table *table;
2172         struct net *net = dev_net(rt->dst.dev);
2173
2174         if (rt == net->ipv6.ip6_null_entry ||
2175             rt->dst.flags & DST_NOCACHE) {
2176                 err = -ENOENT;
2177                 goto out;
2178         }
2179
2180         table = rt->rt6i_table;
2181         write_lock_bh(&table->tb6_lock);
2182         err = fib6_del(rt, info);
2183         write_unlock_bh(&table->tb6_lock);
2184
2185 out:
2186         ip6_rt_put(rt);
2187         return err;
2188 }
2189
2190 int ip6_del_rt(struct rt6_info *rt)
2191 {
2192         struct nl_info info = {
2193                 .nl_net = dev_net(rt->dst.dev),
2194         };
2195         return __ip6_del_rt(rt, &info);
2196 }
2197
2198 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2199 {
2200         struct nl_info *info = &cfg->fc_nlinfo;
2201         struct net *net = info->nl_net;
2202         struct sk_buff *skb = NULL;
2203         struct fib6_table *table;
2204         int err = -ENOENT;
2205
2206         if (rt == net->ipv6.ip6_null_entry)
2207                 goto out_put;
2208         table = rt->rt6i_table;
2209         write_lock_bh(&table->tb6_lock);
2210
2211         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2212                 struct rt6_info *sibling, *next_sibling;
2213
2214                 /* prefer to send a single notification with all hops */
2215                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2216                 if (skb) {
2217                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2218
2219                         if (rt6_fill_node(net, skb, rt,
2220                                           NULL, NULL, 0, RTM_DELROUTE,
2221                                           info->portid, seq, 0) < 0) {
2222                                 kfree_skb(skb);
2223                                 skb = NULL;
2224                         } else
2225                                 info->skip_notify = 1;
2226                 }
2227
2228                 list_for_each_entry_safe(sibling, next_sibling,
2229                                          &rt->rt6i_siblings,
2230                                          rt6i_siblings) {
2231                         err = fib6_del(sibling, info);
2232                         if (err)
2233                                 goto out_unlock;
2234                 }
2235         }
2236
2237         err = fib6_del(rt, info);
2238 out_unlock:
2239         write_unlock_bh(&table->tb6_lock);
2240 out_put:
2241         ip6_rt_put(rt);
2242
2243         if (skb) {
2244                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2245                             info->nlh, gfp_any());
2246         }
2247         return err;
2248 }
2249
2250 static int ip6_route_del(struct fib6_config *cfg,
2251                          struct netlink_ext_ack *extack)
2252 {
2253         struct fib6_table *table;
2254         struct fib6_node *fn;
2255         struct rt6_info *rt;
2256         int err = -ESRCH;
2257
2258         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2259         if (!table) {
2260                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2261                 return err;
2262         }
2263
2264         read_lock_bh(&table->tb6_lock);
2265
2266         fn = fib6_locate(&table->tb6_root,
2267                          &cfg->fc_dst, cfg->fc_dst_len,
2268                          &cfg->fc_src, cfg->fc_src_len);
2269
2270         if (fn) {
2271                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2272                         if ((rt->rt6i_flags & RTF_CACHE) &&
2273                             !(cfg->fc_flags & RTF_CACHE))
2274                                 continue;
2275                         if (cfg->fc_ifindex &&
2276                             (!rt->dst.dev ||
2277                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2278                                 continue;
2279                         if (cfg->fc_flags & RTF_GATEWAY &&
2280                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2281                                 continue;
2282                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2283                                 continue;
2284                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2285                                 continue;
2286                         dst_hold(&rt->dst);
2287                         read_unlock_bh(&table->tb6_lock);
2288
2289                         /* if gateway was specified only delete the one hop */
2290                         if (cfg->fc_flags & RTF_GATEWAY)
2291                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2292
2293                         return __ip6_del_rt_siblings(rt, cfg);
2294                 }
2295         }
2296         read_unlock_bh(&table->tb6_lock);
2297
2298         return err;
2299 }
2300
2301 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2302 {
2303         struct netevent_redirect netevent;
2304         struct rt6_info *rt, *nrt = NULL;
2305         struct ndisc_options ndopts;
2306         struct inet6_dev *in6_dev;
2307         struct neighbour *neigh;
2308         struct rd_msg *msg;
2309         int optlen, on_link;
2310         u8 *lladdr;
2311
2312         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2313         optlen -= sizeof(*msg);
2314
2315         if (optlen < 0) {
2316                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2317                 return;
2318         }
2319
2320         msg = (struct rd_msg *)icmp6_hdr(skb);
2321
2322         if (ipv6_addr_is_multicast(&msg->dest)) {
2323                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2324                 return;
2325         }
2326
2327         on_link = 0;
2328         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2329                 on_link = 1;
2330         } else if (ipv6_addr_type(&msg->target) !=
2331                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2332                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2333                 return;
2334         }
2335
2336         in6_dev = __in6_dev_get(skb->dev);
2337         if (!in6_dev)
2338                 return;
2339         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2340                 return;
2341
2342         /* RFC2461 8.1:
2343          *      The IP source address of the Redirect MUST be the same as the current
2344          *      first-hop router for the specified ICMP Destination Address.
2345          */
2346
2347         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2348                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2349                 return;
2350         }
2351
2352         lladdr = NULL;
2353         if (ndopts.nd_opts_tgt_lladdr) {
2354                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2355                                              skb->dev);
2356                 if (!lladdr) {
2357                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2358                         return;
2359                 }
2360         }
2361
2362         rt = (struct rt6_info *) dst;
2363         if (rt->rt6i_flags & RTF_REJECT) {
2364                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2365                 return;
2366         }
2367
2368         /* Redirect received -> path was valid.
2369          * Look, redirects are sent only in response to data packets,
2370          * so that this nexthop apparently is reachable. --ANK
2371          */
2372         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2373
2374         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2375         if (!neigh)
2376                 return;
2377
2378         /*
2379          *      We have finally decided to accept it.
2380          */
2381
2382         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2383                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2384                      NEIGH_UPDATE_F_OVERRIDE|
2385                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2386                                      NEIGH_UPDATE_F_ISROUTER)),
2387                      NDISC_REDIRECT, &ndopts);
2388
2389         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2390         if (!nrt)
2391                 goto out;
2392
2393         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2394         if (on_link)
2395                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2396
2397         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2398
2399         if (ip6_ins_rt(nrt))
2400                 goto out;
2401
2402         netevent.old = &rt->dst;
2403         netevent.new = &nrt->dst;
2404         netevent.daddr = &msg->dest;
2405         netevent.neigh = neigh;
2406         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2407
2408         if (rt->rt6i_flags & RTF_CACHE) {
2409                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2410                 ip6_del_rt(rt);
2411         }
2412
2413 out:
2414         neigh_release(neigh);
2415 }
2416
2417 /*
2418  *      Misc support functions
2419  */
2420
2421 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2422 {
2423         BUG_ON(from->dst.from);
2424
2425         rt->rt6i_flags &= ~RTF_EXPIRES;
2426         dst_hold(&from->dst);
2427         rt->dst.from = &from->dst;
2428         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2429 }
2430
2431 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2432 {
2433         rt->dst.input = ort->dst.input;
2434         rt->dst.output = ort->dst.output;
2435         rt->rt6i_dst = ort->rt6i_dst;
2436         rt->dst.error = ort->dst.error;
2437         rt->rt6i_idev = ort->rt6i_idev;
2438         if (rt->rt6i_idev)
2439                 in6_dev_hold(rt->rt6i_idev);
2440         rt->dst.lastuse = jiffies;
2441         rt->rt6i_gateway = ort->rt6i_gateway;
2442         rt->rt6i_flags = ort->rt6i_flags;
2443         rt6_set_from(rt, ort);
2444         rt->rt6i_metric = ort->rt6i_metric;
2445 #ifdef CONFIG_IPV6_SUBTREES
2446         rt->rt6i_src = ort->rt6i_src;
2447 #endif
2448         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2449         rt->rt6i_table = ort->rt6i_table;
2450         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2451 }
2452
2453 #ifdef CONFIG_IPV6_ROUTE_INFO
2454 static struct rt6_info *rt6_get_route_info(struct net *net,
2455                                            const struct in6_addr *prefix, int prefixlen,
2456                                            const struct in6_addr *gwaddr,
2457                                            struct net_device *dev)
2458 {
2459         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2460         int ifindex = dev->ifindex;
2461         struct fib6_node *fn;
2462         struct rt6_info *rt = NULL;
2463         struct fib6_table *table;
2464
2465         table = fib6_get_table(net, tb_id);
2466         if (!table)
2467                 return NULL;
2468
2469         read_lock_bh(&table->tb6_lock);
2470         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2471         if (!fn)
2472                 goto out;
2473
2474         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2475                 if (rt->dst.dev->ifindex != ifindex)
2476                         continue;
2477                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2478                         continue;
2479                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2480                         continue;
2481                 dst_hold(&rt->dst);
2482                 break;
2483         }
2484 out:
2485         read_unlock_bh(&table->tb6_lock);
2486         return rt;
2487 }
2488
2489 static struct rt6_info *rt6_add_route_info(struct net *net,
2490                                            const struct in6_addr *prefix, int prefixlen,
2491                                            const struct in6_addr *gwaddr,
2492                                            struct net_device *dev,
2493                                            unsigned int pref)
2494 {
2495         struct fib6_config cfg = {
2496                 .fc_metric      = IP6_RT_PRIO_USER,
2497                 .fc_ifindex     = dev->ifindex,
2498                 .fc_dst_len     = prefixlen,
2499                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2500                                   RTF_UP | RTF_PREF(pref),
2501                 .fc_nlinfo.portid = 0,
2502                 .fc_nlinfo.nlh = NULL,
2503                 .fc_nlinfo.nl_net = net,
2504         };
2505
2506         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2507         cfg.fc_dst = *prefix;
2508         cfg.fc_gateway = *gwaddr;
2509
2510         /* We should treat it as a default route if prefix length is 0. */
2511         if (!prefixlen)
2512                 cfg.fc_flags |= RTF_DEFAULT;
2513
2514         ip6_route_add(&cfg, NULL);
2515
2516         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2517 }
2518 #endif
2519
2520 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2521 {
2522         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2523         struct rt6_info *rt;
2524         struct fib6_table *table;
2525
2526         table = fib6_get_table(dev_net(dev), tb_id);
2527         if (!table)
2528                 return NULL;
2529
2530         read_lock_bh(&table->tb6_lock);
2531         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2532                 if (dev == rt->dst.dev &&
2533                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2534                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2535                         break;
2536         }
2537         if (rt)
2538                 dst_hold(&rt->dst);
2539         read_unlock_bh(&table->tb6_lock);
2540         return rt;
2541 }
2542
2543 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2544                                      struct net_device *dev,
2545                                      unsigned int pref)
2546 {
2547         struct fib6_config cfg = {
2548                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2549                 .fc_metric      = IP6_RT_PRIO_USER,
2550                 .fc_ifindex     = dev->ifindex,
2551                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2552                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2553                 .fc_nlinfo.portid = 0,
2554                 .fc_nlinfo.nlh = NULL,
2555                 .fc_nlinfo.nl_net = dev_net(dev),
2556         };
2557
2558         cfg.fc_gateway = *gwaddr;
2559
2560         if (!ip6_route_add(&cfg, NULL)) {
2561                 struct fib6_table *table;
2562
2563                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2564                 if (table)
2565                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2566         }
2567
2568         return rt6_get_dflt_router(gwaddr, dev);
2569 }
2570
2571 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2572 {
2573         struct rt6_info *rt;
2574
2575 restart:
2576         read_lock_bh(&table->tb6_lock);
2577         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2578                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2579                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2580                         dst_hold(&rt->dst);
2581                         read_unlock_bh(&table->tb6_lock);
2582                         ip6_del_rt(rt);
2583                         goto restart;
2584                 }
2585         }
2586         read_unlock_bh(&table->tb6_lock);
2587
2588         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2589 }
2590
2591 void rt6_purge_dflt_routers(struct net *net)
2592 {
2593         struct fib6_table *table;
2594         struct hlist_head *head;
2595         unsigned int h;
2596
2597         rcu_read_lock();
2598
2599         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2600                 head = &net->ipv6.fib_table_hash[h];
2601                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2602                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2603                                 __rt6_purge_dflt_routers(table);
2604                 }
2605         }
2606
2607         rcu_read_unlock();
2608 }
2609
2610 static void rtmsg_to_fib6_config(struct net *net,
2611                                  struct in6_rtmsg *rtmsg,
2612                                  struct fib6_config *cfg)
2613 {
2614         memset(cfg, 0, sizeof(*cfg));
2615
2616         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2617                          : RT6_TABLE_MAIN;
2618         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2619         cfg->fc_metric = rtmsg->rtmsg_metric;
2620         cfg->fc_expires = rtmsg->rtmsg_info;
2621         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2622         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2623         cfg->fc_flags = rtmsg->rtmsg_flags;
2624
2625         cfg->fc_nlinfo.nl_net = net;
2626
2627         cfg->fc_dst = rtmsg->rtmsg_dst;
2628         cfg->fc_src = rtmsg->rtmsg_src;
2629         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2630 }
2631
2632 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2633 {
2634         struct fib6_config cfg;
2635         struct in6_rtmsg rtmsg;
2636         int err;
2637
2638         switch (cmd) {
2639         case SIOCADDRT:         /* Add a route */
2640         case SIOCDELRT:         /* Delete a route */
2641                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2642                         return -EPERM;
2643                 err = copy_from_user(&rtmsg, arg,
2644                                      sizeof(struct in6_rtmsg));
2645                 if (err)
2646                         return -EFAULT;
2647
2648                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2649
2650                 rtnl_lock();
2651                 switch (cmd) {
2652                 case SIOCADDRT:
2653                         err = ip6_route_add(&cfg, NULL);
2654                         break;
2655                 case SIOCDELRT:
2656                         err = ip6_route_del(&cfg, NULL);
2657                         break;
2658                 default:
2659                         err = -EINVAL;
2660                 }
2661                 rtnl_unlock();
2662
2663                 return err;
2664         }
2665
2666         return -EINVAL;
2667 }
2668
2669 /*
2670  *      Drop the packet on the floor
2671  */
2672
2673 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2674 {
2675         int type;
2676         struct dst_entry *dst = skb_dst(skb);
2677         switch (ipstats_mib_noroutes) {
2678         case IPSTATS_MIB_INNOROUTES:
2679                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2680                 if (type == IPV6_ADDR_ANY) {
2681                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2682                                       IPSTATS_MIB_INADDRERRORS);
2683                         break;
2684                 }
2685                 /* FALLTHROUGH */
2686         case IPSTATS_MIB_OUTNOROUTES:
2687                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2688                               ipstats_mib_noroutes);
2689                 break;
2690         }
2691         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2692         kfree_skb(skb);
2693         return 0;
2694 }
2695
2696 static int ip6_pkt_discard(struct sk_buff *skb)
2697 {
2698         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2699 }
2700
2701 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2702 {
2703         skb->dev = skb_dst(skb)->dev;
2704         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2705 }
2706
2707 static int ip6_pkt_prohibit(struct sk_buff *skb)
2708 {
2709         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2710 }
2711
2712 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2713 {
2714         skb->dev = skb_dst(skb)->dev;
2715         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2716 }
2717
2718 /*
2719  *      Allocate a dst for local (unicast / anycast) address.
2720  */
2721
2722 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2723                                     const struct in6_addr *addr,
2724                                     bool anycast)
2725 {
2726         u32 tb_id;
2727         struct net *net = dev_net(idev->dev);
2728         struct net_device *dev = net->loopback_dev;
2729         struct rt6_info *rt;
2730
2731         /* use L3 Master device as loopback for host routes if device
2732          * is enslaved and address is not link local or multicast
2733          */
2734         if (!rt6_need_strict(addr))
2735                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2736
2737         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2738         if (!rt)
2739                 return ERR_PTR(-ENOMEM);
2740
2741         in6_dev_hold(idev);
2742
2743         rt->dst.flags |= DST_HOST;
2744         rt->dst.input = ip6_input;
2745         rt->dst.output = ip6_output;
2746         rt->rt6i_idev = idev;
2747
2748         rt->rt6i_protocol = RTPROT_KERNEL;
2749         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2750         if (anycast)
2751                 rt->rt6i_flags |= RTF_ANYCAST;
2752         else
2753                 rt->rt6i_flags |= RTF_LOCAL;
2754
2755         rt->rt6i_gateway  = *addr;
2756         rt->rt6i_dst.addr = *addr;
2757         rt->rt6i_dst.plen = 128;
2758         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2759         rt->rt6i_table = fib6_get_table(net, tb_id);
2760         rt->dst.flags |= DST_NOCACHE;
2761
2762         atomic_set(&rt->dst.__refcnt, 1);
2763
2764         return rt;
2765 }
2766
2767 /* remove deleted ip from prefsrc entries */
2768 struct arg_dev_net_ip {
2769         struct net_device *dev;
2770         struct net *net;
2771         struct in6_addr *addr;
2772 };
2773
2774 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2775 {
2776         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2777         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2778         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2779
2780         if (((void *)rt->dst.dev == dev || !dev) &&
2781             rt != net->ipv6.ip6_null_entry &&
2782             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2783                 /* remove prefsrc entry */
2784                 rt->rt6i_prefsrc.plen = 0;
2785         }
2786         return 0;
2787 }
2788
2789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2790 {
2791         struct net *net = dev_net(ifp->idev->dev);
2792         struct arg_dev_net_ip adni = {
2793                 .dev = ifp->idev->dev,
2794                 .net = net,
2795                 .addr = &ifp->addr,
2796         };
2797         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2798 }
2799
2800 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2801 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2802
2803 /* Remove routers and update dst entries when gateway turn into host. */
2804 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2805 {
2806         struct in6_addr *gateway = (struct in6_addr *)arg;
2807
2808         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2809              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2810              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2811                 return -1;
2812         }
2813         return 0;
2814 }
2815
2816 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2817 {
2818         fib6_clean_all(net, fib6_clean_tohost, gateway);
2819 }
2820
2821 struct arg_dev_net {
2822         struct net_device *dev;
2823         struct net *net;
2824 };
2825
2826 /* called with write lock held for table with rt */
2827 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2828 {
2829         const struct arg_dev_net *adn = arg;
2830         const struct net_device *dev = adn->dev;
2831
2832         if ((rt->dst.dev == dev || !dev) &&
2833             rt != adn->net->ipv6.ip6_null_entry &&
2834             (rt->rt6i_nsiblings == 0 ||
2835              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2836                 return -1;
2837
2838         return 0;
2839 }
2840
2841 void rt6_ifdown(struct net *net, struct net_device *dev)
2842 {
2843         struct arg_dev_net adn = {
2844                 .dev = dev,
2845                 .net = net,
2846         };
2847
2848         fib6_clean_all(net, fib6_ifdown, &adn);
2849         icmp6_clean_all(fib6_ifdown, &adn);
2850         if (dev)
2851                 rt6_uncached_list_flush_dev(net, dev);
2852 }
2853
2854 struct rt6_mtu_change_arg {
2855         struct net_device *dev;
2856         unsigned int mtu;
2857 };
2858
2859 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2860 {
2861         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2862         struct inet6_dev *idev;
2863
2864         /* In IPv6 pmtu discovery is not optional,
2865            so that RTAX_MTU lock cannot disable it.
2866            We still use this lock to block changes
2867            caused by addrconf/ndisc.
2868         */
2869
2870         idev = __in6_dev_get(arg->dev);
2871         if (!idev)
2872                 return 0;
2873
2874         /* For administrative MTU increase, there is no way to discover
2875            IPv6 PMTU increase, so PMTU increase should be updated here.
2876            Since RFC 1981 doesn't include administrative MTU increase
2877            update PMTU increase is a MUST. (i.e. jumbo frame)
2878          */
2879         /*
2880            If new MTU is less than route PMTU, this new MTU will be the
2881            lowest MTU in the path, update the route PMTU to reflect PMTU
2882            decreases; if new MTU is greater than route PMTU, and the
2883            old MTU is the lowest MTU in the path, update the route PMTU
2884            to reflect the increase. In this case if the other nodes' MTU
2885            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2886            PMTU discovery.
2887          */
2888         if (rt->dst.dev == arg->dev &&
2889             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2890             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2891                 if (rt->rt6i_flags & RTF_CACHE) {
2892                         /* For RTF_CACHE with rt6i_pmtu == 0
2893                          * (i.e. a redirected route),
2894                          * the metrics of its rt->dst.from has already
2895                          * been updated.
2896                          */
2897                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2898                                 rt->rt6i_pmtu = arg->mtu;
2899                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2900                            (dst_mtu(&rt->dst) < arg->mtu &&
2901                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2902                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2903                 }
2904         }
2905         return 0;
2906 }
2907
2908 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2909 {
2910         struct rt6_mtu_change_arg arg = {
2911                 .dev = dev,
2912                 .mtu = mtu,
2913         };
2914
2915         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2916 }
2917
2918 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2919         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2920         [RTA_OIF]               = { .type = NLA_U32 },
2921         [RTA_IIF]               = { .type = NLA_U32 },
2922         [RTA_PRIORITY]          = { .type = NLA_U32 },
2923         [RTA_METRICS]           = { .type = NLA_NESTED },
2924         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2925         [RTA_PREF]              = { .type = NLA_U8 },
2926         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2927         [RTA_ENCAP]             = { .type = NLA_NESTED },
2928         [RTA_EXPIRES]           = { .type = NLA_U32 },
2929         [RTA_UID]               = { .type = NLA_U32 },
2930         [RTA_MARK]              = { .type = NLA_U32 },
2931 };
2932
2933 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2934                               struct fib6_config *cfg,
2935                               struct netlink_ext_ack *extack)
2936 {
2937         struct rtmsg *rtm;
2938         struct nlattr *tb[RTA_MAX+1];
2939         unsigned int pref;
2940         int err;
2941
2942         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2943                           NULL);
2944         if (err < 0)
2945                 goto errout;
2946
2947         err = -EINVAL;
2948         rtm = nlmsg_data(nlh);
2949         memset(cfg, 0, sizeof(*cfg));
2950
2951         cfg->fc_table = rtm->rtm_table;
2952         cfg->fc_dst_len = rtm->rtm_dst_len;
2953         cfg->fc_src_len = rtm->rtm_src_len;
2954         cfg->fc_flags = RTF_UP;
2955         cfg->fc_protocol = rtm->rtm_protocol;
2956         cfg->fc_type = rtm->rtm_type;
2957
2958         if (rtm->rtm_type == RTN_UNREACHABLE ||
2959             rtm->rtm_type == RTN_BLACKHOLE ||
2960             rtm->rtm_type == RTN_PROHIBIT ||
2961             rtm->rtm_type == RTN_THROW)
2962                 cfg->fc_flags |= RTF_REJECT;
2963
2964         if (rtm->rtm_type == RTN_LOCAL)
2965                 cfg->fc_flags |= RTF_LOCAL;
2966
2967         if (rtm->rtm_flags & RTM_F_CLONED)
2968                 cfg->fc_flags |= RTF_CACHE;
2969
2970         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2971         cfg->fc_nlinfo.nlh = nlh;
2972         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2973
2974         if (tb[RTA_GATEWAY]) {
2975                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2976                 cfg->fc_flags |= RTF_GATEWAY;
2977         }
2978
2979         if (tb[RTA_DST]) {
2980                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2981
2982                 if (nla_len(tb[RTA_DST]) < plen)
2983                         goto errout;
2984
2985                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2986         }
2987
2988         if (tb[RTA_SRC]) {
2989                 int plen = (rtm->rtm_src_len + 7) >> 3;
2990
2991                 if (nla_len(tb[RTA_SRC]) < plen)
2992                         goto errout;
2993
2994                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2995         }
2996
2997         if (tb[RTA_PREFSRC])
2998                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2999
3000         if (tb[RTA_OIF])
3001                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3002
3003         if (tb[RTA_PRIORITY])
3004                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3005
3006         if (tb[RTA_METRICS]) {
3007                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3008                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3009         }
3010
3011         if (tb[RTA_TABLE])
3012                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3013
3014         if (tb[RTA_MULTIPATH]) {
3015                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3016                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3017
3018                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3019                                                      cfg->fc_mp_len, extack);
3020                 if (err < 0)
3021                         goto errout;
3022         }
3023
3024         if (tb[RTA_PREF]) {
3025                 pref = nla_get_u8(tb[RTA_PREF]);
3026                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3027                     pref != ICMPV6_ROUTER_PREF_HIGH)
3028                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3029                 cfg->fc_flags |= RTF_PREF(pref);
3030         }
3031
3032         if (tb[RTA_ENCAP])
3033                 cfg->fc_encap = tb[RTA_ENCAP];
3034
3035         if (tb[RTA_ENCAP_TYPE]) {
3036                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3037
3038                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3039                 if (err < 0)
3040                         goto errout;
3041         }
3042
3043         if (tb[RTA_EXPIRES]) {
3044                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3045
3046                 if (addrconf_finite_timeout(timeout)) {
3047                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3048                         cfg->fc_flags |= RTF_EXPIRES;
3049                 }
3050         }
3051
3052         err = 0;
3053 errout:
3054         return err;
3055 }
3056
3057 struct rt6_nh {
3058         struct rt6_info *rt6_info;
3059         struct fib6_config r_cfg;
3060         struct mx6_config mxc;
3061         struct list_head next;
3062 };
3063
3064 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3065 {
3066         struct rt6_nh *nh;
3067
3068         list_for_each_entry(nh, rt6_nh_list, next) {
3069                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3070                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3071                         nh->r_cfg.fc_ifindex);
3072         }
3073 }
3074
3075 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3076                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3077 {
3078         struct rt6_nh *nh;
3079         struct rt6_info *rtnh;
3080         int err = -EEXIST;
3081
3082         list_for_each_entry(nh, rt6_nh_list, next) {
3083                 /* check if rt6_info already exists */
3084                 rtnh = nh->rt6_info;
3085
3086                 if (rtnh->dst.dev == rt->dst.dev &&
3087                     rtnh->rt6i_idev == rt->rt6i_idev &&
3088                     ipv6_addr_equal(&rtnh->rt6i_gateway,
3089                                     &rt->rt6i_gateway))
3090                         return err;
3091         }
3092
3093         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3094         if (!nh)
3095                 return -ENOMEM;
3096         nh->rt6_info = rt;
3097         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3098         if (err) {
3099                 kfree(nh);
3100                 return err;
3101         }
3102         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3103         list_add_tail(&nh->next, rt6_nh_list);
3104
3105         return 0;
3106 }
3107
3108 static void ip6_route_mpath_notify(struct rt6_info *rt,
3109                                    struct rt6_info *rt_last,
3110                                    struct nl_info *info,
3111                                    __u16 nlflags)
3112 {
3113         /* if this is an APPEND route, then rt points to the first route
3114          * inserted and rt_last points to last route inserted. Userspace
3115          * wants a consistent dump of the route which starts at the first
3116          * nexthop. Since sibling routes are always added at the end of
3117          * the list, find the first sibling of the last route appended
3118          */
3119         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3120                 rt = list_first_entry(&rt_last->rt6i_siblings,
3121                                       struct rt6_info,
3122                                       rt6i_siblings);
3123         }
3124
3125         if (rt)
3126                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3127 }
3128
3129 static int ip6_route_multipath_add(struct fib6_config *cfg,
3130                                    struct netlink_ext_ack *extack)
3131 {
3132         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3133         struct nl_info *info = &cfg->fc_nlinfo;
3134         struct fib6_config r_cfg;
3135         struct rtnexthop *rtnh;
3136         struct rt6_info *rt;
3137         struct rt6_nh *err_nh;
3138         struct rt6_nh *nh, *nh_safe;
3139         __u16 nlflags;
3140         int remaining;
3141         int attrlen;
3142         int err = 1;
3143         int nhn = 0;
3144         int replace = (cfg->fc_nlinfo.nlh &&
3145                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3146         LIST_HEAD(rt6_nh_list);
3147
3148         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3149         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3150                 nlflags |= NLM_F_APPEND;
3151
3152         remaining = cfg->fc_mp_len;
3153         rtnh = (struct rtnexthop *)cfg->fc_mp;
3154
3155         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3156          * rt6_info structs per nexthop
3157          */
3158         while (rtnh_ok(rtnh, remaining)) {
3159                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3160                 if (rtnh->rtnh_ifindex)
3161                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3162
3163                 attrlen = rtnh_attrlen(rtnh);
3164                 if (attrlen > 0) {
3165                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3166
3167                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3168                         if (nla) {
3169                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3170                                 r_cfg.fc_flags |= RTF_GATEWAY;
3171                         }
3172                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3173                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3174                         if (nla)
3175                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3176                 }
3177
3178                 rt = ip6_route_info_create(&r_cfg, extack);
3179                 if (IS_ERR(rt)) {
3180                         err = PTR_ERR(rt);
3181                         rt = NULL;
3182                         goto cleanup;
3183                 }
3184
3185                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3186                 if (err) {
3187                         dst_free(&rt->dst);
3188                         goto cleanup;
3189                 }
3190
3191                 rtnh = rtnh_next(rtnh, &remaining);
3192         }
3193
3194         /* for add and replace send one notification with all nexthops.
3195          * Skip the notification in fib6_add_rt2node and send one with
3196          * the full route when done
3197          */
3198         info->skip_notify = 1;
3199
3200         err_nh = NULL;
3201         list_for_each_entry(nh, &rt6_nh_list, next) {
3202                 rt_last = nh->rt6_info;
3203                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3204                 /* save reference to first route for notification */
3205                 if (!rt_notif && !err)
3206                         rt_notif = nh->rt6_info;
3207
3208                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3209                 nh->rt6_info = NULL;
3210                 if (err) {
3211                         if (replace && nhn)
3212                                 ip6_print_replace_route_err(&rt6_nh_list);
3213                         err_nh = nh;
3214                         goto add_errout;
3215                 }
3216
3217                 /* Because each route is added like a single route we remove
3218                  * these flags after the first nexthop: if there is a collision,
3219                  * we have already failed to add the first nexthop:
3220                  * fib6_add_rt2node() has rejected it; when replacing, old
3221                  * nexthops have been replaced by first new, the rest should
3222                  * be added to it.
3223                  */
3224                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3225                                                      NLM_F_REPLACE);
3226                 nhn++;
3227         }
3228
3229         /* success ... tell user about new route */
3230         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3231         goto cleanup;
3232
3233 add_errout:
3234         /* send notification for routes that were added so that
3235          * the delete notifications sent by ip6_route_del are
3236          * coherent
3237          */
3238         if (rt_notif)
3239                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3240
3241         /* Delete routes that were already added */
3242         list_for_each_entry(nh, &rt6_nh_list, next) {
3243                 if (err_nh == nh)
3244                         break;
3245                 ip6_route_del(&nh->r_cfg, extack);
3246         }
3247
3248 cleanup:
3249         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3250                 if (nh->rt6_info)
3251                         dst_free(&nh->rt6_info->dst);
3252                 kfree(nh->mxc.mx);
3253                 list_del(&nh->next);
3254                 kfree(nh);
3255         }
3256
3257         return err;
3258 }
3259
3260 static int ip6_route_multipath_del(struct fib6_config *cfg,
3261                                    struct netlink_ext_ack *extack)
3262 {
3263         struct fib6_config r_cfg;
3264         struct rtnexthop *rtnh;
3265         int remaining;
3266         int attrlen;
3267         int err = 1, last_err = 0;
3268
3269         remaining = cfg->fc_mp_len;
3270         rtnh = (struct rtnexthop *)cfg->fc_mp;
3271
3272         /* Parse a Multipath Entry */
3273         while (rtnh_ok(rtnh, remaining)) {
3274                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3275                 if (rtnh->rtnh_ifindex)
3276                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3277
3278                 attrlen = rtnh_attrlen(rtnh);
3279                 if (attrlen > 0) {
3280                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3281
3282                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3283                         if (nla) {
3284                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3285                                 r_cfg.fc_flags |= RTF_GATEWAY;
3286                         }
3287                 }
3288                 err = ip6_route_del(&r_cfg, extack);
3289                 if (err)
3290                         last_err = err;
3291
3292                 rtnh = rtnh_next(rtnh, &remaining);
3293         }
3294
3295         return last_err;
3296 }
3297
3298 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3299                               struct netlink_ext_ack *extack)
3300 {
3301         struct fib6_config cfg;
3302         int err;
3303
3304         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3305         if (err < 0)
3306                 return err;
3307
3308         if (cfg.fc_mp)
3309                 return ip6_route_multipath_del(&cfg, extack);
3310         else {
3311                 cfg.fc_delete_all_nh = 1;
3312                 return ip6_route_del(&cfg, extack);
3313         }
3314 }
3315
3316 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3317                               struct netlink_ext_ack *extack)
3318 {
3319         struct fib6_config cfg;
3320         int err;
3321
3322         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3323         if (err < 0)
3324                 return err;
3325
3326         if (cfg.fc_mp)
3327                 return ip6_route_multipath_add(&cfg, extack);
3328         else
3329                 return ip6_route_add(&cfg, extack);
3330 }
3331
3332 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3333 {
3334         int nexthop_len = 0;
3335
3336         if (rt->rt6i_nsiblings) {
3337                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3338                             + NLA_ALIGN(sizeof(struct rtnexthop))
3339                             + nla_total_size(16) /* RTA_GATEWAY */
3340                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3341
3342                 nexthop_len *= rt->rt6i_nsiblings;
3343         }
3344
3345         return NLMSG_ALIGN(sizeof(struct rtmsg))
3346                + nla_total_size(16) /* RTA_SRC */
3347                + nla_total_size(16) /* RTA_DST */
3348                + nla_total_size(16) /* RTA_GATEWAY */
3349                + nla_total_size(16) /* RTA_PREFSRC */
3350                + nla_total_size(4) /* RTA_TABLE */
3351                + nla_total_size(4) /* RTA_IIF */
3352                + nla_total_size(4) /* RTA_OIF */
3353                + nla_total_size(4) /* RTA_PRIORITY */
3354                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3355                + nla_total_size(sizeof(struct rta_cacheinfo))
3356                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3357                + nla_total_size(1) /* RTA_PREF */
3358                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3359                + nexthop_len;
3360 }
3361
3362 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3363                             unsigned int *flags, bool skip_oif)
3364 {
3365         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3366                 *flags |= RTNH_F_LINKDOWN;
3367                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3368                         *flags |= RTNH_F_DEAD;
3369         }
3370
3371         if (rt->rt6i_flags & RTF_GATEWAY) {
3372                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3373                         goto nla_put_failure;
3374         }
3375
3376         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3377         if (!skip_oif && rt->dst.dev &&
3378             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3379                 goto nla_put_failure;
3380
3381         if (rt->dst.lwtstate &&
3382             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3383                 goto nla_put_failure;
3384
3385         return 0;
3386
3387 nla_put_failure:
3388         return -EMSGSIZE;
3389 }
3390
3391 /* add multipath next hop */
3392 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3393 {
3394         struct rtnexthop *rtnh;
3395         unsigned int flags = 0;
3396
3397         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3398         if (!rtnh)
3399                 goto nla_put_failure;
3400
3401         rtnh->rtnh_hops = 0;
3402         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3403
3404         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3405                 goto nla_put_failure;
3406
3407         rtnh->rtnh_flags = flags;
3408
3409         /* length of rtnetlink header + attributes */
3410         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3411
3412         return 0;
3413
3414 nla_put_failure:
3415         return -EMSGSIZE;
3416 }
3417
3418 static int rt6_fill_node(struct net *net,
3419                          struct sk_buff *skb, struct rt6_info *rt,
3420                          struct in6_addr *dst, struct in6_addr *src,
3421                          int iif, int type, u32 portid, u32 seq,
3422                          unsigned int flags)
3423 {
3424         u32 metrics[RTAX_MAX];
3425         struct rtmsg *rtm;
3426         struct nlmsghdr *nlh;
3427         long expires;
3428         u32 table;
3429
3430         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3431         if (!nlh)
3432                 return -EMSGSIZE;
3433
3434         rtm = nlmsg_data(nlh);
3435         rtm->rtm_family = AF_INET6;
3436         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3437         rtm->rtm_src_len = rt->rt6i_src.plen;
3438         rtm->rtm_tos = 0;
3439         if (rt->rt6i_table)
3440                 table = rt->rt6i_table->tb6_id;
3441         else
3442                 table = RT6_TABLE_UNSPEC;
3443         rtm->rtm_table = table;
3444         if (nla_put_u32(skb, RTA_TABLE, table))
3445                 goto nla_put_failure;
3446         if (rt->rt6i_flags & RTF_REJECT) {
3447                 switch (rt->dst.error) {
3448                 case -EINVAL:
3449                         rtm->rtm_type = RTN_BLACKHOLE;
3450                         break;
3451                 case -EACCES:
3452                         rtm->rtm_type = RTN_PROHIBIT;
3453                         break;
3454                 case -EAGAIN:
3455                         rtm->rtm_type = RTN_THROW;
3456                         break;
3457                 default:
3458                         rtm->rtm_type = RTN_UNREACHABLE;
3459                         break;
3460                 }
3461         }
3462         else if (rt->rt6i_flags & RTF_LOCAL)
3463                 rtm->rtm_type = RTN_LOCAL;
3464         else if (rt->rt6i_flags & RTF_ANYCAST)
3465                 rtm->rtm_type = RTN_ANYCAST;
3466         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3467                 rtm->rtm_type = RTN_LOCAL;
3468         else
3469                 rtm->rtm_type = RTN_UNICAST;
3470         rtm->rtm_flags = 0;
3471         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3472         rtm->rtm_protocol = rt->rt6i_protocol;
3473         if (rt->rt6i_flags & RTF_DYNAMIC)
3474                 rtm->rtm_protocol = RTPROT_REDIRECT;
3475         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3476                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3477                         rtm->rtm_protocol = RTPROT_RA;
3478                 else
3479                         rtm->rtm_protocol = RTPROT_KERNEL;
3480         }
3481
3482         if (rt->rt6i_flags & RTF_CACHE)
3483                 rtm->rtm_flags |= RTM_F_CLONED;
3484
3485         if (dst) {
3486                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3487                         goto nla_put_failure;
3488                 rtm->rtm_dst_len = 128;
3489         } else if (rtm->rtm_dst_len)
3490                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3491                         goto nla_put_failure;
3492 #ifdef CONFIG_IPV6_SUBTREES
3493         if (src) {
3494                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3495                         goto nla_put_failure;
3496                 rtm->rtm_src_len = 128;
3497         } else if (rtm->rtm_src_len &&
3498                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3499                 goto nla_put_failure;
3500 #endif
3501         if (iif) {
3502 #ifdef CONFIG_IPV6_MROUTE
3503                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3504                         int err = ip6mr_get_route(net, skb, rtm, portid);
3505
3506                         if (err == 0)
3507                                 return 0;
3508                         if (err < 0)
3509                                 goto nla_put_failure;
3510                 } else
3511 #endif
3512                         if (nla_put_u32(skb, RTA_IIF, iif))
3513                                 goto nla_put_failure;
3514         } else if (dst) {
3515                 struct in6_addr saddr_buf;
3516                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3517                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3518                         goto nla_put_failure;
3519         }
3520
3521         if (rt->rt6i_prefsrc.plen) {
3522                 struct in6_addr saddr_buf;
3523                 saddr_buf = rt->rt6i_prefsrc.addr;
3524                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3525                         goto nla_put_failure;
3526         }
3527
3528         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3529         if (rt->rt6i_pmtu)
3530                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3531         if (rtnetlink_put_metrics(skb, metrics) < 0)
3532                 goto nla_put_failure;
3533
3534         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3535                 goto nla_put_failure;
3536
3537         /* For multipath routes, walk the siblings list and add
3538          * each as a nexthop within RTA_MULTIPATH.
3539          */
3540         if (rt->rt6i_nsiblings) {
3541                 struct rt6_info *sibling, *next_sibling;
3542                 struct nlattr *mp;
3543
3544                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3545                 if (!mp)
3546                         goto nla_put_failure;
3547
3548                 if (rt6_add_nexthop(skb, rt) < 0)
3549                         goto nla_put_failure;
3550
3551                 list_for_each_entry_safe(sibling, next_sibling,
3552                                          &rt->rt6i_siblings, rt6i_siblings) {
3553                         if (rt6_add_nexthop(skb, sibling) < 0)
3554                                 goto nla_put_failure;
3555                 }
3556
3557                 nla_nest_end(skb, mp);
3558         } else {
3559                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3560                         goto nla_put_failure;
3561         }
3562
3563         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3564
3565         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3566                 goto nla_put_failure;
3567
3568         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3569                 goto nla_put_failure;
3570
3571
3572         nlmsg_end(skb, nlh);
3573         return 0;
3574
3575 nla_put_failure:
3576         nlmsg_cancel(skb, nlh);
3577         return -EMSGSIZE;
3578 }
3579
3580 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3581 {
3582         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3583         struct net *net = arg->net;
3584
3585         if (rt == net->ipv6.ip6_null_entry)
3586                 return 0;
3587
3588         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3589                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3590
3591                 /* user wants prefix routes only */
3592                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3593                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3594                         /* success since this is not a prefix route */
3595                         return 1;
3596                 }
3597         }
3598
3599         return rt6_fill_node(net,
3600                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3601                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3602                      NLM_F_MULTI);
3603 }
3604
3605 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3606                               struct netlink_ext_ack *extack)
3607 {
3608         struct net *net = sock_net(in_skb->sk);
3609         struct nlattr *tb[RTA_MAX+1];
3610         int err, iif = 0, oif = 0;
3611         struct dst_entry *dst;
3612         struct rt6_info *rt;
3613         struct sk_buff *skb;
3614         struct rtmsg *rtm;
3615         struct flowi6 fl6;
3616         bool fibmatch;
3617
3618         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3619                           extack);
3620         if (err < 0)
3621                 goto errout;
3622
3623         err = -EINVAL;
3624         memset(&fl6, 0, sizeof(fl6));
3625         rtm = nlmsg_data(nlh);
3626         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3627         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3628
3629         if (tb[RTA_SRC]) {
3630                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3631                         goto errout;
3632
3633                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3634         }
3635
3636         if (tb[RTA_DST]) {
3637                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3638                         goto errout;
3639
3640                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3641         }
3642
3643         if (tb[RTA_IIF])
3644                 iif = nla_get_u32(tb[RTA_IIF]);
3645
3646         if (tb[RTA_OIF])
3647                 oif = nla_get_u32(tb[RTA_OIF]);
3648
3649         if (tb[RTA_MARK])
3650                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3651
3652         if (tb[RTA_UID])
3653                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3654                                            nla_get_u32(tb[RTA_UID]));
3655         else
3656                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3657
3658         if (iif) {
3659                 struct net_device *dev;
3660                 int flags = 0;
3661
3662                 dev = __dev_get_by_index(net, iif);
3663                 if (!dev) {
3664                         err = -ENODEV;
3665                         goto errout;
3666                 }
3667
3668                 fl6.flowi6_iif = iif;
3669
3670                 if (!ipv6_addr_any(&fl6.saddr))
3671                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3672
3673                 if (!fibmatch)
3674                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3675         } else {
3676                 fl6.flowi6_oif = oif;
3677
3678                 if (!fibmatch)
3679                         dst = ip6_route_output(net, NULL, &fl6);
3680         }
3681
3682         if (fibmatch)
3683                 dst = ip6_route_lookup(net, &fl6, 0);
3684
3685         rt = container_of(dst, struct rt6_info, dst);
3686         if (rt->dst.error) {
3687                 err = rt->dst.error;
3688                 ip6_rt_put(rt);
3689                 goto errout;
3690         }
3691
3692         if (rt == net->ipv6.ip6_null_entry) {
3693                 err = rt->dst.error;
3694                 ip6_rt_put(rt);
3695                 goto errout;
3696         }
3697
3698         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3699         if (!skb) {
3700                 ip6_rt_put(rt);
3701                 err = -ENOBUFS;
3702                 goto errout;
3703         }
3704
3705         skb_dst_set(skb, &rt->dst);
3706         if (fibmatch)
3707                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3708                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3709                                     nlh->nlmsg_seq, 0);
3710         else
3711                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3712                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3713                                     nlh->nlmsg_seq, 0);
3714         if (err < 0) {
3715                 kfree_skb(skb);
3716                 goto errout;
3717         }
3718
3719         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3720 errout:
3721         return err;
3722 }
3723
3724 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3725                      unsigned int nlm_flags)
3726 {
3727         struct sk_buff *skb;
3728         struct net *net = info->nl_net;
3729         u32 seq;
3730         int err;
3731
3732         err = -ENOBUFS;
3733         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3734
3735         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3736         if (!skb)
3737                 goto errout;
3738
3739         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3740                                 event, info->portid, seq, nlm_flags);
3741         if (err < 0) {
3742                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3743                 WARN_ON(err == -EMSGSIZE);
3744                 kfree_skb(skb);
3745                 goto errout;
3746         }
3747         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3748                     info->nlh, gfp_any());
3749         return;
3750 errout:
3751         if (err < 0)
3752                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3753 }
3754
3755 static int ip6_route_dev_notify(struct notifier_block *this,
3756                                 unsigned long event, void *ptr)
3757 {
3758         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3759         struct net *net = dev_net(dev);
3760
3761         if (!(dev->flags & IFF_LOOPBACK))
3762                 return NOTIFY_OK;
3763
3764         if (event == NETDEV_REGISTER) {
3765                 net->ipv6.ip6_null_entry->dst.dev = dev;
3766                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3767 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3768                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3769                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3770                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3771                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3772 #endif
3773          } else if (event == NETDEV_UNREGISTER) {
3774                 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3775 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3776                 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3777                 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3778 #endif
3779         }
3780
3781         return NOTIFY_OK;
3782 }
3783
3784 /*
3785  *      /proc
3786  */
3787
3788 #ifdef CONFIG_PROC_FS
3789
3790 static const struct file_operations ipv6_route_proc_fops = {
3791         .owner          = THIS_MODULE,
3792         .open           = ipv6_route_open,
3793         .read           = seq_read,
3794         .llseek         = seq_lseek,
3795         .release        = seq_release_net,
3796 };
3797
3798 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3799 {
3800         struct net *net = (struct net *)seq->private;
3801         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3802                    net->ipv6.rt6_stats->fib_nodes,
3803                    net->ipv6.rt6_stats->fib_route_nodes,
3804                    net->ipv6.rt6_stats->fib_rt_alloc,
3805                    net->ipv6.rt6_stats->fib_rt_entries,
3806                    net->ipv6.rt6_stats->fib_rt_cache,
3807                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3808                    net->ipv6.rt6_stats->fib_discarded_routes);
3809
3810         return 0;
3811 }
3812
3813 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3814 {
3815         return single_open_net(inode, file, rt6_stats_seq_show);
3816 }
3817
3818 static const struct file_operations rt6_stats_seq_fops = {
3819         .owner   = THIS_MODULE,
3820         .open    = rt6_stats_seq_open,
3821         .read    = seq_read,
3822         .llseek  = seq_lseek,
3823         .release = single_release_net,
3824 };
3825 #endif  /* CONFIG_PROC_FS */
3826
3827 #ifdef CONFIG_SYSCTL
3828
3829 static
3830 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3831                               void __user *buffer, size_t *lenp, loff_t *ppos)
3832 {
3833         struct net *net;
3834         int delay;
3835         if (!write)
3836                 return -EINVAL;
3837
3838         net = (struct net *)ctl->extra1;
3839         delay = net->ipv6.sysctl.flush_delay;
3840         proc_dointvec(ctl, write, buffer, lenp, ppos);
3841         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3842         return 0;
3843 }
3844
3845 struct ctl_table ipv6_route_table_template[] = {
3846         {
3847                 .procname       =       "flush",
3848                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3849                 .maxlen         =       sizeof(int),
3850                 .mode           =       0200,
3851                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3852         },
3853         {
3854                 .procname       =       "gc_thresh",
3855                 .data           =       &ip6_dst_ops_template.gc_thresh,
3856                 .maxlen         =       sizeof(int),
3857                 .mode           =       0644,
3858                 .proc_handler   =       proc_dointvec,
3859         },
3860         {
3861                 .procname       =       "max_size",
3862                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3863                 .maxlen         =       sizeof(int),
3864                 .mode           =       0644,
3865                 .proc_handler   =       proc_dointvec,
3866         },
3867         {
3868                 .procname       =       "gc_min_interval",
3869                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3870                 .maxlen         =       sizeof(int),
3871                 .mode           =       0644,
3872                 .proc_handler   =       proc_dointvec_jiffies,
3873         },
3874         {
3875                 .procname       =       "gc_timeout",
3876                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3877                 .maxlen         =       sizeof(int),
3878                 .mode           =       0644,
3879                 .proc_handler   =       proc_dointvec_jiffies,
3880         },
3881         {
3882                 .procname       =       "gc_interval",
3883                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3884                 .maxlen         =       sizeof(int),
3885                 .mode           =       0644,
3886                 .proc_handler   =       proc_dointvec_jiffies,
3887         },
3888         {
3889                 .procname       =       "gc_elasticity",
3890                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3891                 .maxlen         =       sizeof(int),
3892                 .mode           =       0644,
3893                 .proc_handler   =       proc_dointvec,
3894         },
3895         {
3896                 .procname       =       "mtu_expires",
3897                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3898                 .maxlen         =       sizeof(int),
3899                 .mode           =       0644,
3900                 .proc_handler   =       proc_dointvec_jiffies,
3901         },
3902         {
3903                 .procname       =       "min_adv_mss",
3904                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3905                 .maxlen         =       sizeof(int),
3906                 .mode           =       0644,
3907                 .proc_handler   =       proc_dointvec,
3908         },
3909         {
3910                 .procname       =       "gc_min_interval_ms",
3911                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3912                 .maxlen         =       sizeof(int),
3913                 .mode           =       0644,
3914                 .proc_handler   =       proc_dointvec_ms_jiffies,
3915         },
3916         { }
3917 };
3918
3919 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3920 {
3921         struct ctl_table *table;
3922
3923         table = kmemdup(ipv6_route_table_template,
3924                         sizeof(ipv6_route_table_template),
3925                         GFP_KERNEL);
3926
3927         if (table) {
3928                 table[0].data = &net->ipv6.sysctl.flush_delay;
3929                 table[0].extra1 = net;
3930                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3931                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3932                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3933                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3934                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3935                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3936                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3937                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3938                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3939
3940                 /* Don't export sysctls to unprivileged users */
3941                 if (net->user_ns != &init_user_ns)
3942                         table[0].procname = NULL;
3943         }
3944
3945         return table;
3946 }
3947 #endif
3948
3949 static int __net_init ip6_route_net_init(struct net *net)
3950 {
3951         int ret = -ENOMEM;
3952
3953         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3954                sizeof(net->ipv6.ip6_dst_ops));
3955
3956         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3957                 goto out_ip6_dst_ops;
3958
3959         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3960                                            sizeof(*net->ipv6.ip6_null_entry),
3961                                            GFP_KERNEL);
3962         if (!net->ipv6.ip6_null_entry)
3963                 goto out_ip6_dst_entries;
3964         net->ipv6.ip6_null_entry->dst.path =
3965                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3966         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3967         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3968                          ip6_template_metrics, true);
3969
3970 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3971         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3972                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3973                                                GFP_KERNEL);
3974         if (!net->ipv6.ip6_prohibit_entry)
3975                 goto out_ip6_null_entry;
3976         net->ipv6.ip6_prohibit_entry->dst.path =
3977                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3978         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3979         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3980                          ip6_template_metrics, true);
3981
3982         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3983                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3984                                                GFP_KERNEL);
3985         if (!net->ipv6.ip6_blk_hole_entry)
3986                 goto out_ip6_prohibit_entry;
3987         net->ipv6.ip6_blk_hole_entry->dst.path =
3988                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3989         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3990         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3991                          ip6_template_metrics, true);
3992 #endif
3993
3994         net->ipv6.sysctl.flush_delay = 0;
3995         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3996         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3997         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3998         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3999         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4000         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4001         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4002
4003         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4004
4005         ret = 0;
4006 out:
4007         return ret;
4008
4009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4010 out_ip6_prohibit_entry:
4011         kfree(net->ipv6.ip6_prohibit_entry);
4012 out_ip6_null_entry:
4013         kfree(net->ipv6.ip6_null_entry);
4014 #endif
4015 out_ip6_dst_entries:
4016         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4017 out_ip6_dst_ops:
4018         goto out;
4019 }
4020
4021 static void __net_exit ip6_route_net_exit(struct net *net)
4022 {
4023         kfree(net->ipv6.ip6_null_entry);
4024 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4025         kfree(net->ipv6.ip6_prohibit_entry);
4026         kfree(net->ipv6.ip6_blk_hole_entry);
4027 #endif
4028         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4029 }
4030
4031 static int __net_init ip6_route_net_init_late(struct net *net)
4032 {
4033 #ifdef CONFIG_PROC_FS
4034         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4035         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4036 #endif
4037         return 0;
4038 }
4039
4040 static void __net_exit ip6_route_net_exit_late(struct net *net)
4041 {
4042 #ifdef CONFIG_PROC_FS
4043         remove_proc_entry("ipv6_route", net->proc_net);
4044         remove_proc_entry("rt6_stats", net->proc_net);
4045 #endif
4046 }
4047
4048 static struct pernet_operations ip6_route_net_ops = {
4049         .init = ip6_route_net_init,
4050         .exit = ip6_route_net_exit,
4051 };
4052
4053 static int __net_init ipv6_inetpeer_init(struct net *net)
4054 {
4055         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4056
4057         if (!bp)
4058                 return -ENOMEM;
4059         inet_peer_base_init(bp);
4060         net->ipv6.peers = bp;
4061         return 0;
4062 }
4063
4064 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4065 {
4066         struct inet_peer_base *bp = net->ipv6.peers;
4067
4068         net->ipv6.peers = NULL;
4069         inetpeer_invalidate_tree(bp);
4070         kfree(bp);
4071 }
4072
4073 static struct pernet_operations ipv6_inetpeer_ops = {
4074         .init   =       ipv6_inetpeer_init,
4075         .exit   =       ipv6_inetpeer_exit,
4076 };
4077
4078 static struct pernet_operations ip6_route_net_late_ops = {
4079         .init = ip6_route_net_init_late,
4080         .exit = ip6_route_net_exit_late,
4081 };
4082
4083 static struct notifier_block ip6_route_dev_notifier = {
4084         .notifier_call = ip6_route_dev_notify,
4085         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4086 };
4087
4088 void __init ip6_route_init_special_entries(void)
4089 {
4090         /* Registering of the loopback is done before this portion of code,
4091          * the loopback reference in rt6_info will not be taken, do it
4092          * manually for init_net */
4093         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4094         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4095   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4096         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4097         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4098         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4099         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4100   #endif
4101 }
4102
4103 int __init ip6_route_init(void)
4104 {
4105         int ret;
4106         int cpu;
4107
4108         ret = -ENOMEM;
4109         ip6_dst_ops_template.kmem_cachep =
4110                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4111                                   SLAB_HWCACHE_ALIGN, NULL);
4112         if (!ip6_dst_ops_template.kmem_cachep)
4113                 goto out;
4114
4115         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4116         if (ret)
4117                 goto out_kmem_cache;
4118
4119         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4120         if (ret)
4121                 goto out_dst_entries;
4122
4123         ret = register_pernet_subsys(&ip6_route_net_ops);
4124         if (ret)
4125                 goto out_register_inetpeer;
4126
4127         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4128
4129         ret = fib6_init();
4130         if (ret)
4131                 goto out_register_subsys;
4132
4133         ret = xfrm6_init();
4134         if (ret)
4135                 goto out_fib6_init;
4136
4137         ret = fib6_rules_init();
4138         if (ret)
4139                 goto xfrm6_init;
4140
4141         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4142         if (ret)
4143                 goto fib6_rules_init;
4144
4145         ret = -ENOBUFS;
4146         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4147             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4148             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4149                 goto out_register_late_subsys;
4150
4151         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4152         if (ret)
4153                 goto out_register_late_subsys;
4154
4155         for_each_possible_cpu(cpu) {
4156                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4157
4158                 INIT_LIST_HEAD(&ul->head);
4159                 spin_lock_init(&ul->lock);
4160         }
4161
4162 out:
4163         return ret;
4164
4165 out_register_late_subsys:
4166         unregister_pernet_subsys(&ip6_route_net_late_ops);
4167 fib6_rules_init:
4168         fib6_rules_cleanup();
4169 xfrm6_init:
4170         xfrm6_fini();
4171 out_fib6_init:
4172         fib6_gc_cleanup();
4173 out_register_subsys:
4174         unregister_pernet_subsys(&ip6_route_net_ops);
4175 out_register_inetpeer:
4176         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4177 out_dst_entries:
4178         dst_entries_destroy(&ip6_dst_blackhole_ops);
4179 out_kmem_cache:
4180         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4181         goto out;
4182 }
4183
4184 void ip6_route_cleanup(void)
4185 {
4186         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4187         unregister_pernet_subsys(&ip6_route_net_late_ops);
4188         fib6_rules_cleanup();
4189         xfrm6_fini();
4190         fib6_gc_cleanup();
4191         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4192         unregister_pernet_subsys(&ip6_route_net_ops);
4193         dst_entries_destroy(&ip6_dst_blackhole_ops);
4194         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4195 }