Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->dst.flags |= DST_NOCACHE;
132         rt->rt6i_uncached_list = ul;
133
134         spin_lock_bh(&ul->lock);
135         list_add_tail(&rt->rt6i_uncached, &ul->head);
136         spin_unlock_bh(&ul->lock);
137 }
138
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141         if (!list_empty(&rt->rt6i_uncached)) {
142                 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144                 spin_lock_bh(&ul->lock);
145                 list_del(&rt->rt6i_uncached);
146                 spin_unlock_bh(&ul->lock);
147         }
148 }
149
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152         struct net_device *loopback_dev = net->loopback_dev;
153         int cpu;
154
155         if (dev == loopback_dev)
156                 return;
157
158         for_each_possible_cpu(cpu) {
159                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160                 struct rt6_info *rt;
161
162                 spin_lock_bh(&ul->lock);
163                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164                         struct inet6_dev *rt_idev = rt->rt6i_idev;
165                         struct net_device *rt_dev = rt->dst.dev;
166
167                         if (rt_idev->dev == dev) {
168                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
169                                 in6_dev_put(rt_idev);
170                         }
171
172                         if (rt_dev == dev) {
173                                 rt->dst.dev = loopback_dev;
174                                 dev_hold(rt->dst.dev);
175                                 dev_put(rt_dev);
176                         }
177                 }
178                 spin_unlock_bh(&ul->lock);
179         }
180 }
181
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184         return dst_metrics_write_ptr(rt->dst.from);
185 }
186
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190
191         if (rt->rt6i_flags & RTF_PCPU)
192                 return rt6_pcpu_cow_metrics(rt);
193         else if (rt->rt6i_flags & RTF_CACHE)
194                 return NULL;
195         else
196                 return dst_cow_metrics_generic(dst, old);
197 }
198
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200                                              struct sk_buff *skb,
201                                              const void *daddr)
202 {
203         struct in6_addr *p = &rt->rt6i_gateway;
204
205         if (!ipv6_addr_any(p))
206                 return (const void *) p;
207         else if (skb)
208                 return &ipv6_hdr(skb)->daddr;
209         return daddr;
210 }
211
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213                                           struct sk_buff *skb,
214                                           const void *daddr)
215 {
216         struct rt6_info *rt = (struct rt6_info *) dst;
217         struct neighbour *n;
218
219         daddr = choose_neigh_daddr(rt, skb, daddr);
220         n = __ipv6_neigh_lookup(dst->dev, daddr);
221         if (n)
222                 return n;
223         return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228         struct net_device *dev = dst->dev;
229         struct rt6_info *rt = (struct rt6_info *)dst;
230
231         daddr = choose_neigh_daddr(rt, NULL, daddr);
232         if (!daddr)
233                 return;
234         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235                 return;
236         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237                 return;
238         __ipv6_confirm_neigh(dev, daddr);
239 }
240
241 static struct dst_ops ip6_dst_ops_template = {
242         .family                 =       AF_INET6,
243         .gc                     =       ip6_dst_gc,
244         .gc_thresh              =       1024,
245         .check                  =       ip6_dst_check,
246         .default_advmss         =       ip6_default_advmss,
247         .mtu                    =       ip6_mtu,
248         .cow_metrics            =       ipv6_cow_metrics,
249         .destroy                =       ip6_dst_destroy,
250         .ifdown                 =       ip6_dst_ifdown,
251         .negative_advice        =       ip6_negative_advice,
252         .link_failure           =       ip6_link_failure,
253         .update_pmtu            =       ip6_rt_update_pmtu,
254         .redirect               =       rt6_do_redirect,
255         .local_out              =       __ip6_local_out,
256         .neigh_lookup           =       ip6_neigh_lookup,
257         .confirm_neigh          =       ip6_confirm_neigh,
258 };
259
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264         return mtu ? : dst->dev->mtu;
265 }
266
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268                                          struct sk_buff *skb, u32 mtu)
269 {
270 }
271
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273                                       struct sk_buff *skb)
274 {
275 }
276
277 static struct dst_ops ip6_dst_blackhole_ops = {
278         .family                 =       AF_INET6,
279         .destroy                =       ip6_dst_destroy,
280         .check                  =       ip6_dst_check,
281         .mtu                    =       ip6_blackhole_mtu,
282         .default_advmss         =       ip6_default_advmss,
283         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
284         .redirect               =       ip6_rt_blackhole_redirect,
285         .cow_metrics            =       dst_cow_metrics_generic,
286         .neigh_lookup           =       ip6_neigh_lookup,
287 };
288
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290         [RTAX_HOPLIMIT - 1] = 0,
291 };
292
293 static const struct rt6_info ip6_null_entry_template = {
294         .dst = {
295                 .__refcnt       = ATOMIC_INIT(1),
296                 .__use          = 1,
297                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
298                 .error          = -ENETUNREACH,
299                 .input          = ip6_pkt_discard,
300                 .output         = ip6_pkt_discard_out,
301         },
302         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
303         .rt6i_protocol  = RTPROT_KERNEL,
304         .rt6i_metric    = ~(u32) 0,
305         .rt6i_ref       = ATOMIC_INIT(1),
306 };
307
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309
310 static const struct rt6_info ip6_prohibit_entry_template = {
311         .dst = {
312                 .__refcnt       = ATOMIC_INIT(1),
313                 .__use          = 1,
314                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
315                 .error          = -EACCES,
316                 .input          = ip6_pkt_prohibit,
317                 .output         = ip6_pkt_prohibit_out,
318         },
319         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
320         .rt6i_protocol  = RTPROT_KERNEL,
321         .rt6i_metric    = ~(u32) 0,
322         .rt6i_ref       = ATOMIC_INIT(1),
323 };
324
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326         .dst = {
327                 .__refcnt       = ATOMIC_INIT(1),
328                 .__use          = 1,
329                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
330                 .error          = -EINVAL,
331                 .input          = dst_discard,
332                 .output         = dst_discard_out,
333         },
334         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
335         .rt6i_protocol  = RTPROT_KERNEL,
336         .rt6i_metric    = ~(u32) 0,
337         .rt6i_ref       = ATOMIC_INIT(1),
338 };
339
340 #endif
341
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344         struct dst_entry *dst = &rt->dst;
345
346         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347         INIT_LIST_HEAD(&rt->rt6i_siblings);
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353                                         struct net_device *dev,
354                                         int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         0, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt)
360                 rt6_info_init(rt);
361
362         return rt;
363 }
364
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366                                struct net_device *dev,
367                                int flags)
368 {
369         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370
371         if (rt) {
372                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373                 if (rt->rt6i_pcpu) {
374                         int cpu;
375
376                         for_each_possible_cpu(cpu) {
377                                 struct rt6_info **p;
378
379                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380                                 /* no one shares rt */
381                                 *p =  NULL;
382                         }
383                 } else {
384                         dst_destroy((struct dst_entry *)rt);
385                         return NULL;
386                 }
387         }
388
389         return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct dst_entry *from = dst->from;
397         struct inet6_dev *idev;
398
399         dst_destroy_metrics_generic(dst);
400         free_percpu(rt->rt6i_pcpu);
401         rt6_uncached_list_del(rt);
402
403         idev = rt->rt6i_idev;
404         if (idev) {
405                 rt->rt6i_idev = NULL;
406                 in6_dev_put(idev);
407         }
408
409         dst->from = NULL;
410         dst_release(from);
411 }
412
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414                            int how)
415 {
416         struct rt6_info *rt = (struct rt6_info *)dst;
417         struct inet6_dev *idev = rt->rt6i_idev;
418         struct net_device *loopback_dev =
419                 dev_net(dev)->loopback_dev;
420
421         if (dev != loopback_dev) {
422                 if (idev && idev->dev == dev) {
423                         struct inet6_dev *loopback_idev =
424                                 in6_dev_get(loopback_dev);
425                         if (loopback_idev) {
426                                 rt->rt6i_idev = loopback_idev;
427                                 in6_dev_put(idev);
428                         }
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448         }
449         return false;
450 }
451
452 /* Multipath route selection:
453  *   Hash based function using packet header and flowlabel.
454  * Adapted from fib_info_hashfn()
455  */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457                                const struct flowi6 *fl6)
458 {
459         return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463                                              struct flowi6 *fl6, int oif,
464                                              int strict)
465 {
466         struct rt6_info *sibling, *next_sibling;
467         int route_choosen;
468
469         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470         /* Don't change the route, if route_choosen == 0
471          * (siblings does not include ourself)
472          */
473         if (route_choosen)
474                 list_for_each_entry_safe(sibling, next_sibling,
475                                 &match->rt6i_siblings, rt6i_siblings) {
476                         route_choosen--;
477                         if (route_choosen == 0) {
478                                 if (rt6_score_route(sibling, oif, strict) < 0)
479                                         break;
480                                 match = sibling;
481                                 break;
482                         }
483                 }
484         return match;
485 }
486
487 /*
488  *      Route lookup. Any table->tb6_lock is implied.
489  */
490
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492                                                     struct rt6_info *rt,
493                                                     const struct in6_addr *saddr,
494                                                     int oif,
495                                                     int flags)
496 {
497         struct rt6_info *local = NULL;
498         struct rt6_info *sprt;
499
500         if (!oif && ipv6_addr_any(saddr))
501                 goto out;
502
503         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504                 struct net_device *dev = sprt->dst.dev;
505
506                 if (oif) {
507                         if (dev->ifindex == oif)
508                                 return sprt;
509                         if (dev->flags & IFF_LOOPBACK) {
510                                 if (!sprt->rt6i_idev ||
511                                     sprt->rt6i_idev->dev->ifindex != oif) {
512                                         if (flags & RT6_LOOKUP_F_IFACE)
513                                                 continue;
514                                         if (local &&
515                                             local->rt6i_idev->dev->ifindex == oif)
516                                                 continue;
517                                 }
518                                 local = sprt;
519                         }
520                 } else {
521                         if (ipv6_chk_addr(net, saddr, dev,
522                                           flags & RT6_LOOKUP_F_IFACE))
523                                 return sprt;
524                 }
525         }
526
527         if (oif) {
528                 if (local)
529                         return local;
530
531                 if (flags & RT6_LOOKUP_F_IFACE)
532                         return net->ipv6.ip6_null_entry;
533         }
534 out:
535         return rt;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540         struct work_struct work;
541         struct in6_addr target;
542         struct net_device *dev;
543 };
544
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547         struct in6_addr mcaddr;
548         struct __rt6_probe_work *work =
549                 container_of(w, struct __rt6_probe_work, work);
550
551         addrconf_addr_solict_mult(&work->target, &mcaddr);
552         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553         dev_put(work->dev);
554         kfree(work);
555 }
556
557 static void rt6_probe(struct rt6_info *rt)
558 {
559         struct __rt6_probe_work *work;
560         struct neighbour *neigh;
561         /*
562          * Okay, this does not seem to be appropriate
563          * for now, however, we need to check if it
564          * is really so; aka Router Reachability Probing.
565          *
566          * Router Reachability Probe MUST be rate-limited
567          * to no more than one per minute.
568          */
569         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570                 return;
571         rcu_read_lock_bh();
572         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573         if (neigh) {
574                 if (neigh->nud_state & NUD_VALID)
575                         goto out;
576
577                 work = NULL;
578                 write_lock(&neigh->lock);
579                 if (!(neigh->nud_state & NUD_VALID) &&
580                     time_after(jiffies,
581                                neigh->updated +
582                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
583                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
584                         if (work)
585                                 __neigh_set_probe_once(neigh);
586                 }
587                 write_unlock(&neigh->lock);
588         } else {
589                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590         }
591
592         if (work) {
593                 INIT_WORK(&work->work, rt6_probe_deferred);
594                 work->target = rt->rt6i_gateway;
595                 dev_hold(rt->dst.dev);
596                 work->dev = rt->dst.dev;
597                 schedule_work(&work->work);
598         }
599
600 out:
601         rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614         struct net_device *dev = rt->dst.dev;
615         if (!oif || dev->ifindex == oif)
616                 return 2;
617         if ((dev->flags & IFF_LOOPBACK) &&
618             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619                 return 1;
620         return 0;
621 }
622
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625         struct neighbour *neigh;
626         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627
628         if (rt->rt6i_flags & RTF_NONEXTHOP ||
629             !(rt->rt6i_flags & RTF_GATEWAY))
630                 return RT6_NUD_SUCCEED;
631
632         rcu_read_lock_bh();
633         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634         if (neigh) {
635                 read_lock(&neigh->lock);
636                 if (neigh->nud_state & NUD_VALID)
637                         ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639                 else if (!(neigh->nud_state & NUD_FAILED))
640                         ret = RT6_NUD_SUCCEED;
641                 else
642                         ret = RT6_NUD_FAIL_PROBE;
643 #endif
644                 read_unlock(&neigh->lock);
645         } else {
646                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648         }
649         rcu_read_unlock_bh();
650
651         return ret;
652 }
653
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655                            int strict)
656 {
657         int m;
658
659         m = rt6_check_dev(rt, oif);
660         if (!m && (strict & RT6_LOOKUP_F_IFACE))
661                 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665         if (strict & RT6_LOOKUP_F_REACHABLE) {
666                 int n = rt6_check_neigh(rt);
667                 if (n < 0)
668                         return n;
669         }
670         return m;
671 }
672
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674                                    int *mpri, struct rt6_info *match,
675                                    bool *do_rr)
676 {
677         int m;
678         bool match_do_rr = false;
679         struct inet6_dev *idev = rt->rt6i_idev;
680         struct net_device *dev = rt->dst.dev;
681
682         if (dev && !netif_carrier_ok(dev) &&
683             idev->cnf.ignore_routes_with_linkdown &&
684             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685                 goto out;
686
687         if (rt6_check_expired(rt))
688                 goto out;
689
690         m = rt6_score_route(rt, oif, strict);
691         if (m == RT6_NUD_FAIL_DO_RR) {
692                 match_do_rr = true;
693                 m = 0; /* lowest valid score */
694         } else if (m == RT6_NUD_FAIL_HARD) {
695                 goto out;
696         }
697
698         if (strict & RT6_LOOKUP_F_REACHABLE)
699                 rt6_probe(rt);
700
701         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702         if (m > *mpri) {
703                 *do_rr = match_do_rr;
704                 *mpri = m;
705                 match = rt;
706         }
707 out:
708         return match;
709 }
710
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731                 if (rt->rt6i_metric != metric) {
732                         cont = rt;
733                         break;
734                 }
735
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737         }
738
739         if (match || !cont)
740                 return match;
741
742         for (rt = cont; rt; rt = rt->dst.rt6_next)
743                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744
745         return match;
746 }
747
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750         struct rt6_info *match, *rt0;
751         struct net *net;
752         bool do_rr = false;
753
754         rt0 = fn->rr_ptr;
755         if (!rt0)
756                 fn->rr_ptr = rt0 = fn->leaf;
757
758         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759                              &do_rr);
760
761         if (do_rr) {
762                 struct rt6_info *next = rt0->dst.rt6_next;
763
764                 /* no entries matched; do round-robin */
765                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766                         next = fn->leaf;
767
768                 if (next != rt0)
769                         fn->rr_ptr = next;
770         }
771
772         net = dev_net(rt0->dst.dev);
773         return match ? match : net->ipv6.ip6_null_entry;
774 }
775
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783                   const struct in6_addr *gwaddr)
784 {
785         struct net *net = dev_net(dev);
786         struct route_info *rinfo = (struct route_info *) opt;
787         struct in6_addr prefix_buf, *prefix;
788         unsigned int pref;
789         unsigned long lifetime;
790         struct rt6_info *rt;
791
792         if (len < sizeof(struct route_info)) {
793                 return -EINVAL;
794         }
795
796         /* Sanity check for prefix_len and length */
797         if (rinfo->length > 3) {
798                 return -EINVAL;
799         } else if (rinfo->prefix_len > 128) {
800                 return -EINVAL;
801         } else if (rinfo->prefix_len > 64) {
802                 if (rinfo->length < 2) {
803                         return -EINVAL;
804                 }
805         } else if (rinfo->prefix_len > 0) {
806                 if (rinfo->length < 1) {
807                         return -EINVAL;
808                 }
809         }
810
811         pref = rinfo->route_pref;
812         if (pref == ICMPV6_ROUTER_PREF_INVALID)
813                 return -EINVAL;
814
815         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816
817         if (rinfo->length == 3)
818                 prefix = (struct in6_addr *)rinfo->prefix;
819         else {
820                 /* this function is safe */
821                 ipv6_addr_prefix(&prefix_buf,
822                                  (struct in6_addr *)rinfo->prefix,
823                                  rinfo->prefix_len);
824                 prefix = &prefix_buf;
825         }
826
827         if (rinfo->prefix_len == 0)
828                 rt = rt6_get_dflt_router(gwaddr, dev);
829         else
830                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831                                         gwaddr, dev);
832
833         if (rt && !lifetime) {
834                 ip6_del_rt(rt);
835                 rt = NULL;
836         }
837
838         if (!rt && lifetime)
839                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840                                         dev, pref);
841         else if (rt)
842                 rt->rt6i_flags = RTF_ROUTEINFO |
843                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844
845         if (rt) {
846                 if (!addrconf_finite_timeout(lifetime))
847                         rt6_clean_expires(rt);
848                 else
849                         rt6_set_expires(rt, jiffies + HZ * lifetime);
850
851                 ip6_rt_put(rt);
852         }
853         return 0;
854 }
855 #endif
856
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858                                         struct in6_addr *saddr)
859 {
860         struct fib6_node *pn;
861         while (1) {
862                 if (fn->fn_flags & RTN_TL_ROOT)
863                         return NULL;
864                 pn = fn->parent;
865                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867                 else
868                         fn = pn;
869                 if (fn->fn_flags & RTN_RTINFO)
870                         return fn;
871         }
872 }
873
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875                                              struct fib6_table *table,
876                                              struct flowi6 *fl6, int flags)
877 {
878         struct fib6_node *fn;
879         struct rt6_info *rt;
880
881         read_lock_bh(&table->tb6_lock);
882         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884         rt = fn->leaf;
885         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888         if (rt == net->ipv6.ip6_null_entry) {
889                 fn = fib6_backtrack(fn, &fl6->saddr);
890                 if (fn)
891                         goto restart;
892         }
893         dst_use(&rt->dst, jiffies);
894         read_unlock_bh(&table->tb6_lock);
895
896         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897
898         return rt;
899
900 }
901
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903                                     int flags)
904 {
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910                             const struct in6_addr *saddr, int oif, int strict)
911 {
912         struct flowi6 fl6 = {
913                 .flowi6_oif = oif,
914                 .daddr = *daddr,
915         };
916         struct dst_entry *dst;
917         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918
919         if (saddr) {
920                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921                 flags |= RT6_LOOKUP_F_HAS_SADDR;
922         }
923
924         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925         if (dst->error == 0)
926                 return (struct rt6_info *) dst;
927
928         dst_release(dst);
929
930         return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935    It takes new route entry, the addition fails by any reason the
936    route is freed. In any case, if caller does not hold it, it may
937    be destroyed.
938  */
939
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941                         struct mx6_config *mxc,
942                         struct netlink_ext_ack *extack)
943 {
944         int err;
945         struct fib6_table *table;
946
947         table = rt->rt6i_table;
948         write_lock_bh(&table->tb6_lock);
949         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
950         write_unlock_bh(&table->tb6_lock);
951
952         return err;
953 }
954
955 int ip6_ins_rt(struct rt6_info *rt)
956 {
957         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
958         struct mx6_config mxc = { .mx = NULL, };
959
960         return __ip6_ins_rt(rt, &info, &mxc, NULL);
961 }
962
963 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
964                                            const struct in6_addr *daddr,
965                                            const struct in6_addr *saddr)
966 {
967         struct rt6_info *rt;
968
969         /*
970          *      Clone the route.
971          */
972
973         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
974                 ort = (struct rt6_info *)ort->dst.from;
975
976         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
977
978         if (!rt)
979                 return NULL;
980
981         ip6_rt_copy_init(rt, ort);
982         rt->rt6i_flags |= RTF_CACHE;
983         rt->rt6i_metric = 0;
984         rt->dst.flags |= DST_HOST;
985         rt->rt6i_dst.addr = *daddr;
986         rt->rt6i_dst.plen = 128;
987
988         if (!rt6_is_gw_or_nonexthop(ort)) {
989                 if (ort->rt6i_dst.plen != 128 &&
990                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
991                         rt->rt6i_flags |= RTF_ANYCAST;
992 #ifdef CONFIG_IPV6_SUBTREES
993                 if (rt->rt6i_src.plen && saddr) {
994                         rt->rt6i_src.addr = *saddr;
995                         rt->rt6i_src.plen = 128;
996                 }
997 #endif
998         }
999
1000         return rt;
1001 }
1002
1003 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1004 {
1005         struct rt6_info *pcpu_rt;
1006
1007         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1008                                   rt->dst.dev, rt->dst.flags);
1009
1010         if (!pcpu_rt)
1011                 return NULL;
1012         ip6_rt_copy_init(pcpu_rt, rt);
1013         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1014         pcpu_rt->rt6i_flags |= RTF_PCPU;
1015         return pcpu_rt;
1016 }
1017
1018 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1019 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1020 {
1021         struct rt6_info *pcpu_rt, **p;
1022
1023         p = this_cpu_ptr(rt->rt6i_pcpu);
1024         pcpu_rt = *p;
1025
1026         if (pcpu_rt) {
1027                 dst_hold(&pcpu_rt->dst);
1028                 rt6_dst_from_metrics_check(pcpu_rt);
1029         }
1030         return pcpu_rt;
1031 }
1032
1033 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1034 {
1035         struct fib6_table *table = rt->rt6i_table;
1036         struct rt6_info *pcpu_rt, *prev, **p;
1037
1038         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1039         if (!pcpu_rt) {
1040                 struct net *net = dev_net(rt->dst.dev);
1041
1042                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1043                 return net->ipv6.ip6_null_entry;
1044         }
1045
1046         read_lock_bh(&table->tb6_lock);
1047         if (rt->rt6i_pcpu) {
1048                 p = this_cpu_ptr(rt->rt6i_pcpu);
1049                 prev = cmpxchg(p, NULL, pcpu_rt);
1050                 if (prev) {
1051                         /* If someone did it before us, return prev instead */
1052                         dst_destroy(&pcpu_rt->dst);
1053                         pcpu_rt = prev;
1054                 }
1055         } else {
1056                 /* rt has been removed from the fib6 tree
1057                  * before we have a chance to acquire the read_lock.
1058                  * In this case, don't brother to create a pcpu rt
1059                  * since rt is going away anyway.  The next
1060                  * dst_check() will trigger a re-lookup.
1061                  */
1062                 dst_destroy(&pcpu_rt->dst);
1063                 pcpu_rt = rt;
1064         }
1065         dst_hold(&pcpu_rt->dst);
1066         rt6_dst_from_metrics_check(pcpu_rt);
1067         read_unlock_bh(&table->tb6_lock);
1068         return pcpu_rt;
1069 }
1070
1071 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1072                                int oif, struct flowi6 *fl6, int flags)
1073 {
1074         struct fib6_node *fn, *saved_fn;
1075         struct rt6_info *rt;
1076         int strict = 0;
1077
1078         strict |= flags & RT6_LOOKUP_F_IFACE;
1079         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1080         if (net->ipv6.devconf_all->forwarding == 0)
1081                 strict |= RT6_LOOKUP_F_REACHABLE;
1082
1083         read_lock_bh(&table->tb6_lock);
1084
1085         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1086         saved_fn = fn;
1087
1088         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1089                 oif = 0;
1090
1091 redo_rt6_select:
1092         rt = rt6_select(fn, oif, strict);
1093         if (rt->rt6i_nsiblings)
1094                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1095         if (rt == net->ipv6.ip6_null_entry) {
1096                 fn = fib6_backtrack(fn, &fl6->saddr);
1097                 if (fn)
1098                         goto redo_rt6_select;
1099                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1100                         /* also consider unreachable route */
1101                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1102                         fn = saved_fn;
1103                         goto redo_rt6_select;
1104                 }
1105         }
1106
1107
1108         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1109                 dst_use(&rt->dst, jiffies);
1110                 read_unlock_bh(&table->tb6_lock);
1111
1112                 rt6_dst_from_metrics_check(rt);
1113
1114                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1115                 return rt;
1116         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1117                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1118                 /* Create a RTF_CACHE clone which will not be
1119                  * owned by the fib6 tree.  It is for the special case where
1120                  * the daddr in the skb during the neighbor look-up is different
1121                  * from the fl6->daddr used to look-up route here.
1122                  */
1123
1124                 struct rt6_info *uncached_rt;
1125
1126                 dst_use(&rt->dst, jiffies);
1127                 read_unlock_bh(&table->tb6_lock);
1128
1129                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1130                 dst_release(&rt->dst);
1131
1132                 if (uncached_rt)
1133                         rt6_uncached_list_add(uncached_rt);
1134                 else
1135                         uncached_rt = net->ipv6.ip6_null_entry;
1136
1137                 dst_hold(&uncached_rt->dst);
1138
1139                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1140                 return uncached_rt;
1141
1142         } else {
1143                 /* Get a percpu copy */
1144
1145                 struct rt6_info *pcpu_rt;
1146
1147                 rt->dst.lastuse = jiffies;
1148                 rt->dst.__use++;
1149                 pcpu_rt = rt6_get_pcpu_route(rt);
1150
1151                 if (pcpu_rt) {
1152                         read_unlock_bh(&table->tb6_lock);
1153                 } else {
1154                         /* We have to do the read_unlock first
1155                          * because rt6_make_pcpu_route() may trigger
1156                          * ip6_dst_gc() which will take the write_lock.
1157                          */
1158                         dst_hold(&rt->dst);
1159                         read_unlock_bh(&table->tb6_lock);
1160                         pcpu_rt = rt6_make_pcpu_route(rt);
1161                         dst_release(&rt->dst);
1162                 }
1163
1164                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1165                 return pcpu_rt;
1166
1167         }
1168 }
1169 EXPORT_SYMBOL_GPL(ip6_pol_route);
1170
1171 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1172                                             struct flowi6 *fl6, int flags)
1173 {
1174         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1175 }
1176
1177 struct dst_entry *ip6_route_input_lookup(struct net *net,
1178                                          struct net_device *dev,
1179                                          struct flowi6 *fl6, int flags)
1180 {
1181         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1182                 flags |= RT6_LOOKUP_F_IFACE;
1183
1184         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1185 }
1186 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1187
1188 void ip6_route_input(struct sk_buff *skb)
1189 {
1190         const struct ipv6hdr *iph = ipv6_hdr(skb);
1191         struct net *net = dev_net(skb->dev);
1192         int flags = RT6_LOOKUP_F_HAS_SADDR;
1193         struct ip_tunnel_info *tun_info;
1194         struct flowi6 fl6 = {
1195                 .flowi6_iif = skb->dev->ifindex,
1196                 .daddr = iph->daddr,
1197                 .saddr = iph->saddr,
1198                 .flowlabel = ip6_flowinfo(iph),
1199                 .flowi6_mark = skb->mark,
1200                 .flowi6_proto = iph->nexthdr,
1201         };
1202
1203         tun_info = skb_tunnel_info(skb);
1204         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1205                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1206         skb_dst_drop(skb);
1207         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1208 }
1209
1210 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1211                                              struct flowi6 *fl6, int flags)
1212 {
1213         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1214 }
1215
1216 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1217                                          struct flowi6 *fl6, int flags)
1218 {
1219         bool any_src;
1220
1221         if (rt6_need_strict(&fl6->daddr)) {
1222                 struct dst_entry *dst;
1223
1224                 dst = l3mdev_link_scope_lookup(net, fl6);
1225                 if (dst)
1226                         return dst;
1227         }
1228
1229         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1230
1231         any_src = ipv6_addr_any(&fl6->saddr);
1232         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1233             (fl6->flowi6_oif && any_src))
1234                 flags |= RT6_LOOKUP_F_IFACE;
1235
1236         if (!any_src)
1237                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1238         else if (sk)
1239                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1240
1241         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1244
1245 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1246 {
1247         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1248         struct dst_entry *new = NULL;
1249
1250         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1251         if (rt) {
1252                 rt6_info_init(rt);
1253
1254                 new = &rt->dst;
1255                 new->__use = 1;
1256                 new->input = dst_discard;
1257                 new->output = dst_discard_out;
1258
1259                 dst_copy_metrics(new, &ort->dst);
1260                 rt->rt6i_idev = ort->rt6i_idev;
1261                 if (rt->rt6i_idev)
1262                         in6_dev_hold(rt->rt6i_idev);
1263
1264                 rt->rt6i_gateway = ort->rt6i_gateway;
1265                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1266                 rt->rt6i_metric = 0;
1267
1268                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1269 #ifdef CONFIG_IPV6_SUBTREES
1270                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1271 #endif
1272
1273                 dst_free(new);
1274         }
1275
1276         dst_release(dst_orig);
1277         return new ? new : ERR_PTR(-ENOMEM);
1278 }
1279
1280 /*
1281  *      Destination cache support functions
1282  */
1283
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1285 {
1286         if (rt->dst.from &&
1287             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1289 }
1290
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1292 {
1293         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1294                 return NULL;
1295
1296         if (rt6_check_expired(rt))
1297                 return NULL;
1298
1299         return &rt->dst;
1300 }
1301
1302 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1303 {
1304         if (!__rt6_check_expired(rt) &&
1305             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1306             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1307                 return &rt->dst;
1308         else
1309                 return NULL;
1310 }
1311
1312 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1313 {
1314         struct rt6_info *rt;
1315
1316         rt = (struct rt6_info *) dst;
1317
1318         /* All IPV6 dsts are created with ->obsolete set to the value
1319          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1320          * into this function always.
1321          */
1322
1323         rt6_dst_from_metrics_check(rt);
1324
1325         if (rt->rt6i_flags & RTF_PCPU ||
1326             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1327                 return rt6_dst_from_check(rt, cookie);
1328         else
1329                 return rt6_check(rt, cookie);
1330 }
1331
1332 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1333 {
1334         struct rt6_info *rt = (struct rt6_info *) dst;
1335
1336         if (rt) {
1337                 if (rt->rt6i_flags & RTF_CACHE) {
1338                         if (rt6_check_expired(rt)) {
1339                                 ip6_del_rt(rt);
1340                                 dst = NULL;
1341                         }
1342                 } else {
1343                         dst_release(dst);
1344                         dst = NULL;
1345                 }
1346         }
1347         return dst;
1348 }
1349
1350 static void ip6_link_failure(struct sk_buff *skb)
1351 {
1352         struct rt6_info *rt;
1353
1354         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1355
1356         rt = (struct rt6_info *) skb_dst(skb);
1357         if (rt) {
1358                 if (rt->rt6i_flags & RTF_CACHE) {
1359                         dst_hold(&rt->dst);
1360                         ip6_del_rt(rt);
1361                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1362                         rt->rt6i_node->fn_sernum = -1;
1363                 }
1364         }
1365 }
1366
1367 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1368 {
1369         struct net *net = dev_net(rt->dst.dev);
1370
1371         rt->rt6i_flags |= RTF_MODIFIED;
1372         rt->rt6i_pmtu = mtu;
1373         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1374 }
1375
1376 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1377 {
1378         return !(rt->rt6i_flags & RTF_CACHE) &&
1379                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1380 }
1381
1382 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1383                                  const struct ipv6hdr *iph, u32 mtu)
1384 {
1385         const struct in6_addr *daddr, *saddr;
1386         struct rt6_info *rt6 = (struct rt6_info *)dst;
1387
1388         if (rt6->rt6i_flags & RTF_LOCAL)
1389                 return;
1390
1391         if (dst_metric_locked(dst, RTAX_MTU))
1392                 return;
1393
1394         if (iph) {
1395                 daddr = &iph->daddr;
1396                 saddr = &iph->saddr;
1397         } else if (sk) {
1398                 daddr = &sk->sk_v6_daddr;
1399                 saddr = &inet6_sk(sk)->saddr;
1400         } else {
1401                 daddr = NULL;
1402                 saddr = NULL;
1403         }
1404         dst_confirm_neigh(dst, daddr);
1405         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1406         if (mtu >= dst_mtu(dst))
1407                 return;
1408
1409         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1410                 rt6_do_update_pmtu(rt6, mtu);
1411         } else if (daddr) {
1412                 struct rt6_info *nrt6;
1413
1414                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1415                 if (nrt6) {
1416                         rt6_do_update_pmtu(nrt6, mtu);
1417
1418                         /* ip6_ins_rt(nrt6) will bump the
1419                          * rt6->rt6i_node->fn_sernum
1420                          * which will fail the next rt6_check() and
1421                          * invalidate the sk->sk_dst_cache.
1422                          */
1423                         ip6_ins_rt(nrt6);
1424                 }
1425         }
1426 }
1427
1428 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1429                                struct sk_buff *skb, u32 mtu)
1430 {
1431         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1432 }
1433
1434 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1435                      int oif, u32 mark, kuid_t uid)
1436 {
1437         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1438         struct dst_entry *dst;
1439         struct flowi6 fl6;
1440
1441         memset(&fl6, 0, sizeof(fl6));
1442         fl6.flowi6_oif = oif;
1443         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1444         fl6.daddr = iph->daddr;
1445         fl6.saddr = iph->saddr;
1446         fl6.flowlabel = ip6_flowinfo(iph);
1447         fl6.flowi6_uid = uid;
1448
1449         dst = ip6_route_output(net, NULL, &fl6);
1450         if (!dst->error)
1451                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1452         dst_release(dst);
1453 }
1454 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1455
1456 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1457 {
1458         struct dst_entry *dst;
1459
1460         ip6_update_pmtu(skb, sock_net(sk), mtu,
1461                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1462
1463         dst = __sk_dst_get(sk);
1464         if (!dst || !dst->obsolete ||
1465             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1466                 return;
1467
1468         bh_lock_sock(sk);
1469         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1470                 ip6_datagram_dst_update(sk, false);
1471         bh_unlock_sock(sk);
1472 }
1473 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1474
1475 /* Handle redirects */
1476 struct ip6rd_flowi {
1477         struct flowi6 fl6;
1478         struct in6_addr gateway;
1479 };
1480
1481 static struct rt6_info *__ip6_route_redirect(struct net *net,
1482                                              struct fib6_table *table,
1483                                              struct flowi6 *fl6,
1484                                              int flags)
1485 {
1486         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1487         struct rt6_info *rt;
1488         struct fib6_node *fn;
1489
1490         /* Get the "current" route for this destination and
1491          * check if the redirect has come from appropriate router.
1492          *
1493          * RFC 4861 specifies that redirects should only be
1494          * accepted if they come from the nexthop to the target.
1495          * Due to the way the routes are chosen, this notion
1496          * is a bit fuzzy and one might need to check all possible
1497          * routes.
1498          */
1499
1500         read_lock_bh(&table->tb6_lock);
1501         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1502 restart:
1503         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1504                 if (rt6_check_expired(rt))
1505                         continue;
1506                 if (rt->dst.error)
1507                         break;
1508                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1509                         continue;
1510                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1511                         continue;
1512                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1513                         continue;
1514                 break;
1515         }
1516
1517         if (!rt)
1518                 rt = net->ipv6.ip6_null_entry;
1519         else if (rt->dst.error) {
1520                 rt = net->ipv6.ip6_null_entry;
1521                 goto out;
1522         }
1523
1524         if (rt == net->ipv6.ip6_null_entry) {
1525                 fn = fib6_backtrack(fn, &fl6->saddr);
1526                 if (fn)
1527                         goto restart;
1528         }
1529
1530 out:
1531         dst_hold(&rt->dst);
1532
1533         read_unlock_bh(&table->tb6_lock);
1534
1535         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1536         return rt;
1537 };
1538
1539 static struct dst_entry *ip6_route_redirect(struct net *net,
1540                                         const struct flowi6 *fl6,
1541                                         const struct in6_addr *gateway)
1542 {
1543         int flags = RT6_LOOKUP_F_HAS_SADDR;
1544         struct ip6rd_flowi rdfl;
1545
1546         rdfl.fl6 = *fl6;
1547         rdfl.gateway = *gateway;
1548
1549         return fib6_rule_lookup(net, &rdfl.fl6,
1550                                 flags, __ip6_route_redirect);
1551 }
1552
1553 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1554                   kuid_t uid)
1555 {
1556         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1557         struct dst_entry *dst;
1558         struct flowi6 fl6;
1559
1560         memset(&fl6, 0, sizeof(fl6));
1561         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1562         fl6.flowi6_oif = oif;
1563         fl6.flowi6_mark = mark;
1564         fl6.daddr = iph->daddr;
1565         fl6.saddr = iph->saddr;
1566         fl6.flowlabel = ip6_flowinfo(iph);
1567         fl6.flowi6_uid = uid;
1568
1569         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1570         rt6_do_redirect(dst, NULL, skb);
1571         dst_release(dst);
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_redirect);
1574
1575 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1576                             u32 mark)
1577 {
1578         const struct ipv6hdr *iph = ipv6_hdr(skb);
1579         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1580         struct dst_entry *dst;
1581         struct flowi6 fl6;
1582
1583         memset(&fl6, 0, sizeof(fl6));
1584         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1585         fl6.flowi6_oif = oif;
1586         fl6.flowi6_mark = mark;
1587         fl6.daddr = msg->dest;
1588         fl6.saddr = iph->daddr;
1589         fl6.flowi6_uid = sock_net_uid(net, NULL);
1590
1591         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1592         rt6_do_redirect(dst, NULL, skb);
1593         dst_release(dst);
1594 }
1595
1596 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1597 {
1598         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1599                      sk->sk_uid);
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1602
1603 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1604 {
1605         struct net_device *dev = dst->dev;
1606         unsigned int mtu = dst_mtu(dst);
1607         struct net *net = dev_net(dev);
1608
1609         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1610
1611         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1612                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1613
1614         /*
1615          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1616          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1617          * IPV6_MAXPLEN is also valid and means: "any MSS,
1618          * rely only on pmtu discovery"
1619          */
1620         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1621                 mtu = IPV6_MAXPLEN;
1622         return mtu;
1623 }
1624
1625 static unsigned int ip6_mtu(const struct dst_entry *dst)
1626 {
1627         const struct rt6_info *rt = (const struct rt6_info *)dst;
1628         unsigned int mtu = rt->rt6i_pmtu;
1629         struct inet6_dev *idev;
1630
1631         if (mtu)
1632                 goto out;
1633
1634         mtu = dst_metric_raw(dst, RTAX_MTU);
1635         if (mtu)
1636                 goto out;
1637
1638         mtu = IPV6_MIN_MTU;
1639
1640         rcu_read_lock();
1641         idev = __in6_dev_get(dst->dev);
1642         if (idev)
1643                 mtu = idev->cnf.mtu6;
1644         rcu_read_unlock();
1645
1646 out:
1647         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1648
1649         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1650 }
1651
1652 static struct dst_entry *icmp6_dst_gc_list;
1653 static DEFINE_SPINLOCK(icmp6_dst_lock);
1654
1655 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1656                                   struct flowi6 *fl6)
1657 {
1658         struct dst_entry *dst;
1659         struct rt6_info *rt;
1660         struct inet6_dev *idev = in6_dev_get(dev);
1661         struct net *net = dev_net(dev);
1662
1663         if (unlikely(!idev))
1664                 return ERR_PTR(-ENODEV);
1665
1666         rt = ip6_dst_alloc(net, dev, 0);
1667         if (unlikely(!rt)) {
1668                 in6_dev_put(idev);
1669                 dst = ERR_PTR(-ENOMEM);
1670                 goto out;
1671         }
1672
1673         rt->dst.flags |= DST_HOST;
1674         rt->dst.output  = ip6_output;
1675         atomic_set(&rt->dst.__refcnt, 1);
1676         rt->rt6i_gateway  = fl6->daddr;
1677         rt->rt6i_dst.addr = fl6->daddr;
1678         rt->rt6i_dst.plen = 128;
1679         rt->rt6i_idev     = idev;
1680         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1681
1682         spin_lock_bh(&icmp6_dst_lock);
1683         rt->dst.next = icmp6_dst_gc_list;
1684         icmp6_dst_gc_list = &rt->dst;
1685         spin_unlock_bh(&icmp6_dst_lock);
1686
1687         fib6_force_start_gc(net);
1688
1689         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1690
1691 out:
1692         return dst;
1693 }
1694
1695 int icmp6_dst_gc(void)
1696 {
1697         struct dst_entry *dst, **pprev;
1698         int more = 0;
1699
1700         spin_lock_bh(&icmp6_dst_lock);
1701         pprev = &icmp6_dst_gc_list;
1702
1703         while ((dst = *pprev) != NULL) {
1704                 if (!atomic_read(&dst->__refcnt)) {
1705                         *pprev = dst->next;
1706                         dst_free(dst);
1707                 } else {
1708                         pprev = &dst->next;
1709                         ++more;
1710                 }
1711         }
1712
1713         spin_unlock_bh(&icmp6_dst_lock);
1714
1715         return more;
1716 }
1717
1718 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1719                             void *arg)
1720 {
1721         struct dst_entry *dst, **pprev;
1722
1723         spin_lock_bh(&icmp6_dst_lock);
1724         pprev = &icmp6_dst_gc_list;
1725         while ((dst = *pprev) != NULL) {
1726                 struct rt6_info *rt = (struct rt6_info *) dst;
1727                 if (func(rt, arg)) {
1728                         *pprev = dst->next;
1729                         dst_free(dst);
1730                 } else {
1731                         pprev = &dst->next;
1732                 }
1733         }
1734         spin_unlock_bh(&icmp6_dst_lock);
1735 }
1736
1737 static int ip6_dst_gc(struct dst_ops *ops)
1738 {
1739         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1740         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1741         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1742         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1743         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1744         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1745         int entries;
1746
1747         entries = dst_entries_get_fast(ops);
1748         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1749             entries <= rt_max_size)
1750                 goto out;
1751
1752         net->ipv6.ip6_rt_gc_expire++;
1753         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1754         entries = dst_entries_get_slow(ops);
1755         if (entries < ops->gc_thresh)
1756                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1757 out:
1758         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1759         return entries > rt_max_size;
1760 }
1761
1762 static int ip6_convert_metrics(struct mx6_config *mxc,
1763                                const struct fib6_config *cfg)
1764 {
1765         bool ecn_ca = false;
1766         struct nlattr *nla;
1767         int remaining;
1768         u32 *mp;
1769
1770         if (!cfg->fc_mx)
1771                 return 0;
1772
1773         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1774         if (unlikely(!mp))
1775                 return -ENOMEM;
1776
1777         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1778                 int type = nla_type(nla);
1779                 u32 val;
1780
1781                 if (!type)
1782                         continue;
1783                 if (unlikely(type > RTAX_MAX))
1784                         goto err;
1785
1786                 if (type == RTAX_CC_ALGO) {
1787                         char tmp[TCP_CA_NAME_MAX];
1788
1789                         nla_strlcpy(tmp, nla, sizeof(tmp));
1790                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1791                         if (val == TCP_CA_UNSPEC)
1792                                 goto err;
1793                 } else {
1794                         val = nla_get_u32(nla);
1795                 }
1796                 if (type == RTAX_HOPLIMIT && val > 255)
1797                         val = 255;
1798                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1799                         goto err;
1800
1801                 mp[type - 1] = val;
1802                 __set_bit(type - 1, mxc->mx_valid);
1803         }
1804
1805         if (ecn_ca) {
1806                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1807                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1808         }
1809
1810         mxc->mx = mp;
1811         return 0;
1812  err:
1813         kfree(mp);
1814         return -EINVAL;
1815 }
1816
1817 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1818                                             struct fib6_config *cfg,
1819                                             const struct in6_addr *gw_addr)
1820 {
1821         struct flowi6 fl6 = {
1822                 .flowi6_oif = cfg->fc_ifindex,
1823                 .daddr = *gw_addr,
1824                 .saddr = cfg->fc_prefsrc,
1825         };
1826         struct fib6_table *table;
1827         struct rt6_info *rt;
1828         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1829
1830         table = fib6_get_table(net, cfg->fc_table);
1831         if (!table)
1832                 return NULL;
1833
1834         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1835                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1836
1837         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1838
1839         /* if table lookup failed, fall back to full lookup */
1840         if (rt == net->ipv6.ip6_null_entry) {
1841                 ip6_rt_put(rt);
1842                 rt = NULL;
1843         }
1844
1845         return rt;
1846 }
1847
1848 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1849                                               struct netlink_ext_ack *extack)
1850 {
1851         struct net *net = cfg->fc_nlinfo.nl_net;
1852         struct rt6_info *rt = NULL;
1853         struct net_device *dev = NULL;
1854         struct inet6_dev *idev = NULL;
1855         struct fib6_table *table;
1856         int addr_type;
1857         int err = -EINVAL;
1858
1859         /* RTF_PCPU is an internal flag; can not be set by userspace */
1860         if (cfg->fc_flags & RTF_PCPU) {
1861                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1862                 goto out;
1863         }
1864
1865         if (cfg->fc_dst_len > 128) {
1866                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1867                 goto out;
1868         }
1869         if (cfg->fc_src_len > 128) {
1870                 NL_SET_ERR_MSG(extack, "Invalid source address length");
1871                 goto out;
1872         }
1873 #ifndef CONFIG_IPV6_SUBTREES
1874         if (cfg->fc_src_len) {
1875                 NL_SET_ERR_MSG(extack,
1876                                "Specifying source address requires IPV6_SUBTREES to be enabled");
1877                 goto out;
1878         }
1879 #endif
1880         if (cfg->fc_ifindex) {
1881                 err = -ENODEV;
1882                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1883                 if (!dev)
1884                         goto out;
1885                 idev = in6_dev_get(dev);
1886                 if (!idev)
1887                         goto out;
1888         }
1889
1890         if (cfg->fc_metric == 0)
1891                 cfg->fc_metric = IP6_RT_PRIO_USER;
1892
1893         err = -ENOBUFS;
1894         if (cfg->fc_nlinfo.nlh &&
1895             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1896                 table = fib6_get_table(net, cfg->fc_table);
1897                 if (!table) {
1898                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1899                         table = fib6_new_table(net, cfg->fc_table);
1900                 }
1901         } else {
1902                 table = fib6_new_table(net, cfg->fc_table);
1903         }
1904
1905         if (!table)
1906                 goto out;
1907
1908         rt = ip6_dst_alloc(net, NULL,
1909                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1910
1911         if (!rt) {
1912                 err = -ENOMEM;
1913                 goto out;
1914         }
1915
1916         if (cfg->fc_flags & RTF_EXPIRES)
1917                 rt6_set_expires(rt, jiffies +
1918                                 clock_t_to_jiffies(cfg->fc_expires));
1919         else
1920                 rt6_clean_expires(rt);
1921
1922         if (cfg->fc_protocol == RTPROT_UNSPEC)
1923                 cfg->fc_protocol = RTPROT_BOOT;
1924         rt->rt6i_protocol = cfg->fc_protocol;
1925
1926         addr_type = ipv6_addr_type(&cfg->fc_dst);
1927
1928         if (addr_type & IPV6_ADDR_MULTICAST)
1929                 rt->dst.input = ip6_mc_input;
1930         else if (cfg->fc_flags & RTF_LOCAL)
1931                 rt->dst.input = ip6_input;
1932         else
1933                 rt->dst.input = ip6_forward;
1934
1935         rt->dst.output = ip6_output;
1936
1937         if (cfg->fc_encap) {
1938                 struct lwtunnel_state *lwtstate;
1939
1940                 err = lwtunnel_build_state(cfg->fc_encap_type,
1941                                            cfg->fc_encap, AF_INET6, cfg,
1942                                            &lwtstate, extack);
1943                 if (err)
1944                         goto out;
1945                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1946                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1947                         rt->dst.lwtstate->orig_output = rt->dst.output;
1948                         rt->dst.output = lwtunnel_output;
1949                 }
1950                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1951                         rt->dst.lwtstate->orig_input = rt->dst.input;
1952                         rt->dst.input = lwtunnel_input;
1953                 }
1954         }
1955
1956         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1957         rt->rt6i_dst.plen = cfg->fc_dst_len;
1958         if (rt->rt6i_dst.plen == 128)
1959                 rt->dst.flags |= DST_HOST;
1960
1961 #ifdef CONFIG_IPV6_SUBTREES
1962         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1963         rt->rt6i_src.plen = cfg->fc_src_len;
1964 #endif
1965
1966         rt->rt6i_metric = cfg->fc_metric;
1967
1968         /* We cannot add true routes via loopback here,
1969            they would result in kernel looping; promote them to reject routes
1970          */
1971         if ((cfg->fc_flags & RTF_REJECT) ||
1972             (dev && (dev->flags & IFF_LOOPBACK) &&
1973              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1974              !(cfg->fc_flags & RTF_LOCAL))) {
1975                 /* hold loopback dev/idev if we haven't done so. */
1976                 if (dev != net->loopback_dev) {
1977                         if (dev) {
1978                                 dev_put(dev);
1979                                 in6_dev_put(idev);
1980                         }
1981                         dev = net->loopback_dev;
1982                         dev_hold(dev);
1983                         idev = in6_dev_get(dev);
1984                         if (!idev) {
1985                                 err = -ENODEV;
1986                                 goto out;
1987                         }
1988                 }
1989                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1990                 switch (cfg->fc_type) {
1991                 case RTN_BLACKHOLE:
1992                         rt->dst.error = -EINVAL;
1993                         rt->dst.output = dst_discard_out;
1994                         rt->dst.input = dst_discard;
1995                         break;
1996                 case RTN_PROHIBIT:
1997                         rt->dst.error = -EACCES;
1998                         rt->dst.output = ip6_pkt_prohibit_out;
1999                         rt->dst.input = ip6_pkt_prohibit;
2000                         break;
2001                 case RTN_THROW:
2002                 case RTN_UNREACHABLE:
2003                 default:
2004                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2005                                         : (cfg->fc_type == RTN_UNREACHABLE)
2006                                         ? -EHOSTUNREACH : -ENETUNREACH;
2007                         rt->dst.output = ip6_pkt_discard_out;
2008                         rt->dst.input = ip6_pkt_discard;
2009                         break;
2010                 }
2011                 goto install_route;
2012         }
2013
2014         if (cfg->fc_flags & RTF_GATEWAY) {
2015                 const struct in6_addr *gw_addr;
2016                 int gwa_type;
2017
2018                 gw_addr = &cfg->fc_gateway;
2019                 gwa_type = ipv6_addr_type(gw_addr);
2020
2021                 /* if gw_addr is local we will fail to detect this in case
2022                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2023                  * will return already-added prefix route via interface that
2024                  * prefix route was assigned to, which might be non-loopback.
2025                  */
2026                 err = -EINVAL;
2027                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2028                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2029                                             dev : NULL, 0, 0)) {
2030                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2031                         goto out;
2032                 }
2033                 rt->rt6i_gateway = *gw_addr;
2034
2035                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2036                         struct rt6_info *grt = NULL;
2037
2038                         /* IPv6 strictly inhibits using not link-local
2039                            addresses as nexthop address.
2040                            Otherwise, router will not able to send redirects.
2041                            It is very good, but in some (rare!) circumstances
2042                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2043                            some exceptions. --ANK
2044                            We allow IPv4-mapped nexthops to support RFC4798-type
2045                            addressing
2046                          */
2047                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2048                                           IPV6_ADDR_MAPPED))) {
2049                                 NL_SET_ERR_MSG(extack,
2050                                                "Invalid gateway address");
2051                                 goto out;
2052                         }
2053
2054                         if (cfg->fc_table) {
2055                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2056
2057                                 if (grt) {
2058                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2059                                             (dev && dev != grt->dst.dev)) {
2060                                                 ip6_rt_put(grt);
2061                                                 grt = NULL;
2062                                         }
2063                                 }
2064                         }
2065
2066                         if (!grt)
2067                                 grt = rt6_lookup(net, gw_addr, NULL,
2068                                                  cfg->fc_ifindex, 1);
2069
2070                         err = -EHOSTUNREACH;
2071                         if (!grt)
2072                                 goto out;
2073                         if (dev) {
2074                                 if (dev != grt->dst.dev) {
2075                                         ip6_rt_put(grt);
2076                                         goto out;
2077                                 }
2078                         } else {
2079                                 dev = grt->dst.dev;
2080                                 idev = grt->rt6i_idev;
2081                                 dev_hold(dev);
2082                                 in6_dev_hold(grt->rt6i_idev);
2083                         }
2084                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2085                                 err = 0;
2086                         ip6_rt_put(grt);
2087
2088                         if (err)
2089                                 goto out;
2090                 }
2091                 err = -EINVAL;
2092                 if (!dev) {
2093                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2094                         goto out;
2095                 } else if (dev->flags & IFF_LOOPBACK) {
2096                         NL_SET_ERR_MSG(extack,
2097                                        "Egress device can not be loopback device for this route");
2098                         goto out;
2099                 }
2100         }
2101
2102         err = -ENODEV;
2103         if (!dev)
2104                 goto out;
2105
2106         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2107                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2108                         NL_SET_ERR_MSG(extack, "Invalid source address");
2109                         err = -EINVAL;
2110                         goto out;
2111                 }
2112                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2113                 rt->rt6i_prefsrc.plen = 128;
2114         } else
2115                 rt->rt6i_prefsrc.plen = 0;
2116
2117         rt->rt6i_flags = cfg->fc_flags;
2118
2119 install_route:
2120         rt->dst.dev = dev;
2121         rt->rt6i_idev = idev;
2122         rt->rt6i_table = table;
2123
2124         cfg->fc_nlinfo.nl_net = dev_net(dev);
2125
2126         return rt;
2127 out:
2128         if (dev)
2129                 dev_put(dev);
2130         if (idev)
2131                 in6_dev_put(idev);
2132         if (rt)
2133                 dst_free(&rt->dst);
2134
2135         return ERR_PTR(err);
2136 }
2137
2138 int ip6_route_add(struct fib6_config *cfg,
2139                   struct netlink_ext_ack *extack)
2140 {
2141         struct mx6_config mxc = { .mx = NULL, };
2142         struct rt6_info *rt;
2143         int err;
2144
2145         rt = ip6_route_info_create(cfg, extack);
2146         if (IS_ERR(rt)) {
2147                 err = PTR_ERR(rt);
2148                 rt = NULL;
2149                 goto out;
2150         }
2151
2152         err = ip6_convert_metrics(&mxc, cfg);
2153         if (err)
2154                 goto out;
2155
2156         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2157
2158         kfree(mxc.mx);
2159
2160         return err;
2161 out:
2162         if (rt)
2163                 dst_free(&rt->dst);
2164
2165         return err;
2166 }
2167
2168 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2169 {
2170         int err;
2171         struct fib6_table *table;
2172         struct net *net = dev_net(rt->dst.dev);
2173
2174         if (rt == net->ipv6.ip6_null_entry ||
2175             rt->dst.flags & DST_NOCACHE) {
2176                 err = -ENOENT;
2177                 goto out;
2178         }
2179
2180         table = rt->rt6i_table;
2181         write_lock_bh(&table->tb6_lock);
2182         err = fib6_del(rt, info);
2183         write_unlock_bh(&table->tb6_lock);
2184
2185 out:
2186         ip6_rt_put(rt);
2187         return err;
2188 }
2189
2190 int ip6_del_rt(struct rt6_info *rt)
2191 {
2192         struct nl_info info = {
2193                 .nl_net = dev_net(rt->dst.dev),
2194         };
2195         return __ip6_del_rt(rt, &info);
2196 }
2197
2198 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2199 {
2200         struct nl_info *info = &cfg->fc_nlinfo;
2201         struct net *net = info->nl_net;
2202         struct sk_buff *skb = NULL;
2203         struct fib6_table *table;
2204         int err = -ENOENT;
2205
2206         if (rt == net->ipv6.ip6_null_entry)
2207                 goto out_put;
2208         table = rt->rt6i_table;
2209         write_lock_bh(&table->tb6_lock);
2210
2211         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2212                 struct rt6_info *sibling, *next_sibling;
2213
2214                 /* prefer to send a single notification with all hops */
2215                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2216                 if (skb) {
2217                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2218
2219                         if (rt6_fill_node(net, skb, rt,
2220                                           NULL, NULL, 0, RTM_DELROUTE,
2221                                           info->portid, seq, 0) < 0) {
2222                                 kfree_skb(skb);
2223                                 skb = NULL;
2224                         } else
2225                                 info->skip_notify = 1;
2226                 }
2227
2228                 list_for_each_entry_safe(sibling, next_sibling,
2229                                          &rt->rt6i_siblings,
2230                                          rt6i_siblings) {
2231                         err = fib6_del(sibling, info);
2232                         if (err)
2233                                 goto out_unlock;
2234                 }
2235         }
2236
2237         err = fib6_del(rt, info);
2238 out_unlock:
2239         write_unlock_bh(&table->tb6_lock);
2240 out_put:
2241         ip6_rt_put(rt);
2242
2243         if (skb) {
2244                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2245                             info->nlh, gfp_any());
2246         }
2247         return err;
2248 }
2249
2250 static int ip6_route_del(struct fib6_config *cfg,
2251                          struct netlink_ext_ack *extack)
2252 {
2253         struct fib6_table *table;
2254         struct fib6_node *fn;
2255         struct rt6_info *rt;
2256         int err = -ESRCH;
2257
2258         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2259         if (!table) {
2260                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2261                 return err;
2262         }
2263
2264         read_lock_bh(&table->tb6_lock);
2265
2266         fn = fib6_locate(&table->tb6_root,
2267                          &cfg->fc_dst, cfg->fc_dst_len,
2268                          &cfg->fc_src, cfg->fc_src_len);
2269
2270         if (fn) {
2271                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2272                         if ((rt->rt6i_flags & RTF_CACHE) &&
2273                             !(cfg->fc_flags & RTF_CACHE))
2274                                 continue;
2275                         if (cfg->fc_ifindex &&
2276                             (!rt->dst.dev ||
2277                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2278                                 continue;
2279                         if (cfg->fc_flags & RTF_GATEWAY &&
2280                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2281                                 continue;
2282                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2283                                 continue;
2284                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2285                                 continue;
2286                         dst_hold(&rt->dst);
2287                         read_unlock_bh(&table->tb6_lock);
2288
2289                         /* if gateway was specified only delete the one hop */
2290                         if (cfg->fc_flags & RTF_GATEWAY)
2291                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2292
2293                         return __ip6_del_rt_siblings(rt, cfg);
2294                 }
2295         }
2296         read_unlock_bh(&table->tb6_lock);
2297
2298         return err;
2299 }
2300
2301 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2302 {
2303         struct netevent_redirect netevent;
2304         struct rt6_info *rt, *nrt = NULL;
2305         struct ndisc_options ndopts;
2306         struct inet6_dev *in6_dev;
2307         struct neighbour *neigh;
2308         struct rd_msg *msg;
2309         int optlen, on_link;
2310         u8 *lladdr;
2311
2312         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2313         optlen -= sizeof(*msg);
2314
2315         if (optlen < 0) {
2316                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2317                 return;
2318         }
2319
2320         msg = (struct rd_msg *)icmp6_hdr(skb);
2321
2322         if (ipv6_addr_is_multicast(&msg->dest)) {
2323                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2324                 return;
2325         }
2326
2327         on_link = 0;
2328         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2329                 on_link = 1;
2330         } else if (ipv6_addr_type(&msg->target) !=
2331                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2332                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2333                 return;
2334         }
2335
2336         in6_dev = __in6_dev_get(skb->dev);
2337         if (!in6_dev)
2338                 return;
2339         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2340                 return;
2341
2342         /* RFC2461 8.1:
2343          *      The IP source address of the Redirect MUST be the same as the current
2344          *      first-hop router for the specified ICMP Destination Address.
2345          */
2346
2347         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2348                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2349                 return;
2350         }
2351
2352         lladdr = NULL;
2353         if (ndopts.nd_opts_tgt_lladdr) {
2354                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2355                                              skb->dev);
2356                 if (!lladdr) {
2357                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2358                         return;
2359                 }
2360         }
2361
2362         rt = (struct rt6_info *) dst;
2363         if (rt->rt6i_flags & RTF_REJECT) {
2364                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2365                 return;
2366         }
2367
2368         /* Redirect received -> path was valid.
2369          * Look, redirects are sent only in response to data packets,
2370          * so that this nexthop apparently is reachable. --ANK
2371          */
2372         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2373
2374         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2375         if (!neigh)
2376                 return;
2377
2378         /*
2379          *      We have finally decided to accept it.
2380          */
2381
2382         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2383                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2384                      NEIGH_UPDATE_F_OVERRIDE|
2385                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2386                                      NEIGH_UPDATE_F_ISROUTER)),
2387                      NDISC_REDIRECT, &ndopts);
2388
2389         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2390         if (!nrt)
2391                 goto out;
2392
2393         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2394         if (on_link)
2395                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2396
2397         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2398
2399         if (ip6_ins_rt(nrt))
2400                 goto out;
2401
2402         netevent.old = &rt->dst;
2403         netevent.new = &nrt->dst;
2404         netevent.daddr = &msg->dest;
2405         netevent.neigh = neigh;
2406         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2407
2408         if (rt->rt6i_flags & RTF_CACHE) {
2409                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2410                 ip6_del_rt(rt);
2411         }
2412
2413 out:
2414         neigh_release(neigh);
2415 }
2416
2417 /*
2418  *      Misc support functions
2419  */
2420
2421 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2422 {
2423         BUG_ON(from->dst.from);
2424
2425         rt->rt6i_flags &= ~RTF_EXPIRES;
2426         dst_hold(&from->dst);
2427         rt->dst.from = &from->dst;
2428         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2429 }
2430
2431 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2432 {
2433         rt->dst.input = ort->dst.input;
2434         rt->dst.output = ort->dst.output;
2435         rt->rt6i_dst = ort->rt6i_dst;
2436         rt->dst.error = ort->dst.error;
2437         rt->rt6i_idev = ort->rt6i_idev;
2438         if (rt->rt6i_idev)
2439                 in6_dev_hold(rt->rt6i_idev);
2440         rt->dst.lastuse = jiffies;
2441         rt->rt6i_gateway = ort->rt6i_gateway;
2442         rt->rt6i_flags = ort->rt6i_flags;
2443         rt6_set_from(rt, ort);
2444         rt->rt6i_metric = ort->rt6i_metric;
2445 #ifdef CONFIG_IPV6_SUBTREES
2446         rt->rt6i_src = ort->rt6i_src;
2447 #endif
2448         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2449         rt->rt6i_table = ort->rt6i_table;
2450         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2451 }
2452
2453 #ifdef CONFIG_IPV6_ROUTE_INFO
2454 static struct rt6_info *rt6_get_route_info(struct net *net,
2455                                            const struct in6_addr *prefix, int prefixlen,
2456                                            const struct in6_addr *gwaddr,
2457                                            struct net_device *dev)
2458 {
2459         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2460         int ifindex = dev->ifindex;
2461         struct fib6_node *fn;
2462         struct rt6_info *rt = NULL;
2463         struct fib6_table *table;
2464
2465         table = fib6_get_table(net, tb_id);
2466         if (!table)
2467                 return NULL;
2468
2469         read_lock_bh(&table->tb6_lock);
2470         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2471         if (!fn)
2472                 goto out;
2473
2474         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2475                 if (rt->dst.dev->ifindex != ifindex)
2476                         continue;
2477                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2478                         continue;
2479                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2480                         continue;
2481                 dst_hold(&rt->dst);
2482                 break;
2483         }
2484 out:
2485         read_unlock_bh(&table->tb6_lock);
2486         return rt;
2487 }
2488
2489 static struct rt6_info *rt6_add_route_info(struct net *net,
2490                                            const struct in6_addr *prefix, int prefixlen,
2491                                            const struct in6_addr *gwaddr,
2492                                            struct net_device *dev,
2493                                            unsigned int pref)
2494 {
2495         struct fib6_config cfg = {
2496                 .fc_metric      = IP6_RT_PRIO_USER,
2497                 .fc_ifindex     = dev->ifindex,
2498                 .fc_dst_len     = prefixlen,
2499                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2500                                   RTF_UP | RTF_PREF(pref),
2501                 .fc_nlinfo.portid = 0,
2502                 .fc_nlinfo.nlh = NULL,
2503                 .fc_nlinfo.nl_net = net,
2504         };
2505
2506         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2507         cfg.fc_dst = *prefix;
2508         cfg.fc_gateway = *gwaddr;
2509
2510         /* We should treat it as a default route if prefix length is 0. */
2511         if (!prefixlen)
2512                 cfg.fc_flags |= RTF_DEFAULT;
2513
2514         ip6_route_add(&cfg, NULL);
2515
2516         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2517 }
2518 #endif
2519
2520 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2521 {
2522         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2523         struct rt6_info *rt;
2524         struct fib6_table *table;
2525
2526         table = fib6_get_table(dev_net(dev), tb_id);
2527         if (!table)
2528                 return NULL;
2529
2530         read_lock_bh(&table->tb6_lock);
2531         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2532                 if (dev == rt->dst.dev &&
2533                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2534                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2535                         break;
2536         }
2537         if (rt)
2538                 dst_hold(&rt->dst);
2539         read_unlock_bh(&table->tb6_lock);
2540         return rt;
2541 }
2542
2543 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2544                                      struct net_device *dev,
2545                                      unsigned int pref)
2546 {
2547         struct fib6_config cfg = {
2548                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2549                 .fc_metric      = IP6_RT_PRIO_USER,
2550                 .fc_ifindex     = dev->ifindex,
2551                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2552                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2553                 .fc_nlinfo.portid = 0,
2554                 .fc_nlinfo.nlh = NULL,
2555                 .fc_nlinfo.nl_net = dev_net(dev),
2556         };
2557
2558         cfg.fc_gateway = *gwaddr;
2559
2560         if (!ip6_route_add(&cfg, NULL)) {
2561                 struct fib6_table *table;
2562
2563                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2564                 if (table)
2565                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2566         }
2567
2568         return rt6_get_dflt_router(gwaddr, dev);
2569 }
2570
2571 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2572 {
2573         struct rt6_info *rt;
2574
2575 restart:
2576         read_lock_bh(&table->tb6_lock);
2577         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2578                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2579                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2580                         dst_hold(&rt->dst);
2581                         read_unlock_bh(&table->tb6_lock);
2582                         ip6_del_rt(rt);
2583                         goto restart;
2584                 }
2585         }
2586         read_unlock_bh(&table->tb6_lock);
2587
2588         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2589 }
2590
2591 void rt6_purge_dflt_routers(struct net *net)
2592 {
2593         struct fib6_table *table;
2594         struct hlist_head *head;
2595         unsigned int h;
2596
2597         rcu_read_lock();
2598
2599         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2600                 head = &net->ipv6.fib_table_hash[h];
2601                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2602                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2603                                 __rt6_purge_dflt_routers(table);
2604                 }
2605         }
2606
2607         rcu_read_unlock();
2608 }
2609
2610 static void rtmsg_to_fib6_config(struct net *net,
2611                                  struct in6_rtmsg *rtmsg,
2612                                  struct fib6_config *cfg)
2613 {
2614         memset(cfg, 0, sizeof(*cfg));
2615
2616         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2617                          : RT6_TABLE_MAIN;
2618         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2619         cfg->fc_metric = rtmsg->rtmsg_metric;
2620         cfg->fc_expires = rtmsg->rtmsg_info;
2621         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2622         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2623         cfg->fc_flags = rtmsg->rtmsg_flags;
2624
2625         cfg->fc_nlinfo.nl_net = net;
2626
2627         cfg->fc_dst = rtmsg->rtmsg_dst;
2628         cfg->fc_src = rtmsg->rtmsg_src;
2629         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2630 }
2631
2632 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2633 {
2634         struct fib6_config cfg;
2635         struct in6_rtmsg rtmsg;
2636         int err;
2637
2638         switch (cmd) {
2639         case SIOCADDRT:         /* Add a route */
2640         case SIOCDELRT:         /* Delete a route */
2641                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2642                         return -EPERM;
2643                 err = copy_from_user(&rtmsg, arg,
2644                                      sizeof(struct in6_rtmsg));
2645                 if (err)
2646                         return -EFAULT;
2647
2648                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2649
2650                 rtnl_lock();
2651                 switch (cmd) {
2652                 case SIOCADDRT:
2653                         err = ip6_route_add(&cfg, NULL);
2654                         break;
2655                 case SIOCDELRT:
2656                         err = ip6_route_del(&cfg, NULL);
2657                         break;
2658                 default:
2659                         err = -EINVAL;
2660                 }
2661                 rtnl_unlock();
2662
2663                 return err;
2664         }
2665
2666         return -EINVAL;
2667 }
2668
2669 /*
2670  *      Drop the packet on the floor
2671  */
2672
2673 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2674 {
2675         int type;
2676         struct dst_entry *dst = skb_dst(skb);
2677         switch (ipstats_mib_noroutes) {
2678         case IPSTATS_MIB_INNOROUTES:
2679                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2680                 if (type == IPV6_ADDR_ANY) {
2681                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2682                                       IPSTATS_MIB_INADDRERRORS);
2683                         break;
2684                 }
2685                 /* FALLTHROUGH */
2686         case IPSTATS_MIB_OUTNOROUTES:
2687                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2688                               ipstats_mib_noroutes);
2689                 break;
2690         }
2691         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2692         kfree_skb(skb);
2693         return 0;
2694 }
2695
2696 static int ip6_pkt_discard(struct sk_buff *skb)
2697 {
2698         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2699 }
2700
2701 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2702 {
2703         skb->dev = skb_dst(skb)->dev;
2704         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2705 }
2706
2707 static int ip6_pkt_prohibit(struct sk_buff *skb)
2708 {
2709         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2710 }
2711
2712 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2713 {
2714         skb->dev = skb_dst(skb)->dev;
2715         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2716 }
2717
2718 /*
2719  *      Allocate a dst for local (unicast / anycast) address.
2720  */
2721
2722 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2723                                     const struct in6_addr *addr,
2724                                     bool anycast)
2725 {
2726         u32 tb_id;
2727         struct net *net = dev_net(idev->dev);
2728         struct net_device *dev = net->loopback_dev;
2729         struct rt6_info *rt;
2730
2731         /* use L3 Master device as loopback for host routes if device
2732          * is enslaved and address is not link local or multicast
2733          */
2734         if (!rt6_need_strict(addr))
2735                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2736
2737         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2738         if (!rt)
2739                 return ERR_PTR(-ENOMEM);
2740
2741         in6_dev_hold(idev);
2742
2743         rt->dst.flags |= DST_HOST;
2744         rt->dst.input = ip6_input;
2745         rt->dst.output = ip6_output;
2746         rt->rt6i_idev = idev;
2747
2748         rt->rt6i_protocol = RTPROT_KERNEL;
2749         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2750         if (anycast)
2751                 rt->rt6i_flags |= RTF_ANYCAST;
2752         else
2753                 rt->rt6i_flags |= RTF_LOCAL;
2754
2755         rt->rt6i_gateway  = *addr;
2756         rt->rt6i_dst.addr = *addr;
2757         rt->rt6i_dst.plen = 128;
2758         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2759         rt->rt6i_table = fib6_get_table(net, tb_id);
2760         rt->dst.flags |= DST_NOCACHE;
2761
2762         atomic_set(&rt->dst.__refcnt, 1);
2763
2764         return rt;
2765 }
2766
2767 /* remove deleted ip from prefsrc entries */
2768 struct arg_dev_net_ip {
2769         struct net_device *dev;
2770         struct net *net;
2771         struct in6_addr *addr;
2772 };
2773
2774 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2775 {
2776         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2777         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2778         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2779
2780         if (((void *)rt->dst.dev == dev || !dev) &&
2781             rt != net->ipv6.ip6_null_entry &&
2782             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2783                 /* remove prefsrc entry */
2784                 rt->rt6i_prefsrc.plen = 0;
2785         }
2786         return 0;
2787 }
2788
2789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2790 {
2791         struct net *net = dev_net(ifp->idev->dev);
2792         struct arg_dev_net_ip adni = {
2793                 .dev = ifp->idev->dev,
2794                 .net = net,
2795                 .addr = &ifp->addr,
2796         };
2797         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2798 }
2799
2800 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2801 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2802
2803 /* Remove routers and update dst entries when gateway turn into host. */
2804 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2805 {
2806         struct in6_addr *gateway = (struct in6_addr *)arg;
2807
2808         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2809              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2810              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2811                 return -1;
2812         }
2813         return 0;
2814 }
2815
2816 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2817 {
2818         fib6_clean_all(net, fib6_clean_tohost, gateway);
2819 }
2820
2821 struct arg_dev_net {
2822         struct net_device *dev;
2823         struct net *net;
2824 };
2825
2826 /* called with write lock held for table with rt */
2827 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2828 {
2829         const struct arg_dev_net *adn = arg;
2830         const struct net_device *dev = adn->dev;
2831
2832         if ((rt->dst.dev == dev || !dev) &&
2833             rt != adn->net->ipv6.ip6_null_entry &&
2834             (rt->rt6i_nsiblings == 0 ||
2835              (dev && netdev_unregistering(dev)) ||
2836              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2837                 return -1;
2838
2839         return 0;
2840 }
2841
2842 void rt6_ifdown(struct net *net, struct net_device *dev)
2843 {
2844         struct arg_dev_net adn = {
2845                 .dev = dev,
2846                 .net = net,
2847         };
2848
2849         fib6_clean_all(net, fib6_ifdown, &adn);
2850         icmp6_clean_all(fib6_ifdown, &adn);
2851         if (dev)
2852                 rt6_uncached_list_flush_dev(net, dev);
2853 }
2854
2855 struct rt6_mtu_change_arg {
2856         struct net_device *dev;
2857         unsigned int mtu;
2858 };
2859
2860 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2861 {
2862         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2863         struct inet6_dev *idev;
2864
2865         /* In IPv6 pmtu discovery is not optional,
2866            so that RTAX_MTU lock cannot disable it.
2867            We still use this lock to block changes
2868            caused by addrconf/ndisc.
2869         */
2870
2871         idev = __in6_dev_get(arg->dev);
2872         if (!idev)
2873                 return 0;
2874
2875         /* For administrative MTU increase, there is no way to discover
2876            IPv6 PMTU increase, so PMTU increase should be updated here.
2877            Since RFC 1981 doesn't include administrative MTU increase
2878            update PMTU increase is a MUST. (i.e. jumbo frame)
2879          */
2880         /*
2881            If new MTU is less than route PMTU, this new MTU will be the
2882            lowest MTU in the path, update the route PMTU to reflect PMTU
2883            decreases; if new MTU is greater than route PMTU, and the
2884            old MTU is the lowest MTU in the path, update the route PMTU
2885            to reflect the increase. In this case if the other nodes' MTU
2886            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2887            PMTU discovery.
2888          */
2889         if (rt->dst.dev == arg->dev &&
2890             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2891             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2892                 if (rt->rt6i_flags & RTF_CACHE) {
2893                         /* For RTF_CACHE with rt6i_pmtu == 0
2894                          * (i.e. a redirected route),
2895                          * the metrics of its rt->dst.from has already
2896                          * been updated.
2897                          */
2898                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2899                                 rt->rt6i_pmtu = arg->mtu;
2900                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2901                            (dst_mtu(&rt->dst) < arg->mtu &&
2902                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2903                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2904                 }
2905         }
2906         return 0;
2907 }
2908
2909 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2910 {
2911         struct rt6_mtu_change_arg arg = {
2912                 .dev = dev,
2913                 .mtu = mtu,
2914         };
2915
2916         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2917 }
2918
2919 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2920         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2921         [RTA_OIF]               = { .type = NLA_U32 },
2922         [RTA_IIF]               = { .type = NLA_U32 },
2923         [RTA_PRIORITY]          = { .type = NLA_U32 },
2924         [RTA_METRICS]           = { .type = NLA_NESTED },
2925         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2926         [RTA_PREF]              = { .type = NLA_U8 },
2927         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2928         [RTA_ENCAP]             = { .type = NLA_NESTED },
2929         [RTA_EXPIRES]           = { .type = NLA_U32 },
2930         [RTA_UID]               = { .type = NLA_U32 },
2931         [RTA_MARK]              = { .type = NLA_U32 },
2932 };
2933
2934 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2935                               struct fib6_config *cfg,
2936                               struct netlink_ext_ack *extack)
2937 {
2938         struct rtmsg *rtm;
2939         struct nlattr *tb[RTA_MAX+1];
2940         unsigned int pref;
2941         int err;
2942
2943         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2944                           NULL);
2945         if (err < 0)
2946                 goto errout;
2947
2948         err = -EINVAL;
2949         rtm = nlmsg_data(nlh);
2950         memset(cfg, 0, sizeof(*cfg));
2951
2952         cfg->fc_table = rtm->rtm_table;
2953         cfg->fc_dst_len = rtm->rtm_dst_len;
2954         cfg->fc_src_len = rtm->rtm_src_len;
2955         cfg->fc_flags = RTF_UP;
2956         cfg->fc_protocol = rtm->rtm_protocol;
2957         cfg->fc_type = rtm->rtm_type;
2958
2959         if (rtm->rtm_type == RTN_UNREACHABLE ||
2960             rtm->rtm_type == RTN_BLACKHOLE ||
2961             rtm->rtm_type == RTN_PROHIBIT ||
2962             rtm->rtm_type == RTN_THROW)
2963                 cfg->fc_flags |= RTF_REJECT;
2964
2965         if (rtm->rtm_type == RTN_LOCAL)
2966                 cfg->fc_flags |= RTF_LOCAL;
2967
2968         if (rtm->rtm_flags & RTM_F_CLONED)
2969                 cfg->fc_flags |= RTF_CACHE;
2970
2971         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2972         cfg->fc_nlinfo.nlh = nlh;
2973         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2974
2975         if (tb[RTA_GATEWAY]) {
2976                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2977                 cfg->fc_flags |= RTF_GATEWAY;
2978         }
2979
2980         if (tb[RTA_DST]) {
2981                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2982
2983                 if (nla_len(tb[RTA_DST]) < plen)
2984                         goto errout;
2985
2986                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2987         }
2988
2989         if (tb[RTA_SRC]) {
2990                 int plen = (rtm->rtm_src_len + 7) >> 3;
2991
2992                 if (nla_len(tb[RTA_SRC]) < plen)
2993                         goto errout;
2994
2995                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2996         }
2997
2998         if (tb[RTA_PREFSRC])
2999                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3000
3001         if (tb[RTA_OIF])
3002                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3003
3004         if (tb[RTA_PRIORITY])
3005                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3006
3007         if (tb[RTA_METRICS]) {
3008                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3009                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3010         }
3011
3012         if (tb[RTA_TABLE])
3013                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3014
3015         if (tb[RTA_MULTIPATH]) {
3016                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3017                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3018
3019                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3020                                                      cfg->fc_mp_len, extack);
3021                 if (err < 0)
3022                         goto errout;
3023         }
3024
3025         if (tb[RTA_PREF]) {
3026                 pref = nla_get_u8(tb[RTA_PREF]);
3027                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3028                     pref != ICMPV6_ROUTER_PREF_HIGH)
3029                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3030                 cfg->fc_flags |= RTF_PREF(pref);
3031         }
3032
3033         if (tb[RTA_ENCAP])
3034                 cfg->fc_encap = tb[RTA_ENCAP];
3035
3036         if (tb[RTA_ENCAP_TYPE]) {
3037                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3038
3039                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3040                 if (err < 0)
3041                         goto errout;
3042         }
3043
3044         if (tb[RTA_EXPIRES]) {
3045                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3046
3047                 if (addrconf_finite_timeout(timeout)) {
3048                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3049                         cfg->fc_flags |= RTF_EXPIRES;
3050                 }
3051         }
3052
3053         err = 0;
3054 errout:
3055         return err;
3056 }
3057
3058 struct rt6_nh {
3059         struct rt6_info *rt6_info;
3060         struct fib6_config r_cfg;
3061         struct mx6_config mxc;
3062         struct list_head next;
3063 };
3064
3065 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3066 {
3067         struct rt6_nh *nh;
3068
3069         list_for_each_entry(nh, rt6_nh_list, next) {
3070                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3071                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3072                         nh->r_cfg.fc_ifindex);
3073         }
3074 }
3075
3076 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3077                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3078 {
3079         struct rt6_nh *nh;
3080         struct rt6_info *rtnh;
3081         int err = -EEXIST;
3082
3083         list_for_each_entry(nh, rt6_nh_list, next) {
3084                 /* check if rt6_info already exists */
3085                 rtnh = nh->rt6_info;
3086
3087                 if (rtnh->dst.dev == rt->dst.dev &&
3088                     rtnh->rt6i_idev == rt->rt6i_idev &&
3089                     ipv6_addr_equal(&rtnh->rt6i_gateway,
3090                                     &rt->rt6i_gateway))
3091                         return err;
3092         }
3093
3094         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3095         if (!nh)
3096                 return -ENOMEM;
3097         nh->rt6_info = rt;
3098         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3099         if (err) {
3100                 kfree(nh);
3101                 return err;
3102         }
3103         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3104         list_add_tail(&nh->next, rt6_nh_list);
3105
3106         return 0;
3107 }
3108
3109 static void ip6_route_mpath_notify(struct rt6_info *rt,
3110                                    struct rt6_info *rt_last,
3111                                    struct nl_info *info,
3112                                    __u16 nlflags)
3113 {
3114         /* if this is an APPEND route, then rt points to the first route
3115          * inserted and rt_last points to last route inserted. Userspace
3116          * wants a consistent dump of the route which starts at the first
3117          * nexthop. Since sibling routes are always added at the end of
3118          * the list, find the first sibling of the last route appended
3119          */
3120         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3121                 rt = list_first_entry(&rt_last->rt6i_siblings,
3122                                       struct rt6_info,
3123                                       rt6i_siblings);
3124         }
3125
3126         if (rt)
3127                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3128 }
3129
3130 static int ip6_route_multipath_add(struct fib6_config *cfg,
3131                                    struct netlink_ext_ack *extack)
3132 {
3133         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3134         struct nl_info *info = &cfg->fc_nlinfo;
3135         struct fib6_config r_cfg;
3136         struct rtnexthop *rtnh;
3137         struct rt6_info *rt;
3138         struct rt6_nh *err_nh;
3139         struct rt6_nh *nh, *nh_safe;
3140         __u16 nlflags;
3141         int remaining;
3142         int attrlen;
3143         int err = 1;
3144         int nhn = 0;
3145         int replace = (cfg->fc_nlinfo.nlh &&
3146                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3147         LIST_HEAD(rt6_nh_list);
3148
3149         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3150         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3151                 nlflags |= NLM_F_APPEND;
3152
3153         remaining = cfg->fc_mp_len;
3154         rtnh = (struct rtnexthop *)cfg->fc_mp;
3155
3156         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3157          * rt6_info structs per nexthop
3158          */
3159         while (rtnh_ok(rtnh, remaining)) {
3160                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3161                 if (rtnh->rtnh_ifindex)
3162                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3163
3164                 attrlen = rtnh_attrlen(rtnh);
3165                 if (attrlen > 0) {
3166                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3167
3168                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3169                         if (nla) {
3170                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3171                                 r_cfg.fc_flags |= RTF_GATEWAY;
3172                         }
3173                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3174                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3175                         if (nla)
3176                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3177                 }
3178
3179                 rt = ip6_route_info_create(&r_cfg, extack);
3180                 if (IS_ERR(rt)) {
3181                         err = PTR_ERR(rt);
3182                         rt = NULL;
3183                         goto cleanup;
3184                 }
3185
3186                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3187                 if (err) {
3188                         dst_free(&rt->dst);
3189                         goto cleanup;
3190                 }
3191
3192                 rtnh = rtnh_next(rtnh, &remaining);
3193         }
3194
3195         /* for add and replace send one notification with all nexthops.
3196          * Skip the notification in fib6_add_rt2node and send one with
3197          * the full route when done
3198          */
3199         info->skip_notify = 1;
3200
3201         err_nh = NULL;
3202         list_for_each_entry(nh, &rt6_nh_list, next) {
3203                 rt_last = nh->rt6_info;
3204                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3205                 /* save reference to first route for notification */
3206                 if (!rt_notif && !err)
3207                         rt_notif = nh->rt6_info;
3208
3209                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3210                 nh->rt6_info = NULL;
3211                 if (err) {
3212                         if (replace && nhn)
3213                                 ip6_print_replace_route_err(&rt6_nh_list);
3214                         err_nh = nh;
3215                         goto add_errout;
3216                 }
3217
3218                 /* Because each route is added like a single route we remove
3219                  * these flags after the first nexthop: if there is a collision,
3220                  * we have already failed to add the first nexthop:
3221                  * fib6_add_rt2node() has rejected it; when replacing, old
3222                  * nexthops have been replaced by first new, the rest should
3223                  * be added to it.
3224                  */
3225                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3226                                                      NLM_F_REPLACE);
3227                 nhn++;
3228         }
3229
3230         /* success ... tell user about new route */
3231         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3232         goto cleanup;
3233
3234 add_errout:
3235         /* send notification for routes that were added so that
3236          * the delete notifications sent by ip6_route_del are
3237          * coherent
3238          */
3239         if (rt_notif)
3240                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3241
3242         /* Delete routes that were already added */
3243         list_for_each_entry(nh, &rt6_nh_list, next) {
3244                 if (err_nh == nh)
3245                         break;
3246                 ip6_route_del(&nh->r_cfg, extack);
3247         }
3248
3249 cleanup:
3250         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3251                 if (nh->rt6_info)
3252                         dst_free(&nh->rt6_info->dst);
3253                 kfree(nh->mxc.mx);
3254                 list_del(&nh->next);
3255                 kfree(nh);
3256         }
3257
3258         return err;
3259 }
3260
3261 static int ip6_route_multipath_del(struct fib6_config *cfg,
3262                                    struct netlink_ext_ack *extack)
3263 {
3264         struct fib6_config r_cfg;
3265         struct rtnexthop *rtnh;
3266         int remaining;
3267         int attrlen;
3268         int err = 1, last_err = 0;
3269
3270         remaining = cfg->fc_mp_len;
3271         rtnh = (struct rtnexthop *)cfg->fc_mp;
3272
3273         /* Parse a Multipath Entry */
3274         while (rtnh_ok(rtnh, remaining)) {
3275                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3276                 if (rtnh->rtnh_ifindex)
3277                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3278
3279                 attrlen = rtnh_attrlen(rtnh);
3280                 if (attrlen > 0) {
3281                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3282
3283                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3284                         if (nla) {
3285                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3286                                 r_cfg.fc_flags |= RTF_GATEWAY;
3287                         }
3288                 }
3289                 err = ip6_route_del(&r_cfg, extack);
3290                 if (err)
3291                         last_err = err;
3292
3293                 rtnh = rtnh_next(rtnh, &remaining);
3294         }
3295
3296         return last_err;
3297 }
3298
3299 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3300                               struct netlink_ext_ack *extack)
3301 {
3302         struct fib6_config cfg;
3303         int err;
3304
3305         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3306         if (err < 0)
3307                 return err;
3308
3309         if (cfg.fc_mp)
3310                 return ip6_route_multipath_del(&cfg, extack);
3311         else {
3312                 cfg.fc_delete_all_nh = 1;
3313                 return ip6_route_del(&cfg, extack);
3314         }
3315 }
3316
3317 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3318                               struct netlink_ext_ack *extack)
3319 {
3320         struct fib6_config cfg;
3321         int err;
3322
3323         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3324         if (err < 0)
3325                 return err;
3326
3327         if (cfg.fc_mp)
3328                 return ip6_route_multipath_add(&cfg, extack);
3329         else
3330                 return ip6_route_add(&cfg, extack);
3331 }
3332
3333 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3334 {
3335         int nexthop_len = 0;
3336
3337         if (rt->rt6i_nsiblings) {
3338                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3339                             + NLA_ALIGN(sizeof(struct rtnexthop))
3340                             + nla_total_size(16) /* RTA_GATEWAY */
3341                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3342
3343                 nexthop_len *= rt->rt6i_nsiblings;
3344         }
3345
3346         return NLMSG_ALIGN(sizeof(struct rtmsg))
3347                + nla_total_size(16) /* RTA_SRC */
3348                + nla_total_size(16) /* RTA_DST */
3349                + nla_total_size(16) /* RTA_GATEWAY */
3350                + nla_total_size(16) /* RTA_PREFSRC */
3351                + nla_total_size(4) /* RTA_TABLE */
3352                + nla_total_size(4) /* RTA_IIF */
3353                + nla_total_size(4) /* RTA_OIF */
3354                + nla_total_size(4) /* RTA_PRIORITY */
3355                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3356                + nla_total_size(sizeof(struct rta_cacheinfo))
3357                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3358                + nla_total_size(1) /* RTA_PREF */
3359                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3360                + nexthop_len;
3361 }
3362
3363 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3364                             unsigned int *flags, bool skip_oif)
3365 {
3366         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3367                 *flags |= RTNH_F_LINKDOWN;
3368                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3369                         *flags |= RTNH_F_DEAD;
3370         }
3371
3372         if (rt->rt6i_flags & RTF_GATEWAY) {
3373                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3374                         goto nla_put_failure;
3375         }
3376
3377         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3378         if (!skip_oif && rt->dst.dev &&
3379             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3380                 goto nla_put_failure;
3381
3382         if (rt->dst.lwtstate &&
3383             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3384                 goto nla_put_failure;
3385
3386         return 0;
3387
3388 nla_put_failure:
3389         return -EMSGSIZE;
3390 }
3391
3392 /* add multipath next hop */
3393 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3394 {
3395         struct rtnexthop *rtnh;
3396         unsigned int flags = 0;
3397
3398         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3399         if (!rtnh)
3400                 goto nla_put_failure;
3401
3402         rtnh->rtnh_hops = 0;
3403         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3404
3405         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3406                 goto nla_put_failure;
3407
3408         rtnh->rtnh_flags = flags;
3409
3410         /* length of rtnetlink header + attributes */
3411         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3412
3413         return 0;
3414
3415 nla_put_failure:
3416         return -EMSGSIZE;
3417 }
3418
3419 static int rt6_fill_node(struct net *net,
3420                          struct sk_buff *skb, struct rt6_info *rt,
3421                          struct in6_addr *dst, struct in6_addr *src,
3422                          int iif, int type, u32 portid, u32 seq,
3423                          unsigned int flags)
3424 {
3425         u32 metrics[RTAX_MAX];
3426         struct rtmsg *rtm;
3427         struct nlmsghdr *nlh;
3428         long expires;
3429         u32 table;
3430
3431         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3432         if (!nlh)
3433                 return -EMSGSIZE;
3434
3435         rtm = nlmsg_data(nlh);
3436         rtm->rtm_family = AF_INET6;
3437         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3438         rtm->rtm_src_len = rt->rt6i_src.plen;
3439         rtm->rtm_tos = 0;
3440         if (rt->rt6i_table)
3441                 table = rt->rt6i_table->tb6_id;
3442         else
3443                 table = RT6_TABLE_UNSPEC;
3444         rtm->rtm_table = table;
3445         if (nla_put_u32(skb, RTA_TABLE, table))
3446                 goto nla_put_failure;
3447         if (rt->rt6i_flags & RTF_REJECT) {
3448                 switch (rt->dst.error) {
3449                 case -EINVAL:
3450                         rtm->rtm_type = RTN_BLACKHOLE;
3451                         break;
3452                 case -EACCES:
3453                         rtm->rtm_type = RTN_PROHIBIT;
3454                         break;
3455                 case -EAGAIN:
3456                         rtm->rtm_type = RTN_THROW;
3457                         break;
3458                 default:
3459                         rtm->rtm_type = RTN_UNREACHABLE;
3460                         break;
3461                 }
3462         }
3463         else if (rt->rt6i_flags & RTF_LOCAL)
3464                 rtm->rtm_type = RTN_LOCAL;
3465         else if (rt->rt6i_flags & RTF_ANYCAST)
3466                 rtm->rtm_type = RTN_ANYCAST;
3467         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3468                 rtm->rtm_type = RTN_LOCAL;
3469         else
3470                 rtm->rtm_type = RTN_UNICAST;
3471         rtm->rtm_flags = 0;
3472         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3473         rtm->rtm_protocol = rt->rt6i_protocol;
3474         if (rt->rt6i_flags & RTF_DYNAMIC)
3475                 rtm->rtm_protocol = RTPROT_REDIRECT;
3476         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3477                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3478                         rtm->rtm_protocol = RTPROT_RA;
3479                 else
3480                         rtm->rtm_protocol = RTPROT_KERNEL;
3481         }
3482
3483         if (rt->rt6i_flags & RTF_CACHE)
3484                 rtm->rtm_flags |= RTM_F_CLONED;
3485
3486         if (dst) {
3487                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3488                         goto nla_put_failure;
3489                 rtm->rtm_dst_len = 128;
3490         } else if (rtm->rtm_dst_len)
3491                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3492                         goto nla_put_failure;
3493 #ifdef CONFIG_IPV6_SUBTREES
3494         if (src) {
3495                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3496                         goto nla_put_failure;
3497                 rtm->rtm_src_len = 128;
3498         } else if (rtm->rtm_src_len &&
3499                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3500                 goto nla_put_failure;
3501 #endif
3502         if (iif) {
3503 #ifdef CONFIG_IPV6_MROUTE
3504                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3505                         int err = ip6mr_get_route(net, skb, rtm, portid);
3506
3507                         if (err == 0)
3508                                 return 0;
3509                         if (err < 0)
3510                                 goto nla_put_failure;
3511                 } else
3512 #endif
3513                         if (nla_put_u32(skb, RTA_IIF, iif))
3514                                 goto nla_put_failure;
3515         } else if (dst) {
3516                 struct in6_addr saddr_buf;
3517                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3518                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3519                         goto nla_put_failure;
3520         }
3521
3522         if (rt->rt6i_prefsrc.plen) {
3523                 struct in6_addr saddr_buf;
3524                 saddr_buf = rt->rt6i_prefsrc.addr;
3525                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3526                         goto nla_put_failure;
3527         }
3528
3529         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3530         if (rt->rt6i_pmtu)
3531                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3532         if (rtnetlink_put_metrics(skb, metrics) < 0)
3533                 goto nla_put_failure;
3534
3535         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3536                 goto nla_put_failure;
3537
3538         /* For multipath routes, walk the siblings list and add
3539          * each as a nexthop within RTA_MULTIPATH.
3540          */
3541         if (rt->rt6i_nsiblings) {
3542                 struct rt6_info *sibling, *next_sibling;
3543                 struct nlattr *mp;
3544
3545                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3546                 if (!mp)
3547                         goto nla_put_failure;
3548
3549                 if (rt6_add_nexthop(skb, rt) < 0)
3550                         goto nla_put_failure;
3551
3552                 list_for_each_entry_safe(sibling, next_sibling,
3553                                          &rt->rt6i_siblings, rt6i_siblings) {
3554                         if (rt6_add_nexthop(skb, sibling) < 0)
3555                                 goto nla_put_failure;
3556                 }
3557
3558                 nla_nest_end(skb, mp);
3559         } else {
3560                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3561                         goto nla_put_failure;
3562         }
3563
3564         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3565
3566         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3567                 goto nla_put_failure;
3568
3569         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3570                 goto nla_put_failure;
3571
3572
3573         nlmsg_end(skb, nlh);
3574         return 0;
3575
3576 nla_put_failure:
3577         nlmsg_cancel(skb, nlh);
3578         return -EMSGSIZE;
3579 }
3580
3581 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3582 {
3583         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3584         struct net *net = arg->net;
3585
3586         if (rt == net->ipv6.ip6_null_entry)
3587                 return 0;
3588
3589         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3590                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3591
3592                 /* user wants prefix routes only */
3593                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3594                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3595                         /* success since this is not a prefix route */
3596                         return 1;
3597                 }
3598         }
3599
3600         return rt6_fill_node(net,
3601                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3602                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3603                      NLM_F_MULTI);
3604 }
3605
3606 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3607                               struct netlink_ext_ack *extack)
3608 {
3609         struct net *net = sock_net(in_skb->sk);
3610         struct nlattr *tb[RTA_MAX+1];
3611         int err, iif = 0, oif = 0;
3612         struct dst_entry *dst;
3613         struct rt6_info *rt;
3614         struct sk_buff *skb;
3615         struct rtmsg *rtm;
3616         struct flowi6 fl6;
3617         bool fibmatch;
3618
3619         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3620                           extack);
3621         if (err < 0)
3622                 goto errout;
3623
3624         err = -EINVAL;
3625         memset(&fl6, 0, sizeof(fl6));
3626         rtm = nlmsg_data(nlh);
3627         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3628         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3629
3630         if (tb[RTA_SRC]) {
3631                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3632                         goto errout;
3633
3634                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3635         }
3636
3637         if (tb[RTA_DST]) {
3638                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3639                         goto errout;
3640
3641                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3642         }
3643
3644         if (tb[RTA_IIF])
3645                 iif = nla_get_u32(tb[RTA_IIF]);
3646
3647         if (tb[RTA_OIF])
3648                 oif = nla_get_u32(tb[RTA_OIF]);
3649
3650         if (tb[RTA_MARK])
3651                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3652
3653         if (tb[RTA_UID])
3654                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3655                                            nla_get_u32(tb[RTA_UID]));
3656         else
3657                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3658
3659         if (iif) {
3660                 struct net_device *dev;
3661                 int flags = 0;
3662
3663                 dev = __dev_get_by_index(net, iif);
3664                 if (!dev) {
3665                         err = -ENODEV;
3666                         goto errout;
3667                 }
3668
3669                 fl6.flowi6_iif = iif;
3670
3671                 if (!ipv6_addr_any(&fl6.saddr))
3672                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3673
3674                 if (!fibmatch)
3675                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3676         } else {
3677                 fl6.flowi6_oif = oif;
3678
3679                 if (!fibmatch)
3680                         dst = ip6_route_output(net, NULL, &fl6);
3681         }
3682
3683         if (fibmatch)
3684                 dst = ip6_route_lookup(net, &fl6, 0);
3685
3686         rt = container_of(dst, struct rt6_info, dst);
3687         if (rt->dst.error) {
3688                 err = rt->dst.error;
3689                 ip6_rt_put(rt);
3690                 goto errout;
3691         }
3692
3693         if (rt == net->ipv6.ip6_null_entry) {
3694                 err = rt->dst.error;
3695                 ip6_rt_put(rt);
3696                 goto errout;
3697         }
3698
3699         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3700         if (!skb) {
3701                 ip6_rt_put(rt);
3702                 err = -ENOBUFS;
3703                 goto errout;
3704         }
3705
3706         skb_dst_set(skb, &rt->dst);
3707         if (fibmatch)
3708                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3709                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3710                                     nlh->nlmsg_seq, 0);
3711         else
3712                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3713                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3714                                     nlh->nlmsg_seq, 0);
3715         if (err < 0) {
3716                 kfree_skb(skb);
3717                 goto errout;
3718         }
3719
3720         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3721 errout:
3722         return err;
3723 }
3724
3725 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3726                      unsigned int nlm_flags)
3727 {
3728         struct sk_buff *skb;
3729         struct net *net = info->nl_net;
3730         u32 seq;
3731         int err;
3732
3733         err = -ENOBUFS;
3734         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3735
3736         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3737         if (!skb)
3738                 goto errout;
3739
3740         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3741                                 event, info->portid, seq, nlm_flags);
3742         if (err < 0) {
3743                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3744                 WARN_ON(err == -EMSGSIZE);
3745                 kfree_skb(skb);
3746                 goto errout;
3747         }
3748         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3749                     info->nlh, gfp_any());
3750         return;
3751 errout:
3752         if (err < 0)
3753                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3754 }
3755
3756 static int ip6_route_dev_notify(struct notifier_block *this,
3757                                 unsigned long event, void *ptr)
3758 {
3759         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3760         struct net *net = dev_net(dev);
3761
3762         if (!(dev->flags & IFF_LOOPBACK))
3763                 return NOTIFY_OK;
3764
3765         if (event == NETDEV_REGISTER) {
3766                 net->ipv6.ip6_null_entry->dst.dev = dev;
3767                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3768 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3769                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3770                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3771                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3772                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3773 #endif
3774          } else if (event == NETDEV_UNREGISTER) {
3775                 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3776 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3777                 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3778                 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3779 #endif
3780         }
3781
3782         return NOTIFY_OK;
3783 }
3784
3785 /*
3786  *      /proc
3787  */
3788
3789 #ifdef CONFIG_PROC_FS
3790
3791 static const struct file_operations ipv6_route_proc_fops = {
3792         .owner          = THIS_MODULE,
3793         .open           = ipv6_route_open,
3794         .read           = seq_read,
3795         .llseek         = seq_lseek,
3796         .release        = seq_release_net,
3797 };
3798
3799 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3800 {
3801         struct net *net = (struct net *)seq->private;
3802         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3803                    net->ipv6.rt6_stats->fib_nodes,
3804                    net->ipv6.rt6_stats->fib_route_nodes,
3805                    net->ipv6.rt6_stats->fib_rt_alloc,
3806                    net->ipv6.rt6_stats->fib_rt_entries,
3807                    net->ipv6.rt6_stats->fib_rt_cache,
3808                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3809                    net->ipv6.rt6_stats->fib_discarded_routes);
3810
3811         return 0;
3812 }
3813
3814 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3815 {
3816         return single_open_net(inode, file, rt6_stats_seq_show);
3817 }
3818
3819 static const struct file_operations rt6_stats_seq_fops = {
3820         .owner   = THIS_MODULE,
3821         .open    = rt6_stats_seq_open,
3822         .read    = seq_read,
3823         .llseek  = seq_lseek,
3824         .release = single_release_net,
3825 };
3826 #endif  /* CONFIG_PROC_FS */
3827
3828 #ifdef CONFIG_SYSCTL
3829
3830 static
3831 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3832                               void __user *buffer, size_t *lenp, loff_t *ppos)
3833 {
3834         struct net *net;
3835         int delay;
3836         if (!write)
3837                 return -EINVAL;
3838
3839         net = (struct net *)ctl->extra1;
3840         delay = net->ipv6.sysctl.flush_delay;
3841         proc_dointvec(ctl, write, buffer, lenp, ppos);
3842         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3843         return 0;
3844 }
3845
3846 struct ctl_table ipv6_route_table_template[] = {
3847         {
3848                 .procname       =       "flush",
3849                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3850                 .maxlen         =       sizeof(int),
3851                 .mode           =       0200,
3852                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3853         },
3854         {
3855                 .procname       =       "gc_thresh",
3856                 .data           =       &ip6_dst_ops_template.gc_thresh,
3857                 .maxlen         =       sizeof(int),
3858                 .mode           =       0644,
3859                 .proc_handler   =       proc_dointvec,
3860         },
3861         {
3862                 .procname       =       "max_size",
3863                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3864                 .maxlen         =       sizeof(int),
3865                 .mode           =       0644,
3866                 .proc_handler   =       proc_dointvec,
3867         },
3868         {
3869                 .procname       =       "gc_min_interval",
3870                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3871                 .maxlen         =       sizeof(int),
3872                 .mode           =       0644,
3873                 .proc_handler   =       proc_dointvec_jiffies,
3874         },
3875         {
3876                 .procname       =       "gc_timeout",
3877                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3878                 .maxlen         =       sizeof(int),
3879                 .mode           =       0644,
3880                 .proc_handler   =       proc_dointvec_jiffies,
3881         },
3882         {
3883                 .procname       =       "gc_interval",
3884                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3885                 .maxlen         =       sizeof(int),
3886                 .mode           =       0644,
3887                 .proc_handler   =       proc_dointvec_jiffies,
3888         },
3889         {
3890                 .procname       =       "gc_elasticity",
3891                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3892                 .maxlen         =       sizeof(int),
3893                 .mode           =       0644,
3894                 .proc_handler   =       proc_dointvec,
3895         },
3896         {
3897                 .procname       =       "mtu_expires",
3898                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3899                 .maxlen         =       sizeof(int),
3900                 .mode           =       0644,
3901                 .proc_handler   =       proc_dointvec_jiffies,
3902         },
3903         {
3904                 .procname       =       "min_adv_mss",
3905                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3906                 .maxlen         =       sizeof(int),
3907                 .mode           =       0644,
3908                 .proc_handler   =       proc_dointvec,
3909         },
3910         {
3911                 .procname       =       "gc_min_interval_ms",
3912                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3913                 .maxlen         =       sizeof(int),
3914                 .mode           =       0644,
3915                 .proc_handler   =       proc_dointvec_ms_jiffies,
3916         },
3917         { }
3918 };
3919
3920 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3921 {
3922         struct ctl_table *table;
3923
3924         table = kmemdup(ipv6_route_table_template,
3925                         sizeof(ipv6_route_table_template),
3926                         GFP_KERNEL);
3927
3928         if (table) {
3929                 table[0].data = &net->ipv6.sysctl.flush_delay;
3930                 table[0].extra1 = net;
3931                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3932                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3933                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3934                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3935                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3936                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3937                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3938                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3939                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3940
3941                 /* Don't export sysctls to unprivileged users */
3942                 if (net->user_ns != &init_user_ns)
3943                         table[0].procname = NULL;
3944         }
3945
3946         return table;
3947 }
3948 #endif
3949
3950 static int __net_init ip6_route_net_init(struct net *net)
3951 {
3952         int ret = -ENOMEM;
3953
3954         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3955                sizeof(net->ipv6.ip6_dst_ops));
3956
3957         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3958                 goto out_ip6_dst_ops;
3959
3960         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3961                                            sizeof(*net->ipv6.ip6_null_entry),
3962                                            GFP_KERNEL);
3963         if (!net->ipv6.ip6_null_entry)
3964                 goto out_ip6_dst_entries;
3965         net->ipv6.ip6_null_entry->dst.path =
3966                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3967         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3968         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3969                          ip6_template_metrics, true);
3970
3971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3972         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3973                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3974                                                GFP_KERNEL);
3975         if (!net->ipv6.ip6_prohibit_entry)
3976                 goto out_ip6_null_entry;
3977         net->ipv6.ip6_prohibit_entry->dst.path =
3978                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3979         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3980         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3981                          ip6_template_metrics, true);
3982
3983         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3984                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3985                                                GFP_KERNEL);
3986         if (!net->ipv6.ip6_blk_hole_entry)
3987                 goto out_ip6_prohibit_entry;
3988         net->ipv6.ip6_blk_hole_entry->dst.path =
3989                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3990         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3991         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3992                          ip6_template_metrics, true);
3993 #endif
3994
3995         net->ipv6.sysctl.flush_delay = 0;
3996         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3997         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3998         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3999         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4000         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4001         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4002         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4003
4004         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4005
4006         ret = 0;
4007 out:
4008         return ret;
4009
4010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4011 out_ip6_prohibit_entry:
4012         kfree(net->ipv6.ip6_prohibit_entry);
4013 out_ip6_null_entry:
4014         kfree(net->ipv6.ip6_null_entry);
4015 #endif
4016 out_ip6_dst_entries:
4017         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4018 out_ip6_dst_ops:
4019         goto out;
4020 }
4021
4022 static void __net_exit ip6_route_net_exit(struct net *net)
4023 {
4024         kfree(net->ipv6.ip6_null_entry);
4025 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4026         kfree(net->ipv6.ip6_prohibit_entry);
4027         kfree(net->ipv6.ip6_blk_hole_entry);
4028 #endif
4029         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4030 }
4031
4032 static int __net_init ip6_route_net_init_late(struct net *net)
4033 {
4034 #ifdef CONFIG_PROC_FS
4035         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4036         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4037 #endif
4038         return 0;
4039 }
4040
4041 static void __net_exit ip6_route_net_exit_late(struct net *net)
4042 {
4043 #ifdef CONFIG_PROC_FS
4044         remove_proc_entry("ipv6_route", net->proc_net);
4045         remove_proc_entry("rt6_stats", net->proc_net);
4046 #endif
4047 }
4048
4049 static struct pernet_operations ip6_route_net_ops = {
4050         .init = ip6_route_net_init,
4051         .exit = ip6_route_net_exit,
4052 };
4053
4054 static int __net_init ipv6_inetpeer_init(struct net *net)
4055 {
4056         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4057
4058         if (!bp)
4059                 return -ENOMEM;
4060         inet_peer_base_init(bp);
4061         net->ipv6.peers = bp;
4062         return 0;
4063 }
4064
4065 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4066 {
4067         struct inet_peer_base *bp = net->ipv6.peers;
4068
4069         net->ipv6.peers = NULL;
4070         inetpeer_invalidate_tree(bp);
4071         kfree(bp);
4072 }
4073
4074 static struct pernet_operations ipv6_inetpeer_ops = {
4075         .init   =       ipv6_inetpeer_init,
4076         .exit   =       ipv6_inetpeer_exit,
4077 };
4078
4079 static struct pernet_operations ip6_route_net_late_ops = {
4080         .init = ip6_route_net_init_late,
4081         .exit = ip6_route_net_exit_late,
4082 };
4083
4084 static struct notifier_block ip6_route_dev_notifier = {
4085         .notifier_call = ip6_route_dev_notify,
4086         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4087 };
4088
4089 void __init ip6_route_init_special_entries(void)
4090 {
4091         /* Registering of the loopback is done before this portion of code,
4092          * the loopback reference in rt6_info will not be taken, do it
4093          * manually for init_net */
4094         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4095         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4096   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4097         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4098         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4099         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4100         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4101   #endif
4102 }
4103
4104 int __init ip6_route_init(void)
4105 {
4106         int ret;
4107         int cpu;
4108
4109         ret = -ENOMEM;
4110         ip6_dst_ops_template.kmem_cachep =
4111                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4112                                   SLAB_HWCACHE_ALIGN, NULL);
4113         if (!ip6_dst_ops_template.kmem_cachep)
4114                 goto out;
4115
4116         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4117         if (ret)
4118                 goto out_kmem_cache;
4119
4120         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4121         if (ret)
4122                 goto out_dst_entries;
4123
4124         ret = register_pernet_subsys(&ip6_route_net_ops);
4125         if (ret)
4126                 goto out_register_inetpeer;
4127
4128         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4129
4130         ret = fib6_init();
4131         if (ret)
4132                 goto out_register_subsys;
4133
4134         ret = xfrm6_init();
4135         if (ret)
4136                 goto out_fib6_init;
4137
4138         ret = fib6_rules_init();
4139         if (ret)
4140                 goto xfrm6_init;
4141
4142         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4143         if (ret)
4144                 goto fib6_rules_init;
4145
4146         ret = -ENOBUFS;
4147         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4148             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4149             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4150                 goto out_register_late_subsys;
4151
4152         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4153         if (ret)
4154                 goto out_register_late_subsys;
4155
4156         for_each_possible_cpu(cpu) {
4157                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4158
4159                 INIT_LIST_HEAD(&ul->head);
4160                 spin_lock_init(&ul->lock);
4161         }
4162
4163 out:
4164         return ret;
4165
4166 out_register_late_subsys:
4167         unregister_pernet_subsys(&ip6_route_net_late_ops);
4168 fib6_rules_init:
4169         fib6_rules_cleanup();
4170 xfrm6_init:
4171         xfrm6_fini();
4172 out_fib6_init:
4173         fib6_gc_cleanup();
4174 out_register_subsys:
4175         unregister_pernet_subsys(&ip6_route_net_ops);
4176 out_register_inetpeer:
4177         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4178 out_dst_entries:
4179         dst_entries_destroy(&ip6_dst_blackhole_ops);
4180 out_kmem_cache:
4181         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4182         goto out;
4183 }
4184
4185 void ip6_route_cleanup(void)
4186 {
4187         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4188         unregister_pernet_subsys(&ip6_route_net_late_ops);
4189         fib6_rules_cleanup();
4190         xfrm6_fini();
4191         fib6_gc_cleanup();
4192         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4193         unregister_pernet_subsys(&ip6_route_net_ops);
4194         dst_entries_destroy(&ip6_dst_blackhole_ops);
4195         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4196 }