net: ipv6: Plumb extack through route add functions
[linux-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->dst.flags |= DST_NOCACHE;
132         rt->rt6i_uncached_list = ul;
133
134         spin_lock_bh(&ul->lock);
135         list_add_tail(&rt->rt6i_uncached, &ul->head);
136         spin_unlock_bh(&ul->lock);
137 }
138
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141         if (!list_empty(&rt->rt6i_uncached)) {
142                 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144                 spin_lock_bh(&ul->lock);
145                 list_del(&rt->rt6i_uncached);
146                 spin_unlock_bh(&ul->lock);
147         }
148 }
149
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152         struct net_device *loopback_dev = net->loopback_dev;
153         int cpu;
154
155         if (dev == loopback_dev)
156                 return;
157
158         for_each_possible_cpu(cpu) {
159                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160                 struct rt6_info *rt;
161
162                 spin_lock_bh(&ul->lock);
163                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164                         struct inet6_dev *rt_idev = rt->rt6i_idev;
165                         struct net_device *rt_dev = rt->dst.dev;
166
167                         if (rt_idev->dev == dev) {
168                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
169                                 in6_dev_put(rt_idev);
170                         }
171
172                         if (rt_dev == dev) {
173                                 rt->dst.dev = loopback_dev;
174                                 dev_hold(rt->dst.dev);
175                                 dev_put(rt_dev);
176                         }
177                 }
178                 spin_unlock_bh(&ul->lock);
179         }
180 }
181
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184         return dst_metrics_write_ptr(rt->dst.from);
185 }
186
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190
191         if (rt->rt6i_flags & RTF_PCPU)
192                 return rt6_pcpu_cow_metrics(rt);
193         else if (rt->rt6i_flags & RTF_CACHE)
194                 return NULL;
195         else
196                 return dst_cow_metrics_generic(dst, old);
197 }
198
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200                                              struct sk_buff *skb,
201                                              const void *daddr)
202 {
203         struct in6_addr *p = &rt->rt6i_gateway;
204
205         if (!ipv6_addr_any(p))
206                 return (const void *) p;
207         else if (skb)
208                 return &ipv6_hdr(skb)->daddr;
209         return daddr;
210 }
211
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213                                           struct sk_buff *skb,
214                                           const void *daddr)
215 {
216         struct rt6_info *rt = (struct rt6_info *) dst;
217         struct neighbour *n;
218
219         daddr = choose_neigh_daddr(rt, skb, daddr);
220         n = __ipv6_neigh_lookup(dst->dev, daddr);
221         if (n)
222                 return n;
223         return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228         struct net_device *dev = dst->dev;
229         struct rt6_info *rt = (struct rt6_info *)dst;
230
231         daddr = choose_neigh_daddr(rt, NULL, daddr);
232         if (!daddr)
233                 return;
234         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235                 return;
236         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237                 return;
238         __ipv6_confirm_neigh(dev, daddr);
239 }
240
241 static struct dst_ops ip6_dst_ops_template = {
242         .family                 =       AF_INET6,
243         .gc                     =       ip6_dst_gc,
244         .gc_thresh              =       1024,
245         .check                  =       ip6_dst_check,
246         .default_advmss         =       ip6_default_advmss,
247         .mtu                    =       ip6_mtu,
248         .cow_metrics            =       ipv6_cow_metrics,
249         .destroy                =       ip6_dst_destroy,
250         .ifdown                 =       ip6_dst_ifdown,
251         .negative_advice        =       ip6_negative_advice,
252         .link_failure           =       ip6_link_failure,
253         .update_pmtu            =       ip6_rt_update_pmtu,
254         .redirect               =       rt6_do_redirect,
255         .local_out              =       __ip6_local_out,
256         .neigh_lookup           =       ip6_neigh_lookup,
257         .confirm_neigh          =       ip6_confirm_neigh,
258 };
259
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264         return mtu ? : dst->dev->mtu;
265 }
266
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268                                          struct sk_buff *skb, u32 mtu)
269 {
270 }
271
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273                                       struct sk_buff *skb)
274 {
275 }
276
277 static struct dst_ops ip6_dst_blackhole_ops = {
278         .family                 =       AF_INET6,
279         .destroy                =       ip6_dst_destroy,
280         .check                  =       ip6_dst_check,
281         .mtu                    =       ip6_blackhole_mtu,
282         .default_advmss         =       ip6_default_advmss,
283         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
284         .redirect               =       ip6_rt_blackhole_redirect,
285         .cow_metrics            =       dst_cow_metrics_generic,
286         .neigh_lookup           =       ip6_neigh_lookup,
287 };
288
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290         [RTAX_HOPLIMIT - 1] = 0,
291 };
292
293 static const struct rt6_info ip6_null_entry_template = {
294         .dst = {
295                 .__refcnt       = ATOMIC_INIT(1),
296                 .__use          = 1,
297                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
298                 .error          = -ENETUNREACH,
299                 .input          = ip6_pkt_discard,
300                 .output         = ip6_pkt_discard_out,
301         },
302         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
303         .rt6i_protocol  = RTPROT_KERNEL,
304         .rt6i_metric    = ~(u32) 0,
305         .rt6i_ref       = ATOMIC_INIT(1),
306 };
307
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309
310 static const struct rt6_info ip6_prohibit_entry_template = {
311         .dst = {
312                 .__refcnt       = ATOMIC_INIT(1),
313                 .__use          = 1,
314                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
315                 .error          = -EACCES,
316                 .input          = ip6_pkt_prohibit,
317                 .output         = ip6_pkt_prohibit_out,
318         },
319         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
320         .rt6i_protocol  = RTPROT_KERNEL,
321         .rt6i_metric    = ~(u32) 0,
322         .rt6i_ref       = ATOMIC_INIT(1),
323 };
324
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326         .dst = {
327                 .__refcnt       = ATOMIC_INIT(1),
328                 .__use          = 1,
329                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
330                 .error          = -EINVAL,
331                 .input          = dst_discard,
332                 .output         = dst_discard_out,
333         },
334         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
335         .rt6i_protocol  = RTPROT_KERNEL,
336         .rt6i_metric    = ~(u32) 0,
337         .rt6i_ref       = ATOMIC_INIT(1),
338 };
339
340 #endif
341
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344         struct dst_entry *dst = &rt->dst;
345
346         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347         INIT_LIST_HEAD(&rt->rt6i_siblings);
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353                                         struct net_device *dev,
354                                         int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         0, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt)
360                 rt6_info_init(rt);
361
362         return rt;
363 }
364
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366                                struct net_device *dev,
367                                int flags)
368 {
369         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370
371         if (rt) {
372                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373                 if (rt->rt6i_pcpu) {
374                         int cpu;
375
376                         for_each_possible_cpu(cpu) {
377                                 struct rt6_info **p;
378
379                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380                                 /* no one shares rt */
381                                 *p =  NULL;
382                         }
383                 } else {
384                         dst_destroy((struct dst_entry *)rt);
385                         return NULL;
386                 }
387         }
388
389         return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct dst_entry *from = dst->from;
397         struct inet6_dev *idev;
398
399         dst_destroy_metrics_generic(dst);
400         free_percpu(rt->rt6i_pcpu);
401         rt6_uncached_list_del(rt);
402
403         idev = rt->rt6i_idev;
404         if (idev) {
405                 rt->rt6i_idev = NULL;
406                 in6_dev_put(idev);
407         }
408
409         dst->from = NULL;
410         dst_release(from);
411 }
412
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414                            int how)
415 {
416         struct rt6_info *rt = (struct rt6_info *)dst;
417         struct inet6_dev *idev = rt->rt6i_idev;
418         struct net_device *loopback_dev =
419                 dev_net(dev)->loopback_dev;
420
421         if (dev != loopback_dev) {
422                 if (idev && idev->dev == dev) {
423                         struct inet6_dev *loopback_idev =
424                                 in6_dev_get(loopback_dev);
425                         if (loopback_idev) {
426                                 rt->rt6i_idev = loopback_idev;
427                                 in6_dev_put(idev);
428                         }
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448         }
449         return false;
450 }
451
452 /* Multipath route selection:
453  *   Hash based function using packet header and flowlabel.
454  * Adapted from fib_info_hashfn()
455  */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457                                const struct flowi6 *fl6)
458 {
459         return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463                                              struct flowi6 *fl6, int oif,
464                                              int strict)
465 {
466         struct rt6_info *sibling, *next_sibling;
467         int route_choosen;
468
469         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470         /* Don't change the route, if route_choosen == 0
471          * (siblings does not include ourself)
472          */
473         if (route_choosen)
474                 list_for_each_entry_safe(sibling, next_sibling,
475                                 &match->rt6i_siblings, rt6i_siblings) {
476                         route_choosen--;
477                         if (route_choosen == 0) {
478                                 if (rt6_score_route(sibling, oif, strict) < 0)
479                                         break;
480                                 match = sibling;
481                                 break;
482                         }
483                 }
484         return match;
485 }
486
487 /*
488  *      Route lookup. Any table->tb6_lock is implied.
489  */
490
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492                                                     struct rt6_info *rt,
493                                                     const struct in6_addr *saddr,
494                                                     int oif,
495                                                     int flags)
496 {
497         struct rt6_info *local = NULL;
498         struct rt6_info *sprt;
499
500         if (!oif && ipv6_addr_any(saddr))
501                 goto out;
502
503         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504                 struct net_device *dev = sprt->dst.dev;
505
506                 if (oif) {
507                         if (dev->ifindex == oif)
508                                 return sprt;
509                         if (dev->flags & IFF_LOOPBACK) {
510                                 if (!sprt->rt6i_idev ||
511                                     sprt->rt6i_idev->dev->ifindex != oif) {
512                                         if (flags & RT6_LOOKUP_F_IFACE)
513                                                 continue;
514                                         if (local &&
515                                             local->rt6i_idev->dev->ifindex == oif)
516                                                 continue;
517                                 }
518                                 local = sprt;
519                         }
520                 } else {
521                         if (ipv6_chk_addr(net, saddr, dev,
522                                           flags & RT6_LOOKUP_F_IFACE))
523                                 return sprt;
524                 }
525         }
526
527         if (oif) {
528                 if (local)
529                         return local;
530
531                 if (flags & RT6_LOOKUP_F_IFACE)
532                         return net->ipv6.ip6_null_entry;
533         }
534 out:
535         return rt;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540         struct work_struct work;
541         struct in6_addr target;
542         struct net_device *dev;
543 };
544
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547         struct in6_addr mcaddr;
548         struct __rt6_probe_work *work =
549                 container_of(w, struct __rt6_probe_work, work);
550
551         addrconf_addr_solict_mult(&work->target, &mcaddr);
552         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553         dev_put(work->dev);
554         kfree(work);
555 }
556
557 static void rt6_probe(struct rt6_info *rt)
558 {
559         struct __rt6_probe_work *work;
560         struct neighbour *neigh;
561         /*
562          * Okay, this does not seem to be appropriate
563          * for now, however, we need to check if it
564          * is really so; aka Router Reachability Probing.
565          *
566          * Router Reachability Probe MUST be rate-limited
567          * to no more than one per minute.
568          */
569         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570                 return;
571         rcu_read_lock_bh();
572         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573         if (neigh) {
574                 if (neigh->nud_state & NUD_VALID)
575                         goto out;
576
577                 work = NULL;
578                 write_lock(&neigh->lock);
579                 if (!(neigh->nud_state & NUD_VALID) &&
580                     time_after(jiffies,
581                                neigh->updated +
582                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
583                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
584                         if (work)
585                                 __neigh_set_probe_once(neigh);
586                 }
587                 write_unlock(&neigh->lock);
588         } else {
589                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590         }
591
592         if (work) {
593                 INIT_WORK(&work->work, rt6_probe_deferred);
594                 work->target = rt->rt6i_gateway;
595                 dev_hold(rt->dst.dev);
596                 work->dev = rt->dst.dev;
597                 schedule_work(&work->work);
598         }
599
600 out:
601         rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614         struct net_device *dev = rt->dst.dev;
615         if (!oif || dev->ifindex == oif)
616                 return 2;
617         if ((dev->flags & IFF_LOOPBACK) &&
618             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619                 return 1;
620         return 0;
621 }
622
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625         struct neighbour *neigh;
626         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627
628         if (rt->rt6i_flags & RTF_NONEXTHOP ||
629             !(rt->rt6i_flags & RTF_GATEWAY))
630                 return RT6_NUD_SUCCEED;
631
632         rcu_read_lock_bh();
633         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634         if (neigh) {
635                 read_lock(&neigh->lock);
636                 if (neigh->nud_state & NUD_VALID)
637                         ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639                 else if (!(neigh->nud_state & NUD_FAILED))
640                         ret = RT6_NUD_SUCCEED;
641                 else
642                         ret = RT6_NUD_FAIL_PROBE;
643 #endif
644                 read_unlock(&neigh->lock);
645         } else {
646                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648         }
649         rcu_read_unlock_bh();
650
651         return ret;
652 }
653
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655                            int strict)
656 {
657         int m;
658
659         m = rt6_check_dev(rt, oif);
660         if (!m && (strict & RT6_LOOKUP_F_IFACE))
661                 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665         if (strict & RT6_LOOKUP_F_REACHABLE) {
666                 int n = rt6_check_neigh(rt);
667                 if (n < 0)
668                         return n;
669         }
670         return m;
671 }
672
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674                                    int *mpri, struct rt6_info *match,
675                                    bool *do_rr)
676 {
677         int m;
678         bool match_do_rr = false;
679         struct inet6_dev *idev = rt->rt6i_idev;
680         struct net_device *dev = rt->dst.dev;
681
682         if (dev && !netif_carrier_ok(dev) &&
683             idev->cnf.ignore_routes_with_linkdown &&
684             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685                 goto out;
686
687         if (rt6_check_expired(rt))
688                 goto out;
689
690         m = rt6_score_route(rt, oif, strict);
691         if (m == RT6_NUD_FAIL_DO_RR) {
692                 match_do_rr = true;
693                 m = 0; /* lowest valid score */
694         } else if (m == RT6_NUD_FAIL_HARD) {
695                 goto out;
696         }
697
698         if (strict & RT6_LOOKUP_F_REACHABLE)
699                 rt6_probe(rt);
700
701         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702         if (m > *mpri) {
703                 *do_rr = match_do_rr;
704                 *mpri = m;
705                 match = rt;
706         }
707 out:
708         return match;
709 }
710
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731                 if (rt->rt6i_metric != metric) {
732                         cont = rt;
733                         break;
734                 }
735
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737         }
738
739         if (match || !cont)
740                 return match;
741
742         for (rt = cont; rt; rt = rt->dst.rt6_next)
743                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744
745         return match;
746 }
747
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750         struct rt6_info *match, *rt0;
751         struct net *net;
752         bool do_rr = false;
753
754         rt0 = fn->rr_ptr;
755         if (!rt0)
756                 fn->rr_ptr = rt0 = fn->leaf;
757
758         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759                              &do_rr);
760
761         if (do_rr) {
762                 struct rt6_info *next = rt0->dst.rt6_next;
763
764                 /* no entries matched; do round-robin */
765                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766                         next = fn->leaf;
767
768                 if (next != rt0)
769                         fn->rr_ptr = next;
770         }
771
772         net = dev_net(rt0->dst.dev);
773         return match ? match : net->ipv6.ip6_null_entry;
774 }
775
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783                   const struct in6_addr *gwaddr)
784 {
785         struct net *net = dev_net(dev);
786         struct route_info *rinfo = (struct route_info *) opt;
787         struct in6_addr prefix_buf, *prefix;
788         unsigned int pref;
789         unsigned long lifetime;
790         struct rt6_info *rt;
791
792         if (len < sizeof(struct route_info)) {
793                 return -EINVAL;
794         }
795
796         /* Sanity check for prefix_len and length */
797         if (rinfo->length > 3) {
798                 return -EINVAL;
799         } else if (rinfo->prefix_len > 128) {
800                 return -EINVAL;
801         } else if (rinfo->prefix_len > 64) {
802                 if (rinfo->length < 2) {
803                         return -EINVAL;
804                 }
805         } else if (rinfo->prefix_len > 0) {
806                 if (rinfo->length < 1) {
807                         return -EINVAL;
808                 }
809         }
810
811         pref = rinfo->route_pref;
812         if (pref == ICMPV6_ROUTER_PREF_INVALID)
813                 return -EINVAL;
814
815         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816
817         if (rinfo->length == 3)
818                 prefix = (struct in6_addr *)rinfo->prefix;
819         else {
820                 /* this function is safe */
821                 ipv6_addr_prefix(&prefix_buf,
822                                  (struct in6_addr *)rinfo->prefix,
823                                  rinfo->prefix_len);
824                 prefix = &prefix_buf;
825         }
826
827         if (rinfo->prefix_len == 0)
828                 rt = rt6_get_dflt_router(gwaddr, dev);
829         else
830                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831                                         gwaddr, dev);
832
833         if (rt && !lifetime) {
834                 ip6_del_rt(rt);
835                 rt = NULL;
836         }
837
838         if (!rt && lifetime)
839                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840                                         dev, pref);
841         else if (rt)
842                 rt->rt6i_flags = RTF_ROUTEINFO |
843                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844
845         if (rt) {
846                 if (!addrconf_finite_timeout(lifetime))
847                         rt6_clean_expires(rt);
848                 else
849                         rt6_set_expires(rt, jiffies + HZ * lifetime);
850
851                 ip6_rt_put(rt);
852         }
853         return 0;
854 }
855 #endif
856
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858                                         struct in6_addr *saddr)
859 {
860         struct fib6_node *pn;
861         while (1) {
862                 if (fn->fn_flags & RTN_TL_ROOT)
863                         return NULL;
864                 pn = fn->parent;
865                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867                 else
868                         fn = pn;
869                 if (fn->fn_flags & RTN_RTINFO)
870                         return fn;
871         }
872 }
873
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875                                              struct fib6_table *table,
876                                              struct flowi6 *fl6, int flags)
877 {
878         struct fib6_node *fn;
879         struct rt6_info *rt;
880
881         read_lock_bh(&table->tb6_lock);
882         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884         rt = fn->leaf;
885         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888         if (rt == net->ipv6.ip6_null_entry) {
889                 fn = fib6_backtrack(fn, &fl6->saddr);
890                 if (fn)
891                         goto restart;
892         }
893         dst_use(&rt->dst, jiffies);
894         read_unlock_bh(&table->tb6_lock);
895
896         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897
898         return rt;
899
900 }
901
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903                                     int flags)
904 {
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910                             const struct in6_addr *saddr, int oif, int strict)
911 {
912         struct flowi6 fl6 = {
913                 .flowi6_oif = oif,
914                 .daddr = *daddr,
915         };
916         struct dst_entry *dst;
917         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918
919         if (saddr) {
920                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921                 flags |= RT6_LOOKUP_F_HAS_SADDR;
922         }
923
924         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925         if (dst->error == 0)
926                 return (struct rt6_info *) dst;
927
928         dst_release(dst);
929
930         return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935    It takes new route entry, the addition fails by any reason the
936    route is freed. In any case, if caller does not hold it, it may
937    be destroyed.
938  */
939
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941                         struct mx6_config *mxc,
942                         struct netlink_ext_ack *extack)
943 {
944         int err;
945         struct fib6_table *table;
946
947         table = rt->rt6i_table;
948         write_lock_bh(&table->tb6_lock);
949         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
950         write_unlock_bh(&table->tb6_lock);
951
952         return err;
953 }
954
955 int ip6_ins_rt(struct rt6_info *rt)
956 {
957         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
958         struct mx6_config mxc = { .mx = NULL, };
959
960         return __ip6_ins_rt(rt, &info, &mxc, NULL);
961 }
962
963 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
964                                            const struct in6_addr *daddr,
965                                            const struct in6_addr *saddr)
966 {
967         struct rt6_info *rt;
968
969         /*
970          *      Clone the route.
971          */
972
973         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
974                 ort = (struct rt6_info *)ort->dst.from;
975
976         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
977
978         if (!rt)
979                 return NULL;
980
981         ip6_rt_copy_init(rt, ort);
982         rt->rt6i_flags |= RTF_CACHE;
983         rt->rt6i_metric = 0;
984         rt->dst.flags |= DST_HOST;
985         rt->rt6i_dst.addr = *daddr;
986         rt->rt6i_dst.plen = 128;
987
988         if (!rt6_is_gw_or_nonexthop(ort)) {
989                 if (ort->rt6i_dst.plen != 128 &&
990                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
991                         rt->rt6i_flags |= RTF_ANYCAST;
992 #ifdef CONFIG_IPV6_SUBTREES
993                 if (rt->rt6i_src.plen && saddr) {
994                         rt->rt6i_src.addr = *saddr;
995                         rt->rt6i_src.plen = 128;
996                 }
997 #endif
998         }
999
1000         return rt;
1001 }
1002
1003 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1004 {
1005         struct rt6_info *pcpu_rt;
1006
1007         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1008                                   rt->dst.dev, rt->dst.flags);
1009
1010         if (!pcpu_rt)
1011                 return NULL;
1012         ip6_rt_copy_init(pcpu_rt, rt);
1013         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1014         pcpu_rt->rt6i_flags |= RTF_PCPU;
1015         return pcpu_rt;
1016 }
1017
1018 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1019 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1020 {
1021         struct rt6_info *pcpu_rt, **p;
1022
1023         p = this_cpu_ptr(rt->rt6i_pcpu);
1024         pcpu_rt = *p;
1025
1026         if (pcpu_rt) {
1027                 dst_hold(&pcpu_rt->dst);
1028                 rt6_dst_from_metrics_check(pcpu_rt);
1029         }
1030         return pcpu_rt;
1031 }
1032
1033 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1034 {
1035         struct fib6_table *table = rt->rt6i_table;
1036         struct rt6_info *pcpu_rt, *prev, **p;
1037
1038         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1039         if (!pcpu_rt) {
1040                 struct net *net = dev_net(rt->dst.dev);
1041
1042                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1043                 return net->ipv6.ip6_null_entry;
1044         }
1045
1046         read_lock_bh(&table->tb6_lock);
1047         if (rt->rt6i_pcpu) {
1048                 p = this_cpu_ptr(rt->rt6i_pcpu);
1049                 prev = cmpxchg(p, NULL, pcpu_rt);
1050                 if (prev) {
1051                         /* If someone did it before us, return prev instead */
1052                         dst_destroy(&pcpu_rt->dst);
1053                         pcpu_rt = prev;
1054                 }
1055         } else {
1056                 /* rt has been removed from the fib6 tree
1057                  * before we have a chance to acquire the read_lock.
1058                  * In this case, don't brother to create a pcpu rt
1059                  * since rt is going away anyway.  The next
1060                  * dst_check() will trigger a re-lookup.
1061                  */
1062                 dst_destroy(&pcpu_rt->dst);
1063                 pcpu_rt = rt;
1064         }
1065         dst_hold(&pcpu_rt->dst);
1066         rt6_dst_from_metrics_check(pcpu_rt);
1067         read_unlock_bh(&table->tb6_lock);
1068         return pcpu_rt;
1069 }
1070
1071 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1072                                int oif, struct flowi6 *fl6, int flags)
1073 {
1074         struct fib6_node *fn, *saved_fn;
1075         struct rt6_info *rt;
1076         int strict = 0;
1077
1078         strict |= flags & RT6_LOOKUP_F_IFACE;
1079         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1080         if (net->ipv6.devconf_all->forwarding == 0)
1081                 strict |= RT6_LOOKUP_F_REACHABLE;
1082
1083         read_lock_bh(&table->tb6_lock);
1084
1085         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1086         saved_fn = fn;
1087
1088         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1089                 oif = 0;
1090
1091 redo_rt6_select:
1092         rt = rt6_select(fn, oif, strict);
1093         if (rt->rt6i_nsiblings)
1094                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1095         if (rt == net->ipv6.ip6_null_entry) {
1096                 fn = fib6_backtrack(fn, &fl6->saddr);
1097                 if (fn)
1098                         goto redo_rt6_select;
1099                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1100                         /* also consider unreachable route */
1101                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1102                         fn = saved_fn;
1103                         goto redo_rt6_select;
1104                 }
1105         }
1106
1107
1108         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1109                 dst_use(&rt->dst, jiffies);
1110                 read_unlock_bh(&table->tb6_lock);
1111
1112                 rt6_dst_from_metrics_check(rt);
1113
1114                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1115                 return rt;
1116         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1117                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1118                 /* Create a RTF_CACHE clone which will not be
1119                  * owned by the fib6 tree.  It is for the special case where
1120                  * the daddr in the skb during the neighbor look-up is different
1121                  * from the fl6->daddr used to look-up route here.
1122                  */
1123
1124                 struct rt6_info *uncached_rt;
1125
1126                 dst_use(&rt->dst, jiffies);
1127                 read_unlock_bh(&table->tb6_lock);
1128
1129                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1130                 dst_release(&rt->dst);
1131
1132                 if (uncached_rt)
1133                         rt6_uncached_list_add(uncached_rt);
1134                 else
1135                         uncached_rt = net->ipv6.ip6_null_entry;
1136
1137                 dst_hold(&uncached_rt->dst);
1138
1139                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1140                 return uncached_rt;
1141
1142         } else {
1143                 /* Get a percpu copy */
1144
1145                 struct rt6_info *pcpu_rt;
1146
1147                 rt->dst.lastuse = jiffies;
1148                 rt->dst.__use++;
1149                 pcpu_rt = rt6_get_pcpu_route(rt);
1150
1151                 if (pcpu_rt) {
1152                         read_unlock_bh(&table->tb6_lock);
1153                 } else {
1154                         /* We have to do the read_unlock first
1155                          * because rt6_make_pcpu_route() may trigger
1156                          * ip6_dst_gc() which will take the write_lock.
1157                          */
1158                         dst_hold(&rt->dst);
1159                         read_unlock_bh(&table->tb6_lock);
1160                         pcpu_rt = rt6_make_pcpu_route(rt);
1161                         dst_release(&rt->dst);
1162                 }
1163
1164                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1165                 return pcpu_rt;
1166
1167         }
1168 }
1169 EXPORT_SYMBOL_GPL(ip6_pol_route);
1170
1171 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1172                                             struct flowi6 *fl6, int flags)
1173 {
1174         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1175 }
1176
1177 struct dst_entry *ip6_route_input_lookup(struct net *net,
1178                                          struct net_device *dev,
1179                                          struct flowi6 *fl6, int flags)
1180 {
1181         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1182                 flags |= RT6_LOOKUP_F_IFACE;
1183
1184         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1185 }
1186 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1187
1188 void ip6_route_input(struct sk_buff *skb)
1189 {
1190         const struct ipv6hdr *iph = ipv6_hdr(skb);
1191         struct net *net = dev_net(skb->dev);
1192         int flags = RT6_LOOKUP_F_HAS_SADDR;
1193         struct ip_tunnel_info *tun_info;
1194         struct flowi6 fl6 = {
1195                 .flowi6_iif = skb->dev->ifindex,
1196                 .daddr = iph->daddr,
1197                 .saddr = iph->saddr,
1198                 .flowlabel = ip6_flowinfo(iph),
1199                 .flowi6_mark = skb->mark,
1200                 .flowi6_proto = iph->nexthdr,
1201         };
1202
1203         tun_info = skb_tunnel_info(skb);
1204         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1205                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1206         skb_dst_drop(skb);
1207         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1208 }
1209
1210 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1211                                              struct flowi6 *fl6, int flags)
1212 {
1213         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1214 }
1215
1216 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1217                                          struct flowi6 *fl6, int flags)
1218 {
1219         bool any_src;
1220
1221         if (rt6_need_strict(&fl6->daddr)) {
1222                 struct dst_entry *dst;
1223
1224                 dst = l3mdev_link_scope_lookup(net, fl6);
1225                 if (dst)
1226                         return dst;
1227         }
1228
1229         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1230
1231         any_src = ipv6_addr_any(&fl6->saddr);
1232         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1233             (fl6->flowi6_oif && any_src))
1234                 flags |= RT6_LOOKUP_F_IFACE;
1235
1236         if (!any_src)
1237                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1238         else if (sk)
1239                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1240
1241         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1244
1245 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1246 {
1247         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1248         struct dst_entry *new = NULL;
1249
1250         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1251         if (rt) {
1252                 rt6_info_init(rt);
1253
1254                 new = &rt->dst;
1255                 new->__use = 1;
1256                 new->input = dst_discard;
1257                 new->output = dst_discard_out;
1258
1259                 dst_copy_metrics(new, &ort->dst);
1260                 rt->rt6i_idev = ort->rt6i_idev;
1261                 if (rt->rt6i_idev)
1262                         in6_dev_hold(rt->rt6i_idev);
1263
1264                 rt->rt6i_gateway = ort->rt6i_gateway;
1265                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1266                 rt->rt6i_metric = 0;
1267
1268                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1269 #ifdef CONFIG_IPV6_SUBTREES
1270                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1271 #endif
1272
1273                 dst_free(new);
1274         }
1275
1276         dst_release(dst_orig);
1277         return new ? new : ERR_PTR(-ENOMEM);
1278 }
1279
1280 /*
1281  *      Destination cache support functions
1282  */
1283
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1285 {
1286         if (rt->dst.from &&
1287             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1289 }
1290
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1292 {
1293         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1294                 return NULL;
1295
1296         if (rt6_check_expired(rt))
1297                 return NULL;
1298
1299         return &rt->dst;
1300 }
1301
1302 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1303 {
1304         if (!__rt6_check_expired(rt) &&
1305             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1306             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1307                 return &rt->dst;
1308         else
1309                 return NULL;
1310 }
1311
1312 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1313 {
1314         struct rt6_info *rt;
1315
1316         rt = (struct rt6_info *) dst;
1317
1318         /* All IPV6 dsts are created with ->obsolete set to the value
1319          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1320          * into this function always.
1321          */
1322
1323         rt6_dst_from_metrics_check(rt);
1324
1325         if (rt->rt6i_flags & RTF_PCPU ||
1326             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1327                 return rt6_dst_from_check(rt, cookie);
1328         else
1329                 return rt6_check(rt, cookie);
1330 }
1331
1332 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1333 {
1334         struct rt6_info *rt = (struct rt6_info *) dst;
1335
1336         if (rt) {
1337                 if (rt->rt6i_flags & RTF_CACHE) {
1338                         if (rt6_check_expired(rt)) {
1339                                 ip6_del_rt(rt);
1340                                 dst = NULL;
1341                         }
1342                 } else {
1343                         dst_release(dst);
1344                         dst = NULL;
1345                 }
1346         }
1347         return dst;
1348 }
1349
1350 static void ip6_link_failure(struct sk_buff *skb)
1351 {
1352         struct rt6_info *rt;
1353
1354         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1355
1356         rt = (struct rt6_info *) skb_dst(skb);
1357         if (rt) {
1358                 if (rt->rt6i_flags & RTF_CACHE) {
1359                         dst_hold(&rt->dst);
1360                         ip6_del_rt(rt);
1361                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1362                         rt->rt6i_node->fn_sernum = -1;
1363                 }
1364         }
1365 }
1366
1367 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1368 {
1369         struct net *net = dev_net(rt->dst.dev);
1370
1371         rt->rt6i_flags |= RTF_MODIFIED;
1372         rt->rt6i_pmtu = mtu;
1373         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1374 }
1375
1376 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1377 {
1378         return !(rt->rt6i_flags & RTF_CACHE) &&
1379                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1380 }
1381
1382 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1383                                  const struct ipv6hdr *iph, u32 mtu)
1384 {
1385         const struct in6_addr *daddr, *saddr;
1386         struct rt6_info *rt6 = (struct rt6_info *)dst;
1387
1388         if (rt6->rt6i_flags & RTF_LOCAL)
1389                 return;
1390
1391         if (dst_metric_locked(dst, RTAX_MTU))
1392                 return;
1393
1394         if (iph) {
1395                 daddr = &iph->daddr;
1396                 saddr = &iph->saddr;
1397         } else if (sk) {
1398                 daddr = &sk->sk_v6_daddr;
1399                 saddr = &inet6_sk(sk)->saddr;
1400         } else {
1401                 daddr = NULL;
1402                 saddr = NULL;
1403         }
1404         dst_confirm_neigh(dst, daddr);
1405         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1406         if (mtu >= dst_mtu(dst))
1407                 return;
1408
1409         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1410                 rt6_do_update_pmtu(rt6, mtu);
1411         } else if (daddr) {
1412                 struct rt6_info *nrt6;
1413
1414                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1415                 if (nrt6) {
1416                         rt6_do_update_pmtu(nrt6, mtu);
1417
1418                         /* ip6_ins_rt(nrt6) will bump the
1419                          * rt6->rt6i_node->fn_sernum
1420                          * which will fail the next rt6_check() and
1421                          * invalidate the sk->sk_dst_cache.
1422                          */
1423                         ip6_ins_rt(nrt6);
1424                 }
1425         }
1426 }
1427
1428 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1429                                struct sk_buff *skb, u32 mtu)
1430 {
1431         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1432 }
1433
1434 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1435                      int oif, u32 mark, kuid_t uid)
1436 {
1437         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1438         struct dst_entry *dst;
1439         struct flowi6 fl6;
1440
1441         memset(&fl6, 0, sizeof(fl6));
1442         fl6.flowi6_oif = oif;
1443         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1444         fl6.daddr = iph->daddr;
1445         fl6.saddr = iph->saddr;
1446         fl6.flowlabel = ip6_flowinfo(iph);
1447         fl6.flowi6_uid = uid;
1448
1449         dst = ip6_route_output(net, NULL, &fl6);
1450         if (!dst->error)
1451                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1452         dst_release(dst);
1453 }
1454 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1455
1456 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1457 {
1458         struct dst_entry *dst;
1459
1460         ip6_update_pmtu(skb, sock_net(sk), mtu,
1461                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1462
1463         dst = __sk_dst_get(sk);
1464         if (!dst || !dst->obsolete ||
1465             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1466                 return;
1467
1468         bh_lock_sock(sk);
1469         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1470                 ip6_datagram_dst_update(sk, false);
1471         bh_unlock_sock(sk);
1472 }
1473 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1474
1475 /* Handle redirects */
1476 struct ip6rd_flowi {
1477         struct flowi6 fl6;
1478         struct in6_addr gateway;
1479 };
1480
1481 static struct rt6_info *__ip6_route_redirect(struct net *net,
1482                                              struct fib6_table *table,
1483                                              struct flowi6 *fl6,
1484                                              int flags)
1485 {
1486         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1487         struct rt6_info *rt;
1488         struct fib6_node *fn;
1489
1490         /* Get the "current" route for this destination and
1491          * check if the redirect has come from appropriate router.
1492          *
1493          * RFC 4861 specifies that redirects should only be
1494          * accepted if they come from the nexthop to the target.
1495          * Due to the way the routes are chosen, this notion
1496          * is a bit fuzzy and one might need to check all possible
1497          * routes.
1498          */
1499
1500         read_lock_bh(&table->tb6_lock);
1501         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1502 restart:
1503         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1504                 if (rt6_check_expired(rt))
1505                         continue;
1506                 if (rt->dst.error)
1507                         break;
1508                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1509                         continue;
1510                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1511                         continue;
1512                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1513                         continue;
1514                 break;
1515         }
1516
1517         if (!rt)
1518                 rt = net->ipv6.ip6_null_entry;
1519         else if (rt->dst.error) {
1520                 rt = net->ipv6.ip6_null_entry;
1521                 goto out;
1522         }
1523
1524         if (rt == net->ipv6.ip6_null_entry) {
1525                 fn = fib6_backtrack(fn, &fl6->saddr);
1526                 if (fn)
1527                         goto restart;
1528         }
1529
1530 out:
1531         dst_hold(&rt->dst);
1532
1533         read_unlock_bh(&table->tb6_lock);
1534
1535         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1536         return rt;
1537 };
1538
1539 static struct dst_entry *ip6_route_redirect(struct net *net,
1540                                         const struct flowi6 *fl6,
1541                                         const struct in6_addr *gateway)
1542 {
1543         int flags = RT6_LOOKUP_F_HAS_SADDR;
1544         struct ip6rd_flowi rdfl;
1545
1546         rdfl.fl6 = *fl6;
1547         rdfl.gateway = *gateway;
1548
1549         return fib6_rule_lookup(net, &rdfl.fl6,
1550                                 flags, __ip6_route_redirect);
1551 }
1552
1553 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1554                   kuid_t uid)
1555 {
1556         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1557         struct dst_entry *dst;
1558         struct flowi6 fl6;
1559
1560         memset(&fl6, 0, sizeof(fl6));
1561         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1562         fl6.flowi6_oif = oif;
1563         fl6.flowi6_mark = mark;
1564         fl6.daddr = iph->daddr;
1565         fl6.saddr = iph->saddr;
1566         fl6.flowlabel = ip6_flowinfo(iph);
1567         fl6.flowi6_uid = uid;
1568
1569         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1570         rt6_do_redirect(dst, NULL, skb);
1571         dst_release(dst);
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_redirect);
1574
1575 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1576                             u32 mark)
1577 {
1578         const struct ipv6hdr *iph = ipv6_hdr(skb);
1579         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1580         struct dst_entry *dst;
1581         struct flowi6 fl6;
1582
1583         memset(&fl6, 0, sizeof(fl6));
1584         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1585         fl6.flowi6_oif = oif;
1586         fl6.flowi6_mark = mark;
1587         fl6.daddr = msg->dest;
1588         fl6.saddr = iph->daddr;
1589         fl6.flowi6_uid = sock_net_uid(net, NULL);
1590
1591         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1592         rt6_do_redirect(dst, NULL, skb);
1593         dst_release(dst);
1594 }
1595
1596 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1597 {
1598         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1599                      sk->sk_uid);
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1602
1603 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1604 {
1605         struct net_device *dev = dst->dev;
1606         unsigned int mtu = dst_mtu(dst);
1607         struct net *net = dev_net(dev);
1608
1609         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1610
1611         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1612                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1613
1614         /*
1615          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1616          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1617          * IPV6_MAXPLEN is also valid and means: "any MSS,
1618          * rely only on pmtu discovery"
1619          */
1620         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1621                 mtu = IPV6_MAXPLEN;
1622         return mtu;
1623 }
1624
1625 static unsigned int ip6_mtu(const struct dst_entry *dst)
1626 {
1627         const struct rt6_info *rt = (const struct rt6_info *)dst;
1628         unsigned int mtu = rt->rt6i_pmtu;
1629         struct inet6_dev *idev;
1630
1631         if (mtu)
1632                 goto out;
1633
1634         mtu = dst_metric_raw(dst, RTAX_MTU);
1635         if (mtu)
1636                 goto out;
1637
1638         mtu = IPV6_MIN_MTU;
1639
1640         rcu_read_lock();
1641         idev = __in6_dev_get(dst->dev);
1642         if (idev)
1643                 mtu = idev->cnf.mtu6;
1644         rcu_read_unlock();
1645
1646 out:
1647         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1648
1649         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1650 }
1651
1652 static struct dst_entry *icmp6_dst_gc_list;
1653 static DEFINE_SPINLOCK(icmp6_dst_lock);
1654
1655 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1656                                   struct flowi6 *fl6)
1657 {
1658         struct dst_entry *dst;
1659         struct rt6_info *rt;
1660         struct inet6_dev *idev = in6_dev_get(dev);
1661         struct net *net = dev_net(dev);
1662
1663         if (unlikely(!idev))
1664                 return ERR_PTR(-ENODEV);
1665
1666         rt = ip6_dst_alloc(net, dev, 0);
1667         if (unlikely(!rt)) {
1668                 in6_dev_put(idev);
1669                 dst = ERR_PTR(-ENOMEM);
1670                 goto out;
1671         }
1672
1673         rt->dst.flags |= DST_HOST;
1674         rt->dst.output  = ip6_output;
1675         atomic_set(&rt->dst.__refcnt, 1);
1676         rt->rt6i_gateway  = fl6->daddr;
1677         rt->rt6i_dst.addr = fl6->daddr;
1678         rt->rt6i_dst.plen = 128;
1679         rt->rt6i_idev     = idev;
1680         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1681
1682         spin_lock_bh(&icmp6_dst_lock);
1683         rt->dst.next = icmp6_dst_gc_list;
1684         icmp6_dst_gc_list = &rt->dst;
1685         spin_unlock_bh(&icmp6_dst_lock);
1686
1687         fib6_force_start_gc(net);
1688
1689         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1690
1691 out:
1692         return dst;
1693 }
1694
1695 int icmp6_dst_gc(void)
1696 {
1697         struct dst_entry *dst, **pprev;
1698         int more = 0;
1699
1700         spin_lock_bh(&icmp6_dst_lock);
1701         pprev = &icmp6_dst_gc_list;
1702
1703         while ((dst = *pprev) != NULL) {
1704                 if (!atomic_read(&dst->__refcnt)) {
1705                         *pprev = dst->next;
1706                         dst_free(dst);
1707                 } else {
1708                         pprev = &dst->next;
1709                         ++more;
1710                 }
1711         }
1712
1713         spin_unlock_bh(&icmp6_dst_lock);
1714
1715         return more;
1716 }
1717
1718 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1719                             void *arg)
1720 {
1721         struct dst_entry *dst, **pprev;
1722
1723         spin_lock_bh(&icmp6_dst_lock);
1724         pprev = &icmp6_dst_gc_list;
1725         while ((dst = *pprev) != NULL) {
1726                 struct rt6_info *rt = (struct rt6_info *) dst;
1727                 if (func(rt, arg)) {
1728                         *pprev = dst->next;
1729                         dst_free(dst);
1730                 } else {
1731                         pprev = &dst->next;
1732                 }
1733         }
1734         spin_unlock_bh(&icmp6_dst_lock);
1735 }
1736
1737 static int ip6_dst_gc(struct dst_ops *ops)
1738 {
1739         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1740         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1741         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1742         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1743         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1744         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1745         int entries;
1746
1747         entries = dst_entries_get_fast(ops);
1748         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1749             entries <= rt_max_size)
1750                 goto out;
1751
1752         net->ipv6.ip6_rt_gc_expire++;
1753         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1754         entries = dst_entries_get_slow(ops);
1755         if (entries < ops->gc_thresh)
1756                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1757 out:
1758         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1759         return entries > rt_max_size;
1760 }
1761
1762 static int ip6_convert_metrics(struct mx6_config *mxc,
1763                                const struct fib6_config *cfg)
1764 {
1765         bool ecn_ca = false;
1766         struct nlattr *nla;
1767         int remaining;
1768         u32 *mp;
1769
1770         if (!cfg->fc_mx)
1771                 return 0;
1772
1773         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1774         if (unlikely(!mp))
1775                 return -ENOMEM;
1776
1777         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1778                 int type = nla_type(nla);
1779                 u32 val;
1780
1781                 if (!type)
1782                         continue;
1783                 if (unlikely(type > RTAX_MAX))
1784                         goto err;
1785
1786                 if (type == RTAX_CC_ALGO) {
1787                         char tmp[TCP_CA_NAME_MAX];
1788
1789                         nla_strlcpy(tmp, nla, sizeof(tmp));
1790                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1791                         if (val == TCP_CA_UNSPEC)
1792                                 goto err;
1793                 } else {
1794                         val = nla_get_u32(nla);
1795                 }
1796                 if (type == RTAX_HOPLIMIT && val > 255)
1797                         val = 255;
1798                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1799                         goto err;
1800
1801                 mp[type - 1] = val;
1802                 __set_bit(type - 1, mxc->mx_valid);
1803         }
1804
1805         if (ecn_ca) {
1806                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1807                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1808         }
1809
1810         mxc->mx = mp;
1811         return 0;
1812  err:
1813         kfree(mp);
1814         return -EINVAL;
1815 }
1816
1817 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1818                                             struct fib6_config *cfg,
1819                                             const struct in6_addr *gw_addr)
1820 {
1821         struct flowi6 fl6 = {
1822                 .flowi6_oif = cfg->fc_ifindex,
1823                 .daddr = *gw_addr,
1824                 .saddr = cfg->fc_prefsrc,
1825         };
1826         struct fib6_table *table;
1827         struct rt6_info *rt;
1828         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1829
1830         table = fib6_get_table(net, cfg->fc_table);
1831         if (!table)
1832                 return NULL;
1833
1834         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1835                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1836
1837         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1838
1839         /* if table lookup failed, fall back to full lookup */
1840         if (rt == net->ipv6.ip6_null_entry) {
1841                 ip6_rt_put(rt);
1842                 rt = NULL;
1843         }
1844
1845         return rt;
1846 }
1847
1848 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1849                                               struct netlink_ext_ack *extack)
1850 {
1851         struct net *net = cfg->fc_nlinfo.nl_net;
1852         struct rt6_info *rt = NULL;
1853         struct net_device *dev = NULL;
1854         struct inet6_dev *idev = NULL;
1855         struct fib6_table *table;
1856         int addr_type;
1857         int err = -EINVAL;
1858
1859         /* RTF_PCPU is an internal flag; can not be set by userspace */
1860         if (cfg->fc_flags & RTF_PCPU)
1861                 goto out;
1862
1863         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1864                 goto out;
1865 #ifndef CONFIG_IPV6_SUBTREES
1866         if (cfg->fc_src_len)
1867                 goto out;
1868 #endif
1869         if (cfg->fc_ifindex) {
1870                 err = -ENODEV;
1871                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1872                 if (!dev)
1873                         goto out;
1874                 idev = in6_dev_get(dev);
1875                 if (!idev)
1876                         goto out;
1877         }
1878
1879         if (cfg->fc_metric == 0)
1880                 cfg->fc_metric = IP6_RT_PRIO_USER;
1881
1882         err = -ENOBUFS;
1883         if (cfg->fc_nlinfo.nlh &&
1884             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1885                 table = fib6_get_table(net, cfg->fc_table);
1886                 if (!table) {
1887                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1888                         table = fib6_new_table(net, cfg->fc_table);
1889                 }
1890         } else {
1891                 table = fib6_new_table(net, cfg->fc_table);
1892         }
1893
1894         if (!table)
1895                 goto out;
1896
1897         rt = ip6_dst_alloc(net, NULL,
1898                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1899
1900         if (!rt) {
1901                 err = -ENOMEM;
1902                 goto out;
1903         }
1904
1905         if (cfg->fc_flags & RTF_EXPIRES)
1906                 rt6_set_expires(rt, jiffies +
1907                                 clock_t_to_jiffies(cfg->fc_expires));
1908         else
1909                 rt6_clean_expires(rt);
1910
1911         if (cfg->fc_protocol == RTPROT_UNSPEC)
1912                 cfg->fc_protocol = RTPROT_BOOT;
1913         rt->rt6i_protocol = cfg->fc_protocol;
1914
1915         addr_type = ipv6_addr_type(&cfg->fc_dst);
1916
1917         if (addr_type & IPV6_ADDR_MULTICAST)
1918                 rt->dst.input = ip6_mc_input;
1919         else if (cfg->fc_flags & RTF_LOCAL)
1920                 rt->dst.input = ip6_input;
1921         else
1922                 rt->dst.input = ip6_forward;
1923
1924         rt->dst.output = ip6_output;
1925
1926         if (cfg->fc_encap) {
1927                 struct lwtunnel_state *lwtstate;
1928
1929                 err = lwtunnel_build_state(cfg->fc_encap_type,
1930                                            cfg->fc_encap, AF_INET6, cfg,
1931                                            &lwtstate);
1932                 if (err)
1933                         goto out;
1934                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1935                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1936                         rt->dst.lwtstate->orig_output = rt->dst.output;
1937                         rt->dst.output = lwtunnel_output;
1938                 }
1939                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1940                         rt->dst.lwtstate->orig_input = rt->dst.input;
1941                         rt->dst.input = lwtunnel_input;
1942                 }
1943         }
1944
1945         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1946         rt->rt6i_dst.plen = cfg->fc_dst_len;
1947         if (rt->rt6i_dst.plen == 128)
1948                 rt->dst.flags |= DST_HOST;
1949
1950 #ifdef CONFIG_IPV6_SUBTREES
1951         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1952         rt->rt6i_src.plen = cfg->fc_src_len;
1953 #endif
1954
1955         rt->rt6i_metric = cfg->fc_metric;
1956
1957         /* We cannot add true routes via loopback here,
1958            they would result in kernel looping; promote them to reject routes
1959          */
1960         if ((cfg->fc_flags & RTF_REJECT) ||
1961             (dev && (dev->flags & IFF_LOOPBACK) &&
1962              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1963              !(cfg->fc_flags & RTF_LOCAL))) {
1964                 /* hold loopback dev/idev if we haven't done so. */
1965                 if (dev != net->loopback_dev) {
1966                         if (dev) {
1967                                 dev_put(dev);
1968                                 in6_dev_put(idev);
1969                         }
1970                         dev = net->loopback_dev;
1971                         dev_hold(dev);
1972                         idev = in6_dev_get(dev);
1973                         if (!idev) {
1974                                 err = -ENODEV;
1975                                 goto out;
1976                         }
1977                 }
1978                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1979                 switch (cfg->fc_type) {
1980                 case RTN_BLACKHOLE:
1981                         rt->dst.error = -EINVAL;
1982                         rt->dst.output = dst_discard_out;
1983                         rt->dst.input = dst_discard;
1984                         break;
1985                 case RTN_PROHIBIT:
1986                         rt->dst.error = -EACCES;
1987                         rt->dst.output = ip6_pkt_prohibit_out;
1988                         rt->dst.input = ip6_pkt_prohibit;
1989                         break;
1990                 case RTN_THROW:
1991                 case RTN_UNREACHABLE:
1992                 default:
1993                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1994                                         : (cfg->fc_type == RTN_UNREACHABLE)
1995                                         ? -EHOSTUNREACH : -ENETUNREACH;
1996                         rt->dst.output = ip6_pkt_discard_out;
1997                         rt->dst.input = ip6_pkt_discard;
1998                         break;
1999                 }
2000                 goto install_route;
2001         }
2002
2003         if (cfg->fc_flags & RTF_GATEWAY) {
2004                 const struct in6_addr *gw_addr;
2005                 int gwa_type;
2006
2007                 gw_addr = &cfg->fc_gateway;
2008                 gwa_type = ipv6_addr_type(gw_addr);
2009
2010                 /* if gw_addr is local we will fail to detect this in case
2011                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2012                  * will return already-added prefix route via interface that
2013                  * prefix route was assigned to, which might be non-loopback.
2014                  */
2015                 err = -EINVAL;
2016                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2017                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2018                                             dev : NULL, 0, 0))
2019                         goto out;
2020
2021                 rt->rt6i_gateway = *gw_addr;
2022
2023                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2024                         struct rt6_info *grt = NULL;
2025
2026                         /* IPv6 strictly inhibits using not link-local
2027                            addresses as nexthop address.
2028                            Otherwise, router will not able to send redirects.
2029                            It is very good, but in some (rare!) circumstances
2030                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2031                            some exceptions. --ANK
2032                            We allow IPv4-mapped nexthops to support RFC4798-type
2033                            addressing
2034                          */
2035                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2036                                           IPV6_ADDR_MAPPED)))
2037                                 goto out;
2038
2039                         if (cfg->fc_table) {
2040                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2041
2042                                 if (grt) {
2043                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2044                                             (dev && dev != grt->dst.dev)) {
2045                                                 ip6_rt_put(grt);
2046                                                 grt = NULL;
2047                                         }
2048                                 }
2049                         }
2050
2051                         if (!grt)
2052                                 grt = rt6_lookup(net, gw_addr, NULL,
2053                                                  cfg->fc_ifindex, 1);
2054
2055                         err = -EHOSTUNREACH;
2056                         if (!grt)
2057                                 goto out;
2058                         if (dev) {
2059                                 if (dev != grt->dst.dev) {
2060                                         ip6_rt_put(grt);
2061                                         goto out;
2062                                 }
2063                         } else {
2064                                 dev = grt->dst.dev;
2065                                 idev = grt->rt6i_idev;
2066                                 dev_hold(dev);
2067                                 in6_dev_hold(grt->rt6i_idev);
2068                         }
2069                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2070                                 err = 0;
2071                         ip6_rt_put(grt);
2072
2073                         if (err)
2074                                 goto out;
2075                 }
2076                 err = -EINVAL;
2077                 if (!dev || (dev->flags & IFF_LOOPBACK))
2078                         goto out;
2079         }
2080
2081         err = -ENODEV;
2082         if (!dev)
2083                 goto out;
2084
2085         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2086                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2087                         err = -EINVAL;
2088                         goto out;
2089                 }
2090                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2091                 rt->rt6i_prefsrc.plen = 128;
2092         } else
2093                 rt->rt6i_prefsrc.plen = 0;
2094
2095         rt->rt6i_flags = cfg->fc_flags;
2096
2097 install_route:
2098         rt->dst.dev = dev;
2099         rt->rt6i_idev = idev;
2100         rt->rt6i_table = table;
2101
2102         cfg->fc_nlinfo.nl_net = dev_net(dev);
2103
2104         return rt;
2105 out:
2106         if (dev)
2107                 dev_put(dev);
2108         if (idev)
2109                 in6_dev_put(idev);
2110         if (rt)
2111                 dst_free(&rt->dst);
2112
2113         return ERR_PTR(err);
2114 }
2115
2116 int ip6_route_add(struct fib6_config *cfg,
2117                   struct netlink_ext_ack *extack)
2118 {
2119         struct mx6_config mxc = { .mx = NULL, };
2120         struct rt6_info *rt;
2121         int err;
2122
2123         rt = ip6_route_info_create(cfg, extack);
2124         if (IS_ERR(rt)) {
2125                 err = PTR_ERR(rt);
2126                 rt = NULL;
2127                 goto out;
2128         }
2129
2130         err = ip6_convert_metrics(&mxc, cfg);
2131         if (err)
2132                 goto out;
2133
2134         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2135
2136         kfree(mxc.mx);
2137
2138         return err;
2139 out:
2140         if (rt)
2141                 dst_free(&rt->dst);
2142
2143         return err;
2144 }
2145
2146 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2147 {
2148         int err;
2149         struct fib6_table *table;
2150         struct net *net = dev_net(rt->dst.dev);
2151
2152         if (rt == net->ipv6.ip6_null_entry ||
2153             rt->dst.flags & DST_NOCACHE) {
2154                 err = -ENOENT;
2155                 goto out;
2156         }
2157
2158         table = rt->rt6i_table;
2159         write_lock_bh(&table->tb6_lock);
2160         err = fib6_del(rt, info);
2161         write_unlock_bh(&table->tb6_lock);
2162
2163 out:
2164         ip6_rt_put(rt);
2165         return err;
2166 }
2167
2168 int ip6_del_rt(struct rt6_info *rt)
2169 {
2170         struct nl_info info = {
2171                 .nl_net = dev_net(rt->dst.dev),
2172         };
2173         return __ip6_del_rt(rt, &info);
2174 }
2175
2176 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2177 {
2178         struct nl_info *info = &cfg->fc_nlinfo;
2179         struct net *net = info->nl_net;
2180         struct sk_buff *skb = NULL;
2181         struct fib6_table *table;
2182         int err = -ENOENT;
2183
2184         if (rt == net->ipv6.ip6_null_entry)
2185                 goto out_put;
2186         table = rt->rt6i_table;
2187         write_lock_bh(&table->tb6_lock);
2188
2189         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2190                 struct rt6_info *sibling, *next_sibling;
2191
2192                 /* prefer to send a single notification with all hops */
2193                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2194                 if (skb) {
2195                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2196
2197                         if (rt6_fill_node(net, skb, rt,
2198                                           NULL, NULL, 0, RTM_DELROUTE,
2199                                           info->portid, seq, 0) < 0) {
2200                                 kfree_skb(skb);
2201                                 skb = NULL;
2202                         } else
2203                                 info->skip_notify = 1;
2204                 }
2205
2206                 list_for_each_entry_safe(sibling, next_sibling,
2207                                          &rt->rt6i_siblings,
2208                                          rt6i_siblings) {
2209                         err = fib6_del(sibling, info);
2210                         if (err)
2211                                 goto out_unlock;
2212                 }
2213         }
2214
2215         err = fib6_del(rt, info);
2216 out_unlock:
2217         write_unlock_bh(&table->tb6_lock);
2218 out_put:
2219         ip6_rt_put(rt);
2220
2221         if (skb) {
2222                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2223                             info->nlh, gfp_any());
2224         }
2225         return err;
2226 }
2227
2228 static int ip6_route_del(struct fib6_config *cfg,
2229                          struct netlink_ext_ack *extack)
2230 {
2231         struct fib6_table *table;
2232         struct fib6_node *fn;
2233         struct rt6_info *rt;
2234         int err = -ESRCH;
2235
2236         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2237         if (!table)
2238                 return err;
2239
2240         read_lock_bh(&table->tb6_lock);
2241
2242         fn = fib6_locate(&table->tb6_root,
2243                          &cfg->fc_dst, cfg->fc_dst_len,
2244                          &cfg->fc_src, cfg->fc_src_len);
2245
2246         if (fn) {
2247                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2248                         if ((rt->rt6i_flags & RTF_CACHE) &&
2249                             !(cfg->fc_flags & RTF_CACHE))
2250                                 continue;
2251                         if (cfg->fc_ifindex &&
2252                             (!rt->dst.dev ||
2253                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2254                                 continue;
2255                         if (cfg->fc_flags & RTF_GATEWAY &&
2256                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2257                                 continue;
2258                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2259                                 continue;
2260                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2261                                 continue;
2262                         dst_hold(&rt->dst);
2263                         read_unlock_bh(&table->tb6_lock);
2264
2265                         /* if gateway was specified only delete the one hop */
2266                         if (cfg->fc_flags & RTF_GATEWAY)
2267                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2268
2269                         return __ip6_del_rt_siblings(rt, cfg);
2270                 }
2271         }
2272         read_unlock_bh(&table->tb6_lock);
2273
2274         return err;
2275 }
2276
2277 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2278 {
2279         struct netevent_redirect netevent;
2280         struct rt6_info *rt, *nrt = NULL;
2281         struct ndisc_options ndopts;
2282         struct inet6_dev *in6_dev;
2283         struct neighbour *neigh;
2284         struct rd_msg *msg;
2285         int optlen, on_link;
2286         u8 *lladdr;
2287
2288         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2289         optlen -= sizeof(*msg);
2290
2291         if (optlen < 0) {
2292                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2293                 return;
2294         }
2295
2296         msg = (struct rd_msg *)icmp6_hdr(skb);
2297
2298         if (ipv6_addr_is_multicast(&msg->dest)) {
2299                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2300                 return;
2301         }
2302
2303         on_link = 0;
2304         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2305                 on_link = 1;
2306         } else if (ipv6_addr_type(&msg->target) !=
2307                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2308                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2309                 return;
2310         }
2311
2312         in6_dev = __in6_dev_get(skb->dev);
2313         if (!in6_dev)
2314                 return;
2315         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2316                 return;
2317
2318         /* RFC2461 8.1:
2319          *      The IP source address of the Redirect MUST be the same as the current
2320          *      first-hop router for the specified ICMP Destination Address.
2321          */
2322
2323         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2324                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2325                 return;
2326         }
2327
2328         lladdr = NULL;
2329         if (ndopts.nd_opts_tgt_lladdr) {
2330                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2331                                              skb->dev);
2332                 if (!lladdr) {
2333                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2334                         return;
2335                 }
2336         }
2337
2338         rt = (struct rt6_info *) dst;
2339         if (rt->rt6i_flags & RTF_REJECT) {
2340                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2341                 return;
2342         }
2343
2344         /* Redirect received -> path was valid.
2345          * Look, redirects are sent only in response to data packets,
2346          * so that this nexthop apparently is reachable. --ANK
2347          */
2348         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2349
2350         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2351         if (!neigh)
2352                 return;
2353
2354         /*
2355          *      We have finally decided to accept it.
2356          */
2357
2358         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2359                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2360                      NEIGH_UPDATE_F_OVERRIDE|
2361                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2362                                      NEIGH_UPDATE_F_ISROUTER)),
2363                      NDISC_REDIRECT, &ndopts);
2364
2365         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2366         if (!nrt)
2367                 goto out;
2368
2369         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2370         if (on_link)
2371                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2372
2373         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2374
2375         if (ip6_ins_rt(nrt))
2376                 goto out;
2377
2378         netevent.old = &rt->dst;
2379         netevent.new = &nrt->dst;
2380         netevent.daddr = &msg->dest;
2381         netevent.neigh = neigh;
2382         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2383
2384         if (rt->rt6i_flags & RTF_CACHE) {
2385                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2386                 ip6_del_rt(rt);
2387         }
2388
2389 out:
2390         neigh_release(neigh);
2391 }
2392
2393 /*
2394  *      Misc support functions
2395  */
2396
2397 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2398 {
2399         BUG_ON(from->dst.from);
2400
2401         rt->rt6i_flags &= ~RTF_EXPIRES;
2402         dst_hold(&from->dst);
2403         rt->dst.from = &from->dst;
2404         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2405 }
2406
2407 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2408 {
2409         rt->dst.input = ort->dst.input;
2410         rt->dst.output = ort->dst.output;
2411         rt->rt6i_dst = ort->rt6i_dst;
2412         rt->dst.error = ort->dst.error;
2413         rt->rt6i_idev = ort->rt6i_idev;
2414         if (rt->rt6i_idev)
2415                 in6_dev_hold(rt->rt6i_idev);
2416         rt->dst.lastuse = jiffies;
2417         rt->rt6i_gateway = ort->rt6i_gateway;
2418         rt->rt6i_flags = ort->rt6i_flags;
2419         rt6_set_from(rt, ort);
2420         rt->rt6i_metric = ort->rt6i_metric;
2421 #ifdef CONFIG_IPV6_SUBTREES
2422         rt->rt6i_src = ort->rt6i_src;
2423 #endif
2424         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2425         rt->rt6i_table = ort->rt6i_table;
2426         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2427 }
2428
2429 #ifdef CONFIG_IPV6_ROUTE_INFO
2430 static struct rt6_info *rt6_get_route_info(struct net *net,
2431                                            const struct in6_addr *prefix, int prefixlen,
2432                                            const struct in6_addr *gwaddr,
2433                                            struct net_device *dev)
2434 {
2435         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2436         int ifindex = dev->ifindex;
2437         struct fib6_node *fn;
2438         struct rt6_info *rt = NULL;
2439         struct fib6_table *table;
2440
2441         table = fib6_get_table(net, tb_id);
2442         if (!table)
2443                 return NULL;
2444
2445         read_lock_bh(&table->tb6_lock);
2446         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2447         if (!fn)
2448                 goto out;
2449
2450         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2451                 if (rt->dst.dev->ifindex != ifindex)
2452                         continue;
2453                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2454                         continue;
2455                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2456                         continue;
2457                 dst_hold(&rt->dst);
2458                 break;
2459         }
2460 out:
2461         read_unlock_bh(&table->tb6_lock);
2462         return rt;
2463 }
2464
2465 static struct rt6_info *rt6_add_route_info(struct net *net,
2466                                            const struct in6_addr *prefix, int prefixlen,
2467                                            const struct in6_addr *gwaddr,
2468                                            struct net_device *dev,
2469                                            unsigned int pref)
2470 {
2471         struct fib6_config cfg = {
2472                 .fc_metric      = IP6_RT_PRIO_USER,
2473                 .fc_ifindex     = dev->ifindex,
2474                 .fc_dst_len     = prefixlen,
2475                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2476                                   RTF_UP | RTF_PREF(pref),
2477                 .fc_nlinfo.portid = 0,
2478                 .fc_nlinfo.nlh = NULL,
2479                 .fc_nlinfo.nl_net = net,
2480         };
2481
2482         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2483         cfg.fc_dst = *prefix;
2484         cfg.fc_gateway = *gwaddr;
2485
2486         /* We should treat it as a default route if prefix length is 0. */
2487         if (!prefixlen)
2488                 cfg.fc_flags |= RTF_DEFAULT;
2489
2490         ip6_route_add(&cfg, NULL);
2491
2492         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2493 }
2494 #endif
2495
2496 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2497 {
2498         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2499         struct rt6_info *rt;
2500         struct fib6_table *table;
2501
2502         table = fib6_get_table(dev_net(dev), tb_id);
2503         if (!table)
2504                 return NULL;
2505
2506         read_lock_bh(&table->tb6_lock);
2507         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2508                 if (dev == rt->dst.dev &&
2509                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2510                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2511                         break;
2512         }
2513         if (rt)
2514                 dst_hold(&rt->dst);
2515         read_unlock_bh(&table->tb6_lock);
2516         return rt;
2517 }
2518
2519 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2520                                      struct net_device *dev,
2521                                      unsigned int pref)
2522 {
2523         struct fib6_config cfg = {
2524                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2525                 .fc_metric      = IP6_RT_PRIO_USER,
2526                 .fc_ifindex     = dev->ifindex,
2527                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2528                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2529                 .fc_nlinfo.portid = 0,
2530                 .fc_nlinfo.nlh = NULL,
2531                 .fc_nlinfo.nl_net = dev_net(dev),
2532         };
2533
2534         cfg.fc_gateway = *gwaddr;
2535
2536         if (!ip6_route_add(&cfg, NULL)) {
2537                 struct fib6_table *table;
2538
2539                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2540                 if (table)
2541                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2542         }
2543
2544         return rt6_get_dflt_router(gwaddr, dev);
2545 }
2546
2547 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2548 {
2549         struct rt6_info *rt;
2550
2551 restart:
2552         read_lock_bh(&table->tb6_lock);
2553         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2554                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2555                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2556                         dst_hold(&rt->dst);
2557                         read_unlock_bh(&table->tb6_lock);
2558                         ip6_del_rt(rt);
2559                         goto restart;
2560                 }
2561         }
2562         read_unlock_bh(&table->tb6_lock);
2563
2564         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2565 }
2566
2567 void rt6_purge_dflt_routers(struct net *net)
2568 {
2569         struct fib6_table *table;
2570         struct hlist_head *head;
2571         unsigned int h;
2572
2573         rcu_read_lock();
2574
2575         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2576                 head = &net->ipv6.fib_table_hash[h];
2577                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2578                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2579                                 __rt6_purge_dflt_routers(table);
2580                 }
2581         }
2582
2583         rcu_read_unlock();
2584 }
2585
2586 static void rtmsg_to_fib6_config(struct net *net,
2587                                  struct in6_rtmsg *rtmsg,
2588                                  struct fib6_config *cfg)
2589 {
2590         memset(cfg, 0, sizeof(*cfg));
2591
2592         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2593                          : RT6_TABLE_MAIN;
2594         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2595         cfg->fc_metric = rtmsg->rtmsg_metric;
2596         cfg->fc_expires = rtmsg->rtmsg_info;
2597         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2598         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2599         cfg->fc_flags = rtmsg->rtmsg_flags;
2600
2601         cfg->fc_nlinfo.nl_net = net;
2602
2603         cfg->fc_dst = rtmsg->rtmsg_dst;
2604         cfg->fc_src = rtmsg->rtmsg_src;
2605         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2606 }
2607
2608 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2609 {
2610         struct fib6_config cfg;
2611         struct in6_rtmsg rtmsg;
2612         int err;
2613
2614         switch (cmd) {
2615         case SIOCADDRT:         /* Add a route */
2616         case SIOCDELRT:         /* Delete a route */
2617                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2618                         return -EPERM;
2619                 err = copy_from_user(&rtmsg, arg,
2620                                      sizeof(struct in6_rtmsg));
2621                 if (err)
2622                         return -EFAULT;
2623
2624                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2625
2626                 rtnl_lock();
2627                 switch (cmd) {
2628                 case SIOCADDRT:
2629                         err = ip6_route_add(&cfg, NULL);
2630                         break;
2631                 case SIOCDELRT:
2632                         err = ip6_route_del(&cfg, NULL);
2633                         break;
2634                 default:
2635                         err = -EINVAL;
2636                 }
2637                 rtnl_unlock();
2638
2639                 return err;
2640         }
2641
2642         return -EINVAL;
2643 }
2644
2645 /*
2646  *      Drop the packet on the floor
2647  */
2648
2649 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2650 {
2651         int type;
2652         struct dst_entry *dst = skb_dst(skb);
2653         switch (ipstats_mib_noroutes) {
2654         case IPSTATS_MIB_INNOROUTES:
2655                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2656                 if (type == IPV6_ADDR_ANY) {
2657                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2658                                       IPSTATS_MIB_INADDRERRORS);
2659                         break;
2660                 }
2661                 /* FALLTHROUGH */
2662         case IPSTATS_MIB_OUTNOROUTES:
2663                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2664                               ipstats_mib_noroutes);
2665                 break;
2666         }
2667         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2668         kfree_skb(skb);
2669         return 0;
2670 }
2671
2672 static int ip6_pkt_discard(struct sk_buff *skb)
2673 {
2674         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2675 }
2676
2677 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2678 {
2679         skb->dev = skb_dst(skb)->dev;
2680         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2681 }
2682
2683 static int ip6_pkt_prohibit(struct sk_buff *skb)
2684 {
2685         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2686 }
2687
2688 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2689 {
2690         skb->dev = skb_dst(skb)->dev;
2691         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2692 }
2693
2694 /*
2695  *      Allocate a dst for local (unicast / anycast) address.
2696  */
2697
2698 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2699                                     const struct in6_addr *addr,
2700                                     bool anycast)
2701 {
2702         u32 tb_id;
2703         struct net *net = dev_net(idev->dev);
2704         struct net_device *dev = net->loopback_dev;
2705         struct rt6_info *rt;
2706
2707         /* use L3 Master device as loopback for host routes if device
2708          * is enslaved and address is not link local or multicast
2709          */
2710         if (!rt6_need_strict(addr))
2711                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2712
2713         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2714         if (!rt)
2715                 return ERR_PTR(-ENOMEM);
2716
2717         in6_dev_hold(idev);
2718
2719         rt->dst.flags |= DST_HOST;
2720         rt->dst.input = ip6_input;
2721         rt->dst.output = ip6_output;
2722         rt->rt6i_idev = idev;
2723
2724         rt->rt6i_protocol = RTPROT_KERNEL;
2725         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2726         if (anycast)
2727                 rt->rt6i_flags |= RTF_ANYCAST;
2728         else
2729                 rt->rt6i_flags |= RTF_LOCAL;
2730
2731         rt->rt6i_gateway  = *addr;
2732         rt->rt6i_dst.addr = *addr;
2733         rt->rt6i_dst.plen = 128;
2734         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2735         rt->rt6i_table = fib6_get_table(net, tb_id);
2736         rt->dst.flags |= DST_NOCACHE;
2737
2738         atomic_set(&rt->dst.__refcnt, 1);
2739
2740         return rt;
2741 }
2742
2743 /* remove deleted ip from prefsrc entries */
2744 struct arg_dev_net_ip {
2745         struct net_device *dev;
2746         struct net *net;
2747         struct in6_addr *addr;
2748 };
2749
2750 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2751 {
2752         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2753         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2754         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2755
2756         if (((void *)rt->dst.dev == dev || !dev) &&
2757             rt != net->ipv6.ip6_null_entry &&
2758             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2759                 /* remove prefsrc entry */
2760                 rt->rt6i_prefsrc.plen = 0;
2761         }
2762         return 0;
2763 }
2764
2765 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2766 {
2767         struct net *net = dev_net(ifp->idev->dev);
2768         struct arg_dev_net_ip adni = {
2769                 .dev = ifp->idev->dev,
2770                 .net = net,
2771                 .addr = &ifp->addr,
2772         };
2773         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2774 }
2775
2776 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2777 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2778
2779 /* Remove routers and update dst entries when gateway turn into host. */
2780 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2781 {
2782         struct in6_addr *gateway = (struct in6_addr *)arg;
2783
2784         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2785              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2786              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2787                 return -1;
2788         }
2789         return 0;
2790 }
2791
2792 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2793 {
2794         fib6_clean_all(net, fib6_clean_tohost, gateway);
2795 }
2796
2797 struct arg_dev_net {
2798         struct net_device *dev;
2799         struct net *net;
2800 };
2801
2802 /* called with write lock held for table with rt */
2803 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2804 {
2805         const struct arg_dev_net *adn = arg;
2806         const struct net_device *dev = adn->dev;
2807
2808         if ((rt->dst.dev == dev || !dev) &&
2809             rt != adn->net->ipv6.ip6_null_entry &&
2810             (rt->rt6i_nsiblings == 0 ||
2811              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2812                 return -1;
2813
2814         return 0;
2815 }
2816
2817 void rt6_ifdown(struct net *net, struct net_device *dev)
2818 {
2819         struct arg_dev_net adn = {
2820                 .dev = dev,
2821                 .net = net,
2822         };
2823
2824         fib6_clean_all(net, fib6_ifdown, &adn);
2825         icmp6_clean_all(fib6_ifdown, &adn);
2826         if (dev)
2827                 rt6_uncached_list_flush_dev(net, dev);
2828 }
2829
2830 struct rt6_mtu_change_arg {
2831         struct net_device *dev;
2832         unsigned int mtu;
2833 };
2834
2835 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2836 {
2837         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2838         struct inet6_dev *idev;
2839
2840         /* In IPv6 pmtu discovery is not optional,
2841            so that RTAX_MTU lock cannot disable it.
2842            We still use this lock to block changes
2843            caused by addrconf/ndisc.
2844         */
2845
2846         idev = __in6_dev_get(arg->dev);
2847         if (!idev)
2848                 return 0;
2849
2850         /* For administrative MTU increase, there is no way to discover
2851            IPv6 PMTU increase, so PMTU increase should be updated here.
2852            Since RFC 1981 doesn't include administrative MTU increase
2853            update PMTU increase is a MUST. (i.e. jumbo frame)
2854          */
2855         /*
2856            If new MTU is less than route PMTU, this new MTU will be the
2857            lowest MTU in the path, update the route PMTU to reflect PMTU
2858            decreases; if new MTU is greater than route PMTU, and the
2859            old MTU is the lowest MTU in the path, update the route PMTU
2860            to reflect the increase. In this case if the other nodes' MTU
2861            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2862            PMTU discovery.
2863          */
2864         if (rt->dst.dev == arg->dev &&
2865             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2866             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2867                 if (rt->rt6i_flags & RTF_CACHE) {
2868                         /* For RTF_CACHE with rt6i_pmtu == 0
2869                          * (i.e. a redirected route),
2870                          * the metrics of its rt->dst.from has already
2871                          * been updated.
2872                          */
2873                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2874                                 rt->rt6i_pmtu = arg->mtu;
2875                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2876                            (dst_mtu(&rt->dst) < arg->mtu &&
2877                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2878                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2879                 }
2880         }
2881         return 0;
2882 }
2883
2884 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2885 {
2886         struct rt6_mtu_change_arg arg = {
2887                 .dev = dev,
2888                 .mtu = mtu,
2889         };
2890
2891         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2892 }
2893
2894 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2895         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2896         [RTA_OIF]               = { .type = NLA_U32 },
2897         [RTA_IIF]               = { .type = NLA_U32 },
2898         [RTA_PRIORITY]          = { .type = NLA_U32 },
2899         [RTA_METRICS]           = { .type = NLA_NESTED },
2900         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2901         [RTA_PREF]              = { .type = NLA_U8 },
2902         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2903         [RTA_ENCAP]             = { .type = NLA_NESTED },
2904         [RTA_EXPIRES]           = { .type = NLA_U32 },
2905         [RTA_UID]               = { .type = NLA_U32 },
2906         [RTA_MARK]              = { .type = NLA_U32 },
2907 };
2908
2909 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2910                               struct fib6_config *cfg,
2911                               struct netlink_ext_ack *extack)
2912 {
2913         struct rtmsg *rtm;
2914         struct nlattr *tb[RTA_MAX+1];
2915         unsigned int pref;
2916         int err;
2917
2918         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2919                           NULL);
2920         if (err < 0)
2921                 goto errout;
2922
2923         err = -EINVAL;
2924         rtm = nlmsg_data(nlh);
2925         memset(cfg, 0, sizeof(*cfg));
2926
2927         cfg->fc_table = rtm->rtm_table;
2928         cfg->fc_dst_len = rtm->rtm_dst_len;
2929         cfg->fc_src_len = rtm->rtm_src_len;
2930         cfg->fc_flags = RTF_UP;
2931         cfg->fc_protocol = rtm->rtm_protocol;
2932         cfg->fc_type = rtm->rtm_type;
2933
2934         if (rtm->rtm_type == RTN_UNREACHABLE ||
2935             rtm->rtm_type == RTN_BLACKHOLE ||
2936             rtm->rtm_type == RTN_PROHIBIT ||
2937             rtm->rtm_type == RTN_THROW)
2938                 cfg->fc_flags |= RTF_REJECT;
2939
2940         if (rtm->rtm_type == RTN_LOCAL)
2941                 cfg->fc_flags |= RTF_LOCAL;
2942
2943         if (rtm->rtm_flags & RTM_F_CLONED)
2944                 cfg->fc_flags |= RTF_CACHE;
2945
2946         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2947         cfg->fc_nlinfo.nlh = nlh;
2948         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2949
2950         if (tb[RTA_GATEWAY]) {
2951                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2952                 cfg->fc_flags |= RTF_GATEWAY;
2953         }
2954
2955         if (tb[RTA_DST]) {
2956                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2957
2958                 if (nla_len(tb[RTA_DST]) < plen)
2959                         goto errout;
2960
2961                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2962         }
2963
2964         if (tb[RTA_SRC]) {
2965                 int plen = (rtm->rtm_src_len + 7) >> 3;
2966
2967                 if (nla_len(tb[RTA_SRC]) < plen)
2968                         goto errout;
2969
2970                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2971         }
2972
2973         if (tb[RTA_PREFSRC])
2974                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2975
2976         if (tb[RTA_OIF])
2977                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2978
2979         if (tb[RTA_PRIORITY])
2980                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2981
2982         if (tb[RTA_METRICS]) {
2983                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2984                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2985         }
2986
2987         if (tb[RTA_TABLE])
2988                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2989
2990         if (tb[RTA_MULTIPATH]) {
2991                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2992                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2993
2994                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2995                                                      cfg->fc_mp_len);
2996                 if (err < 0)
2997                         goto errout;
2998         }
2999
3000         if (tb[RTA_PREF]) {
3001                 pref = nla_get_u8(tb[RTA_PREF]);
3002                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3003                     pref != ICMPV6_ROUTER_PREF_HIGH)
3004                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3005                 cfg->fc_flags |= RTF_PREF(pref);
3006         }
3007
3008         if (tb[RTA_ENCAP])
3009                 cfg->fc_encap = tb[RTA_ENCAP];
3010
3011         if (tb[RTA_ENCAP_TYPE]) {
3012                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3013
3014                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
3015                 if (err < 0)
3016                         goto errout;
3017         }
3018
3019         if (tb[RTA_EXPIRES]) {
3020                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3021
3022                 if (addrconf_finite_timeout(timeout)) {
3023                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3024                         cfg->fc_flags |= RTF_EXPIRES;
3025                 }
3026         }
3027
3028         err = 0;
3029 errout:
3030         return err;
3031 }
3032
3033 struct rt6_nh {
3034         struct rt6_info *rt6_info;
3035         struct fib6_config r_cfg;
3036         struct mx6_config mxc;
3037         struct list_head next;
3038 };
3039
3040 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3041 {
3042         struct rt6_nh *nh;
3043
3044         list_for_each_entry(nh, rt6_nh_list, next) {
3045                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3046                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3047                         nh->r_cfg.fc_ifindex);
3048         }
3049 }
3050
3051 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3052                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3053 {
3054         struct rt6_nh *nh;
3055         struct rt6_info *rtnh;
3056         int err = -EEXIST;
3057
3058         list_for_each_entry(nh, rt6_nh_list, next) {
3059                 /* check if rt6_info already exists */
3060                 rtnh = nh->rt6_info;
3061
3062                 if (rtnh->dst.dev == rt->dst.dev &&
3063                     rtnh->rt6i_idev == rt->rt6i_idev &&
3064                     ipv6_addr_equal(&rtnh->rt6i_gateway,
3065                                     &rt->rt6i_gateway))
3066                         return err;
3067         }
3068
3069         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3070         if (!nh)
3071                 return -ENOMEM;
3072         nh->rt6_info = rt;
3073         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3074         if (err) {
3075                 kfree(nh);
3076                 return err;
3077         }
3078         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3079         list_add_tail(&nh->next, rt6_nh_list);
3080
3081         return 0;
3082 }
3083
3084 static void ip6_route_mpath_notify(struct rt6_info *rt,
3085                                    struct rt6_info *rt_last,
3086                                    struct nl_info *info,
3087                                    __u16 nlflags)
3088 {
3089         /* if this is an APPEND route, then rt points to the first route
3090          * inserted and rt_last points to last route inserted. Userspace
3091          * wants a consistent dump of the route which starts at the first
3092          * nexthop. Since sibling routes are always added at the end of
3093          * the list, find the first sibling of the last route appended
3094          */
3095         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3096                 rt = list_first_entry(&rt_last->rt6i_siblings,
3097                                       struct rt6_info,
3098                                       rt6i_siblings);
3099         }
3100
3101         if (rt)
3102                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3103 }
3104
3105 static int ip6_route_multipath_add(struct fib6_config *cfg,
3106                                    struct netlink_ext_ack *extack)
3107 {
3108         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3109         struct nl_info *info = &cfg->fc_nlinfo;
3110         struct fib6_config r_cfg;
3111         struct rtnexthop *rtnh;
3112         struct rt6_info *rt;
3113         struct rt6_nh *err_nh;
3114         struct rt6_nh *nh, *nh_safe;
3115         __u16 nlflags;
3116         int remaining;
3117         int attrlen;
3118         int err = 1;
3119         int nhn = 0;
3120         int replace = (cfg->fc_nlinfo.nlh &&
3121                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3122         LIST_HEAD(rt6_nh_list);
3123
3124         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3125         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3126                 nlflags |= NLM_F_APPEND;
3127
3128         remaining = cfg->fc_mp_len;
3129         rtnh = (struct rtnexthop *)cfg->fc_mp;
3130
3131         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3132          * rt6_info structs per nexthop
3133          */
3134         while (rtnh_ok(rtnh, remaining)) {
3135                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3136                 if (rtnh->rtnh_ifindex)
3137                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3138
3139                 attrlen = rtnh_attrlen(rtnh);
3140                 if (attrlen > 0) {
3141                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3142
3143                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3144                         if (nla) {
3145                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3146                                 r_cfg.fc_flags |= RTF_GATEWAY;
3147                         }
3148                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3149                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3150                         if (nla)
3151                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3152                 }
3153
3154                 rt = ip6_route_info_create(&r_cfg, extack);
3155                 if (IS_ERR(rt)) {
3156                         err = PTR_ERR(rt);
3157                         rt = NULL;
3158                         goto cleanup;
3159                 }
3160
3161                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3162                 if (err) {
3163                         dst_free(&rt->dst);
3164                         goto cleanup;
3165                 }
3166
3167                 rtnh = rtnh_next(rtnh, &remaining);
3168         }
3169
3170         /* for add and replace send one notification with all nexthops.
3171          * Skip the notification in fib6_add_rt2node and send one with
3172          * the full route when done
3173          */
3174         info->skip_notify = 1;
3175
3176         err_nh = NULL;
3177         list_for_each_entry(nh, &rt6_nh_list, next) {
3178                 rt_last = nh->rt6_info;
3179                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3180                 /* save reference to first route for notification */
3181                 if (!rt_notif && !err)
3182                         rt_notif = nh->rt6_info;
3183
3184                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3185                 nh->rt6_info = NULL;
3186                 if (err) {
3187                         if (replace && nhn)
3188                                 ip6_print_replace_route_err(&rt6_nh_list);
3189                         err_nh = nh;
3190                         goto add_errout;
3191                 }
3192
3193                 /* Because each route is added like a single route we remove
3194                  * these flags after the first nexthop: if there is a collision,
3195                  * we have already failed to add the first nexthop:
3196                  * fib6_add_rt2node() has rejected it; when replacing, old
3197                  * nexthops have been replaced by first new, the rest should
3198                  * be added to it.
3199                  */
3200                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3201                                                      NLM_F_REPLACE);
3202                 nhn++;
3203         }
3204
3205         /* success ... tell user about new route */
3206         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3207         goto cleanup;
3208
3209 add_errout:
3210         /* send notification for routes that were added so that
3211          * the delete notifications sent by ip6_route_del are
3212          * coherent
3213          */
3214         if (rt_notif)
3215                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3216
3217         /* Delete routes that were already added */
3218         list_for_each_entry(nh, &rt6_nh_list, next) {
3219                 if (err_nh == nh)
3220                         break;
3221                 ip6_route_del(&nh->r_cfg, extack);
3222         }
3223
3224 cleanup:
3225         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3226                 if (nh->rt6_info)
3227                         dst_free(&nh->rt6_info->dst);
3228                 kfree(nh->mxc.mx);
3229                 list_del(&nh->next);
3230                 kfree(nh);
3231         }
3232
3233         return err;
3234 }
3235
3236 static int ip6_route_multipath_del(struct fib6_config *cfg,
3237                                    struct netlink_ext_ack *extack)
3238 {
3239         struct fib6_config r_cfg;
3240         struct rtnexthop *rtnh;
3241         int remaining;
3242         int attrlen;
3243         int err = 1, last_err = 0;
3244
3245         remaining = cfg->fc_mp_len;
3246         rtnh = (struct rtnexthop *)cfg->fc_mp;
3247
3248         /* Parse a Multipath Entry */
3249         while (rtnh_ok(rtnh, remaining)) {
3250                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3251                 if (rtnh->rtnh_ifindex)
3252                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3253
3254                 attrlen = rtnh_attrlen(rtnh);
3255                 if (attrlen > 0) {
3256                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3257
3258                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3259                         if (nla) {
3260                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3261                                 r_cfg.fc_flags |= RTF_GATEWAY;
3262                         }
3263                 }
3264                 err = ip6_route_del(&r_cfg, extack);
3265                 if (err)
3266                         last_err = err;
3267
3268                 rtnh = rtnh_next(rtnh, &remaining);
3269         }
3270
3271         return last_err;
3272 }
3273
3274 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3275                               struct netlink_ext_ack *extack)
3276 {
3277         struct fib6_config cfg;
3278         int err;
3279
3280         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3281         if (err < 0)
3282                 return err;
3283
3284         if (cfg.fc_mp)
3285                 return ip6_route_multipath_del(&cfg, extack);
3286         else {
3287                 cfg.fc_delete_all_nh = 1;
3288                 return ip6_route_del(&cfg, extack);
3289         }
3290 }
3291
3292 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3293                               struct netlink_ext_ack *extack)
3294 {
3295         struct fib6_config cfg;
3296         int err;
3297
3298         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3299         if (err < 0)
3300                 return err;
3301
3302         if (cfg.fc_mp)
3303                 return ip6_route_multipath_add(&cfg, extack);
3304         else
3305                 return ip6_route_add(&cfg, extack);
3306 }
3307
3308 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3309 {
3310         int nexthop_len = 0;
3311
3312         if (rt->rt6i_nsiblings) {
3313                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3314                             + NLA_ALIGN(sizeof(struct rtnexthop))
3315                             + nla_total_size(16) /* RTA_GATEWAY */
3316                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3317
3318                 nexthop_len *= rt->rt6i_nsiblings;
3319         }
3320
3321         return NLMSG_ALIGN(sizeof(struct rtmsg))
3322                + nla_total_size(16) /* RTA_SRC */
3323                + nla_total_size(16) /* RTA_DST */
3324                + nla_total_size(16) /* RTA_GATEWAY */
3325                + nla_total_size(16) /* RTA_PREFSRC */
3326                + nla_total_size(4) /* RTA_TABLE */
3327                + nla_total_size(4) /* RTA_IIF */
3328                + nla_total_size(4) /* RTA_OIF */
3329                + nla_total_size(4) /* RTA_PRIORITY */
3330                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3331                + nla_total_size(sizeof(struct rta_cacheinfo))
3332                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3333                + nla_total_size(1) /* RTA_PREF */
3334                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3335                + nexthop_len;
3336 }
3337
3338 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3339                             unsigned int *flags, bool skip_oif)
3340 {
3341         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3342                 *flags |= RTNH_F_LINKDOWN;
3343                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3344                         *flags |= RTNH_F_DEAD;
3345         }
3346
3347         if (rt->rt6i_flags & RTF_GATEWAY) {
3348                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3349                         goto nla_put_failure;
3350         }
3351
3352         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3353         if (!skip_oif && rt->dst.dev &&
3354             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3355                 goto nla_put_failure;
3356
3357         if (rt->dst.lwtstate &&
3358             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3359                 goto nla_put_failure;
3360
3361         return 0;
3362
3363 nla_put_failure:
3364         return -EMSGSIZE;
3365 }
3366
3367 /* add multipath next hop */
3368 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3369 {
3370         struct rtnexthop *rtnh;
3371         unsigned int flags = 0;
3372
3373         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3374         if (!rtnh)
3375                 goto nla_put_failure;
3376
3377         rtnh->rtnh_hops = 0;
3378         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3379
3380         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3381                 goto nla_put_failure;
3382
3383         rtnh->rtnh_flags = flags;
3384
3385         /* length of rtnetlink header + attributes */
3386         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3387
3388         return 0;
3389
3390 nla_put_failure:
3391         return -EMSGSIZE;
3392 }
3393
3394 static int rt6_fill_node(struct net *net,
3395                          struct sk_buff *skb, struct rt6_info *rt,
3396                          struct in6_addr *dst, struct in6_addr *src,
3397                          int iif, int type, u32 portid, u32 seq,
3398                          unsigned int flags)
3399 {
3400         u32 metrics[RTAX_MAX];
3401         struct rtmsg *rtm;
3402         struct nlmsghdr *nlh;
3403         long expires;
3404         u32 table;
3405
3406         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3407         if (!nlh)
3408                 return -EMSGSIZE;
3409
3410         rtm = nlmsg_data(nlh);
3411         rtm->rtm_family = AF_INET6;
3412         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3413         rtm->rtm_src_len = rt->rt6i_src.plen;
3414         rtm->rtm_tos = 0;
3415         if (rt->rt6i_table)
3416                 table = rt->rt6i_table->tb6_id;
3417         else
3418                 table = RT6_TABLE_UNSPEC;
3419         rtm->rtm_table = table;
3420         if (nla_put_u32(skb, RTA_TABLE, table))
3421                 goto nla_put_failure;
3422         if (rt->rt6i_flags & RTF_REJECT) {
3423                 switch (rt->dst.error) {
3424                 case -EINVAL:
3425                         rtm->rtm_type = RTN_BLACKHOLE;
3426                         break;
3427                 case -EACCES:
3428                         rtm->rtm_type = RTN_PROHIBIT;
3429                         break;
3430                 case -EAGAIN:
3431                         rtm->rtm_type = RTN_THROW;
3432                         break;
3433                 default:
3434                         rtm->rtm_type = RTN_UNREACHABLE;
3435                         break;
3436                 }
3437         }
3438         else if (rt->rt6i_flags & RTF_LOCAL)
3439                 rtm->rtm_type = RTN_LOCAL;
3440         else if (rt->rt6i_flags & RTF_ANYCAST)
3441                 rtm->rtm_type = RTN_ANYCAST;
3442         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3443                 rtm->rtm_type = RTN_LOCAL;
3444         else
3445                 rtm->rtm_type = RTN_UNICAST;
3446         rtm->rtm_flags = 0;
3447         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3448         rtm->rtm_protocol = rt->rt6i_protocol;
3449         if (rt->rt6i_flags & RTF_DYNAMIC)
3450                 rtm->rtm_protocol = RTPROT_REDIRECT;
3451         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3452                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3453                         rtm->rtm_protocol = RTPROT_RA;
3454                 else
3455                         rtm->rtm_protocol = RTPROT_KERNEL;
3456         }
3457
3458         if (rt->rt6i_flags & RTF_CACHE)
3459                 rtm->rtm_flags |= RTM_F_CLONED;
3460
3461         if (dst) {
3462                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3463                         goto nla_put_failure;
3464                 rtm->rtm_dst_len = 128;
3465         } else if (rtm->rtm_dst_len)
3466                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3467                         goto nla_put_failure;
3468 #ifdef CONFIG_IPV6_SUBTREES
3469         if (src) {
3470                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3471                         goto nla_put_failure;
3472                 rtm->rtm_src_len = 128;
3473         } else if (rtm->rtm_src_len &&
3474                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3475                 goto nla_put_failure;
3476 #endif
3477         if (iif) {
3478 #ifdef CONFIG_IPV6_MROUTE
3479                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3480                         int err = ip6mr_get_route(net, skb, rtm, portid);
3481
3482                         if (err == 0)
3483                                 return 0;
3484                         if (err < 0)
3485                                 goto nla_put_failure;
3486                 } else
3487 #endif
3488                         if (nla_put_u32(skb, RTA_IIF, iif))
3489                                 goto nla_put_failure;
3490         } else if (dst) {
3491                 struct in6_addr saddr_buf;
3492                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3493                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3494                         goto nla_put_failure;
3495         }
3496
3497         if (rt->rt6i_prefsrc.plen) {
3498                 struct in6_addr saddr_buf;
3499                 saddr_buf = rt->rt6i_prefsrc.addr;
3500                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3501                         goto nla_put_failure;
3502         }
3503
3504         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3505         if (rt->rt6i_pmtu)
3506                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3507         if (rtnetlink_put_metrics(skb, metrics) < 0)
3508                 goto nla_put_failure;
3509
3510         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3511                 goto nla_put_failure;
3512
3513         /* For multipath routes, walk the siblings list and add
3514          * each as a nexthop within RTA_MULTIPATH.
3515          */
3516         if (rt->rt6i_nsiblings) {
3517                 struct rt6_info *sibling, *next_sibling;
3518                 struct nlattr *mp;
3519
3520                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3521                 if (!mp)
3522                         goto nla_put_failure;
3523
3524                 if (rt6_add_nexthop(skb, rt) < 0)
3525                         goto nla_put_failure;
3526
3527                 list_for_each_entry_safe(sibling, next_sibling,
3528                                          &rt->rt6i_siblings, rt6i_siblings) {
3529                         if (rt6_add_nexthop(skb, sibling) < 0)
3530                                 goto nla_put_failure;
3531                 }
3532
3533                 nla_nest_end(skb, mp);
3534         } else {
3535                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3536                         goto nla_put_failure;
3537         }
3538
3539         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3540
3541         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3542                 goto nla_put_failure;
3543
3544         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3545                 goto nla_put_failure;
3546
3547
3548         nlmsg_end(skb, nlh);
3549         return 0;
3550
3551 nla_put_failure:
3552         nlmsg_cancel(skb, nlh);
3553         return -EMSGSIZE;
3554 }
3555
3556 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3557 {
3558         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3559         struct net *net = arg->net;
3560
3561         if (rt == net->ipv6.ip6_null_entry)
3562                 return 0;
3563
3564         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3565                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3566
3567                 /* user wants prefix routes only */
3568                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3569                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3570                         /* success since this is not a prefix route */
3571                         return 1;
3572                 }
3573         }
3574
3575         return rt6_fill_node(net,
3576                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3577                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3578                      NLM_F_MULTI);
3579 }
3580
3581 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3582                               struct netlink_ext_ack *extack)
3583 {
3584         struct net *net = sock_net(in_skb->sk);
3585         struct nlattr *tb[RTA_MAX+1];
3586         struct rt6_info *rt;
3587         struct sk_buff *skb;
3588         struct rtmsg *rtm;
3589         struct flowi6 fl6;
3590         int err, iif = 0, oif = 0;
3591
3592         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3593                           extack);
3594         if (err < 0)
3595                 goto errout;
3596
3597         err = -EINVAL;
3598         memset(&fl6, 0, sizeof(fl6));
3599         rtm = nlmsg_data(nlh);
3600         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3601
3602         if (tb[RTA_SRC]) {
3603                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3604                         goto errout;
3605
3606                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3607         }
3608
3609         if (tb[RTA_DST]) {
3610                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3611                         goto errout;
3612
3613                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3614         }
3615
3616         if (tb[RTA_IIF])
3617                 iif = nla_get_u32(tb[RTA_IIF]);
3618
3619         if (tb[RTA_OIF])
3620                 oif = nla_get_u32(tb[RTA_OIF]);
3621
3622         if (tb[RTA_MARK])
3623                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3624
3625         if (tb[RTA_UID])
3626                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3627                                            nla_get_u32(tb[RTA_UID]));
3628         else
3629                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3630
3631         if (iif) {
3632                 struct net_device *dev;
3633                 int flags = 0;
3634
3635                 dev = __dev_get_by_index(net, iif);
3636                 if (!dev) {
3637                         err = -ENODEV;
3638                         goto errout;
3639                 }
3640
3641                 fl6.flowi6_iif = iif;
3642
3643                 if (!ipv6_addr_any(&fl6.saddr))
3644                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3645
3646                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3647                                                                flags);
3648         } else {
3649                 fl6.flowi6_oif = oif;
3650
3651                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3652         }
3653
3654         if (rt == net->ipv6.ip6_null_entry) {
3655                 err = rt->dst.error;
3656                 ip6_rt_put(rt);
3657                 goto errout;
3658         }
3659
3660         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3661         if (!skb) {
3662                 ip6_rt_put(rt);
3663                 err = -ENOBUFS;
3664                 goto errout;
3665         }
3666
3667         skb_dst_set(skb, &rt->dst);
3668
3669         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3670                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3671                             nlh->nlmsg_seq, 0);
3672         if (err < 0) {
3673                 kfree_skb(skb);
3674                 goto errout;
3675         }
3676
3677         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3678 errout:
3679         return err;
3680 }
3681
3682 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3683                      unsigned int nlm_flags)
3684 {
3685         struct sk_buff *skb;
3686         struct net *net = info->nl_net;
3687         u32 seq;
3688         int err;
3689
3690         err = -ENOBUFS;
3691         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3692
3693         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3694         if (!skb)
3695                 goto errout;
3696
3697         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3698                                 event, info->portid, seq, nlm_flags);
3699         if (err < 0) {
3700                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3701                 WARN_ON(err == -EMSGSIZE);
3702                 kfree_skb(skb);
3703                 goto errout;
3704         }
3705         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3706                     info->nlh, gfp_any());
3707         return;
3708 errout:
3709         if (err < 0)
3710                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3711 }
3712
3713 static int ip6_route_dev_notify(struct notifier_block *this,
3714                                 unsigned long event, void *ptr)
3715 {
3716         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3717         struct net *net = dev_net(dev);
3718
3719         if (!(dev->flags & IFF_LOOPBACK))
3720                 return NOTIFY_OK;
3721
3722         if (event == NETDEV_REGISTER) {
3723                 net->ipv6.ip6_null_entry->dst.dev = dev;
3724                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3725 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3726                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3727                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3728                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3729                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3730 #endif
3731          } else if (event == NETDEV_UNREGISTER) {
3732                 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3733 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3734                 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3735                 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3736 #endif
3737         }
3738
3739         return NOTIFY_OK;
3740 }
3741
3742 /*
3743  *      /proc
3744  */
3745
3746 #ifdef CONFIG_PROC_FS
3747
3748 static const struct file_operations ipv6_route_proc_fops = {
3749         .owner          = THIS_MODULE,
3750         .open           = ipv6_route_open,
3751         .read           = seq_read,
3752         .llseek         = seq_lseek,
3753         .release        = seq_release_net,
3754 };
3755
3756 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3757 {
3758         struct net *net = (struct net *)seq->private;
3759         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3760                    net->ipv6.rt6_stats->fib_nodes,
3761                    net->ipv6.rt6_stats->fib_route_nodes,
3762                    net->ipv6.rt6_stats->fib_rt_alloc,
3763                    net->ipv6.rt6_stats->fib_rt_entries,
3764                    net->ipv6.rt6_stats->fib_rt_cache,
3765                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3766                    net->ipv6.rt6_stats->fib_discarded_routes);
3767
3768         return 0;
3769 }
3770
3771 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3772 {
3773         return single_open_net(inode, file, rt6_stats_seq_show);
3774 }
3775
3776 static const struct file_operations rt6_stats_seq_fops = {
3777         .owner   = THIS_MODULE,
3778         .open    = rt6_stats_seq_open,
3779         .read    = seq_read,
3780         .llseek  = seq_lseek,
3781         .release = single_release_net,
3782 };
3783 #endif  /* CONFIG_PROC_FS */
3784
3785 #ifdef CONFIG_SYSCTL
3786
3787 static
3788 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3789                               void __user *buffer, size_t *lenp, loff_t *ppos)
3790 {
3791         struct net *net;
3792         int delay;
3793         if (!write)
3794                 return -EINVAL;
3795
3796         net = (struct net *)ctl->extra1;
3797         delay = net->ipv6.sysctl.flush_delay;
3798         proc_dointvec(ctl, write, buffer, lenp, ppos);
3799         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3800         return 0;
3801 }
3802
3803 struct ctl_table ipv6_route_table_template[] = {
3804         {
3805                 .procname       =       "flush",
3806                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3807                 .maxlen         =       sizeof(int),
3808                 .mode           =       0200,
3809                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3810         },
3811         {
3812                 .procname       =       "gc_thresh",
3813                 .data           =       &ip6_dst_ops_template.gc_thresh,
3814                 .maxlen         =       sizeof(int),
3815                 .mode           =       0644,
3816                 .proc_handler   =       proc_dointvec,
3817         },
3818         {
3819                 .procname       =       "max_size",
3820                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3821                 .maxlen         =       sizeof(int),
3822                 .mode           =       0644,
3823                 .proc_handler   =       proc_dointvec,
3824         },
3825         {
3826                 .procname       =       "gc_min_interval",
3827                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3828                 .maxlen         =       sizeof(int),
3829                 .mode           =       0644,
3830                 .proc_handler   =       proc_dointvec_jiffies,
3831         },
3832         {
3833                 .procname       =       "gc_timeout",
3834                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3835                 .maxlen         =       sizeof(int),
3836                 .mode           =       0644,
3837                 .proc_handler   =       proc_dointvec_jiffies,
3838         },
3839         {
3840                 .procname       =       "gc_interval",
3841                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3842                 .maxlen         =       sizeof(int),
3843                 .mode           =       0644,
3844                 .proc_handler   =       proc_dointvec_jiffies,
3845         },
3846         {
3847                 .procname       =       "gc_elasticity",
3848                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3849                 .maxlen         =       sizeof(int),
3850                 .mode           =       0644,
3851                 .proc_handler   =       proc_dointvec,
3852         },
3853         {
3854                 .procname       =       "mtu_expires",
3855                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3856                 .maxlen         =       sizeof(int),
3857                 .mode           =       0644,
3858                 .proc_handler   =       proc_dointvec_jiffies,
3859         },
3860         {
3861                 .procname       =       "min_adv_mss",
3862                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3863                 .maxlen         =       sizeof(int),
3864                 .mode           =       0644,
3865                 .proc_handler   =       proc_dointvec,
3866         },
3867         {
3868                 .procname       =       "gc_min_interval_ms",
3869                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3870                 .maxlen         =       sizeof(int),
3871                 .mode           =       0644,
3872                 .proc_handler   =       proc_dointvec_ms_jiffies,
3873         },
3874         { }
3875 };
3876
3877 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3878 {
3879         struct ctl_table *table;
3880
3881         table = kmemdup(ipv6_route_table_template,
3882                         sizeof(ipv6_route_table_template),
3883                         GFP_KERNEL);
3884
3885         if (table) {
3886                 table[0].data = &net->ipv6.sysctl.flush_delay;
3887                 table[0].extra1 = net;
3888                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3889                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3890                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3891                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3892                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3893                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3894                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3895                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3896                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3897
3898                 /* Don't export sysctls to unprivileged users */
3899                 if (net->user_ns != &init_user_ns)
3900                         table[0].procname = NULL;
3901         }
3902
3903         return table;
3904 }
3905 #endif
3906
3907 static int __net_init ip6_route_net_init(struct net *net)
3908 {
3909         int ret = -ENOMEM;
3910
3911         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3912                sizeof(net->ipv6.ip6_dst_ops));
3913
3914         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3915                 goto out_ip6_dst_ops;
3916
3917         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3918                                            sizeof(*net->ipv6.ip6_null_entry),
3919                                            GFP_KERNEL);
3920         if (!net->ipv6.ip6_null_entry)
3921                 goto out_ip6_dst_entries;
3922         net->ipv6.ip6_null_entry->dst.path =
3923                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3924         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3925         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3926                          ip6_template_metrics, true);
3927
3928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3929         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3930                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3931                                                GFP_KERNEL);
3932         if (!net->ipv6.ip6_prohibit_entry)
3933                 goto out_ip6_null_entry;
3934         net->ipv6.ip6_prohibit_entry->dst.path =
3935                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3936         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3937         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3938                          ip6_template_metrics, true);
3939
3940         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3941                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3942                                                GFP_KERNEL);
3943         if (!net->ipv6.ip6_blk_hole_entry)
3944                 goto out_ip6_prohibit_entry;
3945         net->ipv6.ip6_blk_hole_entry->dst.path =
3946                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3947         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3948         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3949                          ip6_template_metrics, true);
3950 #endif
3951
3952         net->ipv6.sysctl.flush_delay = 0;
3953         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3954         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3955         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3956         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3957         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3958         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3959         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3960
3961         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3962
3963         ret = 0;
3964 out:
3965         return ret;
3966
3967 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3968 out_ip6_prohibit_entry:
3969         kfree(net->ipv6.ip6_prohibit_entry);
3970 out_ip6_null_entry:
3971         kfree(net->ipv6.ip6_null_entry);
3972 #endif
3973 out_ip6_dst_entries:
3974         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3975 out_ip6_dst_ops:
3976         goto out;
3977 }
3978
3979 static void __net_exit ip6_route_net_exit(struct net *net)
3980 {
3981         kfree(net->ipv6.ip6_null_entry);
3982 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3983         kfree(net->ipv6.ip6_prohibit_entry);
3984         kfree(net->ipv6.ip6_blk_hole_entry);
3985 #endif
3986         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3987 }
3988
3989 static int __net_init ip6_route_net_init_late(struct net *net)
3990 {
3991 #ifdef CONFIG_PROC_FS
3992         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3993         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3994 #endif
3995         return 0;
3996 }
3997
3998 static void __net_exit ip6_route_net_exit_late(struct net *net)
3999 {
4000 #ifdef CONFIG_PROC_FS
4001         remove_proc_entry("ipv6_route", net->proc_net);
4002         remove_proc_entry("rt6_stats", net->proc_net);
4003 #endif
4004 }
4005
4006 static struct pernet_operations ip6_route_net_ops = {
4007         .init = ip6_route_net_init,
4008         .exit = ip6_route_net_exit,
4009 };
4010
4011 static int __net_init ipv6_inetpeer_init(struct net *net)
4012 {
4013         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4014
4015         if (!bp)
4016                 return -ENOMEM;
4017         inet_peer_base_init(bp);
4018         net->ipv6.peers = bp;
4019         return 0;
4020 }
4021
4022 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4023 {
4024         struct inet_peer_base *bp = net->ipv6.peers;
4025
4026         net->ipv6.peers = NULL;
4027         inetpeer_invalidate_tree(bp);
4028         kfree(bp);
4029 }
4030
4031 static struct pernet_operations ipv6_inetpeer_ops = {
4032         .init   =       ipv6_inetpeer_init,
4033         .exit   =       ipv6_inetpeer_exit,
4034 };
4035
4036 static struct pernet_operations ip6_route_net_late_ops = {
4037         .init = ip6_route_net_init_late,
4038         .exit = ip6_route_net_exit_late,
4039 };
4040
4041 static struct notifier_block ip6_route_dev_notifier = {
4042         .notifier_call = ip6_route_dev_notify,
4043         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4044 };
4045
4046 void __init ip6_route_init_special_entries(void)
4047 {
4048         /* Registering of the loopback is done before this portion of code,
4049          * the loopback reference in rt6_info will not be taken, do it
4050          * manually for init_net */
4051         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4052         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4053   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4054         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4055         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4056         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4057         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4058   #endif
4059 }
4060
4061 int __init ip6_route_init(void)
4062 {
4063         int ret;
4064         int cpu;
4065
4066         ret = -ENOMEM;
4067         ip6_dst_ops_template.kmem_cachep =
4068                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4069                                   SLAB_HWCACHE_ALIGN, NULL);
4070         if (!ip6_dst_ops_template.kmem_cachep)
4071                 goto out;
4072
4073         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4074         if (ret)
4075                 goto out_kmem_cache;
4076
4077         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4078         if (ret)
4079                 goto out_dst_entries;
4080
4081         ret = register_pernet_subsys(&ip6_route_net_ops);
4082         if (ret)
4083                 goto out_register_inetpeer;
4084
4085         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4086
4087         ret = fib6_init();
4088         if (ret)
4089                 goto out_register_subsys;
4090
4091         ret = xfrm6_init();
4092         if (ret)
4093                 goto out_fib6_init;
4094
4095         ret = fib6_rules_init();
4096         if (ret)
4097                 goto xfrm6_init;
4098
4099         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4100         if (ret)
4101                 goto fib6_rules_init;
4102
4103         ret = -ENOBUFS;
4104         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4105             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4106             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4107                 goto out_register_late_subsys;
4108
4109         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4110         if (ret)
4111                 goto out_register_late_subsys;
4112
4113         for_each_possible_cpu(cpu) {
4114                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4115
4116                 INIT_LIST_HEAD(&ul->head);
4117                 spin_lock_init(&ul->lock);
4118         }
4119
4120 out:
4121         return ret;
4122
4123 out_register_late_subsys:
4124         unregister_pernet_subsys(&ip6_route_net_late_ops);
4125 fib6_rules_init:
4126         fib6_rules_cleanup();
4127 xfrm6_init:
4128         xfrm6_fini();
4129 out_fib6_init:
4130         fib6_gc_cleanup();
4131 out_register_subsys:
4132         unregister_pernet_subsys(&ip6_route_net_ops);
4133 out_register_inetpeer:
4134         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4135 out_dst_entries:
4136         dst_entries_destroy(&ip6_dst_blackhole_ops);
4137 out_kmem_cache:
4138         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4139         goto out;
4140 }
4141
4142 void ip6_route_cleanup(void)
4143 {
4144         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4145         unregister_pernet_subsys(&ip6_route_net_late_ops);
4146         fib6_rules_cleanup();
4147         xfrm6_fini();
4148         fib6_gc_cleanup();
4149         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4150         unregister_pernet_subsys(&ip6_route_net_ops);
4151         dst_entries_destroy(&ip6_dst_blackhole_ops);
4152         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4153 }