net/ipv6: Remove rt6i_prefsrc
[linux-2.6-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524
525         /*
526          * Okay, this does not seem to be appropriate
527          * for now, however, we need to check if it
528          * is really so; aka Router Reachability Probing.
529          *
530          * Router Reachability Probe MUST be rate-limited
531          * to no more than one per minute.
532          */
533         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534                 return;
535
536         nh_gw = &rt->fib6_nh.nh_gw;
537         dev = rt->fib6_nh.nh_dev;
538         rcu_read_lock_bh();
539         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540         if (neigh) {
541                 struct inet6_dev *idev;
542
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 idev = __in6_dev_get(dev);
547                 work = NULL;
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else {
558                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559         }
560
561         if (work) {
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         rt->dst.flags |= fib6_info_dst_flags(ort);
950
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019                           bool null_fallback)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (null_fallback) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 return NULL;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (nrt)
1047                 ip6_rt_copy_init(nrt, rt);
1048         else
1049                 fib6_info_release(rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         if (!fib6_info_hold_safe(ort))
1182                 return NULL;
1183
1184         dev = ip6_rt_get_dev_rcu(ort);
1185         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186         if (!rt) {
1187                 fib6_info_release(ort);
1188                 return NULL;
1189         }
1190
1191         ip6_rt_copy_init(rt, ort);
1192         rt->rt6i_flags |= RTF_CACHE;
1193         rt->dst.flags |= DST_HOST;
1194         rt->rt6i_dst.addr = *daddr;
1195         rt->rt6i_dst.plen = 128;
1196
1197         if (!rt6_is_gw_or_nonexthop(ort)) {
1198                 if (ort->fib6_dst.plen != 128 &&
1199                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200                         rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202                 if (rt->rt6i_src.plen && saddr) {
1203                         rt->rt6i_src.addr = *saddr;
1204                         rt->rt6i_src.plen = 128;
1205                 }
1206 #endif
1207         }
1208
1209         return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214         unsigned short flags = fib6_info_dst_flags(rt);
1215         struct net_device *dev;
1216         struct rt6_info *pcpu_rt;
1217
1218         if (!fib6_info_hold_safe(rt))
1219                 return NULL;
1220
1221         rcu_read_lock();
1222         dev = ip6_rt_get_dev_rcu(rt);
1223         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224         rcu_read_unlock();
1225         if (!pcpu_rt) {
1226                 fib6_info_release(rt);
1227                 return NULL;
1228         }
1229         ip6_rt_copy_init(pcpu_rt, rt);
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         hlist_del_rcu(&rt6_ex->hlist);
1284         dst_release(&rt6_ex->rt6i->dst);
1285         kfree_rcu(rt6_ex, rcu);
1286         WARN_ON_ONCE(!bucket->depth);
1287         bucket->depth--;
1288         net->ipv6.rt6_stats->fib_rt_cache--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct fib6_info *from = rt->from;
1603         struct in6_addr *src_key = NULL;
1604         struct rt6_exception *rt6_ex;
1605
1606         if (!from ||
1607             !(rt->rt6i_flags & RTF_CACHE))
1608                 return;
1609
1610         rcu_read_lock();
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629         rcu_read_unlock();
1630 }
1631
1632 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1633                                          struct rt6_info *rt, int mtu)
1634 {
1635         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1636          * lowest MTU in the path: always allow updating the route PMTU to
1637          * reflect PMTU decreases.
1638          *
1639          * If the new MTU is higher, and the route PMTU is equal to the local
1640          * MTU, this means the old MTU is the lowest in the path, so allow
1641          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1642          * handle this.
1643          */
1644
1645         if (dst_mtu(&rt->dst) >= mtu)
1646                 return true;
1647
1648         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1649                 return true;
1650
1651         return false;
1652 }
1653
1654 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1655                                        struct fib6_info *rt, int mtu)
1656 {
1657         struct rt6_exception_bucket *bucket;
1658         struct rt6_exception *rt6_ex;
1659         int i;
1660
1661         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662                                         lockdep_is_held(&rt6_exception_lock));
1663
1664         if (!bucket)
1665                 return;
1666
1667         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1669                         struct rt6_info *entry = rt6_ex->rt6i;
1670
1671                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1672                          * route), the metrics of its rt->from have already
1673                          * been updated.
1674                          */
1675                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1676                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1677                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1678                 }
1679                 bucket++;
1680         }
1681 }
1682
1683 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1684
1685 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1686                                         struct in6_addr *gateway)
1687 {
1688         struct rt6_exception_bucket *bucket;
1689         struct rt6_exception *rt6_ex;
1690         struct hlist_node *tmp;
1691         int i;
1692
1693         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1694                 return;
1695
1696         spin_lock_bh(&rt6_exception_lock);
1697         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698                                      lockdep_is_held(&rt6_exception_lock));
1699
1700         if (bucket) {
1701                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702                         hlist_for_each_entry_safe(rt6_ex, tmp,
1703                                                   &bucket->chain, hlist) {
1704                                 struct rt6_info *entry = rt6_ex->rt6i;
1705
1706                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1707                                     RTF_CACHE_GATEWAY &&
1708                                     ipv6_addr_equal(gateway,
1709                                                     &entry->rt6i_gateway)) {
1710                                         rt6_remove_exception(bucket, rt6_ex);
1711                                 }
1712                         }
1713                         bucket++;
1714                 }
1715         }
1716
1717         spin_unlock_bh(&rt6_exception_lock);
1718 }
1719
1720 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1721                                       struct rt6_exception *rt6_ex,
1722                                       struct fib6_gc_args *gc_args,
1723                                       unsigned long now)
1724 {
1725         struct rt6_info *rt = rt6_ex->rt6i;
1726
1727         /* we are pruning and obsoleting aged-out and non gateway exceptions
1728          * even if others have still references to them, so that on next
1729          * dst_check() such references can be dropped.
1730          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1731          * expired, independently from their aging, as per RFC 8201 section 4
1732          */
1733         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1734                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1735                         RT6_TRACE("aging clone %p\n", rt);
1736                         rt6_remove_exception(bucket, rt6_ex);
1737                         return;
1738                 }
1739         } else if (time_after(jiffies, rt->dst.expires)) {
1740                 RT6_TRACE("purging expired route %p\n", rt);
1741                 rt6_remove_exception(bucket, rt6_ex);
1742                 return;
1743         }
1744
1745         if (rt->rt6i_flags & RTF_GATEWAY) {
1746                 struct neighbour *neigh;
1747                 __u8 neigh_flags = 0;
1748
1749                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1750                 if (neigh)
1751                         neigh_flags = neigh->flags;
1752
1753                 if (!(neigh_flags & NTF_ROUTER)) {
1754                         RT6_TRACE("purging route %p via non-router but gateway\n",
1755                                   rt);
1756                         rt6_remove_exception(bucket, rt6_ex);
1757                         return;
1758                 }
1759         }
1760
1761         gc_args->more++;
1762 }
1763
1764 void rt6_age_exceptions(struct fib6_info *rt,
1765                         struct fib6_gc_args *gc_args,
1766                         unsigned long now)
1767 {
1768         struct rt6_exception_bucket *bucket;
1769         struct rt6_exception *rt6_ex;
1770         struct hlist_node *tmp;
1771         int i;
1772
1773         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1774                 return;
1775
1776         rcu_read_lock_bh();
1777         spin_lock(&rt6_exception_lock);
1778         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1779                                     lockdep_is_held(&rt6_exception_lock));
1780
1781         if (bucket) {
1782                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1783                         hlist_for_each_entry_safe(rt6_ex, tmp,
1784                                                   &bucket->chain, hlist) {
1785                                 rt6_age_examine_exception(bucket, rt6_ex,
1786                                                           gc_args, now);
1787                         }
1788                         bucket++;
1789                 }
1790         }
1791         spin_unlock(&rt6_exception_lock);
1792         rcu_read_unlock_bh();
1793 }
1794
1795 /* must be called with rcu lock held */
1796 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1797                                     int oif, struct flowi6 *fl6, int strict)
1798 {
1799         struct fib6_node *fn, *saved_fn;
1800         struct fib6_info *f6i;
1801
1802         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1803         saved_fn = fn;
1804
1805         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1806                 oif = 0;
1807
1808 redo_rt6_select:
1809         f6i = rt6_select(net, fn, oif, strict);
1810         if (f6i == net->ipv6.fib6_null_entry) {
1811                 fn = fib6_backtrack(fn, &fl6->saddr);
1812                 if (fn)
1813                         goto redo_rt6_select;
1814                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1815                         /* also consider unreachable route */
1816                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1817                         fn = saved_fn;
1818                         goto redo_rt6_select;
1819                 }
1820         }
1821
1822         trace_fib6_table_lookup(net, f6i, table, fl6);
1823
1824         return f6i;
1825 }
1826
1827 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1828                                int oif, struct flowi6 *fl6,
1829                                const struct sk_buff *skb, int flags)
1830 {
1831         struct fib6_info *f6i;
1832         struct rt6_info *rt;
1833         int strict = 0;
1834
1835         strict |= flags & RT6_LOOKUP_F_IFACE;
1836         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1837         if (net->ipv6.devconf_all->forwarding == 0)
1838                 strict |= RT6_LOOKUP_F_REACHABLE;
1839
1840         rcu_read_lock();
1841
1842         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1843         if (f6i->fib6_nsiblings)
1844                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1845
1846         if (f6i == net->ipv6.fib6_null_entry) {
1847                 rt = net->ipv6.ip6_null_entry;
1848                 rcu_read_unlock();
1849                 dst_hold(&rt->dst);
1850                 return rt;
1851         }
1852
1853         /*Search through exception table */
1854         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1855         if (rt) {
1856                 if (ip6_hold_safe(net, &rt, true))
1857                         dst_use_noref(&rt->dst, jiffies);
1858
1859                 rcu_read_unlock();
1860                 return rt;
1861         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1862                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1863                 /* Create a RTF_CACHE clone which will not be
1864                  * owned by the fib6 tree.  It is for the special case where
1865                  * the daddr in the skb during the neighbor look-up is different
1866                  * from the fl6->daddr used to look-up route here.
1867                  */
1868                 struct rt6_info *uncached_rt;
1869
1870                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1871
1872                 rcu_read_unlock();
1873
1874                 if (uncached_rt) {
1875                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1876                          * No need for another dst_hold()
1877                          */
1878                         rt6_uncached_list_add(uncached_rt);
1879                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1880                 } else {
1881                         uncached_rt = net->ipv6.ip6_null_entry;
1882                         dst_hold(&uncached_rt->dst);
1883                 }
1884
1885                 return uncached_rt;
1886         } else {
1887                 /* Get a percpu copy */
1888
1889                 struct rt6_info *pcpu_rt;
1890
1891                 local_bh_disable();
1892                 pcpu_rt = rt6_get_pcpu_route(f6i);
1893
1894                 if (!pcpu_rt)
1895                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1896
1897                 local_bh_enable();
1898                 rcu_read_unlock();
1899
1900                 return pcpu_rt;
1901         }
1902 }
1903 EXPORT_SYMBOL_GPL(ip6_pol_route);
1904
1905 static struct rt6_info *ip6_pol_route_input(struct net *net,
1906                                             struct fib6_table *table,
1907                                             struct flowi6 *fl6,
1908                                             const struct sk_buff *skb,
1909                                             int flags)
1910 {
1911         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1912 }
1913
1914 struct dst_entry *ip6_route_input_lookup(struct net *net,
1915                                          struct net_device *dev,
1916                                          struct flowi6 *fl6,
1917                                          const struct sk_buff *skb,
1918                                          int flags)
1919 {
1920         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1921                 flags |= RT6_LOOKUP_F_IFACE;
1922
1923         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1924 }
1925 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1926
1927 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1928                                   struct flow_keys *keys,
1929                                   struct flow_keys *flkeys)
1930 {
1931         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1932         const struct ipv6hdr *key_iph = outer_iph;
1933         struct flow_keys *_flkeys = flkeys;
1934         const struct ipv6hdr *inner_iph;
1935         const struct icmp6hdr *icmph;
1936         struct ipv6hdr _inner_iph;
1937         struct icmp6hdr _icmph;
1938
1939         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1940                 goto out;
1941
1942         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1943                                    sizeof(_icmph), &_icmph);
1944         if (!icmph)
1945                 goto out;
1946
1947         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1948             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1949             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1950             icmph->icmp6_type != ICMPV6_PARAMPROB)
1951                 goto out;
1952
1953         inner_iph = skb_header_pointer(skb,
1954                                        skb_transport_offset(skb) + sizeof(*icmph),
1955                                        sizeof(_inner_iph), &_inner_iph);
1956         if (!inner_iph)
1957                 goto out;
1958
1959         key_iph = inner_iph;
1960         _flkeys = NULL;
1961 out:
1962         if (_flkeys) {
1963                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1964                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1965                 keys->tags.flow_label = _flkeys->tags.flow_label;
1966                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1967         } else {
1968                 keys->addrs.v6addrs.src = key_iph->saddr;
1969                 keys->addrs.v6addrs.dst = key_iph->daddr;
1970                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1971                 keys->basic.ip_proto = key_iph->nexthdr;
1972         }
1973 }
1974
1975 /* if skb is set it will be used and fl6 can be NULL */
1976 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1977                        const struct sk_buff *skb, struct flow_keys *flkeys)
1978 {
1979         struct flow_keys hash_keys;
1980         u32 mhash;
1981
1982         switch (ip6_multipath_hash_policy(net)) {
1983         case 0:
1984                 memset(&hash_keys, 0, sizeof(hash_keys));
1985                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1986                 if (skb) {
1987                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1988                 } else {
1989                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1990                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1991                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1992                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1993                 }
1994                 break;
1995         case 1:
1996                 if (skb) {
1997                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1998                         struct flow_keys keys;
1999
2000                         /* short-circuit if we already have L4 hash present */
2001                         if (skb->l4_hash)
2002                                 return skb_get_hash_raw(skb) >> 1;
2003
2004                         memset(&hash_keys, 0, sizeof(hash_keys));
2005
2006                         if (!flkeys) {
2007                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2008                                 flkeys = &keys;
2009                         }
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2012                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2013                         hash_keys.ports.src = flkeys->ports.src;
2014                         hash_keys.ports.dst = flkeys->ports.dst;
2015                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2016                 } else {
2017                         memset(&hash_keys, 0, sizeof(hash_keys));
2018                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2020                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021                         hash_keys.ports.src = fl6->fl6_sport;
2022                         hash_keys.ports.dst = fl6->fl6_dport;
2023                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2024                 }
2025                 break;
2026         }
2027         mhash = flow_hash_from_keys(&hash_keys);
2028
2029         return mhash >> 1;
2030 }
2031
2032 void ip6_route_input(struct sk_buff *skb)
2033 {
2034         const struct ipv6hdr *iph = ipv6_hdr(skb);
2035         struct net *net = dev_net(skb->dev);
2036         int flags = RT6_LOOKUP_F_HAS_SADDR;
2037         struct ip_tunnel_info *tun_info;
2038         struct flowi6 fl6 = {
2039                 .flowi6_iif = skb->dev->ifindex,
2040                 .daddr = iph->daddr,
2041                 .saddr = iph->saddr,
2042                 .flowlabel = ip6_flowinfo(iph),
2043                 .flowi6_mark = skb->mark,
2044                 .flowi6_proto = iph->nexthdr,
2045         };
2046         struct flow_keys *flkeys = NULL, _flkeys;
2047
2048         tun_info = skb_tunnel_info(skb);
2049         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2050                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2051
2052         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2053                 flkeys = &_flkeys;
2054
2055         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2056                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2057         skb_dst_drop(skb);
2058         skb_dst_set(skb,
2059                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2060 }
2061
2062 static struct rt6_info *ip6_pol_route_output(struct net *net,
2063                                              struct fib6_table *table,
2064                                              struct flowi6 *fl6,
2065                                              const struct sk_buff *skb,
2066                                              int flags)
2067 {
2068         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2069 }
2070
2071 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2072                                          struct flowi6 *fl6, int flags)
2073 {
2074         bool any_src;
2075
2076         if (rt6_need_strict(&fl6->daddr)) {
2077                 struct dst_entry *dst;
2078
2079                 dst = l3mdev_link_scope_lookup(net, fl6);
2080                 if (dst)
2081                         return dst;
2082         }
2083
2084         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2085
2086         any_src = ipv6_addr_any(&fl6->saddr);
2087         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2088             (fl6->flowi6_oif && any_src))
2089                 flags |= RT6_LOOKUP_F_IFACE;
2090
2091         if (!any_src)
2092                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2093         else if (sk)
2094                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2095
2096         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2097 }
2098 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2099
2100 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2101 {
2102         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2103         struct net_device *loopback_dev = net->loopback_dev;
2104         struct dst_entry *new = NULL;
2105
2106         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2107                        DST_OBSOLETE_DEAD, 0);
2108         if (rt) {
2109                 rt6_info_init(rt);
2110                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2111
2112                 new = &rt->dst;
2113                 new->__use = 1;
2114                 new->input = dst_discard;
2115                 new->output = dst_discard_out;
2116
2117                 dst_copy_metrics(new, &ort->dst);
2118
2119                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2120                 rt->rt6i_gateway = ort->rt6i_gateway;
2121                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2122
2123                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2124 #ifdef CONFIG_IPV6_SUBTREES
2125                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2126 #endif
2127         }
2128
2129         dst_release(dst_orig);
2130         return new ? new : ERR_PTR(-ENOMEM);
2131 }
2132
2133 /*
2134  *      Destination cache support functions
2135  */
2136
2137 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2138 {
2139         u32 rt_cookie = 0;
2140
2141         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2142                 return false;
2143
2144         if (fib6_check_expired(f6i))
2145                 return false;
2146
2147         return true;
2148 }
2149
2150 static struct dst_entry *rt6_check(struct rt6_info *rt,
2151                                    struct fib6_info *from,
2152                                    u32 cookie)
2153 {
2154         u32 rt_cookie = 0;
2155
2156         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2157             rt_cookie != cookie)
2158                 return NULL;
2159
2160         if (rt6_check_expired(rt))
2161                 return NULL;
2162
2163         return &rt->dst;
2164 }
2165
2166 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2167                                             struct fib6_info *from,
2168                                             u32 cookie)
2169 {
2170         if (!__rt6_check_expired(rt) &&
2171             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2172             fib6_check(from, cookie))
2173                 return &rt->dst;
2174         else
2175                 return NULL;
2176 }
2177
2178 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2179 {
2180         struct dst_entry *dst_ret;
2181         struct fib6_info *from;
2182         struct rt6_info *rt;
2183
2184         rt = container_of(dst, struct rt6_info, dst);
2185
2186         rcu_read_lock();
2187
2188         /* All IPV6 dsts are created with ->obsolete set to the value
2189          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2190          * into this function always.
2191          */
2192
2193         from = rcu_dereference(rt->from);
2194
2195         if (from && (rt->rt6i_flags & RTF_PCPU ||
2196             unlikely(!list_empty(&rt->rt6i_uncached))))
2197                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2198         else
2199                 dst_ret = rt6_check(rt, from, cookie);
2200
2201         rcu_read_unlock();
2202
2203         return dst_ret;
2204 }
2205
2206 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2207 {
2208         struct rt6_info *rt = (struct rt6_info *) dst;
2209
2210         if (rt) {
2211                 if (rt->rt6i_flags & RTF_CACHE) {
2212                         rcu_read_lock();
2213                         if (rt6_check_expired(rt)) {
2214                                 rt6_remove_exception_rt(rt);
2215                                 dst = NULL;
2216                         }
2217                         rcu_read_unlock();
2218                 } else {
2219                         dst_release(dst);
2220                         dst = NULL;
2221                 }
2222         }
2223         return dst;
2224 }
2225
2226 static void ip6_link_failure(struct sk_buff *skb)
2227 {
2228         struct rt6_info *rt;
2229
2230         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2231
2232         rt = (struct rt6_info *) skb_dst(skb);
2233         if (rt) {
2234                 rcu_read_lock();
2235                 if (rt->rt6i_flags & RTF_CACHE) {
2236                         if (dst_hold_safe(&rt->dst))
2237                                 rt6_remove_exception_rt(rt);
2238                 } else {
2239                         struct fib6_info *from;
2240                         struct fib6_node *fn;
2241
2242                         from = rcu_dereference(rt->from);
2243                         if (from) {
2244                                 fn = rcu_dereference(from->fib6_node);
2245                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2246                                         fn->fn_sernum = -1;
2247                         }
2248                 }
2249                 rcu_read_unlock();
2250         }
2251 }
2252
2253 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2254 {
2255         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2256                 struct fib6_info *from;
2257
2258                 rcu_read_lock();
2259                 from = rcu_dereference(rt0->from);
2260                 if (from)
2261                         rt0->dst.expires = from->expires;
2262                 rcu_read_unlock();
2263         }
2264
2265         dst_set_expires(&rt0->dst, timeout);
2266         rt0->rt6i_flags |= RTF_EXPIRES;
2267 }
2268
2269 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2270 {
2271         struct net *net = dev_net(rt->dst.dev);
2272
2273         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2274         rt->rt6i_flags |= RTF_MODIFIED;
2275         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2276 }
2277
2278 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2279 {
2280         bool from_set;
2281
2282         rcu_read_lock();
2283         from_set = !!rcu_dereference(rt->from);
2284         rcu_read_unlock();
2285
2286         return !(rt->rt6i_flags & RTF_CACHE) &&
2287                 (rt->rt6i_flags & RTF_PCPU || from_set);
2288 }
2289
2290 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2291                                  const struct ipv6hdr *iph, u32 mtu)
2292 {
2293         const struct in6_addr *daddr, *saddr;
2294         struct rt6_info *rt6 = (struct rt6_info *)dst;
2295
2296         if (dst_metric_locked(dst, RTAX_MTU))
2297                 return;
2298
2299         if (iph) {
2300                 daddr = &iph->daddr;
2301                 saddr = &iph->saddr;
2302         } else if (sk) {
2303                 daddr = &sk->sk_v6_daddr;
2304                 saddr = &inet6_sk(sk)->saddr;
2305         } else {
2306                 daddr = NULL;
2307                 saddr = NULL;
2308         }
2309         dst_confirm_neigh(dst, daddr);
2310         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2311         if (mtu >= dst_mtu(dst))
2312                 return;
2313
2314         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2315                 rt6_do_update_pmtu(rt6, mtu);
2316                 /* update rt6_ex->stamp for cache */
2317                 if (rt6->rt6i_flags & RTF_CACHE)
2318                         rt6_update_exception_stamp_rt(rt6);
2319         } else if (daddr) {
2320                 struct fib6_info *from;
2321                 struct rt6_info *nrt6;
2322
2323                 rcu_read_lock();
2324                 from = rcu_dereference(rt6->from);
2325                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2326                 if (nrt6) {
2327                         rt6_do_update_pmtu(nrt6, mtu);
2328                         if (rt6_insert_exception(nrt6, from))
2329                                 dst_release_immediate(&nrt6->dst);
2330                 }
2331                 rcu_read_unlock();
2332         }
2333 }
2334
2335 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2336                                struct sk_buff *skb, u32 mtu)
2337 {
2338         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2339 }
2340
2341 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2342                      int oif, u32 mark, kuid_t uid)
2343 {
2344         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2345         struct dst_entry *dst;
2346         struct flowi6 fl6;
2347
2348         memset(&fl6, 0, sizeof(fl6));
2349         fl6.flowi6_oif = oif;
2350         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2351         fl6.daddr = iph->daddr;
2352         fl6.saddr = iph->saddr;
2353         fl6.flowlabel = ip6_flowinfo(iph);
2354         fl6.flowi6_uid = uid;
2355
2356         dst = ip6_route_output(net, NULL, &fl6);
2357         if (!dst->error)
2358                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2359         dst_release(dst);
2360 }
2361 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2362
2363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2364 {
2365         struct dst_entry *dst;
2366
2367         ip6_update_pmtu(skb, sock_net(sk), mtu,
2368                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2369
2370         dst = __sk_dst_get(sk);
2371         if (!dst || !dst->obsolete ||
2372             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373                 return;
2374
2375         bh_lock_sock(sk);
2376         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2377                 ip6_datagram_dst_update(sk, false);
2378         bh_unlock_sock(sk);
2379 }
2380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2381
2382 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2383                            const struct flowi6 *fl6)
2384 {
2385 #ifdef CONFIG_IPV6_SUBTREES
2386         struct ipv6_pinfo *np = inet6_sk(sk);
2387 #endif
2388
2389         ip6_dst_store(sk, dst,
2390                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2391                       &sk->sk_v6_daddr : NULL,
2392 #ifdef CONFIG_IPV6_SUBTREES
2393                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2394                       &np->saddr :
2395 #endif
2396                       NULL);
2397 }
2398
2399 /* Handle redirects */
2400 struct ip6rd_flowi {
2401         struct flowi6 fl6;
2402         struct in6_addr gateway;
2403 };
2404
2405 static struct rt6_info *__ip6_route_redirect(struct net *net,
2406                                              struct fib6_table *table,
2407                                              struct flowi6 *fl6,
2408                                              const struct sk_buff *skb,
2409                                              int flags)
2410 {
2411         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2412         struct rt6_info *ret = NULL, *rt_cache;
2413         struct fib6_info *rt;
2414         struct fib6_node *fn;
2415
2416         /* Get the "current" route for this destination and
2417          * check if the redirect has come from appropriate router.
2418          *
2419          * RFC 4861 specifies that redirects should only be
2420          * accepted if they come from the nexthop to the target.
2421          * Due to the way the routes are chosen, this notion
2422          * is a bit fuzzy and one might need to check all possible
2423          * routes.
2424          */
2425
2426         rcu_read_lock();
2427         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2428 restart:
2429         for_each_fib6_node_rt_rcu(fn) {
2430                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2431                         continue;
2432                 if (fib6_check_expired(rt))
2433                         continue;
2434                 if (rt->fib6_flags & RTF_REJECT)
2435                         break;
2436                 if (!(rt->fib6_flags & RTF_GATEWAY))
2437                         continue;
2438                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2439                         continue;
2440                 /* rt_cache's gateway might be different from its 'parent'
2441                  * in the case of an ip redirect.
2442                  * So we keep searching in the exception table if the gateway
2443                  * is different.
2444                  */
2445                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2446                         rt_cache = rt6_find_cached_rt(rt,
2447                                                       &fl6->daddr,
2448                                                       &fl6->saddr);
2449                         if (rt_cache &&
2450                             ipv6_addr_equal(&rdfl->gateway,
2451                                             &rt_cache->rt6i_gateway)) {
2452                                 ret = rt_cache;
2453                                 break;
2454                         }
2455                         continue;
2456                 }
2457                 break;
2458         }
2459
2460         if (!rt)
2461                 rt = net->ipv6.fib6_null_entry;
2462         else if (rt->fib6_flags & RTF_REJECT) {
2463                 ret = net->ipv6.ip6_null_entry;
2464                 goto out;
2465         }
2466
2467         if (rt == net->ipv6.fib6_null_entry) {
2468                 fn = fib6_backtrack(fn, &fl6->saddr);
2469                 if (fn)
2470                         goto restart;
2471         }
2472
2473 out:
2474         if (ret)
2475                 ip6_hold_safe(net, &ret, true);
2476         else
2477                 ret = ip6_create_rt_rcu(rt);
2478
2479         rcu_read_unlock();
2480
2481         trace_fib6_table_lookup(net, rt, table, fl6);
2482         return ret;
2483 };
2484
2485 static struct dst_entry *ip6_route_redirect(struct net *net,
2486                                             const struct flowi6 *fl6,
2487                                             const struct sk_buff *skb,
2488                                             const struct in6_addr *gateway)
2489 {
2490         int flags = RT6_LOOKUP_F_HAS_SADDR;
2491         struct ip6rd_flowi rdfl;
2492
2493         rdfl.fl6 = *fl6;
2494         rdfl.gateway = *gateway;
2495
2496         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2497                                 flags, __ip6_route_redirect);
2498 }
2499
2500 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2501                   kuid_t uid)
2502 {
2503         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2504         struct dst_entry *dst;
2505         struct flowi6 fl6;
2506
2507         memset(&fl6, 0, sizeof(fl6));
2508         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2509         fl6.flowi6_oif = oif;
2510         fl6.flowi6_mark = mark;
2511         fl6.daddr = iph->daddr;
2512         fl6.saddr = iph->saddr;
2513         fl6.flowlabel = ip6_flowinfo(iph);
2514         fl6.flowi6_uid = uid;
2515
2516         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2517         rt6_do_redirect(dst, NULL, skb);
2518         dst_release(dst);
2519 }
2520 EXPORT_SYMBOL_GPL(ip6_redirect);
2521
2522 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2523                             u32 mark)
2524 {
2525         const struct ipv6hdr *iph = ipv6_hdr(skb);
2526         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2527         struct dst_entry *dst;
2528         struct flowi6 fl6;
2529
2530         memset(&fl6, 0, sizeof(fl6));
2531         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2532         fl6.flowi6_oif = oif;
2533         fl6.flowi6_mark = mark;
2534         fl6.daddr = msg->dest;
2535         fl6.saddr = iph->daddr;
2536         fl6.flowi6_uid = sock_net_uid(net, NULL);
2537
2538         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2539         rt6_do_redirect(dst, NULL, skb);
2540         dst_release(dst);
2541 }
2542
2543 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2544 {
2545         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2546                      sk->sk_uid);
2547 }
2548 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2549
2550 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2551 {
2552         struct net_device *dev = dst->dev;
2553         unsigned int mtu = dst_mtu(dst);
2554         struct net *net = dev_net(dev);
2555
2556         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2557
2558         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2559                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2560
2561         /*
2562          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2563          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2564          * IPV6_MAXPLEN is also valid and means: "any MSS,
2565          * rely only on pmtu discovery"
2566          */
2567         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2568                 mtu = IPV6_MAXPLEN;
2569         return mtu;
2570 }
2571
2572 static unsigned int ip6_mtu(const struct dst_entry *dst)
2573 {
2574         struct inet6_dev *idev;
2575         unsigned int mtu;
2576
2577         mtu = dst_metric_raw(dst, RTAX_MTU);
2578         if (mtu)
2579                 goto out;
2580
2581         mtu = IPV6_MIN_MTU;
2582
2583         rcu_read_lock();
2584         idev = __in6_dev_get(dst->dev);
2585         if (idev)
2586                 mtu = idev->cnf.mtu6;
2587         rcu_read_unlock();
2588
2589 out:
2590         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2591
2592         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2593 }
2594
2595 /* MTU selection:
2596  * 1. mtu on route is locked - use it
2597  * 2. mtu from nexthop exception
2598  * 3. mtu from egress device
2599  *
2600  * based on ip6_dst_mtu_forward and exception logic of
2601  * rt6_find_cached_rt; called with rcu_read_lock
2602  */
2603 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2604                       struct in6_addr *saddr)
2605 {
2606         struct rt6_exception_bucket *bucket;
2607         struct rt6_exception *rt6_ex;
2608         struct in6_addr *src_key;
2609         struct inet6_dev *idev;
2610         u32 mtu = 0;
2611
2612         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2613                 mtu = f6i->fib6_pmtu;
2614                 if (mtu)
2615                         goto out;
2616         }
2617
2618         src_key = NULL;
2619 #ifdef CONFIG_IPV6_SUBTREES
2620         if (f6i->fib6_src.plen)
2621                 src_key = saddr;
2622 #endif
2623
2624         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2625         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2626         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2627                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2628
2629         if (likely(!mtu)) {
2630                 struct net_device *dev = fib6_info_nh_dev(f6i);
2631
2632                 mtu = IPV6_MIN_MTU;
2633                 idev = __in6_dev_get(dev);
2634                 if (idev && idev->cnf.mtu6 > mtu)
2635                         mtu = idev->cnf.mtu6;
2636         }
2637
2638         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2639 out:
2640         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2641 }
2642
2643 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2644                                   struct flowi6 *fl6)
2645 {
2646         struct dst_entry *dst;
2647         struct rt6_info *rt;
2648         struct inet6_dev *idev = in6_dev_get(dev);
2649         struct net *net = dev_net(dev);
2650
2651         if (unlikely(!idev))
2652                 return ERR_PTR(-ENODEV);
2653
2654         rt = ip6_dst_alloc(net, dev, 0);
2655         if (unlikely(!rt)) {
2656                 in6_dev_put(idev);
2657                 dst = ERR_PTR(-ENOMEM);
2658                 goto out;
2659         }
2660
2661         rt->dst.flags |= DST_HOST;
2662         rt->dst.input = ip6_input;
2663         rt->dst.output  = ip6_output;
2664         rt->rt6i_gateway  = fl6->daddr;
2665         rt->rt6i_dst.addr = fl6->daddr;
2666         rt->rt6i_dst.plen = 128;
2667         rt->rt6i_idev     = idev;
2668         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2669
2670         /* Add this dst into uncached_list so that rt6_disable_ip() can
2671          * do proper release of the net_device
2672          */
2673         rt6_uncached_list_add(rt);
2674         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2675
2676         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2677
2678 out:
2679         return dst;
2680 }
2681
2682 static int ip6_dst_gc(struct dst_ops *ops)
2683 {
2684         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2685         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2686         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2687         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2688         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2689         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2690         int entries;
2691
2692         entries = dst_entries_get_fast(ops);
2693         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2694             entries <= rt_max_size)
2695                 goto out;
2696
2697         net->ipv6.ip6_rt_gc_expire++;
2698         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2699         entries = dst_entries_get_slow(ops);
2700         if (entries < ops->gc_thresh)
2701                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2702 out:
2703         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2704         return entries > rt_max_size;
2705 }
2706
2707 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2708                                struct fib6_config *cfg)
2709 {
2710         struct dst_metrics *p;
2711
2712         if (!cfg->fc_mx)
2713                 return 0;
2714
2715         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2716         if (unlikely(!p))
2717                 return -ENOMEM;
2718
2719         refcount_set(&p->refcnt, 1);
2720         rt->fib6_metrics = p;
2721
2722         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2723 }
2724
2725 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2726                                             struct fib6_config *cfg,
2727                                             const struct in6_addr *gw_addr,
2728                                             u32 tbid, int flags)
2729 {
2730         struct flowi6 fl6 = {
2731                 .flowi6_oif = cfg->fc_ifindex,
2732                 .daddr = *gw_addr,
2733                 .saddr = cfg->fc_prefsrc,
2734         };
2735         struct fib6_table *table;
2736         struct rt6_info *rt;
2737
2738         table = fib6_get_table(net, tbid);
2739         if (!table)
2740                 return NULL;
2741
2742         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2743                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2744
2745         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2746         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2747
2748         /* if table lookup failed, fall back to full lookup */
2749         if (rt == net->ipv6.ip6_null_entry) {
2750                 ip6_rt_put(rt);
2751                 rt = NULL;
2752         }
2753
2754         return rt;
2755 }
2756
2757 static int ip6_route_check_nh_onlink(struct net *net,
2758                                      struct fib6_config *cfg,
2759                                      const struct net_device *dev,
2760                                      struct netlink_ext_ack *extack)
2761 {
2762         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2763         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2764         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2765         struct rt6_info *grt;
2766         int err;
2767
2768         err = 0;
2769         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2770         if (grt) {
2771                 if (!grt->dst.error &&
2772                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2773                         NL_SET_ERR_MSG(extack,
2774                                        "Nexthop has invalid gateway or device mismatch");
2775                         err = -EINVAL;
2776                 }
2777
2778                 ip6_rt_put(grt);
2779         }
2780
2781         return err;
2782 }
2783
2784 static int ip6_route_check_nh(struct net *net,
2785                               struct fib6_config *cfg,
2786                               struct net_device **_dev,
2787                               struct inet6_dev **idev)
2788 {
2789         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2790         struct net_device *dev = _dev ? *_dev : NULL;
2791         struct rt6_info *grt = NULL;
2792         int err = -EHOSTUNREACH;
2793
2794         if (cfg->fc_table) {
2795                 int flags = RT6_LOOKUP_F_IFACE;
2796
2797                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2798                                           cfg->fc_table, flags);
2799                 if (grt) {
2800                         if (grt->rt6i_flags & RTF_GATEWAY ||
2801                             (dev && dev != grt->dst.dev)) {
2802                                 ip6_rt_put(grt);
2803                                 grt = NULL;
2804                         }
2805                 }
2806         }
2807
2808         if (!grt)
2809                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2810
2811         if (!grt)
2812                 goto out;
2813
2814         if (dev) {
2815                 if (dev != grt->dst.dev) {
2816                         ip6_rt_put(grt);
2817                         goto out;
2818                 }
2819         } else {
2820                 *_dev = dev = grt->dst.dev;
2821                 *idev = grt->rt6i_idev;
2822                 dev_hold(dev);
2823                 in6_dev_hold(grt->rt6i_idev);
2824         }
2825
2826         if (!(grt->rt6i_flags & RTF_GATEWAY))
2827                 err = 0;
2828
2829         ip6_rt_put(grt);
2830
2831 out:
2832         return err;
2833 }
2834
2835 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2836                            struct net_device **_dev, struct inet6_dev **idev,
2837                            struct netlink_ext_ack *extack)
2838 {
2839         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2840         int gwa_type = ipv6_addr_type(gw_addr);
2841         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2842         const struct net_device *dev = *_dev;
2843         bool need_addr_check = !dev;
2844         int err = -EINVAL;
2845
2846         /* if gw_addr is local we will fail to detect this in case
2847          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2848          * will return already-added prefix route via interface that
2849          * prefix route was assigned to, which might be non-loopback.
2850          */
2851         if (dev &&
2852             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2853                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2854                 goto out;
2855         }
2856
2857         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2858                 /* IPv6 strictly inhibits using not link-local
2859                  * addresses as nexthop address.
2860                  * Otherwise, router will not able to send redirects.
2861                  * It is very good, but in some (rare!) circumstances
2862                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2863                  * some exceptions. --ANK
2864                  * We allow IPv4-mapped nexthops to support RFC4798-type
2865                  * addressing
2866                  */
2867                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2868                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2869                         goto out;
2870                 }
2871
2872                 if (cfg->fc_flags & RTNH_F_ONLINK)
2873                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2874                 else
2875                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2876
2877                 if (err)
2878                         goto out;
2879         }
2880
2881         /* reload in case device was changed */
2882         dev = *_dev;
2883
2884         err = -EINVAL;
2885         if (!dev) {
2886                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2887                 goto out;
2888         } else if (dev->flags & IFF_LOOPBACK) {
2889                 NL_SET_ERR_MSG(extack,
2890                                "Egress device can not be loopback device for this route");
2891                 goto out;
2892         }
2893
2894         /* if we did not check gw_addr above, do so now that the
2895          * egress device has been resolved.
2896          */
2897         if (need_addr_check &&
2898             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2899                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2900                 goto out;
2901         }
2902
2903         err = 0;
2904 out:
2905         return err;
2906 }
2907
2908 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2909                                               gfp_t gfp_flags,
2910                                               struct netlink_ext_ack *extack)
2911 {
2912         struct net *net = cfg->fc_nlinfo.nl_net;
2913         struct fib6_info *rt = NULL;
2914         struct net_device *dev = NULL;
2915         struct inet6_dev *idev = NULL;
2916         struct fib6_table *table;
2917         int addr_type;
2918         int err = -EINVAL;
2919
2920         /* RTF_PCPU is an internal flag; can not be set by userspace */
2921         if (cfg->fc_flags & RTF_PCPU) {
2922                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2923                 goto out;
2924         }
2925
2926         /* RTF_CACHE is an internal flag; can not be set by userspace */
2927         if (cfg->fc_flags & RTF_CACHE) {
2928                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2929                 goto out;
2930         }
2931
2932         if (cfg->fc_type > RTN_MAX) {
2933                 NL_SET_ERR_MSG(extack, "Invalid route type");
2934                 goto out;
2935         }
2936
2937         if (cfg->fc_dst_len > 128) {
2938                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2939                 goto out;
2940         }
2941         if (cfg->fc_src_len > 128) {
2942                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2943                 goto out;
2944         }
2945 #ifndef CONFIG_IPV6_SUBTREES
2946         if (cfg->fc_src_len) {
2947                 NL_SET_ERR_MSG(extack,
2948                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2949                 goto out;
2950         }
2951 #endif
2952         if (cfg->fc_ifindex) {
2953                 err = -ENODEV;
2954                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2955                 if (!dev)
2956                         goto out;
2957                 idev = in6_dev_get(dev);
2958                 if (!idev)
2959                         goto out;
2960         }
2961
2962         if (cfg->fc_metric == 0)
2963                 cfg->fc_metric = IP6_RT_PRIO_USER;
2964
2965         if (cfg->fc_flags & RTNH_F_ONLINK) {
2966                 if (!dev) {
2967                         NL_SET_ERR_MSG(extack,
2968                                        "Nexthop device required for onlink");
2969                         err = -ENODEV;
2970                         goto out;
2971                 }
2972
2973                 if (!(dev->flags & IFF_UP)) {
2974                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2975                         err = -ENETDOWN;
2976                         goto out;
2977                 }
2978         }
2979
2980         err = -ENOBUFS;
2981         if (cfg->fc_nlinfo.nlh &&
2982             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2983                 table = fib6_get_table(net, cfg->fc_table);
2984                 if (!table) {
2985                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2986                         table = fib6_new_table(net, cfg->fc_table);
2987                 }
2988         } else {
2989                 table = fib6_new_table(net, cfg->fc_table);
2990         }
2991
2992         if (!table)
2993                 goto out;
2994
2995         err = -ENOMEM;
2996         rt = fib6_info_alloc(gfp_flags);
2997         if (!rt)
2998                 goto out;
2999
3000         if (cfg->fc_flags & RTF_ADDRCONF)
3001                 rt->dst_nocount = true;
3002
3003         err = ip6_convert_metrics(net, rt, cfg);
3004         if (err < 0)
3005                 goto out;
3006
3007         if (cfg->fc_flags & RTF_EXPIRES)
3008                 fib6_set_expires(rt, jiffies +
3009                                 clock_t_to_jiffies(cfg->fc_expires));
3010         else
3011                 fib6_clean_expires(rt);
3012
3013         if (cfg->fc_protocol == RTPROT_UNSPEC)
3014                 cfg->fc_protocol = RTPROT_BOOT;
3015         rt->fib6_protocol = cfg->fc_protocol;
3016
3017         addr_type = ipv6_addr_type(&cfg->fc_dst);
3018
3019         if (cfg->fc_encap) {
3020                 struct lwtunnel_state *lwtstate;
3021
3022                 err = lwtunnel_build_state(cfg->fc_encap_type,
3023                                            cfg->fc_encap, AF_INET6, cfg,
3024                                            &lwtstate, extack);
3025                 if (err)
3026                         goto out;
3027                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3028         }
3029
3030         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3031         rt->fib6_dst.plen = cfg->fc_dst_len;
3032         if (rt->fib6_dst.plen == 128)
3033                 rt->dst_host = true;
3034
3035 #ifdef CONFIG_IPV6_SUBTREES
3036         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3037         rt->fib6_src.plen = cfg->fc_src_len;
3038 #endif
3039
3040         rt->fib6_metric = cfg->fc_metric;
3041         rt->fib6_nh.nh_weight = 1;
3042
3043         rt->fib6_type = cfg->fc_type;
3044
3045         /* We cannot add true routes via loopback here,
3046            they would result in kernel looping; promote them to reject routes
3047          */
3048         if ((cfg->fc_flags & RTF_REJECT) ||
3049             (dev && (dev->flags & IFF_LOOPBACK) &&
3050              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3051              !(cfg->fc_flags & RTF_LOCAL))) {
3052                 /* hold loopback dev/idev if we haven't done so. */
3053                 if (dev != net->loopback_dev) {
3054                         if (dev) {
3055                                 dev_put(dev);
3056                                 in6_dev_put(idev);
3057                         }
3058                         dev = net->loopback_dev;
3059                         dev_hold(dev);
3060                         idev = in6_dev_get(dev);
3061                         if (!idev) {
3062                                 err = -ENODEV;
3063                                 goto out;
3064                         }
3065                 }
3066                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3067                 goto install_route;
3068         }
3069
3070         if (cfg->fc_flags & RTF_GATEWAY) {
3071                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3072                 if (err)
3073                         goto out;
3074
3075                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3076         }
3077
3078         err = -ENODEV;
3079         if (!dev)
3080                 goto out;
3081
3082         if (idev->cnf.disable_ipv6) {
3083                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3084                 err = -EACCES;
3085                 goto out;
3086         }
3087
3088         if (!(dev->flags & IFF_UP)) {
3089                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3090                 err = -ENETDOWN;
3091                 goto out;
3092         }
3093
3094         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3095                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3096                         NL_SET_ERR_MSG(extack, "Invalid source address");
3097                         err = -EINVAL;
3098                         goto out;
3099                 }
3100                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3101                 rt->fib6_prefsrc.plen = 128;
3102         } else
3103                 rt->fib6_prefsrc.plen = 0;
3104
3105         rt->fib6_flags = cfg->fc_flags;
3106
3107 install_route:
3108         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3109             !netif_carrier_ok(dev))
3110                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3111         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3112         rt->fib6_nh.nh_dev = dev;
3113         rt->fib6_table = table;
3114
3115         if (idev)
3116                 in6_dev_put(idev);
3117
3118         return rt;
3119 out:
3120         if (dev)
3121                 dev_put(dev);
3122         if (idev)
3123                 in6_dev_put(idev);
3124
3125         fib6_info_release(rt);
3126         return ERR_PTR(err);
3127 }
3128
3129 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3130                   struct netlink_ext_ack *extack)
3131 {
3132         struct fib6_info *rt;
3133         int err;
3134
3135         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3136         if (IS_ERR(rt))
3137                 return PTR_ERR(rt);
3138
3139         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3140         fib6_info_release(rt);
3141
3142         return err;
3143 }
3144
3145 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3146 {
3147         struct net *net = info->nl_net;
3148         struct fib6_table *table;
3149         int err;
3150
3151         if (rt == net->ipv6.fib6_null_entry) {
3152                 err = -ENOENT;
3153                 goto out;
3154         }
3155
3156         table = rt->fib6_table;
3157         spin_lock_bh(&table->tb6_lock);
3158         err = fib6_del(rt, info);
3159         spin_unlock_bh(&table->tb6_lock);
3160
3161 out:
3162         fib6_info_release(rt);
3163         return err;
3164 }
3165
3166 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3167 {
3168         struct nl_info info = { .nl_net = net };
3169
3170         return __ip6_del_rt(rt, &info);
3171 }
3172
3173 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3174 {
3175         struct nl_info *info = &cfg->fc_nlinfo;
3176         struct net *net = info->nl_net;
3177         struct sk_buff *skb = NULL;
3178         struct fib6_table *table;
3179         int err = -ENOENT;
3180
3181         if (rt == net->ipv6.fib6_null_entry)
3182                 goto out_put;
3183         table = rt->fib6_table;
3184         spin_lock_bh(&table->tb6_lock);
3185
3186         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3187                 struct fib6_info *sibling, *next_sibling;
3188
3189                 /* prefer to send a single notification with all hops */
3190                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3191                 if (skb) {
3192                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3193
3194                         if (rt6_fill_node(net, skb, rt, NULL,
3195                                           NULL, NULL, 0, RTM_DELROUTE,
3196                                           info->portid, seq, 0) < 0) {
3197                                 kfree_skb(skb);
3198                                 skb = NULL;
3199                         } else
3200                                 info->skip_notify = 1;
3201                 }
3202
3203                 list_for_each_entry_safe(sibling, next_sibling,
3204                                          &rt->fib6_siblings,
3205                                          fib6_siblings) {
3206                         err = fib6_del(sibling, info);
3207                         if (err)
3208                                 goto out_unlock;
3209                 }
3210         }
3211
3212         err = fib6_del(rt, info);
3213 out_unlock:
3214         spin_unlock_bh(&table->tb6_lock);
3215 out_put:
3216         fib6_info_release(rt);
3217
3218         if (skb) {
3219                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3220                             info->nlh, gfp_any());
3221         }
3222         return err;
3223 }
3224
3225 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3226 {
3227         int rc = -ESRCH;
3228
3229         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3230                 goto out;
3231
3232         if (cfg->fc_flags & RTF_GATEWAY &&
3233             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3234                 goto out;
3235         if (dst_hold_safe(&rt->dst))
3236                 rc = rt6_remove_exception_rt(rt);
3237 out:
3238         return rc;
3239 }
3240
3241 static int ip6_route_del(struct fib6_config *cfg,
3242                          struct netlink_ext_ack *extack)
3243 {
3244         struct rt6_info *rt_cache;
3245         struct fib6_table *table;
3246         struct fib6_info *rt;
3247         struct fib6_node *fn;
3248         int err = -ESRCH;
3249
3250         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3251         if (!table) {
3252                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3253                 return err;
3254         }
3255
3256         rcu_read_lock();
3257
3258         fn = fib6_locate(&table->tb6_root,
3259                          &cfg->fc_dst, cfg->fc_dst_len,
3260                          &cfg->fc_src, cfg->fc_src_len,
3261                          !(cfg->fc_flags & RTF_CACHE));
3262
3263         if (fn) {
3264                 for_each_fib6_node_rt_rcu(fn) {
3265                         if (cfg->fc_flags & RTF_CACHE) {
3266                                 int rc;
3267
3268                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3269                                                               &cfg->fc_src);
3270                                 if (rt_cache) {
3271                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3272                                         if (rc != -ESRCH) {
3273                                                 rcu_read_unlock();
3274                                                 return rc;
3275                                         }
3276                                 }
3277                                 continue;
3278                         }
3279                         if (cfg->fc_ifindex &&
3280                             (!rt->fib6_nh.nh_dev ||
3281                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3282                                 continue;
3283                         if (cfg->fc_flags & RTF_GATEWAY &&
3284                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3285                                 continue;
3286                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3287                                 continue;
3288                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3289                                 continue;
3290                         if (!fib6_info_hold_safe(rt))
3291                                 continue;
3292                         rcu_read_unlock();
3293
3294                         /* if gateway was specified only delete the one hop */
3295                         if (cfg->fc_flags & RTF_GATEWAY)
3296                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3297
3298                         return __ip6_del_rt_siblings(rt, cfg);
3299                 }
3300         }
3301         rcu_read_unlock();
3302
3303         return err;
3304 }
3305
3306 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3307 {
3308         struct netevent_redirect netevent;
3309         struct rt6_info *rt, *nrt = NULL;
3310         struct ndisc_options ndopts;
3311         struct inet6_dev *in6_dev;
3312         struct neighbour *neigh;
3313         struct fib6_info *from;
3314         struct rd_msg *msg;
3315         int optlen, on_link;
3316         u8 *lladdr;
3317
3318         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3319         optlen -= sizeof(*msg);
3320
3321         if (optlen < 0) {
3322                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3323                 return;
3324         }
3325
3326         msg = (struct rd_msg *)icmp6_hdr(skb);
3327
3328         if (ipv6_addr_is_multicast(&msg->dest)) {
3329                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3330                 return;
3331         }
3332
3333         on_link = 0;
3334         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3335                 on_link = 1;
3336         } else if (ipv6_addr_type(&msg->target) !=
3337                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3338                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3339                 return;
3340         }
3341
3342         in6_dev = __in6_dev_get(skb->dev);
3343         if (!in6_dev)
3344                 return;
3345         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3346                 return;
3347
3348         /* RFC2461 8.1:
3349          *      The IP source address of the Redirect MUST be the same as the current
3350          *      first-hop router for the specified ICMP Destination Address.
3351          */
3352
3353         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3354                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3355                 return;
3356         }
3357
3358         lladdr = NULL;
3359         if (ndopts.nd_opts_tgt_lladdr) {
3360                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3361                                              skb->dev);
3362                 if (!lladdr) {
3363                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3364                         return;
3365                 }
3366         }
3367
3368         rt = (struct rt6_info *) dst;
3369         if (rt->rt6i_flags & RTF_REJECT) {
3370                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3371                 return;
3372         }
3373
3374         /* Redirect received -> path was valid.
3375          * Look, redirects are sent only in response to data packets,
3376          * so that this nexthop apparently is reachable. --ANK
3377          */
3378         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3379
3380         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3381         if (!neigh)
3382                 return;
3383
3384         /*
3385          *      We have finally decided to accept it.
3386          */
3387
3388         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3389                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3390                      NEIGH_UPDATE_F_OVERRIDE|
3391                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3392                                      NEIGH_UPDATE_F_ISROUTER)),
3393                      NDISC_REDIRECT, &ndopts);
3394
3395         rcu_read_lock();
3396         from = rcu_dereference(rt->from);
3397         /* This fib6_info_hold() is safe here because we hold reference to rt
3398          * and rt already holds reference to fib6_info.
3399          */
3400         fib6_info_hold(from);
3401         rcu_read_unlock();
3402
3403         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3404         if (!nrt)
3405                 goto out;
3406
3407         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3408         if (on_link)
3409                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3410
3411         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3412
3413         /* No need to remove rt from the exception table if rt is
3414          * a cached route because rt6_insert_exception() will
3415          * takes care of it
3416          */
3417         if (rt6_insert_exception(nrt, from)) {
3418                 dst_release_immediate(&nrt->dst);
3419                 goto out;
3420         }
3421
3422         netevent.old = &rt->dst;
3423         netevent.new = &nrt->dst;
3424         netevent.daddr = &msg->dest;
3425         netevent.neigh = neigh;
3426         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3427
3428 out:
3429         fib6_info_release(from);
3430         neigh_release(neigh);
3431 }
3432
3433 #ifdef CONFIG_IPV6_ROUTE_INFO
3434 static struct fib6_info *rt6_get_route_info(struct net *net,
3435                                            const struct in6_addr *prefix, int prefixlen,
3436                                            const struct in6_addr *gwaddr,
3437                                            struct net_device *dev)
3438 {
3439         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3440         int ifindex = dev->ifindex;
3441         struct fib6_node *fn;
3442         struct fib6_info *rt = NULL;
3443         struct fib6_table *table;
3444
3445         table = fib6_get_table(net, tb_id);
3446         if (!table)
3447                 return NULL;
3448
3449         rcu_read_lock();
3450         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3451         if (!fn)
3452                 goto out;
3453
3454         for_each_fib6_node_rt_rcu(fn) {
3455                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3456                         continue;
3457                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3458                         continue;
3459                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3460                         continue;
3461                 if (!fib6_info_hold_safe(rt))
3462                         continue;
3463                 break;
3464         }
3465 out:
3466         rcu_read_unlock();
3467         return rt;
3468 }
3469
3470 static struct fib6_info *rt6_add_route_info(struct net *net,
3471                                            const struct in6_addr *prefix, int prefixlen,
3472                                            const struct in6_addr *gwaddr,
3473                                            struct net_device *dev,
3474                                            unsigned int pref)
3475 {
3476         struct fib6_config cfg = {
3477                 .fc_metric      = IP6_RT_PRIO_USER,
3478                 .fc_ifindex     = dev->ifindex,
3479                 .fc_dst_len     = prefixlen,
3480                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3481                                   RTF_UP | RTF_PREF(pref),
3482                 .fc_protocol = RTPROT_RA,
3483                 .fc_type = RTN_UNICAST,
3484                 .fc_nlinfo.portid = 0,
3485                 .fc_nlinfo.nlh = NULL,
3486                 .fc_nlinfo.nl_net = net,
3487         };
3488
3489         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3490         cfg.fc_dst = *prefix;
3491         cfg.fc_gateway = *gwaddr;
3492
3493         /* We should treat it as a default route if prefix length is 0. */
3494         if (!prefixlen)
3495                 cfg.fc_flags |= RTF_DEFAULT;
3496
3497         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3498
3499         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3500 }
3501 #endif
3502
3503 struct fib6_info *rt6_get_dflt_router(struct net *net,
3504                                      const struct in6_addr *addr,
3505                                      struct net_device *dev)
3506 {
3507         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3508         struct fib6_info *rt;
3509         struct fib6_table *table;
3510
3511         table = fib6_get_table(net, tb_id);
3512         if (!table)
3513                 return NULL;
3514
3515         rcu_read_lock();
3516         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3517                 if (dev == rt->fib6_nh.nh_dev &&
3518                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3519                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3520                         break;
3521         }
3522         if (rt && !fib6_info_hold_safe(rt))
3523                 rt = NULL;
3524         rcu_read_unlock();
3525         return rt;
3526 }
3527
3528 struct fib6_info *rt6_add_dflt_router(struct net *net,
3529                                      const struct in6_addr *gwaddr,
3530                                      struct net_device *dev,
3531                                      unsigned int pref)
3532 {
3533         struct fib6_config cfg = {
3534                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3535                 .fc_metric      = IP6_RT_PRIO_USER,
3536                 .fc_ifindex     = dev->ifindex,
3537                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3538                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3539                 .fc_protocol = RTPROT_RA,
3540                 .fc_type = RTN_UNICAST,
3541                 .fc_nlinfo.portid = 0,
3542                 .fc_nlinfo.nlh = NULL,
3543                 .fc_nlinfo.nl_net = net,
3544         };
3545
3546         cfg.fc_gateway = *gwaddr;
3547
3548         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3549                 struct fib6_table *table;
3550
3551                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3552                 if (table)
3553                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3554         }
3555
3556         return rt6_get_dflt_router(net, gwaddr, dev);
3557 }
3558
3559 static void __rt6_purge_dflt_routers(struct net *net,
3560                                      struct fib6_table *table)
3561 {
3562         struct fib6_info *rt;
3563
3564 restart:
3565         rcu_read_lock();
3566         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3567                 struct net_device *dev = fib6_info_nh_dev(rt);
3568                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3569
3570                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3571                     (!idev || idev->cnf.accept_ra != 2) &&
3572                     fib6_info_hold_safe(rt)) {
3573                         rcu_read_unlock();
3574                         ip6_del_rt(net, rt);
3575                         goto restart;
3576                 }
3577         }
3578         rcu_read_unlock();
3579
3580         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3581 }
3582
3583 void rt6_purge_dflt_routers(struct net *net)
3584 {
3585         struct fib6_table *table;
3586         struct hlist_head *head;
3587         unsigned int h;
3588
3589         rcu_read_lock();
3590
3591         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3592                 head = &net->ipv6.fib_table_hash[h];
3593                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3594                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3595                                 __rt6_purge_dflt_routers(net, table);
3596                 }
3597         }
3598
3599         rcu_read_unlock();
3600 }
3601
3602 static void rtmsg_to_fib6_config(struct net *net,
3603                                  struct in6_rtmsg *rtmsg,
3604                                  struct fib6_config *cfg)
3605 {
3606         memset(cfg, 0, sizeof(*cfg));
3607
3608         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3609                          : RT6_TABLE_MAIN;
3610         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3611         cfg->fc_metric = rtmsg->rtmsg_metric;
3612         cfg->fc_expires = rtmsg->rtmsg_info;
3613         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3614         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3615         cfg->fc_flags = rtmsg->rtmsg_flags;
3616         cfg->fc_type = rtmsg->rtmsg_type;
3617
3618         cfg->fc_nlinfo.nl_net = net;
3619
3620         cfg->fc_dst = rtmsg->rtmsg_dst;
3621         cfg->fc_src = rtmsg->rtmsg_src;
3622         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3623 }
3624
3625 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3626 {
3627         struct fib6_config cfg;
3628         struct in6_rtmsg rtmsg;
3629         int err;
3630
3631         switch (cmd) {
3632         case SIOCADDRT:         /* Add a route */
3633         case SIOCDELRT:         /* Delete a route */
3634                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3635                         return -EPERM;
3636                 err = copy_from_user(&rtmsg, arg,
3637                                      sizeof(struct in6_rtmsg));
3638                 if (err)
3639                         return -EFAULT;
3640
3641                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3642
3643                 rtnl_lock();
3644                 switch (cmd) {
3645                 case SIOCADDRT:
3646                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3647                         break;
3648                 case SIOCDELRT:
3649                         err = ip6_route_del(&cfg, NULL);
3650                         break;
3651                 default:
3652                         err = -EINVAL;
3653                 }
3654                 rtnl_unlock();
3655
3656                 return err;
3657         }
3658
3659         return -EINVAL;
3660 }
3661
3662 /*
3663  *      Drop the packet on the floor
3664  */
3665
3666 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3667 {
3668         int type;
3669         struct dst_entry *dst = skb_dst(skb);
3670         switch (ipstats_mib_noroutes) {
3671         case IPSTATS_MIB_INNOROUTES:
3672                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3673                 if (type == IPV6_ADDR_ANY) {
3674                         IP6_INC_STATS(dev_net(dst->dev),
3675                                       __in6_dev_get_safely(skb->dev),
3676                                       IPSTATS_MIB_INADDRERRORS);
3677                         break;
3678                 }
3679                 /* FALLTHROUGH */
3680         case IPSTATS_MIB_OUTNOROUTES:
3681                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3682                               ipstats_mib_noroutes);
3683                 break;
3684         }
3685         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3686         kfree_skb(skb);
3687         return 0;
3688 }
3689
3690 static int ip6_pkt_discard(struct sk_buff *skb)
3691 {
3692         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3693 }
3694
3695 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3696 {
3697         skb->dev = skb_dst(skb)->dev;
3698         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3699 }
3700
3701 static int ip6_pkt_prohibit(struct sk_buff *skb)
3702 {
3703         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3704 }
3705
3706 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3707 {
3708         skb->dev = skb_dst(skb)->dev;
3709         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3710 }
3711
3712 /*
3713  *      Allocate a dst for local (unicast / anycast) address.
3714  */
3715
3716 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3717                                      struct inet6_dev *idev,
3718                                      const struct in6_addr *addr,
3719                                      bool anycast, gfp_t gfp_flags)
3720 {
3721         u32 tb_id;
3722         struct net_device *dev = idev->dev;
3723         struct fib6_info *f6i;
3724
3725         f6i = fib6_info_alloc(gfp_flags);
3726         if (!f6i)
3727                 return ERR_PTR(-ENOMEM);
3728
3729         f6i->dst_nocount = true;
3730         f6i->dst_host = true;
3731         f6i->fib6_protocol = RTPROT_KERNEL;
3732         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3733         if (anycast) {
3734                 f6i->fib6_type = RTN_ANYCAST;
3735                 f6i->fib6_flags |= RTF_ANYCAST;
3736         } else {
3737                 f6i->fib6_type = RTN_LOCAL;
3738                 f6i->fib6_flags |= RTF_LOCAL;
3739         }
3740
3741         f6i->fib6_nh.nh_gw = *addr;
3742         dev_hold(dev);
3743         f6i->fib6_nh.nh_dev = dev;
3744         f6i->fib6_dst.addr = *addr;
3745         f6i->fib6_dst.plen = 128;
3746         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3747         f6i->fib6_table = fib6_get_table(net, tb_id);
3748
3749         return f6i;
3750 }
3751
3752 /* remove deleted ip from prefsrc entries */
3753 struct arg_dev_net_ip {
3754         struct net_device *dev;
3755         struct net *net;
3756         struct in6_addr *addr;
3757 };
3758
3759 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3760 {
3761         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3762         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3763         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3764
3765         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3766             rt != net->ipv6.fib6_null_entry &&
3767             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3768                 spin_lock_bh(&rt6_exception_lock);
3769                 /* remove prefsrc entry */
3770                 rt->fib6_prefsrc.plen = 0;
3771                 spin_unlock_bh(&rt6_exception_lock);
3772         }
3773         return 0;
3774 }
3775
3776 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3777 {
3778         struct net *net = dev_net(ifp->idev->dev);
3779         struct arg_dev_net_ip adni = {
3780                 .dev = ifp->idev->dev,
3781                 .net = net,
3782                 .addr = &ifp->addr,
3783         };
3784         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3785 }
3786
3787 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3788
3789 /* Remove routers and update dst entries when gateway turn into host. */
3790 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3791 {
3792         struct in6_addr *gateway = (struct in6_addr *)arg;
3793
3794         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3795             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3796                 return -1;
3797         }
3798
3799         /* Further clean up cached routes in exception table.
3800          * This is needed because cached route may have a different
3801          * gateway than its 'parent' in the case of an ip redirect.
3802          */
3803         rt6_exceptions_clean_tohost(rt, gateway);
3804
3805         return 0;
3806 }
3807
3808 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3809 {
3810         fib6_clean_all(net, fib6_clean_tohost, gateway);
3811 }
3812
3813 struct arg_netdev_event {
3814         const struct net_device *dev;
3815         union {
3816                 unsigned int nh_flags;
3817                 unsigned long event;
3818         };
3819 };
3820
3821 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3822 {
3823         struct fib6_info *iter;
3824         struct fib6_node *fn;
3825
3826         fn = rcu_dereference_protected(rt->fib6_node,
3827                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3828         iter = rcu_dereference_protected(fn->leaf,
3829                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3830         while (iter) {
3831                 if (iter->fib6_metric == rt->fib6_metric &&
3832                     rt6_qualify_for_ecmp(iter))
3833                         return iter;
3834                 iter = rcu_dereference_protected(iter->fib6_next,
3835                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3836         }
3837
3838         return NULL;
3839 }
3840
3841 static bool rt6_is_dead(const struct fib6_info *rt)
3842 {
3843         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3844             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3845              fib6_ignore_linkdown(rt)))
3846                 return true;
3847
3848         return false;
3849 }
3850
3851 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3852 {
3853         struct fib6_info *iter;
3854         int total = 0;
3855
3856         if (!rt6_is_dead(rt))
3857                 total += rt->fib6_nh.nh_weight;
3858
3859         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3860                 if (!rt6_is_dead(iter))
3861                         total += iter->fib6_nh.nh_weight;
3862         }
3863
3864         return total;
3865 }
3866
3867 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3868 {
3869         int upper_bound = -1;
3870
3871         if (!rt6_is_dead(rt)) {
3872                 *weight += rt->fib6_nh.nh_weight;
3873                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3874                                                     total) - 1;
3875         }
3876         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3877 }
3878
3879 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3880 {
3881         struct fib6_info *iter;
3882         int weight = 0;
3883
3884         rt6_upper_bound_set(rt, &weight, total);
3885
3886         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3887                 rt6_upper_bound_set(iter, &weight, total);
3888 }
3889
3890 void rt6_multipath_rebalance(struct fib6_info *rt)
3891 {
3892         struct fib6_info *first;
3893         int total;
3894
3895         /* In case the entire multipath route was marked for flushing,
3896          * then there is no need to rebalance upon the removal of every
3897          * sibling route.
3898          */
3899         if (!rt->fib6_nsiblings || rt->should_flush)
3900                 return;
3901
3902         /* During lookup routes are evaluated in order, so we need to
3903          * make sure upper bounds are assigned from the first sibling
3904          * onwards.
3905          */
3906         first = rt6_multipath_first_sibling(rt);
3907         if (WARN_ON_ONCE(!first))
3908                 return;
3909
3910         total = rt6_multipath_total_weight(first);
3911         rt6_multipath_upper_bound_set(first, total);
3912 }
3913
3914 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3915 {
3916         const struct arg_netdev_event *arg = p_arg;
3917         struct net *net = dev_net(arg->dev);
3918
3919         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3920                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3921                 fib6_update_sernum_upto_root(net, rt);
3922                 rt6_multipath_rebalance(rt);
3923         }
3924
3925         return 0;
3926 }
3927
3928 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3929 {
3930         struct arg_netdev_event arg = {
3931                 .dev = dev,
3932                 {
3933                         .nh_flags = nh_flags,
3934                 },
3935         };
3936
3937         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3938                 arg.nh_flags |= RTNH_F_LINKDOWN;
3939
3940         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3941 }
3942
3943 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3944                                    const struct net_device *dev)
3945 {
3946         struct fib6_info *iter;
3947
3948         if (rt->fib6_nh.nh_dev == dev)
3949                 return true;
3950         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3951                 if (iter->fib6_nh.nh_dev == dev)
3952                         return true;
3953
3954         return false;
3955 }
3956
3957 static void rt6_multipath_flush(struct fib6_info *rt)
3958 {
3959         struct fib6_info *iter;
3960
3961         rt->should_flush = 1;
3962         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3963                 iter->should_flush = 1;
3964 }
3965
3966 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3967                                              const struct net_device *down_dev)
3968 {
3969         struct fib6_info *iter;
3970         unsigned int dead = 0;
3971
3972         if (rt->fib6_nh.nh_dev == down_dev ||
3973             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3974                 dead++;
3975         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976                 if (iter->fib6_nh.nh_dev == down_dev ||
3977                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3978                         dead++;
3979
3980         return dead;
3981 }
3982
3983 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3984                                        const struct net_device *dev,
3985                                        unsigned int nh_flags)
3986 {
3987         struct fib6_info *iter;
3988
3989         if (rt->fib6_nh.nh_dev == dev)
3990                 rt->fib6_nh.nh_flags |= nh_flags;
3991         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992                 if (iter->fib6_nh.nh_dev == dev)
3993                         iter->fib6_nh.nh_flags |= nh_flags;
3994 }
3995
3996 /* called with write lock held for table with rt */
3997 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3998 {
3999         const struct arg_netdev_event *arg = p_arg;
4000         const struct net_device *dev = arg->dev;
4001         struct net *net = dev_net(dev);
4002
4003         if (rt == net->ipv6.fib6_null_entry)
4004                 return 0;
4005
4006         switch (arg->event) {
4007         case NETDEV_UNREGISTER:
4008                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4009         case NETDEV_DOWN:
4010                 if (rt->should_flush)
4011                         return -1;
4012                 if (!rt->fib6_nsiblings)
4013                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4014                 if (rt6_multipath_uses_dev(rt, dev)) {
4015                         unsigned int count;
4016
4017                         count = rt6_multipath_dead_count(rt, dev);
4018                         if (rt->fib6_nsiblings + 1 == count) {
4019                                 rt6_multipath_flush(rt);
4020                                 return -1;
4021                         }
4022                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4023                                                    RTNH_F_LINKDOWN);
4024                         fib6_update_sernum(net, rt);
4025                         rt6_multipath_rebalance(rt);
4026                 }
4027                 return -2;
4028         case NETDEV_CHANGE:
4029                 if (rt->fib6_nh.nh_dev != dev ||
4030                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4031                         break;
4032                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4033                 rt6_multipath_rebalance(rt);
4034                 break;
4035         }
4036
4037         return 0;
4038 }
4039
4040 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4041 {
4042         struct arg_netdev_event arg = {
4043                 .dev = dev,
4044                 {
4045                         .event = event,
4046                 },
4047         };
4048
4049         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4050 }
4051
4052 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4053 {
4054         rt6_sync_down_dev(dev, event);
4055         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4056         neigh_ifdown(&nd_tbl, dev);
4057 }
4058
4059 struct rt6_mtu_change_arg {
4060         struct net_device *dev;
4061         unsigned int mtu;
4062 };
4063
4064 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4065 {
4066         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4067         struct inet6_dev *idev;
4068
4069         /* In IPv6 pmtu discovery is not optional,
4070            so that RTAX_MTU lock cannot disable it.
4071            We still use this lock to block changes
4072            caused by addrconf/ndisc.
4073         */
4074
4075         idev = __in6_dev_get(arg->dev);
4076         if (!idev)
4077                 return 0;
4078
4079         /* For administrative MTU increase, there is no way to discover
4080            IPv6 PMTU increase, so PMTU increase should be updated here.
4081            Since RFC 1981 doesn't include administrative MTU increase
4082            update PMTU increase is a MUST. (i.e. jumbo frame)
4083          */
4084         if (rt->fib6_nh.nh_dev == arg->dev &&
4085             !fib6_metric_locked(rt, RTAX_MTU)) {
4086                 u32 mtu = rt->fib6_pmtu;
4087
4088                 if (mtu >= arg->mtu ||
4089                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4090                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4091
4092                 spin_lock_bh(&rt6_exception_lock);
4093                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4094                 spin_unlock_bh(&rt6_exception_lock);
4095         }
4096         return 0;
4097 }
4098
4099 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4100 {
4101         struct rt6_mtu_change_arg arg = {
4102                 .dev = dev,
4103                 .mtu = mtu,
4104         };
4105
4106         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4107 }
4108
4109 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4110         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4111         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4112         [RTA_OIF]               = { .type = NLA_U32 },
4113         [RTA_IIF]               = { .type = NLA_U32 },
4114         [RTA_PRIORITY]          = { .type = NLA_U32 },
4115         [RTA_METRICS]           = { .type = NLA_NESTED },
4116         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4117         [RTA_PREF]              = { .type = NLA_U8 },
4118         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4119         [RTA_ENCAP]             = { .type = NLA_NESTED },
4120         [RTA_EXPIRES]           = { .type = NLA_U32 },
4121         [RTA_UID]               = { .type = NLA_U32 },
4122         [RTA_MARK]              = { .type = NLA_U32 },
4123         [RTA_TABLE]             = { .type = NLA_U32 },
4124         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4125         [RTA_SPORT]             = { .type = NLA_U16 },
4126         [RTA_DPORT]             = { .type = NLA_U16 },
4127 };
4128
4129 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4130                               struct fib6_config *cfg,
4131                               struct netlink_ext_ack *extack)
4132 {
4133         struct rtmsg *rtm;
4134         struct nlattr *tb[RTA_MAX+1];
4135         unsigned int pref;
4136         int err;
4137
4138         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4139                           NULL);
4140         if (err < 0)
4141                 goto errout;
4142
4143         err = -EINVAL;
4144         rtm = nlmsg_data(nlh);
4145         memset(cfg, 0, sizeof(*cfg));
4146
4147         cfg->fc_table = rtm->rtm_table;
4148         cfg->fc_dst_len = rtm->rtm_dst_len;
4149         cfg->fc_src_len = rtm->rtm_src_len;
4150         cfg->fc_flags = RTF_UP;
4151         cfg->fc_protocol = rtm->rtm_protocol;
4152         cfg->fc_type = rtm->rtm_type;
4153
4154         if (rtm->rtm_type == RTN_UNREACHABLE ||
4155             rtm->rtm_type == RTN_BLACKHOLE ||
4156             rtm->rtm_type == RTN_PROHIBIT ||
4157             rtm->rtm_type == RTN_THROW)
4158                 cfg->fc_flags |= RTF_REJECT;
4159
4160         if (rtm->rtm_type == RTN_LOCAL)
4161                 cfg->fc_flags |= RTF_LOCAL;
4162
4163         if (rtm->rtm_flags & RTM_F_CLONED)
4164                 cfg->fc_flags |= RTF_CACHE;
4165
4166         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4167
4168         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4169         cfg->fc_nlinfo.nlh = nlh;
4170         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4171
4172         if (tb[RTA_GATEWAY]) {
4173                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4174                 cfg->fc_flags |= RTF_GATEWAY;
4175         }
4176
4177         if (tb[RTA_DST]) {
4178                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4179
4180                 if (nla_len(tb[RTA_DST]) < plen)
4181                         goto errout;
4182
4183                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4184         }
4185
4186         if (tb[RTA_SRC]) {
4187                 int plen = (rtm->rtm_src_len + 7) >> 3;
4188
4189                 if (nla_len(tb[RTA_SRC]) < plen)
4190                         goto errout;
4191
4192                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4193         }
4194
4195         if (tb[RTA_PREFSRC])
4196                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4197
4198         if (tb[RTA_OIF])
4199                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4200
4201         if (tb[RTA_PRIORITY])
4202                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4203
4204         if (tb[RTA_METRICS]) {
4205                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4206                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4207         }
4208
4209         if (tb[RTA_TABLE])
4210                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4211
4212         if (tb[RTA_MULTIPATH]) {
4213                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4214                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4215
4216                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4217                                                      cfg->fc_mp_len, extack);
4218                 if (err < 0)
4219                         goto errout;
4220         }
4221
4222         if (tb[RTA_PREF]) {
4223                 pref = nla_get_u8(tb[RTA_PREF]);
4224                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4225                     pref != ICMPV6_ROUTER_PREF_HIGH)
4226                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4227                 cfg->fc_flags |= RTF_PREF(pref);
4228         }
4229
4230         if (tb[RTA_ENCAP])
4231                 cfg->fc_encap = tb[RTA_ENCAP];
4232
4233         if (tb[RTA_ENCAP_TYPE]) {
4234                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4235
4236                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4237                 if (err < 0)
4238                         goto errout;
4239         }
4240
4241         if (tb[RTA_EXPIRES]) {
4242                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4243
4244                 if (addrconf_finite_timeout(timeout)) {
4245                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4246                         cfg->fc_flags |= RTF_EXPIRES;
4247                 }
4248         }
4249
4250         err = 0;
4251 errout:
4252         return err;
4253 }
4254
4255 struct rt6_nh {
4256         struct fib6_info *fib6_info;
4257         struct fib6_config r_cfg;
4258         struct list_head next;
4259 };
4260
4261 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4262 {
4263         struct rt6_nh *nh;
4264
4265         list_for_each_entry(nh, rt6_nh_list, next) {
4266                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4267                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4268                         nh->r_cfg.fc_ifindex);
4269         }
4270 }
4271
4272 static int ip6_route_info_append(struct net *net,
4273                                  struct list_head *rt6_nh_list,
4274                                  struct fib6_info *rt,
4275                                  struct fib6_config *r_cfg)
4276 {
4277         struct rt6_nh *nh;
4278         int err = -EEXIST;
4279
4280         list_for_each_entry(nh, rt6_nh_list, next) {
4281                 /* check if fib6_info already exists */
4282                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4283                         return err;
4284         }
4285
4286         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4287         if (!nh)
4288                 return -ENOMEM;
4289         nh->fib6_info = rt;
4290         err = ip6_convert_metrics(net, rt, r_cfg);
4291         if (err) {
4292                 kfree(nh);
4293                 return err;
4294         }
4295         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4296         list_add_tail(&nh->next, rt6_nh_list);
4297
4298         return 0;
4299 }
4300
4301 static void ip6_route_mpath_notify(struct fib6_info *rt,
4302                                    struct fib6_info *rt_last,
4303                                    struct nl_info *info,
4304                                    __u16 nlflags)
4305 {
4306         /* if this is an APPEND route, then rt points to the first route
4307          * inserted and rt_last points to last route inserted. Userspace
4308          * wants a consistent dump of the route which starts at the first
4309          * nexthop. Since sibling routes are always added at the end of
4310          * the list, find the first sibling of the last route appended
4311          */
4312         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4313                 rt = list_first_entry(&rt_last->fib6_siblings,
4314                                       struct fib6_info,
4315                                       fib6_siblings);
4316         }
4317
4318         if (rt)
4319                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4320 }
4321
4322 static int ip6_route_multipath_add(struct fib6_config *cfg,
4323                                    struct netlink_ext_ack *extack)
4324 {
4325         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4326         struct nl_info *info = &cfg->fc_nlinfo;
4327         struct fib6_config r_cfg;
4328         struct rtnexthop *rtnh;
4329         struct fib6_info *rt;
4330         struct rt6_nh *err_nh;
4331         struct rt6_nh *nh, *nh_safe;
4332         __u16 nlflags;
4333         int remaining;
4334         int attrlen;
4335         int err = 1;
4336         int nhn = 0;
4337         int replace = (cfg->fc_nlinfo.nlh &&
4338                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4339         LIST_HEAD(rt6_nh_list);
4340
4341         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4342         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4343                 nlflags |= NLM_F_APPEND;
4344
4345         remaining = cfg->fc_mp_len;
4346         rtnh = (struct rtnexthop *)cfg->fc_mp;
4347
4348         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4349          * fib6_info structs per nexthop
4350          */
4351         while (rtnh_ok(rtnh, remaining)) {
4352                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4353                 if (rtnh->rtnh_ifindex)
4354                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4355
4356                 attrlen = rtnh_attrlen(rtnh);
4357                 if (attrlen > 0) {
4358                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4359
4360                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4361                         if (nla) {
4362                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4363                                 r_cfg.fc_flags |= RTF_GATEWAY;
4364                         }
4365                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4366                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4367                         if (nla)
4368                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4369                 }
4370
4371                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4372                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4373                 if (IS_ERR(rt)) {
4374                         err = PTR_ERR(rt);
4375                         rt = NULL;
4376                         goto cleanup;
4377                 }
4378                 if (!rt6_qualify_for_ecmp(rt)) {
4379                         err = -EINVAL;
4380                         NL_SET_ERR_MSG(extack,
4381                                        "Device only routes can not be added for IPv6 using the multipath API.");
4382                         fib6_info_release(rt);
4383                         goto cleanup;
4384                 }
4385
4386                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4387
4388                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4389                                             rt, &r_cfg);
4390                 if (err) {
4391                         fib6_info_release(rt);
4392                         goto cleanup;
4393                 }
4394
4395                 rtnh = rtnh_next(rtnh, &remaining);
4396         }
4397
4398         /* for add and replace send one notification with all nexthops.
4399          * Skip the notification in fib6_add_rt2node and send one with
4400          * the full route when done
4401          */
4402         info->skip_notify = 1;
4403
4404         err_nh = NULL;
4405         list_for_each_entry(nh, &rt6_nh_list, next) {
4406                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4407                 fib6_info_release(nh->fib6_info);
4408
4409                 if (!err) {
4410                         /* save reference to last route successfully inserted */
4411                         rt_last = nh->fib6_info;
4412
4413                         /* save reference to first route for notification */
4414                         if (!rt_notif)
4415                                 rt_notif = nh->fib6_info;
4416                 }
4417
4418                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4419                 nh->fib6_info = NULL;
4420                 if (err) {
4421                         if (replace && nhn)
4422                                 ip6_print_replace_route_err(&rt6_nh_list);
4423                         err_nh = nh;
4424                         goto add_errout;
4425                 }
4426
4427                 /* Because each route is added like a single route we remove
4428                  * these flags after the first nexthop: if there is a collision,
4429                  * we have already failed to add the first nexthop:
4430                  * fib6_add_rt2node() has rejected it; when replacing, old
4431                  * nexthops have been replaced by first new, the rest should
4432                  * be added to it.
4433                  */
4434                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4435                                                      NLM_F_REPLACE);
4436                 nhn++;
4437         }
4438
4439         /* success ... tell user about new route */
4440         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4441         goto cleanup;
4442
4443 add_errout:
4444         /* send notification for routes that were added so that
4445          * the delete notifications sent by ip6_route_del are
4446          * coherent
4447          */
4448         if (rt_notif)
4449                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4450
4451         /* Delete routes that were already added */
4452         list_for_each_entry(nh, &rt6_nh_list, next) {
4453                 if (err_nh == nh)
4454                         break;
4455                 ip6_route_del(&nh->r_cfg, extack);
4456         }
4457
4458 cleanup:
4459         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4460                 if (nh->fib6_info)
4461                         fib6_info_release(nh->fib6_info);
4462                 list_del(&nh->next);
4463                 kfree(nh);
4464         }
4465
4466         return err;
4467 }
4468
4469 static int ip6_route_multipath_del(struct fib6_config *cfg,
4470                                    struct netlink_ext_ack *extack)
4471 {
4472         struct fib6_config r_cfg;
4473         struct rtnexthop *rtnh;
4474         int remaining;
4475         int attrlen;
4476         int err = 1, last_err = 0;
4477
4478         remaining = cfg->fc_mp_len;
4479         rtnh = (struct rtnexthop *)cfg->fc_mp;
4480
4481         /* Parse a Multipath Entry */
4482         while (rtnh_ok(rtnh, remaining)) {
4483                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4484                 if (rtnh->rtnh_ifindex)
4485                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4486
4487                 attrlen = rtnh_attrlen(rtnh);
4488                 if (attrlen > 0) {
4489                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4490
4491                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4492                         if (nla) {
4493                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4494                                 r_cfg.fc_flags |= RTF_GATEWAY;
4495                         }
4496                 }
4497                 err = ip6_route_del(&r_cfg, extack);
4498                 if (err)
4499                         last_err = err;
4500
4501                 rtnh = rtnh_next(rtnh, &remaining);
4502         }
4503
4504         return last_err;
4505 }
4506
4507 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4508                               struct netlink_ext_ack *extack)
4509 {
4510         struct fib6_config cfg;
4511         int err;
4512
4513         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4514         if (err < 0)
4515                 return err;
4516
4517         if (cfg.fc_mp)
4518                 return ip6_route_multipath_del(&cfg, extack);
4519         else {
4520                 cfg.fc_delete_all_nh = 1;
4521                 return ip6_route_del(&cfg, extack);
4522         }
4523 }
4524
4525 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4526                               struct netlink_ext_ack *extack)
4527 {
4528         struct fib6_config cfg;
4529         int err;
4530
4531         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4532         if (err < 0)
4533                 return err;
4534
4535         if (cfg.fc_mp)
4536                 return ip6_route_multipath_add(&cfg, extack);
4537         else
4538                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4539 }
4540
4541 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4542 {
4543         int nexthop_len = 0;
4544
4545         if (rt->fib6_nsiblings) {
4546                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4547                             + NLA_ALIGN(sizeof(struct rtnexthop))
4548                             + nla_total_size(16) /* RTA_GATEWAY */
4549                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4550
4551                 nexthop_len *= rt->fib6_nsiblings;
4552         }
4553
4554         return NLMSG_ALIGN(sizeof(struct rtmsg))
4555                + nla_total_size(16) /* RTA_SRC */
4556                + nla_total_size(16) /* RTA_DST */
4557                + nla_total_size(16) /* RTA_GATEWAY */
4558                + nla_total_size(16) /* RTA_PREFSRC */
4559                + nla_total_size(4) /* RTA_TABLE */
4560                + nla_total_size(4) /* RTA_IIF */
4561                + nla_total_size(4) /* RTA_OIF */
4562                + nla_total_size(4) /* RTA_PRIORITY */
4563                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4564                + nla_total_size(sizeof(struct rta_cacheinfo))
4565                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4566                + nla_total_size(1) /* RTA_PREF */
4567                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4568                + nexthop_len;
4569 }
4570
4571 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4572                             unsigned int *flags, bool skip_oif)
4573 {
4574         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4575                 *flags |= RTNH_F_DEAD;
4576
4577         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4578                 *flags |= RTNH_F_LINKDOWN;
4579
4580                 rcu_read_lock();
4581                 if (fib6_ignore_linkdown(rt))
4582                         *flags |= RTNH_F_DEAD;
4583                 rcu_read_unlock();
4584         }
4585
4586         if (rt->fib6_flags & RTF_GATEWAY) {
4587                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4588                         goto nla_put_failure;
4589         }
4590
4591         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4592         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4593                 *flags |= RTNH_F_OFFLOAD;
4594
4595         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4596         if (!skip_oif && rt->fib6_nh.nh_dev &&
4597             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4598                 goto nla_put_failure;
4599
4600         if (rt->fib6_nh.nh_lwtstate &&
4601             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4602                 goto nla_put_failure;
4603
4604         return 0;
4605
4606 nla_put_failure:
4607         return -EMSGSIZE;
4608 }
4609
4610 /* add multipath next hop */
4611 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4612 {
4613         const struct net_device *dev = rt->fib6_nh.nh_dev;
4614         struct rtnexthop *rtnh;
4615         unsigned int flags = 0;
4616
4617         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4618         if (!rtnh)
4619                 goto nla_put_failure;
4620
4621         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4622         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4623
4624         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4625                 goto nla_put_failure;
4626
4627         rtnh->rtnh_flags = flags;
4628
4629         /* length of rtnetlink header + attributes */
4630         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4631
4632         return 0;
4633
4634 nla_put_failure:
4635         return -EMSGSIZE;
4636 }
4637
4638 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4639                          struct fib6_info *rt, struct dst_entry *dst,
4640                          struct in6_addr *dest, struct in6_addr *src,
4641                          int iif, int type, u32 portid, u32 seq,
4642                          unsigned int flags)
4643 {
4644         struct rtmsg *rtm;
4645         struct nlmsghdr *nlh;
4646         long expires = 0;
4647         u32 *pmetrics;
4648         u32 table;
4649
4650         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4651         if (!nlh)
4652                 return -EMSGSIZE;
4653
4654         rtm = nlmsg_data(nlh);
4655         rtm->rtm_family = AF_INET6;
4656         rtm->rtm_dst_len = rt->fib6_dst.plen;
4657         rtm->rtm_src_len = rt->fib6_src.plen;
4658         rtm->rtm_tos = 0;
4659         if (rt->fib6_table)
4660                 table = rt->fib6_table->tb6_id;
4661         else
4662                 table = RT6_TABLE_UNSPEC;
4663         rtm->rtm_table = table;
4664         if (nla_put_u32(skb, RTA_TABLE, table))
4665                 goto nla_put_failure;
4666
4667         rtm->rtm_type = rt->fib6_type;
4668         rtm->rtm_flags = 0;
4669         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4670         rtm->rtm_protocol = rt->fib6_protocol;
4671
4672         if (rt->fib6_flags & RTF_CACHE)
4673                 rtm->rtm_flags |= RTM_F_CLONED;
4674
4675         if (dest) {
4676                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4677                         goto nla_put_failure;
4678                 rtm->rtm_dst_len = 128;
4679         } else if (rtm->rtm_dst_len)
4680                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4681                         goto nla_put_failure;
4682 #ifdef CONFIG_IPV6_SUBTREES
4683         if (src) {
4684                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4685                         goto nla_put_failure;
4686                 rtm->rtm_src_len = 128;
4687         } else if (rtm->rtm_src_len &&
4688                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4689                 goto nla_put_failure;
4690 #endif
4691         if (iif) {
4692 #ifdef CONFIG_IPV6_MROUTE
4693                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4694                         int err = ip6mr_get_route(net, skb, rtm, portid);
4695
4696                         if (err == 0)
4697                                 return 0;
4698                         if (err < 0)
4699                                 goto nla_put_failure;
4700                 } else
4701 #endif
4702                         if (nla_put_u32(skb, RTA_IIF, iif))
4703                                 goto nla_put_failure;
4704         } else if (dest) {
4705                 struct in6_addr saddr_buf;
4706                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4707                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4708                         goto nla_put_failure;
4709         }
4710
4711         if (rt->fib6_prefsrc.plen) {
4712                 struct in6_addr saddr_buf;
4713                 saddr_buf = rt->fib6_prefsrc.addr;
4714                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4715                         goto nla_put_failure;
4716         }
4717
4718         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4719         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4720                 goto nla_put_failure;
4721
4722         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4723                 goto nla_put_failure;
4724
4725         /* For multipath routes, walk the siblings list and add
4726          * each as a nexthop within RTA_MULTIPATH.
4727          */
4728         if (rt->fib6_nsiblings) {
4729                 struct fib6_info *sibling, *next_sibling;
4730                 struct nlattr *mp;
4731
4732                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4733                 if (!mp)
4734                         goto nla_put_failure;
4735
4736                 if (rt6_add_nexthop(skb, rt) < 0)
4737                         goto nla_put_failure;
4738
4739                 list_for_each_entry_safe(sibling, next_sibling,
4740                                          &rt->fib6_siblings, fib6_siblings) {
4741                         if (rt6_add_nexthop(skb, sibling) < 0)
4742                                 goto nla_put_failure;
4743                 }
4744
4745                 nla_nest_end(skb, mp);
4746         } else {
4747                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4748                         goto nla_put_failure;
4749         }
4750
4751         if (rt->fib6_flags & RTF_EXPIRES) {
4752                 expires = dst ? dst->expires : rt->expires;
4753                 expires -= jiffies;
4754         }
4755
4756         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4757                 goto nla_put_failure;
4758
4759         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4760                 goto nla_put_failure;
4761
4762
4763         nlmsg_end(skb, nlh);
4764         return 0;
4765
4766 nla_put_failure:
4767         nlmsg_cancel(skb, nlh);
4768         return -EMSGSIZE;
4769 }
4770
4771 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4772 {
4773         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4774         struct net *net = arg->net;
4775
4776         if (rt == net->ipv6.fib6_null_entry)
4777                 return 0;
4778
4779         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4780                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4781
4782                 /* user wants prefix routes only */
4783                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4784                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4785                         /* success since this is not a prefix route */
4786                         return 1;
4787                 }
4788         }
4789
4790         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4791                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4792                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4793 }
4794
4795 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4796                               struct netlink_ext_ack *extack)
4797 {
4798         struct net *net = sock_net(in_skb->sk);
4799         struct nlattr *tb[RTA_MAX+1];
4800         int err, iif = 0, oif = 0;
4801         struct fib6_info *from;
4802         struct dst_entry *dst;
4803         struct rt6_info *rt;
4804         struct sk_buff *skb;
4805         struct rtmsg *rtm;
4806         struct flowi6 fl6;
4807         bool fibmatch;
4808
4809         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4810                           extack);
4811         if (err < 0)
4812                 goto errout;
4813
4814         err = -EINVAL;
4815         memset(&fl6, 0, sizeof(fl6));
4816         rtm = nlmsg_data(nlh);
4817         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4818         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4819
4820         if (tb[RTA_SRC]) {
4821                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4822                         goto errout;
4823
4824                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4825         }
4826
4827         if (tb[RTA_DST]) {
4828                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4829                         goto errout;
4830
4831                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4832         }
4833
4834         if (tb[RTA_IIF])
4835                 iif = nla_get_u32(tb[RTA_IIF]);
4836
4837         if (tb[RTA_OIF])
4838                 oif = nla_get_u32(tb[RTA_OIF]);
4839
4840         if (tb[RTA_MARK])
4841                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4842
4843         if (tb[RTA_UID])
4844                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4845                                            nla_get_u32(tb[RTA_UID]));
4846         else
4847                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4848
4849         if (tb[RTA_SPORT])
4850                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4851
4852         if (tb[RTA_DPORT])
4853                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4854
4855         if (tb[RTA_IP_PROTO]) {
4856                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4857                                                   &fl6.flowi6_proto, extack);
4858                 if (err)
4859                         goto errout;
4860         }
4861
4862         if (iif) {
4863                 struct net_device *dev;
4864                 int flags = 0;
4865
4866                 rcu_read_lock();
4867
4868                 dev = dev_get_by_index_rcu(net, iif);
4869                 if (!dev) {
4870                         rcu_read_unlock();
4871                         err = -ENODEV;
4872                         goto errout;
4873                 }
4874
4875                 fl6.flowi6_iif = iif;
4876
4877                 if (!ipv6_addr_any(&fl6.saddr))
4878                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4879
4880                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4881
4882                 rcu_read_unlock();
4883         } else {
4884                 fl6.flowi6_oif = oif;
4885
4886                 dst = ip6_route_output(net, NULL, &fl6);
4887         }
4888
4889
4890         rt = container_of(dst, struct rt6_info, dst);
4891         if (rt->dst.error) {
4892                 err = rt->dst.error;
4893                 ip6_rt_put(rt);
4894                 goto errout;
4895         }
4896
4897         if (rt == net->ipv6.ip6_null_entry) {
4898                 err = rt->dst.error;
4899                 ip6_rt_put(rt);
4900                 goto errout;
4901         }
4902
4903         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4904         if (!skb) {
4905                 ip6_rt_put(rt);
4906                 err = -ENOBUFS;
4907                 goto errout;
4908         }
4909
4910         skb_dst_set(skb, &rt->dst);
4911
4912         rcu_read_lock();
4913         from = rcu_dereference(rt->from);
4914
4915         if (fibmatch)
4916                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4917                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4918                                     nlh->nlmsg_seq, 0);
4919         else
4920                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4921                                     &fl6.saddr, iif, RTM_NEWROUTE,
4922                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4923                                     0);
4924         rcu_read_unlock();
4925
4926         if (err < 0) {
4927                 kfree_skb(skb);
4928                 goto errout;
4929         }
4930
4931         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4932 errout:
4933         return err;
4934 }
4935
4936 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4937                      unsigned int nlm_flags)
4938 {
4939         struct sk_buff *skb;
4940         struct net *net = info->nl_net;
4941         u32 seq;
4942         int err;
4943
4944         err = -ENOBUFS;
4945         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4946
4947         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4948         if (!skb)
4949                 goto errout;
4950
4951         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4952                             event, info->portid, seq, nlm_flags);
4953         if (err < 0) {
4954                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4955                 WARN_ON(err == -EMSGSIZE);
4956                 kfree_skb(skb);
4957                 goto errout;
4958         }
4959         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4960                     info->nlh, gfp_any());
4961         return;
4962 errout:
4963         if (err < 0)
4964                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4965 }
4966
4967 static int ip6_route_dev_notify(struct notifier_block *this,
4968                                 unsigned long event, void *ptr)
4969 {
4970         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4971         struct net *net = dev_net(dev);
4972
4973         if (!(dev->flags & IFF_LOOPBACK))
4974                 return NOTIFY_OK;
4975
4976         if (event == NETDEV_REGISTER) {
4977                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4978                 net->ipv6.ip6_null_entry->dst.dev = dev;
4979                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4981                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4982                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4983                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4984                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4985 #endif
4986          } else if (event == NETDEV_UNREGISTER &&
4987                     dev->reg_state != NETREG_UNREGISTERED) {
4988                 /* NETDEV_UNREGISTER could be fired for multiple times by
4989                  * netdev_wait_allrefs(). Make sure we only call this once.
4990                  */
4991                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4992 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4993                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4994                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4995 #endif
4996         }
4997
4998         return NOTIFY_OK;
4999 }
5000
5001 /*
5002  *      /proc
5003  */
5004
5005 #ifdef CONFIG_PROC_FS
5006 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5007 {
5008         struct net *net = (struct net *)seq->private;
5009         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5010                    net->ipv6.rt6_stats->fib_nodes,
5011                    net->ipv6.rt6_stats->fib_route_nodes,
5012                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5013                    net->ipv6.rt6_stats->fib_rt_entries,
5014                    net->ipv6.rt6_stats->fib_rt_cache,
5015                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5016                    net->ipv6.rt6_stats->fib_discarded_routes);
5017
5018         return 0;
5019 }
5020 #endif  /* CONFIG_PROC_FS */
5021
5022 #ifdef CONFIG_SYSCTL
5023
5024 static
5025 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5026                               void __user *buffer, size_t *lenp, loff_t *ppos)
5027 {
5028         struct net *net;
5029         int delay;
5030         if (!write)
5031                 return -EINVAL;
5032
5033         net = (struct net *)ctl->extra1;
5034         delay = net->ipv6.sysctl.flush_delay;
5035         proc_dointvec(ctl, write, buffer, lenp, ppos);
5036         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5037         return 0;
5038 }
5039
5040 struct ctl_table ipv6_route_table_template[] = {
5041         {
5042                 .procname       =       "flush",
5043                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5044                 .maxlen         =       sizeof(int),
5045                 .mode           =       0200,
5046                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5047         },
5048         {
5049                 .procname       =       "gc_thresh",
5050                 .data           =       &ip6_dst_ops_template.gc_thresh,
5051                 .maxlen         =       sizeof(int),
5052                 .mode           =       0644,
5053                 .proc_handler   =       proc_dointvec,
5054         },
5055         {
5056                 .procname       =       "max_size",
5057                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5058                 .maxlen         =       sizeof(int),
5059                 .mode           =       0644,
5060                 .proc_handler   =       proc_dointvec,
5061         },
5062         {
5063                 .procname       =       "gc_min_interval",
5064                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5065                 .maxlen         =       sizeof(int),
5066                 .mode           =       0644,
5067                 .proc_handler   =       proc_dointvec_jiffies,
5068         },
5069         {
5070                 .procname       =       "gc_timeout",
5071                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5072                 .maxlen         =       sizeof(int),
5073                 .mode           =       0644,
5074                 .proc_handler   =       proc_dointvec_jiffies,
5075         },
5076         {
5077                 .procname       =       "gc_interval",
5078                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5079                 .maxlen         =       sizeof(int),
5080                 .mode           =       0644,
5081                 .proc_handler   =       proc_dointvec_jiffies,
5082         },
5083         {
5084                 .procname       =       "gc_elasticity",
5085                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5086                 .maxlen         =       sizeof(int),
5087                 .mode           =       0644,
5088                 .proc_handler   =       proc_dointvec,
5089         },
5090         {
5091                 .procname       =       "mtu_expires",
5092                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5093                 .maxlen         =       sizeof(int),
5094                 .mode           =       0644,
5095                 .proc_handler   =       proc_dointvec_jiffies,
5096         },
5097         {
5098                 .procname       =       "min_adv_mss",
5099                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5100                 .maxlen         =       sizeof(int),
5101                 .mode           =       0644,
5102                 .proc_handler   =       proc_dointvec,
5103         },
5104         {
5105                 .procname       =       "gc_min_interval_ms",
5106                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5107                 .maxlen         =       sizeof(int),
5108                 .mode           =       0644,
5109                 .proc_handler   =       proc_dointvec_ms_jiffies,
5110         },
5111         { }
5112 };
5113
5114 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5115 {
5116         struct ctl_table *table;
5117
5118         table = kmemdup(ipv6_route_table_template,
5119                         sizeof(ipv6_route_table_template),
5120                         GFP_KERNEL);
5121
5122         if (table) {
5123                 table[0].data = &net->ipv6.sysctl.flush_delay;
5124                 table[0].extra1 = net;
5125                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5126                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5127                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5128                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5129                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5130                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5131                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5132                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5133                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5134
5135                 /* Don't export sysctls to unprivileged users */
5136                 if (net->user_ns != &init_user_ns)
5137                         table[0].procname = NULL;
5138         }
5139
5140         return table;
5141 }
5142 #endif
5143
5144 static int __net_init ip6_route_net_init(struct net *net)
5145 {
5146         int ret = -ENOMEM;
5147
5148         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5149                sizeof(net->ipv6.ip6_dst_ops));
5150
5151         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5152                 goto out_ip6_dst_ops;
5153
5154         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5155                                             sizeof(*net->ipv6.fib6_null_entry),
5156                                             GFP_KERNEL);
5157         if (!net->ipv6.fib6_null_entry)
5158                 goto out_ip6_dst_entries;
5159
5160         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5161                                            sizeof(*net->ipv6.ip6_null_entry),
5162                                            GFP_KERNEL);
5163         if (!net->ipv6.ip6_null_entry)
5164                 goto out_fib6_null_entry;
5165         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5166         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5167                          ip6_template_metrics, true);
5168
5169 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5170         net->ipv6.fib6_has_custom_rules = false;
5171         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5172                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5173                                                GFP_KERNEL);
5174         if (!net->ipv6.ip6_prohibit_entry)
5175                 goto out_ip6_null_entry;
5176         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5177         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5178                          ip6_template_metrics, true);
5179
5180         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5181                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5182                                                GFP_KERNEL);
5183         if (!net->ipv6.ip6_blk_hole_entry)
5184                 goto out_ip6_prohibit_entry;
5185         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5186         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5187                          ip6_template_metrics, true);
5188 #endif
5189
5190         net->ipv6.sysctl.flush_delay = 0;
5191         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5192         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5193         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5194         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5195         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5196         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5197         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5198
5199         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5200
5201         ret = 0;
5202 out:
5203         return ret;
5204
5205 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5206 out_ip6_prohibit_entry:
5207         kfree(net->ipv6.ip6_prohibit_entry);
5208 out_ip6_null_entry:
5209         kfree(net->ipv6.ip6_null_entry);
5210 #endif
5211 out_fib6_null_entry:
5212         kfree(net->ipv6.fib6_null_entry);
5213 out_ip6_dst_entries:
5214         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5215 out_ip6_dst_ops:
5216         goto out;
5217 }
5218
5219 static void __net_exit ip6_route_net_exit(struct net *net)
5220 {
5221         kfree(net->ipv6.fib6_null_entry);
5222         kfree(net->ipv6.ip6_null_entry);
5223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5224         kfree(net->ipv6.ip6_prohibit_entry);
5225         kfree(net->ipv6.ip6_blk_hole_entry);
5226 #endif
5227         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5228 }
5229
5230 static int __net_init ip6_route_net_init_late(struct net *net)
5231 {
5232 #ifdef CONFIG_PROC_FS
5233         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5234                         sizeof(struct ipv6_route_iter));
5235         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5236                         rt6_stats_seq_show, NULL);
5237 #endif
5238         return 0;
5239 }
5240
5241 static void __net_exit ip6_route_net_exit_late(struct net *net)
5242 {
5243 #ifdef CONFIG_PROC_FS
5244         remove_proc_entry("ipv6_route", net->proc_net);
5245         remove_proc_entry("rt6_stats", net->proc_net);
5246 #endif
5247 }
5248
5249 static struct pernet_operations ip6_route_net_ops = {
5250         .init = ip6_route_net_init,
5251         .exit = ip6_route_net_exit,
5252 };
5253
5254 static int __net_init ipv6_inetpeer_init(struct net *net)
5255 {
5256         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5257
5258         if (!bp)
5259                 return -ENOMEM;
5260         inet_peer_base_init(bp);
5261         net->ipv6.peers = bp;
5262         return 0;
5263 }
5264
5265 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5266 {
5267         struct inet_peer_base *bp = net->ipv6.peers;
5268
5269         net->ipv6.peers = NULL;
5270         inetpeer_invalidate_tree(bp);
5271         kfree(bp);
5272 }
5273
5274 static struct pernet_operations ipv6_inetpeer_ops = {
5275         .init   =       ipv6_inetpeer_init,
5276         .exit   =       ipv6_inetpeer_exit,
5277 };
5278
5279 static struct pernet_operations ip6_route_net_late_ops = {
5280         .init = ip6_route_net_init_late,
5281         .exit = ip6_route_net_exit_late,
5282 };
5283
5284 static struct notifier_block ip6_route_dev_notifier = {
5285         .notifier_call = ip6_route_dev_notify,
5286         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5287 };
5288
5289 void __init ip6_route_init_special_entries(void)
5290 {
5291         /* Registering of the loopback is done before this portion of code,
5292          * the loopback reference in rt6_info will not be taken, do it
5293          * manually for init_net */
5294         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5295         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5296         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5297   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5298         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5299         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5300         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5301         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5302   #endif
5303 }
5304
5305 int __init ip6_route_init(void)
5306 {
5307         int ret;
5308         int cpu;
5309
5310         ret = -ENOMEM;
5311         ip6_dst_ops_template.kmem_cachep =
5312                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5313                                   SLAB_HWCACHE_ALIGN, NULL);
5314         if (!ip6_dst_ops_template.kmem_cachep)
5315                 goto out;
5316
5317         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5318         if (ret)
5319                 goto out_kmem_cache;
5320
5321         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5322         if (ret)
5323                 goto out_dst_entries;
5324
5325         ret = register_pernet_subsys(&ip6_route_net_ops);
5326         if (ret)
5327                 goto out_register_inetpeer;
5328
5329         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5330
5331         ret = fib6_init();
5332         if (ret)
5333                 goto out_register_subsys;
5334
5335         ret = xfrm6_init();
5336         if (ret)
5337                 goto out_fib6_init;
5338
5339         ret = fib6_rules_init();
5340         if (ret)
5341                 goto xfrm6_init;
5342
5343         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5344         if (ret)
5345                 goto fib6_rules_init;
5346
5347         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5348                                    inet6_rtm_newroute, NULL, 0);
5349         if (ret < 0)
5350                 goto out_register_late_subsys;
5351
5352         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5353                                    inet6_rtm_delroute, NULL, 0);
5354         if (ret < 0)
5355                 goto out_register_late_subsys;
5356
5357         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5358                                    inet6_rtm_getroute, NULL,
5359                                    RTNL_FLAG_DOIT_UNLOCKED);
5360         if (ret < 0)
5361                 goto out_register_late_subsys;
5362
5363         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5364         if (ret)
5365                 goto out_register_late_subsys;
5366
5367         for_each_possible_cpu(cpu) {
5368                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5369
5370                 INIT_LIST_HEAD(&ul->head);
5371                 spin_lock_init(&ul->lock);
5372         }
5373
5374 out:
5375         return ret;
5376
5377 out_register_late_subsys:
5378         rtnl_unregister_all(PF_INET6);
5379         unregister_pernet_subsys(&ip6_route_net_late_ops);
5380 fib6_rules_init:
5381         fib6_rules_cleanup();
5382 xfrm6_init:
5383         xfrm6_fini();
5384 out_fib6_init:
5385         fib6_gc_cleanup();
5386 out_register_subsys:
5387         unregister_pernet_subsys(&ip6_route_net_ops);
5388 out_register_inetpeer:
5389         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5390 out_dst_entries:
5391         dst_entries_destroy(&ip6_dst_blackhole_ops);
5392 out_kmem_cache:
5393         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5394         goto out;
5395 }
5396
5397 void ip6_route_cleanup(void)
5398 {
5399         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5400         unregister_pernet_subsys(&ip6_route_net_late_ops);
5401         fib6_rules_cleanup();
5402         xfrm6_fini();
5403         fib6_gc_cleanup();
5404         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5405         unregister_pernet_subsys(&ip6_route_net_ops);
5406         dst_entries_destroy(&ip6_dst_blackhole_ops);
5407         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5408 }