2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1003 struct fib6_node *pn, *sn;
1005 if (fn->fn_flags & RTN_TL_ROOT)
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1013 if (fn->fn_flags & RTN_RTINFO)
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1021 struct rt6_info *rt = *prt;
1023 if (dst_hold_safe(&rt->dst))
1025 if (null_fallback) {
1026 rt = net->ipv6.ip6_null_entry;
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.nh_dev;
1040 struct rt6_info *nrt;
1042 if (!fib6_info_hold_safe(rt))
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047 ip6_rt_copy_init(nrt, rt);
1049 fib6_info_release(rt);
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1057 const struct sk_buff *skb,
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 f6i = rcu_dereference(fn->leaf);
1072 f6i = net->ipv6.fib6_null_entry;
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1098 rt = ip6_create_rt_rcu(f6i);
1100 rt = net->ipv6.ip6_null_entry;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1181 if (!fib6_info_hold_safe(ort))
1184 dev = ip6_rt_get_dev_rcu(ort);
1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1187 fib6_info_release(ort);
1191 ip6_rt_copy_init(rt, ort);
1192 rt->rt6i_flags |= RTF_CACHE;
1193 rt->dst.flags |= DST_HOST;
1194 rt->rt6i_dst.addr = *daddr;
1195 rt->rt6i_dst.plen = 128;
1197 if (!rt6_is_gw_or_nonexthop(ort)) {
1198 if (ort->fib6_dst.plen != 128 &&
1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200 rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202 if (rt->rt6i_src.plen && saddr) {
1203 rt->rt6i_src.addr = *saddr;
1204 rt->rt6i_src.plen = 128;
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1214 unsigned short flags = fib6_info_dst_flags(rt);
1215 struct net_device *dev;
1216 struct rt6_info *pcpu_rt;
1218 if (!fib6_info_hold_safe(rt))
1222 dev = ip6_rt_get_dev_rcu(rt);
1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1226 fib6_info_release(rt);
1229 ip6_rt_copy_init(pcpu_rt, rt);
1230 pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1237 struct rt6_info *pcpu_rt, **p;
1239 p = this_cpu_ptr(rt->rt6i_pcpu);
1243 ip6_hold_safe(NULL, &pcpu_rt, false);
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249 struct fib6_info *rt)
1251 struct rt6_info *pcpu_rt, *prev, **p;
1253 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256 return net->ipv6.ip6_null_entry;
1259 dst_hold(&pcpu_rt->dst);
1260 p = this_cpu_ptr(rt->rt6i_pcpu);
1261 prev = cmpxchg(p, NULL, pcpu_rt);
1267 /* exception hash table implementation
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1271 /* Remove rt6_ex from hash table and free the memory
1272 * Caller must hold rt6_exception_lock
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275 struct rt6_exception *rt6_ex)
1279 if (!bucket || !rt6_ex)
1282 net = dev_net(rt6_ex->rt6i->dst.dev);
1283 hlist_del_rcu(&rt6_ex->hlist);
1284 dst_release(&rt6_ex->rt6i->dst);
1285 kfree_rcu(rt6_ex, rcu);
1286 WARN_ON_ONCE(!bucket->depth);
1288 net->ipv6.rt6_stats->fib_rt_cache--;
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292 * Caller must hold rt6_exception_lock
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1296 struct rt6_exception *rt6_ex, *oldest = NULL;
1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1305 rt6_remove_exception(bucket, oldest);
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309 const struct in6_addr *src)
1311 static u32 seed __read_mostly;
1314 net_get_random_once(&seed, sizeof(seed));
1315 val = jhash(dst, sizeof(*dst), seed);
1317 #ifdef CONFIG_IPV6_SUBTREES
1319 val = jhash(src, sizeof(*src), val);
1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1324 /* Helper function to find the cached rt in the hash table
1325 * and update bucket pointer to point to the bucket for this
1326 * (daddr, saddr) pair
1327 * Caller must hold rt6_exception_lock
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331 const struct in6_addr *daddr,
1332 const struct in6_addr *saddr)
1334 struct rt6_exception *rt6_ex;
1337 if (!(*bucket) || !daddr)
1340 hval = rt6_exception_hash(daddr, saddr);
1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344 struct rt6_info *rt6 = rt6_ex->rt6i;
1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1347 #ifdef CONFIG_IPV6_SUBTREES
1348 if (matched && saddr)
1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1357 /* Helper function to find the cached rt in the hash table
1358 * and update bucket pointer to point to the bucket for this
1359 * (daddr, saddr) pair
1360 * Caller must hold rcu_read_lock()
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364 const struct in6_addr *daddr,
1365 const struct in6_addr *saddr)
1367 struct rt6_exception *rt6_ex;
1370 WARN_ON_ONCE(!rcu_read_lock_held());
1372 if (!(*bucket) || !daddr)
1375 hval = rt6_exception_hash(daddr, saddr);
1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379 struct rt6_info *rt6 = rt6_ex->rt6i;
1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1382 #ifdef CONFIG_IPV6_SUBTREES
1383 if (matched && saddr)
1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1396 if (rt->fib6_pmtu) {
1397 mtu = rt->fib6_pmtu;
1399 struct net_device *dev = fib6_info_nh_dev(rt);
1400 struct inet6_dev *idev;
1403 idev = __in6_dev_get(dev);
1404 mtu = idev->cnf.mtu6;
1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1410 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414 struct fib6_info *ort)
1416 struct net *net = dev_net(nrt->dst.dev);
1417 struct rt6_exception_bucket *bucket;
1418 struct in6_addr *src_key = NULL;
1419 struct rt6_exception *rt6_ex;
1422 spin_lock_bh(&rt6_exception_lock);
1424 if (ort->exception_bucket_flushed) {
1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430 lockdep_is_held(&rt6_exception_lock));
1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 /* rt6i_src.plen != 0 indicates ort is in subtree
1443 * and exception table is indexed by a hash of
1444 * both rt6i_dst and rt6i_src.
1445 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst.
1448 if (ort->fib6_src.plen)
1449 src_key = &nrt->rt6i_src.addr;
1451 /* rt6_mtu_change() might lower mtu on ort.
1452 * Only insert this exception route if its mtu
1453 * is less than ort's mtu value.
1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1463 rt6_remove_exception(bucket, rt6_ex);
1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1471 rt6_ex->stamp = jiffies;
1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1474 net->ipv6.rt6_stats->fib_rt_cache++;
1476 if (bucket->depth > FIB6_MAX_DEPTH)
1477 rt6_exception_remove_oldest(bucket);
1480 spin_unlock_bh(&rt6_exception_lock);
1482 /* Update fn->fn_sernum to invalidate all cached dst */
1484 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485 fib6_update_sernum(net, ort);
1486 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487 fib6_force_start_gc(net);
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1495 struct rt6_exception_bucket *bucket;
1496 struct rt6_exception *rt6_ex;
1497 struct hlist_node *tmp;
1500 spin_lock_bh(&rt6_exception_lock);
1501 /* Prevent rt6_insert_exception() to recreate the bucket list */
1502 rt->exception_bucket_flushed = 1;
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511 rt6_remove_exception(bucket, rt6_ex);
1512 WARN_ON_ONCE(bucket->depth);
1517 spin_unlock_bh(&rt6_exception_lock);
1520 /* Find cached rt in the hash table inside passed in rt
1521 * Caller has to hold rcu_read_lock()
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524 struct in6_addr *daddr,
1525 struct in6_addr *saddr)
1527 struct rt6_exception_bucket *bucket;
1528 struct in6_addr *src_key = NULL;
1529 struct rt6_exception *rt6_ex;
1530 struct rt6_info *res = NULL;
1532 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1534 #ifdef CONFIG_IPV6_SUBTREES
1535 /* rt6i_src.plen != 0 indicates rt is in subtree
1536 * and exception table is indexed by a hash of
1537 * both rt6i_dst and rt6i_src.
1538 * Otherwise, the exception table is indexed by
1539 * a hash of only rt6i_dst.
1541 if (rt->fib6_src.plen)
1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1555 struct rt6_exception_bucket *bucket;
1556 struct in6_addr *src_key = NULL;
1557 struct rt6_exception *rt6_ex;
1558 struct fib6_info *from;
1561 from = rcu_dereference(rt->from);
1563 !(rt->rt6i_flags & RTF_CACHE))
1566 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1569 spin_lock_bh(&rt6_exception_lock);
1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571 lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574 * and exception table is indexed by a hash of
1575 * both rt6i_dst and rt6i_src.
1576 * Otherwise, the exception table is indexed by
1577 * a hash of only rt6i_dst.
1579 if (from->fib6_src.plen)
1580 src_key = &rt->rt6i_src.addr;
1582 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1586 rt6_remove_exception(bucket, rt6_ex);
1592 spin_unlock_bh(&rt6_exception_lock);
1596 /* Find rt6_ex which contains the passed in rt cache and
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1601 struct rt6_exception_bucket *bucket;
1602 struct fib6_info *from = rt->from;
1603 struct in6_addr *src_key = NULL;
1604 struct rt6_exception *rt6_ex;
1607 !(rt->rt6i_flags & RTF_CACHE))
1611 bucket = rcu_dereference(from->rt6i_exception_bucket);
1613 #ifdef CONFIG_IPV6_SUBTREES
1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615 * and exception table is indexed by a hash of
1616 * both rt6i_dst and rt6i_src.
1617 * Otherwise, the exception table is indexed by
1618 * a hash of only rt6i_dst.
1620 if (from->fib6_src.plen)
1621 src_key = &rt->rt6i_src.addr;
1623 rt6_ex = __rt6_find_exception_rcu(&bucket,
1627 rt6_ex->stamp = jiffies;
1632 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1633 struct rt6_info *rt, int mtu)
1635 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1636 * lowest MTU in the path: always allow updating the route PMTU to
1637 * reflect PMTU decreases.
1639 * If the new MTU is higher, and the route PMTU is equal to the local
1640 * MTU, this means the old MTU is the lowest in the path, so allow
1641 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1645 if (dst_mtu(&rt->dst) >= mtu)
1648 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1654 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1655 struct fib6_info *rt, int mtu)
1657 struct rt6_exception_bucket *bucket;
1658 struct rt6_exception *rt6_ex;
1661 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662 lockdep_is_held(&rt6_exception_lock));
1667 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1669 struct rt6_info *entry = rt6_ex->rt6i;
1671 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1672 * route), the metrics of its rt->from have already
1675 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1676 rt6_mtu_change_route_allowed(idev, entry, mtu))
1677 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1683 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1685 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1686 struct in6_addr *gateway)
1688 struct rt6_exception_bucket *bucket;
1689 struct rt6_exception *rt6_ex;
1690 struct hlist_node *tmp;
1693 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1696 spin_lock_bh(&rt6_exception_lock);
1697 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698 lockdep_is_held(&rt6_exception_lock));
1701 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702 hlist_for_each_entry_safe(rt6_ex, tmp,
1703 &bucket->chain, hlist) {
1704 struct rt6_info *entry = rt6_ex->rt6i;
1706 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1707 RTF_CACHE_GATEWAY &&
1708 ipv6_addr_equal(gateway,
1709 &entry->rt6i_gateway)) {
1710 rt6_remove_exception(bucket, rt6_ex);
1717 spin_unlock_bh(&rt6_exception_lock);
1720 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1721 struct rt6_exception *rt6_ex,
1722 struct fib6_gc_args *gc_args,
1725 struct rt6_info *rt = rt6_ex->rt6i;
1727 /* we are pruning and obsoleting aged-out and non gateway exceptions
1728 * even if others have still references to them, so that on next
1729 * dst_check() such references can be dropped.
1730 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1731 * expired, independently from their aging, as per RFC 8201 section 4
1733 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1734 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1735 RT6_TRACE("aging clone %p\n", rt);
1736 rt6_remove_exception(bucket, rt6_ex);
1739 } else if (time_after(jiffies, rt->dst.expires)) {
1740 RT6_TRACE("purging expired route %p\n", rt);
1741 rt6_remove_exception(bucket, rt6_ex);
1745 if (rt->rt6i_flags & RTF_GATEWAY) {
1746 struct neighbour *neigh;
1747 __u8 neigh_flags = 0;
1749 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1751 neigh_flags = neigh->flags;
1753 if (!(neigh_flags & NTF_ROUTER)) {
1754 RT6_TRACE("purging route %p via non-router but gateway\n",
1756 rt6_remove_exception(bucket, rt6_ex);
1764 void rt6_age_exceptions(struct fib6_info *rt,
1765 struct fib6_gc_args *gc_args,
1768 struct rt6_exception_bucket *bucket;
1769 struct rt6_exception *rt6_ex;
1770 struct hlist_node *tmp;
1773 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1777 spin_lock(&rt6_exception_lock);
1778 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1779 lockdep_is_held(&rt6_exception_lock));
1782 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1783 hlist_for_each_entry_safe(rt6_ex, tmp,
1784 &bucket->chain, hlist) {
1785 rt6_age_examine_exception(bucket, rt6_ex,
1791 spin_unlock(&rt6_exception_lock);
1792 rcu_read_unlock_bh();
1795 /* must be called with rcu lock held */
1796 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1797 int oif, struct flowi6 *fl6, int strict)
1799 struct fib6_node *fn, *saved_fn;
1800 struct fib6_info *f6i;
1802 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1805 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1809 f6i = rt6_select(net, fn, oif, strict);
1810 if (f6i == net->ipv6.fib6_null_entry) {
1811 fn = fib6_backtrack(fn, &fl6->saddr);
1813 goto redo_rt6_select;
1814 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1815 /* also consider unreachable route */
1816 strict &= ~RT6_LOOKUP_F_REACHABLE;
1818 goto redo_rt6_select;
1822 trace_fib6_table_lookup(net, f6i, table, fl6);
1827 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1828 int oif, struct flowi6 *fl6,
1829 const struct sk_buff *skb, int flags)
1831 struct fib6_info *f6i;
1832 struct rt6_info *rt;
1835 strict |= flags & RT6_LOOKUP_F_IFACE;
1836 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1837 if (net->ipv6.devconf_all->forwarding == 0)
1838 strict |= RT6_LOOKUP_F_REACHABLE;
1842 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1843 if (f6i->fib6_nsiblings)
1844 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1846 if (f6i == net->ipv6.fib6_null_entry) {
1847 rt = net->ipv6.ip6_null_entry;
1853 /*Search through exception table */
1854 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1856 if (ip6_hold_safe(net, &rt, true))
1857 dst_use_noref(&rt->dst, jiffies);
1861 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1862 !(f6i->fib6_flags & RTF_GATEWAY))) {
1863 /* Create a RTF_CACHE clone which will not be
1864 * owned by the fib6 tree. It is for the special case where
1865 * the daddr in the skb during the neighbor look-up is different
1866 * from the fl6->daddr used to look-up route here.
1868 struct rt6_info *uncached_rt;
1870 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1875 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1876 * No need for another dst_hold()
1878 rt6_uncached_list_add(uncached_rt);
1879 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1881 uncached_rt = net->ipv6.ip6_null_entry;
1882 dst_hold(&uncached_rt->dst);
1887 /* Get a percpu copy */
1889 struct rt6_info *pcpu_rt;
1892 pcpu_rt = rt6_get_pcpu_route(f6i);
1895 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1903 EXPORT_SYMBOL_GPL(ip6_pol_route);
1905 static struct rt6_info *ip6_pol_route_input(struct net *net,
1906 struct fib6_table *table,
1908 const struct sk_buff *skb,
1911 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1914 struct dst_entry *ip6_route_input_lookup(struct net *net,
1915 struct net_device *dev,
1917 const struct sk_buff *skb,
1920 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1921 flags |= RT6_LOOKUP_F_IFACE;
1923 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1925 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1927 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1928 struct flow_keys *keys,
1929 struct flow_keys *flkeys)
1931 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1932 const struct ipv6hdr *key_iph = outer_iph;
1933 struct flow_keys *_flkeys = flkeys;
1934 const struct ipv6hdr *inner_iph;
1935 const struct icmp6hdr *icmph;
1936 struct ipv6hdr _inner_iph;
1937 struct icmp6hdr _icmph;
1939 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1942 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1943 sizeof(_icmph), &_icmph);
1947 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1948 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1949 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1950 icmph->icmp6_type != ICMPV6_PARAMPROB)
1953 inner_iph = skb_header_pointer(skb,
1954 skb_transport_offset(skb) + sizeof(*icmph),
1955 sizeof(_inner_iph), &_inner_iph);
1959 key_iph = inner_iph;
1963 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1964 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1965 keys->tags.flow_label = _flkeys->tags.flow_label;
1966 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1968 keys->addrs.v6addrs.src = key_iph->saddr;
1969 keys->addrs.v6addrs.dst = key_iph->daddr;
1970 keys->tags.flow_label = ip6_flowlabel(key_iph);
1971 keys->basic.ip_proto = key_iph->nexthdr;
1975 /* if skb is set it will be used and fl6 can be NULL */
1976 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1977 const struct sk_buff *skb, struct flow_keys *flkeys)
1979 struct flow_keys hash_keys;
1982 switch (ip6_multipath_hash_policy(net)) {
1984 memset(&hash_keys, 0, sizeof(hash_keys));
1985 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1989 hash_keys.addrs.v6addrs.src = fl6->saddr;
1990 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1991 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1992 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1997 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1998 struct flow_keys keys;
2000 /* short-circuit if we already have L4 hash present */
2002 return skb_get_hash_raw(skb) >> 1;
2004 memset(&hash_keys, 0, sizeof(hash_keys));
2007 skb_flow_dissect_flow_keys(skb, &keys, flag);
2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2012 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2013 hash_keys.ports.src = flkeys->ports.src;
2014 hash_keys.ports.dst = flkeys->ports.dst;
2015 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2017 memset(&hash_keys, 0, sizeof(hash_keys));
2018 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019 hash_keys.addrs.v6addrs.src = fl6->saddr;
2020 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021 hash_keys.ports.src = fl6->fl6_sport;
2022 hash_keys.ports.dst = fl6->fl6_dport;
2023 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2027 mhash = flow_hash_from_keys(&hash_keys);
2032 void ip6_route_input(struct sk_buff *skb)
2034 const struct ipv6hdr *iph = ipv6_hdr(skb);
2035 struct net *net = dev_net(skb->dev);
2036 int flags = RT6_LOOKUP_F_HAS_SADDR;
2037 struct ip_tunnel_info *tun_info;
2038 struct flowi6 fl6 = {
2039 .flowi6_iif = skb->dev->ifindex,
2040 .daddr = iph->daddr,
2041 .saddr = iph->saddr,
2042 .flowlabel = ip6_flowinfo(iph),
2043 .flowi6_mark = skb->mark,
2044 .flowi6_proto = iph->nexthdr,
2046 struct flow_keys *flkeys = NULL, _flkeys;
2048 tun_info = skb_tunnel_info(skb);
2049 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2050 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2052 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2055 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2056 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2059 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2062 static struct rt6_info *ip6_pol_route_output(struct net *net,
2063 struct fib6_table *table,
2065 const struct sk_buff *skb,
2068 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2071 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2072 struct flowi6 *fl6, int flags)
2076 if (rt6_need_strict(&fl6->daddr)) {
2077 struct dst_entry *dst;
2079 dst = l3mdev_link_scope_lookup(net, fl6);
2084 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2086 any_src = ipv6_addr_any(&fl6->saddr);
2087 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2088 (fl6->flowi6_oif && any_src))
2089 flags |= RT6_LOOKUP_F_IFACE;
2092 flags |= RT6_LOOKUP_F_HAS_SADDR;
2094 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2096 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2098 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2100 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2103 struct net_device *loopback_dev = net->loopback_dev;
2104 struct dst_entry *new = NULL;
2106 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2107 DST_OBSOLETE_DEAD, 0);
2110 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2114 new->input = dst_discard;
2115 new->output = dst_discard_out;
2117 dst_copy_metrics(new, &ort->dst);
2119 rt->rt6i_idev = in6_dev_get(loopback_dev);
2120 rt->rt6i_gateway = ort->rt6i_gateway;
2121 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2123 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2124 #ifdef CONFIG_IPV6_SUBTREES
2125 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2129 dst_release(dst_orig);
2130 return new ? new : ERR_PTR(-ENOMEM);
2134 * Destination cache support functions
2137 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2141 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2144 if (fib6_check_expired(f6i))
2150 static struct dst_entry *rt6_check(struct rt6_info *rt,
2151 struct fib6_info *from,
2156 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2157 rt_cookie != cookie)
2160 if (rt6_check_expired(rt))
2166 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2167 struct fib6_info *from,
2170 if (!__rt6_check_expired(rt) &&
2171 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2172 fib6_check(from, cookie))
2178 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2180 struct dst_entry *dst_ret;
2181 struct fib6_info *from;
2182 struct rt6_info *rt;
2184 rt = container_of(dst, struct rt6_info, dst);
2188 /* All IPV6 dsts are created with ->obsolete set to the value
2189 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2190 * into this function always.
2193 from = rcu_dereference(rt->from);
2195 if (from && (rt->rt6i_flags & RTF_PCPU ||
2196 unlikely(!list_empty(&rt->rt6i_uncached))))
2197 dst_ret = rt6_dst_from_check(rt, from, cookie);
2199 dst_ret = rt6_check(rt, from, cookie);
2206 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2208 struct rt6_info *rt = (struct rt6_info *) dst;
2211 if (rt->rt6i_flags & RTF_CACHE) {
2213 if (rt6_check_expired(rt)) {
2214 rt6_remove_exception_rt(rt);
2226 static void ip6_link_failure(struct sk_buff *skb)
2228 struct rt6_info *rt;
2230 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2232 rt = (struct rt6_info *) skb_dst(skb);
2235 if (rt->rt6i_flags & RTF_CACHE) {
2236 if (dst_hold_safe(&rt->dst))
2237 rt6_remove_exception_rt(rt);
2239 struct fib6_info *from;
2240 struct fib6_node *fn;
2242 from = rcu_dereference(rt->from);
2244 fn = rcu_dereference(from->fib6_node);
2245 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2253 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2255 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2256 struct fib6_info *from;
2259 from = rcu_dereference(rt0->from);
2261 rt0->dst.expires = from->expires;
2265 dst_set_expires(&rt0->dst, timeout);
2266 rt0->rt6i_flags |= RTF_EXPIRES;
2269 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2271 struct net *net = dev_net(rt->dst.dev);
2273 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2274 rt->rt6i_flags |= RTF_MODIFIED;
2275 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2278 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2283 from_set = !!rcu_dereference(rt->from);
2286 return !(rt->rt6i_flags & RTF_CACHE) &&
2287 (rt->rt6i_flags & RTF_PCPU || from_set);
2290 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2291 const struct ipv6hdr *iph, u32 mtu)
2293 const struct in6_addr *daddr, *saddr;
2294 struct rt6_info *rt6 = (struct rt6_info *)dst;
2296 if (dst_metric_locked(dst, RTAX_MTU))
2300 daddr = &iph->daddr;
2301 saddr = &iph->saddr;
2303 daddr = &sk->sk_v6_daddr;
2304 saddr = &inet6_sk(sk)->saddr;
2309 dst_confirm_neigh(dst, daddr);
2310 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2311 if (mtu >= dst_mtu(dst))
2314 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2315 rt6_do_update_pmtu(rt6, mtu);
2316 /* update rt6_ex->stamp for cache */
2317 if (rt6->rt6i_flags & RTF_CACHE)
2318 rt6_update_exception_stamp_rt(rt6);
2320 struct fib6_info *from;
2321 struct rt6_info *nrt6;
2324 from = rcu_dereference(rt6->from);
2325 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2327 rt6_do_update_pmtu(nrt6, mtu);
2328 if (rt6_insert_exception(nrt6, from))
2329 dst_release_immediate(&nrt6->dst);
2335 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2336 struct sk_buff *skb, u32 mtu)
2338 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2341 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2342 int oif, u32 mark, kuid_t uid)
2344 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2345 struct dst_entry *dst;
2348 memset(&fl6, 0, sizeof(fl6));
2349 fl6.flowi6_oif = oif;
2350 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2351 fl6.daddr = iph->daddr;
2352 fl6.saddr = iph->saddr;
2353 fl6.flowlabel = ip6_flowinfo(iph);
2354 fl6.flowi6_uid = uid;
2356 dst = ip6_route_output(net, NULL, &fl6);
2358 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2361 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2365 struct dst_entry *dst;
2367 ip6_update_pmtu(skb, sock_net(sk), mtu,
2368 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2370 dst = __sk_dst_get(sk);
2371 if (!dst || !dst->obsolete ||
2372 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2376 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2377 ip6_datagram_dst_update(sk, false);
2380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2382 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2383 const struct flowi6 *fl6)
2385 #ifdef CONFIG_IPV6_SUBTREES
2386 struct ipv6_pinfo *np = inet6_sk(sk);
2389 ip6_dst_store(sk, dst,
2390 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2391 &sk->sk_v6_daddr : NULL,
2392 #ifdef CONFIG_IPV6_SUBTREES
2393 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2399 /* Handle redirects */
2400 struct ip6rd_flowi {
2402 struct in6_addr gateway;
2405 static struct rt6_info *__ip6_route_redirect(struct net *net,
2406 struct fib6_table *table,
2408 const struct sk_buff *skb,
2411 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2412 struct rt6_info *ret = NULL, *rt_cache;
2413 struct fib6_info *rt;
2414 struct fib6_node *fn;
2416 /* Get the "current" route for this destination and
2417 * check if the redirect has come from appropriate router.
2419 * RFC 4861 specifies that redirects should only be
2420 * accepted if they come from the nexthop to the target.
2421 * Due to the way the routes are chosen, this notion
2422 * is a bit fuzzy and one might need to check all possible
2427 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2429 for_each_fib6_node_rt_rcu(fn) {
2430 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2432 if (fib6_check_expired(rt))
2434 if (rt->fib6_flags & RTF_REJECT)
2436 if (!(rt->fib6_flags & RTF_GATEWAY))
2438 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2440 /* rt_cache's gateway might be different from its 'parent'
2441 * in the case of an ip redirect.
2442 * So we keep searching in the exception table if the gateway
2445 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2446 rt_cache = rt6_find_cached_rt(rt,
2450 ipv6_addr_equal(&rdfl->gateway,
2451 &rt_cache->rt6i_gateway)) {
2461 rt = net->ipv6.fib6_null_entry;
2462 else if (rt->fib6_flags & RTF_REJECT) {
2463 ret = net->ipv6.ip6_null_entry;
2467 if (rt == net->ipv6.fib6_null_entry) {
2468 fn = fib6_backtrack(fn, &fl6->saddr);
2475 ip6_hold_safe(net, &ret, true);
2477 ret = ip6_create_rt_rcu(rt);
2481 trace_fib6_table_lookup(net, rt, table, fl6);
2485 static struct dst_entry *ip6_route_redirect(struct net *net,
2486 const struct flowi6 *fl6,
2487 const struct sk_buff *skb,
2488 const struct in6_addr *gateway)
2490 int flags = RT6_LOOKUP_F_HAS_SADDR;
2491 struct ip6rd_flowi rdfl;
2494 rdfl.gateway = *gateway;
2496 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2497 flags, __ip6_route_redirect);
2500 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2503 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2504 struct dst_entry *dst;
2507 memset(&fl6, 0, sizeof(fl6));
2508 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2509 fl6.flowi6_oif = oif;
2510 fl6.flowi6_mark = mark;
2511 fl6.daddr = iph->daddr;
2512 fl6.saddr = iph->saddr;
2513 fl6.flowlabel = ip6_flowinfo(iph);
2514 fl6.flowi6_uid = uid;
2516 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2517 rt6_do_redirect(dst, NULL, skb);
2520 EXPORT_SYMBOL_GPL(ip6_redirect);
2522 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2525 const struct ipv6hdr *iph = ipv6_hdr(skb);
2526 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2527 struct dst_entry *dst;
2530 memset(&fl6, 0, sizeof(fl6));
2531 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2532 fl6.flowi6_oif = oif;
2533 fl6.flowi6_mark = mark;
2534 fl6.daddr = msg->dest;
2535 fl6.saddr = iph->daddr;
2536 fl6.flowi6_uid = sock_net_uid(net, NULL);
2538 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2539 rt6_do_redirect(dst, NULL, skb);
2543 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2545 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2548 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2550 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2552 struct net_device *dev = dst->dev;
2553 unsigned int mtu = dst_mtu(dst);
2554 struct net *net = dev_net(dev);
2556 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2558 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2559 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2562 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2563 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2564 * IPV6_MAXPLEN is also valid and means: "any MSS,
2565 * rely only on pmtu discovery"
2567 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2572 static unsigned int ip6_mtu(const struct dst_entry *dst)
2574 struct inet6_dev *idev;
2577 mtu = dst_metric_raw(dst, RTAX_MTU);
2584 idev = __in6_dev_get(dst->dev);
2586 mtu = idev->cnf.mtu6;
2590 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2592 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2596 * 1. mtu on route is locked - use it
2597 * 2. mtu from nexthop exception
2598 * 3. mtu from egress device
2600 * based on ip6_dst_mtu_forward and exception logic of
2601 * rt6_find_cached_rt; called with rcu_read_lock
2603 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2604 struct in6_addr *saddr)
2606 struct rt6_exception_bucket *bucket;
2607 struct rt6_exception *rt6_ex;
2608 struct in6_addr *src_key;
2609 struct inet6_dev *idev;
2612 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2613 mtu = f6i->fib6_pmtu;
2619 #ifdef CONFIG_IPV6_SUBTREES
2620 if (f6i->fib6_src.plen)
2624 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2625 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2626 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2627 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2630 struct net_device *dev = fib6_info_nh_dev(f6i);
2633 idev = __in6_dev_get(dev);
2634 if (idev && idev->cnf.mtu6 > mtu)
2635 mtu = idev->cnf.mtu6;
2638 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2640 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2643 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2646 struct dst_entry *dst;
2647 struct rt6_info *rt;
2648 struct inet6_dev *idev = in6_dev_get(dev);
2649 struct net *net = dev_net(dev);
2651 if (unlikely(!idev))
2652 return ERR_PTR(-ENODEV);
2654 rt = ip6_dst_alloc(net, dev, 0);
2655 if (unlikely(!rt)) {
2657 dst = ERR_PTR(-ENOMEM);
2661 rt->dst.flags |= DST_HOST;
2662 rt->dst.input = ip6_input;
2663 rt->dst.output = ip6_output;
2664 rt->rt6i_gateway = fl6->daddr;
2665 rt->rt6i_dst.addr = fl6->daddr;
2666 rt->rt6i_dst.plen = 128;
2667 rt->rt6i_idev = idev;
2668 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2670 /* Add this dst into uncached_list so that rt6_disable_ip() can
2671 * do proper release of the net_device
2673 rt6_uncached_list_add(rt);
2674 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2676 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2682 static int ip6_dst_gc(struct dst_ops *ops)
2684 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2685 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2686 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2687 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2688 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2689 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2692 entries = dst_entries_get_fast(ops);
2693 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2694 entries <= rt_max_size)
2697 net->ipv6.ip6_rt_gc_expire++;
2698 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2699 entries = dst_entries_get_slow(ops);
2700 if (entries < ops->gc_thresh)
2701 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2703 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2704 return entries > rt_max_size;
2707 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2708 struct fib6_config *cfg)
2710 struct dst_metrics *p;
2715 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2719 refcount_set(&p->refcnt, 1);
2720 rt->fib6_metrics = p;
2722 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2725 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2726 struct fib6_config *cfg,
2727 const struct in6_addr *gw_addr,
2728 u32 tbid, int flags)
2730 struct flowi6 fl6 = {
2731 .flowi6_oif = cfg->fc_ifindex,
2733 .saddr = cfg->fc_prefsrc,
2735 struct fib6_table *table;
2736 struct rt6_info *rt;
2738 table = fib6_get_table(net, tbid);
2742 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2743 flags |= RT6_LOOKUP_F_HAS_SADDR;
2745 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2746 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2748 /* if table lookup failed, fall back to full lookup */
2749 if (rt == net->ipv6.ip6_null_entry) {
2757 static int ip6_route_check_nh_onlink(struct net *net,
2758 struct fib6_config *cfg,
2759 const struct net_device *dev,
2760 struct netlink_ext_ack *extack)
2762 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2763 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2764 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2765 struct rt6_info *grt;
2769 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2771 if (!grt->dst.error &&
2772 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2773 NL_SET_ERR_MSG(extack,
2774 "Nexthop has invalid gateway or device mismatch");
2784 static int ip6_route_check_nh(struct net *net,
2785 struct fib6_config *cfg,
2786 struct net_device **_dev,
2787 struct inet6_dev **idev)
2789 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2790 struct net_device *dev = _dev ? *_dev : NULL;
2791 struct rt6_info *grt = NULL;
2792 int err = -EHOSTUNREACH;
2794 if (cfg->fc_table) {
2795 int flags = RT6_LOOKUP_F_IFACE;
2797 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2798 cfg->fc_table, flags);
2800 if (grt->rt6i_flags & RTF_GATEWAY ||
2801 (dev && dev != grt->dst.dev)) {
2809 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2815 if (dev != grt->dst.dev) {
2820 *_dev = dev = grt->dst.dev;
2821 *idev = grt->rt6i_idev;
2823 in6_dev_hold(grt->rt6i_idev);
2826 if (!(grt->rt6i_flags & RTF_GATEWAY))
2835 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2836 struct net_device **_dev, struct inet6_dev **idev,
2837 struct netlink_ext_ack *extack)
2839 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2840 int gwa_type = ipv6_addr_type(gw_addr);
2841 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2842 const struct net_device *dev = *_dev;
2843 bool need_addr_check = !dev;
2846 /* if gw_addr is local we will fail to detect this in case
2847 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2848 * will return already-added prefix route via interface that
2849 * prefix route was assigned to, which might be non-loopback.
2852 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2853 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2857 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2858 /* IPv6 strictly inhibits using not link-local
2859 * addresses as nexthop address.
2860 * Otherwise, router will not able to send redirects.
2861 * It is very good, but in some (rare!) circumstances
2862 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2863 * some exceptions. --ANK
2864 * We allow IPv4-mapped nexthops to support RFC4798-type
2867 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2868 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2872 if (cfg->fc_flags & RTNH_F_ONLINK)
2873 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2875 err = ip6_route_check_nh(net, cfg, _dev, idev);
2881 /* reload in case device was changed */
2886 NL_SET_ERR_MSG(extack, "Egress device not specified");
2888 } else if (dev->flags & IFF_LOOPBACK) {
2889 NL_SET_ERR_MSG(extack,
2890 "Egress device can not be loopback device for this route");
2894 /* if we did not check gw_addr above, do so now that the
2895 * egress device has been resolved.
2897 if (need_addr_check &&
2898 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2899 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2908 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2910 struct netlink_ext_ack *extack)
2912 struct net *net = cfg->fc_nlinfo.nl_net;
2913 struct fib6_info *rt = NULL;
2914 struct net_device *dev = NULL;
2915 struct inet6_dev *idev = NULL;
2916 struct fib6_table *table;
2920 /* RTF_PCPU is an internal flag; can not be set by userspace */
2921 if (cfg->fc_flags & RTF_PCPU) {
2922 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2926 /* RTF_CACHE is an internal flag; can not be set by userspace */
2927 if (cfg->fc_flags & RTF_CACHE) {
2928 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2932 if (cfg->fc_type > RTN_MAX) {
2933 NL_SET_ERR_MSG(extack, "Invalid route type");
2937 if (cfg->fc_dst_len > 128) {
2938 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2941 if (cfg->fc_src_len > 128) {
2942 NL_SET_ERR_MSG(extack, "Invalid source address length");
2945 #ifndef CONFIG_IPV6_SUBTREES
2946 if (cfg->fc_src_len) {
2947 NL_SET_ERR_MSG(extack,
2948 "Specifying source address requires IPV6_SUBTREES to be enabled");
2952 if (cfg->fc_ifindex) {
2954 dev = dev_get_by_index(net, cfg->fc_ifindex);
2957 idev = in6_dev_get(dev);
2962 if (cfg->fc_metric == 0)
2963 cfg->fc_metric = IP6_RT_PRIO_USER;
2965 if (cfg->fc_flags & RTNH_F_ONLINK) {
2967 NL_SET_ERR_MSG(extack,
2968 "Nexthop device required for onlink");
2973 if (!(dev->flags & IFF_UP)) {
2974 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2981 if (cfg->fc_nlinfo.nlh &&
2982 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2983 table = fib6_get_table(net, cfg->fc_table);
2985 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2986 table = fib6_new_table(net, cfg->fc_table);
2989 table = fib6_new_table(net, cfg->fc_table);
2996 rt = fib6_info_alloc(gfp_flags);
3000 if (cfg->fc_flags & RTF_ADDRCONF)
3001 rt->dst_nocount = true;
3003 err = ip6_convert_metrics(net, rt, cfg);
3007 if (cfg->fc_flags & RTF_EXPIRES)
3008 fib6_set_expires(rt, jiffies +
3009 clock_t_to_jiffies(cfg->fc_expires));
3011 fib6_clean_expires(rt);
3013 if (cfg->fc_protocol == RTPROT_UNSPEC)
3014 cfg->fc_protocol = RTPROT_BOOT;
3015 rt->fib6_protocol = cfg->fc_protocol;
3017 addr_type = ipv6_addr_type(&cfg->fc_dst);
3019 if (cfg->fc_encap) {
3020 struct lwtunnel_state *lwtstate;
3022 err = lwtunnel_build_state(cfg->fc_encap_type,
3023 cfg->fc_encap, AF_INET6, cfg,
3027 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3030 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3031 rt->fib6_dst.plen = cfg->fc_dst_len;
3032 if (rt->fib6_dst.plen == 128)
3033 rt->dst_host = true;
3035 #ifdef CONFIG_IPV6_SUBTREES
3036 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3037 rt->fib6_src.plen = cfg->fc_src_len;
3040 rt->fib6_metric = cfg->fc_metric;
3041 rt->fib6_nh.nh_weight = 1;
3043 rt->fib6_type = cfg->fc_type;
3045 /* We cannot add true routes via loopback here,
3046 they would result in kernel looping; promote them to reject routes
3048 if ((cfg->fc_flags & RTF_REJECT) ||
3049 (dev && (dev->flags & IFF_LOOPBACK) &&
3050 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3051 !(cfg->fc_flags & RTF_LOCAL))) {
3052 /* hold loopback dev/idev if we haven't done so. */
3053 if (dev != net->loopback_dev) {
3058 dev = net->loopback_dev;
3060 idev = in6_dev_get(dev);
3066 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3070 if (cfg->fc_flags & RTF_GATEWAY) {
3071 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3075 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3082 if (idev->cnf.disable_ipv6) {
3083 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3088 if (!(dev->flags & IFF_UP)) {
3089 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3094 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3095 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3096 NL_SET_ERR_MSG(extack, "Invalid source address");
3100 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3101 rt->fib6_prefsrc.plen = 128;
3103 rt->fib6_prefsrc.plen = 0;
3105 rt->fib6_flags = cfg->fc_flags;
3108 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3109 !netif_carrier_ok(dev))
3110 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3111 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3112 rt->fib6_nh.nh_dev = dev;
3113 rt->fib6_table = table;
3125 fib6_info_release(rt);
3126 return ERR_PTR(err);
3129 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3130 struct netlink_ext_ack *extack)
3132 struct fib6_info *rt;
3135 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3139 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3140 fib6_info_release(rt);
3145 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3147 struct net *net = info->nl_net;
3148 struct fib6_table *table;
3151 if (rt == net->ipv6.fib6_null_entry) {
3156 table = rt->fib6_table;
3157 spin_lock_bh(&table->tb6_lock);
3158 err = fib6_del(rt, info);
3159 spin_unlock_bh(&table->tb6_lock);
3162 fib6_info_release(rt);
3166 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3168 struct nl_info info = { .nl_net = net };
3170 return __ip6_del_rt(rt, &info);
3173 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3175 struct nl_info *info = &cfg->fc_nlinfo;
3176 struct net *net = info->nl_net;
3177 struct sk_buff *skb = NULL;
3178 struct fib6_table *table;
3181 if (rt == net->ipv6.fib6_null_entry)
3183 table = rt->fib6_table;
3184 spin_lock_bh(&table->tb6_lock);
3186 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3187 struct fib6_info *sibling, *next_sibling;
3189 /* prefer to send a single notification with all hops */
3190 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3192 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3194 if (rt6_fill_node(net, skb, rt, NULL,
3195 NULL, NULL, 0, RTM_DELROUTE,
3196 info->portid, seq, 0) < 0) {
3200 info->skip_notify = 1;
3203 list_for_each_entry_safe(sibling, next_sibling,
3206 err = fib6_del(sibling, info);
3212 err = fib6_del(rt, info);
3214 spin_unlock_bh(&table->tb6_lock);
3216 fib6_info_release(rt);
3219 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3220 info->nlh, gfp_any());
3225 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3229 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3232 if (cfg->fc_flags & RTF_GATEWAY &&
3233 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3235 if (dst_hold_safe(&rt->dst))
3236 rc = rt6_remove_exception_rt(rt);
3241 static int ip6_route_del(struct fib6_config *cfg,
3242 struct netlink_ext_ack *extack)
3244 struct rt6_info *rt_cache;
3245 struct fib6_table *table;
3246 struct fib6_info *rt;
3247 struct fib6_node *fn;
3250 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3252 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3258 fn = fib6_locate(&table->tb6_root,
3259 &cfg->fc_dst, cfg->fc_dst_len,
3260 &cfg->fc_src, cfg->fc_src_len,
3261 !(cfg->fc_flags & RTF_CACHE));
3264 for_each_fib6_node_rt_rcu(fn) {
3265 if (cfg->fc_flags & RTF_CACHE) {
3268 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3271 rc = ip6_del_cached_rt(rt_cache, cfg);
3279 if (cfg->fc_ifindex &&
3280 (!rt->fib6_nh.nh_dev ||
3281 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3283 if (cfg->fc_flags & RTF_GATEWAY &&
3284 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3286 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3288 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3290 if (!fib6_info_hold_safe(rt))
3294 /* if gateway was specified only delete the one hop */
3295 if (cfg->fc_flags & RTF_GATEWAY)
3296 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3298 return __ip6_del_rt_siblings(rt, cfg);
3306 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3308 struct netevent_redirect netevent;
3309 struct rt6_info *rt, *nrt = NULL;
3310 struct ndisc_options ndopts;
3311 struct inet6_dev *in6_dev;
3312 struct neighbour *neigh;
3313 struct fib6_info *from;
3315 int optlen, on_link;
3318 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3319 optlen -= sizeof(*msg);
3322 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3326 msg = (struct rd_msg *)icmp6_hdr(skb);
3328 if (ipv6_addr_is_multicast(&msg->dest)) {
3329 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3334 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3336 } else if (ipv6_addr_type(&msg->target) !=
3337 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3338 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3342 in6_dev = __in6_dev_get(skb->dev);
3345 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3349 * The IP source address of the Redirect MUST be the same as the current
3350 * first-hop router for the specified ICMP Destination Address.
3353 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3354 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3359 if (ndopts.nd_opts_tgt_lladdr) {
3360 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3363 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3368 rt = (struct rt6_info *) dst;
3369 if (rt->rt6i_flags & RTF_REJECT) {
3370 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3374 /* Redirect received -> path was valid.
3375 * Look, redirects are sent only in response to data packets,
3376 * so that this nexthop apparently is reachable. --ANK
3378 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3380 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3385 * We have finally decided to accept it.
3388 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3389 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3390 NEIGH_UPDATE_F_OVERRIDE|
3391 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3392 NEIGH_UPDATE_F_ISROUTER)),
3393 NDISC_REDIRECT, &ndopts);
3396 from = rcu_dereference(rt->from);
3397 /* This fib6_info_hold() is safe here because we hold reference to rt
3398 * and rt already holds reference to fib6_info.
3400 fib6_info_hold(from);
3403 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3407 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3409 nrt->rt6i_flags &= ~RTF_GATEWAY;
3411 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3413 /* No need to remove rt from the exception table if rt is
3414 * a cached route because rt6_insert_exception() will
3417 if (rt6_insert_exception(nrt, from)) {
3418 dst_release_immediate(&nrt->dst);
3422 netevent.old = &rt->dst;
3423 netevent.new = &nrt->dst;
3424 netevent.daddr = &msg->dest;
3425 netevent.neigh = neigh;
3426 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3429 fib6_info_release(from);
3430 neigh_release(neigh);
3433 #ifdef CONFIG_IPV6_ROUTE_INFO
3434 static struct fib6_info *rt6_get_route_info(struct net *net,
3435 const struct in6_addr *prefix, int prefixlen,
3436 const struct in6_addr *gwaddr,
3437 struct net_device *dev)
3439 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3440 int ifindex = dev->ifindex;
3441 struct fib6_node *fn;
3442 struct fib6_info *rt = NULL;
3443 struct fib6_table *table;
3445 table = fib6_get_table(net, tb_id);
3450 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3454 for_each_fib6_node_rt_rcu(fn) {
3455 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3457 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3459 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3461 if (!fib6_info_hold_safe(rt))
3470 static struct fib6_info *rt6_add_route_info(struct net *net,
3471 const struct in6_addr *prefix, int prefixlen,
3472 const struct in6_addr *gwaddr,
3473 struct net_device *dev,
3476 struct fib6_config cfg = {
3477 .fc_metric = IP6_RT_PRIO_USER,
3478 .fc_ifindex = dev->ifindex,
3479 .fc_dst_len = prefixlen,
3480 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3481 RTF_UP | RTF_PREF(pref),
3482 .fc_protocol = RTPROT_RA,
3483 .fc_type = RTN_UNICAST,
3484 .fc_nlinfo.portid = 0,
3485 .fc_nlinfo.nlh = NULL,
3486 .fc_nlinfo.nl_net = net,
3489 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3490 cfg.fc_dst = *prefix;
3491 cfg.fc_gateway = *gwaddr;
3493 /* We should treat it as a default route if prefix length is 0. */
3495 cfg.fc_flags |= RTF_DEFAULT;
3497 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3499 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3503 struct fib6_info *rt6_get_dflt_router(struct net *net,
3504 const struct in6_addr *addr,
3505 struct net_device *dev)
3507 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3508 struct fib6_info *rt;
3509 struct fib6_table *table;
3511 table = fib6_get_table(net, tb_id);
3516 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3517 if (dev == rt->fib6_nh.nh_dev &&
3518 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3519 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3522 if (rt && !fib6_info_hold_safe(rt))
3528 struct fib6_info *rt6_add_dflt_router(struct net *net,
3529 const struct in6_addr *gwaddr,
3530 struct net_device *dev,
3533 struct fib6_config cfg = {
3534 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3535 .fc_metric = IP6_RT_PRIO_USER,
3536 .fc_ifindex = dev->ifindex,
3537 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3538 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3539 .fc_protocol = RTPROT_RA,
3540 .fc_type = RTN_UNICAST,
3541 .fc_nlinfo.portid = 0,
3542 .fc_nlinfo.nlh = NULL,
3543 .fc_nlinfo.nl_net = net,
3546 cfg.fc_gateway = *gwaddr;
3548 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3549 struct fib6_table *table;
3551 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3553 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3556 return rt6_get_dflt_router(net, gwaddr, dev);
3559 static void __rt6_purge_dflt_routers(struct net *net,
3560 struct fib6_table *table)
3562 struct fib6_info *rt;
3566 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3567 struct net_device *dev = fib6_info_nh_dev(rt);
3568 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3570 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3571 (!idev || idev->cnf.accept_ra != 2) &&
3572 fib6_info_hold_safe(rt)) {
3574 ip6_del_rt(net, rt);
3580 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3583 void rt6_purge_dflt_routers(struct net *net)
3585 struct fib6_table *table;
3586 struct hlist_head *head;
3591 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3592 head = &net->ipv6.fib_table_hash[h];
3593 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3594 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3595 __rt6_purge_dflt_routers(net, table);
3602 static void rtmsg_to_fib6_config(struct net *net,
3603 struct in6_rtmsg *rtmsg,
3604 struct fib6_config *cfg)
3606 memset(cfg, 0, sizeof(*cfg));
3608 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3610 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3611 cfg->fc_metric = rtmsg->rtmsg_metric;
3612 cfg->fc_expires = rtmsg->rtmsg_info;
3613 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3614 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3615 cfg->fc_flags = rtmsg->rtmsg_flags;
3616 cfg->fc_type = rtmsg->rtmsg_type;
3618 cfg->fc_nlinfo.nl_net = net;
3620 cfg->fc_dst = rtmsg->rtmsg_dst;
3621 cfg->fc_src = rtmsg->rtmsg_src;
3622 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3625 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3627 struct fib6_config cfg;
3628 struct in6_rtmsg rtmsg;
3632 case SIOCADDRT: /* Add a route */
3633 case SIOCDELRT: /* Delete a route */
3634 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3636 err = copy_from_user(&rtmsg, arg,
3637 sizeof(struct in6_rtmsg));
3641 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3646 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3649 err = ip6_route_del(&cfg, NULL);
3663 * Drop the packet on the floor
3666 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3669 struct dst_entry *dst = skb_dst(skb);
3670 switch (ipstats_mib_noroutes) {
3671 case IPSTATS_MIB_INNOROUTES:
3672 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3673 if (type == IPV6_ADDR_ANY) {
3674 IP6_INC_STATS(dev_net(dst->dev),
3675 __in6_dev_get_safely(skb->dev),
3676 IPSTATS_MIB_INADDRERRORS);
3680 case IPSTATS_MIB_OUTNOROUTES:
3681 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3682 ipstats_mib_noroutes);
3685 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3690 static int ip6_pkt_discard(struct sk_buff *skb)
3692 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3695 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3697 skb->dev = skb_dst(skb)->dev;
3698 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3701 static int ip6_pkt_prohibit(struct sk_buff *skb)
3703 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3706 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3708 skb->dev = skb_dst(skb)->dev;
3709 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3713 * Allocate a dst for local (unicast / anycast) address.
3716 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3717 struct inet6_dev *idev,
3718 const struct in6_addr *addr,
3719 bool anycast, gfp_t gfp_flags)
3722 struct net_device *dev = idev->dev;
3723 struct fib6_info *f6i;
3725 f6i = fib6_info_alloc(gfp_flags);
3727 return ERR_PTR(-ENOMEM);
3729 f6i->dst_nocount = true;
3730 f6i->dst_host = true;
3731 f6i->fib6_protocol = RTPROT_KERNEL;
3732 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3734 f6i->fib6_type = RTN_ANYCAST;
3735 f6i->fib6_flags |= RTF_ANYCAST;
3737 f6i->fib6_type = RTN_LOCAL;
3738 f6i->fib6_flags |= RTF_LOCAL;
3741 f6i->fib6_nh.nh_gw = *addr;
3743 f6i->fib6_nh.nh_dev = dev;
3744 f6i->fib6_dst.addr = *addr;
3745 f6i->fib6_dst.plen = 128;
3746 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3747 f6i->fib6_table = fib6_get_table(net, tb_id);
3752 /* remove deleted ip from prefsrc entries */
3753 struct arg_dev_net_ip {
3754 struct net_device *dev;
3756 struct in6_addr *addr;
3759 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3761 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3762 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3763 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3765 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3766 rt != net->ipv6.fib6_null_entry &&
3767 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3768 spin_lock_bh(&rt6_exception_lock);
3769 /* remove prefsrc entry */
3770 rt->fib6_prefsrc.plen = 0;
3771 spin_unlock_bh(&rt6_exception_lock);
3776 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3778 struct net *net = dev_net(ifp->idev->dev);
3779 struct arg_dev_net_ip adni = {
3780 .dev = ifp->idev->dev,
3784 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3787 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3789 /* Remove routers and update dst entries when gateway turn into host. */
3790 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3792 struct in6_addr *gateway = (struct in6_addr *)arg;
3794 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3795 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3799 /* Further clean up cached routes in exception table.
3800 * This is needed because cached route may have a different
3801 * gateway than its 'parent' in the case of an ip redirect.
3803 rt6_exceptions_clean_tohost(rt, gateway);
3808 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3810 fib6_clean_all(net, fib6_clean_tohost, gateway);
3813 struct arg_netdev_event {
3814 const struct net_device *dev;
3816 unsigned int nh_flags;
3817 unsigned long event;
3821 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3823 struct fib6_info *iter;
3824 struct fib6_node *fn;
3826 fn = rcu_dereference_protected(rt->fib6_node,
3827 lockdep_is_held(&rt->fib6_table->tb6_lock));
3828 iter = rcu_dereference_protected(fn->leaf,
3829 lockdep_is_held(&rt->fib6_table->tb6_lock));
3831 if (iter->fib6_metric == rt->fib6_metric &&
3832 rt6_qualify_for_ecmp(iter))
3834 iter = rcu_dereference_protected(iter->fib6_next,
3835 lockdep_is_held(&rt->fib6_table->tb6_lock));
3841 static bool rt6_is_dead(const struct fib6_info *rt)
3843 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3844 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3845 fib6_ignore_linkdown(rt)))
3851 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3853 struct fib6_info *iter;
3856 if (!rt6_is_dead(rt))
3857 total += rt->fib6_nh.nh_weight;
3859 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3860 if (!rt6_is_dead(iter))
3861 total += iter->fib6_nh.nh_weight;
3867 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3869 int upper_bound = -1;
3871 if (!rt6_is_dead(rt)) {
3872 *weight += rt->fib6_nh.nh_weight;
3873 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3876 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3879 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3881 struct fib6_info *iter;
3884 rt6_upper_bound_set(rt, &weight, total);
3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3887 rt6_upper_bound_set(iter, &weight, total);
3890 void rt6_multipath_rebalance(struct fib6_info *rt)
3892 struct fib6_info *first;
3895 /* In case the entire multipath route was marked for flushing,
3896 * then there is no need to rebalance upon the removal of every
3899 if (!rt->fib6_nsiblings || rt->should_flush)
3902 /* During lookup routes are evaluated in order, so we need to
3903 * make sure upper bounds are assigned from the first sibling
3906 first = rt6_multipath_first_sibling(rt);
3907 if (WARN_ON_ONCE(!first))
3910 total = rt6_multipath_total_weight(first);
3911 rt6_multipath_upper_bound_set(first, total);
3914 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3916 const struct arg_netdev_event *arg = p_arg;
3917 struct net *net = dev_net(arg->dev);
3919 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3920 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3921 fib6_update_sernum_upto_root(net, rt);
3922 rt6_multipath_rebalance(rt);
3928 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3930 struct arg_netdev_event arg = {
3933 .nh_flags = nh_flags,
3937 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3938 arg.nh_flags |= RTNH_F_LINKDOWN;
3940 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3943 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3944 const struct net_device *dev)
3946 struct fib6_info *iter;
3948 if (rt->fib6_nh.nh_dev == dev)
3950 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3951 if (iter->fib6_nh.nh_dev == dev)
3957 static void rt6_multipath_flush(struct fib6_info *rt)
3959 struct fib6_info *iter;
3961 rt->should_flush = 1;
3962 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3963 iter->should_flush = 1;
3966 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3967 const struct net_device *down_dev)
3969 struct fib6_info *iter;
3970 unsigned int dead = 0;
3972 if (rt->fib6_nh.nh_dev == down_dev ||
3973 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976 if (iter->fib6_nh.nh_dev == down_dev ||
3977 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3983 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3984 const struct net_device *dev,
3985 unsigned int nh_flags)
3987 struct fib6_info *iter;
3989 if (rt->fib6_nh.nh_dev == dev)
3990 rt->fib6_nh.nh_flags |= nh_flags;
3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992 if (iter->fib6_nh.nh_dev == dev)
3993 iter->fib6_nh.nh_flags |= nh_flags;
3996 /* called with write lock held for table with rt */
3997 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3999 const struct arg_netdev_event *arg = p_arg;
4000 const struct net_device *dev = arg->dev;
4001 struct net *net = dev_net(dev);
4003 if (rt == net->ipv6.fib6_null_entry)
4006 switch (arg->event) {
4007 case NETDEV_UNREGISTER:
4008 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4010 if (rt->should_flush)
4012 if (!rt->fib6_nsiblings)
4013 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4014 if (rt6_multipath_uses_dev(rt, dev)) {
4017 count = rt6_multipath_dead_count(rt, dev);
4018 if (rt->fib6_nsiblings + 1 == count) {
4019 rt6_multipath_flush(rt);
4022 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4024 fib6_update_sernum(net, rt);
4025 rt6_multipath_rebalance(rt);
4029 if (rt->fib6_nh.nh_dev != dev ||
4030 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4032 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4033 rt6_multipath_rebalance(rt);
4040 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4042 struct arg_netdev_event arg = {
4049 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4052 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4054 rt6_sync_down_dev(dev, event);
4055 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4056 neigh_ifdown(&nd_tbl, dev);
4059 struct rt6_mtu_change_arg {
4060 struct net_device *dev;
4064 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4066 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4067 struct inet6_dev *idev;
4069 /* In IPv6 pmtu discovery is not optional,
4070 so that RTAX_MTU lock cannot disable it.
4071 We still use this lock to block changes
4072 caused by addrconf/ndisc.
4075 idev = __in6_dev_get(arg->dev);
4079 /* For administrative MTU increase, there is no way to discover
4080 IPv6 PMTU increase, so PMTU increase should be updated here.
4081 Since RFC 1981 doesn't include administrative MTU increase
4082 update PMTU increase is a MUST. (i.e. jumbo frame)
4084 if (rt->fib6_nh.nh_dev == arg->dev &&
4085 !fib6_metric_locked(rt, RTAX_MTU)) {
4086 u32 mtu = rt->fib6_pmtu;
4088 if (mtu >= arg->mtu ||
4089 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4090 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4092 spin_lock_bh(&rt6_exception_lock);
4093 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4094 spin_unlock_bh(&rt6_exception_lock);
4099 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4101 struct rt6_mtu_change_arg arg = {
4106 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4109 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4110 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4111 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4112 [RTA_OIF] = { .type = NLA_U32 },
4113 [RTA_IIF] = { .type = NLA_U32 },
4114 [RTA_PRIORITY] = { .type = NLA_U32 },
4115 [RTA_METRICS] = { .type = NLA_NESTED },
4116 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4117 [RTA_PREF] = { .type = NLA_U8 },
4118 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4119 [RTA_ENCAP] = { .type = NLA_NESTED },
4120 [RTA_EXPIRES] = { .type = NLA_U32 },
4121 [RTA_UID] = { .type = NLA_U32 },
4122 [RTA_MARK] = { .type = NLA_U32 },
4123 [RTA_TABLE] = { .type = NLA_U32 },
4124 [RTA_IP_PROTO] = { .type = NLA_U8 },
4125 [RTA_SPORT] = { .type = NLA_U16 },
4126 [RTA_DPORT] = { .type = NLA_U16 },
4129 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4130 struct fib6_config *cfg,
4131 struct netlink_ext_ack *extack)
4134 struct nlattr *tb[RTA_MAX+1];
4138 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4144 rtm = nlmsg_data(nlh);
4145 memset(cfg, 0, sizeof(*cfg));
4147 cfg->fc_table = rtm->rtm_table;
4148 cfg->fc_dst_len = rtm->rtm_dst_len;
4149 cfg->fc_src_len = rtm->rtm_src_len;
4150 cfg->fc_flags = RTF_UP;
4151 cfg->fc_protocol = rtm->rtm_protocol;
4152 cfg->fc_type = rtm->rtm_type;
4154 if (rtm->rtm_type == RTN_UNREACHABLE ||
4155 rtm->rtm_type == RTN_BLACKHOLE ||
4156 rtm->rtm_type == RTN_PROHIBIT ||
4157 rtm->rtm_type == RTN_THROW)
4158 cfg->fc_flags |= RTF_REJECT;
4160 if (rtm->rtm_type == RTN_LOCAL)
4161 cfg->fc_flags |= RTF_LOCAL;
4163 if (rtm->rtm_flags & RTM_F_CLONED)
4164 cfg->fc_flags |= RTF_CACHE;
4166 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4168 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4169 cfg->fc_nlinfo.nlh = nlh;
4170 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4172 if (tb[RTA_GATEWAY]) {
4173 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4174 cfg->fc_flags |= RTF_GATEWAY;
4178 int plen = (rtm->rtm_dst_len + 7) >> 3;
4180 if (nla_len(tb[RTA_DST]) < plen)
4183 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4187 int plen = (rtm->rtm_src_len + 7) >> 3;
4189 if (nla_len(tb[RTA_SRC]) < plen)
4192 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4195 if (tb[RTA_PREFSRC])
4196 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4199 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4201 if (tb[RTA_PRIORITY])
4202 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4204 if (tb[RTA_METRICS]) {
4205 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4206 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4210 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4212 if (tb[RTA_MULTIPATH]) {
4213 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4214 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4216 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4217 cfg->fc_mp_len, extack);
4223 pref = nla_get_u8(tb[RTA_PREF]);
4224 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4225 pref != ICMPV6_ROUTER_PREF_HIGH)
4226 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4227 cfg->fc_flags |= RTF_PREF(pref);
4231 cfg->fc_encap = tb[RTA_ENCAP];
4233 if (tb[RTA_ENCAP_TYPE]) {
4234 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4236 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4241 if (tb[RTA_EXPIRES]) {
4242 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4244 if (addrconf_finite_timeout(timeout)) {
4245 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4246 cfg->fc_flags |= RTF_EXPIRES;
4256 struct fib6_info *fib6_info;
4257 struct fib6_config r_cfg;
4258 struct list_head next;
4261 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4265 list_for_each_entry(nh, rt6_nh_list, next) {
4266 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4267 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4268 nh->r_cfg.fc_ifindex);
4272 static int ip6_route_info_append(struct net *net,
4273 struct list_head *rt6_nh_list,
4274 struct fib6_info *rt,
4275 struct fib6_config *r_cfg)
4280 list_for_each_entry(nh, rt6_nh_list, next) {
4281 /* check if fib6_info already exists */
4282 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4286 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4290 err = ip6_convert_metrics(net, rt, r_cfg);
4295 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4296 list_add_tail(&nh->next, rt6_nh_list);
4301 static void ip6_route_mpath_notify(struct fib6_info *rt,
4302 struct fib6_info *rt_last,
4303 struct nl_info *info,
4306 /* if this is an APPEND route, then rt points to the first route
4307 * inserted and rt_last points to last route inserted. Userspace
4308 * wants a consistent dump of the route which starts at the first
4309 * nexthop. Since sibling routes are always added at the end of
4310 * the list, find the first sibling of the last route appended
4312 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4313 rt = list_first_entry(&rt_last->fib6_siblings,
4319 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4322 static int ip6_route_multipath_add(struct fib6_config *cfg,
4323 struct netlink_ext_ack *extack)
4325 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4326 struct nl_info *info = &cfg->fc_nlinfo;
4327 struct fib6_config r_cfg;
4328 struct rtnexthop *rtnh;
4329 struct fib6_info *rt;
4330 struct rt6_nh *err_nh;
4331 struct rt6_nh *nh, *nh_safe;
4337 int replace = (cfg->fc_nlinfo.nlh &&
4338 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4339 LIST_HEAD(rt6_nh_list);
4341 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4342 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4343 nlflags |= NLM_F_APPEND;
4345 remaining = cfg->fc_mp_len;
4346 rtnh = (struct rtnexthop *)cfg->fc_mp;
4348 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4349 * fib6_info structs per nexthop
4351 while (rtnh_ok(rtnh, remaining)) {
4352 memcpy(&r_cfg, cfg, sizeof(*cfg));
4353 if (rtnh->rtnh_ifindex)
4354 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4356 attrlen = rtnh_attrlen(rtnh);
4358 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4360 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4362 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4363 r_cfg.fc_flags |= RTF_GATEWAY;
4365 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4366 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4368 r_cfg.fc_encap_type = nla_get_u16(nla);
4371 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4372 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4378 if (!rt6_qualify_for_ecmp(rt)) {
4380 NL_SET_ERR_MSG(extack,
4381 "Device only routes can not be added for IPv6 using the multipath API.");
4382 fib6_info_release(rt);
4386 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4388 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4391 fib6_info_release(rt);
4395 rtnh = rtnh_next(rtnh, &remaining);
4398 /* for add and replace send one notification with all nexthops.
4399 * Skip the notification in fib6_add_rt2node and send one with
4400 * the full route when done
4402 info->skip_notify = 1;
4405 list_for_each_entry(nh, &rt6_nh_list, next) {
4406 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4407 fib6_info_release(nh->fib6_info);
4410 /* save reference to last route successfully inserted */
4411 rt_last = nh->fib6_info;
4413 /* save reference to first route for notification */
4415 rt_notif = nh->fib6_info;
4418 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4419 nh->fib6_info = NULL;
4422 ip6_print_replace_route_err(&rt6_nh_list);
4427 /* Because each route is added like a single route we remove
4428 * these flags after the first nexthop: if there is a collision,
4429 * we have already failed to add the first nexthop:
4430 * fib6_add_rt2node() has rejected it; when replacing, old
4431 * nexthops have been replaced by first new, the rest should
4434 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4439 /* success ... tell user about new route */
4440 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4444 /* send notification for routes that were added so that
4445 * the delete notifications sent by ip6_route_del are
4449 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4451 /* Delete routes that were already added */
4452 list_for_each_entry(nh, &rt6_nh_list, next) {
4455 ip6_route_del(&nh->r_cfg, extack);
4459 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4461 fib6_info_release(nh->fib6_info);
4462 list_del(&nh->next);
4469 static int ip6_route_multipath_del(struct fib6_config *cfg,
4470 struct netlink_ext_ack *extack)
4472 struct fib6_config r_cfg;
4473 struct rtnexthop *rtnh;
4476 int err = 1, last_err = 0;
4478 remaining = cfg->fc_mp_len;
4479 rtnh = (struct rtnexthop *)cfg->fc_mp;
4481 /* Parse a Multipath Entry */
4482 while (rtnh_ok(rtnh, remaining)) {
4483 memcpy(&r_cfg, cfg, sizeof(*cfg));
4484 if (rtnh->rtnh_ifindex)
4485 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4487 attrlen = rtnh_attrlen(rtnh);
4489 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4491 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4493 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4494 r_cfg.fc_flags |= RTF_GATEWAY;
4497 err = ip6_route_del(&r_cfg, extack);
4501 rtnh = rtnh_next(rtnh, &remaining);
4507 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4508 struct netlink_ext_ack *extack)
4510 struct fib6_config cfg;
4513 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4518 return ip6_route_multipath_del(&cfg, extack);
4520 cfg.fc_delete_all_nh = 1;
4521 return ip6_route_del(&cfg, extack);
4525 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4526 struct netlink_ext_ack *extack)
4528 struct fib6_config cfg;
4531 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4536 return ip6_route_multipath_add(&cfg, extack);
4538 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4541 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4543 int nexthop_len = 0;
4545 if (rt->fib6_nsiblings) {
4546 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4547 + NLA_ALIGN(sizeof(struct rtnexthop))
4548 + nla_total_size(16) /* RTA_GATEWAY */
4549 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4551 nexthop_len *= rt->fib6_nsiblings;
4554 return NLMSG_ALIGN(sizeof(struct rtmsg))
4555 + nla_total_size(16) /* RTA_SRC */
4556 + nla_total_size(16) /* RTA_DST */
4557 + nla_total_size(16) /* RTA_GATEWAY */
4558 + nla_total_size(16) /* RTA_PREFSRC */
4559 + nla_total_size(4) /* RTA_TABLE */
4560 + nla_total_size(4) /* RTA_IIF */
4561 + nla_total_size(4) /* RTA_OIF */
4562 + nla_total_size(4) /* RTA_PRIORITY */
4563 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4564 + nla_total_size(sizeof(struct rta_cacheinfo))
4565 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4566 + nla_total_size(1) /* RTA_PREF */
4567 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4571 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4572 unsigned int *flags, bool skip_oif)
4574 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4575 *flags |= RTNH_F_DEAD;
4577 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4578 *flags |= RTNH_F_LINKDOWN;
4581 if (fib6_ignore_linkdown(rt))
4582 *flags |= RTNH_F_DEAD;
4586 if (rt->fib6_flags & RTF_GATEWAY) {
4587 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4588 goto nla_put_failure;
4591 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4592 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4593 *flags |= RTNH_F_OFFLOAD;
4595 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4596 if (!skip_oif && rt->fib6_nh.nh_dev &&
4597 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4598 goto nla_put_failure;
4600 if (rt->fib6_nh.nh_lwtstate &&
4601 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4602 goto nla_put_failure;
4610 /* add multipath next hop */
4611 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4613 const struct net_device *dev = rt->fib6_nh.nh_dev;
4614 struct rtnexthop *rtnh;
4615 unsigned int flags = 0;
4617 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4619 goto nla_put_failure;
4621 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4622 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4624 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4625 goto nla_put_failure;
4627 rtnh->rtnh_flags = flags;
4629 /* length of rtnetlink header + attributes */
4630 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4638 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4639 struct fib6_info *rt, struct dst_entry *dst,
4640 struct in6_addr *dest, struct in6_addr *src,
4641 int iif, int type, u32 portid, u32 seq,
4645 struct nlmsghdr *nlh;
4650 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4654 rtm = nlmsg_data(nlh);
4655 rtm->rtm_family = AF_INET6;
4656 rtm->rtm_dst_len = rt->fib6_dst.plen;
4657 rtm->rtm_src_len = rt->fib6_src.plen;
4660 table = rt->fib6_table->tb6_id;
4662 table = RT6_TABLE_UNSPEC;
4663 rtm->rtm_table = table;
4664 if (nla_put_u32(skb, RTA_TABLE, table))
4665 goto nla_put_failure;
4667 rtm->rtm_type = rt->fib6_type;
4669 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4670 rtm->rtm_protocol = rt->fib6_protocol;
4672 if (rt->fib6_flags & RTF_CACHE)
4673 rtm->rtm_flags |= RTM_F_CLONED;
4676 if (nla_put_in6_addr(skb, RTA_DST, dest))
4677 goto nla_put_failure;
4678 rtm->rtm_dst_len = 128;
4679 } else if (rtm->rtm_dst_len)
4680 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4681 goto nla_put_failure;
4682 #ifdef CONFIG_IPV6_SUBTREES
4684 if (nla_put_in6_addr(skb, RTA_SRC, src))
4685 goto nla_put_failure;
4686 rtm->rtm_src_len = 128;
4687 } else if (rtm->rtm_src_len &&
4688 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4689 goto nla_put_failure;
4692 #ifdef CONFIG_IPV6_MROUTE
4693 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4694 int err = ip6mr_get_route(net, skb, rtm, portid);
4699 goto nla_put_failure;
4702 if (nla_put_u32(skb, RTA_IIF, iif))
4703 goto nla_put_failure;
4705 struct in6_addr saddr_buf;
4706 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4707 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4708 goto nla_put_failure;
4711 if (rt->fib6_prefsrc.plen) {
4712 struct in6_addr saddr_buf;
4713 saddr_buf = rt->fib6_prefsrc.addr;
4714 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4715 goto nla_put_failure;
4718 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4719 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4720 goto nla_put_failure;
4722 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4723 goto nla_put_failure;
4725 /* For multipath routes, walk the siblings list and add
4726 * each as a nexthop within RTA_MULTIPATH.
4728 if (rt->fib6_nsiblings) {
4729 struct fib6_info *sibling, *next_sibling;
4732 mp = nla_nest_start(skb, RTA_MULTIPATH);
4734 goto nla_put_failure;
4736 if (rt6_add_nexthop(skb, rt) < 0)
4737 goto nla_put_failure;
4739 list_for_each_entry_safe(sibling, next_sibling,
4740 &rt->fib6_siblings, fib6_siblings) {
4741 if (rt6_add_nexthop(skb, sibling) < 0)
4742 goto nla_put_failure;
4745 nla_nest_end(skb, mp);
4747 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4748 goto nla_put_failure;
4751 if (rt->fib6_flags & RTF_EXPIRES) {
4752 expires = dst ? dst->expires : rt->expires;
4756 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4757 goto nla_put_failure;
4759 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4760 goto nla_put_failure;
4763 nlmsg_end(skb, nlh);
4767 nlmsg_cancel(skb, nlh);
4771 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4773 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4774 struct net *net = arg->net;
4776 if (rt == net->ipv6.fib6_null_entry)
4779 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4780 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4782 /* user wants prefix routes only */
4783 if (rtm->rtm_flags & RTM_F_PREFIX &&
4784 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4785 /* success since this is not a prefix route */
4790 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4791 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4792 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4795 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4796 struct netlink_ext_ack *extack)
4798 struct net *net = sock_net(in_skb->sk);
4799 struct nlattr *tb[RTA_MAX+1];
4800 int err, iif = 0, oif = 0;
4801 struct fib6_info *from;
4802 struct dst_entry *dst;
4803 struct rt6_info *rt;
4804 struct sk_buff *skb;
4809 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4815 memset(&fl6, 0, sizeof(fl6));
4816 rtm = nlmsg_data(nlh);
4817 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4818 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4821 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4824 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4828 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4831 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4835 iif = nla_get_u32(tb[RTA_IIF]);
4838 oif = nla_get_u32(tb[RTA_OIF]);
4841 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4844 fl6.flowi6_uid = make_kuid(current_user_ns(),
4845 nla_get_u32(tb[RTA_UID]));
4847 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4850 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4853 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4855 if (tb[RTA_IP_PROTO]) {
4856 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4857 &fl6.flowi6_proto, extack);
4863 struct net_device *dev;
4868 dev = dev_get_by_index_rcu(net, iif);
4875 fl6.flowi6_iif = iif;
4877 if (!ipv6_addr_any(&fl6.saddr))
4878 flags |= RT6_LOOKUP_F_HAS_SADDR;
4880 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4884 fl6.flowi6_oif = oif;
4886 dst = ip6_route_output(net, NULL, &fl6);
4890 rt = container_of(dst, struct rt6_info, dst);
4891 if (rt->dst.error) {
4892 err = rt->dst.error;
4897 if (rt == net->ipv6.ip6_null_entry) {
4898 err = rt->dst.error;
4903 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4910 skb_dst_set(skb, &rt->dst);
4913 from = rcu_dereference(rt->from);
4916 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4917 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4920 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4921 &fl6.saddr, iif, RTM_NEWROUTE,
4922 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4931 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4936 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4937 unsigned int nlm_flags)
4939 struct sk_buff *skb;
4940 struct net *net = info->nl_net;
4945 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4947 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4951 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4952 event, info->portid, seq, nlm_flags);
4954 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4955 WARN_ON(err == -EMSGSIZE);
4959 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4960 info->nlh, gfp_any());
4964 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4967 static int ip6_route_dev_notify(struct notifier_block *this,
4968 unsigned long event, void *ptr)
4970 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4971 struct net *net = dev_net(dev);
4973 if (!(dev->flags & IFF_LOOPBACK))
4976 if (event == NETDEV_REGISTER) {
4977 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4978 net->ipv6.ip6_null_entry->dst.dev = dev;
4979 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4981 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4982 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4983 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4984 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4986 } else if (event == NETDEV_UNREGISTER &&
4987 dev->reg_state != NETREG_UNREGISTERED) {
4988 /* NETDEV_UNREGISTER could be fired for multiple times by
4989 * netdev_wait_allrefs(). Make sure we only call this once.
4991 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4992 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4993 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4994 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5005 #ifdef CONFIG_PROC_FS
5006 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5008 struct net *net = (struct net *)seq->private;
5009 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5010 net->ipv6.rt6_stats->fib_nodes,
5011 net->ipv6.rt6_stats->fib_route_nodes,
5012 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5013 net->ipv6.rt6_stats->fib_rt_entries,
5014 net->ipv6.rt6_stats->fib_rt_cache,
5015 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5016 net->ipv6.rt6_stats->fib_discarded_routes);
5020 #endif /* CONFIG_PROC_FS */
5022 #ifdef CONFIG_SYSCTL
5025 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5026 void __user *buffer, size_t *lenp, loff_t *ppos)
5033 net = (struct net *)ctl->extra1;
5034 delay = net->ipv6.sysctl.flush_delay;
5035 proc_dointvec(ctl, write, buffer, lenp, ppos);
5036 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5040 struct ctl_table ipv6_route_table_template[] = {
5042 .procname = "flush",
5043 .data = &init_net.ipv6.sysctl.flush_delay,
5044 .maxlen = sizeof(int),
5046 .proc_handler = ipv6_sysctl_rtcache_flush
5049 .procname = "gc_thresh",
5050 .data = &ip6_dst_ops_template.gc_thresh,
5051 .maxlen = sizeof(int),
5053 .proc_handler = proc_dointvec,
5056 .procname = "max_size",
5057 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5058 .maxlen = sizeof(int),
5060 .proc_handler = proc_dointvec,
5063 .procname = "gc_min_interval",
5064 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5065 .maxlen = sizeof(int),
5067 .proc_handler = proc_dointvec_jiffies,
5070 .procname = "gc_timeout",
5071 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5072 .maxlen = sizeof(int),
5074 .proc_handler = proc_dointvec_jiffies,
5077 .procname = "gc_interval",
5078 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5079 .maxlen = sizeof(int),
5081 .proc_handler = proc_dointvec_jiffies,
5084 .procname = "gc_elasticity",
5085 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5086 .maxlen = sizeof(int),
5088 .proc_handler = proc_dointvec,
5091 .procname = "mtu_expires",
5092 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5093 .maxlen = sizeof(int),
5095 .proc_handler = proc_dointvec_jiffies,
5098 .procname = "min_adv_mss",
5099 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5100 .maxlen = sizeof(int),
5102 .proc_handler = proc_dointvec,
5105 .procname = "gc_min_interval_ms",
5106 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5107 .maxlen = sizeof(int),
5109 .proc_handler = proc_dointvec_ms_jiffies,
5114 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5116 struct ctl_table *table;
5118 table = kmemdup(ipv6_route_table_template,
5119 sizeof(ipv6_route_table_template),
5123 table[0].data = &net->ipv6.sysctl.flush_delay;
5124 table[0].extra1 = net;
5125 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5126 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5127 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5128 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5129 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5130 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5131 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5132 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5133 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5135 /* Don't export sysctls to unprivileged users */
5136 if (net->user_ns != &init_user_ns)
5137 table[0].procname = NULL;
5144 static int __net_init ip6_route_net_init(struct net *net)
5148 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5149 sizeof(net->ipv6.ip6_dst_ops));
5151 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5152 goto out_ip6_dst_ops;
5154 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5155 sizeof(*net->ipv6.fib6_null_entry),
5157 if (!net->ipv6.fib6_null_entry)
5158 goto out_ip6_dst_entries;
5160 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5161 sizeof(*net->ipv6.ip6_null_entry),
5163 if (!net->ipv6.ip6_null_entry)
5164 goto out_fib6_null_entry;
5165 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5166 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5167 ip6_template_metrics, true);
5169 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5170 net->ipv6.fib6_has_custom_rules = false;
5171 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5172 sizeof(*net->ipv6.ip6_prohibit_entry),
5174 if (!net->ipv6.ip6_prohibit_entry)
5175 goto out_ip6_null_entry;
5176 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5177 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5178 ip6_template_metrics, true);
5180 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5181 sizeof(*net->ipv6.ip6_blk_hole_entry),
5183 if (!net->ipv6.ip6_blk_hole_entry)
5184 goto out_ip6_prohibit_entry;
5185 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5186 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5187 ip6_template_metrics, true);
5190 net->ipv6.sysctl.flush_delay = 0;
5191 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5192 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5193 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5194 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5195 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5196 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5197 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5199 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5205 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5206 out_ip6_prohibit_entry:
5207 kfree(net->ipv6.ip6_prohibit_entry);
5209 kfree(net->ipv6.ip6_null_entry);
5211 out_fib6_null_entry:
5212 kfree(net->ipv6.fib6_null_entry);
5213 out_ip6_dst_entries:
5214 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5219 static void __net_exit ip6_route_net_exit(struct net *net)
5221 kfree(net->ipv6.fib6_null_entry);
5222 kfree(net->ipv6.ip6_null_entry);
5223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5224 kfree(net->ipv6.ip6_prohibit_entry);
5225 kfree(net->ipv6.ip6_blk_hole_entry);
5227 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5230 static int __net_init ip6_route_net_init_late(struct net *net)
5232 #ifdef CONFIG_PROC_FS
5233 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5234 sizeof(struct ipv6_route_iter));
5235 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5236 rt6_stats_seq_show, NULL);
5241 static void __net_exit ip6_route_net_exit_late(struct net *net)
5243 #ifdef CONFIG_PROC_FS
5244 remove_proc_entry("ipv6_route", net->proc_net);
5245 remove_proc_entry("rt6_stats", net->proc_net);
5249 static struct pernet_operations ip6_route_net_ops = {
5250 .init = ip6_route_net_init,
5251 .exit = ip6_route_net_exit,
5254 static int __net_init ipv6_inetpeer_init(struct net *net)
5256 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5260 inet_peer_base_init(bp);
5261 net->ipv6.peers = bp;
5265 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5267 struct inet_peer_base *bp = net->ipv6.peers;
5269 net->ipv6.peers = NULL;
5270 inetpeer_invalidate_tree(bp);
5274 static struct pernet_operations ipv6_inetpeer_ops = {
5275 .init = ipv6_inetpeer_init,
5276 .exit = ipv6_inetpeer_exit,
5279 static struct pernet_operations ip6_route_net_late_ops = {
5280 .init = ip6_route_net_init_late,
5281 .exit = ip6_route_net_exit_late,
5284 static struct notifier_block ip6_route_dev_notifier = {
5285 .notifier_call = ip6_route_dev_notify,
5286 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5289 void __init ip6_route_init_special_entries(void)
5291 /* Registering of the loopback is done before this portion of code,
5292 * the loopback reference in rt6_info will not be taken, do it
5293 * manually for init_net */
5294 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5295 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5296 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5297 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5298 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5299 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5300 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5301 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5305 int __init ip6_route_init(void)
5311 ip6_dst_ops_template.kmem_cachep =
5312 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5313 SLAB_HWCACHE_ALIGN, NULL);
5314 if (!ip6_dst_ops_template.kmem_cachep)
5317 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5319 goto out_kmem_cache;
5321 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5323 goto out_dst_entries;
5325 ret = register_pernet_subsys(&ip6_route_net_ops);
5327 goto out_register_inetpeer;
5329 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5333 goto out_register_subsys;
5339 ret = fib6_rules_init();
5343 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5345 goto fib6_rules_init;
5347 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5348 inet6_rtm_newroute, NULL, 0);
5350 goto out_register_late_subsys;
5352 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5353 inet6_rtm_delroute, NULL, 0);
5355 goto out_register_late_subsys;
5357 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5358 inet6_rtm_getroute, NULL,
5359 RTNL_FLAG_DOIT_UNLOCKED);
5361 goto out_register_late_subsys;
5363 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5365 goto out_register_late_subsys;
5367 for_each_possible_cpu(cpu) {
5368 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5370 INIT_LIST_HEAD(&ul->head);
5371 spin_lock_init(&ul->lock);
5377 out_register_late_subsys:
5378 rtnl_unregister_all(PF_INET6);
5379 unregister_pernet_subsys(&ip6_route_net_late_ops);
5381 fib6_rules_cleanup();
5386 out_register_subsys:
5387 unregister_pernet_subsys(&ip6_route_net_ops);
5388 out_register_inetpeer:
5389 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5391 dst_entries_destroy(&ip6_dst_blackhole_ops);
5393 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5397 void ip6_route_cleanup(void)
5399 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5400 unregister_pernet_subsys(&ip6_route_net_late_ops);
5401 fib6_rules_cleanup();
5404 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5405 unregister_pernet_subsys(&ip6_route_net_ops);
5406 dst_entries_destroy(&ip6_dst_blackhole_ops);
5407 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);