2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
998 rt->rt6i_prefsrc = ort->fib6_prefsrc;
999 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1002 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1003 struct in6_addr *saddr)
1005 struct fib6_node *pn, *sn;
1007 if (fn->fn_flags & RTN_TL_ROOT)
1009 pn = rcu_dereference(fn->parent);
1010 sn = FIB6_SUBTREE(pn);
1012 fn = fib6_node_lookup(sn, NULL, saddr);
1015 if (fn->fn_flags & RTN_RTINFO)
1020 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1023 struct rt6_info *rt = *prt;
1025 if (dst_hold_safe(&rt->dst))
1027 if (null_fallback) {
1028 rt = net->ipv6.ip6_null_entry;
1037 /* called with rcu_lock held */
1038 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1040 unsigned short flags = fib6_info_dst_flags(rt);
1041 struct net_device *dev = rt->fib6_nh.nh_dev;
1042 struct rt6_info *nrt;
1044 if (!fib6_info_hold_safe(rt))
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1049 ip6_rt_copy_init(nrt, rt);
1051 fib6_info_release(rt);
1056 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1057 struct fib6_table *table,
1059 const struct sk_buff *skb,
1062 struct fib6_info *f6i;
1063 struct fib6_node *fn;
1064 struct rt6_info *rt;
1066 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1067 flags &= ~RT6_LOOKUP_F_IFACE;
1070 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1072 f6i = rcu_dereference(fn->leaf);
1074 f6i = net->ipv6.fib6_null_entry;
1076 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1077 fl6->flowi6_oif, flags);
1078 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1079 f6i = fib6_multipath_select(net, f6i, fl6,
1080 fl6->flowi6_oif, skb,
1083 if (f6i == net->ipv6.fib6_null_entry) {
1084 fn = fib6_backtrack(fn, &fl6->saddr);
1089 trace_fib6_table_lookup(net, f6i, table, fl6);
1091 /* Search through exception table */
1092 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1094 if (ip6_hold_safe(net, &rt, true))
1095 dst_use_noref(&rt->dst, jiffies);
1096 } else if (f6i == net->ipv6.fib6_null_entry) {
1097 rt = net->ipv6.ip6_null_entry;
1100 rt = ip6_create_rt_rcu(f6i);
1102 rt = net->ipv6.ip6_null_entry;
1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1113 const struct sk_buff *skb, int flags)
1115 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1120 const struct in6_addr *saddr, int oif,
1121 const struct sk_buff *skb, int strict)
1123 struct flowi6 fl6 = {
1127 struct dst_entry *dst;
1128 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1131 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1132 flags |= RT6_LOOKUP_F_HAS_SADDR;
1135 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1136 if (dst->error == 0)
1137 return (struct rt6_info *) dst;
1143 EXPORT_SYMBOL(rt6_lookup);
1145 /* ip6_ins_rt is called with FREE table->tb6_lock.
1146 * It takes new route entry, the addition fails by any reason the
1147 * route is released.
1148 * Caller must hold dst before calling it.
1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1152 struct netlink_ext_ack *extack)
1155 struct fib6_table *table;
1157 table = rt->fib6_table;
1158 spin_lock_bh(&table->tb6_lock);
1159 err = fib6_add(&table->tb6_root, rt, info, extack);
1160 spin_unlock_bh(&table->tb6_lock);
1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 struct nl_info info = { .nl_net = net, };
1169 return __ip6_ins_rt(rt, &info, NULL);
1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1173 const struct in6_addr *daddr,
1174 const struct in6_addr *saddr)
1176 struct net_device *dev;
1177 struct rt6_info *rt;
1183 if (!fib6_info_hold_safe(ort))
1186 dev = ip6_rt_get_dev_rcu(ort);
1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189 fib6_info_release(ort);
1193 ip6_rt_copy_init(rt, ort);
1194 rt->rt6i_flags |= RTF_CACHE;
1195 rt->dst.flags |= DST_HOST;
1196 rt->rt6i_dst.addr = *daddr;
1197 rt->rt6i_dst.plen = 128;
1199 if (!rt6_is_gw_or_nonexthop(ort)) {
1200 if (ort->fib6_dst.plen != 128 &&
1201 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1202 rt->rt6i_flags |= RTF_ANYCAST;
1203 #ifdef CONFIG_IPV6_SUBTREES
1204 if (rt->rt6i_src.plen && saddr) {
1205 rt->rt6i_src.addr = *saddr;
1206 rt->rt6i_src.plen = 128;
1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 unsigned short flags = fib6_info_dst_flags(rt);
1217 struct net_device *dev;
1218 struct rt6_info *pcpu_rt;
1220 if (!fib6_info_hold_safe(rt))
1224 dev = ip6_rt_get_dev_rcu(rt);
1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1228 fib6_info_release(rt);
1231 ip6_rt_copy_init(pcpu_rt, rt);
1232 pcpu_rt->rt6i_flags |= RTF_PCPU;
1236 /* It should be called with rcu_read_lock() acquired */
1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 struct rt6_info *pcpu_rt, **p;
1241 p = this_cpu_ptr(rt->rt6i_pcpu);
1245 ip6_hold_safe(NULL, &pcpu_rt, false);
1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1251 struct fib6_info *rt)
1253 struct rt6_info *pcpu_rt, *prev, **p;
1255 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257 dst_hold(&net->ipv6.ip6_null_entry->dst);
1258 return net->ipv6.ip6_null_entry;
1261 dst_hold(&pcpu_rt->dst);
1262 p = this_cpu_ptr(rt->rt6i_pcpu);
1263 prev = cmpxchg(p, NULL, pcpu_rt);
1269 /* exception hash table implementation
1271 static DEFINE_SPINLOCK(rt6_exception_lock);
1273 /* Remove rt6_ex from hash table and free the memory
1274 * Caller must hold rt6_exception_lock
1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1277 struct rt6_exception *rt6_ex)
1281 if (!bucket || !rt6_ex)
1284 net = dev_net(rt6_ex->rt6i->dst.dev);
1285 hlist_del_rcu(&rt6_ex->hlist);
1286 dst_release(&rt6_ex->rt6i->dst);
1287 kfree_rcu(rt6_ex, rcu);
1288 WARN_ON_ONCE(!bucket->depth);
1290 net->ipv6.rt6_stats->fib_rt_cache--;
1293 /* Remove oldest rt6_ex in bucket and free the memory
1294 * Caller must hold rt6_exception_lock
1296 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1298 struct rt6_exception *rt6_ex, *oldest = NULL;
1303 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1304 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1307 rt6_remove_exception(bucket, oldest);
1310 static u32 rt6_exception_hash(const struct in6_addr *dst,
1311 const struct in6_addr *src)
1313 static u32 seed __read_mostly;
1316 net_get_random_once(&seed, sizeof(seed));
1317 val = jhash(dst, sizeof(*dst), seed);
1319 #ifdef CONFIG_IPV6_SUBTREES
1321 val = jhash(src, sizeof(*src), val);
1323 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1326 /* Helper function to find the cached rt in the hash table
1327 * and update bucket pointer to point to the bucket for this
1328 * (daddr, saddr) pair
1329 * Caller must hold rt6_exception_lock
1331 static struct rt6_exception *
1332 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1333 const struct in6_addr *daddr,
1334 const struct in6_addr *saddr)
1336 struct rt6_exception *rt6_ex;
1339 if (!(*bucket) || !daddr)
1342 hval = rt6_exception_hash(daddr, saddr);
1345 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1346 struct rt6_info *rt6 = rt6_ex->rt6i;
1347 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1349 #ifdef CONFIG_IPV6_SUBTREES
1350 if (matched && saddr)
1351 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1359 /* Helper function to find the cached rt in the hash table
1360 * and update bucket pointer to point to the bucket for this
1361 * (daddr, saddr) pair
1362 * Caller must hold rcu_read_lock()
1364 static struct rt6_exception *
1365 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1366 const struct in6_addr *daddr,
1367 const struct in6_addr *saddr)
1369 struct rt6_exception *rt6_ex;
1372 WARN_ON_ONCE(!rcu_read_lock_held());
1374 if (!(*bucket) || !daddr)
1377 hval = rt6_exception_hash(daddr, saddr);
1380 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1381 struct rt6_info *rt6 = rt6_ex->rt6i;
1382 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1384 #ifdef CONFIG_IPV6_SUBTREES
1385 if (matched && saddr)
1386 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1394 static unsigned int fib6_mtu(const struct fib6_info *rt)
1398 if (rt->fib6_pmtu) {
1399 mtu = rt->fib6_pmtu;
1401 struct net_device *dev = fib6_info_nh_dev(rt);
1402 struct inet6_dev *idev;
1405 idev = __in6_dev_get(dev);
1406 mtu = idev->cnf.mtu6;
1410 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1412 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1415 static int rt6_insert_exception(struct rt6_info *nrt,
1416 struct fib6_info *ort)
1418 struct net *net = dev_net(nrt->dst.dev);
1419 struct rt6_exception_bucket *bucket;
1420 struct in6_addr *src_key = NULL;
1421 struct rt6_exception *rt6_ex;
1424 spin_lock_bh(&rt6_exception_lock);
1426 if (ort->exception_bucket_flushed) {
1431 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1432 lockdep_is_held(&rt6_exception_lock));
1434 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1440 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1443 #ifdef CONFIG_IPV6_SUBTREES
1444 /* rt6i_src.plen != 0 indicates ort is in subtree
1445 * and exception table is indexed by a hash of
1446 * both rt6i_dst and rt6i_src.
1447 * Otherwise, the exception table is indexed by
1448 * a hash of only rt6i_dst.
1450 if (ort->fib6_src.plen)
1451 src_key = &nrt->rt6i_src.addr;
1454 /* Update rt6i_prefsrc as it could be changed
1455 * in rt6_remove_prefsrc()
1457 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1458 /* rt6_mtu_change() might lower mtu on ort.
1459 * Only insert this exception route if its mtu
1460 * is less than ort's mtu value.
1462 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1470 rt6_remove_exception(bucket, rt6_ex);
1472 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1478 rt6_ex->stamp = jiffies;
1479 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1481 net->ipv6.rt6_stats->fib_rt_cache++;
1483 if (bucket->depth > FIB6_MAX_DEPTH)
1484 rt6_exception_remove_oldest(bucket);
1487 spin_unlock_bh(&rt6_exception_lock);
1489 /* Update fn->fn_sernum to invalidate all cached dst */
1491 spin_lock_bh(&ort->fib6_table->tb6_lock);
1492 fib6_update_sernum(net, ort);
1493 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1494 fib6_force_start_gc(net);
1500 void rt6_flush_exceptions(struct fib6_info *rt)
1502 struct rt6_exception_bucket *bucket;
1503 struct rt6_exception *rt6_ex;
1504 struct hlist_node *tmp;
1507 spin_lock_bh(&rt6_exception_lock);
1508 /* Prevent rt6_insert_exception() to recreate the bucket list */
1509 rt->exception_bucket_flushed = 1;
1511 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1512 lockdep_is_held(&rt6_exception_lock));
1516 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1517 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1518 rt6_remove_exception(bucket, rt6_ex);
1519 WARN_ON_ONCE(bucket->depth);
1524 spin_unlock_bh(&rt6_exception_lock);
1527 /* Find cached rt in the hash table inside passed in rt
1528 * Caller has to hold rcu_read_lock()
1530 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1531 struct in6_addr *daddr,
1532 struct in6_addr *saddr)
1534 struct rt6_exception_bucket *bucket;
1535 struct in6_addr *src_key = NULL;
1536 struct rt6_exception *rt6_ex;
1537 struct rt6_info *res = NULL;
1539 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1541 #ifdef CONFIG_IPV6_SUBTREES
1542 /* rt6i_src.plen != 0 indicates rt is in subtree
1543 * and exception table is indexed by a hash of
1544 * both rt6i_dst and rt6i_src.
1545 * Otherwise, the exception table is indexed by
1546 * a hash of only rt6i_dst.
1548 if (rt->fib6_src.plen)
1551 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1553 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1559 /* Remove the passed in cached rt from the hash table that contains it */
1560 static int rt6_remove_exception_rt(struct rt6_info *rt)
1562 struct rt6_exception_bucket *bucket;
1563 struct in6_addr *src_key = NULL;
1564 struct rt6_exception *rt6_ex;
1565 struct fib6_info *from;
1568 from = rcu_dereference(rt->from);
1570 !(rt->rt6i_flags & RTF_CACHE))
1573 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1576 spin_lock_bh(&rt6_exception_lock);
1577 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1578 lockdep_is_held(&rt6_exception_lock));
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1581 * and exception table is indexed by a hash of
1582 * both rt6i_dst and rt6i_src.
1583 * Otherwise, the exception table is indexed by
1584 * a hash of only rt6i_dst.
1586 if (from->fib6_src.plen)
1587 src_key = &rt->rt6i_src.addr;
1589 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1593 rt6_remove_exception(bucket, rt6_ex);
1599 spin_unlock_bh(&rt6_exception_lock);
1603 /* Find rt6_ex which contains the passed in rt cache and
1606 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1608 struct rt6_exception_bucket *bucket;
1609 struct fib6_info *from = rt->from;
1610 struct in6_addr *src_key = NULL;
1611 struct rt6_exception *rt6_ex;
1614 !(rt->rt6i_flags & RTF_CACHE))
1618 bucket = rcu_dereference(from->rt6i_exception_bucket);
1620 #ifdef CONFIG_IPV6_SUBTREES
1621 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1622 * and exception table is indexed by a hash of
1623 * both rt6i_dst and rt6i_src.
1624 * Otherwise, the exception table is indexed by
1625 * a hash of only rt6i_dst.
1627 if (from->fib6_src.plen)
1628 src_key = &rt->rt6i_src.addr;
1630 rt6_ex = __rt6_find_exception_rcu(&bucket,
1634 rt6_ex->stamp = jiffies;
1639 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1641 struct rt6_exception_bucket *bucket;
1642 struct rt6_exception *rt6_ex;
1645 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1646 lockdep_is_held(&rt6_exception_lock));
1649 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1650 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1651 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1658 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1659 struct rt6_info *rt, int mtu)
1661 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1662 * lowest MTU in the path: always allow updating the route PMTU to
1663 * reflect PMTU decreases.
1665 * If the new MTU is higher, and the route PMTU is equal to the local
1666 * MTU, this means the old MTU is the lowest in the path, so allow
1667 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1671 if (dst_mtu(&rt->dst) >= mtu)
1674 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1680 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1681 struct fib6_info *rt, int mtu)
1683 struct rt6_exception_bucket *bucket;
1684 struct rt6_exception *rt6_ex;
1687 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1688 lockdep_is_held(&rt6_exception_lock));
1693 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1694 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1695 struct rt6_info *entry = rt6_ex->rt6i;
1697 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1698 * route), the metrics of its rt->from have already
1701 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1702 rt6_mtu_change_route_allowed(idev, entry, mtu))
1703 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1709 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1711 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1712 struct in6_addr *gateway)
1714 struct rt6_exception_bucket *bucket;
1715 struct rt6_exception *rt6_ex;
1716 struct hlist_node *tmp;
1719 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1722 spin_lock_bh(&rt6_exception_lock);
1723 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1724 lockdep_is_held(&rt6_exception_lock));
1727 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1728 hlist_for_each_entry_safe(rt6_ex, tmp,
1729 &bucket->chain, hlist) {
1730 struct rt6_info *entry = rt6_ex->rt6i;
1732 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1733 RTF_CACHE_GATEWAY &&
1734 ipv6_addr_equal(gateway,
1735 &entry->rt6i_gateway)) {
1736 rt6_remove_exception(bucket, rt6_ex);
1743 spin_unlock_bh(&rt6_exception_lock);
1746 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1747 struct rt6_exception *rt6_ex,
1748 struct fib6_gc_args *gc_args,
1751 struct rt6_info *rt = rt6_ex->rt6i;
1753 /* we are pruning and obsoleting aged-out and non gateway exceptions
1754 * even if others have still references to them, so that on next
1755 * dst_check() such references can be dropped.
1756 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1757 * expired, independently from their aging, as per RFC 8201 section 4
1759 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1760 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1761 RT6_TRACE("aging clone %p\n", rt);
1762 rt6_remove_exception(bucket, rt6_ex);
1765 } else if (time_after(jiffies, rt->dst.expires)) {
1766 RT6_TRACE("purging expired route %p\n", rt);
1767 rt6_remove_exception(bucket, rt6_ex);
1771 if (rt->rt6i_flags & RTF_GATEWAY) {
1772 struct neighbour *neigh;
1773 __u8 neigh_flags = 0;
1775 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1777 neigh_flags = neigh->flags;
1779 if (!(neigh_flags & NTF_ROUTER)) {
1780 RT6_TRACE("purging route %p via non-router but gateway\n",
1782 rt6_remove_exception(bucket, rt6_ex);
1790 void rt6_age_exceptions(struct fib6_info *rt,
1791 struct fib6_gc_args *gc_args,
1794 struct rt6_exception_bucket *bucket;
1795 struct rt6_exception *rt6_ex;
1796 struct hlist_node *tmp;
1799 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1803 spin_lock(&rt6_exception_lock);
1804 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1805 lockdep_is_held(&rt6_exception_lock));
1808 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1809 hlist_for_each_entry_safe(rt6_ex, tmp,
1810 &bucket->chain, hlist) {
1811 rt6_age_examine_exception(bucket, rt6_ex,
1817 spin_unlock(&rt6_exception_lock);
1818 rcu_read_unlock_bh();
1821 /* must be called with rcu lock held */
1822 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1823 int oif, struct flowi6 *fl6, int strict)
1825 struct fib6_node *fn, *saved_fn;
1826 struct fib6_info *f6i;
1828 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1831 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1835 f6i = rt6_select(net, fn, oif, strict);
1836 if (f6i == net->ipv6.fib6_null_entry) {
1837 fn = fib6_backtrack(fn, &fl6->saddr);
1839 goto redo_rt6_select;
1840 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1841 /* also consider unreachable route */
1842 strict &= ~RT6_LOOKUP_F_REACHABLE;
1844 goto redo_rt6_select;
1848 trace_fib6_table_lookup(net, f6i, table, fl6);
1853 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1854 int oif, struct flowi6 *fl6,
1855 const struct sk_buff *skb, int flags)
1857 struct fib6_info *f6i;
1858 struct rt6_info *rt;
1861 strict |= flags & RT6_LOOKUP_F_IFACE;
1862 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1863 if (net->ipv6.devconf_all->forwarding == 0)
1864 strict |= RT6_LOOKUP_F_REACHABLE;
1868 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1869 if (f6i->fib6_nsiblings)
1870 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1872 if (f6i == net->ipv6.fib6_null_entry) {
1873 rt = net->ipv6.ip6_null_entry;
1879 /*Search through exception table */
1880 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1882 if (ip6_hold_safe(net, &rt, true))
1883 dst_use_noref(&rt->dst, jiffies);
1887 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1888 !(f6i->fib6_flags & RTF_GATEWAY))) {
1889 /* Create a RTF_CACHE clone which will not be
1890 * owned by the fib6 tree. It is for the special case where
1891 * the daddr in the skb during the neighbor look-up is different
1892 * from the fl6->daddr used to look-up route here.
1894 struct rt6_info *uncached_rt;
1896 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1901 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1902 * No need for another dst_hold()
1904 rt6_uncached_list_add(uncached_rt);
1905 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1907 uncached_rt = net->ipv6.ip6_null_entry;
1908 dst_hold(&uncached_rt->dst);
1913 /* Get a percpu copy */
1915 struct rt6_info *pcpu_rt;
1918 pcpu_rt = rt6_get_pcpu_route(f6i);
1921 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1929 EXPORT_SYMBOL_GPL(ip6_pol_route);
1931 static struct rt6_info *ip6_pol_route_input(struct net *net,
1932 struct fib6_table *table,
1934 const struct sk_buff *skb,
1937 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1940 struct dst_entry *ip6_route_input_lookup(struct net *net,
1941 struct net_device *dev,
1943 const struct sk_buff *skb,
1946 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1947 flags |= RT6_LOOKUP_F_IFACE;
1949 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1951 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1953 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1954 struct flow_keys *keys,
1955 struct flow_keys *flkeys)
1957 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1958 const struct ipv6hdr *key_iph = outer_iph;
1959 struct flow_keys *_flkeys = flkeys;
1960 const struct ipv6hdr *inner_iph;
1961 const struct icmp6hdr *icmph;
1962 struct ipv6hdr _inner_iph;
1963 struct icmp6hdr _icmph;
1965 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1968 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1969 sizeof(_icmph), &_icmph);
1973 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1974 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1975 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1976 icmph->icmp6_type != ICMPV6_PARAMPROB)
1979 inner_iph = skb_header_pointer(skb,
1980 skb_transport_offset(skb) + sizeof(*icmph),
1981 sizeof(_inner_iph), &_inner_iph);
1985 key_iph = inner_iph;
1989 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1990 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1991 keys->tags.flow_label = _flkeys->tags.flow_label;
1992 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1994 keys->addrs.v6addrs.src = key_iph->saddr;
1995 keys->addrs.v6addrs.dst = key_iph->daddr;
1996 keys->tags.flow_label = ip6_flowlabel(key_iph);
1997 keys->basic.ip_proto = key_iph->nexthdr;
2001 /* if skb is set it will be used and fl6 can be NULL */
2002 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2003 const struct sk_buff *skb, struct flow_keys *flkeys)
2005 struct flow_keys hash_keys;
2008 switch (ip6_multipath_hash_policy(net)) {
2010 memset(&hash_keys, 0, sizeof(hash_keys));
2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2013 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2015 hash_keys.addrs.v6addrs.src = fl6->saddr;
2016 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2017 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2018 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2023 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2024 struct flow_keys keys;
2026 /* short-circuit if we already have L4 hash present */
2028 return skb_get_hash_raw(skb) >> 1;
2030 memset(&hash_keys, 0, sizeof(hash_keys));
2033 skb_flow_dissect_flow_keys(skb, &keys, flag);
2036 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2037 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2038 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2039 hash_keys.ports.src = flkeys->ports.src;
2040 hash_keys.ports.dst = flkeys->ports.dst;
2041 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2043 memset(&hash_keys, 0, sizeof(hash_keys));
2044 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2045 hash_keys.addrs.v6addrs.src = fl6->saddr;
2046 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2047 hash_keys.ports.src = fl6->fl6_sport;
2048 hash_keys.ports.dst = fl6->fl6_dport;
2049 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2053 mhash = flow_hash_from_keys(&hash_keys);
2058 void ip6_route_input(struct sk_buff *skb)
2060 const struct ipv6hdr *iph = ipv6_hdr(skb);
2061 struct net *net = dev_net(skb->dev);
2062 int flags = RT6_LOOKUP_F_HAS_SADDR;
2063 struct ip_tunnel_info *tun_info;
2064 struct flowi6 fl6 = {
2065 .flowi6_iif = skb->dev->ifindex,
2066 .daddr = iph->daddr,
2067 .saddr = iph->saddr,
2068 .flowlabel = ip6_flowinfo(iph),
2069 .flowi6_mark = skb->mark,
2070 .flowi6_proto = iph->nexthdr,
2072 struct flow_keys *flkeys = NULL, _flkeys;
2074 tun_info = skb_tunnel_info(skb);
2075 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2076 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2078 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2081 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2082 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2085 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2088 static struct rt6_info *ip6_pol_route_output(struct net *net,
2089 struct fib6_table *table,
2091 const struct sk_buff *skb,
2094 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2097 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2098 struct flowi6 *fl6, int flags)
2102 if (rt6_need_strict(&fl6->daddr)) {
2103 struct dst_entry *dst;
2105 dst = l3mdev_link_scope_lookup(net, fl6);
2110 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2112 any_src = ipv6_addr_any(&fl6->saddr);
2113 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2114 (fl6->flowi6_oif && any_src))
2115 flags |= RT6_LOOKUP_F_IFACE;
2118 flags |= RT6_LOOKUP_F_HAS_SADDR;
2120 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2122 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2124 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2126 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2128 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2129 struct net_device *loopback_dev = net->loopback_dev;
2130 struct dst_entry *new = NULL;
2132 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2133 DST_OBSOLETE_DEAD, 0);
2136 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2140 new->input = dst_discard;
2141 new->output = dst_discard_out;
2143 dst_copy_metrics(new, &ort->dst);
2145 rt->rt6i_idev = in6_dev_get(loopback_dev);
2146 rt->rt6i_gateway = ort->rt6i_gateway;
2147 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2149 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2150 #ifdef CONFIG_IPV6_SUBTREES
2151 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2155 dst_release(dst_orig);
2156 return new ? new : ERR_PTR(-ENOMEM);
2160 * Destination cache support functions
2163 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2167 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2170 if (fib6_check_expired(f6i))
2176 static struct dst_entry *rt6_check(struct rt6_info *rt,
2177 struct fib6_info *from,
2182 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2183 rt_cookie != cookie)
2186 if (rt6_check_expired(rt))
2192 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2193 struct fib6_info *from,
2196 if (!__rt6_check_expired(rt) &&
2197 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2198 fib6_check(from, cookie))
2204 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2206 struct dst_entry *dst_ret;
2207 struct fib6_info *from;
2208 struct rt6_info *rt;
2210 rt = container_of(dst, struct rt6_info, dst);
2214 /* All IPV6 dsts are created with ->obsolete set to the value
2215 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2216 * into this function always.
2219 from = rcu_dereference(rt->from);
2221 if (from && (rt->rt6i_flags & RTF_PCPU ||
2222 unlikely(!list_empty(&rt->rt6i_uncached))))
2223 dst_ret = rt6_dst_from_check(rt, from, cookie);
2225 dst_ret = rt6_check(rt, from, cookie);
2232 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2234 struct rt6_info *rt = (struct rt6_info *) dst;
2237 if (rt->rt6i_flags & RTF_CACHE) {
2239 if (rt6_check_expired(rt)) {
2240 rt6_remove_exception_rt(rt);
2252 static void ip6_link_failure(struct sk_buff *skb)
2254 struct rt6_info *rt;
2256 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2258 rt = (struct rt6_info *) skb_dst(skb);
2261 if (rt->rt6i_flags & RTF_CACHE) {
2262 if (dst_hold_safe(&rt->dst))
2263 rt6_remove_exception_rt(rt);
2265 struct fib6_info *from;
2266 struct fib6_node *fn;
2268 from = rcu_dereference(rt->from);
2270 fn = rcu_dereference(from->fib6_node);
2271 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2279 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2281 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2282 struct fib6_info *from;
2285 from = rcu_dereference(rt0->from);
2287 rt0->dst.expires = from->expires;
2291 dst_set_expires(&rt0->dst, timeout);
2292 rt0->rt6i_flags |= RTF_EXPIRES;
2295 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2297 struct net *net = dev_net(rt->dst.dev);
2299 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2300 rt->rt6i_flags |= RTF_MODIFIED;
2301 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2304 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2309 from_set = !!rcu_dereference(rt->from);
2312 return !(rt->rt6i_flags & RTF_CACHE) &&
2313 (rt->rt6i_flags & RTF_PCPU || from_set);
2316 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2317 const struct ipv6hdr *iph, u32 mtu)
2319 const struct in6_addr *daddr, *saddr;
2320 struct rt6_info *rt6 = (struct rt6_info *)dst;
2322 if (dst_metric_locked(dst, RTAX_MTU))
2326 daddr = &iph->daddr;
2327 saddr = &iph->saddr;
2329 daddr = &sk->sk_v6_daddr;
2330 saddr = &inet6_sk(sk)->saddr;
2335 dst_confirm_neigh(dst, daddr);
2336 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2337 if (mtu >= dst_mtu(dst))
2340 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2341 rt6_do_update_pmtu(rt6, mtu);
2342 /* update rt6_ex->stamp for cache */
2343 if (rt6->rt6i_flags & RTF_CACHE)
2344 rt6_update_exception_stamp_rt(rt6);
2346 struct fib6_info *from;
2347 struct rt6_info *nrt6;
2350 from = rcu_dereference(rt6->from);
2351 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2353 rt6_do_update_pmtu(nrt6, mtu);
2354 if (rt6_insert_exception(nrt6, from))
2355 dst_release_immediate(&nrt6->dst);
2361 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2362 struct sk_buff *skb, u32 mtu)
2364 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2367 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2368 int oif, u32 mark, kuid_t uid)
2370 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2371 struct dst_entry *dst;
2374 memset(&fl6, 0, sizeof(fl6));
2375 fl6.flowi6_oif = oif;
2376 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2377 fl6.daddr = iph->daddr;
2378 fl6.saddr = iph->saddr;
2379 fl6.flowlabel = ip6_flowinfo(iph);
2380 fl6.flowi6_uid = uid;
2382 dst = ip6_route_output(net, NULL, &fl6);
2384 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2391 struct dst_entry *dst;
2393 ip6_update_pmtu(skb, sock_net(sk), mtu,
2394 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2396 dst = __sk_dst_get(sk);
2397 if (!dst || !dst->obsolete ||
2398 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2402 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2403 ip6_datagram_dst_update(sk, false);
2406 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2408 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2409 const struct flowi6 *fl6)
2411 #ifdef CONFIG_IPV6_SUBTREES
2412 struct ipv6_pinfo *np = inet6_sk(sk);
2415 ip6_dst_store(sk, dst,
2416 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2417 &sk->sk_v6_daddr : NULL,
2418 #ifdef CONFIG_IPV6_SUBTREES
2419 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2425 /* Handle redirects */
2426 struct ip6rd_flowi {
2428 struct in6_addr gateway;
2431 static struct rt6_info *__ip6_route_redirect(struct net *net,
2432 struct fib6_table *table,
2434 const struct sk_buff *skb,
2437 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2438 struct rt6_info *ret = NULL, *rt_cache;
2439 struct fib6_info *rt;
2440 struct fib6_node *fn;
2442 /* Get the "current" route for this destination and
2443 * check if the redirect has come from appropriate router.
2445 * RFC 4861 specifies that redirects should only be
2446 * accepted if they come from the nexthop to the target.
2447 * Due to the way the routes are chosen, this notion
2448 * is a bit fuzzy and one might need to check all possible
2453 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2455 for_each_fib6_node_rt_rcu(fn) {
2456 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2458 if (fib6_check_expired(rt))
2460 if (rt->fib6_flags & RTF_REJECT)
2462 if (!(rt->fib6_flags & RTF_GATEWAY))
2464 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2466 /* rt_cache's gateway might be different from its 'parent'
2467 * in the case of an ip redirect.
2468 * So we keep searching in the exception table if the gateway
2471 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2472 rt_cache = rt6_find_cached_rt(rt,
2476 ipv6_addr_equal(&rdfl->gateway,
2477 &rt_cache->rt6i_gateway)) {
2487 rt = net->ipv6.fib6_null_entry;
2488 else if (rt->fib6_flags & RTF_REJECT) {
2489 ret = net->ipv6.ip6_null_entry;
2493 if (rt == net->ipv6.fib6_null_entry) {
2494 fn = fib6_backtrack(fn, &fl6->saddr);
2501 ip6_hold_safe(net, &ret, true);
2503 ret = ip6_create_rt_rcu(rt);
2507 trace_fib6_table_lookup(net, rt, table, fl6);
2511 static struct dst_entry *ip6_route_redirect(struct net *net,
2512 const struct flowi6 *fl6,
2513 const struct sk_buff *skb,
2514 const struct in6_addr *gateway)
2516 int flags = RT6_LOOKUP_F_HAS_SADDR;
2517 struct ip6rd_flowi rdfl;
2520 rdfl.gateway = *gateway;
2522 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2523 flags, __ip6_route_redirect);
2526 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2529 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2530 struct dst_entry *dst;
2533 memset(&fl6, 0, sizeof(fl6));
2534 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2535 fl6.flowi6_oif = oif;
2536 fl6.flowi6_mark = mark;
2537 fl6.daddr = iph->daddr;
2538 fl6.saddr = iph->saddr;
2539 fl6.flowlabel = ip6_flowinfo(iph);
2540 fl6.flowi6_uid = uid;
2542 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2543 rt6_do_redirect(dst, NULL, skb);
2546 EXPORT_SYMBOL_GPL(ip6_redirect);
2548 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2551 const struct ipv6hdr *iph = ipv6_hdr(skb);
2552 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2553 struct dst_entry *dst;
2556 memset(&fl6, 0, sizeof(fl6));
2557 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2558 fl6.flowi6_oif = oif;
2559 fl6.flowi6_mark = mark;
2560 fl6.daddr = msg->dest;
2561 fl6.saddr = iph->daddr;
2562 fl6.flowi6_uid = sock_net_uid(net, NULL);
2564 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2565 rt6_do_redirect(dst, NULL, skb);
2569 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2571 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2574 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2576 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2578 struct net_device *dev = dst->dev;
2579 unsigned int mtu = dst_mtu(dst);
2580 struct net *net = dev_net(dev);
2582 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2584 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2585 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2588 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2589 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2590 * IPV6_MAXPLEN is also valid and means: "any MSS,
2591 * rely only on pmtu discovery"
2593 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2598 static unsigned int ip6_mtu(const struct dst_entry *dst)
2600 struct inet6_dev *idev;
2603 mtu = dst_metric_raw(dst, RTAX_MTU);
2610 idev = __in6_dev_get(dst->dev);
2612 mtu = idev->cnf.mtu6;
2616 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2618 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2622 * 1. mtu on route is locked - use it
2623 * 2. mtu from nexthop exception
2624 * 3. mtu from egress device
2626 * based on ip6_dst_mtu_forward and exception logic of
2627 * rt6_find_cached_rt; called with rcu_read_lock
2629 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2630 struct in6_addr *saddr)
2632 struct rt6_exception_bucket *bucket;
2633 struct rt6_exception *rt6_ex;
2634 struct in6_addr *src_key;
2635 struct inet6_dev *idev;
2638 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2639 mtu = f6i->fib6_pmtu;
2645 #ifdef CONFIG_IPV6_SUBTREES
2646 if (f6i->fib6_src.plen)
2650 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2651 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2652 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2653 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2656 struct net_device *dev = fib6_info_nh_dev(f6i);
2659 idev = __in6_dev_get(dev);
2660 if (idev && idev->cnf.mtu6 > mtu)
2661 mtu = idev->cnf.mtu6;
2664 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2666 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2669 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2672 struct dst_entry *dst;
2673 struct rt6_info *rt;
2674 struct inet6_dev *idev = in6_dev_get(dev);
2675 struct net *net = dev_net(dev);
2677 if (unlikely(!idev))
2678 return ERR_PTR(-ENODEV);
2680 rt = ip6_dst_alloc(net, dev, 0);
2681 if (unlikely(!rt)) {
2683 dst = ERR_PTR(-ENOMEM);
2687 rt->dst.flags |= DST_HOST;
2688 rt->dst.input = ip6_input;
2689 rt->dst.output = ip6_output;
2690 rt->rt6i_gateway = fl6->daddr;
2691 rt->rt6i_dst.addr = fl6->daddr;
2692 rt->rt6i_dst.plen = 128;
2693 rt->rt6i_idev = idev;
2694 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2696 /* Add this dst into uncached_list so that rt6_disable_ip() can
2697 * do proper release of the net_device
2699 rt6_uncached_list_add(rt);
2700 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2702 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2708 static int ip6_dst_gc(struct dst_ops *ops)
2710 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2711 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2712 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2713 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2714 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2715 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2718 entries = dst_entries_get_fast(ops);
2719 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2720 entries <= rt_max_size)
2723 net->ipv6.ip6_rt_gc_expire++;
2724 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2725 entries = dst_entries_get_slow(ops);
2726 if (entries < ops->gc_thresh)
2727 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2729 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2730 return entries > rt_max_size;
2733 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2734 struct fib6_config *cfg)
2736 struct dst_metrics *p;
2741 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2745 refcount_set(&p->refcnt, 1);
2746 rt->fib6_metrics = p;
2748 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2751 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2752 struct fib6_config *cfg,
2753 const struct in6_addr *gw_addr,
2754 u32 tbid, int flags)
2756 struct flowi6 fl6 = {
2757 .flowi6_oif = cfg->fc_ifindex,
2759 .saddr = cfg->fc_prefsrc,
2761 struct fib6_table *table;
2762 struct rt6_info *rt;
2764 table = fib6_get_table(net, tbid);
2768 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2769 flags |= RT6_LOOKUP_F_HAS_SADDR;
2771 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2772 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2774 /* if table lookup failed, fall back to full lookup */
2775 if (rt == net->ipv6.ip6_null_entry) {
2783 static int ip6_route_check_nh_onlink(struct net *net,
2784 struct fib6_config *cfg,
2785 const struct net_device *dev,
2786 struct netlink_ext_ack *extack)
2788 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2789 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2790 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2791 struct rt6_info *grt;
2795 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2797 if (!grt->dst.error &&
2798 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2799 NL_SET_ERR_MSG(extack,
2800 "Nexthop has invalid gateway or device mismatch");
2810 static int ip6_route_check_nh(struct net *net,
2811 struct fib6_config *cfg,
2812 struct net_device **_dev,
2813 struct inet6_dev **idev)
2815 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2816 struct net_device *dev = _dev ? *_dev : NULL;
2817 struct rt6_info *grt = NULL;
2818 int err = -EHOSTUNREACH;
2820 if (cfg->fc_table) {
2821 int flags = RT6_LOOKUP_F_IFACE;
2823 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2824 cfg->fc_table, flags);
2826 if (grt->rt6i_flags & RTF_GATEWAY ||
2827 (dev && dev != grt->dst.dev)) {
2835 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2841 if (dev != grt->dst.dev) {
2846 *_dev = dev = grt->dst.dev;
2847 *idev = grt->rt6i_idev;
2849 in6_dev_hold(grt->rt6i_idev);
2852 if (!(grt->rt6i_flags & RTF_GATEWAY))
2861 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2862 struct net_device **_dev, struct inet6_dev **idev,
2863 struct netlink_ext_ack *extack)
2865 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2866 int gwa_type = ipv6_addr_type(gw_addr);
2867 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2868 const struct net_device *dev = *_dev;
2869 bool need_addr_check = !dev;
2872 /* if gw_addr is local we will fail to detect this in case
2873 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2874 * will return already-added prefix route via interface that
2875 * prefix route was assigned to, which might be non-loopback.
2878 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2879 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2883 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2884 /* IPv6 strictly inhibits using not link-local
2885 * addresses as nexthop address.
2886 * Otherwise, router will not able to send redirects.
2887 * It is very good, but in some (rare!) circumstances
2888 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2889 * some exceptions. --ANK
2890 * We allow IPv4-mapped nexthops to support RFC4798-type
2893 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2894 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2898 if (cfg->fc_flags & RTNH_F_ONLINK)
2899 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2901 err = ip6_route_check_nh(net, cfg, _dev, idev);
2907 /* reload in case device was changed */
2912 NL_SET_ERR_MSG(extack, "Egress device not specified");
2914 } else if (dev->flags & IFF_LOOPBACK) {
2915 NL_SET_ERR_MSG(extack,
2916 "Egress device can not be loopback device for this route");
2920 /* if we did not check gw_addr above, do so now that the
2921 * egress device has been resolved.
2923 if (need_addr_check &&
2924 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2925 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2934 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2936 struct netlink_ext_ack *extack)
2938 struct net *net = cfg->fc_nlinfo.nl_net;
2939 struct fib6_info *rt = NULL;
2940 struct net_device *dev = NULL;
2941 struct inet6_dev *idev = NULL;
2942 struct fib6_table *table;
2946 /* RTF_PCPU is an internal flag; can not be set by userspace */
2947 if (cfg->fc_flags & RTF_PCPU) {
2948 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2952 /* RTF_CACHE is an internal flag; can not be set by userspace */
2953 if (cfg->fc_flags & RTF_CACHE) {
2954 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2958 if (cfg->fc_type > RTN_MAX) {
2959 NL_SET_ERR_MSG(extack, "Invalid route type");
2963 if (cfg->fc_dst_len > 128) {
2964 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2967 if (cfg->fc_src_len > 128) {
2968 NL_SET_ERR_MSG(extack, "Invalid source address length");
2971 #ifndef CONFIG_IPV6_SUBTREES
2972 if (cfg->fc_src_len) {
2973 NL_SET_ERR_MSG(extack,
2974 "Specifying source address requires IPV6_SUBTREES to be enabled");
2978 if (cfg->fc_ifindex) {
2980 dev = dev_get_by_index(net, cfg->fc_ifindex);
2983 idev = in6_dev_get(dev);
2988 if (cfg->fc_metric == 0)
2989 cfg->fc_metric = IP6_RT_PRIO_USER;
2991 if (cfg->fc_flags & RTNH_F_ONLINK) {
2993 NL_SET_ERR_MSG(extack,
2994 "Nexthop device required for onlink");
2999 if (!(dev->flags & IFF_UP)) {
3000 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3007 if (cfg->fc_nlinfo.nlh &&
3008 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3009 table = fib6_get_table(net, cfg->fc_table);
3011 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3012 table = fib6_new_table(net, cfg->fc_table);
3015 table = fib6_new_table(net, cfg->fc_table);
3022 rt = fib6_info_alloc(gfp_flags);
3026 if (cfg->fc_flags & RTF_ADDRCONF)
3027 rt->dst_nocount = true;
3029 err = ip6_convert_metrics(net, rt, cfg);
3033 if (cfg->fc_flags & RTF_EXPIRES)
3034 fib6_set_expires(rt, jiffies +
3035 clock_t_to_jiffies(cfg->fc_expires));
3037 fib6_clean_expires(rt);
3039 if (cfg->fc_protocol == RTPROT_UNSPEC)
3040 cfg->fc_protocol = RTPROT_BOOT;
3041 rt->fib6_protocol = cfg->fc_protocol;
3043 addr_type = ipv6_addr_type(&cfg->fc_dst);
3045 if (cfg->fc_encap) {
3046 struct lwtunnel_state *lwtstate;
3048 err = lwtunnel_build_state(cfg->fc_encap_type,
3049 cfg->fc_encap, AF_INET6, cfg,
3053 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3056 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3057 rt->fib6_dst.plen = cfg->fc_dst_len;
3058 if (rt->fib6_dst.plen == 128)
3059 rt->dst_host = true;
3061 #ifdef CONFIG_IPV6_SUBTREES
3062 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3063 rt->fib6_src.plen = cfg->fc_src_len;
3066 rt->fib6_metric = cfg->fc_metric;
3067 rt->fib6_nh.nh_weight = 1;
3069 rt->fib6_type = cfg->fc_type;
3071 /* We cannot add true routes via loopback here,
3072 they would result in kernel looping; promote them to reject routes
3074 if ((cfg->fc_flags & RTF_REJECT) ||
3075 (dev && (dev->flags & IFF_LOOPBACK) &&
3076 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3077 !(cfg->fc_flags & RTF_LOCAL))) {
3078 /* hold loopback dev/idev if we haven't done so. */
3079 if (dev != net->loopback_dev) {
3084 dev = net->loopback_dev;
3086 idev = in6_dev_get(dev);
3092 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3096 if (cfg->fc_flags & RTF_GATEWAY) {
3097 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3101 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3108 if (idev->cnf.disable_ipv6) {
3109 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3114 if (!(dev->flags & IFF_UP)) {
3115 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3120 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3121 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3122 NL_SET_ERR_MSG(extack, "Invalid source address");
3126 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3127 rt->fib6_prefsrc.plen = 128;
3129 rt->fib6_prefsrc.plen = 0;
3131 rt->fib6_flags = cfg->fc_flags;
3134 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3135 !netif_carrier_ok(dev))
3136 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3137 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3138 rt->fib6_nh.nh_dev = dev;
3139 rt->fib6_table = table;
3141 cfg->fc_nlinfo.nl_net = dev_net(dev);
3153 fib6_info_release(rt);
3154 return ERR_PTR(err);
3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3158 struct netlink_ext_ack *extack)
3160 struct fib6_info *rt;
3163 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3167 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3168 fib6_info_release(rt);
3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3175 struct net *net = info->nl_net;
3176 struct fib6_table *table;
3179 if (rt == net->ipv6.fib6_null_entry) {
3184 table = rt->fib6_table;
3185 spin_lock_bh(&table->tb6_lock);
3186 err = fib6_del(rt, info);
3187 spin_unlock_bh(&table->tb6_lock);
3190 fib6_info_release(rt);
3194 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3196 struct nl_info info = { .nl_net = net };
3198 return __ip6_del_rt(rt, &info);
3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3203 struct nl_info *info = &cfg->fc_nlinfo;
3204 struct net *net = info->nl_net;
3205 struct sk_buff *skb = NULL;
3206 struct fib6_table *table;
3209 if (rt == net->ipv6.fib6_null_entry)
3211 table = rt->fib6_table;
3212 spin_lock_bh(&table->tb6_lock);
3214 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3215 struct fib6_info *sibling, *next_sibling;
3217 /* prefer to send a single notification with all hops */
3218 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3220 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3222 if (rt6_fill_node(net, skb, rt, NULL,
3223 NULL, NULL, 0, RTM_DELROUTE,
3224 info->portid, seq, 0) < 0) {
3228 info->skip_notify = 1;
3231 list_for_each_entry_safe(sibling, next_sibling,
3234 err = fib6_del(sibling, info);
3240 err = fib6_del(rt, info);
3242 spin_unlock_bh(&table->tb6_lock);
3244 fib6_info_release(rt);
3247 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3248 info->nlh, gfp_any());
3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3257 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3260 if (cfg->fc_flags & RTF_GATEWAY &&
3261 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3263 if (dst_hold_safe(&rt->dst))
3264 rc = rt6_remove_exception_rt(rt);
3269 static int ip6_route_del(struct fib6_config *cfg,
3270 struct netlink_ext_ack *extack)
3272 struct rt6_info *rt_cache;
3273 struct fib6_table *table;
3274 struct fib6_info *rt;
3275 struct fib6_node *fn;
3278 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3280 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3286 fn = fib6_locate(&table->tb6_root,
3287 &cfg->fc_dst, cfg->fc_dst_len,
3288 &cfg->fc_src, cfg->fc_src_len,
3289 !(cfg->fc_flags & RTF_CACHE));
3292 for_each_fib6_node_rt_rcu(fn) {
3293 if (cfg->fc_flags & RTF_CACHE) {
3296 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3299 rc = ip6_del_cached_rt(rt_cache, cfg);
3307 if (cfg->fc_ifindex &&
3308 (!rt->fib6_nh.nh_dev ||
3309 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3311 if (cfg->fc_flags & RTF_GATEWAY &&
3312 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3314 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3316 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3318 if (!fib6_info_hold_safe(rt))
3322 /* if gateway was specified only delete the one hop */
3323 if (cfg->fc_flags & RTF_GATEWAY)
3324 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3326 return __ip6_del_rt_siblings(rt, cfg);
3334 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3336 struct netevent_redirect netevent;
3337 struct rt6_info *rt, *nrt = NULL;
3338 struct ndisc_options ndopts;
3339 struct inet6_dev *in6_dev;
3340 struct neighbour *neigh;
3341 struct fib6_info *from;
3343 int optlen, on_link;
3346 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3347 optlen -= sizeof(*msg);
3350 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3354 msg = (struct rd_msg *)icmp6_hdr(skb);
3356 if (ipv6_addr_is_multicast(&msg->dest)) {
3357 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3362 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3364 } else if (ipv6_addr_type(&msg->target) !=
3365 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3366 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3370 in6_dev = __in6_dev_get(skb->dev);
3373 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3377 * The IP source address of the Redirect MUST be the same as the current
3378 * first-hop router for the specified ICMP Destination Address.
3381 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3382 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3387 if (ndopts.nd_opts_tgt_lladdr) {
3388 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3391 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3396 rt = (struct rt6_info *) dst;
3397 if (rt->rt6i_flags & RTF_REJECT) {
3398 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3402 /* Redirect received -> path was valid.
3403 * Look, redirects are sent only in response to data packets,
3404 * so that this nexthop apparently is reachable. --ANK
3406 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3408 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3413 * We have finally decided to accept it.
3416 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3417 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3418 NEIGH_UPDATE_F_OVERRIDE|
3419 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3420 NEIGH_UPDATE_F_ISROUTER)),
3421 NDISC_REDIRECT, &ndopts);
3424 from = rcu_dereference(rt->from);
3425 /* This fib6_info_hold() is safe here because we hold reference to rt
3426 * and rt already holds reference to fib6_info.
3428 fib6_info_hold(from);
3431 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3435 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3437 nrt->rt6i_flags &= ~RTF_GATEWAY;
3439 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3441 /* No need to remove rt from the exception table if rt is
3442 * a cached route because rt6_insert_exception() will
3445 if (rt6_insert_exception(nrt, from)) {
3446 dst_release_immediate(&nrt->dst);
3450 netevent.old = &rt->dst;
3451 netevent.new = &nrt->dst;
3452 netevent.daddr = &msg->dest;
3453 netevent.neigh = neigh;
3454 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3457 fib6_info_release(from);
3458 neigh_release(neigh);
3461 #ifdef CONFIG_IPV6_ROUTE_INFO
3462 static struct fib6_info *rt6_get_route_info(struct net *net,
3463 const struct in6_addr *prefix, int prefixlen,
3464 const struct in6_addr *gwaddr,
3465 struct net_device *dev)
3467 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3468 int ifindex = dev->ifindex;
3469 struct fib6_node *fn;
3470 struct fib6_info *rt = NULL;
3471 struct fib6_table *table;
3473 table = fib6_get_table(net, tb_id);
3478 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3482 for_each_fib6_node_rt_rcu(fn) {
3483 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3485 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3487 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3489 if (!fib6_info_hold_safe(rt))
3498 static struct fib6_info *rt6_add_route_info(struct net *net,
3499 const struct in6_addr *prefix, int prefixlen,
3500 const struct in6_addr *gwaddr,
3501 struct net_device *dev,
3504 struct fib6_config cfg = {
3505 .fc_metric = IP6_RT_PRIO_USER,
3506 .fc_ifindex = dev->ifindex,
3507 .fc_dst_len = prefixlen,
3508 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3509 RTF_UP | RTF_PREF(pref),
3510 .fc_protocol = RTPROT_RA,
3511 .fc_type = RTN_UNICAST,
3512 .fc_nlinfo.portid = 0,
3513 .fc_nlinfo.nlh = NULL,
3514 .fc_nlinfo.nl_net = net,
3517 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3518 cfg.fc_dst = *prefix;
3519 cfg.fc_gateway = *gwaddr;
3521 /* We should treat it as a default route if prefix length is 0. */
3523 cfg.fc_flags |= RTF_DEFAULT;
3525 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3527 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3531 struct fib6_info *rt6_get_dflt_router(struct net *net,
3532 const struct in6_addr *addr,
3533 struct net_device *dev)
3535 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3536 struct fib6_info *rt;
3537 struct fib6_table *table;
3539 table = fib6_get_table(net, tb_id);
3544 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3545 if (dev == rt->fib6_nh.nh_dev &&
3546 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3547 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3550 if (rt && !fib6_info_hold_safe(rt))
3556 struct fib6_info *rt6_add_dflt_router(struct net *net,
3557 const struct in6_addr *gwaddr,
3558 struct net_device *dev,
3561 struct fib6_config cfg = {
3562 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3563 .fc_metric = IP6_RT_PRIO_USER,
3564 .fc_ifindex = dev->ifindex,
3565 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3566 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3567 .fc_protocol = RTPROT_RA,
3568 .fc_type = RTN_UNICAST,
3569 .fc_nlinfo.portid = 0,
3570 .fc_nlinfo.nlh = NULL,
3571 .fc_nlinfo.nl_net = net,
3574 cfg.fc_gateway = *gwaddr;
3576 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3577 struct fib6_table *table;
3579 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3581 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3584 return rt6_get_dflt_router(net, gwaddr, dev);
3587 static void __rt6_purge_dflt_routers(struct net *net,
3588 struct fib6_table *table)
3590 struct fib6_info *rt;
3594 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3595 struct net_device *dev = fib6_info_nh_dev(rt);
3596 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3598 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3599 (!idev || idev->cnf.accept_ra != 2) &&
3600 fib6_info_hold_safe(rt)) {
3602 ip6_del_rt(net, rt);
3608 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3611 void rt6_purge_dflt_routers(struct net *net)
3613 struct fib6_table *table;
3614 struct hlist_head *head;
3619 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3620 head = &net->ipv6.fib_table_hash[h];
3621 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3622 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3623 __rt6_purge_dflt_routers(net, table);
3630 static void rtmsg_to_fib6_config(struct net *net,
3631 struct in6_rtmsg *rtmsg,
3632 struct fib6_config *cfg)
3634 memset(cfg, 0, sizeof(*cfg));
3636 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3638 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3639 cfg->fc_metric = rtmsg->rtmsg_metric;
3640 cfg->fc_expires = rtmsg->rtmsg_info;
3641 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3642 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3643 cfg->fc_flags = rtmsg->rtmsg_flags;
3644 cfg->fc_type = rtmsg->rtmsg_type;
3646 cfg->fc_nlinfo.nl_net = net;
3648 cfg->fc_dst = rtmsg->rtmsg_dst;
3649 cfg->fc_src = rtmsg->rtmsg_src;
3650 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3653 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3655 struct fib6_config cfg;
3656 struct in6_rtmsg rtmsg;
3660 case SIOCADDRT: /* Add a route */
3661 case SIOCDELRT: /* Delete a route */
3662 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3664 err = copy_from_user(&rtmsg, arg,
3665 sizeof(struct in6_rtmsg));
3669 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3674 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3677 err = ip6_route_del(&cfg, NULL);
3691 * Drop the packet on the floor
3694 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3697 struct dst_entry *dst = skb_dst(skb);
3698 switch (ipstats_mib_noroutes) {
3699 case IPSTATS_MIB_INNOROUTES:
3700 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3701 if (type == IPV6_ADDR_ANY) {
3702 IP6_INC_STATS(dev_net(dst->dev),
3703 __in6_dev_get_safely(skb->dev),
3704 IPSTATS_MIB_INADDRERRORS);
3708 case IPSTATS_MIB_OUTNOROUTES:
3709 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3710 ipstats_mib_noroutes);
3713 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3718 static int ip6_pkt_discard(struct sk_buff *skb)
3720 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3723 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3725 skb->dev = skb_dst(skb)->dev;
3726 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3729 static int ip6_pkt_prohibit(struct sk_buff *skb)
3731 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3734 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3736 skb->dev = skb_dst(skb)->dev;
3737 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3741 * Allocate a dst for local (unicast / anycast) address.
3744 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3745 struct inet6_dev *idev,
3746 const struct in6_addr *addr,
3747 bool anycast, gfp_t gfp_flags)
3750 struct net_device *dev = idev->dev;
3751 struct fib6_info *f6i;
3753 f6i = fib6_info_alloc(gfp_flags);
3755 return ERR_PTR(-ENOMEM);
3757 f6i->dst_nocount = true;
3758 f6i->dst_host = true;
3759 f6i->fib6_protocol = RTPROT_KERNEL;
3760 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3762 f6i->fib6_type = RTN_ANYCAST;
3763 f6i->fib6_flags |= RTF_ANYCAST;
3765 f6i->fib6_type = RTN_LOCAL;
3766 f6i->fib6_flags |= RTF_LOCAL;
3769 f6i->fib6_nh.nh_gw = *addr;
3771 f6i->fib6_nh.nh_dev = dev;
3772 f6i->fib6_dst.addr = *addr;
3773 f6i->fib6_dst.plen = 128;
3774 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3775 f6i->fib6_table = fib6_get_table(net, tb_id);
3780 /* remove deleted ip from prefsrc entries */
3781 struct arg_dev_net_ip {
3782 struct net_device *dev;
3784 struct in6_addr *addr;
3787 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3789 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3790 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3791 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3793 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3794 rt != net->ipv6.fib6_null_entry &&
3795 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3796 spin_lock_bh(&rt6_exception_lock);
3797 /* remove prefsrc entry */
3798 rt->fib6_prefsrc.plen = 0;
3799 /* need to update cache as well */
3800 rt6_exceptions_remove_prefsrc(rt);
3801 spin_unlock_bh(&rt6_exception_lock);
3806 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3808 struct net *net = dev_net(ifp->idev->dev);
3809 struct arg_dev_net_ip adni = {
3810 .dev = ifp->idev->dev,
3814 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3817 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3819 /* Remove routers and update dst entries when gateway turn into host. */
3820 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3822 struct in6_addr *gateway = (struct in6_addr *)arg;
3824 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3825 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3829 /* Further clean up cached routes in exception table.
3830 * This is needed because cached route may have a different
3831 * gateway than its 'parent' in the case of an ip redirect.
3833 rt6_exceptions_clean_tohost(rt, gateway);
3838 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3840 fib6_clean_all(net, fib6_clean_tohost, gateway);
3843 struct arg_netdev_event {
3844 const struct net_device *dev;
3846 unsigned int nh_flags;
3847 unsigned long event;
3851 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3853 struct fib6_info *iter;
3854 struct fib6_node *fn;
3856 fn = rcu_dereference_protected(rt->fib6_node,
3857 lockdep_is_held(&rt->fib6_table->tb6_lock));
3858 iter = rcu_dereference_protected(fn->leaf,
3859 lockdep_is_held(&rt->fib6_table->tb6_lock));
3861 if (iter->fib6_metric == rt->fib6_metric &&
3862 rt6_qualify_for_ecmp(iter))
3864 iter = rcu_dereference_protected(iter->fib6_next,
3865 lockdep_is_held(&rt->fib6_table->tb6_lock));
3871 static bool rt6_is_dead(const struct fib6_info *rt)
3873 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3874 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3875 fib6_ignore_linkdown(rt)))
3881 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3883 struct fib6_info *iter;
3886 if (!rt6_is_dead(rt))
3887 total += rt->fib6_nh.nh_weight;
3889 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3890 if (!rt6_is_dead(iter))
3891 total += iter->fib6_nh.nh_weight;
3897 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3899 int upper_bound = -1;
3901 if (!rt6_is_dead(rt)) {
3902 *weight += rt->fib6_nh.nh_weight;
3903 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3906 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3909 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3911 struct fib6_info *iter;
3914 rt6_upper_bound_set(rt, &weight, total);
3916 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3917 rt6_upper_bound_set(iter, &weight, total);
3920 void rt6_multipath_rebalance(struct fib6_info *rt)
3922 struct fib6_info *first;
3925 /* In case the entire multipath route was marked for flushing,
3926 * then there is no need to rebalance upon the removal of every
3929 if (!rt->fib6_nsiblings || rt->should_flush)
3932 /* During lookup routes are evaluated in order, so we need to
3933 * make sure upper bounds are assigned from the first sibling
3936 first = rt6_multipath_first_sibling(rt);
3937 if (WARN_ON_ONCE(!first))
3940 total = rt6_multipath_total_weight(first);
3941 rt6_multipath_upper_bound_set(first, total);
3944 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3946 const struct arg_netdev_event *arg = p_arg;
3947 struct net *net = dev_net(arg->dev);
3949 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3950 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3951 fib6_update_sernum_upto_root(net, rt);
3952 rt6_multipath_rebalance(rt);
3958 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3960 struct arg_netdev_event arg = {
3963 .nh_flags = nh_flags,
3967 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3968 arg.nh_flags |= RTNH_F_LINKDOWN;
3970 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3973 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3974 const struct net_device *dev)
3976 struct fib6_info *iter;
3978 if (rt->fib6_nh.nh_dev == dev)
3980 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3981 if (iter->fib6_nh.nh_dev == dev)
3987 static void rt6_multipath_flush(struct fib6_info *rt)
3989 struct fib6_info *iter;
3991 rt->should_flush = 1;
3992 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3993 iter->should_flush = 1;
3996 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3997 const struct net_device *down_dev)
3999 struct fib6_info *iter;
4000 unsigned int dead = 0;
4002 if (rt->fib6_nh.nh_dev == down_dev ||
4003 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4005 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4006 if (iter->fib6_nh.nh_dev == down_dev ||
4007 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4013 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4014 const struct net_device *dev,
4015 unsigned int nh_flags)
4017 struct fib6_info *iter;
4019 if (rt->fib6_nh.nh_dev == dev)
4020 rt->fib6_nh.nh_flags |= nh_flags;
4021 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4022 if (iter->fib6_nh.nh_dev == dev)
4023 iter->fib6_nh.nh_flags |= nh_flags;
4026 /* called with write lock held for table with rt */
4027 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4029 const struct arg_netdev_event *arg = p_arg;
4030 const struct net_device *dev = arg->dev;
4031 struct net *net = dev_net(dev);
4033 if (rt == net->ipv6.fib6_null_entry)
4036 switch (arg->event) {
4037 case NETDEV_UNREGISTER:
4038 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4040 if (rt->should_flush)
4042 if (!rt->fib6_nsiblings)
4043 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4044 if (rt6_multipath_uses_dev(rt, dev)) {
4047 count = rt6_multipath_dead_count(rt, dev);
4048 if (rt->fib6_nsiblings + 1 == count) {
4049 rt6_multipath_flush(rt);
4052 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4054 fib6_update_sernum(net, rt);
4055 rt6_multipath_rebalance(rt);
4059 if (rt->fib6_nh.nh_dev != dev ||
4060 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4062 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4063 rt6_multipath_rebalance(rt);
4070 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4072 struct arg_netdev_event arg = {
4079 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4082 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4084 rt6_sync_down_dev(dev, event);
4085 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4086 neigh_ifdown(&nd_tbl, dev);
4089 struct rt6_mtu_change_arg {
4090 struct net_device *dev;
4094 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4096 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4097 struct inet6_dev *idev;
4099 /* In IPv6 pmtu discovery is not optional,
4100 so that RTAX_MTU lock cannot disable it.
4101 We still use this lock to block changes
4102 caused by addrconf/ndisc.
4105 idev = __in6_dev_get(arg->dev);
4109 /* For administrative MTU increase, there is no way to discover
4110 IPv6 PMTU increase, so PMTU increase should be updated here.
4111 Since RFC 1981 doesn't include administrative MTU increase
4112 update PMTU increase is a MUST. (i.e. jumbo frame)
4114 if (rt->fib6_nh.nh_dev == arg->dev &&
4115 !fib6_metric_locked(rt, RTAX_MTU)) {
4116 u32 mtu = rt->fib6_pmtu;
4118 if (mtu >= arg->mtu ||
4119 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4120 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4122 spin_lock_bh(&rt6_exception_lock);
4123 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4124 spin_unlock_bh(&rt6_exception_lock);
4129 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4131 struct rt6_mtu_change_arg arg = {
4136 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4139 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4140 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4141 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4142 [RTA_OIF] = { .type = NLA_U32 },
4143 [RTA_IIF] = { .type = NLA_U32 },
4144 [RTA_PRIORITY] = { .type = NLA_U32 },
4145 [RTA_METRICS] = { .type = NLA_NESTED },
4146 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4147 [RTA_PREF] = { .type = NLA_U8 },
4148 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4149 [RTA_ENCAP] = { .type = NLA_NESTED },
4150 [RTA_EXPIRES] = { .type = NLA_U32 },
4151 [RTA_UID] = { .type = NLA_U32 },
4152 [RTA_MARK] = { .type = NLA_U32 },
4153 [RTA_TABLE] = { .type = NLA_U32 },
4154 [RTA_IP_PROTO] = { .type = NLA_U8 },
4155 [RTA_SPORT] = { .type = NLA_U16 },
4156 [RTA_DPORT] = { .type = NLA_U16 },
4159 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4160 struct fib6_config *cfg,
4161 struct netlink_ext_ack *extack)
4164 struct nlattr *tb[RTA_MAX+1];
4168 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4174 rtm = nlmsg_data(nlh);
4175 memset(cfg, 0, sizeof(*cfg));
4177 cfg->fc_table = rtm->rtm_table;
4178 cfg->fc_dst_len = rtm->rtm_dst_len;
4179 cfg->fc_src_len = rtm->rtm_src_len;
4180 cfg->fc_flags = RTF_UP;
4181 cfg->fc_protocol = rtm->rtm_protocol;
4182 cfg->fc_type = rtm->rtm_type;
4184 if (rtm->rtm_type == RTN_UNREACHABLE ||
4185 rtm->rtm_type == RTN_BLACKHOLE ||
4186 rtm->rtm_type == RTN_PROHIBIT ||
4187 rtm->rtm_type == RTN_THROW)
4188 cfg->fc_flags |= RTF_REJECT;
4190 if (rtm->rtm_type == RTN_LOCAL)
4191 cfg->fc_flags |= RTF_LOCAL;
4193 if (rtm->rtm_flags & RTM_F_CLONED)
4194 cfg->fc_flags |= RTF_CACHE;
4196 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4198 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4199 cfg->fc_nlinfo.nlh = nlh;
4200 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4202 if (tb[RTA_GATEWAY]) {
4203 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4204 cfg->fc_flags |= RTF_GATEWAY;
4208 int plen = (rtm->rtm_dst_len + 7) >> 3;
4210 if (nla_len(tb[RTA_DST]) < plen)
4213 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4217 int plen = (rtm->rtm_src_len + 7) >> 3;
4219 if (nla_len(tb[RTA_SRC]) < plen)
4222 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4225 if (tb[RTA_PREFSRC])
4226 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4229 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4231 if (tb[RTA_PRIORITY])
4232 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4234 if (tb[RTA_METRICS]) {
4235 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4236 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4240 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4242 if (tb[RTA_MULTIPATH]) {
4243 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4244 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4246 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4247 cfg->fc_mp_len, extack);
4253 pref = nla_get_u8(tb[RTA_PREF]);
4254 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4255 pref != ICMPV6_ROUTER_PREF_HIGH)
4256 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4257 cfg->fc_flags |= RTF_PREF(pref);
4261 cfg->fc_encap = tb[RTA_ENCAP];
4263 if (tb[RTA_ENCAP_TYPE]) {
4264 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4266 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4271 if (tb[RTA_EXPIRES]) {
4272 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4274 if (addrconf_finite_timeout(timeout)) {
4275 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4276 cfg->fc_flags |= RTF_EXPIRES;
4286 struct fib6_info *fib6_info;
4287 struct fib6_config r_cfg;
4288 struct list_head next;
4291 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4295 list_for_each_entry(nh, rt6_nh_list, next) {
4296 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4297 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4298 nh->r_cfg.fc_ifindex);
4302 static int ip6_route_info_append(struct net *net,
4303 struct list_head *rt6_nh_list,
4304 struct fib6_info *rt,
4305 struct fib6_config *r_cfg)
4310 list_for_each_entry(nh, rt6_nh_list, next) {
4311 /* check if fib6_info already exists */
4312 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4316 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4320 err = ip6_convert_metrics(net, rt, r_cfg);
4325 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4326 list_add_tail(&nh->next, rt6_nh_list);
4331 static void ip6_route_mpath_notify(struct fib6_info *rt,
4332 struct fib6_info *rt_last,
4333 struct nl_info *info,
4336 /* if this is an APPEND route, then rt points to the first route
4337 * inserted and rt_last points to last route inserted. Userspace
4338 * wants a consistent dump of the route which starts at the first
4339 * nexthop. Since sibling routes are always added at the end of
4340 * the list, find the first sibling of the last route appended
4342 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4343 rt = list_first_entry(&rt_last->fib6_siblings,
4349 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4352 static int ip6_route_multipath_add(struct fib6_config *cfg,
4353 struct netlink_ext_ack *extack)
4355 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4356 struct nl_info *info = &cfg->fc_nlinfo;
4357 struct fib6_config r_cfg;
4358 struct rtnexthop *rtnh;
4359 struct fib6_info *rt;
4360 struct rt6_nh *err_nh;
4361 struct rt6_nh *nh, *nh_safe;
4367 int replace = (cfg->fc_nlinfo.nlh &&
4368 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4369 LIST_HEAD(rt6_nh_list);
4371 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4372 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4373 nlflags |= NLM_F_APPEND;
4375 remaining = cfg->fc_mp_len;
4376 rtnh = (struct rtnexthop *)cfg->fc_mp;
4378 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4379 * fib6_info structs per nexthop
4381 while (rtnh_ok(rtnh, remaining)) {
4382 memcpy(&r_cfg, cfg, sizeof(*cfg));
4383 if (rtnh->rtnh_ifindex)
4384 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4386 attrlen = rtnh_attrlen(rtnh);
4388 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4390 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4392 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4393 r_cfg.fc_flags |= RTF_GATEWAY;
4395 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4396 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4398 r_cfg.fc_encap_type = nla_get_u16(nla);
4401 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4402 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4408 if (!rt6_qualify_for_ecmp(rt)) {
4410 NL_SET_ERR_MSG(extack,
4411 "Device only routes can not be added for IPv6 using the multipath API.");
4412 fib6_info_release(rt);
4416 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4418 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4421 fib6_info_release(rt);
4425 rtnh = rtnh_next(rtnh, &remaining);
4428 /* for add and replace send one notification with all nexthops.
4429 * Skip the notification in fib6_add_rt2node and send one with
4430 * the full route when done
4432 info->skip_notify = 1;
4435 list_for_each_entry(nh, &rt6_nh_list, next) {
4436 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4437 fib6_info_release(nh->fib6_info);
4440 /* save reference to last route successfully inserted */
4441 rt_last = nh->fib6_info;
4443 /* save reference to first route for notification */
4445 rt_notif = nh->fib6_info;
4448 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4449 nh->fib6_info = NULL;
4452 ip6_print_replace_route_err(&rt6_nh_list);
4457 /* Because each route is added like a single route we remove
4458 * these flags after the first nexthop: if there is a collision,
4459 * we have already failed to add the first nexthop:
4460 * fib6_add_rt2node() has rejected it; when replacing, old
4461 * nexthops have been replaced by first new, the rest should
4464 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4469 /* success ... tell user about new route */
4470 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4474 /* send notification for routes that were added so that
4475 * the delete notifications sent by ip6_route_del are
4479 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4481 /* Delete routes that were already added */
4482 list_for_each_entry(nh, &rt6_nh_list, next) {
4485 ip6_route_del(&nh->r_cfg, extack);
4489 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4491 fib6_info_release(nh->fib6_info);
4492 list_del(&nh->next);
4499 static int ip6_route_multipath_del(struct fib6_config *cfg,
4500 struct netlink_ext_ack *extack)
4502 struct fib6_config r_cfg;
4503 struct rtnexthop *rtnh;
4506 int err = 1, last_err = 0;
4508 remaining = cfg->fc_mp_len;
4509 rtnh = (struct rtnexthop *)cfg->fc_mp;
4511 /* Parse a Multipath Entry */
4512 while (rtnh_ok(rtnh, remaining)) {
4513 memcpy(&r_cfg, cfg, sizeof(*cfg));
4514 if (rtnh->rtnh_ifindex)
4515 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4517 attrlen = rtnh_attrlen(rtnh);
4519 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4521 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4523 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4524 r_cfg.fc_flags |= RTF_GATEWAY;
4527 err = ip6_route_del(&r_cfg, extack);
4531 rtnh = rtnh_next(rtnh, &remaining);
4537 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4538 struct netlink_ext_ack *extack)
4540 struct fib6_config cfg;
4543 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4548 return ip6_route_multipath_del(&cfg, extack);
4550 cfg.fc_delete_all_nh = 1;
4551 return ip6_route_del(&cfg, extack);
4555 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4556 struct netlink_ext_ack *extack)
4558 struct fib6_config cfg;
4561 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4566 return ip6_route_multipath_add(&cfg, extack);
4568 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4571 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4573 int nexthop_len = 0;
4575 if (rt->fib6_nsiblings) {
4576 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4577 + NLA_ALIGN(sizeof(struct rtnexthop))
4578 + nla_total_size(16) /* RTA_GATEWAY */
4579 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4581 nexthop_len *= rt->fib6_nsiblings;
4584 return NLMSG_ALIGN(sizeof(struct rtmsg))
4585 + nla_total_size(16) /* RTA_SRC */
4586 + nla_total_size(16) /* RTA_DST */
4587 + nla_total_size(16) /* RTA_GATEWAY */
4588 + nla_total_size(16) /* RTA_PREFSRC */
4589 + nla_total_size(4) /* RTA_TABLE */
4590 + nla_total_size(4) /* RTA_IIF */
4591 + nla_total_size(4) /* RTA_OIF */
4592 + nla_total_size(4) /* RTA_PRIORITY */
4593 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4594 + nla_total_size(sizeof(struct rta_cacheinfo))
4595 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4596 + nla_total_size(1) /* RTA_PREF */
4597 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4601 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4602 unsigned int *flags, bool skip_oif)
4604 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4605 *flags |= RTNH_F_DEAD;
4607 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4608 *flags |= RTNH_F_LINKDOWN;
4611 if (fib6_ignore_linkdown(rt))
4612 *flags |= RTNH_F_DEAD;
4616 if (rt->fib6_flags & RTF_GATEWAY) {
4617 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4618 goto nla_put_failure;
4621 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4622 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4623 *flags |= RTNH_F_OFFLOAD;
4625 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4626 if (!skip_oif && rt->fib6_nh.nh_dev &&
4627 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4628 goto nla_put_failure;
4630 if (rt->fib6_nh.nh_lwtstate &&
4631 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4632 goto nla_put_failure;
4640 /* add multipath next hop */
4641 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4643 const struct net_device *dev = rt->fib6_nh.nh_dev;
4644 struct rtnexthop *rtnh;
4645 unsigned int flags = 0;
4647 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4649 goto nla_put_failure;
4651 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4652 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4654 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4655 goto nla_put_failure;
4657 rtnh->rtnh_flags = flags;
4659 /* length of rtnetlink header + attributes */
4660 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669 struct fib6_info *rt, struct dst_entry *dst,
4670 struct in6_addr *dest, struct in6_addr *src,
4671 int iif, int type, u32 portid, u32 seq,
4675 struct nlmsghdr *nlh;
4680 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4684 rtm = nlmsg_data(nlh);
4685 rtm->rtm_family = AF_INET6;
4686 rtm->rtm_dst_len = rt->fib6_dst.plen;
4687 rtm->rtm_src_len = rt->fib6_src.plen;
4690 table = rt->fib6_table->tb6_id;
4692 table = RT6_TABLE_UNSPEC;
4693 rtm->rtm_table = table;
4694 if (nla_put_u32(skb, RTA_TABLE, table))
4695 goto nla_put_failure;
4697 rtm->rtm_type = rt->fib6_type;
4699 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4700 rtm->rtm_protocol = rt->fib6_protocol;
4702 if (rt->fib6_flags & RTF_CACHE)
4703 rtm->rtm_flags |= RTM_F_CLONED;
4706 if (nla_put_in6_addr(skb, RTA_DST, dest))
4707 goto nla_put_failure;
4708 rtm->rtm_dst_len = 128;
4709 } else if (rtm->rtm_dst_len)
4710 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4711 goto nla_put_failure;
4712 #ifdef CONFIG_IPV6_SUBTREES
4714 if (nla_put_in6_addr(skb, RTA_SRC, src))
4715 goto nla_put_failure;
4716 rtm->rtm_src_len = 128;
4717 } else if (rtm->rtm_src_len &&
4718 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4719 goto nla_put_failure;
4722 #ifdef CONFIG_IPV6_MROUTE
4723 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4724 int err = ip6mr_get_route(net, skb, rtm, portid);
4729 goto nla_put_failure;
4732 if (nla_put_u32(skb, RTA_IIF, iif))
4733 goto nla_put_failure;
4735 struct in6_addr saddr_buf;
4736 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4737 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4738 goto nla_put_failure;
4741 if (rt->fib6_prefsrc.plen) {
4742 struct in6_addr saddr_buf;
4743 saddr_buf = rt->fib6_prefsrc.addr;
4744 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4745 goto nla_put_failure;
4748 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4749 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4750 goto nla_put_failure;
4752 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4753 goto nla_put_failure;
4755 /* For multipath routes, walk the siblings list and add
4756 * each as a nexthop within RTA_MULTIPATH.
4758 if (rt->fib6_nsiblings) {
4759 struct fib6_info *sibling, *next_sibling;
4762 mp = nla_nest_start(skb, RTA_MULTIPATH);
4764 goto nla_put_failure;
4766 if (rt6_add_nexthop(skb, rt) < 0)
4767 goto nla_put_failure;
4769 list_for_each_entry_safe(sibling, next_sibling,
4770 &rt->fib6_siblings, fib6_siblings) {
4771 if (rt6_add_nexthop(skb, sibling) < 0)
4772 goto nla_put_failure;
4775 nla_nest_end(skb, mp);
4777 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4778 goto nla_put_failure;
4781 if (rt->fib6_flags & RTF_EXPIRES) {
4782 expires = dst ? dst->expires : rt->expires;
4786 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4787 goto nla_put_failure;
4789 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4790 goto nla_put_failure;
4793 nlmsg_end(skb, nlh);
4797 nlmsg_cancel(skb, nlh);
4801 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4803 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4804 struct net *net = arg->net;
4806 if (rt == net->ipv6.fib6_null_entry)
4809 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4810 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4812 /* user wants prefix routes only */
4813 if (rtm->rtm_flags & RTM_F_PREFIX &&
4814 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4815 /* success since this is not a prefix route */
4820 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4821 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4822 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4825 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4826 struct netlink_ext_ack *extack)
4828 struct net *net = sock_net(in_skb->sk);
4829 struct nlattr *tb[RTA_MAX+1];
4830 int err, iif = 0, oif = 0;
4831 struct fib6_info *from;
4832 struct dst_entry *dst;
4833 struct rt6_info *rt;
4834 struct sk_buff *skb;
4839 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4845 memset(&fl6, 0, sizeof(fl6));
4846 rtm = nlmsg_data(nlh);
4847 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4848 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4851 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4854 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4858 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4861 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4865 iif = nla_get_u32(tb[RTA_IIF]);
4868 oif = nla_get_u32(tb[RTA_OIF]);
4871 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4874 fl6.flowi6_uid = make_kuid(current_user_ns(),
4875 nla_get_u32(tb[RTA_UID]));
4877 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4880 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4883 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4885 if (tb[RTA_IP_PROTO]) {
4886 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4887 &fl6.flowi6_proto, extack);
4893 struct net_device *dev;
4898 dev = dev_get_by_index_rcu(net, iif);
4905 fl6.flowi6_iif = iif;
4907 if (!ipv6_addr_any(&fl6.saddr))
4908 flags |= RT6_LOOKUP_F_HAS_SADDR;
4910 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4914 fl6.flowi6_oif = oif;
4916 dst = ip6_route_output(net, NULL, &fl6);
4920 rt = container_of(dst, struct rt6_info, dst);
4921 if (rt->dst.error) {
4922 err = rt->dst.error;
4927 if (rt == net->ipv6.ip6_null_entry) {
4928 err = rt->dst.error;
4933 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4940 skb_dst_set(skb, &rt->dst);
4943 from = rcu_dereference(rt->from);
4946 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4947 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4950 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4951 &fl6.saddr, iif, RTM_NEWROUTE,
4952 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4961 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4966 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4967 unsigned int nlm_flags)
4969 struct sk_buff *skb;
4970 struct net *net = info->nl_net;
4975 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4977 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4981 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4982 event, info->portid, seq, nlm_flags);
4984 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4985 WARN_ON(err == -EMSGSIZE);
4989 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4990 info->nlh, gfp_any());
4994 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4997 static int ip6_route_dev_notify(struct notifier_block *this,
4998 unsigned long event, void *ptr)
5000 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5001 struct net *net = dev_net(dev);
5003 if (!(dev->flags & IFF_LOOPBACK))
5006 if (event == NETDEV_REGISTER) {
5007 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5008 net->ipv6.ip6_null_entry->dst.dev = dev;
5009 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5011 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5012 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5013 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5014 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5016 } else if (event == NETDEV_UNREGISTER &&
5017 dev->reg_state != NETREG_UNREGISTERED) {
5018 /* NETDEV_UNREGISTER could be fired for multiple times by
5019 * netdev_wait_allrefs(). Make sure we only call this once.
5021 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5022 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5023 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5024 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5035 #ifdef CONFIG_PROC_FS
5036 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5038 struct net *net = (struct net *)seq->private;
5039 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5040 net->ipv6.rt6_stats->fib_nodes,
5041 net->ipv6.rt6_stats->fib_route_nodes,
5042 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5043 net->ipv6.rt6_stats->fib_rt_entries,
5044 net->ipv6.rt6_stats->fib_rt_cache,
5045 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5046 net->ipv6.rt6_stats->fib_discarded_routes);
5050 #endif /* CONFIG_PROC_FS */
5052 #ifdef CONFIG_SYSCTL
5055 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5056 void __user *buffer, size_t *lenp, loff_t *ppos)
5063 net = (struct net *)ctl->extra1;
5064 delay = net->ipv6.sysctl.flush_delay;
5065 proc_dointvec(ctl, write, buffer, lenp, ppos);
5066 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5070 struct ctl_table ipv6_route_table_template[] = {
5072 .procname = "flush",
5073 .data = &init_net.ipv6.sysctl.flush_delay,
5074 .maxlen = sizeof(int),
5076 .proc_handler = ipv6_sysctl_rtcache_flush
5079 .procname = "gc_thresh",
5080 .data = &ip6_dst_ops_template.gc_thresh,
5081 .maxlen = sizeof(int),
5083 .proc_handler = proc_dointvec,
5086 .procname = "max_size",
5087 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5088 .maxlen = sizeof(int),
5090 .proc_handler = proc_dointvec,
5093 .procname = "gc_min_interval",
5094 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5095 .maxlen = sizeof(int),
5097 .proc_handler = proc_dointvec_jiffies,
5100 .procname = "gc_timeout",
5101 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5102 .maxlen = sizeof(int),
5104 .proc_handler = proc_dointvec_jiffies,
5107 .procname = "gc_interval",
5108 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5109 .maxlen = sizeof(int),
5111 .proc_handler = proc_dointvec_jiffies,
5114 .procname = "gc_elasticity",
5115 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5116 .maxlen = sizeof(int),
5118 .proc_handler = proc_dointvec,
5121 .procname = "mtu_expires",
5122 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5123 .maxlen = sizeof(int),
5125 .proc_handler = proc_dointvec_jiffies,
5128 .procname = "min_adv_mss",
5129 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5130 .maxlen = sizeof(int),
5132 .proc_handler = proc_dointvec,
5135 .procname = "gc_min_interval_ms",
5136 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5137 .maxlen = sizeof(int),
5139 .proc_handler = proc_dointvec_ms_jiffies,
5144 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5146 struct ctl_table *table;
5148 table = kmemdup(ipv6_route_table_template,
5149 sizeof(ipv6_route_table_template),
5153 table[0].data = &net->ipv6.sysctl.flush_delay;
5154 table[0].extra1 = net;
5155 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5156 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5157 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5158 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5159 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5160 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5161 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5162 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5163 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5165 /* Don't export sysctls to unprivileged users */
5166 if (net->user_ns != &init_user_ns)
5167 table[0].procname = NULL;
5174 static int __net_init ip6_route_net_init(struct net *net)
5178 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5179 sizeof(net->ipv6.ip6_dst_ops));
5181 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5182 goto out_ip6_dst_ops;
5184 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5185 sizeof(*net->ipv6.fib6_null_entry),
5187 if (!net->ipv6.fib6_null_entry)
5188 goto out_ip6_dst_entries;
5190 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5191 sizeof(*net->ipv6.ip6_null_entry),
5193 if (!net->ipv6.ip6_null_entry)
5194 goto out_fib6_null_entry;
5195 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5196 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5197 ip6_template_metrics, true);
5199 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5200 net->ipv6.fib6_has_custom_rules = false;
5201 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5202 sizeof(*net->ipv6.ip6_prohibit_entry),
5204 if (!net->ipv6.ip6_prohibit_entry)
5205 goto out_ip6_null_entry;
5206 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5207 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5208 ip6_template_metrics, true);
5210 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5211 sizeof(*net->ipv6.ip6_blk_hole_entry),
5213 if (!net->ipv6.ip6_blk_hole_entry)
5214 goto out_ip6_prohibit_entry;
5215 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5216 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5217 ip6_template_metrics, true);
5220 net->ipv6.sysctl.flush_delay = 0;
5221 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5222 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5223 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5224 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5225 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5226 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5227 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5229 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5235 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5236 out_ip6_prohibit_entry:
5237 kfree(net->ipv6.ip6_prohibit_entry);
5239 kfree(net->ipv6.ip6_null_entry);
5241 out_fib6_null_entry:
5242 kfree(net->ipv6.fib6_null_entry);
5243 out_ip6_dst_entries:
5244 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5249 static void __net_exit ip6_route_net_exit(struct net *net)
5251 kfree(net->ipv6.fib6_null_entry);
5252 kfree(net->ipv6.ip6_null_entry);
5253 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5254 kfree(net->ipv6.ip6_prohibit_entry);
5255 kfree(net->ipv6.ip6_blk_hole_entry);
5257 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5260 static int __net_init ip6_route_net_init_late(struct net *net)
5262 #ifdef CONFIG_PROC_FS
5263 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5264 sizeof(struct ipv6_route_iter));
5265 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5266 rt6_stats_seq_show, NULL);
5271 static void __net_exit ip6_route_net_exit_late(struct net *net)
5273 #ifdef CONFIG_PROC_FS
5274 remove_proc_entry("ipv6_route", net->proc_net);
5275 remove_proc_entry("rt6_stats", net->proc_net);
5279 static struct pernet_operations ip6_route_net_ops = {
5280 .init = ip6_route_net_init,
5281 .exit = ip6_route_net_exit,
5284 static int __net_init ipv6_inetpeer_init(struct net *net)
5286 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5290 inet_peer_base_init(bp);
5291 net->ipv6.peers = bp;
5295 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5297 struct inet_peer_base *bp = net->ipv6.peers;
5299 net->ipv6.peers = NULL;
5300 inetpeer_invalidate_tree(bp);
5304 static struct pernet_operations ipv6_inetpeer_ops = {
5305 .init = ipv6_inetpeer_init,
5306 .exit = ipv6_inetpeer_exit,
5309 static struct pernet_operations ip6_route_net_late_ops = {
5310 .init = ip6_route_net_init_late,
5311 .exit = ip6_route_net_exit_late,
5314 static struct notifier_block ip6_route_dev_notifier = {
5315 .notifier_call = ip6_route_dev_notify,
5316 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5319 void __init ip6_route_init_special_entries(void)
5321 /* Registering of the loopback is done before this portion of code,
5322 * the loopback reference in rt6_info will not be taken, do it
5323 * manually for init_net */
5324 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5325 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5326 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5327 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5328 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5329 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5330 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5331 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5335 int __init ip6_route_init(void)
5341 ip6_dst_ops_template.kmem_cachep =
5342 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5343 SLAB_HWCACHE_ALIGN, NULL);
5344 if (!ip6_dst_ops_template.kmem_cachep)
5347 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5349 goto out_kmem_cache;
5351 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5353 goto out_dst_entries;
5355 ret = register_pernet_subsys(&ip6_route_net_ops);
5357 goto out_register_inetpeer;
5359 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5363 goto out_register_subsys;
5369 ret = fib6_rules_init();
5373 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5375 goto fib6_rules_init;
5377 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5378 inet6_rtm_newroute, NULL, 0);
5380 goto out_register_late_subsys;
5382 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5383 inet6_rtm_delroute, NULL, 0);
5385 goto out_register_late_subsys;
5387 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5388 inet6_rtm_getroute, NULL,
5389 RTNL_FLAG_DOIT_UNLOCKED);
5391 goto out_register_late_subsys;
5393 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5395 goto out_register_late_subsys;
5397 for_each_possible_cpu(cpu) {
5398 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5400 INIT_LIST_HEAD(&ul->head);
5401 spin_lock_init(&ul->lock);
5407 out_register_late_subsys:
5408 rtnl_unregister_all(PF_INET6);
5409 unregister_pernet_subsys(&ip6_route_net_late_ops);
5411 fib6_rules_cleanup();
5416 out_register_subsys:
5417 unregister_pernet_subsys(&ip6_route_net_ops);
5418 out_register_inetpeer:
5419 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5421 dst_entries_destroy(&ip6_dst_blackhole_ops);
5423 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5427 void ip6_route_cleanup(void)
5429 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5430 unregister_pernet_subsys(&ip6_route_net_late_ops);
5431 fib6_rules_cleanup();
5434 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5435 unregister_pernet_subsys(&ip6_route_net_ops);
5436 dst_entries_destroy(&ip6_dst_blackhole_ops);
5437 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);