2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
160 if (dev == loopback_dev)
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(rt->dst.from);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
213 return &ipv6_hdr(skb)->daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
221 struct rt6_info *rt = (struct rt6_info *) dst;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
282 static struct dst_ops ip6_dst_blackhole_ops = {
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
300 .__refcnt = ATOMIC_INIT(1),
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
383 for_each_possible_cpu(cpu) {
386 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
387 /* no one shares rt */
391 dst_release_immediate(&rt->dst);
398 EXPORT_SYMBOL(ip6_dst_alloc);
400 static void ip6_dst_destroy(struct dst_entry *dst)
402 struct rt6_info *rt = (struct rt6_info *)dst;
403 struct rt6_exception_bucket *bucket;
404 struct dst_entry *from = dst->from;
405 struct inet6_dev *idev;
407 dst_destroy_metrics_generic(dst);
408 free_percpu(rt->rt6i_pcpu);
409 rt6_uncached_list_del(rt);
411 idev = rt->rt6i_idev;
413 rt->rt6i_idev = NULL;
416 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
418 rt->rt6i_exception_bucket = NULL;
426 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
429 struct rt6_info *rt = (struct rt6_info *)dst;
430 struct inet6_dev *idev = rt->rt6i_idev;
431 struct net_device *loopback_dev =
432 dev_net(dev)->loopback_dev;
434 if (idev && idev->dev != loopback_dev) {
435 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
437 rt->rt6i_idev = loopback_idev;
443 static bool __rt6_check_expired(const struct rt6_info *rt)
445 if (rt->rt6i_flags & RTF_EXPIRES)
446 return time_after(jiffies, rt->dst.expires);
451 static bool rt6_check_expired(const struct rt6_info *rt)
453 if (rt->rt6i_flags & RTF_EXPIRES) {
454 if (time_after(jiffies, rt->dst.expires))
456 } else if (rt->dst.from) {
457 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
458 rt6_check_expired((struct rt6_info *)rt->dst.from);
463 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
464 struct flowi6 *fl6, int oif,
467 struct rt6_info *sibling, *next_sibling;
470 /* We might have already computed the hash for ICMPv6 errors. In such
471 * case it will always be non-zero. Otherwise now is the time to do it.
474 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
476 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
477 /* Don't change the route, if route_choosen == 0
478 * (siblings does not include ourself)
481 list_for_each_entry_safe(sibling, next_sibling,
482 &match->rt6i_siblings, rt6i_siblings) {
484 if (route_choosen == 0) {
485 if (rt6_score_route(sibling, oif, strict) < 0)
495 * Route lookup. rcu_read_lock() should be held.
498 static inline struct rt6_info *rt6_device_match(struct net *net,
500 const struct in6_addr *saddr,
504 struct rt6_info *local = NULL;
505 struct rt6_info *sprt;
507 if (!oif && ipv6_addr_any(saddr))
510 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
511 struct net_device *dev = sprt->dst.dev;
514 if (dev->ifindex == oif)
516 if (dev->flags & IFF_LOOPBACK) {
517 if (!sprt->rt6i_idev ||
518 sprt->rt6i_idev->dev->ifindex != oif) {
519 if (flags & RT6_LOOKUP_F_IFACE)
522 local->rt6i_idev->dev->ifindex == oif)
528 if (ipv6_chk_addr(net, saddr, dev,
529 flags & RT6_LOOKUP_F_IFACE))
538 if (flags & RT6_LOOKUP_F_IFACE)
539 return net->ipv6.ip6_null_entry;
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547 struct work_struct work;
548 struct in6_addr target;
549 struct net_device *dev;
552 static void rt6_probe_deferred(struct work_struct *w)
554 struct in6_addr mcaddr;
555 struct __rt6_probe_work *work =
556 container_of(w, struct __rt6_probe_work, work);
558 addrconf_addr_solict_mult(&work->target, &mcaddr);
559 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
564 static void rt6_probe(struct rt6_info *rt)
566 struct __rt6_probe_work *work;
567 struct neighbour *neigh;
569 * Okay, this does not seem to be appropriate
570 * for now, however, we need to check if it
571 * is really so; aka Router Reachability Probing.
573 * Router Reachability Probe MUST be rate-limited
574 * to no more than one per minute.
576 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
579 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
581 if (neigh->nud_state & NUD_VALID)
585 write_lock(&neigh->lock);
586 if (!(neigh->nud_state & NUD_VALID) &&
589 rt->rt6i_idev->cnf.rtr_probe_interval)) {
590 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592 __neigh_set_probe_once(neigh);
594 write_unlock(&neigh->lock);
596 work = kmalloc(sizeof(*work), GFP_ATOMIC);
600 INIT_WORK(&work->work, rt6_probe_deferred);
601 work->target = rt->rt6i_gateway;
602 dev_hold(rt->dst.dev);
603 work->dev = rt->dst.dev;
604 schedule_work(&work->work);
608 rcu_read_unlock_bh();
611 static inline void rt6_probe(struct rt6_info *rt)
617 * Default Router Selection (RFC 2461 6.3.6)
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
621 struct net_device *dev = rt->dst.dev;
622 if (!oif || dev->ifindex == oif)
624 if ((dev->flags & IFF_LOOPBACK) &&
625 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
632 struct neighbour *neigh;
633 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
635 if (rt->rt6i_flags & RTF_NONEXTHOP ||
636 !(rt->rt6i_flags & RTF_GATEWAY))
637 return RT6_NUD_SUCCEED;
640 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
642 read_lock(&neigh->lock);
643 if (neigh->nud_state & NUD_VALID)
644 ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646 else if (!(neigh->nud_state & NUD_FAILED))
647 ret = RT6_NUD_SUCCEED;
649 ret = RT6_NUD_FAIL_PROBE;
651 read_unlock(&neigh->lock);
653 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
656 rcu_read_unlock_bh();
661 static int rt6_score_route(struct rt6_info *rt, int oif,
666 m = rt6_check_dev(rt, oif);
667 if (!m && (strict & RT6_LOOKUP_F_IFACE))
668 return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
672 if (strict & RT6_LOOKUP_F_REACHABLE) {
673 int n = rt6_check_neigh(rt);
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681 int *mpri, struct rt6_info *match,
685 bool match_do_rr = false;
686 struct inet6_dev *idev = rt->rt6i_idev;
687 struct net_device *dev = rt->dst.dev;
689 if (dev && !netif_carrier_ok(dev) &&
690 idev->cnf.ignore_routes_with_linkdown &&
691 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
694 if (rt6_check_expired(rt))
697 m = rt6_score_route(rt, oif, strict);
698 if (m == RT6_NUD_FAIL_DO_RR) {
700 m = 0; /* lowest valid score */
701 } else if (m == RT6_NUD_FAIL_HARD) {
705 if (strict & RT6_LOOKUP_F_REACHABLE)
708 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
710 *do_rr = match_do_rr;
718 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
719 struct rt6_info *leaf,
720 struct rt6_info *rr_head,
721 u32 metric, int oif, int strict,
724 struct rt6_info *rt, *match, *cont;
729 for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
730 if (rt->rt6i_metric != metric) {
735 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 for (rt = leaf; rt && rt != rr_head;
739 rt = rcu_dereference(rt->dst.rt6_next)) {
740 if (rt->rt6i_metric != metric) {
745 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
752 match = find_match(rt, oif, strict, &mpri, match, do_rr);
757 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
760 struct rt6_info *leaf = rcu_dereference(fn->leaf);
761 struct rt6_info *match, *rt0;
766 return net->ipv6.ip6_null_entry;
768 rt0 = rcu_dereference(fn->rr_ptr);
772 /* Double check to make sure fn is not an intermediate node
773 * and fn->leaf does not points to its child's leaf
774 * (This might happen if all routes under fn are deleted from
775 * the tree and fib6_repair_tree() is called on the node.)
777 key_plen = rt0->rt6i_dst.plen;
778 #ifdef CONFIG_IPV6_SUBTREES
779 if (rt0->rt6i_src.plen)
780 key_plen = rt0->rt6i_src.plen;
782 if (fn->fn_bit != key_plen)
783 return net->ipv6.ip6_null_entry;
785 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
789 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
791 /* no entries matched; do round-robin */
792 if (!next || next->rt6i_metric != rt0->rt6i_metric)
796 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
797 /* make sure next is not being deleted from the tree */
799 rcu_assign_pointer(fn->rr_ptr, next);
800 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
804 return match ? match : net->ipv6.ip6_null_entry;
807 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
809 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
812 #ifdef CONFIG_IPV6_ROUTE_INFO
813 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
814 const struct in6_addr *gwaddr)
816 struct net *net = dev_net(dev);
817 struct route_info *rinfo = (struct route_info *) opt;
818 struct in6_addr prefix_buf, *prefix;
820 unsigned long lifetime;
823 if (len < sizeof(struct route_info)) {
827 /* Sanity check for prefix_len and length */
828 if (rinfo->length > 3) {
830 } else if (rinfo->prefix_len > 128) {
832 } else if (rinfo->prefix_len > 64) {
833 if (rinfo->length < 2) {
836 } else if (rinfo->prefix_len > 0) {
837 if (rinfo->length < 1) {
842 pref = rinfo->route_pref;
843 if (pref == ICMPV6_ROUTER_PREF_INVALID)
846 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
848 if (rinfo->length == 3)
849 prefix = (struct in6_addr *)rinfo->prefix;
851 /* this function is safe */
852 ipv6_addr_prefix(&prefix_buf,
853 (struct in6_addr *)rinfo->prefix,
855 prefix = &prefix_buf;
858 if (rinfo->prefix_len == 0)
859 rt = rt6_get_dflt_router(gwaddr, dev);
861 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
864 if (rt && !lifetime) {
870 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
873 rt->rt6i_flags = RTF_ROUTEINFO |
874 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
877 if (!addrconf_finite_timeout(lifetime))
878 rt6_clean_expires(rt);
880 rt6_set_expires(rt, jiffies + HZ * lifetime);
888 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
889 struct in6_addr *saddr)
891 struct fib6_node *pn, *sn;
893 if (fn->fn_flags & RTN_TL_ROOT)
895 pn = rcu_dereference(fn->parent);
896 sn = FIB6_SUBTREE(pn);
898 fn = fib6_lookup(sn, NULL, saddr);
901 if (fn->fn_flags & RTN_RTINFO)
906 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
909 struct rt6_info *rt = *prt;
911 if (dst_hold_safe(&rt->dst))
914 rt = net->ipv6.ip6_null_entry;
923 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
924 struct fib6_table *table,
925 struct flowi6 *fl6, int flags)
927 struct rt6_info *rt, *rt_cache;
928 struct fib6_node *fn;
931 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
933 rt = rcu_dereference(fn->leaf);
935 rt = net->ipv6.ip6_null_entry;
937 rt = rt6_device_match(net, rt, &fl6->saddr,
938 fl6->flowi6_oif, flags);
939 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
940 rt = rt6_multipath_select(rt, fl6,
941 fl6->flowi6_oif, flags);
943 if (rt == net->ipv6.ip6_null_entry) {
944 fn = fib6_backtrack(fn, &fl6->saddr);
948 /* Search through exception table */
949 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
953 if (ip6_hold_safe(net, &rt, true))
954 dst_use_noref(&rt->dst, jiffies);
958 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
964 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
967 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
969 EXPORT_SYMBOL_GPL(ip6_route_lookup);
971 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
972 const struct in6_addr *saddr, int oif, int strict)
974 struct flowi6 fl6 = {
978 struct dst_entry *dst;
979 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
982 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 flags |= RT6_LOOKUP_F_HAS_SADDR;
986 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
988 return (struct rt6_info *) dst;
994 EXPORT_SYMBOL(rt6_lookup);
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997 * It takes new route entry, the addition fails by any reason the
999 * Caller must hold dst before calling it.
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 struct mx6_config *mxc,
1004 struct netlink_ext_ack *extack)
1007 struct fib6_table *table;
1009 table = rt->rt6i_table;
1010 spin_lock_bh(&table->tb6_lock);
1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 spin_unlock_bh(&table->tb6_lock);
1017 int ip6_ins_rt(struct rt6_info *rt)
1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020 struct mx6_config mxc = { .mx = NULL, };
1022 /* Hold dst to account for the reference from the fib6 tree */
1024 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1030 struct net_device *dev = rt->dst.dev;
1032 if (rt->rt6i_flags & RTF_LOCAL) {
1033 /* for copies of local routes, dst->dev needs to be the
1034 * device if it is a master device, the master device if
1035 * device is enslaved, and the loopback as the default
1037 if (netif_is_l3_slave(dev) &&
1038 !rt6_need_strict(&rt->rt6i_dst.addr))
1039 dev = l3mdev_master_dev_rcu(dev);
1040 else if (!netif_is_l3_master(dev))
1041 dev = dev_net(dev)->loopback_dev;
1042 /* last case is netif_is_l3_master(dev) is true in which
1043 * case we want dev returned to be dev
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 const struct in6_addr *daddr,
1052 const struct in6_addr *saddr)
1054 struct net_device *dev;
1055 struct rt6_info *rt;
1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 ort = (struct rt6_info *)ort->dst.from;
1065 dev = ip6_rt_get_dev_rcu(ort);
1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1071 ip6_rt_copy_init(rt, ort);
1072 rt->rt6i_flags |= RTF_CACHE;
1073 rt->rt6i_metric = 0;
1074 rt->dst.flags |= DST_HOST;
1075 rt->rt6i_dst.addr = *daddr;
1076 rt->rt6i_dst.plen = 128;
1078 if (!rt6_is_gw_or_nonexthop(ort)) {
1079 if (ort->rt6i_dst.plen != 128 &&
1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 if (rt->rt6i_src.plen && saddr) {
1084 rt->rt6i_src.addr = *saddr;
1085 rt->rt6i_src.plen = 128;
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1095 struct net_device *dev;
1096 struct rt6_info *pcpu_rt;
1099 dev = ip6_rt_get_dev_rcu(rt);
1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1104 ip6_rt_copy_init(pcpu_rt, rt);
1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 pcpu_rt->rt6i_flags |= RTF_PCPU;
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1113 struct rt6_info *pcpu_rt, **p;
1115 p = this_cpu_ptr(rt->rt6i_pcpu);
1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 rt6_dst_from_metrics_check(pcpu_rt);
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1126 struct rt6_info *pcpu_rt, *prev, **p;
1128 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1130 struct net *net = dev_net(rt->dst.dev);
1132 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 return net->ipv6.ip6_null_entry;
1136 dst_hold(&pcpu_rt->dst);
1137 p = this_cpu_ptr(rt->rt6i_pcpu);
1138 prev = cmpxchg(p, NULL, pcpu_rt);
1141 rt6_dst_from_metrics_check(pcpu_rt);
1145 /* exception hash table implementation
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1149 /* Remove rt6_ex from hash table and free the memory
1150 * Caller must hold rt6_exception_lock
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 struct rt6_exception *rt6_ex)
1155 struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1157 if (!bucket || !rt6_ex)
1159 rt6_ex->rt6i->rt6i_node = NULL;
1160 hlist_del_rcu(&rt6_ex->hlist);
1161 rt6_release(rt6_ex->rt6i);
1162 kfree_rcu(rt6_ex, rcu);
1163 WARN_ON_ONCE(!bucket->depth);
1165 net->ipv6.rt6_stats->fib_rt_cache--;
1168 /* Remove oldest rt6_ex in bucket and free the memory
1169 * Caller must hold rt6_exception_lock
1171 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1173 struct rt6_exception *rt6_ex, *oldest = NULL;
1178 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1179 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182 rt6_remove_exception(bucket, oldest);
1185 static u32 rt6_exception_hash(const struct in6_addr *dst,
1186 const struct in6_addr *src)
1188 static u32 seed __read_mostly;
1191 net_get_random_once(&seed, sizeof(seed));
1192 val = jhash(dst, sizeof(*dst), seed);
1194 #ifdef CONFIG_IPV6_SUBTREES
1196 val = jhash(src, sizeof(*src), val);
1198 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 /* Helper function to find the cached rt in the hash table
1202 * and update bucket pointer to point to the bucket for this
1203 * (daddr, saddr) pair
1204 * Caller must hold rt6_exception_lock
1206 static struct rt6_exception *
1207 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1208 const struct in6_addr *daddr,
1209 const struct in6_addr *saddr)
1211 struct rt6_exception *rt6_ex;
1214 if (!(*bucket) || !daddr)
1217 hval = rt6_exception_hash(daddr, saddr);
1220 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1221 struct rt6_info *rt6 = rt6_ex->rt6i;
1222 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1224 #ifdef CONFIG_IPV6_SUBTREES
1225 if (matched && saddr)
1226 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1234 /* Helper function to find the cached rt in the hash table
1235 * and update bucket pointer to point to the bucket for this
1236 * (daddr, saddr) pair
1237 * Caller must hold rcu_read_lock()
1239 static struct rt6_exception *
1240 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1241 const struct in6_addr *daddr,
1242 const struct in6_addr *saddr)
1244 struct rt6_exception *rt6_ex;
1247 WARN_ON_ONCE(!rcu_read_lock_held());
1249 if (!(*bucket) || !daddr)
1252 hval = rt6_exception_hash(daddr, saddr);
1255 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1256 struct rt6_info *rt6 = rt6_ex->rt6i;
1257 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1259 #ifdef CONFIG_IPV6_SUBTREES
1260 if (matched && saddr)
1261 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1269 static int rt6_insert_exception(struct rt6_info *nrt,
1270 struct rt6_info *ort)
1272 struct net *net = dev_net(ort->dst.dev);
1273 struct rt6_exception_bucket *bucket;
1274 struct in6_addr *src_key = NULL;
1275 struct rt6_exception *rt6_ex;
1278 /* ort can't be a cache or pcpu route */
1279 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1280 ort = (struct rt6_info *)ort->dst.from;
1281 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1283 spin_lock_bh(&rt6_exception_lock);
1285 if (ort->exception_bucket_flushed) {
1290 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1291 lockdep_is_held(&rt6_exception_lock));
1293 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1299 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302 #ifdef CONFIG_IPV6_SUBTREES
1303 /* rt6i_src.plen != 0 indicates ort is in subtree
1304 * and exception table is indexed by a hash of
1305 * both rt6i_dst and rt6i_src.
1306 * Otherwise, the exception table is indexed by
1307 * a hash of only rt6i_dst.
1309 if (ort->rt6i_src.plen)
1310 src_key = &nrt->rt6i_src.addr;
1313 /* Update rt6i_prefsrc as it could be changed
1314 * in rt6_remove_prefsrc()
1316 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1317 /* rt6_mtu_change() might lower mtu on ort.
1318 * Only insert this exception route if its mtu
1319 * is less than ort's mtu value.
1321 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1326 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329 rt6_remove_exception(bucket, rt6_ex);
1331 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1337 rt6_ex->stamp = jiffies;
1338 atomic_inc(&nrt->rt6i_ref);
1339 nrt->rt6i_node = ort->rt6i_node;
1340 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1342 net->ipv6.rt6_stats->fib_rt_cache++;
1344 if (bucket->depth > FIB6_MAX_DEPTH)
1345 rt6_exception_remove_oldest(bucket);
1348 spin_unlock_bh(&rt6_exception_lock);
1350 /* Update fn->fn_sernum to invalidate all cached dst */
1352 fib6_update_sernum(ort);
1357 void rt6_flush_exceptions(struct rt6_info *rt)
1359 struct rt6_exception_bucket *bucket;
1360 struct rt6_exception *rt6_ex;
1361 struct hlist_node *tmp;
1364 spin_lock_bh(&rt6_exception_lock);
1365 /* Prevent rt6_insert_exception() to recreate the bucket list */
1366 rt->exception_bucket_flushed = 1;
1368 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1369 lockdep_is_held(&rt6_exception_lock));
1373 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1374 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1375 rt6_remove_exception(bucket, rt6_ex);
1376 WARN_ON_ONCE(bucket->depth);
1381 spin_unlock_bh(&rt6_exception_lock);
1384 /* Find cached rt in the hash table inside passed in rt
1385 * Caller has to hold rcu_read_lock()
1387 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1388 struct in6_addr *daddr,
1389 struct in6_addr *saddr)
1391 struct rt6_exception_bucket *bucket;
1392 struct in6_addr *src_key = NULL;
1393 struct rt6_exception *rt6_ex;
1394 struct rt6_info *res = NULL;
1396 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1398 #ifdef CONFIG_IPV6_SUBTREES
1399 /* rt6i_src.plen != 0 indicates rt is in subtree
1400 * and exception table is indexed by a hash of
1401 * both rt6i_dst and rt6i_src.
1402 * Otherwise, the exception table is indexed by
1403 * a hash of only rt6i_dst.
1405 if (rt->rt6i_src.plen)
1408 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1410 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1416 /* Remove the passed in cached rt from the hash table that contains it */
1417 int rt6_remove_exception_rt(struct rt6_info *rt)
1419 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1420 struct rt6_exception_bucket *bucket;
1421 struct in6_addr *src_key = NULL;
1422 struct rt6_exception *rt6_ex;
1426 !(rt->rt6i_flags | RTF_CACHE))
1429 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1432 spin_lock_bh(&rt6_exception_lock);
1433 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1434 lockdep_is_held(&rt6_exception_lock));
1435 #ifdef CONFIG_IPV6_SUBTREES
1436 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1437 * and exception table is indexed by a hash of
1438 * both rt6i_dst and rt6i_src.
1439 * Otherwise, the exception table is indexed by
1440 * a hash of only rt6i_dst.
1442 if (from->rt6i_src.plen)
1443 src_key = &rt->rt6i_src.addr;
1445 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1449 rt6_remove_exception(bucket, rt6_ex);
1455 spin_unlock_bh(&rt6_exception_lock);
1459 /* Find rt6_ex which contains the passed in rt cache and
1462 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1464 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1465 struct rt6_exception_bucket *bucket;
1466 struct in6_addr *src_key = NULL;
1467 struct rt6_exception *rt6_ex;
1470 !(rt->rt6i_flags | RTF_CACHE))
1474 bucket = rcu_dereference(from->rt6i_exception_bucket);
1476 #ifdef CONFIG_IPV6_SUBTREES
1477 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1478 * and exception table is indexed by a hash of
1479 * both rt6i_dst and rt6i_src.
1480 * Otherwise, the exception table is indexed by
1481 * a hash of only rt6i_dst.
1483 if (from->rt6i_src.plen)
1484 src_key = &rt->rt6i_src.addr;
1486 rt6_ex = __rt6_find_exception_rcu(&bucket,
1490 rt6_ex->stamp = jiffies;
1495 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1497 struct rt6_exception_bucket *bucket;
1498 struct rt6_exception *rt6_ex;
1501 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1502 lockdep_is_held(&rt6_exception_lock));
1505 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1506 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1507 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1516 struct rt6_exception_bucket *bucket;
1517 struct rt6_exception *rt6_ex;
1520 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1521 lockdep_is_held(&rt6_exception_lock));
1524 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1525 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1526 struct rt6_info *entry = rt6_ex->rt6i;
1527 /* For RTF_CACHE with rt6i_pmtu == 0
1528 * (i.e. a redirected route),
1529 * the metrics of its rt->dst.from has already
1532 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1533 entry->rt6i_pmtu = mtu;
1540 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1542 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1543 struct in6_addr *gateway)
1545 struct rt6_exception_bucket *bucket;
1546 struct rt6_exception *rt6_ex;
1547 struct hlist_node *tmp;
1550 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1553 spin_lock_bh(&rt6_exception_lock);
1554 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1555 lockdep_is_held(&rt6_exception_lock));
1558 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1559 hlist_for_each_entry_safe(rt6_ex, tmp,
1560 &bucket->chain, hlist) {
1561 struct rt6_info *entry = rt6_ex->rt6i;
1563 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1564 RTF_CACHE_GATEWAY &&
1565 ipv6_addr_equal(gateway,
1566 &entry->rt6i_gateway)) {
1567 rt6_remove_exception(bucket, rt6_ex);
1574 spin_unlock_bh(&rt6_exception_lock);
1577 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1578 struct rt6_exception *rt6_ex,
1579 struct fib6_gc_args *gc_args,
1582 struct rt6_info *rt = rt6_ex->rt6i;
1584 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1585 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1586 RT6_TRACE("aging clone %p\n", rt);
1587 rt6_remove_exception(bucket, rt6_ex);
1589 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1590 struct neighbour *neigh;
1591 __u8 neigh_flags = 0;
1593 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1595 neigh_flags = neigh->flags;
1596 neigh_release(neigh);
1598 if (!(neigh_flags & NTF_ROUTER)) {
1599 RT6_TRACE("purging route %p via non-router but gateway\n",
1601 rt6_remove_exception(bucket, rt6_ex);
1608 void rt6_age_exceptions(struct rt6_info *rt,
1609 struct fib6_gc_args *gc_args,
1612 struct rt6_exception_bucket *bucket;
1613 struct rt6_exception *rt6_ex;
1614 struct hlist_node *tmp;
1617 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1620 spin_lock_bh(&rt6_exception_lock);
1621 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1622 lockdep_is_held(&rt6_exception_lock));
1625 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1626 hlist_for_each_entry_safe(rt6_ex, tmp,
1627 &bucket->chain, hlist) {
1628 rt6_age_examine_exception(bucket, rt6_ex,
1634 spin_unlock_bh(&rt6_exception_lock);
1637 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1638 int oif, struct flowi6 *fl6, int flags)
1640 struct fib6_node *fn, *saved_fn;
1641 struct rt6_info *rt, *rt_cache;
1644 strict |= flags & RT6_LOOKUP_F_IFACE;
1645 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1646 if (net->ipv6.devconf_all->forwarding == 0)
1647 strict |= RT6_LOOKUP_F_REACHABLE;
1651 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1654 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1658 rt = rt6_select(net, fn, oif, strict);
1659 if (rt->rt6i_nsiblings)
1660 rt = rt6_multipath_select(rt, fl6, oif, strict);
1661 if (rt == net->ipv6.ip6_null_entry) {
1662 fn = fib6_backtrack(fn, &fl6->saddr);
1664 goto redo_rt6_select;
1665 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1666 /* also consider unreachable route */
1667 strict &= ~RT6_LOOKUP_F_REACHABLE;
1669 goto redo_rt6_select;
1673 /*Search through exception table */
1674 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1678 if (rt == net->ipv6.ip6_null_entry) {
1681 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1683 } else if (rt->rt6i_flags & RTF_CACHE) {
1684 if (ip6_hold_safe(net, &rt, true)) {
1685 dst_use_noref(&rt->dst, jiffies);
1686 rt6_dst_from_metrics_check(rt);
1689 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1691 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1692 !(rt->rt6i_flags & RTF_GATEWAY))) {
1693 /* Create a RTF_CACHE clone which will not be
1694 * owned by the fib6 tree. It is for the special case where
1695 * the daddr in the skb during the neighbor look-up is different
1696 * from the fl6->daddr used to look-up route here.
1699 struct rt6_info *uncached_rt;
1701 if (ip6_hold_safe(net, &rt, true)) {
1702 dst_use_noref(&rt->dst, jiffies);
1706 goto uncached_rt_out;
1710 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1711 dst_release(&rt->dst);
1714 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1715 * No need for another dst_hold()
1717 rt6_uncached_list_add(uncached_rt);
1718 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1720 uncached_rt = net->ipv6.ip6_null_entry;
1721 dst_hold(&uncached_rt->dst);
1725 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1729 /* Get a percpu copy */
1731 struct rt6_info *pcpu_rt;
1733 dst_use_noref(&rt->dst, jiffies);
1735 pcpu_rt = rt6_get_pcpu_route(rt);
1738 /* atomic_inc_not_zero() is needed when using rcu */
1739 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1740 /* No dst_hold() on rt is needed because grabbing
1741 * rt->rt6i_ref makes sure rt can't be released.
1743 pcpu_rt = rt6_make_pcpu_route(rt);
1746 /* rt is already removed from tree */
1747 pcpu_rt = net->ipv6.ip6_null_entry;
1748 dst_hold(&pcpu_rt->dst);
1753 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1757 EXPORT_SYMBOL_GPL(ip6_pol_route);
1759 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1760 struct flowi6 *fl6, int flags)
1762 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1765 struct dst_entry *ip6_route_input_lookup(struct net *net,
1766 struct net_device *dev,
1767 struct flowi6 *fl6, int flags)
1769 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1770 flags |= RT6_LOOKUP_F_IFACE;
1772 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1774 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1776 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1777 struct flow_keys *keys)
1779 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1780 const struct ipv6hdr *key_iph = outer_iph;
1781 const struct ipv6hdr *inner_iph;
1782 const struct icmp6hdr *icmph;
1783 struct ipv6hdr _inner_iph;
1785 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1788 icmph = icmp6_hdr(skb);
1789 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1790 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1791 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1792 icmph->icmp6_type != ICMPV6_PARAMPROB)
1795 inner_iph = skb_header_pointer(skb,
1796 skb_transport_offset(skb) + sizeof(*icmph),
1797 sizeof(_inner_iph), &_inner_iph);
1801 key_iph = inner_iph;
1803 memset(keys, 0, sizeof(*keys));
1804 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1805 keys->addrs.v6addrs.src = key_iph->saddr;
1806 keys->addrs.v6addrs.dst = key_iph->daddr;
1807 keys->tags.flow_label = ip6_flowinfo(key_iph);
1808 keys->basic.ip_proto = key_iph->nexthdr;
1811 /* if skb is set it will be used and fl6 can be NULL */
1812 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1814 struct flow_keys hash_keys;
1817 ip6_multipath_l3_keys(skb, &hash_keys);
1818 return flow_hash_from_keys(&hash_keys);
1821 return get_hash_from_flowi6(fl6);
1824 void ip6_route_input(struct sk_buff *skb)
1826 const struct ipv6hdr *iph = ipv6_hdr(skb);
1827 struct net *net = dev_net(skb->dev);
1828 int flags = RT6_LOOKUP_F_HAS_SADDR;
1829 struct ip_tunnel_info *tun_info;
1830 struct flowi6 fl6 = {
1831 .flowi6_iif = skb->dev->ifindex,
1832 .daddr = iph->daddr,
1833 .saddr = iph->saddr,
1834 .flowlabel = ip6_flowinfo(iph),
1835 .flowi6_mark = skb->mark,
1836 .flowi6_proto = iph->nexthdr,
1839 tun_info = skb_tunnel_info(skb);
1840 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1841 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1842 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1843 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1845 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1848 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1849 struct flowi6 *fl6, int flags)
1851 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1854 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1855 struct flowi6 *fl6, int flags)
1859 if (rt6_need_strict(&fl6->daddr)) {
1860 struct dst_entry *dst;
1862 dst = l3mdev_link_scope_lookup(net, fl6);
1867 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1869 any_src = ipv6_addr_any(&fl6->saddr);
1870 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1871 (fl6->flowi6_oif && any_src))
1872 flags |= RT6_LOOKUP_F_IFACE;
1875 flags |= RT6_LOOKUP_F_HAS_SADDR;
1877 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1879 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1881 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1883 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1885 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1886 struct net_device *loopback_dev = net->loopback_dev;
1887 struct dst_entry *new = NULL;
1889 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1890 DST_OBSOLETE_NONE, 0);
1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1897 new->input = dst_discard;
1898 new->output = dst_discard_out;
1900 dst_copy_metrics(new, &ort->dst);
1902 rt->rt6i_idev = in6_dev_get(loopback_dev);
1903 rt->rt6i_gateway = ort->rt6i_gateway;
1904 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1905 rt->rt6i_metric = 0;
1907 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1908 #ifdef CONFIG_IPV6_SUBTREES
1909 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1913 dst_release(dst_orig);
1914 return new ? new : ERR_PTR(-ENOMEM);
1918 * Destination cache support functions
1921 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1924 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1925 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1928 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1932 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1935 if (rt6_check_expired(rt))
1941 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1943 if (!__rt6_check_expired(rt) &&
1944 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1945 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1951 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1953 struct rt6_info *rt;
1955 rt = (struct rt6_info *) dst;
1957 /* All IPV6 dsts are created with ->obsolete set to the value
1958 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1959 * into this function always.
1962 rt6_dst_from_metrics_check(rt);
1964 if (rt->rt6i_flags & RTF_PCPU ||
1965 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1966 return rt6_dst_from_check(rt, cookie);
1968 return rt6_check(rt, cookie);
1971 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1973 struct rt6_info *rt = (struct rt6_info *) dst;
1976 if (rt->rt6i_flags & RTF_CACHE) {
1977 if (rt6_check_expired(rt)) {
1989 static void ip6_link_failure(struct sk_buff *skb)
1991 struct rt6_info *rt;
1993 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1995 rt = (struct rt6_info *) skb_dst(skb);
1997 if (rt->rt6i_flags & RTF_CACHE) {
1998 if (dst_hold_safe(&rt->dst))
2001 struct fib6_node *fn;
2004 fn = rcu_dereference(rt->rt6i_node);
2005 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2012 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2014 struct net *net = dev_net(rt->dst.dev);
2016 rt->rt6i_flags |= RTF_MODIFIED;
2017 rt->rt6i_pmtu = mtu;
2018 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2021 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2023 return !(rt->rt6i_flags & RTF_CACHE) &&
2024 (rt->rt6i_flags & RTF_PCPU ||
2025 rcu_access_pointer(rt->rt6i_node));
2028 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2029 const struct ipv6hdr *iph, u32 mtu)
2031 const struct in6_addr *daddr, *saddr;
2032 struct rt6_info *rt6 = (struct rt6_info *)dst;
2034 if (rt6->rt6i_flags & RTF_LOCAL)
2037 if (dst_metric_locked(dst, RTAX_MTU))
2041 daddr = &iph->daddr;
2042 saddr = &iph->saddr;
2044 daddr = &sk->sk_v6_daddr;
2045 saddr = &inet6_sk(sk)->saddr;
2050 dst_confirm_neigh(dst, daddr);
2051 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2052 if (mtu >= dst_mtu(dst))
2055 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2056 rt6_do_update_pmtu(rt6, mtu);
2057 /* update rt6_ex->stamp for cache */
2058 if (rt6->rt6i_flags & RTF_CACHE)
2059 rt6_update_exception_stamp_rt(rt6);
2061 struct rt6_info *nrt6;
2063 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2065 rt6_do_update_pmtu(nrt6, mtu);
2066 if (rt6_insert_exception(nrt6, rt6))
2067 dst_release_immediate(&nrt6->dst);
2072 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2073 struct sk_buff *skb, u32 mtu)
2075 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2078 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2079 int oif, u32 mark, kuid_t uid)
2081 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2082 struct dst_entry *dst;
2085 memset(&fl6, 0, sizeof(fl6));
2086 fl6.flowi6_oif = oif;
2087 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2088 fl6.daddr = iph->daddr;
2089 fl6.saddr = iph->saddr;
2090 fl6.flowlabel = ip6_flowinfo(iph);
2091 fl6.flowi6_uid = uid;
2093 dst = ip6_route_output(net, NULL, &fl6);
2095 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2098 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2100 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2102 struct dst_entry *dst;
2104 ip6_update_pmtu(skb, sock_net(sk), mtu,
2105 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2107 dst = __sk_dst_get(sk);
2108 if (!dst || !dst->obsolete ||
2109 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2113 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2114 ip6_datagram_dst_update(sk, false);
2117 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2119 /* Handle redirects */
2120 struct ip6rd_flowi {
2122 struct in6_addr gateway;
2125 static struct rt6_info *__ip6_route_redirect(struct net *net,
2126 struct fib6_table *table,
2130 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2131 struct rt6_info *rt, *rt_cache;
2132 struct fib6_node *fn;
2134 /* Get the "current" route for this destination and
2135 * check if the redirect has come from appropriate router.
2137 * RFC 4861 specifies that redirects should only be
2138 * accepted if they come from the nexthop to the target.
2139 * Due to the way the routes are chosen, this notion
2140 * is a bit fuzzy and one might need to check all possible
2145 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2147 for_each_fib6_node_rt_rcu(fn) {
2148 if (rt6_check_expired(rt))
2152 if (!(rt->rt6i_flags & RTF_GATEWAY))
2154 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2156 /* rt_cache's gateway might be different from its 'parent'
2157 * in the case of an ip redirect.
2158 * So we keep searching in the exception table if the gateway
2161 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2162 rt_cache = rt6_find_cached_rt(rt,
2166 ipv6_addr_equal(&rdfl->gateway,
2167 &rt_cache->rt6i_gateway)) {
2177 rt = net->ipv6.ip6_null_entry;
2178 else if (rt->dst.error) {
2179 rt = net->ipv6.ip6_null_entry;
2183 if (rt == net->ipv6.ip6_null_entry) {
2184 fn = fib6_backtrack(fn, &fl6->saddr);
2190 ip6_hold_safe(net, &rt, true);
2194 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2198 static struct dst_entry *ip6_route_redirect(struct net *net,
2199 const struct flowi6 *fl6,
2200 const struct in6_addr *gateway)
2202 int flags = RT6_LOOKUP_F_HAS_SADDR;
2203 struct ip6rd_flowi rdfl;
2206 rdfl.gateway = *gateway;
2208 return fib6_rule_lookup(net, &rdfl.fl6,
2209 flags, __ip6_route_redirect);
2212 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2215 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2216 struct dst_entry *dst;
2219 memset(&fl6, 0, sizeof(fl6));
2220 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2221 fl6.flowi6_oif = oif;
2222 fl6.flowi6_mark = mark;
2223 fl6.daddr = iph->daddr;
2224 fl6.saddr = iph->saddr;
2225 fl6.flowlabel = ip6_flowinfo(iph);
2226 fl6.flowi6_uid = uid;
2228 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2229 rt6_do_redirect(dst, NULL, skb);
2232 EXPORT_SYMBOL_GPL(ip6_redirect);
2234 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2237 const struct ipv6hdr *iph = ipv6_hdr(skb);
2238 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2239 struct dst_entry *dst;
2242 memset(&fl6, 0, sizeof(fl6));
2243 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2244 fl6.flowi6_oif = oif;
2245 fl6.flowi6_mark = mark;
2246 fl6.daddr = msg->dest;
2247 fl6.saddr = iph->daddr;
2248 fl6.flowi6_uid = sock_net_uid(net, NULL);
2250 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2251 rt6_do_redirect(dst, NULL, skb);
2255 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2257 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2260 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2262 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2264 struct net_device *dev = dst->dev;
2265 unsigned int mtu = dst_mtu(dst);
2266 struct net *net = dev_net(dev);
2268 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2270 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2271 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2274 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2275 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2276 * IPV6_MAXPLEN is also valid and means: "any MSS,
2277 * rely only on pmtu discovery"
2279 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2284 static unsigned int ip6_mtu(const struct dst_entry *dst)
2286 const struct rt6_info *rt = (const struct rt6_info *)dst;
2287 unsigned int mtu = rt->rt6i_pmtu;
2288 struct inet6_dev *idev;
2293 mtu = dst_metric_raw(dst, RTAX_MTU);
2300 idev = __in6_dev_get(dst->dev);
2302 mtu = idev->cnf.mtu6;
2306 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2308 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2311 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2314 struct dst_entry *dst;
2315 struct rt6_info *rt;
2316 struct inet6_dev *idev = in6_dev_get(dev);
2317 struct net *net = dev_net(dev);
2319 if (unlikely(!idev))
2320 return ERR_PTR(-ENODEV);
2322 rt = ip6_dst_alloc(net, dev, 0);
2323 if (unlikely(!rt)) {
2325 dst = ERR_PTR(-ENOMEM);
2329 rt->dst.flags |= DST_HOST;
2330 rt->dst.output = ip6_output;
2331 rt->rt6i_gateway = fl6->daddr;
2332 rt->rt6i_dst.addr = fl6->daddr;
2333 rt->rt6i_dst.plen = 128;
2334 rt->rt6i_idev = idev;
2335 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2337 /* Add this dst into uncached_list so that rt6_ifdown() can
2338 * do proper release of the net_device
2340 rt6_uncached_list_add(rt);
2341 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2343 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2349 static int ip6_dst_gc(struct dst_ops *ops)
2351 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2352 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2353 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2354 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2355 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2356 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2359 entries = dst_entries_get_fast(ops);
2360 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2361 entries <= rt_max_size)
2364 net->ipv6.ip6_rt_gc_expire++;
2365 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2366 entries = dst_entries_get_slow(ops);
2367 if (entries < ops->gc_thresh)
2368 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2370 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2371 return entries > rt_max_size;
2374 static int ip6_convert_metrics(struct mx6_config *mxc,
2375 const struct fib6_config *cfg)
2377 bool ecn_ca = false;
2385 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2389 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2390 int type = nla_type(nla);
2395 if (unlikely(type > RTAX_MAX))
2398 if (type == RTAX_CC_ALGO) {
2399 char tmp[TCP_CA_NAME_MAX];
2401 nla_strlcpy(tmp, nla, sizeof(tmp));
2402 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2403 if (val == TCP_CA_UNSPEC)
2406 val = nla_get_u32(nla);
2408 if (type == RTAX_HOPLIMIT && val > 255)
2410 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2414 __set_bit(type - 1, mxc->mx_valid);
2418 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2419 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2429 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2430 struct fib6_config *cfg,
2431 const struct in6_addr *gw_addr)
2433 struct flowi6 fl6 = {
2434 .flowi6_oif = cfg->fc_ifindex,
2436 .saddr = cfg->fc_prefsrc,
2438 struct fib6_table *table;
2439 struct rt6_info *rt;
2440 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2442 table = fib6_get_table(net, cfg->fc_table);
2446 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2447 flags |= RT6_LOOKUP_F_HAS_SADDR;
2449 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2451 /* if table lookup failed, fall back to full lookup */
2452 if (rt == net->ipv6.ip6_null_entry) {
2460 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2461 struct netlink_ext_ack *extack)
2463 struct net *net = cfg->fc_nlinfo.nl_net;
2464 struct rt6_info *rt = NULL;
2465 struct net_device *dev = NULL;
2466 struct inet6_dev *idev = NULL;
2467 struct fib6_table *table;
2471 /* RTF_PCPU is an internal flag; can not be set by userspace */
2472 if (cfg->fc_flags & RTF_PCPU) {
2473 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2477 if (cfg->fc_dst_len > 128) {
2478 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2481 if (cfg->fc_src_len > 128) {
2482 NL_SET_ERR_MSG(extack, "Invalid source address length");
2485 #ifndef CONFIG_IPV6_SUBTREES
2486 if (cfg->fc_src_len) {
2487 NL_SET_ERR_MSG(extack,
2488 "Specifying source address requires IPV6_SUBTREES to be enabled");
2492 if (cfg->fc_ifindex) {
2494 dev = dev_get_by_index(net, cfg->fc_ifindex);
2497 idev = in6_dev_get(dev);
2502 if (cfg->fc_metric == 0)
2503 cfg->fc_metric = IP6_RT_PRIO_USER;
2506 if (cfg->fc_nlinfo.nlh &&
2507 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2508 table = fib6_get_table(net, cfg->fc_table);
2510 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2511 table = fib6_new_table(net, cfg->fc_table);
2514 table = fib6_new_table(net, cfg->fc_table);
2520 rt = ip6_dst_alloc(net, NULL,
2521 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2528 if (cfg->fc_flags & RTF_EXPIRES)
2529 rt6_set_expires(rt, jiffies +
2530 clock_t_to_jiffies(cfg->fc_expires));
2532 rt6_clean_expires(rt);
2534 if (cfg->fc_protocol == RTPROT_UNSPEC)
2535 cfg->fc_protocol = RTPROT_BOOT;
2536 rt->rt6i_protocol = cfg->fc_protocol;
2538 addr_type = ipv6_addr_type(&cfg->fc_dst);
2540 if (addr_type & IPV6_ADDR_MULTICAST)
2541 rt->dst.input = ip6_mc_input;
2542 else if (cfg->fc_flags & RTF_LOCAL)
2543 rt->dst.input = ip6_input;
2545 rt->dst.input = ip6_forward;
2547 rt->dst.output = ip6_output;
2549 if (cfg->fc_encap) {
2550 struct lwtunnel_state *lwtstate;
2552 err = lwtunnel_build_state(cfg->fc_encap_type,
2553 cfg->fc_encap, AF_INET6, cfg,
2557 rt->dst.lwtstate = lwtstate_get(lwtstate);
2558 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2559 rt->dst.lwtstate->orig_output = rt->dst.output;
2560 rt->dst.output = lwtunnel_output;
2562 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2563 rt->dst.lwtstate->orig_input = rt->dst.input;
2564 rt->dst.input = lwtunnel_input;
2568 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2569 rt->rt6i_dst.plen = cfg->fc_dst_len;
2570 if (rt->rt6i_dst.plen == 128)
2571 rt->dst.flags |= DST_HOST;
2573 #ifdef CONFIG_IPV6_SUBTREES
2574 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2575 rt->rt6i_src.plen = cfg->fc_src_len;
2578 rt->rt6i_metric = cfg->fc_metric;
2580 /* We cannot add true routes via loopback here,
2581 they would result in kernel looping; promote them to reject routes
2583 if ((cfg->fc_flags & RTF_REJECT) ||
2584 (dev && (dev->flags & IFF_LOOPBACK) &&
2585 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2586 !(cfg->fc_flags & RTF_LOCAL))) {
2587 /* hold loopback dev/idev if we haven't done so. */
2588 if (dev != net->loopback_dev) {
2593 dev = net->loopback_dev;
2595 idev = in6_dev_get(dev);
2601 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2602 switch (cfg->fc_type) {
2604 rt->dst.error = -EINVAL;
2605 rt->dst.output = dst_discard_out;
2606 rt->dst.input = dst_discard;
2609 rt->dst.error = -EACCES;
2610 rt->dst.output = ip6_pkt_prohibit_out;
2611 rt->dst.input = ip6_pkt_prohibit;
2614 case RTN_UNREACHABLE:
2616 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2617 : (cfg->fc_type == RTN_UNREACHABLE)
2618 ? -EHOSTUNREACH : -ENETUNREACH;
2619 rt->dst.output = ip6_pkt_discard_out;
2620 rt->dst.input = ip6_pkt_discard;
2626 if (cfg->fc_flags & RTF_GATEWAY) {
2627 const struct in6_addr *gw_addr;
2630 gw_addr = &cfg->fc_gateway;
2631 gwa_type = ipv6_addr_type(gw_addr);
2633 /* if gw_addr is local we will fail to detect this in case
2634 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2635 * will return already-added prefix route via interface that
2636 * prefix route was assigned to, which might be non-loopback.
2639 if (ipv6_chk_addr_and_flags(net, gw_addr,
2640 gwa_type & IPV6_ADDR_LINKLOCAL ?
2641 dev : NULL, 0, 0)) {
2642 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2645 rt->rt6i_gateway = *gw_addr;
2647 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2648 struct rt6_info *grt = NULL;
2650 /* IPv6 strictly inhibits using not link-local
2651 addresses as nexthop address.
2652 Otherwise, router will not able to send redirects.
2653 It is very good, but in some (rare!) circumstances
2654 (SIT, PtP, NBMA NOARP links) it is handy to allow
2655 some exceptions. --ANK
2656 We allow IPv4-mapped nexthops to support RFC4798-type
2659 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2660 IPV6_ADDR_MAPPED))) {
2661 NL_SET_ERR_MSG(extack,
2662 "Invalid gateway address");
2666 if (cfg->fc_table) {
2667 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2670 if (grt->rt6i_flags & RTF_GATEWAY ||
2671 (dev && dev != grt->dst.dev)) {
2679 grt = rt6_lookup(net, gw_addr, NULL,
2680 cfg->fc_ifindex, 1);
2682 err = -EHOSTUNREACH;
2686 if (dev != grt->dst.dev) {
2692 idev = grt->rt6i_idev;
2694 in6_dev_hold(grt->rt6i_idev);
2696 if (!(grt->rt6i_flags & RTF_GATEWAY))
2705 NL_SET_ERR_MSG(extack, "Egress device not specified");
2707 } else if (dev->flags & IFF_LOOPBACK) {
2708 NL_SET_ERR_MSG(extack,
2709 "Egress device can not be loopback device for this route");
2718 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2719 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2720 NL_SET_ERR_MSG(extack, "Invalid source address");
2724 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2725 rt->rt6i_prefsrc.plen = 128;
2727 rt->rt6i_prefsrc.plen = 0;
2729 rt->rt6i_flags = cfg->fc_flags;
2733 rt->rt6i_idev = idev;
2734 rt->rt6i_table = table;
2736 cfg->fc_nlinfo.nl_net = dev_net(dev);
2745 dst_release_immediate(&rt->dst);
2747 return ERR_PTR(err);
2750 int ip6_route_add(struct fib6_config *cfg,
2751 struct netlink_ext_ack *extack)
2753 struct mx6_config mxc = { .mx = NULL, };
2754 struct rt6_info *rt;
2757 rt = ip6_route_info_create(cfg, extack);
2764 err = ip6_convert_metrics(&mxc, cfg);
2768 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2775 dst_release_immediate(&rt->dst);
2780 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2783 struct fib6_table *table;
2784 struct net *net = dev_net(rt->dst.dev);
2786 if (rt == net->ipv6.ip6_null_entry) {
2791 table = rt->rt6i_table;
2792 spin_lock_bh(&table->tb6_lock);
2793 err = fib6_del(rt, info);
2794 spin_unlock_bh(&table->tb6_lock);
2801 int ip6_del_rt(struct rt6_info *rt)
2803 struct nl_info info = {
2804 .nl_net = dev_net(rt->dst.dev),
2806 return __ip6_del_rt(rt, &info);
2809 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2811 struct nl_info *info = &cfg->fc_nlinfo;
2812 struct net *net = info->nl_net;
2813 struct sk_buff *skb = NULL;
2814 struct fib6_table *table;
2817 if (rt == net->ipv6.ip6_null_entry)
2819 table = rt->rt6i_table;
2820 spin_lock_bh(&table->tb6_lock);
2822 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2823 struct rt6_info *sibling, *next_sibling;
2825 /* prefer to send a single notification with all hops */
2826 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2828 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2830 if (rt6_fill_node(net, skb, rt,
2831 NULL, NULL, 0, RTM_DELROUTE,
2832 info->portid, seq, 0) < 0) {
2836 info->skip_notify = 1;
2839 list_for_each_entry_safe(sibling, next_sibling,
2842 err = fib6_del(sibling, info);
2848 err = fib6_del(rt, info);
2850 spin_unlock_bh(&table->tb6_lock);
2855 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2856 info->nlh, gfp_any());
2861 static int ip6_route_del(struct fib6_config *cfg,
2862 struct netlink_ext_ack *extack)
2864 struct rt6_info *rt, *rt_cache;
2865 struct fib6_table *table;
2866 struct fib6_node *fn;
2869 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2871 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2877 fn = fib6_locate(&table->tb6_root,
2878 &cfg->fc_dst, cfg->fc_dst_len,
2879 &cfg->fc_src, cfg->fc_src_len,
2880 !(cfg->fc_flags & RTF_CACHE));
2883 for_each_fib6_node_rt_rcu(fn) {
2884 if (cfg->fc_flags & RTF_CACHE) {
2885 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2891 if (cfg->fc_ifindex &&
2893 rt->dst.dev->ifindex != cfg->fc_ifindex))
2895 if (cfg->fc_flags & RTF_GATEWAY &&
2896 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2898 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2900 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2902 if (!dst_hold_safe(&rt->dst))
2906 /* if gateway was specified only delete the one hop */
2907 if (cfg->fc_flags & RTF_GATEWAY)
2908 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2910 return __ip6_del_rt_siblings(rt, cfg);
2918 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2920 struct netevent_redirect netevent;
2921 struct rt6_info *rt, *nrt = NULL;
2922 struct ndisc_options ndopts;
2923 struct inet6_dev *in6_dev;
2924 struct neighbour *neigh;
2926 int optlen, on_link;
2929 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2930 optlen -= sizeof(*msg);
2933 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2937 msg = (struct rd_msg *)icmp6_hdr(skb);
2939 if (ipv6_addr_is_multicast(&msg->dest)) {
2940 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2945 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2947 } else if (ipv6_addr_type(&msg->target) !=
2948 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2949 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2953 in6_dev = __in6_dev_get(skb->dev);
2956 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2960 * The IP source address of the Redirect MUST be the same as the current
2961 * first-hop router for the specified ICMP Destination Address.
2964 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2965 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2970 if (ndopts.nd_opts_tgt_lladdr) {
2971 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2974 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2979 rt = (struct rt6_info *) dst;
2980 if (rt->rt6i_flags & RTF_REJECT) {
2981 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2985 /* Redirect received -> path was valid.
2986 * Look, redirects are sent only in response to data packets,
2987 * so that this nexthop apparently is reachable. --ANK
2989 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2991 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2996 * We have finally decided to accept it.
2999 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3000 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3001 NEIGH_UPDATE_F_OVERRIDE|
3002 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3003 NEIGH_UPDATE_F_ISROUTER)),
3004 NDISC_REDIRECT, &ndopts);
3006 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3010 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3012 nrt->rt6i_flags &= ~RTF_GATEWAY;
3014 nrt->rt6i_protocol = RTPROT_REDIRECT;
3015 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3017 /* No need to remove rt from the exception table if rt is
3018 * a cached route because rt6_insert_exception() will
3021 if (rt6_insert_exception(nrt, rt)) {
3022 dst_release_immediate(&nrt->dst);
3026 netevent.old = &rt->dst;
3027 netevent.new = &nrt->dst;
3028 netevent.daddr = &msg->dest;
3029 netevent.neigh = neigh;
3030 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3033 neigh_release(neigh);
3037 * Misc support functions
3040 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3042 BUG_ON(from->dst.from);
3044 rt->rt6i_flags &= ~RTF_EXPIRES;
3045 dst_hold(&from->dst);
3046 rt->dst.from = &from->dst;
3047 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3050 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3052 rt->dst.input = ort->dst.input;
3053 rt->dst.output = ort->dst.output;
3054 rt->rt6i_dst = ort->rt6i_dst;
3055 rt->dst.error = ort->dst.error;
3056 rt->rt6i_idev = ort->rt6i_idev;
3058 in6_dev_hold(rt->rt6i_idev);
3059 rt->dst.lastuse = jiffies;
3060 rt->rt6i_gateway = ort->rt6i_gateway;
3061 rt->rt6i_flags = ort->rt6i_flags;
3062 rt6_set_from(rt, ort);
3063 rt->rt6i_metric = ort->rt6i_metric;
3064 #ifdef CONFIG_IPV6_SUBTREES
3065 rt->rt6i_src = ort->rt6i_src;
3067 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3068 rt->rt6i_table = ort->rt6i_table;
3069 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3072 #ifdef CONFIG_IPV6_ROUTE_INFO
3073 static struct rt6_info *rt6_get_route_info(struct net *net,
3074 const struct in6_addr *prefix, int prefixlen,
3075 const struct in6_addr *gwaddr,
3076 struct net_device *dev)
3078 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3079 int ifindex = dev->ifindex;
3080 struct fib6_node *fn;
3081 struct rt6_info *rt = NULL;
3082 struct fib6_table *table;
3084 table = fib6_get_table(net, tb_id);
3089 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3093 for_each_fib6_node_rt_rcu(fn) {
3094 if (rt->dst.dev->ifindex != ifindex)
3096 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3098 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3100 ip6_hold_safe(NULL, &rt, false);
3108 static struct rt6_info *rt6_add_route_info(struct net *net,
3109 const struct in6_addr *prefix, int prefixlen,
3110 const struct in6_addr *gwaddr,
3111 struct net_device *dev,
3114 struct fib6_config cfg = {
3115 .fc_metric = IP6_RT_PRIO_USER,
3116 .fc_ifindex = dev->ifindex,
3117 .fc_dst_len = prefixlen,
3118 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3119 RTF_UP | RTF_PREF(pref),
3120 .fc_protocol = RTPROT_RA,
3121 .fc_nlinfo.portid = 0,
3122 .fc_nlinfo.nlh = NULL,
3123 .fc_nlinfo.nl_net = net,
3126 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3127 cfg.fc_dst = *prefix;
3128 cfg.fc_gateway = *gwaddr;
3130 /* We should treat it as a default route if prefix length is 0. */
3132 cfg.fc_flags |= RTF_DEFAULT;
3134 ip6_route_add(&cfg, NULL);
3136 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3140 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3142 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3143 struct rt6_info *rt;
3144 struct fib6_table *table;
3146 table = fib6_get_table(dev_net(dev), tb_id);
3151 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3152 if (dev == rt->dst.dev &&
3153 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3154 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3158 ip6_hold_safe(NULL, &rt, false);
3163 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3164 struct net_device *dev,
3167 struct fib6_config cfg = {
3168 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3169 .fc_metric = IP6_RT_PRIO_USER,
3170 .fc_ifindex = dev->ifindex,
3171 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3172 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3173 .fc_protocol = RTPROT_RA,
3174 .fc_nlinfo.portid = 0,
3175 .fc_nlinfo.nlh = NULL,
3176 .fc_nlinfo.nl_net = dev_net(dev),
3179 cfg.fc_gateway = *gwaddr;
3181 if (!ip6_route_add(&cfg, NULL)) {
3182 struct fib6_table *table;
3184 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3186 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3189 return rt6_get_dflt_router(gwaddr, dev);
3192 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3194 struct rt6_info *rt;
3198 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3199 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3200 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3201 if (dst_hold_safe(&rt->dst)) {
3212 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3215 void rt6_purge_dflt_routers(struct net *net)
3217 struct fib6_table *table;
3218 struct hlist_head *head;
3223 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3224 head = &net->ipv6.fib_table_hash[h];
3225 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3226 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3227 __rt6_purge_dflt_routers(table);
3234 static void rtmsg_to_fib6_config(struct net *net,
3235 struct in6_rtmsg *rtmsg,
3236 struct fib6_config *cfg)
3238 memset(cfg, 0, sizeof(*cfg));
3240 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3242 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3243 cfg->fc_metric = rtmsg->rtmsg_metric;
3244 cfg->fc_expires = rtmsg->rtmsg_info;
3245 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3246 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3247 cfg->fc_flags = rtmsg->rtmsg_flags;
3249 cfg->fc_nlinfo.nl_net = net;
3251 cfg->fc_dst = rtmsg->rtmsg_dst;
3252 cfg->fc_src = rtmsg->rtmsg_src;
3253 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3256 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3258 struct fib6_config cfg;
3259 struct in6_rtmsg rtmsg;
3263 case SIOCADDRT: /* Add a route */
3264 case SIOCDELRT: /* Delete a route */
3265 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3267 err = copy_from_user(&rtmsg, arg,
3268 sizeof(struct in6_rtmsg));
3272 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3277 err = ip6_route_add(&cfg, NULL);
3280 err = ip6_route_del(&cfg, NULL);
3294 * Drop the packet on the floor
3297 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3300 struct dst_entry *dst = skb_dst(skb);
3301 switch (ipstats_mib_noroutes) {
3302 case IPSTATS_MIB_INNOROUTES:
3303 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3304 if (type == IPV6_ADDR_ANY) {
3305 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3306 IPSTATS_MIB_INADDRERRORS);
3310 case IPSTATS_MIB_OUTNOROUTES:
3311 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3312 ipstats_mib_noroutes);
3315 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3320 static int ip6_pkt_discard(struct sk_buff *skb)
3322 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3325 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3327 skb->dev = skb_dst(skb)->dev;
3328 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3331 static int ip6_pkt_prohibit(struct sk_buff *skb)
3333 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3336 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3338 skb->dev = skb_dst(skb)->dev;
3339 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3343 * Allocate a dst for local (unicast / anycast) address.
3346 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3347 const struct in6_addr *addr,
3351 struct net *net = dev_net(idev->dev);
3352 struct net_device *dev = idev->dev;
3353 struct rt6_info *rt;
3355 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3357 return ERR_PTR(-ENOMEM);
3361 rt->dst.flags |= DST_HOST;
3362 rt->dst.input = ip6_input;
3363 rt->dst.output = ip6_output;
3364 rt->rt6i_idev = idev;
3366 rt->rt6i_protocol = RTPROT_KERNEL;
3367 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3369 rt->rt6i_flags |= RTF_ANYCAST;
3371 rt->rt6i_flags |= RTF_LOCAL;
3373 rt->rt6i_gateway = *addr;
3374 rt->rt6i_dst.addr = *addr;
3375 rt->rt6i_dst.plen = 128;
3376 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3377 rt->rt6i_table = fib6_get_table(net, tb_id);
3382 /* remove deleted ip from prefsrc entries */
3383 struct arg_dev_net_ip {
3384 struct net_device *dev;
3386 struct in6_addr *addr;
3389 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3391 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3392 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3393 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3395 if (((void *)rt->dst.dev == dev || !dev) &&
3396 rt != net->ipv6.ip6_null_entry &&
3397 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3398 spin_lock_bh(&rt6_exception_lock);
3399 /* remove prefsrc entry */
3400 rt->rt6i_prefsrc.plen = 0;
3401 /* need to update cache as well */
3402 rt6_exceptions_remove_prefsrc(rt);
3403 spin_unlock_bh(&rt6_exception_lock);
3408 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3410 struct net *net = dev_net(ifp->idev->dev);
3411 struct arg_dev_net_ip adni = {
3412 .dev = ifp->idev->dev,
3416 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3419 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3421 /* Remove routers and update dst entries when gateway turn into host. */
3422 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3424 struct in6_addr *gateway = (struct in6_addr *)arg;
3426 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3427 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3431 /* Further clean up cached routes in exception table.
3432 * This is needed because cached route may have a different
3433 * gateway than its 'parent' in the case of an ip redirect.
3435 rt6_exceptions_clean_tohost(rt, gateway);
3440 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3442 fib6_clean_all(net, fib6_clean_tohost, gateway);
3445 struct arg_dev_net {
3446 struct net_device *dev;
3450 /* called with write lock held for table with rt */
3451 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3453 const struct arg_dev_net *adn = arg;
3454 const struct net_device *dev = adn->dev;
3456 if ((rt->dst.dev == dev || !dev) &&
3457 rt != adn->net->ipv6.ip6_null_entry &&
3458 (rt->rt6i_nsiblings == 0 ||
3459 (dev && netdev_unregistering(dev)) ||
3460 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3466 void rt6_ifdown(struct net *net, struct net_device *dev)
3468 struct arg_dev_net adn = {
3473 fib6_clean_all(net, fib6_ifdown, &adn);
3475 rt6_uncached_list_flush_dev(net, dev);
3478 struct rt6_mtu_change_arg {
3479 struct net_device *dev;
3483 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3485 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3486 struct inet6_dev *idev;
3488 /* In IPv6 pmtu discovery is not optional,
3489 so that RTAX_MTU lock cannot disable it.
3490 We still use this lock to block changes
3491 caused by addrconf/ndisc.
3494 idev = __in6_dev_get(arg->dev);
3498 /* For administrative MTU increase, there is no way to discover
3499 IPv6 PMTU increase, so PMTU increase should be updated here.
3500 Since RFC 1981 doesn't include administrative MTU increase
3501 update PMTU increase is a MUST. (i.e. jumbo frame)
3504 If new MTU is less than route PMTU, this new MTU will be the
3505 lowest MTU in the path, update the route PMTU to reflect PMTU
3506 decreases; if new MTU is greater than route PMTU, and the
3507 old MTU is the lowest MTU in the path, update the route PMTU
3508 to reflect the increase. In this case if the other nodes' MTU
3509 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3512 if (rt->dst.dev == arg->dev &&
3513 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3514 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3515 spin_lock_bh(&rt6_exception_lock);
3516 if (dst_mtu(&rt->dst) >= arg->mtu ||
3517 (dst_mtu(&rt->dst) < arg->mtu &&
3518 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3519 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3521 rt6_exceptions_update_pmtu(rt, arg->mtu);
3522 spin_unlock_bh(&rt6_exception_lock);
3527 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3529 struct rt6_mtu_change_arg arg = {
3534 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3537 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3538 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3539 [RTA_OIF] = { .type = NLA_U32 },
3540 [RTA_IIF] = { .type = NLA_U32 },
3541 [RTA_PRIORITY] = { .type = NLA_U32 },
3542 [RTA_METRICS] = { .type = NLA_NESTED },
3543 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3544 [RTA_PREF] = { .type = NLA_U8 },
3545 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3546 [RTA_ENCAP] = { .type = NLA_NESTED },
3547 [RTA_EXPIRES] = { .type = NLA_U32 },
3548 [RTA_UID] = { .type = NLA_U32 },
3549 [RTA_MARK] = { .type = NLA_U32 },
3552 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3553 struct fib6_config *cfg,
3554 struct netlink_ext_ack *extack)
3557 struct nlattr *tb[RTA_MAX+1];
3561 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3567 rtm = nlmsg_data(nlh);
3568 memset(cfg, 0, sizeof(*cfg));
3570 cfg->fc_table = rtm->rtm_table;
3571 cfg->fc_dst_len = rtm->rtm_dst_len;
3572 cfg->fc_src_len = rtm->rtm_src_len;
3573 cfg->fc_flags = RTF_UP;
3574 cfg->fc_protocol = rtm->rtm_protocol;
3575 cfg->fc_type = rtm->rtm_type;
3577 if (rtm->rtm_type == RTN_UNREACHABLE ||
3578 rtm->rtm_type == RTN_BLACKHOLE ||
3579 rtm->rtm_type == RTN_PROHIBIT ||
3580 rtm->rtm_type == RTN_THROW)
3581 cfg->fc_flags |= RTF_REJECT;
3583 if (rtm->rtm_type == RTN_LOCAL)
3584 cfg->fc_flags |= RTF_LOCAL;
3586 if (rtm->rtm_flags & RTM_F_CLONED)
3587 cfg->fc_flags |= RTF_CACHE;
3589 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3590 cfg->fc_nlinfo.nlh = nlh;
3591 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3593 if (tb[RTA_GATEWAY]) {
3594 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3595 cfg->fc_flags |= RTF_GATEWAY;
3599 int plen = (rtm->rtm_dst_len + 7) >> 3;
3601 if (nla_len(tb[RTA_DST]) < plen)
3604 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3608 int plen = (rtm->rtm_src_len + 7) >> 3;
3610 if (nla_len(tb[RTA_SRC]) < plen)
3613 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3616 if (tb[RTA_PREFSRC])
3617 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3620 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3622 if (tb[RTA_PRIORITY])
3623 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3625 if (tb[RTA_METRICS]) {
3626 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3627 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3631 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3633 if (tb[RTA_MULTIPATH]) {
3634 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3635 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3637 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3638 cfg->fc_mp_len, extack);
3644 pref = nla_get_u8(tb[RTA_PREF]);
3645 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3646 pref != ICMPV6_ROUTER_PREF_HIGH)
3647 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3648 cfg->fc_flags |= RTF_PREF(pref);
3652 cfg->fc_encap = tb[RTA_ENCAP];
3654 if (tb[RTA_ENCAP_TYPE]) {
3655 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3657 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3662 if (tb[RTA_EXPIRES]) {
3663 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3665 if (addrconf_finite_timeout(timeout)) {
3666 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3667 cfg->fc_flags |= RTF_EXPIRES;
3677 struct rt6_info *rt6_info;
3678 struct fib6_config r_cfg;
3679 struct mx6_config mxc;
3680 struct list_head next;
3683 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3687 list_for_each_entry(nh, rt6_nh_list, next) {
3688 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3689 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3690 nh->r_cfg.fc_ifindex);
3694 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3695 struct rt6_info *rt, struct fib6_config *r_cfg)
3700 list_for_each_entry(nh, rt6_nh_list, next) {
3701 /* check if rt6_info already exists */
3702 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3706 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3710 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3715 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3716 list_add_tail(&nh->next, rt6_nh_list);
3721 static void ip6_route_mpath_notify(struct rt6_info *rt,
3722 struct rt6_info *rt_last,
3723 struct nl_info *info,
3726 /* if this is an APPEND route, then rt points to the first route
3727 * inserted and rt_last points to last route inserted. Userspace
3728 * wants a consistent dump of the route which starts at the first
3729 * nexthop. Since sibling routes are always added at the end of
3730 * the list, find the first sibling of the last route appended
3732 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3733 rt = list_first_entry(&rt_last->rt6i_siblings,
3739 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3742 static int ip6_route_multipath_add(struct fib6_config *cfg,
3743 struct netlink_ext_ack *extack)
3745 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3746 struct nl_info *info = &cfg->fc_nlinfo;
3747 struct fib6_config r_cfg;
3748 struct rtnexthop *rtnh;
3749 struct rt6_info *rt;
3750 struct rt6_nh *err_nh;
3751 struct rt6_nh *nh, *nh_safe;
3757 int replace = (cfg->fc_nlinfo.nlh &&
3758 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3759 LIST_HEAD(rt6_nh_list);
3761 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3762 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3763 nlflags |= NLM_F_APPEND;
3765 remaining = cfg->fc_mp_len;
3766 rtnh = (struct rtnexthop *)cfg->fc_mp;
3768 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3769 * rt6_info structs per nexthop
3771 while (rtnh_ok(rtnh, remaining)) {
3772 memcpy(&r_cfg, cfg, sizeof(*cfg));
3773 if (rtnh->rtnh_ifindex)
3774 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3776 attrlen = rtnh_attrlen(rtnh);
3778 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3780 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3782 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3783 r_cfg.fc_flags |= RTF_GATEWAY;
3785 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3786 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3788 r_cfg.fc_encap_type = nla_get_u16(nla);
3791 rt = ip6_route_info_create(&r_cfg, extack);
3798 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3800 dst_release_immediate(&rt->dst);
3804 rtnh = rtnh_next(rtnh, &remaining);
3807 /* for add and replace send one notification with all nexthops.
3808 * Skip the notification in fib6_add_rt2node and send one with
3809 * the full route when done
3811 info->skip_notify = 1;
3814 list_for_each_entry(nh, &rt6_nh_list, next) {
3815 rt_last = nh->rt6_info;
3816 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3817 /* save reference to first route for notification */
3818 if (!rt_notif && !err)
3819 rt_notif = nh->rt6_info;
3821 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3822 nh->rt6_info = NULL;
3825 ip6_print_replace_route_err(&rt6_nh_list);
3830 /* Because each route is added like a single route we remove
3831 * these flags after the first nexthop: if there is a collision,
3832 * we have already failed to add the first nexthop:
3833 * fib6_add_rt2node() has rejected it; when replacing, old
3834 * nexthops have been replaced by first new, the rest should
3837 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3842 /* success ... tell user about new route */
3843 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3847 /* send notification for routes that were added so that
3848 * the delete notifications sent by ip6_route_del are
3852 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3854 /* Delete routes that were already added */
3855 list_for_each_entry(nh, &rt6_nh_list, next) {
3858 ip6_route_del(&nh->r_cfg, extack);
3862 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3864 dst_release_immediate(&nh->rt6_info->dst);
3866 list_del(&nh->next);
3873 static int ip6_route_multipath_del(struct fib6_config *cfg,
3874 struct netlink_ext_ack *extack)
3876 struct fib6_config r_cfg;
3877 struct rtnexthop *rtnh;
3880 int err = 1, last_err = 0;
3882 remaining = cfg->fc_mp_len;
3883 rtnh = (struct rtnexthop *)cfg->fc_mp;
3885 /* Parse a Multipath Entry */
3886 while (rtnh_ok(rtnh, remaining)) {
3887 memcpy(&r_cfg, cfg, sizeof(*cfg));
3888 if (rtnh->rtnh_ifindex)
3889 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3891 attrlen = rtnh_attrlen(rtnh);
3893 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3895 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3897 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3898 r_cfg.fc_flags |= RTF_GATEWAY;
3901 err = ip6_route_del(&r_cfg, extack);
3905 rtnh = rtnh_next(rtnh, &remaining);
3911 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3912 struct netlink_ext_ack *extack)
3914 struct fib6_config cfg;
3917 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3922 return ip6_route_multipath_del(&cfg, extack);
3924 cfg.fc_delete_all_nh = 1;
3925 return ip6_route_del(&cfg, extack);
3929 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3930 struct netlink_ext_ack *extack)
3932 struct fib6_config cfg;
3935 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3940 return ip6_route_multipath_add(&cfg, extack);
3942 return ip6_route_add(&cfg, extack);
3945 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3947 int nexthop_len = 0;
3949 if (rt->rt6i_nsiblings) {
3950 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3951 + NLA_ALIGN(sizeof(struct rtnexthop))
3952 + nla_total_size(16) /* RTA_GATEWAY */
3953 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3955 nexthop_len *= rt->rt6i_nsiblings;
3958 return NLMSG_ALIGN(sizeof(struct rtmsg))
3959 + nla_total_size(16) /* RTA_SRC */
3960 + nla_total_size(16) /* RTA_DST */
3961 + nla_total_size(16) /* RTA_GATEWAY */
3962 + nla_total_size(16) /* RTA_PREFSRC */
3963 + nla_total_size(4) /* RTA_TABLE */
3964 + nla_total_size(4) /* RTA_IIF */
3965 + nla_total_size(4) /* RTA_OIF */
3966 + nla_total_size(4) /* RTA_PRIORITY */
3967 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3968 + nla_total_size(sizeof(struct rta_cacheinfo))
3969 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3970 + nla_total_size(1) /* RTA_PREF */
3971 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3975 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3976 unsigned int *flags, bool skip_oif)
3978 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3979 *flags |= RTNH_F_LINKDOWN;
3980 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3981 *flags |= RTNH_F_DEAD;
3984 if (rt->rt6i_flags & RTF_GATEWAY) {
3985 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3986 goto nla_put_failure;
3989 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3990 *flags |= RTNH_F_OFFLOAD;
3992 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3993 if (!skip_oif && rt->dst.dev &&
3994 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3995 goto nla_put_failure;
3997 if (rt->dst.lwtstate &&
3998 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3999 goto nla_put_failure;
4007 /* add multipath next hop */
4008 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4010 struct rtnexthop *rtnh;
4011 unsigned int flags = 0;
4013 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4015 goto nla_put_failure;
4017 rtnh->rtnh_hops = 0;
4018 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4020 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4021 goto nla_put_failure;
4023 rtnh->rtnh_flags = flags;
4025 /* length of rtnetlink header + attributes */
4026 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4034 static int rt6_fill_node(struct net *net,
4035 struct sk_buff *skb, struct rt6_info *rt,
4036 struct in6_addr *dst, struct in6_addr *src,
4037 int iif, int type, u32 portid, u32 seq,
4040 u32 metrics[RTAX_MAX];
4042 struct nlmsghdr *nlh;
4046 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4050 rtm = nlmsg_data(nlh);
4051 rtm->rtm_family = AF_INET6;
4052 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4053 rtm->rtm_src_len = rt->rt6i_src.plen;
4056 table = rt->rt6i_table->tb6_id;
4058 table = RT6_TABLE_UNSPEC;
4059 rtm->rtm_table = table;
4060 if (nla_put_u32(skb, RTA_TABLE, table))
4061 goto nla_put_failure;
4062 if (rt->rt6i_flags & RTF_REJECT) {
4063 switch (rt->dst.error) {
4065 rtm->rtm_type = RTN_BLACKHOLE;
4068 rtm->rtm_type = RTN_PROHIBIT;
4071 rtm->rtm_type = RTN_THROW;
4074 rtm->rtm_type = RTN_UNREACHABLE;
4078 else if (rt->rt6i_flags & RTF_LOCAL)
4079 rtm->rtm_type = RTN_LOCAL;
4080 else if (rt->rt6i_flags & RTF_ANYCAST)
4081 rtm->rtm_type = RTN_ANYCAST;
4082 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4083 rtm->rtm_type = RTN_LOCAL;
4085 rtm->rtm_type = RTN_UNICAST;
4087 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4088 rtm->rtm_protocol = rt->rt6i_protocol;
4090 if (rt->rt6i_flags & RTF_CACHE)
4091 rtm->rtm_flags |= RTM_F_CLONED;
4094 if (nla_put_in6_addr(skb, RTA_DST, dst))
4095 goto nla_put_failure;
4096 rtm->rtm_dst_len = 128;
4097 } else if (rtm->rtm_dst_len)
4098 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4099 goto nla_put_failure;
4100 #ifdef CONFIG_IPV6_SUBTREES
4102 if (nla_put_in6_addr(skb, RTA_SRC, src))
4103 goto nla_put_failure;
4104 rtm->rtm_src_len = 128;
4105 } else if (rtm->rtm_src_len &&
4106 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4107 goto nla_put_failure;
4110 #ifdef CONFIG_IPV6_MROUTE
4111 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4112 int err = ip6mr_get_route(net, skb, rtm, portid);
4117 goto nla_put_failure;
4120 if (nla_put_u32(skb, RTA_IIF, iif))
4121 goto nla_put_failure;
4123 struct in6_addr saddr_buf;
4124 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4125 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4126 goto nla_put_failure;
4129 if (rt->rt6i_prefsrc.plen) {
4130 struct in6_addr saddr_buf;
4131 saddr_buf = rt->rt6i_prefsrc.addr;
4132 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4133 goto nla_put_failure;
4136 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4138 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4139 if (rtnetlink_put_metrics(skb, metrics) < 0)
4140 goto nla_put_failure;
4142 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4143 goto nla_put_failure;
4145 /* For multipath routes, walk the siblings list and add
4146 * each as a nexthop within RTA_MULTIPATH.
4148 if (rt->rt6i_nsiblings) {
4149 struct rt6_info *sibling, *next_sibling;
4152 mp = nla_nest_start(skb, RTA_MULTIPATH);
4154 goto nla_put_failure;
4156 if (rt6_add_nexthop(skb, rt) < 0)
4157 goto nla_put_failure;
4159 list_for_each_entry_safe(sibling, next_sibling,
4160 &rt->rt6i_siblings, rt6i_siblings) {
4161 if (rt6_add_nexthop(skb, sibling) < 0)
4162 goto nla_put_failure;
4165 nla_nest_end(skb, mp);
4167 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4168 goto nla_put_failure;
4171 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4173 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4174 goto nla_put_failure;
4176 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4177 goto nla_put_failure;
4180 nlmsg_end(skb, nlh);
4184 nlmsg_cancel(skb, nlh);
4188 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4190 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4191 struct net *net = arg->net;
4193 if (rt == net->ipv6.ip6_null_entry)
4196 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4197 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4199 /* user wants prefix routes only */
4200 if (rtm->rtm_flags & RTM_F_PREFIX &&
4201 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4202 /* success since this is not a prefix route */
4207 return rt6_fill_node(net,
4208 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4209 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4213 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4214 struct netlink_ext_ack *extack)
4216 struct net *net = sock_net(in_skb->sk);
4217 struct nlattr *tb[RTA_MAX+1];
4218 int err, iif = 0, oif = 0;
4219 struct dst_entry *dst;
4220 struct rt6_info *rt;
4221 struct sk_buff *skb;
4226 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4232 memset(&fl6, 0, sizeof(fl6));
4233 rtm = nlmsg_data(nlh);
4234 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4235 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4238 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4241 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4245 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4248 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4252 iif = nla_get_u32(tb[RTA_IIF]);
4255 oif = nla_get_u32(tb[RTA_OIF]);
4258 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4261 fl6.flowi6_uid = make_kuid(current_user_ns(),
4262 nla_get_u32(tb[RTA_UID]));
4264 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4267 struct net_device *dev;
4272 dev = dev_get_by_index_rcu(net, iif);
4279 fl6.flowi6_iif = iif;
4281 if (!ipv6_addr_any(&fl6.saddr))
4282 flags |= RT6_LOOKUP_F_HAS_SADDR;
4285 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4287 dst = ip6_route_lookup(net, &fl6, 0);
4291 fl6.flowi6_oif = oif;
4294 dst = ip6_route_output(net, NULL, &fl6);
4296 dst = ip6_route_lookup(net, &fl6, 0);
4300 rt = container_of(dst, struct rt6_info, dst);
4301 if (rt->dst.error) {
4302 err = rt->dst.error;
4307 if (rt == net->ipv6.ip6_null_entry) {
4308 err = rt->dst.error;
4313 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4320 skb_dst_set(skb, &rt->dst);
4322 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4323 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4326 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4327 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4334 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4339 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4340 unsigned int nlm_flags)
4342 struct sk_buff *skb;
4343 struct net *net = info->nl_net;
4348 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4350 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4354 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4355 event, info->portid, seq, nlm_flags);
4357 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4358 WARN_ON(err == -EMSGSIZE);
4362 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4363 info->nlh, gfp_any());
4367 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4370 static int ip6_route_dev_notify(struct notifier_block *this,
4371 unsigned long event, void *ptr)
4373 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4374 struct net *net = dev_net(dev);
4376 if (!(dev->flags & IFF_LOOPBACK))
4379 if (event == NETDEV_REGISTER) {
4380 net->ipv6.ip6_null_entry->dst.dev = dev;
4381 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4383 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4384 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4385 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4386 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4388 } else if (event == NETDEV_UNREGISTER &&
4389 dev->reg_state != NETREG_UNREGISTERED) {
4390 /* NETDEV_UNREGISTER could be fired for multiple times by
4391 * netdev_wait_allrefs(). Make sure we only call this once.
4393 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4394 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4395 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4396 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4407 #ifdef CONFIG_PROC_FS
4409 static const struct file_operations ipv6_route_proc_fops = {
4410 .owner = THIS_MODULE,
4411 .open = ipv6_route_open,
4413 .llseek = seq_lseek,
4414 .release = seq_release_net,
4417 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4419 struct net *net = (struct net *)seq->private;
4420 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4421 net->ipv6.rt6_stats->fib_nodes,
4422 net->ipv6.rt6_stats->fib_route_nodes,
4423 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4424 net->ipv6.rt6_stats->fib_rt_entries,
4425 net->ipv6.rt6_stats->fib_rt_cache,
4426 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4427 net->ipv6.rt6_stats->fib_discarded_routes);
4432 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4434 return single_open_net(inode, file, rt6_stats_seq_show);
4437 static const struct file_operations rt6_stats_seq_fops = {
4438 .owner = THIS_MODULE,
4439 .open = rt6_stats_seq_open,
4441 .llseek = seq_lseek,
4442 .release = single_release_net,
4444 #endif /* CONFIG_PROC_FS */
4446 #ifdef CONFIG_SYSCTL
4449 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4450 void __user *buffer, size_t *lenp, loff_t *ppos)
4457 net = (struct net *)ctl->extra1;
4458 delay = net->ipv6.sysctl.flush_delay;
4459 proc_dointvec(ctl, write, buffer, lenp, ppos);
4460 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4464 struct ctl_table ipv6_route_table_template[] = {
4466 .procname = "flush",
4467 .data = &init_net.ipv6.sysctl.flush_delay,
4468 .maxlen = sizeof(int),
4470 .proc_handler = ipv6_sysctl_rtcache_flush
4473 .procname = "gc_thresh",
4474 .data = &ip6_dst_ops_template.gc_thresh,
4475 .maxlen = sizeof(int),
4477 .proc_handler = proc_dointvec,
4480 .procname = "max_size",
4481 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4482 .maxlen = sizeof(int),
4484 .proc_handler = proc_dointvec,
4487 .procname = "gc_min_interval",
4488 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4489 .maxlen = sizeof(int),
4491 .proc_handler = proc_dointvec_jiffies,
4494 .procname = "gc_timeout",
4495 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4496 .maxlen = sizeof(int),
4498 .proc_handler = proc_dointvec_jiffies,
4501 .procname = "gc_interval",
4502 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4503 .maxlen = sizeof(int),
4505 .proc_handler = proc_dointvec_jiffies,
4508 .procname = "gc_elasticity",
4509 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4510 .maxlen = sizeof(int),
4512 .proc_handler = proc_dointvec,
4515 .procname = "mtu_expires",
4516 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4517 .maxlen = sizeof(int),
4519 .proc_handler = proc_dointvec_jiffies,
4522 .procname = "min_adv_mss",
4523 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4524 .maxlen = sizeof(int),
4526 .proc_handler = proc_dointvec,
4529 .procname = "gc_min_interval_ms",
4530 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4531 .maxlen = sizeof(int),
4533 .proc_handler = proc_dointvec_ms_jiffies,
4538 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4540 struct ctl_table *table;
4542 table = kmemdup(ipv6_route_table_template,
4543 sizeof(ipv6_route_table_template),
4547 table[0].data = &net->ipv6.sysctl.flush_delay;
4548 table[0].extra1 = net;
4549 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4550 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4551 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4552 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4553 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4554 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4555 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4556 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4557 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4559 /* Don't export sysctls to unprivileged users */
4560 if (net->user_ns != &init_user_ns)
4561 table[0].procname = NULL;
4568 static int __net_init ip6_route_net_init(struct net *net)
4572 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4573 sizeof(net->ipv6.ip6_dst_ops));
4575 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4576 goto out_ip6_dst_ops;
4578 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4579 sizeof(*net->ipv6.ip6_null_entry),
4581 if (!net->ipv6.ip6_null_entry)
4582 goto out_ip6_dst_entries;
4583 net->ipv6.ip6_null_entry->dst.path =
4584 (struct dst_entry *)net->ipv6.ip6_null_entry;
4585 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4586 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4587 ip6_template_metrics, true);
4589 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4590 net->ipv6.fib6_has_custom_rules = false;
4591 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4592 sizeof(*net->ipv6.ip6_prohibit_entry),
4594 if (!net->ipv6.ip6_prohibit_entry)
4595 goto out_ip6_null_entry;
4596 net->ipv6.ip6_prohibit_entry->dst.path =
4597 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4598 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4599 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4600 ip6_template_metrics, true);
4602 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4603 sizeof(*net->ipv6.ip6_blk_hole_entry),
4605 if (!net->ipv6.ip6_blk_hole_entry)
4606 goto out_ip6_prohibit_entry;
4607 net->ipv6.ip6_blk_hole_entry->dst.path =
4608 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4609 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4610 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4611 ip6_template_metrics, true);
4614 net->ipv6.sysctl.flush_delay = 0;
4615 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4616 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4617 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4618 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4619 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4620 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4621 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4623 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4629 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4630 out_ip6_prohibit_entry:
4631 kfree(net->ipv6.ip6_prohibit_entry);
4633 kfree(net->ipv6.ip6_null_entry);
4635 out_ip6_dst_entries:
4636 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4641 static void __net_exit ip6_route_net_exit(struct net *net)
4643 kfree(net->ipv6.ip6_null_entry);
4644 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4645 kfree(net->ipv6.ip6_prohibit_entry);
4646 kfree(net->ipv6.ip6_blk_hole_entry);
4648 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4651 static int __net_init ip6_route_net_init_late(struct net *net)
4653 #ifdef CONFIG_PROC_FS
4654 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4655 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4660 static void __net_exit ip6_route_net_exit_late(struct net *net)
4662 #ifdef CONFIG_PROC_FS
4663 remove_proc_entry("ipv6_route", net->proc_net);
4664 remove_proc_entry("rt6_stats", net->proc_net);
4668 static struct pernet_operations ip6_route_net_ops = {
4669 .init = ip6_route_net_init,
4670 .exit = ip6_route_net_exit,
4673 static int __net_init ipv6_inetpeer_init(struct net *net)
4675 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4679 inet_peer_base_init(bp);
4680 net->ipv6.peers = bp;
4684 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4686 struct inet_peer_base *bp = net->ipv6.peers;
4688 net->ipv6.peers = NULL;
4689 inetpeer_invalidate_tree(bp);
4693 static struct pernet_operations ipv6_inetpeer_ops = {
4694 .init = ipv6_inetpeer_init,
4695 .exit = ipv6_inetpeer_exit,
4698 static struct pernet_operations ip6_route_net_late_ops = {
4699 .init = ip6_route_net_init_late,
4700 .exit = ip6_route_net_exit_late,
4703 static struct notifier_block ip6_route_dev_notifier = {
4704 .notifier_call = ip6_route_dev_notify,
4705 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4708 void __init ip6_route_init_special_entries(void)
4710 /* Registering of the loopback is done before this portion of code,
4711 * the loopback reference in rt6_info will not be taken, do it
4712 * manually for init_net */
4713 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4714 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4715 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4716 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4717 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4718 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4719 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4723 int __init ip6_route_init(void)
4729 ip6_dst_ops_template.kmem_cachep =
4730 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4731 SLAB_HWCACHE_ALIGN, NULL);
4732 if (!ip6_dst_ops_template.kmem_cachep)
4735 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4737 goto out_kmem_cache;
4739 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4741 goto out_dst_entries;
4743 ret = register_pernet_subsys(&ip6_route_net_ops);
4745 goto out_register_inetpeer;
4747 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4751 goto out_register_subsys;
4757 ret = fib6_rules_init();
4761 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4763 goto fib6_rules_init;
4766 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4767 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4768 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4769 RTNL_FLAG_DOIT_UNLOCKED))
4770 goto out_register_late_subsys;
4772 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4774 goto out_register_late_subsys;
4776 for_each_possible_cpu(cpu) {
4777 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4779 INIT_LIST_HEAD(&ul->head);
4780 spin_lock_init(&ul->lock);
4786 out_register_late_subsys:
4787 unregister_pernet_subsys(&ip6_route_net_late_ops);
4789 fib6_rules_cleanup();
4794 out_register_subsys:
4795 unregister_pernet_subsys(&ip6_route_net_ops);
4796 out_register_inetpeer:
4797 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4799 dst_entries_destroy(&ip6_dst_blackhole_ops);
4801 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4805 void ip6_route_cleanup(void)
4807 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4808 unregister_pernet_subsys(&ip6_route_net_late_ops);
4809 fib6_rules_cleanup();
4812 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4813 unregister_pernet_subsys(&ip6_route_net_ops);
4814 dst_entries_destroy(&ip6_dst_blackhole_ops);
4815 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);