2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110 const struct in6_addr *prefix, int prefixlen,
111 const struct in6_addr *gwaddr,
112 struct net_device *dev,
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115 const struct in6_addr *prefix, int prefixlen,
116 const struct in6_addr *gwaddr,
117 struct net_device *dev);
120 struct uncached_list {
122 struct list_head head;
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
127 static void rt6_uncached_list_add(struct rt6_info *rt)
129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
131 rt->rt6i_uncached_list = ul;
133 spin_lock_bh(&ul->lock);
134 list_add_tail(&rt->rt6i_uncached, &ul->head);
135 spin_unlock_bh(&ul->lock);
138 static void rt6_uncached_list_del(struct rt6_info *rt)
140 if (!list_empty(&rt->rt6i_uncached)) {
141 struct uncached_list *ul = rt->rt6i_uncached_list;
143 spin_lock_bh(&ul->lock);
144 list_del(&rt->rt6i_uncached);
145 spin_unlock_bh(&ul->lock);
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 struct net_device *loopback_dev = net->loopback_dev;
154 if (dev == loopback_dev)
157 for_each_possible_cpu(cpu) {
158 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
161 spin_lock_bh(&ul->lock);
162 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163 struct inet6_dev *rt_idev = rt->rt6i_idev;
164 struct net_device *rt_dev = rt->dst.dev;
166 if (rt_idev->dev == dev) {
167 rt->rt6i_idev = in6_dev_get(loopback_dev);
168 in6_dev_put(rt_idev);
172 rt->dst.dev = loopback_dev;
173 dev_hold(rt->dst.dev);
177 spin_unlock_bh(&ul->lock);
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 return dst_metrics_write_ptr(rt->dst.from);
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 struct rt6_info *rt = (struct rt6_info *)dst;
190 if (rt->rt6i_flags & RTF_PCPU)
191 return rt6_pcpu_cow_metrics(rt);
192 else if (rt->rt6i_flags & RTF_CACHE)
195 return dst_cow_metrics_generic(dst, old);
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
202 struct in6_addr *p = &rt->rt6i_gateway;
204 if (!ipv6_addr_any(p))
205 return (const void *) p;
207 return &ipv6_hdr(skb)->daddr;
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
215 struct rt6_info *rt = (struct rt6_info *) dst;
218 daddr = choose_neigh_daddr(rt, skb, daddr);
219 n = __ipv6_neigh_lookup(dst->dev, daddr);
222 return neigh_create(&nd_tbl, daddr, dst->dev);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(rt, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = ipv6_cow_metrics,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct rt6_info ip6_null_entry_template = {
294 .__refcnt = ATOMIC_INIT(1),
296 .obsolete = DST_OBSOLETE_FORCE_CHK,
297 .error = -ENETUNREACH,
298 .input = ip6_pkt_discard,
299 .output = ip6_pkt_discard_out,
301 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
302 .rt6i_protocol = RTPROT_KERNEL,
303 .rt6i_metric = ~(u32) 0,
304 .rt6i_ref = ATOMIC_INIT(1),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
311 .__refcnt = ATOMIC_INIT(1),
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
319 .rt6i_protocol = RTPROT_KERNEL,
320 .rt6i_metric = ~(u32) 0,
321 .rt6i_ref = ATOMIC_INIT(1),
324 static const struct rt6_info ip6_blk_hole_entry_template = {
326 .__refcnt = ATOMIC_INIT(1),
328 .obsolete = DST_OBSOLETE_FORCE_CHK,
330 .input = dst_discard,
331 .output = dst_discard_out,
333 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
334 .rt6i_protocol = RTPROT_KERNEL,
335 .rt6i_metric = ~(u32) 0,
336 .rt6i_ref = ATOMIC_INIT(1),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_siblings);
347 INIT_LIST_HEAD(&rt->rt6i_uncached);
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352 struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365 struct net_device *dev,
368 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
371 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
375 for_each_possible_cpu(cpu) {
378 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379 /* no one shares rt */
383 dst_release_immediate(&rt->dst);
390 EXPORT_SYMBOL(ip6_dst_alloc);
392 static void ip6_dst_destroy(struct dst_entry *dst)
394 struct rt6_info *rt = (struct rt6_info *)dst;
395 struct dst_entry *from = dst->from;
396 struct inet6_dev *idev;
398 dst_destroy_metrics_generic(dst);
399 free_percpu(rt->rt6i_pcpu);
400 rt6_uncached_list_del(rt);
402 idev = rt->rt6i_idev;
404 rt->rt6i_idev = NULL;
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
415 struct rt6_info *rt = (struct rt6_info *)dst;
416 struct inet6_dev *idev = rt->rt6i_idev;
417 struct net_device *loopback_dev =
418 dev_net(dev)->loopback_dev;
420 if (idev && idev->dev != loopback_dev) {
421 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
423 rt->rt6i_idev = loopback_idev;
429 static bool __rt6_check_expired(const struct rt6_info *rt)
431 if (rt->rt6i_flags & RTF_EXPIRES)
432 return time_after(jiffies, rt->dst.expires);
437 static bool rt6_check_expired(const struct rt6_info *rt)
439 if (rt->rt6i_flags & RTF_EXPIRES) {
440 if (time_after(jiffies, rt->dst.expires))
442 } else if (rt->dst.from) {
443 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
444 rt6_check_expired((struct rt6_info *)rt->dst.from);
449 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
450 struct flowi6 *fl6, int oif,
453 struct rt6_info *sibling, *next_sibling;
456 /* We might have already computed the hash for ICMPv6 errors. In such
457 * case it will always be non-zero. Otherwise now is the time to do it.
460 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
462 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
463 /* Don't change the route, if route_choosen == 0
464 * (siblings does not include ourself)
467 list_for_each_entry_safe(sibling, next_sibling,
468 &match->rt6i_siblings, rt6i_siblings) {
470 if (route_choosen == 0) {
471 if (rt6_score_route(sibling, oif, strict) < 0)
481 * Route lookup. Any table->tb6_lock is implied.
484 static inline struct rt6_info *rt6_device_match(struct net *net,
486 const struct in6_addr *saddr,
490 struct rt6_info *local = NULL;
491 struct rt6_info *sprt;
493 if (!oif && ipv6_addr_any(saddr))
496 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
497 struct net_device *dev = sprt->dst.dev;
500 if (dev->ifindex == oif)
502 if (dev->flags & IFF_LOOPBACK) {
503 if (!sprt->rt6i_idev ||
504 sprt->rt6i_idev->dev->ifindex != oif) {
505 if (flags & RT6_LOOKUP_F_IFACE)
508 local->rt6i_idev->dev->ifindex == oif)
514 if (ipv6_chk_addr(net, saddr, dev,
515 flags & RT6_LOOKUP_F_IFACE))
524 if (flags & RT6_LOOKUP_F_IFACE)
525 return net->ipv6.ip6_null_entry;
531 #ifdef CONFIG_IPV6_ROUTER_PREF
532 struct __rt6_probe_work {
533 struct work_struct work;
534 struct in6_addr target;
535 struct net_device *dev;
538 static void rt6_probe_deferred(struct work_struct *w)
540 struct in6_addr mcaddr;
541 struct __rt6_probe_work *work =
542 container_of(w, struct __rt6_probe_work, work);
544 addrconf_addr_solict_mult(&work->target, &mcaddr);
545 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 static void rt6_probe(struct rt6_info *rt)
552 struct __rt6_probe_work *work;
553 struct neighbour *neigh;
555 * Okay, this does not seem to be appropriate
556 * for now, however, we need to check if it
557 * is really so; aka Router Reachability Probing.
559 * Router Reachability Probe MUST be rate-limited
560 * to no more than one per minute.
562 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
565 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
567 if (neigh->nud_state & NUD_VALID)
571 write_lock(&neigh->lock);
572 if (!(neigh->nud_state & NUD_VALID) &&
575 rt->rt6i_idev->cnf.rtr_probe_interval)) {
576 work = kmalloc(sizeof(*work), GFP_ATOMIC);
578 __neigh_set_probe_once(neigh);
580 write_unlock(&neigh->lock);
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
586 INIT_WORK(&work->work, rt6_probe_deferred);
587 work->target = rt->rt6i_gateway;
588 dev_hold(rt->dst.dev);
589 work->dev = rt->dst.dev;
590 schedule_work(&work->work);
594 rcu_read_unlock_bh();
597 static inline void rt6_probe(struct rt6_info *rt)
603 * Default Router Selection (RFC 2461 6.3.6)
605 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
607 struct net_device *dev = rt->dst.dev;
608 if (!oif || dev->ifindex == oif)
610 if ((dev->flags & IFF_LOOPBACK) &&
611 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
618 struct neighbour *neigh;
619 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
621 if (rt->rt6i_flags & RTF_NONEXTHOP ||
622 !(rt->rt6i_flags & RTF_GATEWAY))
623 return RT6_NUD_SUCCEED;
626 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
628 read_lock(&neigh->lock);
629 if (neigh->nud_state & NUD_VALID)
630 ret = RT6_NUD_SUCCEED;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 else if (!(neigh->nud_state & NUD_FAILED))
633 ret = RT6_NUD_SUCCEED;
635 ret = RT6_NUD_FAIL_PROBE;
637 read_unlock(&neigh->lock);
639 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
640 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
642 rcu_read_unlock_bh();
647 static int rt6_score_route(struct rt6_info *rt, int oif,
652 m = rt6_check_dev(rt, oif);
653 if (!m && (strict & RT6_LOOKUP_F_IFACE))
654 return RT6_NUD_FAIL_HARD;
655 #ifdef CONFIG_IPV6_ROUTER_PREF
656 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
658 if (strict & RT6_LOOKUP_F_REACHABLE) {
659 int n = rt6_check_neigh(rt);
666 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
667 int *mpri, struct rt6_info *match,
671 bool match_do_rr = false;
672 struct inet6_dev *idev = rt->rt6i_idev;
673 struct net_device *dev = rt->dst.dev;
675 if (dev && !netif_carrier_ok(dev) &&
676 idev->cnf.ignore_routes_with_linkdown &&
677 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
680 if (rt6_check_expired(rt))
683 m = rt6_score_route(rt, oif, strict);
684 if (m == RT6_NUD_FAIL_DO_RR) {
686 m = 0; /* lowest valid score */
687 } else if (m == RT6_NUD_FAIL_HARD) {
691 if (strict & RT6_LOOKUP_F_REACHABLE)
694 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
696 *do_rr = match_do_rr;
704 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
705 struct rt6_info *rr_head,
706 u32 metric, int oif, int strict,
709 struct rt6_info *rt, *match, *cont;
714 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
715 if (rt->rt6i_metric != metric) {
720 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
724 if (rt->rt6i_metric != metric) {
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 for (rt = cont; rt; rt = rt->dst.rt6_next)
736 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
743 struct rt6_info *match, *rt0;
749 fn->rr_ptr = rt0 = fn->leaf;
751 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
755 struct rt6_info *next = rt0->dst.rt6_next;
757 /* no entries matched; do round-robin */
758 if (!next || next->rt6i_metric != rt0->rt6i_metric)
765 net = dev_net(rt0->dst.dev);
766 return match ? match : net->ipv6.ip6_null_entry;
769 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
771 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
774 #ifdef CONFIG_IPV6_ROUTE_INFO
775 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
776 const struct in6_addr *gwaddr)
778 struct net *net = dev_net(dev);
779 struct route_info *rinfo = (struct route_info *) opt;
780 struct in6_addr prefix_buf, *prefix;
782 unsigned long lifetime;
785 if (len < sizeof(struct route_info)) {
789 /* Sanity check for prefix_len and length */
790 if (rinfo->length > 3) {
792 } else if (rinfo->prefix_len > 128) {
794 } else if (rinfo->prefix_len > 64) {
795 if (rinfo->length < 2) {
798 } else if (rinfo->prefix_len > 0) {
799 if (rinfo->length < 1) {
804 pref = rinfo->route_pref;
805 if (pref == ICMPV6_ROUTER_PREF_INVALID)
808 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
810 if (rinfo->length == 3)
811 prefix = (struct in6_addr *)rinfo->prefix;
813 /* this function is safe */
814 ipv6_addr_prefix(&prefix_buf,
815 (struct in6_addr *)rinfo->prefix,
817 prefix = &prefix_buf;
820 if (rinfo->prefix_len == 0)
821 rt = rt6_get_dflt_router(gwaddr, dev);
823 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
826 if (rt && !lifetime) {
832 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
835 rt->rt6i_flags = RTF_ROUTEINFO |
836 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
839 if (!addrconf_finite_timeout(lifetime))
840 rt6_clean_expires(rt);
842 rt6_set_expires(rt, jiffies + HZ * lifetime);
850 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
851 struct in6_addr *saddr)
853 struct fib6_node *pn;
855 if (fn->fn_flags & RTN_TL_ROOT)
858 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
859 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
862 if (fn->fn_flags & RTN_RTINFO)
867 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
868 struct fib6_table *table,
869 struct flowi6 *fl6, int flags)
871 struct fib6_node *fn;
874 read_lock_bh(&table->tb6_lock);
875 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
878 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
879 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
880 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
881 if (rt == net->ipv6.ip6_null_entry) {
882 fn = fib6_backtrack(fn, &fl6->saddr);
886 dst_use(&rt->dst, jiffies);
887 read_unlock_bh(&table->tb6_lock);
889 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
895 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
898 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
900 EXPORT_SYMBOL_GPL(ip6_route_lookup);
902 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
903 const struct in6_addr *saddr, int oif, int strict)
905 struct flowi6 fl6 = {
909 struct dst_entry *dst;
910 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
913 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
914 flags |= RT6_LOOKUP_F_HAS_SADDR;
917 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
919 return (struct rt6_info *) dst;
925 EXPORT_SYMBOL(rt6_lookup);
927 /* ip6_ins_rt is called with FREE table->tb6_lock.
928 * It takes new route entry, the addition fails by any reason the
930 * Caller must hold dst before calling it.
933 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
934 struct mx6_config *mxc,
935 struct netlink_ext_ack *extack)
938 struct fib6_table *table;
940 table = rt->rt6i_table;
941 write_lock_bh(&table->tb6_lock);
942 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
943 write_unlock_bh(&table->tb6_lock);
948 int ip6_ins_rt(struct rt6_info *rt)
950 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
951 struct mx6_config mxc = { .mx = NULL, };
953 /* Hold dst to account for the reference from the fib6 tree */
955 return __ip6_ins_rt(rt, &info, &mxc, NULL);
958 /* called with rcu_lock held */
959 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
961 struct net_device *dev = rt->dst.dev;
963 if (rt->rt6i_flags & RTF_LOCAL) {
964 /* for copies of local routes, dst->dev needs to be the
965 * device if it is a master device, the master device if
966 * device is enslaved, and the loopback as the default
968 if (netif_is_l3_slave(dev) &&
969 !rt6_need_strict(&rt->rt6i_dst.addr))
970 dev = l3mdev_master_dev_rcu(dev);
971 else if (!netif_is_l3_master(dev))
972 dev = dev_net(dev)->loopback_dev;
973 /* last case is netif_is_l3_master(dev) is true in which
974 * case we want dev returned to be dev
981 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
982 const struct in6_addr *daddr,
983 const struct in6_addr *saddr)
985 struct net_device *dev;
992 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
993 ort = (struct rt6_info *)ort->dst.from;
996 dev = ip6_rt_get_dev_rcu(ort);
997 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1002 ip6_rt_copy_init(rt, ort);
1003 rt->rt6i_flags |= RTF_CACHE;
1004 rt->rt6i_metric = 0;
1005 rt->dst.flags |= DST_HOST;
1006 rt->rt6i_dst.addr = *daddr;
1007 rt->rt6i_dst.plen = 128;
1009 if (!rt6_is_gw_or_nonexthop(ort)) {
1010 if (ort->rt6i_dst.plen != 128 &&
1011 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1012 rt->rt6i_flags |= RTF_ANYCAST;
1013 #ifdef CONFIG_IPV6_SUBTREES
1014 if (rt->rt6i_src.plen && saddr) {
1015 rt->rt6i_src.addr = *saddr;
1016 rt->rt6i_src.plen = 128;
1024 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1026 struct net_device *dev;
1027 struct rt6_info *pcpu_rt;
1030 dev = ip6_rt_get_dev_rcu(rt);
1031 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1035 ip6_rt_copy_init(pcpu_rt, rt);
1036 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1037 pcpu_rt->rt6i_flags |= RTF_PCPU;
1041 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1042 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1044 struct rt6_info *pcpu_rt, **p;
1046 p = this_cpu_ptr(rt->rt6i_pcpu);
1050 dst_hold(&pcpu_rt->dst);
1051 rt6_dst_from_metrics_check(pcpu_rt);
1056 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1058 struct fib6_table *table = rt->rt6i_table;
1059 struct rt6_info *pcpu_rt, *prev, **p;
1061 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1063 struct net *net = dev_net(rt->dst.dev);
1065 dst_hold(&net->ipv6.ip6_null_entry->dst);
1066 return net->ipv6.ip6_null_entry;
1069 read_lock_bh(&table->tb6_lock);
1070 if (rt->rt6i_pcpu) {
1071 p = this_cpu_ptr(rt->rt6i_pcpu);
1072 prev = cmpxchg(p, NULL, pcpu_rt);
1074 /* If someone did it before us, return prev instead */
1075 dst_release_immediate(&pcpu_rt->dst);
1079 /* rt has been removed from the fib6 tree
1080 * before we have a chance to acquire the read_lock.
1081 * In this case, don't brother to create a pcpu rt
1082 * since rt is going away anyway. The next
1083 * dst_check() will trigger a re-lookup.
1085 dst_release_immediate(&pcpu_rt->dst);
1088 dst_hold(&pcpu_rt->dst);
1089 rt6_dst_from_metrics_check(pcpu_rt);
1090 read_unlock_bh(&table->tb6_lock);
1094 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1095 int oif, struct flowi6 *fl6, int flags)
1097 struct fib6_node *fn, *saved_fn;
1098 struct rt6_info *rt;
1101 strict |= flags & RT6_LOOKUP_F_IFACE;
1102 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1103 if (net->ipv6.devconf_all->forwarding == 0)
1104 strict |= RT6_LOOKUP_F_REACHABLE;
1106 read_lock_bh(&table->tb6_lock);
1108 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1111 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1115 rt = rt6_select(fn, oif, strict);
1116 if (rt->rt6i_nsiblings)
1117 rt = rt6_multipath_select(rt, fl6, oif, strict);
1118 if (rt == net->ipv6.ip6_null_entry) {
1119 fn = fib6_backtrack(fn, &fl6->saddr);
1121 goto redo_rt6_select;
1122 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1123 /* also consider unreachable route */
1124 strict &= ~RT6_LOOKUP_F_REACHABLE;
1126 goto redo_rt6_select;
1131 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1132 dst_use(&rt->dst, jiffies);
1133 read_unlock_bh(&table->tb6_lock);
1135 rt6_dst_from_metrics_check(rt);
1137 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1139 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1140 !(rt->rt6i_flags & RTF_GATEWAY))) {
1141 /* Create a RTF_CACHE clone which will not be
1142 * owned by the fib6 tree. It is for the special case where
1143 * the daddr in the skb during the neighbor look-up is different
1144 * from the fl6->daddr used to look-up route here.
1147 struct rt6_info *uncached_rt;
1149 dst_use(&rt->dst, jiffies);
1150 read_unlock_bh(&table->tb6_lock);
1152 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1153 dst_release(&rt->dst);
1156 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1157 * No need for another dst_hold()
1159 rt6_uncached_list_add(uncached_rt);
1161 uncached_rt = net->ipv6.ip6_null_entry;
1162 dst_hold(&uncached_rt->dst);
1165 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1169 /* Get a percpu copy */
1171 struct rt6_info *pcpu_rt;
1173 rt->dst.lastuse = jiffies;
1175 pcpu_rt = rt6_get_pcpu_route(rt);
1178 read_unlock_bh(&table->tb6_lock);
1180 /* We have to do the read_unlock first
1181 * because rt6_make_pcpu_route() may trigger
1182 * ip6_dst_gc() which will take the write_lock.
1185 read_unlock_bh(&table->tb6_lock);
1186 pcpu_rt = rt6_make_pcpu_route(rt);
1187 dst_release(&rt->dst);
1190 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1195 EXPORT_SYMBOL_GPL(ip6_pol_route);
1197 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1198 struct flowi6 *fl6, int flags)
1200 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1203 struct dst_entry *ip6_route_input_lookup(struct net *net,
1204 struct net_device *dev,
1205 struct flowi6 *fl6, int flags)
1207 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1208 flags |= RT6_LOOKUP_F_IFACE;
1210 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1212 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1214 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1215 struct flow_keys *keys)
1217 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1218 const struct ipv6hdr *key_iph = outer_iph;
1219 const struct ipv6hdr *inner_iph;
1220 const struct icmp6hdr *icmph;
1221 struct ipv6hdr _inner_iph;
1223 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1226 icmph = icmp6_hdr(skb);
1227 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1228 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1229 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1230 icmph->icmp6_type != ICMPV6_PARAMPROB)
1233 inner_iph = skb_header_pointer(skb,
1234 skb_transport_offset(skb) + sizeof(*icmph),
1235 sizeof(_inner_iph), &_inner_iph);
1239 key_iph = inner_iph;
1241 memset(keys, 0, sizeof(*keys));
1242 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1243 keys->addrs.v6addrs.src = key_iph->saddr;
1244 keys->addrs.v6addrs.dst = key_iph->daddr;
1245 keys->tags.flow_label = ip6_flowinfo(key_iph);
1246 keys->basic.ip_proto = key_iph->nexthdr;
1249 /* if skb is set it will be used and fl6 can be NULL */
1250 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1252 struct flow_keys hash_keys;
1255 ip6_multipath_l3_keys(skb, &hash_keys);
1256 return flow_hash_from_keys(&hash_keys);
1259 return get_hash_from_flowi6(fl6);
1262 void ip6_route_input(struct sk_buff *skb)
1264 const struct ipv6hdr *iph = ipv6_hdr(skb);
1265 struct net *net = dev_net(skb->dev);
1266 int flags = RT6_LOOKUP_F_HAS_SADDR;
1267 struct ip_tunnel_info *tun_info;
1268 struct flowi6 fl6 = {
1269 .flowi6_iif = skb->dev->ifindex,
1270 .daddr = iph->daddr,
1271 .saddr = iph->saddr,
1272 .flowlabel = ip6_flowinfo(iph),
1273 .flowi6_mark = skb->mark,
1274 .flowi6_proto = iph->nexthdr,
1277 tun_info = skb_tunnel_info(skb);
1278 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1279 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1280 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1281 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1283 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1286 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1287 struct flowi6 *fl6, int flags)
1289 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1292 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1293 struct flowi6 *fl6, int flags)
1297 if (rt6_need_strict(&fl6->daddr)) {
1298 struct dst_entry *dst;
1300 dst = l3mdev_link_scope_lookup(net, fl6);
1305 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1307 any_src = ipv6_addr_any(&fl6->saddr);
1308 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1309 (fl6->flowi6_oif && any_src))
1310 flags |= RT6_LOOKUP_F_IFACE;
1313 flags |= RT6_LOOKUP_F_HAS_SADDR;
1315 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1317 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1319 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1321 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1323 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1324 struct net_device *loopback_dev = net->loopback_dev;
1325 struct dst_entry *new = NULL;
1327 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1328 DST_OBSOLETE_NONE, 0);
1334 new->input = dst_discard;
1335 new->output = dst_discard_out;
1337 dst_copy_metrics(new, &ort->dst);
1339 rt->rt6i_idev = in6_dev_get(loopback_dev);
1340 rt->rt6i_gateway = ort->rt6i_gateway;
1341 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1342 rt->rt6i_metric = 0;
1344 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1345 #ifdef CONFIG_IPV6_SUBTREES
1346 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1350 dst_release(dst_orig);
1351 return new ? new : ERR_PTR(-ENOMEM);
1355 * Destination cache support functions
1358 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1361 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1362 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1365 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1369 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1372 if (rt6_check_expired(rt))
1378 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1380 if (!__rt6_check_expired(rt) &&
1381 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1382 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1388 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1390 struct rt6_info *rt;
1392 rt = (struct rt6_info *) dst;
1394 /* All IPV6 dsts are created with ->obsolete set to the value
1395 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1396 * into this function always.
1399 rt6_dst_from_metrics_check(rt);
1401 if (rt->rt6i_flags & RTF_PCPU ||
1402 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1403 return rt6_dst_from_check(rt, cookie);
1405 return rt6_check(rt, cookie);
1408 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1410 struct rt6_info *rt = (struct rt6_info *) dst;
1413 if (rt->rt6i_flags & RTF_CACHE) {
1414 if (rt6_check_expired(rt)) {
1426 static void ip6_link_failure(struct sk_buff *skb)
1428 struct rt6_info *rt;
1430 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1432 rt = (struct rt6_info *) skb_dst(skb);
1434 if (rt->rt6i_flags & RTF_CACHE) {
1435 if (dst_hold_safe(&rt->dst))
1438 struct fib6_node *fn;
1441 fn = rcu_dereference(rt->rt6i_node);
1442 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1449 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1451 struct net *net = dev_net(rt->dst.dev);
1453 rt->rt6i_flags |= RTF_MODIFIED;
1454 rt->rt6i_pmtu = mtu;
1455 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1458 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1460 return !(rt->rt6i_flags & RTF_CACHE) &&
1461 (rt->rt6i_flags & RTF_PCPU ||
1462 rcu_access_pointer(rt->rt6i_node));
1465 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1466 const struct ipv6hdr *iph, u32 mtu)
1468 const struct in6_addr *daddr, *saddr;
1469 struct rt6_info *rt6 = (struct rt6_info *)dst;
1471 if (rt6->rt6i_flags & RTF_LOCAL)
1474 if (dst_metric_locked(dst, RTAX_MTU))
1478 daddr = &iph->daddr;
1479 saddr = &iph->saddr;
1481 daddr = &sk->sk_v6_daddr;
1482 saddr = &inet6_sk(sk)->saddr;
1487 dst_confirm_neigh(dst, daddr);
1488 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1489 if (mtu >= dst_mtu(dst))
1492 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1493 rt6_do_update_pmtu(rt6, mtu);
1495 struct rt6_info *nrt6;
1497 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1499 rt6_do_update_pmtu(nrt6, mtu);
1501 /* ip6_ins_rt(nrt6) will bump the
1502 * rt6->rt6i_node->fn_sernum
1503 * which will fail the next rt6_check() and
1504 * invalidate the sk->sk_dst_cache.
1507 /* Release the reference taken in
1508 * ip6_rt_cache_alloc()
1510 dst_release(&nrt6->dst);
1515 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1516 struct sk_buff *skb, u32 mtu)
1518 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1521 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1522 int oif, u32 mark, kuid_t uid)
1524 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1525 struct dst_entry *dst;
1528 memset(&fl6, 0, sizeof(fl6));
1529 fl6.flowi6_oif = oif;
1530 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1531 fl6.daddr = iph->daddr;
1532 fl6.saddr = iph->saddr;
1533 fl6.flowlabel = ip6_flowinfo(iph);
1534 fl6.flowi6_uid = uid;
1536 dst = ip6_route_output(net, NULL, &fl6);
1538 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1541 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1543 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1545 struct dst_entry *dst;
1547 ip6_update_pmtu(skb, sock_net(sk), mtu,
1548 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1550 dst = __sk_dst_get(sk);
1551 if (!dst || !dst->obsolete ||
1552 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1556 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1557 ip6_datagram_dst_update(sk, false);
1560 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1562 /* Handle redirects */
1563 struct ip6rd_flowi {
1565 struct in6_addr gateway;
1568 static struct rt6_info *__ip6_route_redirect(struct net *net,
1569 struct fib6_table *table,
1573 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1574 struct rt6_info *rt;
1575 struct fib6_node *fn;
1577 /* Get the "current" route for this destination and
1578 * check if the redirect has come from appropriate router.
1580 * RFC 4861 specifies that redirects should only be
1581 * accepted if they come from the nexthop to the target.
1582 * Due to the way the routes are chosen, this notion
1583 * is a bit fuzzy and one might need to check all possible
1587 read_lock_bh(&table->tb6_lock);
1588 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1590 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1591 if (rt6_check_expired(rt))
1595 if (!(rt->rt6i_flags & RTF_GATEWAY))
1597 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1599 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1605 rt = net->ipv6.ip6_null_entry;
1606 else if (rt->dst.error) {
1607 rt = net->ipv6.ip6_null_entry;
1611 if (rt == net->ipv6.ip6_null_entry) {
1612 fn = fib6_backtrack(fn, &fl6->saddr);
1620 read_unlock_bh(&table->tb6_lock);
1622 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1626 static struct dst_entry *ip6_route_redirect(struct net *net,
1627 const struct flowi6 *fl6,
1628 const struct in6_addr *gateway)
1630 int flags = RT6_LOOKUP_F_HAS_SADDR;
1631 struct ip6rd_flowi rdfl;
1634 rdfl.gateway = *gateway;
1636 return fib6_rule_lookup(net, &rdfl.fl6,
1637 flags, __ip6_route_redirect);
1640 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1643 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1644 struct dst_entry *dst;
1647 memset(&fl6, 0, sizeof(fl6));
1648 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1649 fl6.flowi6_oif = oif;
1650 fl6.flowi6_mark = mark;
1651 fl6.daddr = iph->daddr;
1652 fl6.saddr = iph->saddr;
1653 fl6.flowlabel = ip6_flowinfo(iph);
1654 fl6.flowi6_uid = uid;
1656 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1657 rt6_do_redirect(dst, NULL, skb);
1660 EXPORT_SYMBOL_GPL(ip6_redirect);
1662 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1665 const struct ipv6hdr *iph = ipv6_hdr(skb);
1666 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1667 struct dst_entry *dst;
1670 memset(&fl6, 0, sizeof(fl6));
1671 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1672 fl6.flowi6_oif = oif;
1673 fl6.flowi6_mark = mark;
1674 fl6.daddr = msg->dest;
1675 fl6.saddr = iph->daddr;
1676 fl6.flowi6_uid = sock_net_uid(net, NULL);
1678 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1679 rt6_do_redirect(dst, NULL, skb);
1683 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1685 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1688 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1690 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1692 struct net_device *dev = dst->dev;
1693 unsigned int mtu = dst_mtu(dst);
1694 struct net *net = dev_net(dev);
1696 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1698 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1699 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1702 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1703 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1704 * IPV6_MAXPLEN is also valid and means: "any MSS,
1705 * rely only on pmtu discovery"
1707 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1712 static unsigned int ip6_mtu(const struct dst_entry *dst)
1714 const struct rt6_info *rt = (const struct rt6_info *)dst;
1715 unsigned int mtu = rt->rt6i_pmtu;
1716 struct inet6_dev *idev;
1721 mtu = dst_metric_raw(dst, RTAX_MTU);
1728 idev = __in6_dev_get(dst->dev);
1730 mtu = idev->cnf.mtu6;
1734 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1736 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1739 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1742 struct dst_entry *dst;
1743 struct rt6_info *rt;
1744 struct inet6_dev *idev = in6_dev_get(dev);
1745 struct net *net = dev_net(dev);
1747 if (unlikely(!idev))
1748 return ERR_PTR(-ENODEV);
1750 rt = ip6_dst_alloc(net, dev, 0);
1751 if (unlikely(!rt)) {
1753 dst = ERR_PTR(-ENOMEM);
1757 rt->dst.flags |= DST_HOST;
1758 rt->dst.output = ip6_output;
1759 rt->rt6i_gateway = fl6->daddr;
1760 rt->rt6i_dst.addr = fl6->daddr;
1761 rt->rt6i_dst.plen = 128;
1762 rt->rt6i_idev = idev;
1763 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1765 /* Add this dst into uncached_list so that rt6_ifdown() can
1766 * do proper release of the net_device
1768 rt6_uncached_list_add(rt);
1770 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1776 static int ip6_dst_gc(struct dst_ops *ops)
1778 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1779 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1780 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1781 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1782 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1783 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1786 entries = dst_entries_get_fast(ops);
1787 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1788 entries <= rt_max_size)
1791 net->ipv6.ip6_rt_gc_expire++;
1792 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1793 entries = dst_entries_get_slow(ops);
1794 if (entries < ops->gc_thresh)
1795 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1797 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1798 return entries > rt_max_size;
1801 static int ip6_convert_metrics(struct mx6_config *mxc,
1802 const struct fib6_config *cfg)
1804 bool ecn_ca = false;
1812 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1816 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1817 int type = nla_type(nla);
1822 if (unlikely(type > RTAX_MAX))
1825 if (type == RTAX_CC_ALGO) {
1826 char tmp[TCP_CA_NAME_MAX];
1828 nla_strlcpy(tmp, nla, sizeof(tmp));
1829 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1830 if (val == TCP_CA_UNSPEC)
1833 val = nla_get_u32(nla);
1835 if (type == RTAX_HOPLIMIT && val > 255)
1837 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1841 __set_bit(type - 1, mxc->mx_valid);
1845 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1846 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1856 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1857 struct fib6_config *cfg,
1858 const struct in6_addr *gw_addr)
1860 struct flowi6 fl6 = {
1861 .flowi6_oif = cfg->fc_ifindex,
1863 .saddr = cfg->fc_prefsrc,
1865 struct fib6_table *table;
1866 struct rt6_info *rt;
1867 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1869 table = fib6_get_table(net, cfg->fc_table);
1873 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1874 flags |= RT6_LOOKUP_F_HAS_SADDR;
1876 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1878 /* if table lookup failed, fall back to full lookup */
1879 if (rt == net->ipv6.ip6_null_entry) {
1887 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1888 struct netlink_ext_ack *extack)
1890 struct net *net = cfg->fc_nlinfo.nl_net;
1891 struct rt6_info *rt = NULL;
1892 struct net_device *dev = NULL;
1893 struct inet6_dev *idev = NULL;
1894 struct fib6_table *table;
1898 /* RTF_PCPU is an internal flag; can not be set by userspace */
1899 if (cfg->fc_flags & RTF_PCPU) {
1900 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1904 if (cfg->fc_dst_len > 128) {
1905 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1908 if (cfg->fc_src_len > 128) {
1909 NL_SET_ERR_MSG(extack, "Invalid source address length");
1912 #ifndef CONFIG_IPV6_SUBTREES
1913 if (cfg->fc_src_len) {
1914 NL_SET_ERR_MSG(extack,
1915 "Specifying source address requires IPV6_SUBTREES to be enabled");
1919 if (cfg->fc_ifindex) {
1921 dev = dev_get_by_index(net, cfg->fc_ifindex);
1924 idev = in6_dev_get(dev);
1929 if (cfg->fc_metric == 0)
1930 cfg->fc_metric = IP6_RT_PRIO_USER;
1933 if (cfg->fc_nlinfo.nlh &&
1934 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1935 table = fib6_get_table(net, cfg->fc_table);
1937 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1938 table = fib6_new_table(net, cfg->fc_table);
1941 table = fib6_new_table(net, cfg->fc_table);
1947 rt = ip6_dst_alloc(net, NULL,
1948 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1955 if (cfg->fc_flags & RTF_EXPIRES)
1956 rt6_set_expires(rt, jiffies +
1957 clock_t_to_jiffies(cfg->fc_expires));
1959 rt6_clean_expires(rt);
1961 if (cfg->fc_protocol == RTPROT_UNSPEC)
1962 cfg->fc_protocol = RTPROT_BOOT;
1963 rt->rt6i_protocol = cfg->fc_protocol;
1965 addr_type = ipv6_addr_type(&cfg->fc_dst);
1967 if (addr_type & IPV6_ADDR_MULTICAST)
1968 rt->dst.input = ip6_mc_input;
1969 else if (cfg->fc_flags & RTF_LOCAL)
1970 rt->dst.input = ip6_input;
1972 rt->dst.input = ip6_forward;
1974 rt->dst.output = ip6_output;
1976 if (cfg->fc_encap) {
1977 struct lwtunnel_state *lwtstate;
1979 err = lwtunnel_build_state(cfg->fc_encap_type,
1980 cfg->fc_encap, AF_INET6, cfg,
1984 rt->dst.lwtstate = lwtstate_get(lwtstate);
1985 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1986 rt->dst.lwtstate->orig_output = rt->dst.output;
1987 rt->dst.output = lwtunnel_output;
1989 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1990 rt->dst.lwtstate->orig_input = rt->dst.input;
1991 rt->dst.input = lwtunnel_input;
1995 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1996 rt->rt6i_dst.plen = cfg->fc_dst_len;
1997 if (rt->rt6i_dst.plen == 128)
1998 rt->dst.flags |= DST_HOST;
2000 #ifdef CONFIG_IPV6_SUBTREES
2001 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2002 rt->rt6i_src.plen = cfg->fc_src_len;
2005 rt->rt6i_metric = cfg->fc_metric;
2007 /* We cannot add true routes via loopback here,
2008 they would result in kernel looping; promote them to reject routes
2010 if ((cfg->fc_flags & RTF_REJECT) ||
2011 (dev && (dev->flags & IFF_LOOPBACK) &&
2012 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2013 !(cfg->fc_flags & RTF_LOCAL))) {
2014 /* hold loopback dev/idev if we haven't done so. */
2015 if (dev != net->loopback_dev) {
2020 dev = net->loopback_dev;
2022 idev = in6_dev_get(dev);
2028 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2029 switch (cfg->fc_type) {
2031 rt->dst.error = -EINVAL;
2032 rt->dst.output = dst_discard_out;
2033 rt->dst.input = dst_discard;
2036 rt->dst.error = -EACCES;
2037 rt->dst.output = ip6_pkt_prohibit_out;
2038 rt->dst.input = ip6_pkt_prohibit;
2041 case RTN_UNREACHABLE:
2043 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2044 : (cfg->fc_type == RTN_UNREACHABLE)
2045 ? -EHOSTUNREACH : -ENETUNREACH;
2046 rt->dst.output = ip6_pkt_discard_out;
2047 rt->dst.input = ip6_pkt_discard;
2053 if (cfg->fc_flags & RTF_GATEWAY) {
2054 const struct in6_addr *gw_addr;
2057 gw_addr = &cfg->fc_gateway;
2058 gwa_type = ipv6_addr_type(gw_addr);
2060 /* if gw_addr is local we will fail to detect this in case
2061 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2062 * will return already-added prefix route via interface that
2063 * prefix route was assigned to, which might be non-loopback.
2066 if (ipv6_chk_addr_and_flags(net, gw_addr,
2067 gwa_type & IPV6_ADDR_LINKLOCAL ?
2068 dev : NULL, 0, 0)) {
2069 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2072 rt->rt6i_gateway = *gw_addr;
2074 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2075 struct rt6_info *grt = NULL;
2077 /* IPv6 strictly inhibits using not link-local
2078 addresses as nexthop address.
2079 Otherwise, router will not able to send redirects.
2080 It is very good, but in some (rare!) circumstances
2081 (SIT, PtP, NBMA NOARP links) it is handy to allow
2082 some exceptions. --ANK
2083 We allow IPv4-mapped nexthops to support RFC4798-type
2086 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2087 IPV6_ADDR_MAPPED))) {
2088 NL_SET_ERR_MSG(extack,
2089 "Invalid gateway address");
2093 if (cfg->fc_table) {
2094 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2097 if (grt->rt6i_flags & RTF_GATEWAY ||
2098 (dev && dev != grt->dst.dev)) {
2106 grt = rt6_lookup(net, gw_addr, NULL,
2107 cfg->fc_ifindex, 1);
2109 err = -EHOSTUNREACH;
2113 if (dev != grt->dst.dev) {
2119 idev = grt->rt6i_idev;
2121 in6_dev_hold(grt->rt6i_idev);
2123 if (!(grt->rt6i_flags & RTF_GATEWAY))
2132 NL_SET_ERR_MSG(extack, "Egress device not specified");
2134 } else if (dev->flags & IFF_LOOPBACK) {
2135 NL_SET_ERR_MSG(extack,
2136 "Egress device can not be loopback device for this route");
2145 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2146 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2147 NL_SET_ERR_MSG(extack, "Invalid source address");
2151 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2152 rt->rt6i_prefsrc.plen = 128;
2154 rt->rt6i_prefsrc.plen = 0;
2156 rt->rt6i_flags = cfg->fc_flags;
2160 rt->rt6i_idev = idev;
2161 rt->rt6i_table = table;
2163 cfg->fc_nlinfo.nl_net = dev_net(dev);
2172 dst_release_immediate(&rt->dst);
2174 return ERR_PTR(err);
2177 int ip6_route_add(struct fib6_config *cfg,
2178 struct netlink_ext_ack *extack)
2180 struct mx6_config mxc = { .mx = NULL, };
2181 struct rt6_info *rt;
2184 rt = ip6_route_info_create(cfg, extack);
2191 err = ip6_convert_metrics(&mxc, cfg);
2195 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2202 dst_release_immediate(&rt->dst);
2207 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2210 struct fib6_table *table;
2211 struct net *net = dev_net(rt->dst.dev);
2213 if (rt == net->ipv6.ip6_null_entry) {
2218 table = rt->rt6i_table;
2219 write_lock_bh(&table->tb6_lock);
2220 err = fib6_del(rt, info);
2221 write_unlock_bh(&table->tb6_lock);
2228 int ip6_del_rt(struct rt6_info *rt)
2230 struct nl_info info = {
2231 .nl_net = dev_net(rt->dst.dev),
2233 return __ip6_del_rt(rt, &info);
2236 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2238 struct nl_info *info = &cfg->fc_nlinfo;
2239 struct net *net = info->nl_net;
2240 struct sk_buff *skb = NULL;
2241 struct fib6_table *table;
2244 if (rt == net->ipv6.ip6_null_entry)
2246 table = rt->rt6i_table;
2247 write_lock_bh(&table->tb6_lock);
2249 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2250 struct rt6_info *sibling, *next_sibling;
2252 /* prefer to send a single notification with all hops */
2253 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2255 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2257 if (rt6_fill_node(net, skb, rt,
2258 NULL, NULL, 0, RTM_DELROUTE,
2259 info->portid, seq, 0) < 0) {
2263 info->skip_notify = 1;
2266 list_for_each_entry_safe(sibling, next_sibling,
2269 err = fib6_del(sibling, info);
2275 err = fib6_del(rt, info);
2277 write_unlock_bh(&table->tb6_lock);
2282 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2283 info->nlh, gfp_any());
2288 static int ip6_route_del(struct fib6_config *cfg,
2289 struct netlink_ext_ack *extack)
2291 struct fib6_table *table;
2292 struct fib6_node *fn;
2293 struct rt6_info *rt;
2296 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2298 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2302 read_lock_bh(&table->tb6_lock);
2304 fn = fib6_locate(&table->tb6_root,
2305 &cfg->fc_dst, cfg->fc_dst_len,
2306 &cfg->fc_src, cfg->fc_src_len);
2309 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2310 if ((rt->rt6i_flags & RTF_CACHE) &&
2311 !(cfg->fc_flags & RTF_CACHE))
2313 if (cfg->fc_ifindex &&
2315 rt->dst.dev->ifindex != cfg->fc_ifindex))
2317 if (cfg->fc_flags & RTF_GATEWAY &&
2318 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2320 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2322 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2325 read_unlock_bh(&table->tb6_lock);
2327 /* if gateway was specified only delete the one hop */
2328 if (cfg->fc_flags & RTF_GATEWAY)
2329 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2331 return __ip6_del_rt_siblings(rt, cfg);
2334 read_unlock_bh(&table->tb6_lock);
2339 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2341 struct netevent_redirect netevent;
2342 struct rt6_info *rt, *nrt = NULL;
2343 struct ndisc_options ndopts;
2344 struct inet6_dev *in6_dev;
2345 struct neighbour *neigh;
2347 int optlen, on_link;
2350 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2351 optlen -= sizeof(*msg);
2354 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2358 msg = (struct rd_msg *)icmp6_hdr(skb);
2360 if (ipv6_addr_is_multicast(&msg->dest)) {
2361 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2366 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2368 } else if (ipv6_addr_type(&msg->target) !=
2369 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2370 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2374 in6_dev = __in6_dev_get(skb->dev);
2377 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2381 * The IP source address of the Redirect MUST be the same as the current
2382 * first-hop router for the specified ICMP Destination Address.
2385 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2386 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2391 if (ndopts.nd_opts_tgt_lladdr) {
2392 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2395 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2400 rt = (struct rt6_info *) dst;
2401 if (rt->rt6i_flags & RTF_REJECT) {
2402 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2406 /* Redirect received -> path was valid.
2407 * Look, redirects are sent only in response to data packets,
2408 * so that this nexthop apparently is reachable. --ANK
2410 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2412 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2417 * We have finally decided to accept it.
2420 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2421 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2422 NEIGH_UPDATE_F_OVERRIDE|
2423 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2424 NEIGH_UPDATE_F_ISROUTER)),
2425 NDISC_REDIRECT, &ndopts);
2427 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2431 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2433 nrt->rt6i_flags &= ~RTF_GATEWAY;
2435 nrt->rt6i_protocol = RTPROT_REDIRECT;
2436 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2438 if (ip6_ins_rt(nrt))
2441 netevent.old = &rt->dst;
2442 netevent.new = &nrt->dst;
2443 netevent.daddr = &msg->dest;
2444 netevent.neigh = neigh;
2445 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2447 if (rt->rt6i_flags & RTF_CACHE) {
2448 rt = (struct rt6_info *) dst_clone(&rt->dst);
2453 /* Release the reference taken in
2454 * ip6_rt_cache_alloc()
2456 dst_release(&nrt->dst);
2459 neigh_release(neigh);
2463 * Misc support functions
2466 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2468 BUG_ON(from->dst.from);
2470 rt->rt6i_flags &= ~RTF_EXPIRES;
2471 dst_hold(&from->dst);
2472 rt->dst.from = &from->dst;
2473 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2476 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2478 rt->dst.input = ort->dst.input;
2479 rt->dst.output = ort->dst.output;
2480 rt->rt6i_dst = ort->rt6i_dst;
2481 rt->dst.error = ort->dst.error;
2482 rt->rt6i_idev = ort->rt6i_idev;
2484 in6_dev_hold(rt->rt6i_idev);
2485 rt->dst.lastuse = jiffies;
2486 rt->rt6i_gateway = ort->rt6i_gateway;
2487 rt->rt6i_flags = ort->rt6i_flags;
2488 rt6_set_from(rt, ort);
2489 rt->rt6i_metric = ort->rt6i_metric;
2490 #ifdef CONFIG_IPV6_SUBTREES
2491 rt->rt6i_src = ort->rt6i_src;
2493 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2494 rt->rt6i_table = ort->rt6i_table;
2495 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2498 #ifdef CONFIG_IPV6_ROUTE_INFO
2499 static struct rt6_info *rt6_get_route_info(struct net *net,
2500 const struct in6_addr *prefix, int prefixlen,
2501 const struct in6_addr *gwaddr,
2502 struct net_device *dev)
2504 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2505 int ifindex = dev->ifindex;
2506 struct fib6_node *fn;
2507 struct rt6_info *rt = NULL;
2508 struct fib6_table *table;
2510 table = fib6_get_table(net, tb_id);
2514 read_lock_bh(&table->tb6_lock);
2515 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2519 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2520 if (rt->dst.dev->ifindex != ifindex)
2522 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2524 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2530 read_unlock_bh(&table->tb6_lock);
2534 static struct rt6_info *rt6_add_route_info(struct net *net,
2535 const struct in6_addr *prefix, int prefixlen,
2536 const struct in6_addr *gwaddr,
2537 struct net_device *dev,
2540 struct fib6_config cfg = {
2541 .fc_metric = IP6_RT_PRIO_USER,
2542 .fc_ifindex = dev->ifindex,
2543 .fc_dst_len = prefixlen,
2544 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2545 RTF_UP | RTF_PREF(pref),
2546 .fc_protocol = RTPROT_RA,
2547 .fc_nlinfo.portid = 0,
2548 .fc_nlinfo.nlh = NULL,
2549 .fc_nlinfo.nl_net = net,
2552 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2553 cfg.fc_dst = *prefix;
2554 cfg.fc_gateway = *gwaddr;
2556 /* We should treat it as a default route if prefix length is 0. */
2558 cfg.fc_flags |= RTF_DEFAULT;
2560 ip6_route_add(&cfg, NULL);
2562 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2566 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2568 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2569 struct rt6_info *rt;
2570 struct fib6_table *table;
2572 table = fib6_get_table(dev_net(dev), tb_id);
2576 read_lock_bh(&table->tb6_lock);
2577 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2578 if (dev == rt->dst.dev &&
2579 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2580 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2585 read_unlock_bh(&table->tb6_lock);
2589 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2590 struct net_device *dev,
2593 struct fib6_config cfg = {
2594 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2595 .fc_metric = IP6_RT_PRIO_USER,
2596 .fc_ifindex = dev->ifindex,
2597 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2598 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2599 .fc_protocol = RTPROT_RA,
2600 .fc_nlinfo.portid = 0,
2601 .fc_nlinfo.nlh = NULL,
2602 .fc_nlinfo.nl_net = dev_net(dev),
2605 cfg.fc_gateway = *gwaddr;
2607 if (!ip6_route_add(&cfg, NULL)) {
2608 struct fib6_table *table;
2610 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2612 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2615 return rt6_get_dflt_router(gwaddr, dev);
2618 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2620 struct rt6_info *rt;
2623 read_lock_bh(&table->tb6_lock);
2624 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2625 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2626 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2628 read_unlock_bh(&table->tb6_lock);
2633 read_unlock_bh(&table->tb6_lock);
2635 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2638 void rt6_purge_dflt_routers(struct net *net)
2640 struct fib6_table *table;
2641 struct hlist_head *head;
2646 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2647 head = &net->ipv6.fib_table_hash[h];
2648 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2649 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2650 __rt6_purge_dflt_routers(table);
2657 static void rtmsg_to_fib6_config(struct net *net,
2658 struct in6_rtmsg *rtmsg,
2659 struct fib6_config *cfg)
2661 memset(cfg, 0, sizeof(*cfg));
2663 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2665 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2666 cfg->fc_metric = rtmsg->rtmsg_metric;
2667 cfg->fc_expires = rtmsg->rtmsg_info;
2668 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2669 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2670 cfg->fc_flags = rtmsg->rtmsg_flags;
2672 cfg->fc_nlinfo.nl_net = net;
2674 cfg->fc_dst = rtmsg->rtmsg_dst;
2675 cfg->fc_src = rtmsg->rtmsg_src;
2676 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2679 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2681 struct fib6_config cfg;
2682 struct in6_rtmsg rtmsg;
2686 case SIOCADDRT: /* Add a route */
2687 case SIOCDELRT: /* Delete a route */
2688 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2690 err = copy_from_user(&rtmsg, arg,
2691 sizeof(struct in6_rtmsg));
2695 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2700 err = ip6_route_add(&cfg, NULL);
2703 err = ip6_route_del(&cfg, NULL);
2717 * Drop the packet on the floor
2720 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2723 struct dst_entry *dst = skb_dst(skb);
2724 switch (ipstats_mib_noroutes) {
2725 case IPSTATS_MIB_INNOROUTES:
2726 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2727 if (type == IPV6_ADDR_ANY) {
2728 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2729 IPSTATS_MIB_INADDRERRORS);
2733 case IPSTATS_MIB_OUTNOROUTES:
2734 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2735 ipstats_mib_noroutes);
2738 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2743 static int ip6_pkt_discard(struct sk_buff *skb)
2745 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2748 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2750 skb->dev = skb_dst(skb)->dev;
2751 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2754 static int ip6_pkt_prohibit(struct sk_buff *skb)
2756 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2759 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2761 skb->dev = skb_dst(skb)->dev;
2762 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2766 * Allocate a dst for local (unicast / anycast) address.
2769 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2770 const struct in6_addr *addr,
2774 struct net *net = dev_net(idev->dev);
2775 struct net_device *dev = idev->dev;
2776 struct rt6_info *rt;
2778 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2780 return ERR_PTR(-ENOMEM);
2784 rt->dst.flags |= DST_HOST;
2785 rt->dst.input = ip6_input;
2786 rt->dst.output = ip6_output;
2787 rt->rt6i_idev = idev;
2789 rt->rt6i_protocol = RTPROT_KERNEL;
2790 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2792 rt->rt6i_flags |= RTF_ANYCAST;
2794 rt->rt6i_flags |= RTF_LOCAL;
2796 rt->rt6i_gateway = *addr;
2797 rt->rt6i_dst.addr = *addr;
2798 rt->rt6i_dst.plen = 128;
2799 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2800 rt->rt6i_table = fib6_get_table(net, tb_id);
2805 /* remove deleted ip from prefsrc entries */
2806 struct arg_dev_net_ip {
2807 struct net_device *dev;
2809 struct in6_addr *addr;
2812 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2814 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2815 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2816 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2818 if (((void *)rt->dst.dev == dev || !dev) &&
2819 rt != net->ipv6.ip6_null_entry &&
2820 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2821 /* remove prefsrc entry */
2822 rt->rt6i_prefsrc.plen = 0;
2827 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2829 struct net *net = dev_net(ifp->idev->dev);
2830 struct arg_dev_net_ip adni = {
2831 .dev = ifp->idev->dev,
2835 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2838 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2839 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2841 /* Remove routers and update dst entries when gateway turn into host. */
2842 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2844 struct in6_addr *gateway = (struct in6_addr *)arg;
2846 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2847 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2848 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2854 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2856 fib6_clean_all(net, fib6_clean_tohost, gateway);
2859 struct arg_dev_net {
2860 struct net_device *dev;
2864 /* called with write lock held for table with rt */
2865 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2867 const struct arg_dev_net *adn = arg;
2868 const struct net_device *dev = adn->dev;
2870 if ((rt->dst.dev == dev || !dev) &&
2871 rt != adn->net->ipv6.ip6_null_entry &&
2872 (rt->rt6i_nsiblings == 0 ||
2873 (dev && netdev_unregistering(dev)) ||
2874 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2880 void rt6_ifdown(struct net *net, struct net_device *dev)
2882 struct arg_dev_net adn = {
2887 fib6_clean_all(net, fib6_ifdown, &adn);
2889 rt6_uncached_list_flush_dev(net, dev);
2892 struct rt6_mtu_change_arg {
2893 struct net_device *dev;
2897 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2899 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2900 struct inet6_dev *idev;
2902 /* In IPv6 pmtu discovery is not optional,
2903 so that RTAX_MTU lock cannot disable it.
2904 We still use this lock to block changes
2905 caused by addrconf/ndisc.
2908 idev = __in6_dev_get(arg->dev);
2912 /* For administrative MTU increase, there is no way to discover
2913 IPv6 PMTU increase, so PMTU increase should be updated here.
2914 Since RFC 1981 doesn't include administrative MTU increase
2915 update PMTU increase is a MUST. (i.e. jumbo frame)
2918 If new MTU is less than route PMTU, this new MTU will be the
2919 lowest MTU in the path, update the route PMTU to reflect PMTU
2920 decreases; if new MTU is greater than route PMTU, and the
2921 old MTU is the lowest MTU in the path, update the route PMTU
2922 to reflect the increase. In this case if the other nodes' MTU
2923 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2926 if (rt->dst.dev == arg->dev &&
2927 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2928 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2929 if (rt->rt6i_flags & RTF_CACHE) {
2930 /* For RTF_CACHE with rt6i_pmtu == 0
2931 * (i.e. a redirected route),
2932 * the metrics of its rt->dst.from has already
2935 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2936 rt->rt6i_pmtu = arg->mtu;
2937 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2938 (dst_mtu(&rt->dst) < arg->mtu &&
2939 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2940 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2946 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2948 struct rt6_mtu_change_arg arg = {
2953 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2956 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2957 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2958 [RTA_OIF] = { .type = NLA_U32 },
2959 [RTA_IIF] = { .type = NLA_U32 },
2960 [RTA_PRIORITY] = { .type = NLA_U32 },
2961 [RTA_METRICS] = { .type = NLA_NESTED },
2962 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2963 [RTA_PREF] = { .type = NLA_U8 },
2964 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2965 [RTA_ENCAP] = { .type = NLA_NESTED },
2966 [RTA_EXPIRES] = { .type = NLA_U32 },
2967 [RTA_UID] = { .type = NLA_U32 },
2968 [RTA_MARK] = { .type = NLA_U32 },
2971 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2972 struct fib6_config *cfg,
2973 struct netlink_ext_ack *extack)
2976 struct nlattr *tb[RTA_MAX+1];
2980 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2986 rtm = nlmsg_data(nlh);
2987 memset(cfg, 0, sizeof(*cfg));
2989 cfg->fc_table = rtm->rtm_table;
2990 cfg->fc_dst_len = rtm->rtm_dst_len;
2991 cfg->fc_src_len = rtm->rtm_src_len;
2992 cfg->fc_flags = RTF_UP;
2993 cfg->fc_protocol = rtm->rtm_protocol;
2994 cfg->fc_type = rtm->rtm_type;
2996 if (rtm->rtm_type == RTN_UNREACHABLE ||
2997 rtm->rtm_type == RTN_BLACKHOLE ||
2998 rtm->rtm_type == RTN_PROHIBIT ||
2999 rtm->rtm_type == RTN_THROW)
3000 cfg->fc_flags |= RTF_REJECT;
3002 if (rtm->rtm_type == RTN_LOCAL)
3003 cfg->fc_flags |= RTF_LOCAL;
3005 if (rtm->rtm_flags & RTM_F_CLONED)
3006 cfg->fc_flags |= RTF_CACHE;
3008 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3009 cfg->fc_nlinfo.nlh = nlh;
3010 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3012 if (tb[RTA_GATEWAY]) {
3013 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3014 cfg->fc_flags |= RTF_GATEWAY;
3018 int plen = (rtm->rtm_dst_len + 7) >> 3;
3020 if (nla_len(tb[RTA_DST]) < plen)
3023 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3027 int plen = (rtm->rtm_src_len + 7) >> 3;
3029 if (nla_len(tb[RTA_SRC]) < plen)
3032 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3035 if (tb[RTA_PREFSRC])
3036 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3039 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3041 if (tb[RTA_PRIORITY])
3042 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3044 if (tb[RTA_METRICS]) {
3045 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3046 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3050 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3052 if (tb[RTA_MULTIPATH]) {
3053 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3054 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3056 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3057 cfg->fc_mp_len, extack);
3063 pref = nla_get_u8(tb[RTA_PREF]);
3064 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3065 pref != ICMPV6_ROUTER_PREF_HIGH)
3066 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3067 cfg->fc_flags |= RTF_PREF(pref);
3071 cfg->fc_encap = tb[RTA_ENCAP];
3073 if (tb[RTA_ENCAP_TYPE]) {
3074 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3076 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3081 if (tb[RTA_EXPIRES]) {
3082 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3084 if (addrconf_finite_timeout(timeout)) {
3085 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3086 cfg->fc_flags |= RTF_EXPIRES;
3096 struct rt6_info *rt6_info;
3097 struct fib6_config r_cfg;
3098 struct mx6_config mxc;
3099 struct list_head next;
3102 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3106 list_for_each_entry(nh, rt6_nh_list, next) {
3107 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3108 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3109 nh->r_cfg.fc_ifindex);
3113 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3114 struct rt6_info *rt, struct fib6_config *r_cfg)
3119 list_for_each_entry(nh, rt6_nh_list, next) {
3120 /* check if rt6_info already exists */
3121 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3125 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3129 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3134 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3135 list_add_tail(&nh->next, rt6_nh_list);
3140 static void ip6_route_mpath_notify(struct rt6_info *rt,
3141 struct rt6_info *rt_last,
3142 struct nl_info *info,
3145 /* if this is an APPEND route, then rt points to the first route
3146 * inserted and rt_last points to last route inserted. Userspace
3147 * wants a consistent dump of the route which starts at the first
3148 * nexthop. Since sibling routes are always added at the end of
3149 * the list, find the first sibling of the last route appended
3151 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3152 rt = list_first_entry(&rt_last->rt6i_siblings,
3158 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3161 static int ip6_route_multipath_add(struct fib6_config *cfg,
3162 struct netlink_ext_ack *extack)
3164 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3165 struct nl_info *info = &cfg->fc_nlinfo;
3166 struct fib6_config r_cfg;
3167 struct rtnexthop *rtnh;
3168 struct rt6_info *rt;
3169 struct rt6_nh *err_nh;
3170 struct rt6_nh *nh, *nh_safe;
3176 int replace = (cfg->fc_nlinfo.nlh &&
3177 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3178 LIST_HEAD(rt6_nh_list);
3180 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3181 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3182 nlflags |= NLM_F_APPEND;
3184 remaining = cfg->fc_mp_len;
3185 rtnh = (struct rtnexthop *)cfg->fc_mp;
3187 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3188 * rt6_info structs per nexthop
3190 while (rtnh_ok(rtnh, remaining)) {
3191 memcpy(&r_cfg, cfg, sizeof(*cfg));
3192 if (rtnh->rtnh_ifindex)
3193 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3195 attrlen = rtnh_attrlen(rtnh);
3197 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3199 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3201 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3202 r_cfg.fc_flags |= RTF_GATEWAY;
3204 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3205 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3207 r_cfg.fc_encap_type = nla_get_u16(nla);
3210 rt = ip6_route_info_create(&r_cfg, extack);
3217 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3219 dst_release_immediate(&rt->dst);
3223 rtnh = rtnh_next(rtnh, &remaining);
3226 /* for add and replace send one notification with all nexthops.
3227 * Skip the notification in fib6_add_rt2node and send one with
3228 * the full route when done
3230 info->skip_notify = 1;
3233 list_for_each_entry(nh, &rt6_nh_list, next) {
3234 rt_last = nh->rt6_info;
3235 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3236 /* save reference to first route for notification */
3237 if (!rt_notif && !err)
3238 rt_notif = nh->rt6_info;
3240 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3241 nh->rt6_info = NULL;
3244 ip6_print_replace_route_err(&rt6_nh_list);
3249 /* Because each route is added like a single route we remove
3250 * these flags after the first nexthop: if there is a collision,
3251 * we have already failed to add the first nexthop:
3252 * fib6_add_rt2node() has rejected it; when replacing, old
3253 * nexthops have been replaced by first new, the rest should
3256 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3261 /* success ... tell user about new route */
3262 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3266 /* send notification for routes that were added so that
3267 * the delete notifications sent by ip6_route_del are
3271 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3273 /* Delete routes that were already added */
3274 list_for_each_entry(nh, &rt6_nh_list, next) {
3277 ip6_route_del(&nh->r_cfg, extack);
3281 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3283 dst_release_immediate(&nh->rt6_info->dst);
3285 list_del(&nh->next);
3292 static int ip6_route_multipath_del(struct fib6_config *cfg,
3293 struct netlink_ext_ack *extack)
3295 struct fib6_config r_cfg;
3296 struct rtnexthop *rtnh;
3299 int err = 1, last_err = 0;
3301 remaining = cfg->fc_mp_len;
3302 rtnh = (struct rtnexthop *)cfg->fc_mp;
3304 /* Parse a Multipath Entry */
3305 while (rtnh_ok(rtnh, remaining)) {
3306 memcpy(&r_cfg, cfg, sizeof(*cfg));
3307 if (rtnh->rtnh_ifindex)
3308 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3310 attrlen = rtnh_attrlen(rtnh);
3312 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3314 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3316 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3317 r_cfg.fc_flags |= RTF_GATEWAY;
3320 err = ip6_route_del(&r_cfg, extack);
3324 rtnh = rtnh_next(rtnh, &remaining);
3330 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3331 struct netlink_ext_ack *extack)
3333 struct fib6_config cfg;
3336 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3341 return ip6_route_multipath_del(&cfg, extack);
3343 cfg.fc_delete_all_nh = 1;
3344 return ip6_route_del(&cfg, extack);
3348 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3349 struct netlink_ext_ack *extack)
3351 struct fib6_config cfg;
3354 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3359 return ip6_route_multipath_add(&cfg, extack);
3361 return ip6_route_add(&cfg, extack);
3364 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3366 int nexthop_len = 0;
3368 if (rt->rt6i_nsiblings) {
3369 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3370 + NLA_ALIGN(sizeof(struct rtnexthop))
3371 + nla_total_size(16) /* RTA_GATEWAY */
3372 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3374 nexthop_len *= rt->rt6i_nsiblings;
3377 return NLMSG_ALIGN(sizeof(struct rtmsg))
3378 + nla_total_size(16) /* RTA_SRC */
3379 + nla_total_size(16) /* RTA_DST */
3380 + nla_total_size(16) /* RTA_GATEWAY */
3381 + nla_total_size(16) /* RTA_PREFSRC */
3382 + nla_total_size(4) /* RTA_TABLE */
3383 + nla_total_size(4) /* RTA_IIF */
3384 + nla_total_size(4) /* RTA_OIF */
3385 + nla_total_size(4) /* RTA_PRIORITY */
3386 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3387 + nla_total_size(sizeof(struct rta_cacheinfo))
3388 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3389 + nla_total_size(1) /* RTA_PREF */
3390 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3394 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3395 unsigned int *flags, bool skip_oif)
3397 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3398 *flags |= RTNH_F_LINKDOWN;
3399 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3400 *flags |= RTNH_F_DEAD;
3403 if (rt->rt6i_flags & RTF_GATEWAY) {
3404 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3405 goto nla_put_failure;
3408 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3409 *flags |= RTNH_F_OFFLOAD;
3411 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3412 if (!skip_oif && rt->dst.dev &&
3413 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3414 goto nla_put_failure;
3416 if (rt->dst.lwtstate &&
3417 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3418 goto nla_put_failure;
3426 /* add multipath next hop */
3427 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3429 struct rtnexthop *rtnh;
3430 unsigned int flags = 0;
3432 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3434 goto nla_put_failure;
3436 rtnh->rtnh_hops = 0;
3437 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3439 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3440 goto nla_put_failure;
3442 rtnh->rtnh_flags = flags;
3444 /* length of rtnetlink header + attributes */
3445 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3453 static int rt6_fill_node(struct net *net,
3454 struct sk_buff *skb, struct rt6_info *rt,
3455 struct in6_addr *dst, struct in6_addr *src,
3456 int iif, int type, u32 portid, u32 seq,
3459 u32 metrics[RTAX_MAX];
3461 struct nlmsghdr *nlh;
3465 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3469 rtm = nlmsg_data(nlh);
3470 rtm->rtm_family = AF_INET6;
3471 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3472 rtm->rtm_src_len = rt->rt6i_src.plen;
3475 table = rt->rt6i_table->tb6_id;
3477 table = RT6_TABLE_UNSPEC;
3478 rtm->rtm_table = table;
3479 if (nla_put_u32(skb, RTA_TABLE, table))
3480 goto nla_put_failure;
3481 if (rt->rt6i_flags & RTF_REJECT) {
3482 switch (rt->dst.error) {
3484 rtm->rtm_type = RTN_BLACKHOLE;
3487 rtm->rtm_type = RTN_PROHIBIT;
3490 rtm->rtm_type = RTN_THROW;
3493 rtm->rtm_type = RTN_UNREACHABLE;
3497 else if (rt->rt6i_flags & RTF_LOCAL)
3498 rtm->rtm_type = RTN_LOCAL;
3499 else if (rt->rt6i_flags & RTF_ANYCAST)
3500 rtm->rtm_type = RTN_ANYCAST;
3501 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3502 rtm->rtm_type = RTN_LOCAL;
3504 rtm->rtm_type = RTN_UNICAST;
3506 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3507 rtm->rtm_protocol = rt->rt6i_protocol;
3509 if (rt->rt6i_flags & RTF_CACHE)
3510 rtm->rtm_flags |= RTM_F_CLONED;
3513 if (nla_put_in6_addr(skb, RTA_DST, dst))
3514 goto nla_put_failure;
3515 rtm->rtm_dst_len = 128;
3516 } else if (rtm->rtm_dst_len)
3517 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3518 goto nla_put_failure;
3519 #ifdef CONFIG_IPV6_SUBTREES
3521 if (nla_put_in6_addr(skb, RTA_SRC, src))
3522 goto nla_put_failure;
3523 rtm->rtm_src_len = 128;
3524 } else if (rtm->rtm_src_len &&
3525 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3526 goto nla_put_failure;
3529 #ifdef CONFIG_IPV6_MROUTE
3530 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3531 int err = ip6mr_get_route(net, skb, rtm, portid);
3536 goto nla_put_failure;
3539 if (nla_put_u32(skb, RTA_IIF, iif))
3540 goto nla_put_failure;
3542 struct in6_addr saddr_buf;
3543 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3544 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3545 goto nla_put_failure;
3548 if (rt->rt6i_prefsrc.plen) {
3549 struct in6_addr saddr_buf;
3550 saddr_buf = rt->rt6i_prefsrc.addr;
3551 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3552 goto nla_put_failure;
3555 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3557 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3558 if (rtnetlink_put_metrics(skb, metrics) < 0)
3559 goto nla_put_failure;
3561 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3562 goto nla_put_failure;
3564 /* For multipath routes, walk the siblings list and add
3565 * each as a nexthop within RTA_MULTIPATH.
3567 if (rt->rt6i_nsiblings) {
3568 struct rt6_info *sibling, *next_sibling;
3571 mp = nla_nest_start(skb, RTA_MULTIPATH);
3573 goto nla_put_failure;
3575 if (rt6_add_nexthop(skb, rt) < 0)
3576 goto nla_put_failure;
3578 list_for_each_entry_safe(sibling, next_sibling,
3579 &rt->rt6i_siblings, rt6i_siblings) {
3580 if (rt6_add_nexthop(skb, sibling) < 0)
3581 goto nla_put_failure;
3584 nla_nest_end(skb, mp);
3586 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3587 goto nla_put_failure;
3590 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3592 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3593 goto nla_put_failure;
3595 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3596 goto nla_put_failure;
3599 nlmsg_end(skb, nlh);
3603 nlmsg_cancel(skb, nlh);
3607 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3609 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3610 struct net *net = arg->net;
3612 if (rt == net->ipv6.ip6_null_entry)
3615 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3616 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3618 /* user wants prefix routes only */
3619 if (rtm->rtm_flags & RTM_F_PREFIX &&
3620 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3621 /* success since this is not a prefix route */
3626 return rt6_fill_node(net,
3627 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3628 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3632 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3633 struct netlink_ext_ack *extack)
3635 struct net *net = sock_net(in_skb->sk);
3636 struct nlattr *tb[RTA_MAX+1];
3637 int err, iif = 0, oif = 0;
3638 struct dst_entry *dst;
3639 struct rt6_info *rt;
3640 struct sk_buff *skb;
3645 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3651 memset(&fl6, 0, sizeof(fl6));
3652 rtm = nlmsg_data(nlh);
3653 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3654 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3657 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3660 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3664 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3667 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3671 iif = nla_get_u32(tb[RTA_IIF]);
3674 oif = nla_get_u32(tb[RTA_OIF]);
3677 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3680 fl6.flowi6_uid = make_kuid(current_user_ns(),
3681 nla_get_u32(tb[RTA_UID]));
3683 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3686 struct net_device *dev;
3691 dev = dev_get_by_index_rcu(net, iif);
3698 fl6.flowi6_iif = iif;
3700 if (!ipv6_addr_any(&fl6.saddr))
3701 flags |= RT6_LOOKUP_F_HAS_SADDR;
3704 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3706 dst = ip6_route_lookup(net, &fl6, 0);
3710 fl6.flowi6_oif = oif;
3713 dst = ip6_route_output(net, NULL, &fl6);
3715 dst = ip6_route_lookup(net, &fl6, 0);
3719 rt = container_of(dst, struct rt6_info, dst);
3720 if (rt->dst.error) {
3721 err = rt->dst.error;
3726 if (rt == net->ipv6.ip6_null_entry) {
3727 err = rt->dst.error;
3732 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3739 skb_dst_set(skb, &rt->dst);
3741 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3742 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3745 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3746 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3753 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3758 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3759 unsigned int nlm_flags)
3761 struct sk_buff *skb;
3762 struct net *net = info->nl_net;
3767 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3769 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3773 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3774 event, info->portid, seq, nlm_flags);
3776 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3777 WARN_ON(err == -EMSGSIZE);
3781 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3782 info->nlh, gfp_any());
3786 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3789 static int ip6_route_dev_notify(struct notifier_block *this,
3790 unsigned long event, void *ptr)
3792 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3793 struct net *net = dev_net(dev);
3795 if (!(dev->flags & IFF_LOOPBACK))
3798 if (event == NETDEV_REGISTER) {
3799 net->ipv6.ip6_null_entry->dst.dev = dev;
3800 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3801 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3802 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3803 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3804 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3805 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3807 } else if (event == NETDEV_UNREGISTER &&
3808 dev->reg_state != NETREG_UNREGISTERED) {
3809 /* NETDEV_UNREGISTER could be fired for multiple times by
3810 * netdev_wait_allrefs(). Make sure we only call this once.
3812 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3813 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3814 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3815 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3826 #ifdef CONFIG_PROC_FS
3828 static const struct file_operations ipv6_route_proc_fops = {
3829 .owner = THIS_MODULE,
3830 .open = ipv6_route_open,
3832 .llseek = seq_lseek,
3833 .release = seq_release_net,
3836 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3838 struct net *net = (struct net *)seq->private;
3839 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3840 net->ipv6.rt6_stats->fib_nodes,
3841 net->ipv6.rt6_stats->fib_route_nodes,
3842 net->ipv6.rt6_stats->fib_rt_alloc,
3843 net->ipv6.rt6_stats->fib_rt_entries,
3844 net->ipv6.rt6_stats->fib_rt_cache,
3845 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3846 net->ipv6.rt6_stats->fib_discarded_routes);
3851 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3853 return single_open_net(inode, file, rt6_stats_seq_show);
3856 static const struct file_operations rt6_stats_seq_fops = {
3857 .owner = THIS_MODULE,
3858 .open = rt6_stats_seq_open,
3860 .llseek = seq_lseek,
3861 .release = single_release_net,
3863 #endif /* CONFIG_PROC_FS */
3865 #ifdef CONFIG_SYSCTL
3868 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3869 void __user *buffer, size_t *lenp, loff_t *ppos)
3876 net = (struct net *)ctl->extra1;
3877 delay = net->ipv6.sysctl.flush_delay;
3878 proc_dointvec(ctl, write, buffer, lenp, ppos);
3879 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3883 struct ctl_table ipv6_route_table_template[] = {
3885 .procname = "flush",
3886 .data = &init_net.ipv6.sysctl.flush_delay,
3887 .maxlen = sizeof(int),
3889 .proc_handler = ipv6_sysctl_rtcache_flush
3892 .procname = "gc_thresh",
3893 .data = &ip6_dst_ops_template.gc_thresh,
3894 .maxlen = sizeof(int),
3896 .proc_handler = proc_dointvec,
3899 .procname = "max_size",
3900 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3901 .maxlen = sizeof(int),
3903 .proc_handler = proc_dointvec,
3906 .procname = "gc_min_interval",
3907 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3908 .maxlen = sizeof(int),
3910 .proc_handler = proc_dointvec_jiffies,
3913 .procname = "gc_timeout",
3914 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3915 .maxlen = sizeof(int),
3917 .proc_handler = proc_dointvec_jiffies,
3920 .procname = "gc_interval",
3921 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3922 .maxlen = sizeof(int),
3924 .proc_handler = proc_dointvec_jiffies,
3927 .procname = "gc_elasticity",
3928 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3929 .maxlen = sizeof(int),
3931 .proc_handler = proc_dointvec,
3934 .procname = "mtu_expires",
3935 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3936 .maxlen = sizeof(int),
3938 .proc_handler = proc_dointvec_jiffies,
3941 .procname = "min_adv_mss",
3942 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3943 .maxlen = sizeof(int),
3945 .proc_handler = proc_dointvec,
3948 .procname = "gc_min_interval_ms",
3949 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3950 .maxlen = sizeof(int),
3952 .proc_handler = proc_dointvec_ms_jiffies,
3957 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3959 struct ctl_table *table;
3961 table = kmemdup(ipv6_route_table_template,
3962 sizeof(ipv6_route_table_template),
3966 table[0].data = &net->ipv6.sysctl.flush_delay;
3967 table[0].extra1 = net;
3968 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3969 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3970 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3971 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3972 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3973 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3974 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3975 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3976 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3978 /* Don't export sysctls to unprivileged users */
3979 if (net->user_ns != &init_user_ns)
3980 table[0].procname = NULL;
3987 static int __net_init ip6_route_net_init(struct net *net)
3991 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3992 sizeof(net->ipv6.ip6_dst_ops));
3994 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3995 goto out_ip6_dst_ops;
3997 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3998 sizeof(*net->ipv6.ip6_null_entry),
4000 if (!net->ipv6.ip6_null_entry)
4001 goto out_ip6_dst_entries;
4002 net->ipv6.ip6_null_entry->dst.path =
4003 (struct dst_entry *)net->ipv6.ip6_null_entry;
4004 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4005 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4006 ip6_template_metrics, true);
4008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4009 net->ipv6.fib6_has_custom_rules = false;
4010 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4011 sizeof(*net->ipv6.ip6_prohibit_entry),
4013 if (!net->ipv6.ip6_prohibit_entry)
4014 goto out_ip6_null_entry;
4015 net->ipv6.ip6_prohibit_entry->dst.path =
4016 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4017 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4018 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4019 ip6_template_metrics, true);
4021 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4022 sizeof(*net->ipv6.ip6_blk_hole_entry),
4024 if (!net->ipv6.ip6_blk_hole_entry)
4025 goto out_ip6_prohibit_entry;
4026 net->ipv6.ip6_blk_hole_entry->dst.path =
4027 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4028 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4029 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4030 ip6_template_metrics, true);
4033 net->ipv6.sysctl.flush_delay = 0;
4034 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4035 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4036 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4037 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4038 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4039 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4040 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4042 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4048 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4049 out_ip6_prohibit_entry:
4050 kfree(net->ipv6.ip6_prohibit_entry);
4052 kfree(net->ipv6.ip6_null_entry);
4054 out_ip6_dst_entries:
4055 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4060 static void __net_exit ip6_route_net_exit(struct net *net)
4062 kfree(net->ipv6.ip6_null_entry);
4063 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4064 kfree(net->ipv6.ip6_prohibit_entry);
4065 kfree(net->ipv6.ip6_blk_hole_entry);
4067 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4070 static int __net_init ip6_route_net_init_late(struct net *net)
4072 #ifdef CONFIG_PROC_FS
4073 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4074 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4079 static void __net_exit ip6_route_net_exit_late(struct net *net)
4081 #ifdef CONFIG_PROC_FS
4082 remove_proc_entry("ipv6_route", net->proc_net);
4083 remove_proc_entry("rt6_stats", net->proc_net);
4087 static struct pernet_operations ip6_route_net_ops = {
4088 .init = ip6_route_net_init,
4089 .exit = ip6_route_net_exit,
4092 static int __net_init ipv6_inetpeer_init(struct net *net)
4094 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4098 inet_peer_base_init(bp);
4099 net->ipv6.peers = bp;
4103 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4105 struct inet_peer_base *bp = net->ipv6.peers;
4107 net->ipv6.peers = NULL;
4108 inetpeer_invalidate_tree(bp);
4112 static struct pernet_operations ipv6_inetpeer_ops = {
4113 .init = ipv6_inetpeer_init,
4114 .exit = ipv6_inetpeer_exit,
4117 static struct pernet_operations ip6_route_net_late_ops = {
4118 .init = ip6_route_net_init_late,
4119 .exit = ip6_route_net_exit_late,
4122 static struct notifier_block ip6_route_dev_notifier = {
4123 .notifier_call = ip6_route_dev_notify,
4124 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4127 void __init ip6_route_init_special_entries(void)
4129 /* Registering of the loopback is done before this portion of code,
4130 * the loopback reference in rt6_info will not be taken, do it
4131 * manually for init_net */
4132 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4133 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4134 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4135 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4136 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4137 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4138 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4142 int __init ip6_route_init(void)
4148 ip6_dst_ops_template.kmem_cachep =
4149 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4150 SLAB_HWCACHE_ALIGN, NULL);
4151 if (!ip6_dst_ops_template.kmem_cachep)
4154 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4156 goto out_kmem_cache;
4158 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4160 goto out_dst_entries;
4162 ret = register_pernet_subsys(&ip6_route_net_ops);
4164 goto out_register_inetpeer;
4166 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4170 goto out_register_subsys;
4176 ret = fib6_rules_init();
4180 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4182 goto fib6_rules_init;
4185 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4186 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4187 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4188 RTNL_FLAG_DOIT_UNLOCKED))
4189 goto out_register_late_subsys;
4191 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4193 goto out_register_late_subsys;
4195 for_each_possible_cpu(cpu) {
4196 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4198 INIT_LIST_HEAD(&ul->head);
4199 spin_lock_init(&ul->lock);
4205 out_register_late_subsys:
4206 unregister_pernet_subsys(&ip6_route_net_late_ops);
4208 fib6_rules_cleanup();
4213 out_register_subsys:
4214 unregister_pernet_subsys(&ip6_route_net_ops);
4215 out_register_inetpeer:
4216 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4218 dst_entries_destroy(&ip6_dst_blackhole_ops);
4220 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4224 void ip6_route_cleanup(void)
4226 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4227 unregister_pernet_subsys(&ip6_route_net_late_ops);
4228 fib6_rules_cleanup();
4231 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4232 unregister_pernet_subsys(&ip6_route_net_ops);
4233 dst_entries_destroy(&ip6_dst_blackhole_ops);
4234 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);