2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
102 peer = rt6_get_peer_create(rt);
104 u32 *old_p = __DST_METRICS_PTR(old);
105 unsigned long prev, new;
108 if (inet_metrics_new(peer))
109 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111 new = (unsigned long) p;
112 prev = cmpxchg(&dst->_metrics, old, new);
115 p = __DST_METRICS_PTR(prev);
116 if (prev & DST_METRICS_READ_ONLY)
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 struct in6_addr *p = &rt->rt6i_gateway;
127 if (!ipv6_addr_any(p))
128 return (const void *) p;
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 struct rt6_info *rt = (struct rt6_info *) dst;
137 daddr = choose_neigh_daddr(rt, daddr);
138 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
141 return neigh_create(&nd_tbl, daddr, dst->dev);
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152 dst_set_neighbour(&rt->dst, n);
157 static struct dst_ops ip6_dst_ops_template = {
159 .protocol = cpu_to_be16(ETH_P_IPV6),
162 .check = ip6_dst_check,
163 .default_advmss = ip6_default_advmss,
165 .cow_metrics = ipv6_cow_metrics,
166 .destroy = ip6_dst_destroy,
167 .ifdown = ip6_dst_ifdown,
168 .negative_advice = ip6_negative_advice,
169 .link_failure = ip6_link_failure,
170 .update_pmtu = ip6_rt_update_pmtu,
171 .local_out = __ip6_local_out,
172 .neigh_lookup = ip6_neigh_lookup,
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179 return mtu ? : dst->dev->mtu;
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
192 static struct dst_ops ip6_dst_blackhole_ops = {
194 .protocol = cpu_to_be16(ETH_P_IPV6),
195 .destroy = ip6_dst_destroy,
196 .check = ip6_dst_check,
197 .mtu = ip6_blackhole_mtu,
198 .default_advmss = ip6_default_advmss,
199 .update_pmtu = ip6_rt_blackhole_update_pmtu,
200 .cow_metrics = ip6_rt_blackhole_cow_metrics,
201 .neigh_lookup = ip6_neigh_lookup,
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205 [RTAX_HOPLIMIT - 1] = 255,
208 static struct rt6_info ip6_null_entry_template = {
210 .__refcnt = ATOMIC_INIT(1),
213 .error = -ENETUNREACH,
214 .input = ip6_pkt_discard,
215 .output = ip6_pkt_discard_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228 static struct rt6_info ip6_prohibit_entry_template = {
230 .__refcnt = ATOMIC_INIT(1),
234 .input = ip6_pkt_prohibit,
235 .output = ip6_pkt_prohibit_out,
237 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
238 .rt6i_protocol = RTPROT_KERNEL,
239 .rt6i_metric = ~(u32) 0,
240 .rt6i_ref = ATOMIC_INIT(1),
243 static struct rt6_info ip6_blk_hole_entry_template = {
245 .__refcnt = ATOMIC_INIT(1),
249 .input = dst_discard,
250 .output = dst_discard,
252 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
253 .rt6i_protocol = RTPROT_KERNEL,
254 .rt6i_metric = ~(u32) 0,
255 .rt6i_ref = ATOMIC_INIT(1),
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
262 struct net_device *dev,
264 struct fib6_table *table)
266 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
270 memset(&rt->rt6i_table, 0,
271 sizeof(*rt) - sizeof(struct dst_entry));
272 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
277 static void ip6_dst_destroy(struct dst_entry *dst)
279 struct rt6_info *rt = (struct rt6_info *)dst;
280 struct inet6_dev *idev = rt->rt6i_idev;
282 if (!(rt->dst.flags & DST_HOST))
283 dst_destroy_metrics_generic(dst);
286 rt->rt6i_idev = NULL;
290 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
291 dst_release(dst->from);
293 if (rt6_has_peer(rt)) {
294 struct inet_peer *peer = rt6_peer_ptr(rt);
299 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
301 static u32 rt6_peer_genid(void)
303 return atomic_read(&__rt6_peer_genid);
306 void rt6_bind_peer(struct rt6_info *rt, int create)
308 struct inet_peer_base *base;
309 struct inet_peer *peer;
311 base = inetpeer_base_ptr(rt->_rt6i_peer);
315 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
317 if (!rt6_set_peer(rt, peer))
320 rt->rt6i_peer_genid = rt6_peer_genid();
324 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
327 struct rt6_info *rt = (struct rt6_info *)dst;
328 struct inet6_dev *idev = rt->rt6i_idev;
329 struct net_device *loopback_dev =
330 dev_net(dev)->loopback_dev;
332 if (dev != loopback_dev && idev && idev->dev == dev) {
333 struct inet6_dev *loopback_idev =
334 in6_dev_get(loopback_dev);
336 rt->rt6i_idev = loopback_idev;
342 static bool rt6_check_expired(const struct rt6_info *rt)
344 struct rt6_info *ort = NULL;
346 if (rt->rt6i_flags & RTF_EXPIRES) {
347 if (time_after(jiffies, rt->dst.expires))
349 } else if (rt->dst.from) {
350 ort = (struct rt6_info *) rt->dst.from;
351 return (ort->rt6i_flags & RTF_EXPIRES) &&
352 time_after(jiffies, ort->dst.expires);
357 static bool rt6_need_strict(const struct in6_addr *daddr)
359 return ipv6_addr_type(daddr) &
360 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
364 * Route lookup. Any table->tb6_lock is implied.
367 static inline struct rt6_info *rt6_device_match(struct net *net,
369 const struct in6_addr *saddr,
373 struct rt6_info *local = NULL;
374 struct rt6_info *sprt;
376 if (!oif && ipv6_addr_any(saddr))
379 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
380 struct net_device *dev = sprt->dst.dev;
383 if (dev->ifindex == oif)
385 if (dev->flags & IFF_LOOPBACK) {
386 if (!sprt->rt6i_idev ||
387 sprt->rt6i_idev->dev->ifindex != oif) {
388 if (flags & RT6_LOOKUP_F_IFACE && oif)
390 if (local && (!oif ||
391 local->rt6i_idev->dev->ifindex == oif))
397 if (ipv6_chk_addr(net, saddr, dev,
398 flags & RT6_LOOKUP_F_IFACE))
407 if (flags & RT6_LOOKUP_F_IFACE)
408 return net->ipv6.ip6_null_entry;
414 #ifdef CONFIG_IPV6_ROUTER_PREF
415 static void rt6_probe(struct rt6_info *rt)
417 struct neighbour *neigh;
419 * Okay, this does not seem to be appropriate
420 * for now, however, we need to check if it
421 * is really so; aka Router Reachability Probing.
423 * Router Reachability Probe MUST be rate-limited
424 * to no more than one per minute.
427 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
428 if (!neigh || (neigh->nud_state & NUD_VALID))
430 read_lock_bh(&neigh->lock);
431 if (!(neigh->nud_state & NUD_VALID) &&
432 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
433 struct in6_addr mcaddr;
434 struct in6_addr *target;
436 neigh->updated = jiffies;
437 read_unlock_bh(&neigh->lock);
439 target = (struct in6_addr *)&neigh->primary_key;
440 addrconf_addr_solict_mult(target, &mcaddr);
441 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
443 read_unlock_bh(&neigh->lock);
449 static inline void rt6_probe(struct rt6_info *rt)
455 * Default Router Selection (RFC 2461 6.3.6)
457 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
459 struct net_device *dev = rt->dst.dev;
460 if (!oif || dev->ifindex == oif)
462 if ((dev->flags & IFF_LOOPBACK) &&
463 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
468 static inline int rt6_check_neigh(struct rt6_info *rt)
470 struct neighbour *neigh;
474 neigh = dst_get_neighbour_noref(&rt->dst);
475 if (rt->rt6i_flags & RTF_NONEXTHOP ||
476 !(rt->rt6i_flags & RTF_GATEWAY))
479 read_lock_bh(&neigh->lock);
480 if (neigh->nud_state & NUD_VALID)
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483 else if (neigh->nud_state & NUD_FAILED)
488 read_unlock_bh(&neigh->lock);
495 static int rt6_score_route(struct rt6_info *rt, int oif,
500 m = rt6_check_dev(rt, oif);
501 if (!m && (strict & RT6_LOOKUP_F_IFACE))
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
506 n = rt6_check_neigh(rt);
507 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
512 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
513 int *mpri, struct rt6_info *match)
517 if (rt6_check_expired(rt))
520 m = rt6_score_route(rt, oif, strict);
525 if (strict & RT6_LOOKUP_F_REACHABLE)
529 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
537 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
538 struct rt6_info *rr_head,
539 u32 metric, int oif, int strict)
541 struct rt6_info *rt, *match;
545 for (rt = rr_head; rt && rt->rt6i_metric == metric;
546 rt = rt->dst.rt6_next)
547 match = find_match(rt, oif, strict, &mpri, match);
548 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
549 rt = rt->dst.rt6_next)
550 match = find_match(rt, oif, strict, &mpri, match);
555 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
557 struct rt6_info *match, *rt0;
562 fn->rr_ptr = rt0 = fn->leaf;
564 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
567 (strict & RT6_LOOKUP_F_REACHABLE)) {
568 struct rt6_info *next = rt0->dst.rt6_next;
570 /* no entries matched; do round-robin */
571 if (!next || next->rt6i_metric != rt0->rt6i_metric)
578 net = dev_net(rt0->dst.dev);
579 return match ? match : net->ipv6.ip6_null_entry;
582 #ifdef CONFIG_IPV6_ROUTE_INFO
583 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
584 const struct in6_addr *gwaddr)
586 struct net *net = dev_net(dev);
587 struct route_info *rinfo = (struct route_info *) opt;
588 struct in6_addr prefix_buf, *prefix;
590 unsigned long lifetime;
593 if (len < sizeof(struct route_info)) {
597 /* Sanity check for prefix_len and length */
598 if (rinfo->length > 3) {
600 } else if (rinfo->prefix_len > 128) {
602 } else if (rinfo->prefix_len > 64) {
603 if (rinfo->length < 2) {
606 } else if (rinfo->prefix_len > 0) {
607 if (rinfo->length < 1) {
612 pref = rinfo->route_pref;
613 if (pref == ICMPV6_ROUTER_PREF_INVALID)
616 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
618 if (rinfo->length == 3)
619 prefix = (struct in6_addr *)rinfo->prefix;
621 /* this function is safe */
622 ipv6_addr_prefix(&prefix_buf,
623 (struct in6_addr *)rinfo->prefix,
625 prefix = &prefix_buf;
628 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
631 if (rt && !lifetime) {
637 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
640 rt->rt6i_flags = RTF_ROUTEINFO |
641 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
644 if (!addrconf_finite_timeout(lifetime))
645 rt6_clean_expires(rt);
647 rt6_set_expires(rt, jiffies + HZ * lifetime);
649 dst_release(&rt->dst);
655 #define BACKTRACK(__net, saddr) \
657 if (rt == __net->ipv6.ip6_null_entry) { \
658 struct fib6_node *pn; \
660 if (fn->fn_flags & RTN_TL_ROOT) \
663 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
664 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
667 if (fn->fn_flags & RTN_RTINFO) \
673 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
674 struct fib6_table *table,
675 struct flowi6 *fl6, int flags)
677 struct fib6_node *fn;
680 read_lock_bh(&table->tb6_lock);
681 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
684 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
685 BACKTRACK(net, &fl6->saddr);
687 dst_use(&rt->dst, jiffies);
688 read_unlock_bh(&table->tb6_lock);
693 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
696 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
698 EXPORT_SYMBOL_GPL(ip6_route_lookup);
700 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
701 const struct in6_addr *saddr, int oif, int strict)
703 struct flowi6 fl6 = {
707 struct dst_entry *dst;
708 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
711 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
712 flags |= RT6_LOOKUP_F_HAS_SADDR;
715 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
717 return (struct rt6_info *) dst;
724 EXPORT_SYMBOL(rt6_lookup);
726 /* ip6_ins_rt is called with FREE table->tb6_lock.
727 It takes new route entry, the addition fails by any reason the
728 route is freed. In any case, if caller does not hold it, it may
732 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
735 struct fib6_table *table;
737 table = rt->rt6i_table;
738 write_lock_bh(&table->tb6_lock);
739 err = fib6_add(&table->tb6_root, rt, info);
740 write_unlock_bh(&table->tb6_lock);
745 int ip6_ins_rt(struct rt6_info *rt)
747 struct nl_info info = {
748 .nl_net = dev_net(rt->dst.dev),
750 return __ip6_ins_rt(rt, &info);
753 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
754 const struct in6_addr *daddr,
755 const struct in6_addr *saddr)
763 rt = ip6_rt_copy(ort, daddr);
766 int attempts = !in_softirq();
768 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
769 if (ort->rt6i_dst.plen != 128 &&
770 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
771 rt->rt6i_flags |= RTF_ANYCAST;
772 rt->rt6i_gateway = *daddr;
775 rt->rt6i_flags |= RTF_CACHE;
777 #ifdef CONFIG_IPV6_SUBTREES
778 if (rt->rt6i_src.plen && saddr) {
779 rt->rt6i_src.addr = *saddr;
780 rt->rt6i_src.plen = 128;
785 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
786 struct net *net = dev_net(rt->dst.dev);
787 int saved_rt_min_interval =
788 net->ipv6.sysctl.ip6_rt_gc_min_interval;
789 int saved_rt_elasticity =
790 net->ipv6.sysctl.ip6_rt_gc_elasticity;
792 if (attempts-- > 0) {
793 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
794 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
796 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
798 net->ipv6.sysctl.ip6_rt_gc_elasticity =
800 net->ipv6.sysctl.ip6_rt_gc_min_interval =
801 saved_rt_min_interval;
805 net_warn_ratelimited("Neighbour table overflow\n");
814 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
815 const struct in6_addr *daddr)
817 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
820 rt->rt6i_flags |= RTF_CACHE;
821 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
826 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
827 struct flowi6 *fl6, int flags)
829 struct fib6_node *fn;
830 struct rt6_info *rt, *nrt;
834 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
836 strict |= flags & RT6_LOOKUP_F_IFACE;
839 read_lock_bh(&table->tb6_lock);
842 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
845 rt = rt6_select(fn, oif, strict | reachable);
847 BACKTRACK(net, &fl6->saddr);
848 if (rt == net->ipv6.ip6_null_entry ||
849 rt->rt6i_flags & RTF_CACHE)
853 read_unlock_bh(&table->tb6_lock);
855 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
856 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
857 else if (!(rt->dst.flags & DST_HOST))
858 nrt = rt6_alloc_clone(rt, &fl6->daddr);
862 dst_release(&rt->dst);
863 rt = nrt ? : net->ipv6.ip6_null_entry;
867 err = ip6_ins_rt(nrt);
876 * Race condition! In the gap, when table->tb6_lock was
877 * released someone could insert this route. Relookup.
879 dst_release(&rt->dst);
888 read_unlock_bh(&table->tb6_lock);
890 rt->dst.lastuse = jiffies;
896 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
902 static struct dst_entry *ip6_route_input_lookup(struct net *net,
903 struct net_device *dev,
904 struct flowi6 *fl6, int flags)
906 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
907 flags |= RT6_LOOKUP_F_IFACE;
909 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
912 void ip6_route_input(struct sk_buff *skb)
914 const struct ipv6hdr *iph = ipv6_hdr(skb);
915 struct net *net = dev_net(skb->dev);
916 int flags = RT6_LOOKUP_F_HAS_SADDR;
917 struct flowi6 fl6 = {
918 .flowi6_iif = skb->dev->ifindex,
921 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
922 .flowi6_mark = skb->mark,
923 .flowi6_proto = iph->nexthdr,
926 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
929 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
930 struct flowi6 *fl6, int flags)
932 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
935 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
940 fl6->flowi6_iif = net->loopback_dev->ifindex;
942 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
943 flags |= RT6_LOOKUP_F_IFACE;
945 if (!ipv6_addr_any(&fl6->saddr))
946 flags |= RT6_LOOKUP_F_HAS_SADDR;
948 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
950 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
953 EXPORT_SYMBOL(ip6_route_output);
955 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
957 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
958 struct dst_entry *new = NULL;
960 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
962 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
963 rt6_init_peer(rt, net->ipv6.peers);
968 new->input = dst_discard;
969 new->output = dst_discard;
971 if (dst_metrics_read_only(&ort->dst))
972 new->_metrics = ort->dst._metrics;
974 dst_copy_metrics(new, &ort->dst);
975 rt->rt6i_idev = ort->rt6i_idev;
977 in6_dev_hold(rt->rt6i_idev);
979 rt->rt6i_gateway = ort->rt6i_gateway;
980 rt->rt6i_flags = ort->rt6i_flags;
981 rt6_clean_expires(rt);
984 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
985 #ifdef CONFIG_IPV6_SUBTREES
986 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
992 dst_release(dst_orig);
993 return new ? new : ERR_PTR(-ENOMEM);
997 * Destination cache support functions
1000 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1002 struct rt6_info *rt;
1004 rt = (struct rt6_info *) dst;
1006 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1007 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1008 if (!rt6_has_peer(rt))
1009 rt6_bind_peer(rt, 0);
1010 rt->rt6i_peer_genid = rt6_peer_genid();
1017 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1019 struct rt6_info *rt = (struct rt6_info *) dst;
1022 if (rt->rt6i_flags & RTF_CACHE) {
1023 if (rt6_check_expired(rt)) {
1035 static void ip6_link_failure(struct sk_buff *skb)
1037 struct rt6_info *rt;
1039 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1041 rt = (struct rt6_info *) skb_dst(skb);
1043 if (rt->rt6i_flags & RTF_CACHE)
1044 rt6_update_expires(rt, 0);
1045 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1046 rt->rt6i_node->fn_sernum = -1;
1050 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1052 struct rt6_info *rt6 = (struct rt6_info*)dst;
1055 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1056 struct net *net = dev_net(dst->dev);
1058 rt6->rt6i_flags |= RTF_MODIFIED;
1059 if (mtu < IPV6_MIN_MTU) {
1060 u32 features = dst_metric(dst, RTAX_FEATURES);
1062 features |= RTAX_FEATURE_ALLFRAG;
1063 dst_metric_set(dst, RTAX_FEATURES, features);
1065 dst_metric_set(dst, RTAX_MTU, mtu);
1066 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1070 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1073 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1074 struct dst_entry *dst;
1077 memset(&fl6, 0, sizeof(fl6));
1078 fl6.flowi6_oif = oif;
1079 fl6.flowi6_mark = mark;
1080 fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1081 fl6.daddr = iph->daddr;
1082 fl6.saddr = iph->saddr;
1083 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1085 dst = ip6_route_output(net, NULL, &fl6);
1087 ip6_rt_update_pmtu(dst, ntohl(mtu));
1090 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1092 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1094 ip6_update_pmtu(skb, sock_net(sk), mtu,
1095 sk->sk_bound_dev_if, sk->sk_mark);
1097 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1099 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1101 struct net_device *dev = dst->dev;
1102 unsigned int mtu = dst_mtu(dst);
1103 struct net *net = dev_net(dev);
1105 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1107 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1108 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1111 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1112 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1113 * IPV6_MAXPLEN is also valid and means: "any MSS,
1114 * rely only on pmtu discovery"
1116 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1121 static unsigned int ip6_mtu(const struct dst_entry *dst)
1123 struct inet6_dev *idev;
1124 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1132 idev = __in6_dev_get(dst->dev);
1134 mtu = idev->cnf.mtu6;
1140 static struct dst_entry *icmp6_dst_gc_list;
1141 static DEFINE_SPINLOCK(icmp6_dst_lock);
1143 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1144 struct neighbour *neigh,
1147 struct dst_entry *dst;
1148 struct rt6_info *rt;
1149 struct inet6_dev *idev = in6_dev_get(dev);
1150 struct net *net = dev_net(dev);
1152 if (unlikely(!idev))
1153 return ERR_PTR(-ENODEV);
1155 rt = ip6_dst_alloc(net, dev, 0, NULL);
1156 if (unlikely(!rt)) {
1158 dst = ERR_PTR(-ENOMEM);
1165 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1166 if (IS_ERR(neigh)) {
1169 return ERR_CAST(neigh);
1173 rt->dst.flags |= DST_HOST;
1174 rt->dst.output = ip6_output;
1175 dst_set_neighbour(&rt->dst, neigh);
1176 atomic_set(&rt->dst.__refcnt, 1);
1177 rt->rt6i_dst.addr = fl6->daddr;
1178 rt->rt6i_dst.plen = 128;
1179 rt->rt6i_idev = idev;
1180 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1182 spin_lock_bh(&icmp6_dst_lock);
1183 rt->dst.next = icmp6_dst_gc_list;
1184 icmp6_dst_gc_list = &rt->dst;
1185 spin_unlock_bh(&icmp6_dst_lock);
1187 fib6_force_start_gc(net);
1189 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1195 int icmp6_dst_gc(void)
1197 struct dst_entry *dst, **pprev;
1200 spin_lock_bh(&icmp6_dst_lock);
1201 pprev = &icmp6_dst_gc_list;
1203 while ((dst = *pprev) != NULL) {
1204 if (!atomic_read(&dst->__refcnt)) {
1213 spin_unlock_bh(&icmp6_dst_lock);
1218 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1221 struct dst_entry *dst, **pprev;
1223 spin_lock_bh(&icmp6_dst_lock);
1224 pprev = &icmp6_dst_gc_list;
1225 while ((dst = *pprev) != NULL) {
1226 struct rt6_info *rt = (struct rt6_info *) dst;
1227 if (func(rt, arg)) {
1234 spin_unlock_bh(&icmp6_dst_lock);
1237 static int ip6_dst_gc(struct dst_ops *ops)
1239 unsigned long now = jiffies;
1240 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1241 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1242 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1243 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1244 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1245 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1248 entries = dst_entries_get_fast(ops);
1249 if (time_after(rt_last_gc + rt_min_interval, now) &&
1250 entries <= rt_max_size)
1253 net->ipv6.ip6_rt_gc_expire++;
1254 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1255 net->ipv6.ip6_rt_last_gc = now;
1256 entries = dst_entries_get_slow(ops);
1257 if (entries < ops->gc_thresh)
1258 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1260 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1261 return entries > rt_max_size;
1264 /* Clean host part of a prefix. Not necessary in radix tree,
1265 but results in cleaner routing tables.
1267 Remove it only when all the things will work!
1270 int ip6_dst_hoplimit(struct dst_entry *dst)
1272 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1273 if (hoplimit == 0) {
1274 struct net_device *dev = dst->dev;
1275 struct inet6_dev *idev;
1278 idev = __in6_dev_get(dev);
1280 hoplimit = idev->cnf.hop_limit;
1282 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1287 EXPORT_SYMBOL(ip6_dst_hoplimit);
1293 int ip6_route_add(struct fib6_config *cfg)
1296 struct net *net = cfg->fc_nlinfo.nl_net;
1297 struct rt6_info *rt = NULL;
1298 struct net_device *dev = NULL;
1299 struct inet6_dev *idev = NULL;
1300 struct fib6_table *table;
1303 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1305 #ifndef CONFIG_IPV6_SUBTREES
1306 if (cfg->fc_src_len)
1309 if (cfg->fc_ifindex) {
1311 dev = dev_get_by_index(net, cfg->fc_ifindex);
1314 idev = in6_dev_get(dev);
1319 if (cfg->fc_metric == 0)
1320 cfg->fc_metric = IP6_RT_PRIO_USER;
1323 if (cfg->fc_nlinfo.nlh &&
1324 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1325 table = fib6_get_table(net, cfg->fc_table);
1327 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1328 table = fib6_new_table(net, cfg->fc_table);
1331 table = fib6_new_table(net, cfg->fc_table);
1337 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1344 rt->dst.obsolete = -1;
1346 if (cfg->fc_flags & RTF_EXPIRES)
1347 rt6_set_expires(rt, jiffies +
1348 clock_t_to_jiffies(cfg->fc_expires));
1350 rt6_clean_expires(rt);
1352 if (cfg->fc_protocol == RTPROT_UNSPEC)
1353 cfg->fc_protocol = RTPROT_BOOT;
1354 rt->rt6i_protocol = cfg->fc_protocol;
1356 addr_type = ipv6_addr_type(&cfg->fc_dst);
1358 if (addr_type & IPV6_ADDR_MULTICAST)
1359 rt->dst.input = ip6_mc_input;
1360 else if (cfg->fc_flags & RTF_LOCAL)
1361 rt->dst.input = ip6_input;
1363 rt->dst.input = ip6_forward;
1365 rt->dst.output = ip6_output;
1367 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1368 rt->rt6i_dst.plen = cfg->fc_dst_len;
1369 if (rt->rt6i_dst.plen == 128)
1370 rt->dst.flags |= DST_HOST;
1372 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1373 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1378 dst_init_metrics(&rt->dst, metrics, 0);
1380 #ifdef CONFIG_IPV6_SUBTREES
1381 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1382 rt->rt6i_src.plen = cfg->fc_src_len;
1385 rt->rt6i_metric = cfg->fc_metric;
1387 /* We cannot add true routes via loopback here,
1388 they would result in kernel looping; promote them to reject routes
1390 if ((cfg->fc_flags & RTF_REJECT) ||
1391 (dev && (dev->flags & IFF_LOOPBACK) &&
1392 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1393 !(cfg->fc_flags & RTF_LOCAL))) {
1394 /* hold loopback dev/idev if we haven't done so. */
1395 if (dev != net->loopback_dev) {
1400 dev = net->loopback_dev;
1402 idev = in6_dev_get(dev);
1408 rt->dst.output = ip6_pkt_discard_out;
1409 rt->dst.input = ip6_pkt_discard;
1410 rt->dst.error = -ENETUNREACH;
1411 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1415 if (cfg->fc_flags & RTF_GATEWAY) {
1416 const struct in6_addr *gw_addr;
1419 gw_addr = &cfg->fc_gateway;
1420 rt->rt6i_gateway = *gw_addr;
1421 gwa_type = ipv6_addr_type(gw_addr);
1423 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1424 struct rt6_info *grt;
1426 /* IPv6 strictly inhibits using not link-local
1427 addresses as nexthop address.
1428 Otherwise, router will not able to send redirects.
1429 It is very good, but in some (rare!) circumstances
1430 (SIT, PtP, NBMA NOARP links) it is handy to allow
1431 some exceptions. --ANK
1434 if (!(gwa_type & IPV6_ADDR_UNICAST))
1437 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1439 err = -EHOSTUNREACH;
1443 if (dev != grt->dst.dev) {
1444 dst_release(&grt->dst);
1449 idev = grt->rt6i_idev;
1451 in6_dev_hold(grt->rt6i_idev);
1453 if (!(grt->rt6i_flags & RTF_GATEWAY))
1455 dst_release(&grt->dst);
1461 if (!dev || (dev->flags & IFF_LOOPBACK))
1469 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1470 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1474 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1475 rt->rt6i_prefsrc.plen = 128;
1477 rt->rt6i_prefsrc.plen = 0;
1479 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1480 err = rt6_bind_neighbour(rt, dev);
1485 rt->rt6i_flags = cfg->fc_flags;
1492 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1493 int type = nla_type(nla);
1496 if (type > RTAX_MAX) {
1501 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1507 rt->rt6i_idev = idev;
1508 rt->rt6i_table = table;
1510 cfg->fc_nlinfo.nl_net = dev_net(dev);
1512 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1524 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1527 struct fib6_table *table;
1528 struct net *net = dev_net(rt->dst.dev);
1530 if (rt == net->ipv6.ip6_null_entry)
1533 table = rt->rt6i_table;
1534 write_lock_bh(&table->tb6_lock);
1536 err = fib6_del(rt, info);
1537 dst_release(&rt->dst);
1539 write_unlock_bh(&table->tb6_lock);
1544 int ip6_del_rt(struct rt6_info *rt)
1546 struct nl_info info = {
1547 .nl_net = dev_net(rt->dst.dev),
1549 return __ip6_del_rt(rt, &info);
1552 static int ip6_route_del(struct fib6_config *cfg)
1554 struct fib6_table *table;
1555 struct fib6_node *fn;
1556 struct rt6_info *rt;
1559 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1563 read_lock_bh(&table->tb6_lock);
1565 fn = fib6_locate(&table->tb6_root,
1566 &cfg->fc_dst, cfg->fc_dst_len,
1567 &cfg->fc_src, cfg->fc_src_len);
1570 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1571 if (cfg->fc_ifindex &&
1573 rt->dst.dev->ifindex != cfg->fc_ifindex))
1575 if (cfg->fc_flags & RTF_GATEWAY &&
1576 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1578 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1581 read_unlock_bh(&table->tb6_lock);
1583 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1586 read_unlock_bh(&table->tb6_lock);
1594 struct ip6rd_flowi {
1596 struct in6_addr gateway;
1599 static struct rt6_info *__ip6_route_redirect(struct net *net,
1600 struct fib6_table *table,
1604 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1605 struct rt6_info *rt;
1606 struct fib6_node *fn;
1609 * Get the "current" route for this destination and
1610 * check if the redirect has come from approriate router.
1612 * RFC 2461 specifies that redirects should only be
1613 * accepted if they come from the nexthop to the target.
1614 * Due to the way the routes are chosen, this notion
1615 * is a bit fuzzy and one might need to check all possible
1619 read_lock_bh(&table->tb6_lock);
1620 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1622 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1624 * Current route is on-link; redirect is always invalid.
1626 * Seems, previous statement is not true. It could
1627 * be node, which looks for us as on-link (f.e. proxy ndisc)
1628 * But then router serving it might decide, that we should
1629 * know truth 8)8) --ANK (980726).
1631 if (rt6_check_expired(rt))
1633 if (!(rt->rt6i_flags & RTF_GATEWAY))
1635 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1637 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1643 rt = net->ipv6.ip6_null_entry;
1644 BACKTRACK(net, &fl6->saddr);
1648 read_unlock_bh(&table->tb6_lock);
1653 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1654 const struct in6_addr *src,
1655 const struct in6_addr *gateway,
1656 struct net_device *dev)
1658 int flags = RT6_LOOKUP_F_HAS_SADDR;
1659 struct net *net = dev_net(dev);
1660 struct ip6rd_flowi rdfl = {
1662 .flowi6_oif = dev->ifindex,
1668 rdfl.gateway = *gateway;
1670 if (rt6_need_strict(dest))
1671 flags |= RT6_LOOKUP_F_IFACE;
1673 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1674 flags, __ip6_route_redirect);
1677 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1678 const struct in6_addr *saddr,
1679 struct neighbour *neigh, u8 *lladdr, int on_link)
1681 struct rt6_info *rt, *nrt = NULL;
1682 struct netevent_redirect netevent;
1683 struct net *net = dev_net(neigh->dev);
1685 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1687 if (rt == net->ipv6.ip6_null_entry) {
1688 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1693 * We have finally decided to accept it.
1696 neigh_update(neigh, lladdr, NUD_STALE,
1697 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1698 NEIGH_UPDATE_F_OVERRIDE|
1699 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1700 NEIGH_UPDATE_F_ISROUTER))
1704 * Redirect received -> path was valid.
1705 * Look, redirects are sent only in response to data packets,
1706 * so that this nexthop apparently is reachable. --ANK
1708 dst_confirm(&rt->dst);
1710 /* Duplicate redirect: silently ignore. */
1711 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1714 nrt = ip6_rt_copy(rt, dest);
1718 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1720 nrt->rt6i_flags &= ~RTF_GATEWAY;
1722 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1723 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1725 if (ip6_ins_rt(nrt))
1728 netevent.old = &rt->dst;
1729 netevent.new = &nrt->dst;
1730 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1732 if (rt->rt6i_flags & RTF_CACHE) {
1738 dst_release(&rt->dst);
1742 * Misc support functions
1745 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1746 const struct in6_addr *dest)
1748 struct net *net = dev_net(ort->dst.dev);
1749 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1753 rt->dst.input = ort->dst.input;
1754 rt->dst.output = ort->dst.output;
1755 rt->dst.flags |= DST_HOST;
1757 rt->rt6i_dst.addr = *dest;
1758 rt->rt6i_dst.plen = 128;
1759 dst_copy_metrics(&rt->dst, &ort->dst);
1760 rt->dst.error = ort->dst.error;
1761 rt->rt6i_idev = ort->rt6i_idev;
1763 in6_dev_hold(rt->rt6i_idev);
1764 rt->dst.lastuse = jiffies;
1766 rt->rt6i_gateway = ort->rt6i_gateway;
1767 rt->rt6i_flags = ort->rt6i_flags;
1768 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1769 (RTF_DEFAULT | RTF_ADDRCONF))
1770 rt6_set_from(rt, ort);
1772 rt6_clean_expires(rt);
1773 rt->rt6i_metric = 0;
1775 #ifdef CONFIG_IPV6_SUBTREES
1776 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1778 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1779 rt->rt6i_table = ort->rt6i_table;
1784 #ifdef CONFIG_IPV6_ROUTE_INFO
1785 static struct rt6_info *rt6_get_route_info(struct net *net,
1786 const struct in6_addr *prefix, int prefixlen,
1787 const struct in6_addr *gwaddr, int ifindex)
1789 struct fib6_node *fn;
1790 struct rt6_info *rt = NULL;
1791 struct fib6_table *table;
1793 table = fib6_get_table(net, RT6_TABLE_INFO);
1797 write_lock_bh(&table->tb6_lock);
1798 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1802 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1803 if (rt->dst.dev->ifindex != ifindex)
1805 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1807 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1813 write_unlock_bh(&table->tb6_lock);
1817 static struct rt6_info *rt6_add_route_info(struct net *net,
1818 const struct in6_addr *prefix, int prefixlen,
1819 const struct in6_addr *gwaddr, int ifindex,
1822 struct fib6_config cfg = {
1823 .fc_table = RT6_TABLE_INFO,
1824 .fc_metric = IP6_RT_PRIO_USER,
1825 .fc_ifindex = ifindex,
1826 .fc_dst_len = prefixlen,
1827 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1828 RTF_UP | RTF_PREF(pref),
1830 .fc_nlinfo.nlh = NULL,
1831 .fc_nlinfo.nl_net = net,
1834 cfg.fc_dst = *prefix;
1835 cfg.fc_gateway = *gwaddr;
1837 /* We should treat it as a default route if prefix length is 0. */
1839 cfg.fc_flags |= RTF_DEFAULT;
1841 ip6_route_add(&cfg);
1843 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1847 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1849 struct rt6_info *rt;
1850 struct fib6_table *table;
1852 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1856 write_lock_bh(&table->tb6_lock);
1857 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1858 if (dev == rt->dst.dev &&
1859 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1860 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1865 write_unlock_bh(&table->tb6_lock);
1869 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1870 struct net_device *dev,
1873 struct fib6_config cfg = {
1874 .fc_table = RT6_TABLE_DFLT,
1875 .fc_metric = IP6_RT_PRIO_USER,
1876 .fc_ifindex = dev->ifindex,
1877 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1878 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1880 .fc_nlinfo.nlh = NULL,
1881 .fc_nlinfo.nl_net = dev_net(dev),
1884 cfg.fc_gateway = *gwaddr;
1886 ip6_route_add(&cfg);
1888 return rt6_get_dflt_router(gwaddr, dev);
1891 void rt6_purge_dflt_routers(struct net *net)
1893 struct rt6_info *rt;
1894 struct fib6_table *table;
1896 /* NOTE: Keep consistent with rt6_get_dflt_router */
1897 table = fib6_get_table(net, RT6_TABLE_DFLT);
1902 read_lock_bh(&table->tb6_lock);
1903 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1904 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1906 read_unlock_bh(&table->tb6_lock);
1911 read_unlock_bh(&table->tb6_lock);
1914 static void rtmsg_to_fib6_config(struct net *net,
1915 struct in6_rtmsg *rtmsg,
1916 struct fib6_config *cfg)
1918 memset(cfg, 0, sizeof(*cfg));
1920 cfg->fc_table = RT6_TABLE_MAIN;
1921 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1922 cfg->fc_metric = rtmsg->rtmsg_metric;
1923 cfg->fc_expires = rtmsg->rtmsg_info;
1924 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1925 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1926 cfg->fc_flags = rtmsg->rtmsg_flags;
1928 cfg->fc_nlinfo.nl_net = net;
1930 cfg->fc_dst = rtmsg->rtmsg_dst;
1931 cfg->fc_src = rtmsg->rtmsg_src;
1932 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1935 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1937 struct fib6_config cfg;
1938 struct in6_rtmsg rtmsg;
1942 case SIOCADDRT: /* Add a route */
1943 case SIOCDELRT: /* Delete a route */
1944 if (!capable(CAP_NET_ADMIN))
1946 err = copy_from_user(&rtmsg, arg,
1947 sizeof(struct in6_rtmsg));
1951 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1956 err = ip6_route_add(&cfg);
1959 err = ip6_route_del(&cfg);
1973 * Drop the packet on the floor
1976 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1979 struct dst_entry *dst = skb_dst(skb);
1980 switch (ipstats_mib_noroutes) {
1981 case IPSTATS_MIB_INNOROUTES:
1982 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1983 if (type == IPV6_ADDR_ANY) {
1984 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1985 IPSTATS_MIB_INADDRERRORS);
1989 case IPSTATS_MIB_OUTNOROUTES:
1990 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1991 ipstats_mib_noroutes);
1994 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1999 static int ip6_pkt_discard(struct sk_buff *skb)
2001 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2004 static int ip6_pkt_discard_out(struct sk_buff *skb)
2006 skb->dev = skb_dst(skb)->dev;
2007 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2012 static int ip6_pkt_prohibit(struct sk_buff *skb)
2014 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2017 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2019 skb->dev = skb_dst(skb)->dev;
2020 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2026 * Allocate a dst for local (unicast / anycast) address.
2029 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2030 const struct in6_addr *addr,
2033 struct net *net = dev_net(idev->dev);
2034 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2038 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2039 return ERR_PTR(-ENOMEM);
2044 rt->dst.flags |= DST_HOST;
2045 rt->dst.input = ip6_input;
2046 rt->dst.output = ip6_output;
2047 rt->rt6i_idev = idev;
2048 rt->dst.obsolete = -1;
2050 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2052 rt->rt6i_flags |= RTF_ANYCAST;
2054 rt->rt6i_flags |= RTF_LOCAL;
2055 err = rt6_bind_neighbour(rt, rt->dst.dev);
2058 return ERR_PTR(err);
2061 rt->rt6i_dst.addr = *addr;
2062 rt->rt6i_dst.plen = 128;
2063 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2065 atomic_set(&rt->dst.__refcnt, 1);
2070 int ip6_route_get_saddr(struct net *net,
2071 struct rt6_info *rt,
2072 const struct in6_addr *daddr,
2074 struct in6_addr *saddr)
2076 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2078 if (rt->rt6i_prefsrc.plen)
2079 *saddr = rt->rt6i_prefsrc.addr;
2081 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2082 daddr, prefs, saddr);
2086 /* remove deleted ip from prefsrc entries */
2087 struct arg_dev_net_ip {
2088 struct net_device *dev;
2090 struct in6_addr *addr;
2093 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2095 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2096 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2097 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2099 if (((void *)rt->dst.dev == dev || !dev) &&
2100 rt != net->ipv6.ip6_null_entry &&
2101 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2102 /* remove prefsrc entry */
2103 rt->rt6i_prefsrc.plen = 0;
2108 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2110 struct net *net = dev_net(ifp->idev->dev);
2111 struct arg_dev_net_ip adni = {
2112 .dev = ifp->idev->dev,
2116 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2119 struct arg_dev_net {
2120 struct net_device *dev;
2124 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2126 const struct arg_dev_net *adn = arg;
2127 const struct net_device *dev = adn->dev;
2129 if ((rt->dst.dev == dev || !dev) &&
2130 rt != adn->net->ipv6.ip6_null_entry)
2136 void rt6_ifdown(struct net *net, struct net_device *dev)
2138 struct arg_dev_net adn = {
2143 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2144 icmp6_clean_all(fib6_ifdown, &adn);
2147 struct rt6_mtu_change_arg {
2148 struct net_device *dev;
2152 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2154 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2155 struct inet6_dev *idev;
2157 /* In IPv6 pmtu discovery is not optional,
2158 so that RTAX_MTU lock cannot disable it.
2159 We still use this lock to block changes
2160 caused by addrconf/ndisc.
2163 idev = __in6_dev_get(arg->dev);
2167 /* For administrative MTU increase, there is no way to discover
2168 IPv6 PMTU increase, so PMTU increase should be updated here.
2169 Since RFC 1981 doesn't include administrative MTU increase
2170 update PMTU increase is a MUST. (i.e. jumbo frame)
2173 If new MTU is less than route PMTU, this new MTU will be the
2174 lowest MTU in the path, update the route PMTU to reflect PMTU
2175 decreases; if new MTU is greater than route PMTU, and the
2176 old MTU is the lowest MTU in the path, update the route PMTU
2177 to reflect the increase. In this case if the other nodes' MTU
2178 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2181 if (rt->dst.dev == arg->dev &&
2182 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2183 (dst_mtu(&rt->dst) >= arg->mtu ||
2184 (dst_mtu(&rt->dst) < arg->mtu &&
2185 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2186 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2191 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2193 struct rt6_mtu_change_arg arg = {
2198 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2201 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2202 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2203 [RTA_OIF] = { .type = NLA_U32 },
2204 [RTA_IIF] = { .type = NLA_U32 },
2205 [RTA_PRIORITY] = { .type = NLA_U32 },
2206 [RTA_METRICS] = { .type = NLA_NESTED },
2209 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2210 struct fib6_config *cfg)
2213 struct nlattr *tb[RTA_MAX+1];
2216 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2221 rtm = nlmsg_data(nlh);
2222 memset(cfg, 0, sizeof(*cfg));
2224 cfg->fc_table = rtm->rtm_table;
2225 cfg->fc_dst_len = rtm->rtm_dst_len;
2226 cfg->fc_src_len = rtm->rtm_src_len;
2227 cfg->fc_flags = RTF_UP;
2228 cfg->fc_protocol = rtm->rtm_protocol;
2230 if (rtm->rtm_type == RTN_UNREACHABLE)
2231 cfg->fc_flags |= RTF_REJECT;
2233 if (rtm->rtm_type == RTN_LOCAL)
2234 cfg->fc_flags |= RTF_LOCAL;
2236 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2237 cfg->fc_nlinfo.nlh = nlh;
2238 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2240 if (tb[RTA_GATEWAY]) {
2241 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2242 cfg->fc_flags |= RTF_GATEWAY;
2246 int plen = (rtm->rtm_dst_len + 7) >> 3;
2248 if (nla_len(tb[RTA_DST]) < plen)
2251 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2255 int plen = (rtm->rtm_src_len + 7) >> 3;
2257 if (nla_len(tb[RTA_SRC]) < plen)
2260 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2263 if (tb[RTA_PREFSRC])
2264 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2267 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2269 if (tb[RTA_PRIORITY])
2270 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2272 if (tb[RTA_METRICS]) {
2273 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2274 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2278 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2285 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2287 struct fib6_config cfg;
2290 err = rtm_to_fib6_config(skb, nlh, &cfg);
2294 return ip6_route_del(&cfg);
2297 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2299 struct fib6_config cfg;
2302 err = rtm_to_fib6_config(skb, nlh, &cfg);
2306 return ip6_route_add(&cfg);
2309 static inline size_t rt6_nlmsg_size(void)
2311 return NLMSG_ALIGN(sizeof(struct rtmsg))
2312 + nla_total_size(16) /* RTA_SRC */
2313 + nla_total_size(16) /* RTA_DST */
2314 + nla_total_size(16) /* RTA_GATEWAY */
2315 + nla_total_size(16) /* RTA_PREFSRC */
2316 + nla_total_size(4) /* RTA_TABLE */
2317 + nla_total_size(4) /* RTA_IIF */
2318 + nla_total_size(4) /* RTA_OIF */
2319 + nla_total_size(4) /* RTA_PRIORITY */
2320 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2321 + nla_total_size(sizeof(struct rta_cacheinfo));
2324 static int rt6_fill_node(struct net *net,
2325 struct sk_buff *skb, struct rt6_info *rt,
2326 struct in6_addr *dst, struct in6_addr *src,
2327 int iif, int type, u32 pid, u32 seq,
2328 int prefix, int nowait, unsigned int flags)
2330 const struct inet_peer *peer;
2332 struct nlmsghdr *nlh;
2335 struct neighbour *n;
2338 if (prefix) { /* user wants prefix routes only */
2339 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2340 /* success since this is not a prefix route */
2345 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2349 rtm = nlmsg_data(nlh);
2350 rtm->rtm_family = AF_INET6;
2351 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2352 rtm->rtm_src_len = rt->rt6i_src.plen;
2355 table = rt->rt6i_table->tb6_id;
2357 table = RT6_TABLE_UNSPEC;
2358 rtm->rtm_table = table;
2359 if (nla_put_u32(skb, RTA_TABLE, table))
2360 goto nla_put_failure;
2361 if (rt->rt6i_flags & RTF_REJECT)
2362 rtm->rtm_type = RTN_UNREACHABLE;
2363 else if (rt->rt6i_flags & RTF_LOCAL)
2364 rtm->rtm_type = RTN_LOCAL;
2365 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2366 rtm->rtm_type = RTN_LOCAL;
2368 rtm->rtm_type = RTN_UNICAST;
2370 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2371 rtm->rtm_protocol = rt->rt6i_protocol;
2372 if (rt->rt6i_flags & RTF_DYNAMIC)
2373 rtm->rtm_protocol = RTPROT_REDIRECT;
2374 else if (rt->rt6i_flags & RTF_ADDRCONF)
2375 rtm->rtm_protocol = RTPROT_KERNEL;
2376 else if (rt->rt6i_flags & RTF_DEFAULT)
2377 rtm->rtm_protocol = RTPROT_RA;
2379 if (rt->rt6i_flags & RTF_CACHE)
2380 rtm->rtm_flags |= RTM_F_CLONED;
2383 if (nla_put(skb, RTA_DST, 16, dst))
2384 goto nla_put_failure;
2385 rtm->rtm_dst_len = 128;
2386 } else if (rtm->rtm_dst_len)
2387 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2388 goto nla_put_failure;
2389 #ifdef CONFIG_IPV6_SUBTREES
2391 if (nla_put(skb, RTA_SRC, 16, src))
2392 goto nla_put_failure;
2393 rtm->rtm_src_len = 128;
2394 } else if (rtm->rtm_src_len &&
2395 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2396 goto nla_put_failure;
2399 #ifdef CONFIG_IPV6_MROUTE
2400 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2401 int err = ip6mr_get_route(net, skb, rtm, nowait);
2406 goto nla_put_failure;
2408 if (err == -EMSGSIZE)
2409 goto nla_put_failure;
2414 if (nla_put_u32(skb, RTA_IIF, iif))
2415 goto nla_put_failure;
2417 struct in6_addr saddr_buf;
2418 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2419 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2420 goto nla_put_failure;
2423 if (rt->rt6i_prefsrc.plen) {
2424 struct in6_addr saddr_buf;
2425 saddr_buf = rt->rt6i_prefsrc.addr;
2426 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2427 goto nla_put_failure;
2430 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2431 goto nla_put_failure;
2434 n = dst_get_neighbour_noref(&rt->dst);
2436 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2438 goto nla_put_failure;
2444 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2445 goto nla_put_failure;
2446 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2447 goto nla_put_failure;
2448 if (!(rt->rt6i_flags & RTF_EXPIRES))
2450 else if (rt->dst.expires - jiffies < INT_MAX)
2451 expires = rt->dst.expires - jiffies;
2456 if (rt6_has_peer(rt))
2457 peer = rt6_peer_ptr(rt);
2459 if (peer && peer->tcp_ts_stamp) {
2461 tsage = get_seconds() - peer->tcp_ts_stamp;
2464 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2465 expires, rt->dst.error) < 0)
2466 goto nla_put_failure;
2468 return nlmsg_end(skb, nlh);
2471 nlmsg_cancel(skb, nlh);
2475 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2477 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2480 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2481 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2482 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2486 return rt6_fill_node(arg->net,
2487 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2488 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2489 prefix, 0, NLM_F_MULTI);
2492 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2494 struct net *net = sock_net(in_skb->sk);
2495 struct nlattr *tb[RTA_MAX+1];
2496 struct rt6_info *rt;
2497 struct sk_buff *skb;
2500 int err, iif = 0, oif = 0;
2502 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2507 memset(&fl6, 0, sizeof(fl6));
2510 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2513 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2517 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2520 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2524 iif = nla_get_u32(tb[RTA_IIF]);
2527 oif = nla_get_u32(tb[RTA_OIF]);
2530 struct net_device *dev;
2533 dev = __dev_get_by_index(net, iif);
2539 fl6.flowi6_iif = iif;
2541 if (!ipv6_addr_any(&fl6.saddr))
2542 flags |= RT6_LOOKUP_F_HAS_SADDR;
2544 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2547 fl6.flowi6_oif = oif;
2549 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2552 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2554 dst_release(&rt->dst);
2559 /* Reserve room for dummy headers, this skb can pass
2560 through good chunk of routing engine.
2562 skb_reset_mac_header(skb);
2563 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2565 skb_dst_set(skb, &rt->dst);
2567 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2568 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2569 nlh->nlmsg_seq, 0, 0, 0);
2575 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2580 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2582 struct sk_buff *skb;
2583 struct net *net = info->nl_net;
2588 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2590 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2594 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2595 event, info->pid, seq, 0, 0, 0);
2597 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2598 WARN_ON(err == -EMSGSIZE);
2602 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2603 info->nlh, gfp_any());
2607 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2610 static int ip6_route_dev_notify(struct notifier_block *this,
2611 unsigned long event, void *data)
2613 struct net_device *dev = (struct net_device *)data;
2614 struct net *net = dev_net(dev);
2616 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2617 net->ipv6.ip6_null_entry->dst.dev = dev;
2618 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2620 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2621 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2622 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2623 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2634 #ifdef CONFIG_PROC_FS
2645 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2647 struct seq_file *m = p_arg;
2648 struct neighbour *n;
2650 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2652 #ifdef CONFIG_IPV6_SUBTREES
2653 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2655 seq_puts(m, "00000000000000000000000000000000 00 ");
2658 n = dst_get_neighbour_noref(&rt->dst);
2660 seq_printf(m, "%pi6", n->primary_key);
2662 seq_puts(m, "00000000000000000000000000000000");
2665 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2666 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2667 rt->dst.__use, rt->rt6i_flags,
2668 rt->dst.dev ? rt->dst.dev->name : "");
2672 static int ipv6_route_show(struct seq_file *m, void *v)
2674 struct net *net = (struct net *)m->private;
2675 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2679 static int ipv6_route_open(struct inode *inode, struct file *file)
2681 return single_open_net(inode, file, ipv6_route_show);
2684 static const struct file_operations ipv6_route_proc_fops = {
2685 .owner = THIS_MODULE,
2686 .open = ipv6_route_open,
2688 .llseek = seq_lseek,
2689 .release = single_release_net,
2692 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2694 struct net *net = (struct net *)seq->private;
2695 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2696 net->ipv6.rt6_stats->fib_nodes,
2697 net->ipv6.rt6_stats->fib_route_nodes,
2698 net->ipv6.rt6_stats->fib_rt_alloc,
2699 net->ipv6.rt6_stats->fib_rt_entries,
2700 net->ipv6.rt6_stats->fib_rt_cache,
2701 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2702 net->ipv6.rt6_stats->fib_discarded_routes);
2707 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2709 return single_open_net(inode, file, rt6_stats_seq_show);
2712 static const struct file_operations rt6_stats_seq_fops = {
2713 .owner = THIS_MODULE,
2714 .open = rt6_stats_seq_open,
2716 .llseek = seq_lseek,
2717 .release = single_release_net,
2719 #endif /* CONFIG_PROC_FS */
2721 #ifdef CONFIG_SYSCTL
2724 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2725 void __user *buffer, size_t *lenp, loff_t *ppos)
2732 net = (struct net *)ctl->extra1;
2733 delay = net->ipv6.sysctl.flush_delay;
2734 proc_dointvec(ctl, write, buffer, lenp, ppos);
2735 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2739 ctl_table ipv6_route_table_template[] = {
2741 .procname = "flush",
2742 .data = &init_net.ipv6.sysctl.flush_delay,
2743 .maxlen = sizeof(int),
2745 .proc_handler = ipv6_sysctl_rtcache_flush
2748 .procname = "gc_thresh",
2749 .data = &ip6_dst_ops_template.gc_thresh,
2750 .maxlen = sizeof(int),
2752 .proc_handler = proc_dointvec,
2755 .procname = "max_size",
2756 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2757 .maxlen = sizeof(int),
2759 .proc_handler = proc_dointvec,
2762 .procname = "gc_min_interval",
2763 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2764 .maxlen = sizeof(int),
2766 .proc_handler = proc_dointvec_jiffies,
2769 .procname = "gc_timeout",
2770 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2771 .maxlen = sizeof(int),
2773 .proc_handler = proc_dointvec_jiffies,
2776 .procname = "gc_interval",
2777 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2778 .maxlen = sizeof(int),
2780 .proc_handler = proc_dointvec_jiffies,
2783 .procname = "gc_elasticity",
2784 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2785 .maxlen = sizeof(int),
2787 .proc_handler = proc_dointvec,
2790 .procname = "mtu_expires",
2791 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2792 .maxlen = sizeof(int),
2794 .proc_handler = proc_dointvec_jiffies,
2797 .procname = "min_adv_mss",
2798 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2799 .maxlen = sizeof(int),
2801 .proc_handler = proc_dointvec,
2804 .procname = "gc_min_interval_ms",
2805 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2806 .maxlen = sizeof(int),
2808 .proc_handler = proc_dointvec_ms_jiffies,
2813 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2815 struct ctl_table *table;
2817 table = kmemdup(ipv6_route_table_template,
2818 sizeof(ipv6_route_table_template),
2822 table[0].data = &net->ipv6.sysctl.flush_delay;
2823 table[0].extra1 = net;
2824 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2825 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2826 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2827 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2828 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2829 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2830 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2831 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2832 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2839 static int __net_init ip6_route_net_init(struct net *net)
2843 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2844 sizeof(net->ipv6.ip6_dst_ops));
2846 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2847 goto out_ip6_dst_ops;
2849 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2850 sizeof(*net->ipv6.ip6_null_entry),
2852 if (!net->ipv6.ip6_null_entry)
2853 goto out_ip6_dst_entries;
2854 net->ipv6.ip6_null_entry->dst.path =
2855 (struct dst_entry *)net->ipv6.ip6_null_entry;
2856 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2857 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2858 ip6_template_metrics, true);
2860 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2861 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2862 sizeof(*net->ipv6.ip6_prohibit_entry),
2864 if (!net->ipv6.ip6_prohibit_entry)
2865 goto out_ip6_null_entry;
2866 net->ipv6.ip6_prohibit_entry->dst.path =
2867 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2868 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2869 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2870 ip6_template_metrics, true);
2872 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2873 sizeof(*net->ipv6.ip6_blk_hole_entry),
2875 if (!net->ipv6.ip6_blk_hole_entry)
2876 goto out_ip6_prohibit_entry;
2877 net->ipv6.ip6_blk_hole_entry->dst.path =
2878 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2879 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2880 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2881 ip6_template_metrics, true);
2884 net->ipv6.sysctl.flush_delay = 0;
2885 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2886 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2887 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2888 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2889 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2890 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2891 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2893 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2900 out_ip6_prohibit_entry:
2901 kfree(net->ipv6.ip6_prohibit_entry);
2903 kfree(net->ipv6.ip6_null_entry);
2905 out_ip6_dst_entries:
2906 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2911 static void __net_exit ip6_route_net_exit(struct net *net)
2913 kfree(net->ipv6.ip6_null_entry);
2914 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2915 kfree(net->ipv6.ip6_prohibit_entry);
2916 kfree(net->ipv6.ip6_blk_hole_entry);
2918 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2921 static int __net_init ip6_route_net_init_late(struct net *net)
2923 #ifdef CONFIG_PROC_FS
2924 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2925 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2930 static void __net_exit ip6_route_net_exit_late(struct net *net)
2932 #ifdef CONFIG_PROC_FS
2933 proc_net_remove(net, "ipv6_route");
2934 proc_net_remove(net, "rt6_stats");
2938 static struct pernet_operations ip6_route_net_ops = {
2939 .init = ip6_route_net_init,
2940 .exit = ip6_route_net_exit,
2943 static int __net_init ipv6_inetpeer_init(struct net *net)
2945 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2949 inet_peer_base_init(bp);
2950 net->ipv6.peers = bp;
2954 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2956 struct inet_peer_base *bp = net->ipv6.peers;
2958 net->ipv6.peers = NULL;
2959 inetpeer_invalidate_tree(bp);
2963 static struct pernet_operations ipv6_inetpeer_ops = {
2964 .init = ipv6_inetpeer_init,
2965 .exit = ipv6_inetpeer_exit,
2968 static struct pernet_operations ip6_route_net_late_ops = {
2969 .init = ip6_route_net_init_late,
2970 .exit = ip6_route_net_exit_late,
2973 static struct notifier_block ip6_route_dev_notifier = {
2974 .notifier_call = ip6_route_dev_notify,
2978 int __init ip6_route_init(void)
2983 ip6_dst_ops_template.kmem_cachep =
2984 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2985 SLAB_HWCACHE_ALIGN, NULL);
2986 if (!ip6_dst_ops_template.kmem_cachep)
2989 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2991 goto out_kmem_cache;
2993 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
2995 goto out_dst_entries;
2997 ret = register_pernet_subsys(&ip6_route_net_ops);
2999 goto out_register_inetpeer;
3001 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3003 /* Registering of the loopback is done before this portion of code,
3004 * the loopback reference in rt6_info will not be taken, do it
3005 * manually for init_net */
3006 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3007 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3009 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3010 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3011 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3012 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3016 goto out_register_subsys;
3022 ret = fib6_rules_init();
3026 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3028 goto fib6_rules_init;
3031 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3032 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3033 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3034 goto out_register_late_subsys;
3036 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3038 goto out_register_late_subsys;
3043 out_register_late_subsys:
3044 unregister_pernet_subsys(&ip6_route_net_late_ops);
3046 fib6_rules_cleanup();
3051 out_register_subsys:
3052 unregister_pernet_subsys(&ip6_route_net_ops);
3053 out_register_inetpeer:
3054 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3056 dst_entries_destroy(&ip6_dst_blackhole_ops);
3058 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3062 void ip6_route_cleanup(void)
3064 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3065 unregister_pernet_subsys(&ip6_route_net_late_ops);
3066 fib6_rules_cleanup();
3069 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3070 unregister_pernet_subsys(&ip6_route_net_ops);
3071 dst_entries_destroy(&ip6_dst_blackhole_ops);
3072 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);