2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
102 peer = rt6_get_peer_create(rt);
104 u32 *old_p = __DST_METRICS_PTR(old);
105 unsigned long prev, new;
108 if (inet_metrics_new(peer))
109 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111 new = (unsigned long) p;
112 prev = cmpxchg(&dst->_metrics, old, new);
115 p = __DST_METRICS_PTR(prev);
116 if (prev & DST_METRICS_READ_ONLY)
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 struct in6_addr *p = &rt->rt6i_gateway;
127 if (!ipv6_addr_any(p))
128 return (const void *) p;
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 struct rt6_info *rt = (struct rt6_info *) dst;
137 daddr = choose_neigh_daddr(rt, daddr);
138 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
141 return neigh_create(&nd_tbl, daddr, dst->dev);
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152 dst_set_neighbour(&rt->dst, n);
157 static struct dst_ops ip6_dst_ops_template = {
159 .protocol = cpu_to_be16(ETH_P_IPV6),
162 .check = ip6_dst_check,
163 .default_advmss = ip6_default_advmss,
165 .cow_metrics = ipv6_cow_metrics,
166 .destroy = ip6_dst_destroy,
167 .ifdown = ip6_dst_ifdown,
168 .negative_advice = ip6_negative_advice,
169 .link_failure = ip6_link_failure,
170 .update_pmtu = ip6_rt_update_pmtu,
171 .local_out = __ip6_local_out,
172 .neigh_lookup = ip6_neigh_lookup,
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179 return mtu ? : dst->dev->mtu;
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
192 static struct dst_ops ip6_dst_blackhole_ops = {
194 .protocol = cpu_to_be16(ETH_P_IPV6),
195 .destroy = ip6_dst_destroy,
196 .check = ip6_dst_check,
197 .mtu = ip6_blackhole_mtu,
198 .default_advmss = ip6_default_advmss,
199 .update_pmtu = ip6_rt_blackhole_update_pmtu,
200 .cow_metrics = ip6_rt_blackhole_cow_metrics,
201 .neigh_lookup = ip6_neigh_lookup,
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205 [RTAX_HOPLIMIT - 1] = 255,
208 static struct rt6_info ip6_null_entry_template = {
210 .__refcnt = ATOMIC_INIT(1),
213 .error = -ENETUNREACH,
214 .input = ip6_pkt_discard,
215 .output = ip6_pkt_discard_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228 static struct rt6_info ip6_prohibit_entry_template = {
230 .__refcnt = ATOMIC_INIT(1),
234 .input = ip6_pkt_prohibit,
235 .output = ip6_pkt_prohibit_out,
237 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
238 .rt6i_protocol = RTPROT_KERNEL,
239 .rt6i_metric = ~(u32) 0,
240 .rt6i_ref = ATOMIC_INIT(1),
243 static struct rt6_info ip6_blk_hole_entry_template = {
245 .__refcnt = ATOMIC_INIT(1),
249 .input = dst_discard,
250 .output = dst_discard,
252 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
253 .rt6i_protocol = RTPROT_KERNEL,
254 .rt6i_metric = ~(u32) 0,
255 .rt6i_ref = ATOMIC_INIT(1),
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
262 struct net_device *dev,
265 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
268 memset(&rt->rt6i_table, 0,
269 sizeof(*rt) - sizeof(struct dst_entry));
274 static void ip6_dst_destroy(struct dst_entry *dst)
276 struct rt6_info *rt = (struct rt6_info *)dst;
277 struct inet6_dev *idev = rt->rt6i_idev;
278 struct inet_peer *peer = rt->rt6i_peer;
280 if (!(rt->dst.flags & DST_HOST))
281 dst_destroy_metrics_generic(dst);
284 rt->rt6i_idev = NULL;
288 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
289 dst_release(dst->from);
292 rt->rt6i_peer = NULL;
297 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
299 static u32 rt6_peer_genid(void)
301 return atomic_read(&__rt6_peer_genid);
304 void rt6_bind_peer(struct rt6_info *rt, int create)
306 struct net *net = dev_net(rt->dst.dev);
307 struct inet_peer *peer;
309 peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create);
310 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
313 rt->rt6i_peer_genid = rt6_peer_genid();
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
319 struct rt6_info *rt = (struct rt6_info *)dst;
320 struct inet6_dev *idev = rt->rt6i_idev;
321 struct net_device *loopback_dev =
322 dev_net(dev)->loopback_dev;
324 if (dev != loopback_dev && idev && idev->dev == dev) {
325 struct inet6_dev *loopback_idev =
326 in6_dev_get(loopback_dev);
328 rt->rt6i_idev = loopback_idev;
334 static bool rt6_check_expired(const struct rt6_info *rt)
336 struct rt6_info *ort = NULL;
338 if (rt->rt6i_flags & RTF_EXPIRES) {
339 if (time_after(jiffies, rt->dst.expires))
341 } else if (rt->dst.from) {
342 ort = (struct rt6_info *) rt->dst.from;
343 return (ort->rt6i_flags & RTF_EXPIRES) &&
344 time_after(jiffies, ort->dst.expires);
349 static bool rt6_need_strict(const struct in6_addr *daddr)
351 return ipv6_addr_type(daddr) &
352 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
356 * Route lookup. Any table->tb6_lock is implied.
359 static inline struct rt6_info *rt6_device_match(struct net *net,
361 const struct in6_addr *saddr,
365 struct rt6_info *local = NULL;
366 struct rt6_info *sprt;
368 if (!oif && ipv6_addr_any(saddr))
371 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372 struct net_device *dev = sprt->dst.dev;
375 if (dev->ifindex == oif)
377 if (dev->flags & IFF_LOOPBACK) {
378 if (!sprt->rt6i_idev ||
379 sprt->rt6i_idev->dev->ifindex != oif) {
380 if (flags & RT6_LOOKUP_F_IFACE && oif)
382 if (local && (!oif ||
383 local->rt6i_idev->dev->ifindex == oif))
389 if (ipv6_chk_addr(net, saddr, dev,
390 flags & RT6_LOOKUP_F_IFACE))
399 if (flags & RT6_LOOKUP_F_IFACE)
400 return net->ipv6.ip6_null_entry;
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
409 struct neighbour *neigh;
411 * Okay, this does not seem to be appropriate
412 * for now, however, we need to check if it
413 * is really so; aka Router Reachability Probing.
415 * Router Reachability Probe MUST be rate-limited
416 * to no more than one per minute.
419 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420 if (!neigh || (neigh->nud_state & NUD_VALID))
422 read_lock_bh(&neigh->lock);
423 if (!(neigh->nud_state & NUD_VALID) &&
424 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425 struct in6_addr mcaddr;
426 struct in6_addr *target;
428 neigh->updated = jiffies;
429 read_unlock_bh(&neigh->lock);
431 target = (struct in6_addr *)&neigh->primary_key;
432 addrconf_addr_solict_mult(target, &mcaddr);
433 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
435 read_unlock_bh(&neigh->lock);
441 static inline void rt6_probe(struct rt6_info *rt)
447 * Default Router Selection (RFC 2461 6.3.6)
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
451 struct net_device *dev = rt->dst.dev;
452 if (!oif || dev->ifindex == oif)
454 if ((dev->flags & IFF_LOOPBACK) &&
455 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
460 static inline int rt6_check_neigh(struct rt6_info *rt)
462 struct neighbour *neigh;
466 neigh = dst_get_neighbour_noref(&rt->dst);
467 if (rt->rt6i_flags & RTF_NONEXTHOP ||
468 !(rt->rt6i_flags & RTF_GATEWAY))
471 read_lock_bh(&neigh->lock);
472 if (neigh->nud_state & NUD_VALID)
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475 else if (neigh->nud_state & NUD_FAILED)
480 read_unlock_bh(&neigh->lock);
487 static int rt6_score_route(struct rt6_info *rt, int oif,
492 m = rt6_check_dev(rt, oif);
493 if (!m && (strict & RT6_LOOKUP_F_IFACE))
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
498 n = rt6_check_neigh(rt);
499 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505 int *mpri, struct rt6_info *match)
509 if (rt6_check_expired(rt))
512 m = rt6_score_route(rt, oif, strict);
517 if (strict & RT6_LOOKUP_F_REACHABLE)
521 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530 struct rt6_info *rr_head,
531 u32 metric, int oif, int strict)
533 struct rt6_info *rt, *match;
537 for (rt = rr_head; rt && rt->rt6i_metric == metric;
538 rt = rt->dst.rt6_next)
539 match = find_match(rt, oif, strict, &mpri, match);
540 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541 rt = rt->dst.rt6_next)
542 match = find_match(rt, oif, strict, &mpri, match);
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
549 struct rt6_info *match, *rt0;
554 fn->rr_ptr = rt0 = fn->leaf;
556 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
559 (strict & RT6_LOOKUP_F_REACHABLE)) {
560 struct rt6_info *next = rt0->dst.rt6_next;
562 /* no entries matched; do round-robin */
563 if (!next || next->rt6i_metric != rt0->rt6i_metric)
570 net = dev_net(rt0->dst.dev);
571 return match ? match : net->ipv6.ip6_null_entry;
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576 const struct in6_addr *gwaddr)
578 struct net *net = dev_net(dev);
579 struct route_info *rinfo = (struct route_info *) opt;
580 struct in6_addr prefix_buf, *prefix;
582 unsigned long lifetime;
585 if (len < sizeof(struct route_info)) {
589 /* Sanity check for prefix_len and length */
590 if (rinfo->length > 3) {
592 } else if (rinfo->prefix_len > 128) {
594 } else if (rinfo->prefix_len > 64) {
595 if (rinfo->length < 2) {
598 } else if (rinfo->prefix_len > 0) {
599 if (rinfo->length < 1) {
604 pref = rinfo->route_pref;
605 if (pref == ICMPV6_ROUTER_PREF_INVALID)
608 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
610 if (rinfo->length == 3)
611 prefix = (struct in6_addr *)rinfo->prefix;
613 /* this function is safe */
614 ipv6_addr_prefix(&prefix_buf,
615 (struct in6_addr *)rinfo->prefix,
617 prefix = &prefix_buf;
620 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
623 if (rt && !lifetime) {
629 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
632 rt->rt6i_flags = RTF_ROUTEINFO |
633 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
636 if (!addrconf_finite_timeout(lifetime))
637 rt6_clean_expires(rt);
639 rt6_set_expires(rt, jiffies + HZ * lifetime);
641 dst_release(&rt->dst);
647 #define BACKTRACK(__net, saddr) \
649 if (rt == __net->ipv6.ip6_null_entry) { \
650 struct fib6_node *pn; \
652 if (fn->fn_flags & RTN_TL_ROOT) \
655 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
659 if (fn->fn_flags & RTN_RTINFO) \
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666 struct fib6_table *table,
667 struct flowi6 *fl6, int flags)
669 struct fib6_node *fn;
672 read_lock_bh(&table->tb6_lock);
673 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
676 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677 BACKTRACK(net, &fl6->saddr);
679 dst_use(&rt->dst, jiffies);
680 read_unlock_bh(&table->tb6_lock);
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
688 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693 const struct in6_addr *saddr, int oif, int strict)
695 struct flowi6 fl6 = {
699 struct dst_entry *dst;
700 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
703 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704 flags |= RT6_LOOKUP_F_HAS_SADDR;
707 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
709 return (struct rt6_info *) dst;
716 EXPORT_SYMBOL(rt6_lookup);
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719 It takes new route entry, the addition fails by any reason the
720 route is freed. In any case, if caller does not hold it, it may
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
727 struct fib6_table *table;
729 table = rt->rt6i_table;
730 write_lock_bh(&table->tb6_lock);
731 err = fib6_add(&table->tb6_root, rt, info);
732 write_unlock_bh(&table->tb6_lock);
737 int ip6_ins_rt(struct rt6_info *rt)
739 struct nl_info info = {
740 .nl_net = dev_net(rt->dst.dev),
742 return __ip6_ins_rt(rt, &info);
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746 const struct in6_addr *daddr,
747 const struct in6_addr *saddr)
755 rt = ip6_rt_copy(ort, daddr);
758 int attempts = !in_softirq();
760 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761 if (ort->rt6i_dst.plen != 128 &&
762 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763 rt->rt6i_flags |= RTF_ANYCAST;
764 rt->rt6i_gateway = *daddr;
767 rt->rt6i_flags |= RTF_CACHE;
769 #ifdef CONFIG_IPV6_SUBTREES
770 if (rt->rt6i_src.plen && saddr) {
771 rt->rt6i_src.addr = *saddr;
772 rt->rt6i_src.plen = 128;
777 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778 struct net *net = dev_net(rt->dst.dev);
779 int saved_rt_min_interval =
780 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781 int saved_rt_elasticity =
782 net->ipv6.sysctl.ip6_rt_gc_elasticity;
784 if (attempts-- > 0) {
785 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
788 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
790 net->ipv6.sysctl.ip6_rt_gc_elasticity =
792 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793 saved_rt_min_interval;
797 net_warn_ratelimited("Neighbour table overflow\n");
806 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
807 const struct in6_addr *daddr)
809 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
812 rt->rt6i_flags |= RTF_CACHE;
813 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
818 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
819 struct flowi6 *fl6, int flags)
821 struct fib6_node *fn;
822 struct rt6_info *rt, *nrt;
826 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
828 strict |= flags & RT6_LOOKUP_F_IFACE;
831 read_lock_bh(&table->tb6_lock);
834 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
837 rt = rt6_select(fn, oif, strict | reachable);
839 BACKTRACK(net, &fl6->saddr);
840 if (rt == net->ipv6.ip6_null_entry ||
841 rt->rt6i_flags & RTF_CACHE)
845 read_unlock_bh(&table->tb6_lock);
847 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
848 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
849 else if (!(rt->dst.flags & DST_HOST))
850 nrt = rt6_alloc_clone(rt, &fl6->daddr);
854 dst_release(&rt->dst);
855 rt = nrt ? : net->ipv6.ip6_null_entry;
859 err = ip6_ins_rt(nrt);
868 * Race condition! In the gap, when table->tb6_lock was
869 * released someone could insert this route. Relookup.
871 dst_release(&rt->dst);
880 read_unlock_bh(&table->tb6_lock);
882 rt->dst.lastuse = jiffies;
888 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
889 struct flowi6 *fl6, int flags)
891 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
894 static struct dst_entry *ip6_route_input_lookup(struct net *net,
895 struct net_device *dev,
896 struct flowi6 *fl6, int flags)
898 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
899 flags |= RT6_LOOKUP_F_IFACE;
901 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
904 void ip6_route_input(struct sk_buff *skb)
906 const struct ipv6hdr *iph = ipv6_hdr(skb);
907 struct net *net = dev_net(skb->dev);
908 int flags = RT6_LOOKUP_F_HAS_SADDR;
909 struct flowi6 fl6 = {
910 .flowi6_iif = skb->dev->ifindex,
913 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
914 .flowi6_mark = skb->mark,
915 .flowi6_proto = iph->nexthdr,
918 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
921 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
922 struct flowi6 *fl6, int flags)
924 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
927 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
932 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
933 flags |= RT6_LOOKUP_F_IFACE;
935 if (!ipv6_addr_any(&fl6->saddr))
936 flags |= RT6_LOOKUP_F_HAS_SADDR;
938 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
940 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
943 EXPORT_SYMBOL(ip6_route_output);
945 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
947 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
948 struct dst_entry *new = NULL;
950 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
952 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
957 new->input = dst_discard;
958 new->output = dst_discard;
960 if (dst_metrics_read_only(&ort->dst))
961 new->_metrics = ort->dst._metrics;
963 dst_copy_metrics(new, &ort->dst);
964 rt->rt6i_idev = ort->rt6i_idev;
966 in6_dev_hold(rt->rt6i_idev);
968 rt->rt6i_gateway = ort->rt6i_gateway;
969 rt->rt6i_flags = ort->rt6i_flags;
970 rt6_clean_expires(rt);
973 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
974 #ifdef CONFIG_IPV6_SUBTREES
975 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
981 dst_release(dst_orig);
982 return new ? new : ERR_PTR(-ENOMEM);
986 * Destination cache support functions
989 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
993 rt = (struct rt6_info *) dst;
995 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
996 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
998 rt6_bind_peer(rt, 0);
999 rt->rt6i_peer_genid = rt6_peer_genid();
1006 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1008 struct rt6_info *rt = (struct rt6_info *) dst;
1011 if (rt->rt6i_flags & RTF_CACHE) {
1012 if (rt6_check_expired(rt)) {
1024 static void ip6_link_failure(struct sk_buff *skb)
1026 struct rt6_info *rt;
1028 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1030 rt = (struct rt6_info *) skb_dst(skb);
1032 if (rt->rt6i_flags & RTF_CACHE)
1033 rt6_update_expires(rt, 0);
1034 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1035 rt->rt6i_node->fn_sernum = -1;
1039 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1041 struct rt6_info *rt6 = (struct rt6_info*)dst;
1043 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1044 rt6->rt6i_flags |= RTF_MODIFIED;
1045 if (mtu < IPV6_MIN_MTU) {
1046 u32 features = dst_metric(dst, RTAX_FEATURES);
1048 features |= RTAX_FEATURE_ALLFRAG;
1049 dst_metric_set(dst, RTAX_FEATURES, features);
1051 dst_metric_set(dst, RTAX_MTU, mtu);
1055 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1057 struct net_device *dev = dst->dev;
1058 unsigned int mtu = dst_mtu(dst);
1059 struct net *net = dev_net(dev);
1061 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1063 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1064 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1068 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1069 * IPV6_MAXPLEN is also valid and means: "any MSS,
1070 * rely only on pmtu discovery"
1072 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1077 static unsigned int ip6_mtu(const struct dst_entry *dst)
1079 struct inet6_dev *idev;
1080 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1088 idev = __in6_dev_get(dst->dev);
1090 mtu = idev->cnf.mtu6;
1096 static struct dst_entry *icmp6_dst_gc_list;
1097 static DEFINE_SPINLOCK(icmp6_dst_lock);
1099 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1100 struct neighbour *neigh,
1103 struct dst_entry *dst;
1104 struct rt6_info *rt;
1105 struct inet6_dev *idev = in6_dev_get(dev);
1106 struct net *net = dev_net(dev);
1108 if (unlikely(!idev))
1109 return ERR_PTR(-ENODEV);
1111 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1112 if (unlikely(!rt)) {
1114 dst = ERR_PTR(-ENOMEM);
1121 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1122 if (IS_ERR(neigh)) {
1125 return ERR_CAST(neigh);
1129 rt->dst.flags |= DST_HOST;
1130 rt->dst.output = ip6_output;
1131 dst_set_neighbour(&rt->dst, neigh);
1132 atomic_set(&rt->dst.__refcnt, 1);
1133 rt->rt6i_dst.addr = fl6->daddr;
1134 rt->rt6i_dst.plen = 128;
1135 rt->rt6i_idev = idev;
1136 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1138 spin_lock_bh(&icmp6_dst_lock);
1139 rt->dst.next = icmp6_dst_gc_list;
1140 icmp6_dst_gc_list = &rt->dst;
1141 spin_unlock_bh(&icmp6_dst_lock);
1143 fib6_force_start_gc(net);
1145 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1151 int icmp6_dst_gc(void)
1153 struct dst_entry *dst, **pprev;
1156 spin_lock_bh(&icmp6_dst_lock);
1157 pprev = &icmp6_dst_gc_list;
1159 while ((dst = *pprev) != NULL) {
1160 if (!atomic_read(&dst->__refcnt)) {
1169 spin_unlock_bh(&icmp6_dst_lock);
1174 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177 struct dst_entry *dst, **pprev;
1179 spin_lock_bh(&icmp6_dst_lock);
1180 pprev = &icmp6_dst_gc_list;
1181 while ((dst = *pprev) != NULL) {
1182 struct rt6_info *rt = (struct rt6_info *) dst;
1183 if (func(rt, arg)) {
1190 spin_unlock_bh(&icmp6_dst_lock);
1193 static int ip6_dst_gc(struct dst_ops *ops)
1195 unsigned long now = jiffies;
1196 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1197 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1198 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1199 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1200 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1201 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204 entries = dst_entries_get_fast(ops);
1205 if (time_after(rt_last_gc + rt_min_interval, now) &&
1206 entries <= rt_max_size)
1209 net->ipv6.ip6_rt_gc_expire++;
1210 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1211 net->ipv6.ip6_rt_last_gc = now;
1212 entries = dst_entries_get_slow(ops);
1213 if (entries < ops->gc_thresh)
1214 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1216 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1217 return entries > rt_max_size;
1220 /* Clean host part of a prefix. Not necessary in radix tree,
1221 but results in cleaner routing tables.
1223 Remove it only when all the things will work!
1226 int ip6_dst_hoplimit(struct dst_entry *dst)
1228 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1229 if (hoplimit == 0) {
1230 struct net_device *dev = dst->dev;
1231 struct inet6_dev *idev;
1234 idev = __in6_dev_get(dev);
1236 hoplimit = idev->cnf.hop_limit;
1238 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1243 EXPORT_SYMBOL(ip6_dst_hoplimit);
1249 int ip6_route_add(struct fib6_config *cfg)
1252 struct net *net = cfg->fc_nlinfo.nl_net;
1253 struct rt6_info *rt = NULL;
1254 struct net_device *dev = NULL;
1255 struct inet6_dev *idev = NULL;
1256 struct fib6_table *table;
1259 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1261 #ifndef CONFIG_IPV6_SUBTREES
1262 if (cfg->fc_src_len)
1265 if (cfg->fc_ifindex) {
1267 dev = dev_get_by_index(net, cfg->fc_ifindex);
1270 idev = in6_dev_get(dev);
1275 if (cfg->fc_metric == 0)
1276 cfg->fc_metric = IP6_RT_PRIO_USER;
1279 if (cfg->fc_nlinfo.nlh &&
1280 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1281 table = fib6_get_table(net, cfg->fc_table);
1283 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1284 table = fib6_new_table(net, cfg->fc_table);
1287 table = fib6_new_table(net, cfg->fc_table);
1293 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1300 rt->dst.obsolete = -1;
1302 if (cfg->fc_flags & RTF_EXPIRES)
1303 rt6_set_expires(rt, jiffies +
1304 clock_t_to_jiffies(cfg->fc_expires));
1306 rt6_clean_expires(rt);
1308 if (cfg->fc_protocol == RTPROT_UNSPEC)
1309 cfg->fc_protocol = RTPROT_BOOT;
1310 rt->rt6i_protocol = cfg->fc_protocol;
1312 addr_type = ipv6_addr_type(&cfg->fc_dst);
1314 if (addr_type & IPV6_ADDR_MULTICAST)
1315 rt->dst.input = ip6_mc_input;
1316 else if (cfg->fc_flags & RTF_LOCAL)
1317 rt->dst.input = ip6_input;
1319 rt->dst.input = ip6_forward;
1321 rt->dst.output = ip6_output;
1323 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1324 rt->rt6i_dst.plen = cfg->fc_dst_len;
1325 if (rt->rt6i_dst.plen == 128)
1326 rt->dst.flags |= DST_HOST;
1328 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1329 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1334 dst_init_metrics(&rt->dst, metrics, 0);
1336 #ifdef CONFIG_IPV6_SUBTREES
1337 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1338 rt->rt6i_src.plen = cfg->fc_src_len;
1341 rt->rt6i_metric = cfg->fc_metric;
1343 /* We cannot add true routes via loopback here,
1344 they would result in kernel looping; promote them to reject routes
1346 if ((cfg->fc_flags & RTF_REJECT) ||
1347 (dev && (dev->flags & IFF_LOOPBACK) &&
1348 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1349 !(cfg->fc_flags & RTF_LOCAL))) {
1350 /* hold loopback dev/idev if we haven't done so. */
1351 if (dev != net->loopback_dev) {
1356 dev = net->loopback_dev;
1358 idev = in6_dev_get(dev);
1364 rt->dst.output = ip6_pkt_discard_out;
1365 rt->dst.input = ip6_pkt_discard;
1366 rt->dst.error = -ENETUNREACH;
1367 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1371 if (cfg->fc_flags & RTF_GATEWAY) {
1372 const struct in6_addr *gw_addr;
1375 gw_addr = &cfg->fc_gateway;
1376 rt->rt6i_gateway = *gw_addr;
1377 gwa_type = ipv6_addr_type(gw_addr);
1379 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1380 struct rt6_info *grt;
1382 /* IPv6 strictly inhibits using not link-local
1383 addresses as nexthop address.
1384 Otherwise, router will not able to send redirects.
1385 It is very good, but in some (rare!) circumstances
1386 (SIT, PtP, NBMA NOARP links) it is handy to allow
1387 some exceptions. --ANK
1390 if (!(gwa_type & IPV6_ADDR_UNICAST))
1393 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1395 err = -EHOSTUNREACH;
1399 if (dev != grt->dst.dev) {
1400 dst_release(&grt->dst);
1405 idev = grt->rt6i_idev;
1407 in6_dev_hold(grt->rt6i_idev);
1409 if (!(grt->rt6i_flags & RTF_GATEWAY))
1411 dst_release(&grt->dst);
1417 if (!dev || (dev->flags & IFF_LOOPBACK))
1425 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1426 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1430 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1431 rt->rt6i_prefsrc.plen = 128;
1433 rt->rt6i_prefsrc.plen = 0;
1435 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1436 err = rt6_bind_neighbour(rt, dev);
1441 rt->rt6i_flags = cfg->fc_flags;
1448 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1449 int type = nla_type(nla);
1452 if (type > RTAX_MAX) {
1457 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1463 rt->rt6i_idev = idev;
1464 rt->rt6i_table = table;
1466 cfg->fc_nlinfo.nl_net = dev_net(dev);
1468 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1480 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483 struct fib6_table *table;
1484 struct net *net = dev_net(rt->dst.dev);
1486 if (rt == net->ipv6.ip6_null_entry)
1489 table = rt->rt6i_table;
1490 write_lock_bh(&table->tb6_lock);
1492 err = fib6_del(rt, info);
1493 dst_release(&rt->dst);
1495 write_unlock_bh(&table->tb6_lock);
1500 int ip6_del_rt(struct rt6_info *rt)
1502 struct nl_info info = {
1503 .nl_net = dev_net(rt->dst.dev),
1505 return __ip6_del_rt(rt, &info);
1508 static int ip6_route_del(struct fib6_config *cfg)
1510 struct fib6_table *table;
1511 struct fib6_node *fn;
1512 struct rt6_info *rt;
1515 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1519 read_lock_bh(&table->tb6_lock);
1521 fn = fib6_locate(&table->tb6_root,
1522 &cfg->fc_dst, cfg->fc_dst_len,
1523 &cfg->fc_src, cfg->fc_src_len);
1526 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1527 if (cfg->fc_ifindex &&
1529 rt->dst.dev->ifindex != cfg->fc_ifindex))
1531 if (cfg->fc_flags & RTF_GATEWAY &&
1532 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1534 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1537 read_unlock_bh(&table->tb6_lock);
1539 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1542 read_unlock_bh(&table->tb6_lock);
1550 struct ip6rd_flowi {
1552 struct in6_addr gateway;
1555 static struct rt6_info *__ip6_route_redirect(struct net *net,
1556 struct fib6_table *table,
1560 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1561 struct rt6_info *rt;
1562 struct fib6_node *fn;
1565 * Get the "current" route for this destination and
1566 * check if the redirect has come from approriate router.
1568 * RFC 2461 specifies that redirects should only be
1569 * accepted if they come from the nexthop to the target.
1570 * Due to the way the routes are chosen, this notion
1571 * is a bit fuzzy and one might need to check all possible
1575 read_lock_bh(&table->tb6_lock);
1576 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1578 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1580 * Current route is on-link; redirect is always invalid.
1582 * Seems, previous statement is not true. It could
1583 * be node, which looks for us as on-link (f.e. proxy ndisc)
1584 * But then router serving it might decide, that we should
1585 * know truth 8)8) --ANK (980726).
1587 if (rt6_check_expired(rt))
1589 if (!(rt->rt6i_flags & RTF_GATEWAY))
1591 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1593 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1599 rt = net->ipv6.ip6_null_entry;
1600 BACKTRACK(net, &fl6->saddr);
1604 read_unlock_bh(&table->tb6_lock);
1609 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1610 const struct in6_addr *src,
1611 const struct in6_addr *gateway,
1612 struct net_device *dev)
1614 int flags = RT6_LOOKUP_F_HAS_SADDR;
1615 struct net *net = dev_net(dev);
1616 struct ip6rd_flowi rdfl = {
1618 .flowi6_oif = dev->ifindex,
1624 rdfl.gateway = *gateway;
1626 if (rt6_need_strict(dest))
1627 flags |= RT6_LOOKUP_F_IFACE;
1629 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1630 flags, __ip6_route_redirect);
1633 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1634 const struct in6_addr *saddr,
1635 struct neighbour *neigh, u8 *lladdr, int on_link)
1637 struct rt6_info *rt, *nrt = NULL;
1638 struct netevent_redirect netevent;
1639 struct net *net = dev_net(neigh->dev);
1641 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1643 if (rt == net->ipv6.ip6_null_entry) {
1644 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1649 * We have finally decided to accept it.
1652 neigh_update(neigh, lladdr, NUD_STALE,
1653 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1654 NEIGH_UPDATE_F_OVERRIDE|
1655 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1656 NEIGH_UPDATE_F_ISROUTER))
1660 * Redirect received -> path was valid.
1661 * Look, redirects are sent only in response to data packets,
1662 * so that this nexthop apparently is reachable. --ANK
1664 dst_confirm(&rt->dst);
1666 /* Duplicate redirect: silently ignore. */
1667 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1670 nrt = ip6_rt_copy(rt, dest);
1674 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1676 nrt->rt6i_flags &= ~RTF_GATEWAY;
1678 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1679 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1681 if (ip6_ins_rt(nrt))
1684 netevent.old = &rt->dst;
1685 netevent.new = &nrt->dst;
1686 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1688 if (rt->rt6i_flags & RTF_CACHE) {
1694 dst_release(&rt->dst);
1698 * Handle ICMP "packet too big" messages
1699 * i.e. Path MTU discovery
1702 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1703 struct net *net, u32 pmtu, int ifindex)
1705 struct rt6_info *rt, *nrt;
1708 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1712 if (rt6_check_expired(rt)) {
1717 if (pmtu >= dst_mtu(&rt->dst))
1720 if (pmtu < IPV6_MIN_MTU) {
1722 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1723 * MTU (1280) and a fragment header should always be included
1724 * after a node receiving Too Big message reporting PMTU is
1725 * less than the IPv6 Minimum Link MTU.
1727 pmtu = IPV6_MIN_MTU;
1731 /* New mtu received -> path was valid.
1732 They are sent only in response to data packets,
1733 so that this nexthop apparently is reachable. --ANK
1735 dst_confirm(&rt->dst);
1737 /* Host route. If it is static, it would be better
1738 not to override it, but add new one, so that
1739 when cache entry will expire old pmtu
1740 would return automatically.
1742 if (rt->rt6i_flags & RTF_CACHE) {
1743 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1745 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1746 features |= RTAX_FEATURE_ALLFRAG;
1747 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1749 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1750 rt->rt6i_flags |= RTF_MODIFIED;
1755 Two cases are possible:
1756 1. It is connected route. Action: COW
1757 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1759 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1760 nrt = rt6_alloc_cow(rt, daddr, saddr);
1762 nrt = rt6_alloc_clone(rt, daddr);
1765 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1767 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1768 features |= RTAX_FEATURE_ALLFRAG;
1769 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1772 /* According to RFC 1981, detecting PMTU increase shouldn't be
1773 * happened within 5 mins, the recommended timer is 10 mins.
1774 * Here this route expiration time is set to ip6_rt_mtu_expires
1775 * which is 10 mins. After 10 mins the decreased pmtu is expired
1776 * and detecting PMTU increase will be automatically happened.
1778 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1779 nrt->rt6i_flags |= RTF_DYNAMIC;
1783 dst_release(&rt->dst);
1786 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1787 struct net_device *dev, u32 pmtu)
1789 struct net *net = dev_net(dev);
1792 * RFC 1981 states that a node "MUST reduce the size of the packets it
1793 * is sending along the path" that caused the Packet Too Big message.
1794 * Since it's not possible in the general case to determine which
1795 * interface was used to send the original packet, we update the MTU
1796 * on the interface that will be used to send future packets. We also
1797 * update the MTU on the interface that received the Packet Too Big in
1798 * case the original packet was forced out that interface with
1799 * SO_BINDTODEVICE or similar. This is the next best thing to the
1800 * correct behaviour, which would be to update the MTU on all
1803 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1804 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1808 * Misc support functions
1811 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1812 const struct in6_addr *dest)
1814 struct net *net = dev_net(ort->dst.dev);
1815 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1819 rt->dst.input = ort->dst.input;
1820 rt->dst.output = ort->dst.output;
1821 rt->dst.flags |= DST_HOST;
1823 rt->rt6i_dst.addr = *dest;
1824 rt->rt6i_dst.plen = 128;
1825 dst_copy_metrics(&rt->dst, &ort->dst);
1826 rt->dst.error = ort->dst.error;
1827 rt->rt6i_idev = ort->rt6i_idev;
1829 in6_dev_hold(rt->rt6i_idev);
1830 rt->dst.lastuse = jiffies;
1832 rt->rt6i_gateway = ort->rt6i_gateway;
1833 rt->rt6i_flags = ort->rt6i_flags;
1834 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1835 (RTF_DEFAULT | RTF_ADDRCONF))
1836 rt6_set_from(rt, ort);
1838 rt6_clean_expires(rt);
1839 rt->rt6i_metric = 0;
1841 #ifdef CONFIG_IPV6_SUBTREES
1842 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1844 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1845 rt->rt6i_table = ort->rt6i_table;
1850 #ifdef CONFIG_IPV6_ROUTE_INFO
1851 static struct rt6_info *rt6_get_route_info(struct net *net,
1852 const struct in6_addr *prefix, int prefixlen,
1853 const struct in6_addr *gwaddr, int ifindex)
1855 struct fib6_node *fn;
1856 struct rt6_info *rt = NULL;
1857 struct fib6_table *table;
1859 table = fib6_get_table(net, RT6_TABLE_INFO);
1863 write_lock_bh(&table->tb6_lock);
1864 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1868 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1869 if (rt->dst.dev->ifindex != ifindex)
1871 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1873 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1879 write_unlock_bh(&table->tb6_lock);
1883 static struct rt6_info *rt6_add_route_info(struct net *net,
1884 const struct in6_addr *prefix, int prefixlen,
1885 const struct in6_addr *gwaddr, int ifindex,
1888 struct fib6_config cfg = {
1889 .fc_table = RT6_TABLE_INFO,
1890 .fc_metric = IP6_RT_PRIO_USER,
1891 .fc_ifindex = ifindex,
1892 .fc_dst_len = prefixlen,
1893 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1894 RTF_UP | RTF_PREF(pref),
1896 .fc_nlinfo.nlh = NULL,
1897 .fc_nlinfo.nl_net = net,
1900 cfg.fc_dst = *prefix;
1901 cfg.fc_gateway = *gwaddr;
1903 /* We should treat it as a default route if prefix length is 0. */
1905 cfg.fc_flags |= RTF_DEFAULT;
1907 ip6_route_add(&cfg);
1909 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1913 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1915 struct rt6_info *rt;
1916 struct fib6_table *table;
1918 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1922 write_lock_bh(&table->tb6_lock);
1923 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1924 if (dev == rt->dst.dev &&
1925 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1926 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1931 write_unlock_bh(&table->tb6_lock);
1935 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1936 struct net_device *dev,
1939 struct fib6_config cfg = {
1940 .fc_table = RT6_TABLE_DFLT,
1941 .fc_metric = IP6_RT_PRIO_USER,
1942 .fc_ifindex = dev->ifindex,
1943 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1944 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1946 .fc_nlinfo.nlh = NULL,
1947 .fc_nlinfo.nl_net = dev_net(dev),
1950 cfg.fc_gateway = *gwaddr;
1952 ip6_route_add(&cfg);
1954 return rt6_get_dflt_router(gwaddr, dev);
1957 void rt6_purge_dflt_routers(struct net *net)
1959 struct rt6_info *rt;
1960 struct fib6_table *table;
1962 /* NOTE: Keep consistent with rt6_get_dflt_router */
1963 table = fib6_get_table(net, RT6_TABLE_DFLT);
1968 read_lock_bh(&table->tb6_lock);
1969 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1970 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1972 read_unlock_bh(&table->tb6_lock);
1977 read_unlock_bh(&table->tb6_lock);
1980 static void rtmsg_to_fib6_config(struct net *net,
1981 struct in6_rtmsg *rtmsg,
1982 struct fib6_config *cfg)
1984 memset(cfg, 0, sizeof(*cfg));
1986 cfg->fc_table = RT6_TABLE_MAIN;
1987 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1988 cfg->fc_metric = rtmsg->rtmsg_metric;
1989 cfg->fc_expires = rtmsg->rtmsg_info;
1990 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1991 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1992 cfg->fc_flags = rtmsg->rtmsg_flags;
1994 cfg->fc_nlinfo.nl_net = net;
1996 cfg->fc_dst = rtmsg->rtmsg_dst;
1997 cfg->fc_src = rtmsg->rtmsg_src;
1998 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2001 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2003 struct fib6_config cfg;
2004 struct in6_rtmsg rtmsg;
2008 case SIOCADDRT: /* Add a route */
2009 case SIOCDELRT: /* Delete a route */
2010 if (!capable(CAP_NET_ADMIN))
2012 err = copy_from_user(&rtmsg, arg,
2013 sizeof(struct in6_rtmsg));
2017 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2022 err = ip6_route_add(&cfg);
2025 err = ip6_route_del(&cfg);
2039 * Drop the packet on the floor
2042 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2045 struct dst_entry *dst = skb_dst(skb);
2046 switch (ipstats_mib_noroutes) {
2047 case IPSTATS_MIB_INNOROUTES:
2048 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2049 if (type == IPV6_ADDR_ANY) {
2050 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2051 IPSTATS_MIB_INADDRERRORS);
2055 case IPSTATS_MIB_OUTNOROUTES:
2056 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2057 ipstats_mib_noroutes);
2060 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2065 static int ip6_pkt_discard(struct sk_buff *skb)
2067 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2070 static int ip6_pkt_discard_out(struct sk_buff *skb)
2072 skb->dev = skb_dst(skb)->dev;
2073 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2076 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2078 static int ip6_pkt_prohibit(struct sk_buff *skb)
2080 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2083 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2085 skb->dev = skb_dst(skb)->dev;
2086 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2092 * Allocate a dst for local (unicast / anycast) address.
2095 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2096 const struct in6_addr *addr,
2099 struct net *net = dev_net(idev->dev);
2100 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2101 net->loopback_dev, 0);
2105 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2106 return ERR_PTR(-ENOMEM);
2111 rt->dst.flags |= DST_HOST;
2112 rt->dst.input = ip6_input;
2113 rt->dst.output = ip6_output;
2114 rt->rt6i_idev = idev;
2115 rt->dst.obsolete = -1;
2117 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2119 rt->rt6i_flags |= RTF_ANYCAST;
2121 rt->rt6i_flags |= RTF_LOCAL;
2122 err = rt6_bind_neighbour(rt, rt->dst.dev);
2125 return ERR_PTR(err);
2128 rt->rt6i_dst.addr = *addr;
2129 rt->rt6i_dst.plen = 128;
2130 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2132 atomic_set(&rt->dst.__refcnt, 1);
2137 int ip6_route_get_saddr(struct net *net,
2138 struct rt6_info *rt,
2139 const struct in6_addr *daddr,
2141 struct in6_addr *saddr)
2143 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2145 if (rt->rt6i_prefsrc.plen)
2146 *saddr = rt->rt6i_prefsrc.addr;
2148 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2149 daddr, prefs, saddr);
2153 /* remove deleted ip from prefsrc entries */
2154 struct arg_dev_net_ip {
2155 struct net_device *dev;
2157 struct in6_addr *addr;
2160 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2162 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2163 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2164 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2166 if (((void *)rt->dst.dev == dev || !dev) &&
2167 rt != net->ipv6.ip6_null_entry &&
2168 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2169 /* remove prefsrc entry */
2170 rt->rt6i_prefsrc.plen = 0;
2175 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2177 struct net *net = dev_net(ifp->idev->dev);
2178 struct arg_dev_net_ip adni = {
2179 .dev = ifp->idev->dev,
2183 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2186 struct arg_dev_net {
2187 struct net_device *dev;
2191 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2193 const struct arg_dev_net *adn = arg;
2194 const struct net_device *dev = adn->dev;
2196 if ((rt->dst.dev == dev || !dev) &&
2197 rt != adn->net->ipv6.ip6_null_entry)
2203 void rt6_ifdown(struct net *net, struct net_device *dev)
2205 struct arg_dev_net adn = {
2210 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2211 icmp6_clean_all(fib6_ifdown, &adn);
2214 struct rt6_mtu_change_arg {
2215 struct net_device *dev;
2219 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2221 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2222 struct inet6_dev *idev;
2224 /* In IPv6 pmtu discovery is not optional,
2225 so that RTAX_MTU lock cannot disable it.
2226 We still use this lock to block changes
2227 caused by addrconf/ndisc.
2230 idev = __in6_dev_get(arg->dev);
2234 /* For administrative MTU increase, there is no way to discover
2235 IPv6 PMTU increase, so PMTU increase should be updated here.
2236 Since RFC 1981 doesn't include administrative MTU increase
2237 update PMTU increase is a MUST. (i.e. jumbo frame)
2240 If new MTU is less than route PMTU, this new MTU will be the
2241 lowest MTU in the path, update the route PMTU to reflect PMTU
2242 decreases; if new MTU is greater than route PMTU, and the
2243 old MTU is the lowest MTU in the path, update the route PMTU
2244 to reflect the increase. In this case if the other nodes' MTU
2245 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2248 if (rt->dst.dev == arg->dev &&
2249 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2250 (dst_mtu(&rt->dst) >= arg->mtu ||
2251 (dst_mtu(&rt->dst) < arg->mtu &&
2252 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2253 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2258 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2260 struct rt6_mtu_change_arg arg = {
2265 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2268 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2269 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2270 [RTA_OIF] = { .type = NLA_U32 },
2271 [RTA_IIF] = { .type = NLA_U32 },
2272 [RTA_PRIORITY] = { .type = NLA_U32 },
2273 [RTA_METRICS] = { .type = NLA_NESTED },
2276 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2277 struct fib6_config *cfg)
2280 struct nlattr *tb[RTA_MAX+1];
2283 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2288 rtm = nlmsg_data(nlh);
2289 memset(cfg, 0, sizeof(*cfg));
2291 cfg->fc_table = rtm->rtm_table;
2292 cfg->fc_dst_len = rtm->rtm_dst_len;
2293 cfg->fc_src_len = rtm->rtm_src_len;
2294 cfg->fc_flags = RTF_UP;
2295 cfg->fc_protocol = rtm->rtm_protocol;
2297 if (rtm->rtm_type == RTN_UNREACHABLE)
2298 cfg->fc_flags |= RTF_REJECT;
2300 if (rtm->rtm_type == RTN_LOCAL)
2301 cfg->fc_flags |= RTF_LOCAL;
2303 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2304 cfg->fc_nlinfo.nlh = nlh;
2305 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2307 if (tb[RTA_GATEWAY]) {
2308 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2309 cfg->fc_flags |= RTF_GATEWAY;
2313 int plen = (rtm->rtm_dst_len + 7) >> 3;
2315 if (nla_len(tb[RTA_DST]) < plen)
2318 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2322 int plen = (rtm->rtm_src_len + 7) >> 3;
2324 if (nla_len(tb[RTA_SRC]) < plen)
2327 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2330 if (tb[RTA_PREFSRC])
2331 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2334 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2336 if (tb[RTA_PRIORITY])
2337 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2339 if (tb[RTA_METRICS]) {
2340 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2341 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2345 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2352 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2354 struct fib6_config cfg;
2357 err = rtm_to_fib6_config(skb, nlh, &cfg);
2361 return ip6_route_del(&cfg);
2364 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2366 struct fib6_config cfg;
2369 err = rtm_to_fib6_config(skb, nlh, &cfg);
2373 return ip6_route_add(&cfg);
2376 static inline size_t rt6_nlmsg_size(void)
2378 return NLMSG_ALIGN(sizeof(struct rtmsg))
2379 + nla_total_size(16) /* RTA_SRC */
2380 + nla_total_size(16) /* RTA_DST */
2381 + nla_total_size(16) /* RTA_GATEWAY */
2382 + nla_total_size(16) /* RTA_PREFSRC */
2383 + nla_total_size(4) /* RTA_TABLE */
2384 + nla_total_size(4) /* RTA_IIF */
2385 + nla_total_size(4) /* RTA_OIF */
2386 + nla_total_size(4) /* RTA_PRIORITY */
2387 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2388 + nla_total_size(sizeof(struct rta_cacheinfo));
2391 static int rt6_fill_node(struct net *net,
2392 struct sk_buff *skb, struct rt6_info *rt,
2393 struct in6_addr *dst, struct in6_addr *src,
2394 int iif, int type, u32 pid, u32 seq,
2395 int prefix, int nowait, unsigned int flags)
2397 const struct inet_peer *peer;
2399 struct nlmsghdr *nlh;
2402 struct neighbour *n;
2405 if (prefix) { /* user wants prefix routes only */
2406 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2407 /* success since this is not a prefix route */
2412 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2416 rtm = nlmsg_data(nlh);
2417 rtm->rtm_family = AF_INET6;
2418 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2419 rtm->rtm_src_len = rt->rt6i_src.plen;
2422 table = rt->rt6i_table->tb6_id;
2424 table = RT6_TABLE_UNSPEC;
2425 rtm->rtm_table = table;
2426 if (nla_put_u32(skb, RTA_TABLE, table))
2427 goto nla_put_failure;
2428 if (rt->rt6i_flags & RTF_REJECT)
2429 rtm->rtm_type = RTN_UNREACHABLE;
2430 else if (rt->rt6i_flags & RTF_LOCAL)
2431 rtm->rtm_type = RTN_LOCAL;
2432 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2433 rtm->rtm_type = RTN_LOCAL;
2435 rtm->rtm_type = RTN_UNICAST;
2437 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2438 rtm->rtm_protocol = rt->rt6i_protocol;
2439 if (rt->rt6i_flags & RTF_DYNAMIC)
2440 rtm->rtm_protocol = RTPROT_REDIRECT;
2441 else if (rt->rt6i_flags & RTF_ADDRCONF)
2442 rtm->rtm_protocol = RTPROT_KERNEL;
2443 else if (rt->rt6i_flags & RTF_DEFAULT)
2444 rtm->rtm_protocol = RTPROT_RA;
2446 if (rt->rt6i_flags & RTF_CACHE)
2447 rtm->rtm_flags |= RTM_F_CLONED;
2450 if (nla_put(skb, RTA_DST, 16, dst))
2451 goto nla_put_failure;
2452 rtm->rtm_dst_len = 128;
2453 } else if (rtm->rtm_dst_len)
2454 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2455 goto nla_put_failure;
2456 #ifdef CONFIG_IPV6_SUBTREES
2458 if (nla_put(skb, RTA_SRC, 16, src))
2459 goto nla_put_failure;
2460 rtm->rtm_src_len = 128;
2461 } else if (rtm->rtm_src_len &&
2462 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2463 goto nla_put_failure;
2466 #ifdef CONFIG_IPV6_MROUTE
2467 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2468 int err = ip6mr_get_route(net, skb, rtm, nowait);
2473 goto nla_put_failure;
2475 if (err == -EMSGSIZE)
2476 goto nla_put_failure;
2481 if (nla_put_u32(skb, RTA_IIF, iif))
2482 goto nla_put_failure;
2484 struct in6_addr saddr_buf;
2485 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2486 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2487 goto nla_put_failure;
2490 if (rt->rt6i_prefsrc.plen) {
2491 struct in6_addr saddr_buf;
2492 saddr_buf = rt->rt6i_prefsrc.addr;
2493 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2494 goto nla_put_failure;
2497 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2498 goto nla_put_failure;
2501 n = dst_get_neighbour_noref(&rt->dst);
2503 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2505 goto nla_put_failure;
2511 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2512 goto nla_put_failure;
2513 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2514 goto nla_put_failure;
2515 if (!(rt->rt6i_flags & RTF_EXPIRES))
2517 else if (rt->dst.expires - jiffies < INT_MAX)
2518 expires = rt->dst.expires - jiffies;
2522 peer = rt->rt6i_peer;
2524 if (peer && peer->tcp_ts_stamp) {
2526 tsage = get_seconds() - peer->tcp_ts_stamp;
2529 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2530 expires, rt->dst.error) < 0)
2531 goto nla_put_failure;
2533 return nlmsg_end(skb, nlh);
2536 nlmsg_cancel(skb, nlh);
2540 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2542 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2545 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2546 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2547 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2551 return rt6_fill_node(arg->net,
2552 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2553 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2554 prefix, 0, NLM_F_MULTI);
2557 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2559 struct net *net = sock_net(in_skb->sk);
2560 struct nlattr *tb[RTA_MAX+1];
2561 struct rt6_info *rt;
2562 struct sk_buff *skb;
2565 int err, iif = 0, oif = 0;
2567 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2572 memset(&fl6, 0, sizeof(fl6));
2575 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2578 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2582 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2585 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2589 iif = nla_get_u32(tb[RTA_IIF]);
2592 oif = nla_get_u32(tb[RTA_OIF]);
2595 struct net_device *dev;
2598 dev = __dev_get_by_index(net, iif);
2604 fl6.flowi6_iif = iif;
2606 if (!ipv6_addr_any(&fl6.saddr))
2607 flags |= RT6_LOOKUP_F_HAS_SADDR;
2609 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2612 fl6.flowi6_oif = oif;
2614 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2617 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2619 dst_release(&rt->dst);
2624 /* Reserve room for dummy headers, this skb can pass
2625 through good chunk of routing engine.
2627 skb_reset_mac_header(skb);
2628 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2630 skb_dst_set(skb, &rt->dst);
2632 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2633 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2634 nlh->nlmsg_seq, 0, 0, 0);
2640 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2645 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2647 struct sk_buff *skb;
2648 struct net *net = info->nl_net;
2653 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2655 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2659 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2660 event, info->pid, seq, 0, 0, 0);
2662 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2663 WARN_ON(err == -EMSGSIZE);
2667 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2668 info->nlh, gfp_any());
2672 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2675 static int ip6_route_dev_notify(struct notifier_block *this,
2676 unsigned long event, void *data)
2678 struct net_device *dev = (struct net_device *)data;
2679 struct net *net = dev_net(dev);
2681 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2682 net->ipv6.ip6_null_entry->dst.dev = dev;
2683 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2684 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2685 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2686 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2687 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2688 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2699 #ifdef CONFIG_PROC_FS
2710 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2712 struct seq_file *m = p_arg;
2713 struct neighbour *n;
2715 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2717 #ifdef CONFIG_IPV6_SUBTREES
2718 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2720 seq_puts(m, "00000000000000000000000000000000 00 ");
2723 n = dst_get_neighbour_noref(&rt->dst);
2725 seq_printf(m, "%pi6", n->primary_key);
2727 seq_puts(m, "00000000000000000000000000000000");
2730 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2731 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2732 rt->dst.__use, rt->rt6i_flags,
2733 rt->dst.dev ? rt->dst.dev->name : "");
2737 static int ipv6_route_show(struct seq_file *m, void *v)
2739 struct net *net = (struct net *)m->private;
2740 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2744 static int ipv6_route_open(struct inode *inode, struct file *file)
2746 return single_open_net(inode, file, ipv6_route_show);
2749 static const struct file_operations ipv6_route_proc_fops = {
2750 .owner = THIS_MODULE,
2751 .open = ipv6_route_open,
2753 .llseek = seq_lseek,
2754 .release = single_release_net,
2757 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2759 struct net *net = (struct net *)seq->private;
2760 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2761 net->ipv6.rt6_stats->fib_nodes,
2762 net->ipv6.rt6_stats->fib_route_nodes,
2763 net->ipv6.rt6_stats->fib_rt_alloc,
2764 net->ipv6.rt6_stats->fib_rt_entries,
2765 net->ipv6.rt6_stats->fib_rt_cache,
2766 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2767 net->ipv6.rt6_stats->fib_discarded_routes);
2772 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2774 return single_open_net(inode, file, rt6_stats_seq_show);
2777 static const struct file_operations rt6_stats_seq_fops = {
2778 .owner = THIS_MODULE,
2779 .open = rt6_stats_seq_open,
2781 .llseek = seq_lseek,
2782 .release = single_release_net,
2784 #endif /* CONFIG_PROC_FS */
2786 #ifdef CONFIG_SYSCTL
2789 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2790 void __user *buffer, size_t *lenp, loff_t *ppos)
2797 net = (struct net *)ctl->extra1;
2798 delay = net->ipv6.sysctl.flush_delay;
2799 proc_dointvec(ctl, write, buffer, lenp, ppos);
2800 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2804 ctl_table ipv6_route_table_template[] = {
2806 .procname = "flush",
2807 .data = &init_net.ipv6.sysctl.flush_delay,
2808 .maxlen = sizeof(int),
2810 .proc_handler = ipv6_sysctl_rtcache_flush
2813 .procname = "gc_thresh",
2814 .data = &ip6_dst_ops_template.gc_thresh,
2815 .maxlen = sizeof(int),
2817 .proc_handler = proc_dointvec,
2820 .procname = "max_size",
2821 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2822 .maxlen = sizeof(int),
2824 .proc_handler = proc_dointvec,
2827 .procname = "gc_min_interval",
2828 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2829 .maxlen = sizeof(int),
2831 .proc_handler = proc_dointvec_jiffies,
2834 .procname = "gc_timeout",
2835 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2836 .maxlen = sizeof(int),
2838 .proc_handler = proc_dointvec_jiffies,
2841 .procname = "gc_interval",
2842 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2843 .maxlen = sizeof(int),
2845 .proc_handler = proc_dointvec_jiffies,
2848 .procname = "gc_elasticity",
2849 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2850 .maxlen = sizeof(int),
2852 .proc_handler = proc_dointvec,
2855 .procname = "mtu_expires",
2856 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2857 .maxlen = sizeof(int),
2859 .proc_handler = proc_dointvec_jiffies,
2862 .procname = "min_adv_mss",
2863 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2864 .maxlen = sizeof(int),
2866 .proc_handler = proc_dointvec,
2869 .procname = "gc_min_interval_ms",
2870 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2871 .maxlen = sizeof(int),
2873 .proc_handler = proc_dointvec_ms_jiffies,
2878 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2880 struct ctl_table *table;
2882 table = kmemdup(ipv6_route_table_template,
2883 sizeof(ipv6_route_table_template),
2887 table[0].data = &net->ipv6.sysctl.flush_delay;
2888 table[0].extra1 = net;
2889 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2890 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2891 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2892 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2893 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2894 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2895 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2896 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2897 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2904 static int __net_init ip6_route_net_init(struct net *net)
2908 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2909 sizeof(net->ipv6.ip6_dst_ops));
2911 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2912 goto out_ip6_dst_ops;
2914 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2915 sizeof(*net->ipv6.ip6_null_entry),
2917 if (!net->ipv6.ip6_null_entry)
2918 goto out_ip6_dst_entries;
2919 net->ipv6.ip6_null_entry->dst.path =
2920 (struct dst_entry *)net->ipv6.ip6_null_entry;
2921 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2922 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2923 ip6_template_metrics, true);
2925 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2926 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2927 sizeof(*net->ipv6.ip6_prohibit_entry),
2929 if (!net->ipv6.ip6_prohibit_entry)
2930 goto out_ip6_null_entry;
2931 net->ipv6.ip6_prohibit_entry->dst.path =
2932 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2933 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2934 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2935 ip6_template_metrics, true);
2937 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2938 sizeof(*net->ipv6.ip6_blk_hole_entry),
2940 if (!net->ipv6.ip6_blk_hole_entry)
2941 goto out_ip6_prohibit_entry;
2942 net->ipv6.ip6_blk_hole_entry->dst.path =
2943 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2944 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2945 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2946 ip6_template_metrics, true);
2949 net->ipv6.sysctl.flush_delay = 0;
2950 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2951 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2952 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2953 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2954 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2955 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2956 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2958 #ifdef CONFIG_PROC_FS
2959 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2960 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2962 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2968 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2969 out_ip6_prohibit_entry:
2970 kfree(net->ipv6.ip6_prohibit_entry);
2972 kfree(net->ipv6.ip6_null_entry);
2974 out_ip6_dst_entries:
2975 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2980 static void __net_exit ip6_route_net_exit(struct net *net)
2982 #ifdef CONFIG_PROC_FS
2983 proc_net_remove(net, "ipv6_route");
2984 proc_net_remove(net, "rt6_stats");
2986 kfree(net->ipv6.ip6_null_entry);
2987 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2988 kfree(net->ipv6.ip6_prohibit_entry);
2989 kfree(net->ipv6.ip6_blk_hole_entry);
2991 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2994 static struct pernet_operations ip6_route_net_ops = {
2995 .init = ip6_route_net_init,
2996 .exit = ip6_route_net_exit,
2999 static int __net_init ipv6_inetpeer_init(struct net *net)
3001 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3005 inet_peer_base_init(bp);
3006 net->ipv6.peers = bp;
3010 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3012 struct inet_peer_base *bp = net->ipv6.peers;
3014 net->ipv6.peers = NULL;
3015 __inetpeer_invalidate_tree(bp);
3019 static __net_initdata struct pernet_operations ipv6_inetpeer_ops = {
3020 .init = ipv6_inetpeer_init,
3021 .exit = ipv6_inetpeer_exit,
3024 static struct notifier_block ip6_route_dev_notifier = {
3025 .notifier_call = ip6_route_dev_notify,
3029 int __init ip6_route_init(void)
3034 ip6_dst_ops_template.kmem_cachep =
3035 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3036 SLAB_HWCACHE_ALIGN, NULL);
3037 if (!ip6_dst_ops_template.kmem_cachep)
3040 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3042 goto out_kmem_cache;
3044 ret = register_pernet_subsys(&ip6_route_net_ops);
3046 goto out_dst_entries;
3048 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3050 goto out_register_subsys;
3052 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3054 /* Registering of the loopback is done before this portion of code,
3055 * the loopback reference in rt6_info will not be taken, do it
3056 * manually for init_net */
3057 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3058 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3059 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3060 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3061 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3062 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3063 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3067 goto out_register_inetpeer;
3073 ret = fib6_rules_init();
3078 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3079 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3080 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3081 goto fib6_rules_init;
3083 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3085 goto fib6_rules_init;
3091 fib6_rules_cleanup();
3096 out_register_inetpeer:
3097 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3098 out_register_subsys:
3099 unregister_pernet_subsys(&ip6_route_net_ops);
3101 dst_entries_destroy(&ip6_dst_blackhole_ops);
3103 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3107 void ip6_route_cleanup(void)
3109 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3110 fib6_rules_cleanup();
3113 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3114 unregister_pernet_subsys(&ip6_route_net_ops);
3115 dst_entries_destroy(&ip6_dst_blackhole_ops);
3116 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);