2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
116 #include "fib_lookup.h"
118 #define RT_FL_TOS(oldflp4) \
119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121 #define RT_GC_TIMEOUT (300*HZ)
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
133 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
136 * Interface to generic destination cache.
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void ipv4_link_failure(struct sk_buff *skb);
144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
161 static struct dst_ops ipv4_dst_ops = {
163 .check = ipv4_dst_check,
164 .default_advmss = ipv4_default_advmss,
166 .cow_metrics = ipv4_cow_metrics,
167 .destroy = ipv4_dst_destroy,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 .confirm_neigh = ipv4_confirm_neigh,
177 #define ECN_OR_COST(class) TC_PRIO_##class
179 const __u8 ip_tos2prio[16] = {
181 ECN_OR_COST(BESTEFFORT),
183 ECN_OR_COST(BESTEFFORT),
189 ECN_OR_COST(INTERACTIVE),
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
197 EXPORT_SYMBOL(ip_tos2prio);
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 return SEQ_START_TOKEN;
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
230 static const struct seq_operations rt_cache_seq_ops = {
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
239 return seq_open(file, &rt_cache_seq_ops);
242 static const struct file_operations rt_cache_seq_fops = {
243 .open = rt_cache_seq_open,
246 .release = seq_release,
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 return SEQ_START_TOKEN;
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
261 return &per_cpu(rt_cache_stat, cpu);
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
274 return &per_cpu(rt_cache_stat, cpu);
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 struct rt_cache_stat *st = v;
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 return seq_open(file, &rt_cpu_seq_ops);
332 static const struct file_operations rt_cpu_seq_fops = {
333 .open = rt_cpu_seq_open,
336 .release = seq_release,
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 struct ip_rt_acct *dst, *src;
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 static int __net_init ip_rt_do_proc_init(struct net *net)
367 struct proc_dir_entry *pde;
369 pde = proc_create("rt_cache", 0444, net->proc_net,
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_seq_fops);
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
387 #ifdef CONFIG_IP_ROUTE_CLASSID
389 remove_proc_entry("rt_cache", net->proc_net_stat);
392 remove_proc_entry("rt_cache", net->proc_net);
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
411 static int __init ip_rt_proc_init(void)
413 return register_pernet_subsys(&ip_rt_proc_ops);
417 static inline int ip_rt_proc_init(void)
421 #endif /* CONFIG_PROC_FS */
423 static inline bool rt_is_expired(const struct rtable *rth)
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
428 void rt_cache_flush(struct net *net)
430 rt_genid_bump_ipv4(net);
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
437 struct net_device *dev = dst->dev;
438 const __be32 *pkey = daddr;
439 const struct rtable *rt;
442 rt = (const struct rtable *) dst;
444 pkey = (const __be32 *) &rt->rt_gateway;
446 pkey = &ip_hdr(skb)->daddr;
448 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
451 return neigh_create(&arp_tbl, pkey, dev);
454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
456 struct net_device *dev = dst->dev;
457 const __be32 *pkey = daddr;
458 const struct rtable *rt;
460 rt = (const struct rtable *)dst;
462 pkey = (const __be32 *)&rt->rt_gateway;
465 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
468 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
471 #define IP_IDENTS_SZ 2048u
473 static atomic_t *ip_idents __read_mostly;
474 static u32 *ip_tstamps __read_mostly;
476 /* In order to protect privacy, we add a perturbation to identifiers
477 * if one generator is seldom used. This makes hard for an attacker
478 * to infer how many packets were sent between two points in time.
480 u32 ip_idents_reserve(u32 hash, int segs)
482 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
484 u32 old = READ_ONCE(*p_tstamp);
485 u32 now = (u32)jiffies;
488 if (old != now && cmpxchg(p_tstamp, old, now) == old)
489 delta = prandom_u32_max(now - old);
491 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
493 old = (u32)atomic_read(p_id);
494 new = old + delta + segs;
495 } while (atomic_cmpxchg(p_id, old, new) != old);
499 EXPORT_SYMBOL(ip_idents_reserve);
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
503 static u32 ip_idents_hashrnd __read_mostly;
506 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
508 hash = jhash_3words((__force u32)iph->daddr,
509 (__force u32)iph->saddr,
510 iph->protocol ^ net_hash_mix(net),
512 id = ip_idents_reserve(hash, segs);
515 EXPORT_SYMBOL(__ip_select_ident);
517 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
518 const struct sock *sk,
519 const struct iphdr *iph,
521 u8 prot, u32 mark, int flow_flags)
524 const struct inet_sock *inet = inet_sk(sk);
526 oif = sk->sk_bound_dev_if;
528 tos = RT_CONN_FLAGS(sk);
529 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
531 flowi4_init_output(fl4, oif, mark, tos,
532 RT_SCOPE_UNIVERSE, prot,
534 iph->daddr, iph->saddr, 0, 0,
535 sock_net_uid(net, sk));
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539 const struct sock *sk)
541 const struct net *net = dev_net(skb->dev);
542 const struct iphdr *iph = ip_hdr(skb);
543 int oif = skb->dev->ifindex;
544 u8 tos = RT_TOS(iph->tos);
545 u8 prot = iph->protocol;
546 u32 mark = skb->mark;
548 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
553 const struct inet_sock *inet = inet_sk(sk);
554 const struct ip_options_rcu *inet_opt;
555 __be32 daddr = inet->inet_daddr;
558 inet_opt = rcu_dereference(inet->inet_opt);
559 if (inet_opt && inet_opt->opt.srr)
560 daddr = inet_opt->opt.faddr;
561 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
562 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
563 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
564 inet_sk_flowi_flags(sk),
565 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
570 const struct sk_buff *skb)
573 build_skb_flow_key(fl4, skb, sk);
575 build_sk_flow_key(fl4, sk);
578 static DEFINE_SPINLOCK(fnhe_lock);
580 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
584 rt = rcu_dereference(fnhe->fnhe_rth_input);
586 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
587 dst_dev_put(&rt->dst);
588 dst_release(&rt->dst);
590 rt = rcu_dereference(fnhe->fnhe_rth_output);
592 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593 dst_dev_put(&rt->dst);
594 dst_release(&rt->dst);
598 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
600 struct fib_nh_exception *fnhe, *oldest;
602 oldest = rcu_dereference(hash->chain);
603 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
604 fnhe = rcu_dereference(fnhe->fnhe_next)) {
605 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608 fnhe_flush_routes(oldest);
612 static inline u32 fnhe_hashfun(__be32 daddr)
614 static u32 fnhe_hashrnd __read_mostly;
617 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
618 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
619 return hash_32(hval, FNHE_HASH_SHIFT);
622 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
624 rt->rt_pmtu = fnhe->fnhe_pmtu;
625 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
626 rt->dst.expires = fnhe->fnhe_expires;
629 rt->rt_flags |= RTCF_REDIRECTED;
630 rt->rt_gateway = fnhe->fnhe_gw;
631 rt->rt_uses_gateway = 1;
635 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
636 u32 pmtu, bool lock, unsigned long expires)
638 struct fnhe_hash_bucket *hash;
639 struct fib_nh_exception *fnhe;
645 genid = fnhe_genid(dev_net(nh->nh_dev));
646 hval = fnhe_hashfun(daddr);
648 spin_lock_bh(&fnhe_lock);
650 hash = rcu_dereference(nh->nh_exceptions);
652 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
655 rcu_assign_pointer(nh->nh_exceptions, hash);
661 for (fnhe = rcu_dereference(hash->chain); fnhe;
662 fnhe = rcu_dereference(fnhe->fnhe_next)) {
663 if (fnhe->fnhe_daddr == daddr)
669 if (fnhe->fnhe_genid != genid)
670 fnhe->fnhe_genid = genid;
674 fnhe->fnhe_pmtu = pmtu;
675 fnhe->fnhe_mtu_locked = lock;
677 fnhe->fnhe_expires = max(1UL, expires);
678 /* Update all cached dsts too */
679 rt = rcu_dereference(fnhe->fnhe_rth_input);
681 fill_route_from_fnhe(rt, fnhe);
682 rt = rcu_dereference(fnhe->fnhe_rth_output);
684 fill_route_from_fnhe(rt, fnhe);
686 if (depth > FNHE_RECLAIM_DEPTH)
687 fnhe = fnhe_oldest(hash);
689 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
693 fnhe->fnhe_next = hash->chain;
694 rcu_assign_pointer(hash->chain, fnhe);
696 fnhe->fnhe_genid = genid;
697 fnhe->fnhe_daddr = daddr;
699 fnhe->fnhe_pmtu = pmtu;
700 fnhe->fnhe_mtu_locked = lock;
701 fnhe->fnhe_expires = max(1UL, expires);
703 /* Exception created; mark the cached routes for the nexthop
704 * stale, so anyone caching it rechecks if this exception
707 rt = rcu_dereference(nh->nh_rth_input);
709 rt->dst.obsolete = DST_OBSOLETE_KILL;
711 for_each_possible_cpu(i) {
712 struct rtable __rcu **prt;
713 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
714 rt = rcu_dereference(*prt);
716 rt->dst.obsolete = DST_OBSOLETE_KILL;
720 fnhe->fnhe_stamp = jiffies;
723 spin_unlock_bh(&fnhe_lock);
726 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
729 __be32 new_gw = icmp_hdr(skb)->un.gateway;
730 __be32 old_gw = ip_hdr(skb)->saddr;
731 struct net_device *dev = skb->dev;
732 struct in_device *in_dev;
733 struct fib_result res;
737 switch (icmp_hdr(skb)->code & 7) {
739 case ICMP_REDIR_NETTOS:
740 case ICMP_REDIR_HOST:
741 case ICMP_REDIR_HOSTTOS:
748 if (rt->rt_gateway != old_gw)
751 in_dev = __in_dev_get_rcu(dev);
756 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
757 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
758 ipv4_is_zeronet(new_gw))
759 goto reject_redirect;
761 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
762 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
763 goto reject_redirect;
764 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
765 goto reject_redirect;
767 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
768 goto reject_redirect;
771 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
773 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
775 if (!(n->nud_state & NUD_VALID)) {
776 neigh_event_send(n, NULL);
778 if (fib_lookup(net, fl4, &res, 0) == 0) {
779 struct fib_nh *nh = &FIB_RES_NH(res);
781 update_or_create_fnhe(nh, fl4->daddr, new_gw,
783 jiffies + ip_rt_gc_timeout);
786 rt->dst.obsolete = DST_OBSOLETE_KILL;
787 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
794 #ifdef CONFIG_IP_ROUTE_VERBOSE
795 if (IN_DEV_LOG_MARTIANS(in_dev)) {
796 const struct iphdr *iph = (const struct iphdr *) skb->data;
797 __be32 daddr = iph->daddr;
798 __be32 saddr = iph->saddr;
800 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
801 " Advised path = %pI4 -> %pI4\n",
802 &old_gw, dev->name, &new_gw,
809 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
813 const struct iphdr *iph = (const struct iphdr *) skb->data;
814 struct net *net = dev_net(skb->dev);
815 int oif = skb->dev->ifindex;
816 u8 tos = RT_TOS(iph->tos);
817 u8 prot = iph->protocol;
818 u32 mark = skb->mark;
820 rt = (struct rtable *) dst;
822 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
823 __ip_do_redirect(rt, skb, &fl4, true);
826 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
828 struct rtable *rt = (struct rtable *)dst;
829 struct dst_entry *ret = dst;
832 if (dst->obsolete > 0) {
835 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846 * 1. The first ip_rt_redirect_number redirects are sent
847 * with exponential backoff, then we stop sending them at all,
848 * assuming that the host ignores our redirects.
849 * 2. If we did not see packets requiring redirects
850 * during ip_rt_redirect_silence, we assume that the host
851 * forgot redirected route and start to send redirects again.
853 * This algorithm is much cheaper and more intelligent than dumb load limiting
856 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
857 * and "frag. need" (breaks PMTU discovery) in icmp.c.
860 void ip_rt_send_redirect(struct sk_buff *skb)
862 struct rtable *rt = skb_rtable(skb);
863 struct in_device *in_dev;
864 struct inet_peer *peer;
870 in_dev = __in_dev_get_rcu(rt->dst.dev);
871 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
875 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
876 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
879 net = dev_net(rt->dst.dev);
880 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
882 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
883 rt_nexthop(rt, ip_hdr(skb)->daddr));
887 /* No redirected packets during ip_rt_redirect_silence;
888 * reset the algorithm.
890 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
891 peer->rate_tokens = 0;
893 /* Too many ignored redirects; do not send anything
894 * set dst.rate_last to the last seen redirected packet.
896 if (peer->rate_tokens >= ip_rt_redirect_number) {
897 peer->rate_last = jiffies;
901 /* Check for load limit; set rate_last to the latest sent
904 if (peer->rate_tokens == 0 ||
907 (ip_rt_redirect_load << peer->rate_tokens)))) {
908 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
910 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
911 peer->rate_last = jiffies;
913 #ifdef CONFIG_IP_ROUTE_VERBOSE
915 peer->rate_tokens == ip_rt_redirect_number)
916 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
917 &ip_hdr(skb)->saddr, inet_iif(skb),
918 &ip_hdr(skb)->daddr, &gw);
925 static int ip_error(struct sk_buff *skb)
927 struct rtable *rt = skb_rtable(skb);
928 struct net_device *dev = skb->dev;
929 struct in_device *in_dev;
930 struct inet_peer *peer;
936 if (netif_is_l3_master(skb->dev)) {
937 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
942 in_dev = __in_dev_get_rcu(dev);
944 /* IP on this device is disabled. */
948 net = dev_net(rt->dst.dev);
949 if (!IN_DEV_FORWARD(in_dev)) {
950 switch (rt->dst.error) {
952 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
956 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
962 switch (rt->dst.error) {
967 code = ICMP_HOST_UNREACH;
970 code = ICMP_NET_UNREACH;
971 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
974 code = ICMP_PKT_FILTERED;
978 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
979 l3mdev_master_ifindex(skb->dev), 1);
984 peer->rate_tokens += now - peer->rate_last;
985 if (peer->rate_tokens > ip_rt_error_burst)
986 peer->rate_tokens = ip_rt_error_burst;
987 peer->rate_last = now;
988 if (peer->rate_tokens >= ip_rt_error_cost)
989 peer->rate_tokens -= ip_rt_error_cost;
995 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1001 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1003 struct dst_entry *dst = &rt->dst;
1004 struct fib_result res;
1007 if (ip_mtu_locked(dst))
1010 if (ipv4_mtu(dst) < mtu)
1013 if (mtu < ip_rt_min_pmtu) {
1015 mtu = ip_rt_min_pmtu;
1018 if (rt->rt_pmtu == mtu &&
1019 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1023 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1024 struct fib_nh *nh = &FIB_RES_NH(res);
1026 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1027 jiffies + ip_rt_mtu_expires);
1032 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1033 struct sk_buff *skb, u32 mtu)
1035 struct rtable *rt = (struct rtable *) dst;
1038 ip_rt_build_flow_key(&fl4, sk, skb);
1039 __ip_rt_update_pmtu(rt, &fl4, mtu);
1042 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1043 int oif, u8 protocol)
1045 const struct iphdr *iph = (const struct iphdr *) skb->data;
1048 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1050 __build_flow_key(net, &fl4, NULL, iph, oif,
1051 RT_TOS(iph->tos), protocol, mark, 0);
1052 rt = __ip_route_output_key(net, &fl4);
1054 __ip_rt_update_pmtu(rt, &fl4, mtu);
1058 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1060 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1062 const struct iphdr *iph = (const struct iphdr *) skb->data;
1066 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1068 if (!fl4.flowi4_mark)
1069 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1071 rt = __ip_route_output_key(sock_net(sk), &fl4);
1073 __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1080 const struct iphdr *iph = (const struct iphdr *) skb->data;
1083 struct dst_entry *odst = NULL;
1085 struct net *net = sock_net(sk);
1089 if (!ip_sk_accept_pmtu(sk))
1092 odst = sk_dst_get(sk);
1094 if (sock_owned_by_user(sk) || !odst) {
1095 __ipv4_sk_update_pmtu(skb, sk, mtu);
1099 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1101 rt = (struct rtable *)odst;
1102 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1103 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1110 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1112 if (!dst_check(&rt->dst, 0)) {
1114 dst_release(&rt->dst);
1116 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1124 sk_dst_set(sk, &rt->dst);
1130 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1132 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1133 int oif, u8 protocol)
1135 const struct iphdr *iph = (const struct iphdr *) skb->data;
1139 __build_flow_key(net, &fl4, NULL, iph, oif,
1140 RT_TOS(iph->tos), protocol, 0, 0);
1141 rt = __ip_route_output_key(net, &fl4);
1143 __ip_do_redirect(rt, skb, &fl4, false);
1147 EXPORT_SYMBOL_GPL(ipv4_redirect);
1149 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1151 const struct iphdr *iph = (const struct iphdr *) skb->data;
1154 struct net *net = sock_net(sk);
1156 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157 rt = __ip_route_output_key(net, &fl4);
1159 __ip_do_redirect(rt, skb, &fl4, false);
1163 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1165 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1167 struct rtable *rt = (struct rtable *) dst;
1169 /* All IPV4 dsts are created with ->obsolete set to the value
1170 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1171 * into this function always.
1173 * When a PMTU/redirect information update invalidates a route,
1174 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1175 * DST_OBSOLETE_DEAD by dst_free().
1177 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1182 static void ipv4_link_failure(struct sk_buff *skb)
1186 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1188 rt = skb_rtable(skb);
1190 dst_set_expires(&rt->dst, 0);
1193 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1195 pr_debug("%s: %pI4 -> %pI4, %s\n",
1196 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1197 skb->dev ? skb->dev->name : "?");
1204 We do not cache source address of outgoing interface,
1205 because it is used only by IP RR, TS and SRR options,
1206 so that it out of fast path.
1208 BTW remember: "addr" is allowed to be not aligned
1212 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1216 if (rt_is_output_route(rt))
1217 src = ip_hdr(skb)->saddr;
1219 struct fib_result res;
1225 memset(&fl4, 0, sizeof(fl4));
1226 fl4.daddr = iph->daddr;
1227 fl4.saddr = iph->saddr;
1228 fl4.flowi4_tos = RT_TOS(iph->tos);
1229 fl4.flowi4_oif = rt->dst.dev->ifindex;
1230 fl4.flowi4_iif = skb->dev->ifindex;
1231 fl4.flowi4_mark = skb->mark;
1234 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1235 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1237 src = inet_select_addr(rt->dst.dev,
1238 rt_nexthop(rt, iph->daddr),
1242 memcpy(addr, &src, 4);
1245 #ifdef CONFIG_IP_ROUTE_CLASSID
1246 static void set_class_tag(struct rtable *rt, u32 tag)
1248 if (!(rt->dst.tclassid & 0xFFFF))
1249 rt->dst.tclassid |= tag & 0xFFFF;
1250 if (!(rt->dst.tclassid & 0xFFFF0000))
1251 rt->dst.tclassid |= tag & 0xFFFF0000;
1255 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1257 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1258 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1261 return min(advmss, IPV4_MAX_PMTU - header_size);
1264 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1266 const struct rtable *rt = (const struct rtable *) dst;
1267 unsigned int mtu = rt->rt_pmtu;
1269 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1270 mtu = dst_metric_raw(dst, RTAX_MTU);
1275 mtu = READ_ONCE(dst->dev->mtu);
1277 if (unlikely(ip_mtu_locked(dst))) {
1278 if (rt->rt_uses_gateway && mtu > 576)
1282 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1284 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1287 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1289 struct fnhe_hash_bucket *hash;
1290 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1291 u32 hval = fnhe_hashfun(daddr);
1293 spin_lock_bh(&fnhe_lock);
1295 hash = rcu_dereference_protected(nh->nh_exceptions,
1296 lockdep_is_held(&fnhe_lock));
1299 fnhe_p = &hash->chain;
1300 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1302 if (fnhe->fnhe_daddr == daddr) {
1303 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1304 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1305 fnhe_flush_routes(fnhe);
1306 kfree_rcu(fnhe, rcu);
1309 fnhe_p = &fnhe->fnhe_next;
1310 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1311 lockdep_is_held(&fnhe_lock));
1314 spin_unlock_bh(&fnhe_lock);
1317 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1319 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1320 struct fib_nh_exception *fnhe;
1326 hval = fnhe_hashfun(daddr);
1328 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1329 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1330 if (fnhe->fnhe_daddr == daddr) {
1331 if (fnhe->fnhe_expires &&
1332 time_after(jiffies, fnhe->fnhe_expires)) {
1333 ip_del_fnhe(nh, daddr);
1343 * 1. mtu on route is locked - use it
1344 * 2. mtu from nexthop exception
1345 * 3. mtu from egress device
1348 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1350 struct fib_info *fi = res->fi;
1351 struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1352 struct net_device *dev = nh->nh_dev;
1355 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1356 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1360 struct fib_nh_exception *fnhe;
1362 fnhe = find_exception(nh, daddr);
1363 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1364 mtu = fnhe->fnhe_pmtu;
1368 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1370 return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1373 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1374 __be32 daddr, const bool do_cache)
1378 spin_lock_bh(&fnhe_lock);
1380 if (daddr == fnhe->fnhe_daddr) {
1381 struct rtable __rcu **porig;
1382 struct rtable *orig;
1383 int genid = fnhe_genid(dev_net(rt->dst.dev));
1385 if (rt_is_input_route(rt))
1386 porig = &fnhe->fnhe_rth_input;
1388 porig = &fnhe->fnhe_rth_output;
1389 orig = rcu_dereference(*porig);
1391 if (fnhe->fnhe_genid != genid) {
1392 fnhe->fnhe_genid = genid;
1394 fnhe->fnhe_pmtu = 0;
1395 fnhe->fnhe_expires = 0;
1396 fnhe->fnhe_mtu_locked = false;
1397 fnhe_flush_routes(fnhe);
1400 fill_route_from_fnhe(rt, fnhe);
1401 if (!rt->rt_gateway)
1402 rt->rt_gateway = daddr;
1406 rcu_assign_pointer(*porig, rt);
1408 dst_dev_put(&orig->dst);
1409 dst_release(&orig->dst);
1414 fnhe->fnhe_stamp = jiffies;
1416 spin_unlock_bh(&fnhe_lock);
1421 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1423 struct rtable *orig, *prev, **p;
1426 if (rt_is_input_route(rt)) {
1427 p = (struct rtable **)&nh->nh_rth_input;
1429 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1433 /* hold dst before doing cmpxchg() to avoid race condition
1437 prev = cmpxchg(p, orig, rt);
1440 dst_dev_put(&orig->dst);
1441 dst_release(&orig->dst);
1444 dst_release(&rt->dst);
1451 struct uncached_list {
1453 struct list_head head;
1456 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1458 void rt_add_uncached_list(struct rtable *rt)
1460 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1462 rt->rt_uncached_list = ul;
1464 spin_lock_bh(&ul->lock);
1465 list_add_tail(&rt->rt_uncached, &ul->head);
1466 spin_unlock_bh(&ul->lock);
1469 void rt_del_uncached_list(struct rtable *rt)
1471 if (!list_empty(&rt->rt_uncached)) {
1472 struct uncached_list *ul = rt->rt_uncached_list;
1474 spin_lock_bh(&ul->lock);
1475 list_del(&rt->rt_uncached);
1476 spin_unlock_bh(&ul->lock);
1480 static void ipv4_dst_destroy(struct dst_entry *dst)
1482 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1483 struct rtable *rt = (struct rtable *)dst;
1485 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1488 rt_del_uncached_list(rt);
1491 void rt_flush_dev(struct net_device *dev)
1493 struct net *net = dev_net(dev);
1497 for_each_possible_cpu(cpu) {
1498 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1500 spin_lock_bh(&ul->lock);
1501 list_for_each_entry(rt, &ul->head, rt_uncached) {
1502 if (rt->dst.dev != dev)
1504 rt->dst.dev = net->loopback_dev;
1505 dev_hold(rt->dst.dev);
1508 spin_unlock_bh(&ul->lock);
1512 static bool rt_cache_valid(const struct rtable *rt)
1515 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1519 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1520 const struct fib_result *res,
1521 struct fib_nh_exception *fnhe,
1522 struct fib_info *fi, u16 type, u32 itag,
1523 const bool do_cache)
1525 bool cached = false;
1528 struct fib_nh *nh = &FIB_RES_NH(*res);
1530 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1531 rt->rt_gateway = nh->nh_gw;
1532 rt->rt_uses_gateway = 1;
1534 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1535 if (fi->fib_metrics != &dst_default_metrics) {
1536 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1537 refcount_inc(&fi->fib_metrics->refcnt);
1539 #ifdef CONFIG_IP_ROUTE_CLASSID
1540 rt->dst.tclassid = nh->nh_tclassid;
1542 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1544 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1546 cached = rt_cache_route(nh, rt);
1547 if (unlikely(!cached)) {
1548 /* Routes we intend to cache in nexthop exception or
1549 * FIB nexthop have the DST_NOCACHE bit clear.
1550 * However, if we are unsuccessful at storing this
1551 * route into the cache we really need to set it.
1553 if (!rt->rt_gateway)
1554 rt->rt_gateway = daddr;
1555 rt_add_uncached_list(rt);
1558 rt_add_uncached_list(rt);
1560 #ifdef CONFIG_IP_ROUTE_CLASSID
1561 #ifdef CONFIG_IP_MULTIPLE_TABLES
1562 set_class_tag(rt, res->tclassid);
1564 set_class_tag(rt, itag);
1568 struct rtable *rt_dst_alloc(struct net_device *dev,
1569 unsigned int flags, u16 type,
1570 bool nopolicy, bool noxfrm, bool will_cache)
1574 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1575 (will_cache ? 0 : DST_HOST) |
1576 (nopolicy ? DST_NOPOLICY : 0) |
1577 (noxfrm ? DST_NOXFRM : 0));
1580 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1581 rt->rt_flags = flags;
1583 rt->rt_is_input = 0;
1586 rt->rt_mtu_locked = 0;
1588 rt->rt_uses_gateway = 0;
1589 INIT_LIST_HEAD(&rt->rt_uncached);
1591 rt->dst.output = ip_output;
1592 if (flags & RTCF_LOCAL)
1593 rt->dst.input = ip_local_deliver;
1598 EXPORT_SYMBOL(rt_dst_alloc);
1600 /* called in rcu_read_lock() section */
1601 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1602 u8 tos, struct net_device *dev,
1603 struct in_device *in_dev, u32 *itag)
1607 /* Primary sanity checks. */
1611 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1612 skb->protocol != htons(ETH_P_IP))
1615 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1618 if (ipv4_is_zeronet(saddr)) {
1619 if (!ipv4_is_local_multicast(daddr))
1622 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1630 /* called in rcu_read_lock() section */
1631 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1632 u8 tos, struct net_device *dev, int our)
1634 struct in_device *in_dev = __in_dev_get_rcu(dev);
1635 unsigned int flags = RTCF_MULTICAST;
1640 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1645 flags |= RTCF_LOCAL;
1647 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1648 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1652 #ifdef CONFIG_IP_ROUTE_CLASSID
1653 rth->dst.tclassid = itag;
1655 rth->dst.output = ip_rt_bug;
1656 rth->rt_is_input= 1;
1658 #ifdef CONFIG_IP_MROUTE
1659 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1660 rth->dst.input = ip_mr_input;
1662 RT_CACHE_STAT_INC(in_slow_mc);
1664 skb_dst_set(skb, &rth->dst);
1669 static void ip_handle_martian_source(struct net_device *dev,
1670 struct in_device *in_dev,
1671 struct sk_buff *skb,
1675 RT_CACHE_STAT_INC(in_martian_src);
1676 #ifdef CONFIG_IP_ROUTE_VERBOSE
1677 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1679 * RFC1812 recommendation, if source is martian,
1680 * the only hint is MAC header.
1682 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1683 &daddr, &saddr, dev->name);
1684 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1685 print_hex_dump(KERN_WARNING, "ll header: ",
1686 DUMP_PREFIX_OFFSET, 16, 1,
1687 skb_mac_header(skb),
1688 dev->hard_header_len, true);
1694 /* called in rcu_read_lock() section */
1695 static int __mkroute_input(struct sk_buff *skb,
1696 const struct fib_result *res,
1697 struct in_device *in_dev,
1698 __be32 daddr, __be32 saddr, u32 tos)
1700 struct fib_nh_exception *fnhe;
1703 struct in_device *out_dev;
1707 /* get a working reference to the output device */
1708 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1710 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1714 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1715 in_dev->dev, in_dev, &itag);
1717 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1723 do_cache = res->fi && !itag;
1724 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1725 skb->protocol == htons(ETH_P_IP) &&
1726 (IN_DEV_SHARED_MEDIA(out_dev) ||
1727 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1728 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1730 if (skb->protocol != htons(ETH_P_IP)) {
1731 /* Not IP (i.e. ARP). Do not create route, if it is
1732 * invalid for proxy arp. DNAT routes are always valid.
1734 * Proxy arp feature have been extended to allow, ARP
1735 * replies back to the same interface, to support
1736 * Private VLAN switch technologies. See arp.c.
1738 if (out_dev == in_dev &&
1739 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1745 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1748 rth = rcu_dereference(fnhe->fnhe_rth_input);
1750 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1751 if (rt_cache_valid(rth)) {
1752 skb_dst_set_noref(skb, &rth->dst);
1757 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1758 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1759 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1765 rth->rt_is_input = 1;
1766 RT_CACHE_STAT_INC(in_slow_tot);
1768 rth->dst.input = ip_forward;
1770 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1772 lwtunnel_set_redirect(&rth->dst);
1773 skb_dst_set(skb, &rth->dst);
1780 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1781 /* To make ICMP packets follow the right flow, the multipath hash is
1782 * calculated from the inner IP addresses.
1784 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1785 struct flow_keys *hash_keys)
1787 const struct iphdr *outer_iph = ip_hdr(skb);
1788 const struct iphdr *key_iph = outer_iph;
1789 const struct iphdr *inner_iph;
1790 const struct icmphdr *icmph;
1791 struct iphdr _inner_iph;
1792 struct icmphdr _icmph;
1794 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1797 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1800 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1805 if (icmph->type != ICMP_DEST_UNREACH &&
1806 icmph->type != ICMP_REDIRECT &&
1807 icmph->type != ICMP_TIME_EXCEEDED &&
1808 icmph->type != ICMP_PARAMETERPROB)
1811 inner_iph = skb_header_pointer(skb,
1812 outer_iph->ihl * 4 + sizeof(_icmph),
1813 sizeof(_inner_iph), &_inner_iph);
1817 key_iph = inner_iph;
1819 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1820 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1823 /* if skb is set it will be used and fl4 can be NULL */
1824 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1825 const struct sk_buff *skb, struct flow_keys *flkeys)
1827 struct flow_keys hash_keys;
1830 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1832 memset(&hash_keys, 0, sizeof(hash_keys));
1833 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1835 ip_multipath_l3_keys(skb, &hash_keys);
1837 hash_keys.addrs.v4addrs.src = fl4->saddr;
1838 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1842 /* skb is currently provided only when forwarding */
1844 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1845 struct flow_keys keys;
1847 /* short-circuit if we already have L4 hash present */
1849 return skb_get_hash_raw(skb) >> 1;
1851 memset(&hash_keys, 0, sizeof(hash_keys));
1854 skb_flow_dissect_flow_keys(skb, &keys, flag);
1858 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1860 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1861 hash_keys.ports.src = flkeys->ports.src;
1862 hash_keys.ports.dst = flkeys->ports.dst;
1863 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1865 memset(&hash_keys, 0, sizeof(hash_keys));
1866 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1867 hash_keys.addrs.v4addrs.src = fl4->saddr;
1868 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1869 hash_keys.ports.src = fl4->fl4_sport;
1870 hash_keys.ports.dst = fl4->fl4_dport;
1871 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1875 mhash = flow_hash_from_keys(&hash_keys);
1879 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1881 static int ip_mkroute_input(struct sk_buff *skb,
1882 struct fib_result *res,
1883 struct in_device *in_dev,
1884 __be32 daddr, __be32 saddr, u32 tos,
1885 struct flow_keys *hkeys)
1887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1888 if (res->fi && res->fi->fib_nhs > 1) {
1889 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1891 fib_select_multipath(res, h);
1895 /* create a routing cache entry */
1896 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1900 * NOTE. We drop all the packets that has local source
1901 * addresses, because every properly looped back packet
1902 * must have correct destination already attached by output routine.
1904 * Such approach solves two big problems:
1905 * 1. Not simplex devices are handled properly.
1906 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1907 * called with rcu_read_lock()
1910 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1911 u8 tos, struct net_device *dev,
1912 struct fib_result *res)
1914 struct in_device *in_dev = __in_dev_get_rcu(dev);
1915 struct flow_keys *flkeys = NULL, _flkeys;
1916 struct net *net = dev_net(dev);
1917 struct ip_tunnel_info *tun_info;
1919 unsigned int flags = 0;
1925 /* IP on this device is disabled. */
1930 /* Check for the most weird martians, which can be not detected
1934 tun_info = skb_tunnel_info(skb);
1935 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1936 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1938 fl4.flowi4_tun_key.tun_id = 0;
1941 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1942 goto martian_source;
1946 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1949 /* Accept zero addresses only to limited broadcast;
1950 * I even do not know to fix it or not. Waiting for complains :-)
1952 if (ipv4_is_zeronet(saddr))
1953 goto martian_source;
1955 if (ipv4_is_zeronet(daddr))
1956 goto martian_destination;
1958 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1959 * and call it once if daddr or/and saddr are loopback addresses
1961 if (ipv4_is_loopback(daddr)) {
1962 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1963 goto martian_destination;
1964 } else if (ipv4_is_loopback(saddr)) {
1965 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1966 goto martian_source;
1970 * Now we are ready to route packet.
1973 fl4.flowi4_iif = dev->ifindex;
1974 fl4.flowi4_mark = skb->mark;
1975 fl4.flowi4_tos = tos;
1976 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1977 fl4.flowi4_flags = 0;
1980 fl4.flowi4_uid = sock_net_uid(net, NULL);
1982 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1985 fl4.flowi4_proto = 0;
1990 err = fib_lookup(net, &fl4, res, 0);
1992 if (!IN_DEV_FORWARD(in_dev))
1993 err = -EHOSTUNREACH;
1997 if (res->type == RTN_BROADCAST) {
1998 if (IN_DEV_BFORWARD(in_dev))
2003 if (res->type == RTN_LOCAL) {
2004 err = fib_validate_source(skb, saddr, daddr, tos,
2005 0, dev, in_dev, &itag);
2007 goto martian_source;
2011 if (!IN_DEV_FORWARD(in_dev)) {
2012 err = -EHOSTUNREACH;
2015 if (res->type != RTN_UNICAST)
2016 goto martian_destination;
2019 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2023 if (skb->protocol != htons(ETH_P_IP))
2026 if (!ipv4_is_zeronet(saddr)) {
2027 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2030 goto martian_source;
2032 flags |= RTCF_BROADCAST;
2033 res->type = RTN_BROADCAST;
2034 RT_CACHE_STAT_INC(in_brd);
2040 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2041 if (rt_cache_valid(rth)) {
2042 skb_dst_set_noref(skb, &rth->dst);
2050 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2051 flags | RTCF_LOCAL, res->type,
2052 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2056 rth->dst.output= ip_rt_bug;
2057 #ifdef CONFIG_IP_ROUTE_CLASSID
2058 rth->dst.tclassid = itag;
2060 rth->rt_is_input = 1;
2062 RT_CACHE_STAT_INC(in_slow_tot);
2063 if (res->type == RTN_UNREACHABLE) {
2064 rth->dst.input= ip_error;
2065 rth->dst.error= -err;
2066 rth->rt_flags &= ~RTCF_LOCAL;
2070 struct fib_nh *nh = &FIB_RES_NH(*res);
2072 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2073 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2074 WARN_ON(rth->dst.input == lwtunnel_input);
2075 rth->dst.lwtstate->orig_input = rth->dst.input;
2076 rth->dst.input = lwtunnel_input;
2079 if (unlikely(!rt_cache_route(nh, rth)))
2080 rt_add_uncached_list(rth);
2082 skb_dst_set(skb, &rth->dst);
2087 RT_CACHE_STAT_INC(in_no_route);
2088 res->type = RTN_UNREACHABLE;
2094 * Do not cache martian addresses: they should be logged (RFC1812)
2096 martian_destination:
2097 RT_CACHE_STAT_INC(in_martian_dst);
2098 #ifdef CONFIG_IP_ROUTE_VERBOSE
2099 if (IN_DEV_LOG_MARTIANS(in_dev))
2100 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2101 &daddr, &saddr, dev->name);
2113 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2117 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2118 u8 tos, struct net_device *dev)
2120 struct fib_result res;
2123 tos &= IPTOS_RT_MASK;
2125 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2130 EXPORT_SYMBOL(ip_route_input_noref);
2132 /* called with rcu_read_lock held */
2133 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2134 u8 tos, struct net_device *dev, struct fib_result *res)
2136 /* Multicast recognition logic is moved from route cache to here.
2137 The problem was that too many Ethernet cards have broken/missing
2138 hardware multicast filters :-( As result the host on multicasting
2139 network acquires a lot of useless route cache entries, sort of
2140 SDR messages from all the world. Now we try to get rid of them.
2141 Really, provided software IP multicast filter is organized
2142 reasonably (at least, hashed), it does not result in a slowdown
2143 comparing with route cache reject entries.
2144 Note, that multicast routers are not affected, because
2145 route cache entry is created eventually.
2147 if (ipv4_is_multicast(daddr)) {
2148 struct in_device *in_dev = __in_dev_get_rcu(dev);
2153 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2154 ip_hdr(skb)->protocol);
2156 /* check l3 master if no match yet */
2157 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2158 struct in_device *l3_in_dev;
2160 l3_in_dev = __in_dev_get_rcu(skb->dev);
2162 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2163 ip_hdr(skb)->protocol);
2167 #ifdef CONFIG_IP_MROUTE
2169 (!ipv4_is_local_multicast(daddr) &&
2170 IN_DEV_MFORWARD(in_dev))
2173 err = ip_route_input_mc(skb, daddr, saddr,
2179 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2182 /* called with rcu_read_lock() */
2183 static struct rtable *__mkroute_output(const struct fib_result *res,
2184 const struct flowi4 *fl4, int orig_oif,
2185 struct net_device *dev_out,
2188 struct fib_info *fi = res->fi;
2189 struct fib_nh_exception *fnhe;
2190 struct in_device *in_dev;
2191 u16 type = res->type;
2195 in_dev = __in_dev_get_rcu(dev_out);
2197 return ERR_PTR(-EINVAL);
2199 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2200 if (ipv4_is_loopback(fl4->saddr) &&
2201 !(dev_out->flags & IFF_LOOPBACK) &&
2202 !netif_is_l3_master(dev_out))
2203 return ERR_PTR(-EINVAL);
2205 if (ipv4_is_lbcast(fl4->daddr))
2206 type = RTN_BROADCAST;
2207 else if (ipv4_is_multicast(fl4->daddr))
2208 type = RTN_MULTICAST;
2209 else if (ipv4_is_zeronet(fl4->daddr))
2210 return ERR_PTR(-EINVAL);
2212 if (dev_out->flags & IFF_LOOPBACK)
2213 flags |= RTCF_LOCAL;
2216 if (type == RTN_BROADCAST) {
2217 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2219 } else if (type == RTN_MULTICAST) {
2220 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2221 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2223 flags &= ~RTCF_LOCAL;
2226 /* If multicast route do not exist use
2227 * default one, but do not gateway in this case.
2230 if (fi && res->prefixlen < 4)
2232 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2233 (orig_oif != dev_out->ifindex)) {
2234 /* For local routes that require a particular output interface
2235 * we do not want to cache the result. Caching the result
2236 * causes incorrect behaviour when there are multiple source
2237 * addresses on the interface, the end result being that if the
2238 * intended recipient is waiting on that interface for the
2239 * packet he won't receive it because it will be delivered on
2240 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2241 * be set to the loopback interface as well.
2247 do_cache &= fi != NULL;
2249 struct rtable __rcu **prth;
2250 struct fib_nh *nh = &FIB_RES_NH(*res);
2252 fnhe = find_exception(nh, fl4->daddr);
2256 prth = &fnhe->fnhe_rth_output;
2258 if (unlikely(fl4->flowi4_flags &
2259 FLOWI_FLAG_KNOWN_NH &&
2261 nh->nh_scope == RT_SCOPE_LINK))) {
2265 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2267 rth = rcu_dereference(*prth);
2268 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2273 rth = rt_dst_alloc(dev_out, flags, type,
2274 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2275 IN_DEV_CONF_GET(in_dev, NOXFRM),
2278 return ERR_PTR(-ENOBUFS);
2280 rth->rt_iif = orig_oif;
2282 RT_CACHE_STAT_INC(out_slow_tot);
2284 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2285 if (flags & RTCF_LOCAL &&
2286 !(dev_out->flags & IFF_LOOPBACK)) {
2287 rth->dst.output = ip_mc_output;
2288 RT_CACHE_STAT_INC(out_slow_mc);
2290 #ifdef CONFIG_IP_MROUTE
2291 if (type == RTN_MULTICAST) {
2292 if (IN_DEV_MFORWARD(in_dev) &&
2293 !ipv4_is_local_multicast(fl4->daddr)) {
2294 rth->dst.input = ip_mr_input;
2295 rth->dst.output = ip_mc_output;
2301 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2302 lwtunnel_set_redirect(&rth->dst);
2308 * Major route resolver routine.
2311 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2312 const struct sk_buff *skb)
2314 __u8 tos = RT_FL_TOS(fl4);
2315 struct fib_result res = {
2323 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2324 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2325 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2326 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2329 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2334 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2336 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2337 struct fib_result *res,
2338 const struct sk_buff *skb)
2340 struct net_device *dev_out = NULL;
2341 int orig_oif = fl4->flowi4_oif;
2342 unsigned int flags = 0;
2344 int err = -ENETUNREACH;
2347 rth = ERR_PTR(-EINVAL);
2348 if (ipv4_is_multicast(fl4->saddr) ||
2349 ipv4_is_lbcast(fl4->saddr) ||
2350 ipv4_is_zeronet(fl4->saddr))
2353 /* I removed check for oif == dev_out->oif here.
2354 It was wrong for two reasons:
2355 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2356 is assigned to multiple interfaces.
2357 2. Moreover, we are allowed to send packets with saddr
2358 of another iface. --ANK
2361 if (fl4->flowi4_oif == 0 &&
2362 (ipv4_is_multicast(fl4->daddr) ||
2363 ipv4_is_lbcast(fl4->daddr))) {
2364 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2365 dev_out = __ip_dev_find(net, fl4->saddr, false);
2369 /* Special hack: user can direct multicasts
2370 and limited broadcast via necessary interface
2371 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2372 This hack is not just for fun, it allows
2373 vic,vat and friends to work.
2374 They bind socket to loopback, set ttl to zero
2375 and expect that it will work.
2376 From the viewpoint of routing cache they are broken,
2377 because we are not allowed to build multicast path
2378 with loopback source addr (look, routing cache
2379 cannot know, that ttl is zero, so that packet
2380 will not leave this host and route is valid).
2381 Luckily, this hack is good workaround.
2384 fl4->flowi4_oif = dev_out->ifindex;
2388 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2389 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2390 if (!__ip_dev_find(net, fl4->saddr, false))
2396 if (fl4->flowi4_oif) {
2397 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2398 rth = ERR_PTR(-ENODEV);
2402 /* RACE: Check return value of inet_select_addr instead. */
2403 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2404 rth = ERR_PTR(-ENETUNREACH);
2407 if (ipv4_is_local_multicast(fl4->daddr) ||
2408 ipv4_is_lbcast(fl4->daddr) ||
2409 fl4->flowi4_proto == IPPROTO_IGMP) {
2411 fl4->saddr = inet_select_addr(dev_out, 0,
2416 if (ipv4_is_multicast(fl4->daddr))
2417 fl4->saddr = inet_select_addr(dev_out, 0,
2419 else if (!fl4->daddr)
2420 fl4->saddr = inet_select_addr(dev_out, 0,
2426 fl4->daddr = fl4->saddr;
2428 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2429 dev_out = net->loopback_dev;
2430 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2431 res->type = RTN_LOCAL;
2432 flags |= RTCF_LOCAL;
2436 err = fib_lookup(net, fl4, res, 0);
2440 if (fl4->flowi4_oif &&
2441 (ipv4_is_multicast(fl4->daddr) ||
2442 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2443 /* Apparently, routing tables are wrong. Assume,
2444 that the destination is on link.
2447 Because we are allowed to send to iface
2448 even if it has NO routes and NO assigned
2449 addresses. When oif is specified, routing
2450 tables are looked up with only one purpose:
2451 to catch if destination is gatewayed, rather than
2452 direct. Moreover, if MSG_DONTROUTE is set,
2453 we send packet, ignoring both routing tables
2454 and ifaddr state. --ANK
2457 We could make it even if oif is unknown,
2458 likely IPv6, but we do not.
2461 if (fl4->saddr == 0)
2462 fl4->saddr = inet_select_addr(dev_out, 0,
2464 res->type = RTN_UNICAST;
2471 if (res->type == RTN_LOCAL) {
2473 if (res->fi->fib_prefsrc)
2474 fl4->saddr = res->fi->fib_prefsrc;
2476 fl4->saddr = fl4->daddr;
2479 /* L3 master device is the loopback for that domain */
2480 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2483 /* make sure orig_oif points to fib result device even
2484 * though packet rx/tx happens over loopback or l3mdev
2486 orig_oif = FIB_RES_OIF(*res);
2488 fl4->flowi4_oif = dev_out->ifindex;
2489 flags |= RTCF_LOCAL;
2493 fib_select_path(net, res, fl4, skb);
2495 dev_out = FIB_RES_DEV(*res);
2496 fl4->flowi4_oif = dev_out->ifindex;
2500 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2506 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2511 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2513 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2515 return mtu ? : dst->dev->mtu;
2518 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2519 struct sk_buff *skb, u32 mtu)
2523 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2524 struct sk_buff *skb)
2528 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2534 static struct dst_ops ipv4_dst_blackhole_ops = {
2536 .check = ipv4_blackhole_dst_check,
2537 .mtu = ipv4_blackhole_mtu,
2538 .default_advmss = ipv4_default_advmss,
2539 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2540 .redirect = ipv4_rt_blackhole_redirect,
2541 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2542 .neigh_lookup = ipv4_neigh_lookup,
2545 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2547 struct rtable *ort = (struct rtable *) dst_orig;
2550 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2552 struct dst_entry *new = &rt->dst;
2555 new->input = dst_discard;
2556 new->output = dst_discard_out;
2558 new->dev = net->loopback_dev;
2562 rt->rt_is_input = ort->rt_is_input;
2563 rt->rt_iif = ort->rt_iif;
2564 rt->rt_pmtu = ort->rt_pmtu;
2565 rt->rt_mtu_locked = ort->rt_mtu_locked;
2567 rt->rt_genid = rt_genid_ipv4(net);
2568 rt->rt_flags = ort->rt_flags;
2569 rt->rt_type = ort->rt_type;
2570 rt->rt_gateway = ort->rt_gateway;
2571 rt->rt_uses_gateway = ort->rt_uses_gateway;
2573 INIT_LIST_HEAD(&rt->rt_uncached);
2576 dst_release(dst_orig);
2578 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2581 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2582 const struct sock *sk)
2584 struct rtable *rt = __ip_route_output_key(net, flp4);
2589 if (flp4->flowi4_proto)
2590 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2591 flowi4_to_flowi(flp4),
2596 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2598 /* called with rcu_read_lock held */
2599 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2600 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2601 struct sk_buff *skb, u32 portid, u32 seq)
2604 struct nlmsghdr *nlh;
2605 unsigned long expires = 0;
2607 u32 metrics[RTAX_MAX];
2609 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2613 r = nlmsg_data(nlh);
2614 r->rtm_family = AF_INET;
2615 r->rtm_dst_len = 32;
2617 r->rtm_tos = fl4->flowi4_tos;
2618 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2619 if (nla_put_u32(skb, RTA_TABLE, table_id))
2620 goto nla_put_failure;
2621 r->rtm_type = rt->rt_type;
2622 r->rtm_scope = RT_SCOPE_UNIVERSE;
2623 r->rtm_protocol = RTPROT_UNSPEC;
2624 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2625 if (rt->rt_flags & RTCF_NOTIFY)
2626 r->rtm_flags |= RTM_F_NOTIFY;
2627 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2628 r->rtm_flags |= RTCF_DOREDIRECT;
2630 if (nla_put_in_addr(skb, RTA_DST, dst))
2631 goto nla_put_failure;
2633 r->rtm_src_len = 32;
2634 if (nla_put_in_addr(skb, RTA_SRC, src))
2635 goto nla_put_failure;
2638 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2639 goto nla_put_failure;
2640 #ifdef CONFIG_IP_ROUTE_CLASSID
2641 if (rt->dst.tclassid &&
2642 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2643 goto nla_put_failure;
2645 if (!rt_is_input_route(rt) &&
2646 fl4->saddr != src) {
2647 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2648 goto nla_put_failure;
2650 if (rt->rt_uses_gateway &&
2651 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2652 goto nla_put_failure;
2654 expires = rt->dst.expires;
2656 unsigned long now = jiffies;
2658 if (time_before(now, expires))
2664 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2665 if (rt->rt_pmtu && expires)
2666 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2667 if (rt->rt_mtu_locked && expires)
2668 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2669 if (rtnetlink_put_metrics(skb, metrics) < 0)
2670 goto nla_put_failure;
2672 if (fl4->flowi4_mark &&
2673 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2674 goto nla_put_failure;
2676 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2677 nla_put_u32(skb, RTA_UID,
2678 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2679 goto nla_put_failure;
2681 error = rt->dst.error;
2683 if (rt_is_input_route(rt)) {
2684 #ifdef CONFIG_IP_MROUTE
2685 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2686 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2687 int err = ipmr_get_route(net, skb,
2688 fl4->saddr, fl4->daddr,
2694 goto nla_put_failure;
2698 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2699 goto nla_put_failure;
2702 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2703 goto nla_put_failure;
2705 nlmsg_end(skb, nlh);
2709 nlmsg_cancel(skb, nlh);
2713 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2714 u8 ip_proto, __be16 sport,
2717 struct sk_buff *skb;
2720 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2724 /* Reserve room for dummy headers, this skb can pass
2725 * through good chunk of routing engine.
2727 skb_reset_mac_header(skb);
2728 skb_reset_network_header(skb);
2729 skb->protocol = htons(ETH_P_IP);
2730 iph = skb_put(skb, sizeof(struct iphdr));
2731 iph->protocol = ip_proto;
2737 skb_set_transport_header(skb, skb->len);
2739 switch (iph->protocol) {
2741 struct udphdr *udph;
2743 udph = skb_put_zero(skb, sizeof(struct udphdr));
2744 udph->source = sport;
2746 udph->len = sizeof(struct udphdr);
2751 struct tcphdr *tcph;
2753 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2754 tcph->source = sport;
2756 tcph->doff = sizeof(struct tcphdr) / 4;
2758 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2762 case IPPROTO_ICMP: {
2763 struct icmphdr *icmph;
2765 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2766 icmph->type = ICMP_ECHO;
2774 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2775 struct netlink_ext_ack *extack)
2777 struct net *net = sock_net(in_skb->sk);
2778 struct nlattr *tb[RTA_MAX+1];
2779 u32 table_id = RT_TABLE_MAIN;
2780 __be16 sport = 0, dport = 0;
2781 struct fib_result res = {};
2782 u8 ip_proto = IPPROTO_UDP;
2783 struct rtable *rt = NULL;
2784 struct sk_buff *skb;
2794 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2799 rtm = nlmsg_data(nlh);
2800 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2801 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2802 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2803 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2805 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2807 uid = (iif ? INVALID_UID : current_uid());
2809 if (tb[RTA_IP_PROTO]) {
2810 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2817 sport = nla_get_be16(tb[RTA_SPORT]);
2820 dport = nla_get_be16(tb[RTA_DPORT]);
2822 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2826 memset(&fl4, 0, sizeof(fl4));
2829 fl4.flowi4_tos = rtm->rtm_tos;
2830 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2831 fl4.flowi4_mark = mark;
2832 fl4.flowi4_uid = uid;
2834 fl4.fl4_sport = sport;
2836 fl4.fl4_dport = dport;
2837 fl4.flowi4_proto = ip_proto;
2842 struct net_device *dev;
2844 dev = dev_get_by_index_rcu(net, iif);
2850 fl4.flowi4_iif = iif; /* for rt_fill_info */
2853 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2856 rt = skb_rtable(skb);
2857 if (err == 0 && rt->dst.error)
2858 err = -rt->dst.error;
2860 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2861 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2866 skb_dst_set(skb, &rt->dst);
2872 if (rtm->rtm_flags & RTM_F_NOTIFY)
2873 rt->rt_flags |= RTCF_NOTIFY;
2875 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2876 table_id = res.table ? res.table->tb_id : 0;
2878 /* reset skb for netlink reply msg */
2880 skb_reset_network_header(skb);
2881 skb_reset_transport_header(skb);
2882 skb_reset_mac_header(skb);
2884 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2886 err = fib_props[res.type].error;
2888 err = -EHOSTUNREACH;
2891 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2892 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2893 rt->rt_type, res.prefix, res.prefixlen,
2894 fl4.flowi4_tos, res.fi, 0);
2896 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2897 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2904 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2914 void ip_rt_multicast_event(struct in_device *in_dev)
2916 rt_cache_flush(dev_net(in_dev->dev));
2919 #ifdef CONFIG_SYSCTL
2920 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2921 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2922 static int ip_rt_gc_elasticity __read_mostly = 8;
2923 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
2925 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2926 void __user *buffer,
2927 size_t *lenp, loff_t *ppos)
2929 struct net *net = (struct net *)__ctl->extra1;
2932 rt_cache_flush(net);
2933 fnhe_genid_bump(net);
2940 static struct ctl_table ipv4_route_table[] = {
2942 .procname = "gc_thresh",
2943 .data = &ipv4_dst_ops.gc_thresh,
2944 .maxlen = sizeof(int),
2946 .proc_handler = proc_dointvec,
2949 .procname = "max_size",
2950 .data = &ip_rt_max_size,
2951 .maxlen = sizeof(int),
2953 .proc_handler = proc_dointvec,
2956 /* Deprecated. Use gc_min_interval_ms */
2958 .procname = "gc_min_interval",
2959 .data = &ip_rt_gc_min_interval,
2960 .maxlen = sizeof(int),
2962 .proc_handler = proc_dointvec_jiffies,
2965 .procname = "gc_min_interval_ms",
2966 .data = &ip_rt_gc_min_interval,
2967 .maxlen = sizeof(int),
2969 .proc_handler = proc_dointvec_ms_jiffies,
2972 .procname = "gc_timeout",
2973 .data = &ip_rt_gc_timeout,
2974 .maxlen = sizeof(int),
2976 .proc_handler = proc_dointvec_jiffies,
2979 .procname = "gc_interval",
2980 .data = &ip_rt_gc_interval,
2981 .maxlen = sizeof(int),
2983 .proc_handler = proc_dointvec_jiffies,
2986 .procname = "redirect_load",
2987 .data = &ip_rt_redirect_load,
2988 .maxlen = sizeof(int),
2990 .proc_handler = proc_dointvec,
2993 .procname = "redirect_number",
2994 .data = &ip_rt_redirect_number,
2995 .maxlen = sizeof(int),
2997 .proc_handler = proc_dointvec,
3000 .procname = "redirect_silence",
3001 .data = &ip_rt_redirect_silence,
3002 .maxlen = sizeof(int),
3004 .proc_handler = proc_dointvec,
3007 .procname = "error_cost",
3008 .data = &ip_rt_error_cost,
3009 .maxlen = sizeof(int),
3011 .proc_handler = proc_dointvec,
3014 .procname = "error_burst",
3015 .data = &ip_rt_error_burst,
3016 .maxlen = sizeof(int),
3018 .proc_handler = proc_dointvec,
3021 .procname = "gc_elasticity",
3022 .data = &ip_rt_gc_elasticity,
3023 .maxlen = sizeof(int),
3025 .proc_handler = proc_dointvec,
3028 .procname = "mtu_expires",
3029 .data = &ip_rt_mtu_expires,
3030 .maxlen = sizeof(int),
3032 .proc_handler = proc_dointvec_jiffies,
3035 .procname = "min_pmtu",
3036 .data = &ip_rt_min_pmtu,
3037 .maxlen = sizeof(int),
3039 .proc_handler = proc_dointvec_minmax,
3040 .extra1 = &ip_min_valid_pmtu,
3043 .procname = "min_adv_mss",
3044 .data = &ip_rt_min_advmss,
3045 .maxlen = sizeof(int),
3047 .proc_handler = proc_dointvec,
3052 static struct ctl_table ipv4_route_flush_table[] = {
3054 .procname = "flush",
3055 .maxlen = sizeof(int),
3057 .proc_handler = ipv4_sysctl_rtcache_flush,
3062 static __net_init int sysctl_route_net_init(struct net *net)
3064 struct ctl_table *tbl;
3066 tbl = ipv4_route_flush_table;
3067 if (!net_eq(net, &init_net)) {
3068 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3072 /* Don't export sysctls to unprivileged users */
3073 if (net->user_ns != &init_user_ns)
3074 tbl[0].procname = NULL;
3076 tbl[0].extra1 = net;
3078 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3079 if (!net->ipv4.route_hdr)
3084 if (tbl != ipv4_route_flush_table)
3090 static __net_exit void sysctl_route_net_exit(struct net *net)
3092 struct ctl_table *tbl;
3094 tbl = net->ipv4.route_hdr->ctl_table_arg;
3095 unregister_net_sysctl_table(net->ipv4.route_hdr);
3096 BUG_ON(tbl == ipv4_route_flush_table);
3100 static __net_initdata struct pernet_operations sysctl_route_ops = {
3101 .init = sysctl_route_net_init,
3102 .exit = sysctl_route_net_exit,
3106 static __net_init int rt_genid_init(struct net *net)
3108 atomic_set(&net->ipv4.rt_genid, 0);
3109 atomic_set(&net->fnhe_genid, 0);
3110 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3114 static __net_initdata struct pernet_operations rt_genid_ops = {
3115 .init = rt_genid_init,
3118 static int __net_init ipv4_inetpeer_init(struct net *net)
3120 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3124 inet_peer_base_init(bp);
3125 net->ipv4.peers = bp;
3129 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3131 struct inet_peer_base *bp = net->ipv4.peers;
3133 net->ipv4.peers = NULL;
3134 inetpeer_invalidate_tree(bp);
3138 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3139 .init = ipv4_inetpeer_init,
3140 .exit = ipv4_inetpeer_exit,
3143 #ifdef CONFIG_IP_ROUTE_CLASSID
3144 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3145 #endif /* CONFIG_IP_ROUTE_CLASSID */
3147 int __init ip_rt_init(void)
3151 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3154 panic("IP: failed to allocate ip_idents\n");
3156 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3158 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3160 panic("IP: failed to allocate ip_tstamps\n");
3162 for_each_possible_cpu(cpu) {
3163 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3165 INIT_LIST_HEAD(&ul->head);
3166 spin_lock_init(&ul->lock);
3168 #ifdef CONFIG_IP_ROUTE_CLASSID
3169 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3171 panic("IP: failed to allocate ip_rt_acct\n");
3174 ipv4_dst_ops.kmem_cachep =
3175 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3176 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3178 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3180 if (dst_entries_init(&ipv4_dst_ops) < 0)
3181 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3183 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3184 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3186 ipv4_dst_ops.gc_thresh = ~0;
3187 ip_rt_max_size = INT_MAX;
3192 if (ip_rt_proc_init())
3193 pr_err("Unable to create route proc files\n");
3198 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3199 RTNL_FLAG_DOIT_UNLOCKED);
3201 #ifdef CONFIG_SYSCTL
3202 register_pernet_subsys(&sysctl_route_ops);
3204 register_pernet_subsys(&rt_genid_ops);
3205 register_pernet_subsys(&ipv4_inetpeer_ops);
3209 #ifdef CONFIG_SYSCTL
3211 * We really need to sanitize the damn ipv4 init order, then all
3212 * this nonsense will go away.
3214 void __init ip_static_sysctl_init(void)
3216 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);