net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149 {
 150         WARN_ON(1);
 151         return NULL;
 152 }
 153
 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                            struct sk_buff *skb,
 156                                            const void *daddr);
 157
 158 static struct dst_ops ipv4_dst_ops = {
 159         .family =               AF_INET,
 160         .check =                ipv4_dst_check,
 161         .default_advmss =       ipv4_default_advmss,
 162         .mtu =                  ipv4_mtu,
 163         .cow_metrics =          ipv4_cow_metrics,
 164         .destroy =              ipv4_dst_destroy,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .redirect =             ip_do_redirect,
 169         .local_out =            __ip_local_out,
 170         .neigh_lookup =         ipv4_neigh_lookup,
 171 };
 172
 173 #define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175 const __u8 ip_tos2prio[16] = {
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK)
 192 };
 193 EXPORT_SYMBOL(ip_tos2prio);
 194
 195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198 #ifdef CONFIG_PROC_FS
 199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200 {
 201         if (*pos)
 202                 return NULL;
 203         return SEQ_START_TOKEN;
 204 }
 205
 206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207 {
 208         ++*pos;
 209         return NULL;
 210 }
 211
 212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213 {
 214 }
 215
 216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217 {
 218         if (v == SEQ_START_TOKEN)
 219                 seq_printf(seq, "%-127s\n",
 220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                            "HHUptod\tSpecDst");
 223         return 0;
 224 }
 225
 226 static const struct seq_operations rt_cache_seq_ops = {
 227         .start  = rt_cache_seq_start,
 228         .next   = rt_cache_seq_next,
 229         .stop   = rt_cache_seq_stop,
 230         .show   = rt_cache_seq_show,
 231 };
 232
 233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234 {
 235         return seq_open(file, &rt_cache_seq_ops);
 236 }
 237
 238 static const struct file_operations rt_cache_seq_fops = {
 239         .owner   = THIS_MODULE,
 240         .open    = rt_cache_seq_open,
 241         .read    = seq_read,
 242         .llseek  = seq_lseek,
 243         .release = seq_release,
 244 };
 245
 246
 247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248 {
 249         int cpu;
 250
 251         if (*pos == 0)
 252                 return SEQ_START_TOKEN;
 253
 254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                 if (!cpu_possible(cpu))
 256                         continue;
 257                 *pos = cpu+1;
 258                 return &per_cpu(rt_cache_stat, cpu);
 259         }
 260         return NULL;
 261 }
 262
 263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264 {
 265         int cpu;
 266
 267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                 if (!cpu_possible(cpu))
 269                         continue;
 270                 *pos = cpu+1;
 271                 return &per_cpu(rt_cache_stat, cpu);
 272         }
 273         return NULL;
 274
 275 }
 276
 277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278 {
 279
 280 }
 281
 282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283 {
 284         struct rt_cache_stat *st = v;
 285
 286         if (v == SEQ_START_TOKEN) {
 287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                 return 0;
 289         }
 290
 291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                    dst_entries_get_slow(&ipv4_dst_ops),
 294                    0, /* st->in_hit */
 295                    st->in_slow_tot,
 296                    st->in_slow_mc,
 297                    st->in_no_route,
 298                    st->in_brd,
 299                    st->in_martian_dst,
 300                    st->in_martian_src,
 301
 302                    0, /* st->out_hit */
 303                    st->out_slow_tot,
 304                    st->out_slow_mc,
 305
 306                    0, /* st->gc_total */
 307                    0, /* st->gc_ignored */
 308                    0, /* st->gc_goal_miss */
 309                    0, /* st->gc_dst_overflow */
 310                    0, /* st->in_hlist_search */
 311                    0  /* st->out_hlist_search */
 312                 );
 313         return 0;
 314 }
 315
 316 static const struct seq_operations rt_cpu_seq_ops = {
 317         .start  = rt_cpu_seq_start,
 318         .next   = rt_cpu_seq_next,
 319         .stop   = rt_cpu_seq_stop,
 320         .show   = rt_cpu_seq_show,
 321 };
 322
 323
 324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325 {
 326         return seq_open(file, &rt_cpu_seq_ops);
 327 }
 328
 329 static const struct file_operations rt_cpu_seq_fops = {
 330         .owner   = THIS_MODULE,
 331         .open    = rt_cpu_seq_open,
 332         .read    = seq_read,
 333         .llseek  = seq_lseek,
 334         .release = seq_release,
 335 };
 336
 337 #ifdef CONFIG_IP_ROUTE_CLASSID
 338 static int rt_acct_proc_show(struct seq_file *m, void *v)
 339 {
 340         struct ip_rt_acct *dst, *src;
 341         unsigned int i, j;
 342
 343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344         if (!dst)
 345                 return -ENOMEM;
 346
 347         for_each_possible_cpu(i) {
 348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                 for (j = 0; j < 256; j++) {
 350                         dst[j].o_bytes   += src[j].o_bytes;
 351                         dst[j].o_packets += src[j].o_packets;
 352                         dst[j].i_bytes   += src[j].i_bytes;
 353                         dst[j].i_packets += src[j].i_packets;
 354                 }
 355         }
 356
 357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358         kfree(dst);
 359         return 0;
 360 }
 361
 362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363 {
 364         return single_open(file, rt_acct_proc_show, NULL);
 365 }
 366
 367 static const struct file_operations rt_acct_proc_fops = {
 368         .owner          = THIS_MODULE,
 369         .open           = rt_acct_proc_open,
 370         .read           = seq_read,
 371         .llseek         = seq_lseek,
 372         .release        = single_release,
 373 };
 374 #endif
 375
 376 static int __net_init ip_rt_do_proc_init(struct net *net)
 377 {
 378         struct proc_dir_entry *pde;
 379
 380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                           &rt_cache_seq_fops);
 382         if (!pde)
 383                 goto err1;
 384
 385         pde = proc_create("rt_cache", S_IRUGO,
 386                           net->proc_net_stat, &rt_cpu_seq_fops);
 387         if (!pde)
 388                 goto err2;
 389
 390 #ifdef CONFIG_IP_ROUTE_CLASSID
 391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392         if (!pde)
 393                 goto err3;
 394 #endif
 395         return 0;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398 err3:
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400 #endif
 401 err2:
 402         remove_proc_entry("rt_cache", net->proc_net);
 403 err1:
 404         return -ENOMEM;
 405 }
 406
 407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408 {
 409         remove_proc_entry("rt_cache", net->proc_net_stat);
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 #ifdef CONFIG_IP_ROUTE_CLASSID
 412         remove_proc_entry("rt_acct", net->proc_net);
 413 #endif
 414 }
 415
 416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417         .init = ip_rt_do_proc_init,
 418         .exit = ip_rt_do_proc_exit,
 419 };
 420
 421 static int __init ip_rt_proc_init(void)
 422 {
 423         return register_pernet_subsys(&ip_rt_proc_ops);
 424 }
 425
 426 #else
 427 static inline int ip_rt_proc_init(void)
 428 {
 429         return 0;
 430 }
 431 #endif /* CONFIG_PROC_FS */
 432
 433 static inline bool rt_is_expired(const struct rtable *rth)
 434 {
 435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436 }
 437
 438 void rt_cache_flush(struct net *net)
 439 {
 440         rt_genid_bump_ipv4(net);
 441 }
 442
 443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                            struct sk_buff *skb,
 445                                            const void *daddr)
 446 {
 447         struct net_device *dev = dst->dev;
 448         const __be32 *pkey = daddr;
 449         const struct rtable *rt;
 450         struct neighbour *n;
 451
 452         rt = (const struct rtable *) dst;
 453         if (rt->rt_gateway)
 454                 pkey = (const __be32 *) &rt->rt_gateway;
 455         else if (skb)
 456                 pkey = &ip_hdr(skb)->daddr;
 457
 458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459         if (n)
 460                 return n;
 461         return neigh_create(&arp_tbl, pkey, dev);
 462 }
 463
 464 #define IP_IDENTS_SZ 2048u
 465
 466 static atomic_t *ip_idents __read_mostly;
 467 static u32 *ip_tstamps __read_mostly;
 468
 469 /* In order to protect privacy, we add a perturbation to identifiers
 470  * if one generator is seldom used. This makes hard for an attacker
 471  * to infer how many packets were sent between two points in time.
 472  */
 473 u32 ip_idents_reserve(u32 hash, int segs)
 474 {
 475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477         u32 old = ACCESS_ONCE(*p_tstamp);
 478         u32 now = (u32)jiffies;
 479         u32 new, delta = 0;
 480
 481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                 delta = prandom_u32_max(now - old);
 483
 484         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 485         do {
 486                 old = (u32)atomic_read(p_id);
 487                 new = old + delta + segs;
 488         } while (atomic_cmpxchg(p_id, old, new) != old);
 489
 490         return new - segs;
 491 }
 492 EXPORT_SYMBOL(ip_idents_reserve);
 493
 494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 495 {
 496         static u32 ip_idents_hashrnd __read_mostly;
 497         u32 hash, id;
 498
 499         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 500
 501         hash = jhash_3words((__force u32)iph->daddr,
 502                             (__force u32)iph->saddr,
 503                             iph->protocol ^ net_hash_mix(net),
 504                             ip_idents_hashrnd);
 505         id = ip_idents_reserve(hash, segs);
 506         iph->id = htons(id);
 507 }
 508 EXPORT_SYMBOL(__ip_select_ident);
 509
 510 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 511                              const struct iphdr *iph,
 512                              int oif, u8 tos,
 513                              u8 prot, u32 mark, int flow_flags)
 514 {
 515         if (sk) {
 516                 const struct inet_sock *inet = inet_sk(sk);
 517
 518                 oif = sk->sk_bound_dev_if;
 519                 mark = sk->sk_mark;
 520                 tos = RT_CONN_FLAGS(sk);
 521                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 522         }
 523         flowi4_init_output(fl4, oif, mark, tos,
 524                            RT_SCOPE_UNIVERSE, prot,
 525                            flow_flags,
 526                            iph->daddr, iph->saddr, 0, 0);
 527 }
 528
 529 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530                                const struct sock *sk)
 531 {
 532         const struct iphdr *iph = ip_hdr(skb);
 533         int oif = skb->dev->ifindex;
 534         u8 tos = RT_TOS(iph->tos);
 535         u8 prot = iph->protocol;
 536         u32 mark = skb->mark;
 537
 538         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 539 }
 540
 541 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 542 {
 543         const struct inet_sock *inet = inet_sk(sk);
 544         const struct ip_options_rcu *inet_opt;
 545         __be32 daddr = inet->inet_daddr;
 546
 547         rcu_read_lock();
 548         inet_opt = rcu_dereference(inet->inet_opt);
 549         if (inet_opt && inet_opt->opt.srr)
 550                 daddr = inet_opt->opt.faddr;
 551         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 552                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 553                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 554                            inet_sk_flowi_flags(sk),
 555                            daddr, inet->inet_saddr, 0, 0);
 556         rcu_read_unlock();
 557 }
 558
 559 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 560                                  const struct sk_buff *skb)
 561 {
 562         if (skb)
 563                 build_skb_flow_key(fl4, skb, sk);
 564         else
 565                 build_sk_flow_key(fl4, sk);
 566 }
 567
 568 static inline void rt_free(struct rtable *rt)
 569 {
 570         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 571 }
 572
 573 static DEFINE_SPINLOCK(fnhe_lock);
 574
 575 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576 {
 577         struct rtable *rt;
 578
 579         rt = rcu_dereference(fnhe->fnhe_rth_input);
 580         if (rt) {
 581                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582                 rt_free(rt);
 583         }
 584         rt = rcu_dereference(fnhe->fnhe_rth_output);
 585         if (rt) {
 586                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 587                 rt_free(rt);
 588         }
 589 }
 590
 591 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 592 {
 593         struct fib_nh_exception *fnhe, *oldest;
 594
 595         oldest = rcu_dereference(hash->chain);
 596         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 597              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 598                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 599                         oldest = fnhe;
 600         }
 601         fnhe_flush_routes(oldest);
 602         return oldest;
 603 }
 604
 605 static inline u32 fnhe_hashfun(__be32 daddr)
 606 {
 607         static u32 fnhe_hashrnd __read_mostly;
 608         u32 hval;
 609
 610         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 611         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 612         return hash_32(hval, FNHE_HASH_SHIFT);
 613 }
 614
 615 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 616 {
 617         rt->rt_pmtu = fnhe->fnhe_pmtu;
 618         rt->dst.expires = fnhe->fnhe_expires;
 619
 620         if (fnhe->fnhe_gw) {
 621                 rt->rt_flags |= RTCF_REDIRECTED;
 622                 rt->rt_gateway = fnhe->fnhe_gw;
 623                 rt->rt_uses_gateway = 1;
 624         }
 625 }
 626
 627 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 628                                   u32 pmtu, unsigned long expires)
 629 {
 630         struct fnhe_hash_bucket *hash;
 631         struct fib_nh_exception *fnhe;
 632         struct rtable *rt;
 633         unsigned int i;
 634         int depth;
 635         u32 hval = fnhe_hashfun(daddr);
 636
 637         spin_lock_bh(&fnhe_lock);
 638
 639         hash = rcu_dereference(nh->nh_exceptions);
 640         if (!hash) {
 641                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 642                 if (!hash)
 643                         goto out_unlock;
 644                 rcu_assign_pointer(nh->nh_exceptions, hash);
 645         }
 646
 647         hash += hval;
 648
 649         depth = 0;
 650         for (fnhe = rcu_dereference(hash->chain); fnhe;
 651              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 652                 if (fnhe->fnhe_daddr == daddr)
 653                         break;
 654                 depth++;
 655         }
 656
 657         if (fnhe) {
 658                 if (gw)
 659                         fnhe->fnhe_gw = gw;
 660                 if (pmtu) {
 661                         fnhe->fnhe_pmtu = pmtu;
 662                         fnhe->fnhe_expires = max(1UL, expires);
 663                 }
 664                 /* Update all cached dsts too */
 665                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 666                 if (rt)
 667                         fill_route_from_fnhe(rt, fnhe);
 668                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 669                 if (rt)
 670                         fill_route_from_fnhe(rt, fnhe);
 671         } else {
 672                 if (depth > FNHE_RECLAIM_DEPTH)
 673                         fnhe = fnhe_oldest(hash);
 674                 else {
 675                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 676                         if (!fnhe)
 677                                 goto out_unlock;
 678
 679                         fnhe->fnhe_next = hash->chain;
 680                         rcu_assign_pointer(hash->chain, fnhe);
 681                 }
 682                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 683                 fnhe->fnhe_daddr = daddr;
 684                 fnhe->fnhe_gw = gw;
 685                 fnhe->fnhe_pmtu = pmtu;
 686                 fnhe->fnhe_expires = expires;
 687
 688                 /* Exception created; mark the cached routes for the nexthop
 689                  * stale, so anyone caching it rechecks if this exception
 690                  * applies to them.
 691                  */
 692                 rt = rcu_dereference(nh->nh_rth_input);
 693                 if (rt)
 694                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 695
 696                 for_each_possible_cpu(i) {
 697                         struct rtable __rcu **prt;
 698                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 699                         rt = rcu_dereference(*prt);
 700                         if (rt)
 701                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 702                 }
 703         }
 704
 705         fnhe->fnhe_stamp = jiffies;
 706
 707 out_unlock:
 708         spin_unlock_bh(&fnhe_lock);
 709 }
 710
 711 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 712                              bool kill_route)
 713 {
 714         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 715         __be32 old_gw = ip_hdr(skb)->saddr;
 716         struct net_device *dev = skb->dev;
 717         struct in_device *in_dev;
 718         struct fib_result res;
 719         struct neighbour *n;
 720         struct net *net;
 721
 722         switch (icmp_hdr(skb)->code & 7) {
 723         case ICMP_REDIR_NET:
 724         case ICMP_REDIR_NETTOS:
 725         case ICMP_REDIR_HOST:
 726         case ICMP_REDIR_HOSTTOS:
 727                 break;
 728
 729         default:
 730                 return;
 731         }
 732
 733         if (rt->rt_gateway != old_gw)
 734                 return;
 735
 736         in_dev = __in_dev_get_rcu(dev);
 737         if (!in_dev)
 738                 return;
 739
 740         net = dev_net(dev);
 741         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 742             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 743             ipv4_is_zeronet(new_gw))
 744                 goto reject_redirect;
 745
 746         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 747                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 748                         goto reject_redirect;
 749                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 750                         goto reject_redirect;
 751         } else {
 752                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 753                         goto reject_redirect;
 754         }
 755
 756         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 757         if (!IS_ERR(n)) {
 758                 if (!(n->nud_state & NUD_VALID)) {
 759                         neigh_event_send(n, NULL);
 760                 } else {
 761                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 762                                 struct fib_nh *nh = &FIB_RES_NH(res);
 763
 764                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 765                                                 0, jiffies + ip_rt_gc_timeout);
 766                         }
 767                         if (kill_route)
 768                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 769                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 770                 }
 771                 neigh_release(n);
 772         }
 773         return;
 774
 775 reject_redirect:
 776 #ifdef CONFIG_IP_ROUTE_VERBOSE
 777         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 778                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 779                 __be32 daddr = iph->daddr;
 780                 __be32 saddr = iph->saddr;
 781
 782                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 783                                      "  Advised path = %pI4 -> %pI4\n",
 784                                      &old_gw, dev->name, &new_gw,
 785                                      &saddr, &daddr);
 786         }
 787 #endif
 788         ;
 789 }
 790
 791 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 792 {
 793         struct rtable *rt;
 794         struct flowi4 fl4;
 795         const struct iphdr *iph = (const struct iphdr *) skb->data;
 796         int oif = skb->dev->ifindex;
 797         u8 tos = RT_TOS(iph->tos);
 798         u8 prot = iph->protocol;
 799         u32 mark = skb->mark;
 800
 801         rt = (struct rtable *) dst;
 802
 803         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 804         __ip_do_redirect(rt, skb, &fl4, true);
 805 }
 806
 807 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 808 {
 809         struct rtable *rt = (struct rtable *)dst;
 810         struct dst_entry *ret = dst;
 811
 812         if (rt) {
 813                 if (dst->obsolete > 0) {
 814                         ip_rt_put(rt);
 815                         ret = NULL;
 816                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 817                            rt->dst.expires) {
 818                         ip_rt_put(rt);
 819                         ret = NULL;
 820                 }
 821         }
 822         return ret;
 823 }
 824
 825 /*
 826  * Algorithm:
 827  *      1. The first ip_rt_redirect_number redirects are sent
 828  *         with exponential backoff, then we stop sending them at all,
 829  *         assuming that the host ignores our redirects.
 830  *      2. If we did not see packets requiring redirects
 831  *         during ip_rt_redirect_silence, we assume that the host
 832  *         forgot redirected route and start to send redirects again.
 833  *
 834  * This algorithm is much cheaper and more intelligent than dumb load limiting
 835  * in icmp.c.
 836  *
 837  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 838  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 839  */
 840
 841 void ip_rt_send_redirect(struct sk_buff *skb)
 842 {
 843         struct rtable *rt = skb_rtable(skb);
 844         struct in_device *in_dev;
 845         struct inet_peer *peer;
 846         struct net *net;
 847         int log_martians;
 848         int vif;
 849
 850         rcu_read_lock();
 851         in_dev = __in_dev_get_rcu(rt->dst.dev);
 852         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 853                 rcu_read_unlock();
 854                 return;
 855         }
 856         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 857         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 858         rcu_read_unlock();
 859
 860         net = dev_net(rt->dst.dev);
 861         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 862         if (!peer) {
 863                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 864                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 865                 return;
 866         }
 867
 868         /* No redirected packets during ip_rt_redirect_silence;
 869          * reset the algorithm.
 870          */
 871         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 872                 peer->rate_tokens = 0;
 873
 874         /* Too many ignored redirects; do not send anything
 875          * set dst.rate_last to the last seen redirected packet.
 876          */
 877         if (peer->rate_tokens >= ip_rt_redirect_number) {
 878                 peer->rate_last = jiffies;
 879                 goto out_put_peer;
 880         }
 881
 882         /* Check for load limit; set rate_last to the latest sent
 883          * redirect.
 884          */
 885         if (peer->rate_tokens == 0 ||
 886             time_after(jiffies,
 887                        (peer->rate_last +
 888                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 889                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 890
 891                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 892                 peer->rate_last = jiffies;
 893                 ++peer->rate_tokens;
 894 #ifdef CONFIG_IP_ROUTE_VERBOSE
 895                 if (log_martians &&
 896                     peer->rate_tokens == ip_rt_redirect_number)
 897                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 898                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 899                                              &ip_hdr(skb)->daddr, &gw);
 900 #endif
 901         }
 902 out_put_peer:
 903         inet_putpeer(peer);
 904 }
 905
 906 static int ip_error(struct sk_buff *skb)
 907 {
 908         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 909         struct rtable *rt = skb_rtable(skb);
 910         struct inet_peer *peer;
 911         unsigned long now;
 912         struct net *net;
 913         bool send;
 914         int code;
 915
 916         /* IP on this device is disabled. */
 917         if (!in_dev)
 918                 goto out;
 919
 920         net = dev_net(rt->dst.dev);
 921         if (!IN_DEV_FORWARD(in_dev)) {
 922                 switch (rt->dst.error) {
 923                 case EHOSTUNREACH:
 924                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 925                         break;
 926
 927                 case ENETUNREACH:
 928                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 929                         break;
 930                 }
 931                 goto out;
 932         }
 933
 934         switch (rt->dst.error) {
 935         case EINVAL:
 936         default:
 937                 goto out;
 938         case EHOSTUNREACH:
 939                 code = ICMP_HOST_UNREACH;
 940                 break;
 941         case ENETUNREACH:
 942                 code = ICMP_NET_UNREACH;
 943                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 944                 break;
 945         case EACCES:
 946                 code = ICMP_PKT_FILTERED;
 947                 break;
 948         }
 949
 950         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 951                                l3mdev_master_ifindex(skb->dev), 1);
 952
 953         send = true;
 954         if (peer) {
 955                 now = jiffies;
 956                 peer->rate_tokens += now - peer->rate_last;
 957                 if (peer->rate_tokens > ip_rt_error_burst)
 958                         peer->rate_tokens = ip_rt_error_burst;
 959                 peer->rate_last = now;
 960                 if (peer->rate_tokens >= ip_rt_error_cost)
 961                         peer->rate_tokens -= ip_rt_error_cost;
 962                 else
 963                         send = false;
 964                 inet_putpeer(peer);
 965         }
 966         if (send)
 967                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 968
 969 out:    kfree_skb(skb);
 970         return 0;
 971 }
 972
 973 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 974 {
 975         struct dst_entry *dst = &rt->dst;
 976         struct fib_result res;
 977
 978         if (dst_metric_locked(dst, RTAX_MTU))
 979                 return;
 980
 981         if (ipv4_mtu(dst) < mtu)
 982                 return;
 983
 984         if (mtu < ip_rt_min_pmtu)
 985                 mtu = ip_rt_min_pmtu;
 986
 987         if (rt->rt_pmtu == mtu &&
 988             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 989                 return;
 990
 991         rcu_read_lock();
 992         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 993                 struct fib_nh *nh = &FIB_RES_NH(res);
 994
 995                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 996                                       jiffies + ip_rt_mtu_expires);
 997         }
 998         rcu_read_unlock();
 999 }
1000
1001 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1002                               struct sk_buff *skb, u32 mtu)
1003 {
1004         struct rtable *rt = (struct rtable *) dst;
1005         struct flowi4 fl4;
1006
1007         ip_rt_build_flow_key(&fl4, sk, skb);
1008         __ip_rt_update_pmtu(rt, &fl4, mtu);
1009 }
1010
1011 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1012                       int oif, u32 mark, u8 protocol, int flow_flags)
1013 {
1014         const struct iphdr *iph = (const struct iphdr *) skb->data;
1015         struct flowi4 fl4;
1016         struct rtable *rt;
1017
1018         if (!mark)
1019                 mark = IP4_REPLY_MARK(net, skb->mark);
1020
1021         __build_flow_key(&fl4, NULL, iph, oif,
1022                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1023         rt = __ip_route_output_key(net, &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1030
1031 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1032 {
1033         const struct iphdr *iph = (const struct iphdr *) skb->data;
1034         struct flowi4 fl4;
1035         struct rtable *rt;
1036
1037         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1038
1039         if (!fl4.flowi4_mark)
1040                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1041
1042         rt = __ip_route_output_key(sock_net(sk), &fl4);
1043         if (!IS_ERR(rt)) {
1044                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1045                 ip_rt_put(rt);
1046         }
1047 }
1048
1049 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054         struct dst_entry *odst = NULL;
1055         bool new = false;
1056
1057         bh_lock_sock(sk);
1058
1059         if (!ip_sk_accept_pmtu(sk))
1060                 goto out;
1061
1062         odst = sk_dst_get(sk);
1063
1064         if (sock_owned_by_user(sk) || !odst) {
1065                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1066                 goto out;
1067         }
1068
1069         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071         rt = (struct rtable *)odst;
1072         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1073                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                 if (IS_ERR(rt))
1075                         goto out;
1076
1077                 new = true;
1078         }
1079
1080         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1081
1082         if (!dst_check(&rt->dst, 0)) {
1083                 if (new)
1084                         dst_release(&rt->dst);
1085
1086                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1087                 if (IS_ERR(rt))
1088                         goto out;
1089
1090                 new = true;
1091         }
1092
1093         if (new)
1094                 sk_dst_set(sk, &rt->dst);
1095
1096 out:
1097         bh_unlock_sock(sk);
1098         dst_release(odst);
1099 }
1100 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1101
1102 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1103                    int oif, u32 mark, u8 protocol, int flow_flags)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *) skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108
1109         __build_flow_key(&fl4, NULL, iph, oif,
1110                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1111         rt = __ip_route_output_key(net, &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_do_redirect(rt, skb, &fl4, false);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117 EXPORT_SYMBOL_GPL(ipv4_redirect);
1118
1119 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1120 {
1121         const struct iphdr *iph = (const struct iphdr *) skb->data;
1122         struct flowi4 fl4;
1123         struct rtable *rt;
1124
1125         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1126         rt = __ip_route_output_key(sock_net(sk), &fl4);
1127         if (!IS_ERR(rt)) {
1128                 __ip_do_redirect(rt, skb, &fl4, false);
1129                 ip_rt_put(rt);
1130         }
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1133
1134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1135 {
1136         struct rtable *rt = (struct rtable *) dst;
1137
1138         /* All IPV4 dsts are created with ->obsolete set to the value
1139          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1140          * into this function always.
1141          *
1142          * When a PMTU/redirect information update invalidates a route,
1143          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1144          * DST_OBSOLETE_DEAD by dst_free().
1145          */
1146         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1147                 return NULL;
1148         return dst;
1149 }
1150
1151 static void ipv4_link_failure(struct sk_buff *skb)
1152 {
1153         struct rtable *rt;
1154
1155         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1156
1157         rt = skb_rtable(skb);
1158         if (rt)
1159                 dst_set_expires(&rt->dst, 0);
1160 }
1161
1162 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1163 {
1164         pr_debug("%s: %pI4 -> %pI4, %s\n",
1165                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1166                  skb->dev ? skb->dev->name : "?");
1167         kfree_skb(skb);
1168         WARN_ON(1);
1169         return 0;
1170 }
1171
1172 /*
1173    We do not cache source address of outgoing interface,
1174    because it is used only by IP RR, TS and SRR options,
1175    so that it out of fast path.
1176
1177    BTW remember: "addr" is allowed to be not aligned
1178    in IP options!
1179  */
1180
1181 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1182 {
1183         __be32 src;
1184
1185         if (rt_is_output_route(rt))
1186                 src = ip_hdr(skb)->saddr;
1187         else {
1188                 struct fib_result res;
1189                 struct flowi4 fl4;
1190                 struct iphdr *iph;
1191
1192                 iph = ip_hdr(skb);
1193
1194                 memset(&fl4, 0, sizeof(fl4));
1195                 fl4.daddr = iph->daddr;
1196                 fl4.saddr = iph->saddr;
1197                 fl4.flowi4_tos = RT_TOS(iph->tos);
1198                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1199                 fl4.flowi4_iif = skb->dev->ifindex;
1200                 fl4.flowi4_mark = skb->mark;
1201
1202                 rcu_read_lock();
1203                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1204                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1205                 else
1206                         src = inet_select_addr(rt->dst.dev,
1207                                                rt_nexthop(rt, iph->daddr),
1208                                                RT_SCOPE_UNIVERSE);
1209                 rcu_read_unlock();
1210         }
1211         memcpy(addr, &src, 4);
1212 }
1213
1214 #ifdef CONFIG_IP_ROUTE_CLASSID
1215 static void set_class_tag(struct rtable *rt, u32 tag)
1216 {
1217         if (!(rt->dst.tclassid & 0xFFFF))
1218                 rt->dst.tclassid |= tag & 0xFFFF;
1219         if (!(rt->dst.tclassid & 0xFFFF0000))
1220                 rt->dst.tclassid |= tag & 0xFFFF0000;
1221 }
1222 #endif
1223
1224 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1225 {
1226         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1227
1228         if (advmss == 0) {
1229                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1230                                ip_rt_min_advmss);
1231                 if (advmss > 65535 - 40)
1232                         advmss = 65535 - 40;
1233         }
1234         return advmss;
1235 }
1236
1237 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1238 {
1239         const struct rtable *rt = (const struct rtable *) dst;
1240         unsigned int mtu = rt->rt_pmtu;
1241
1242         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1243                 mtu = dst_metric_raw(dst, RTAX_MTU);
1244
1245         if (mtu)
1246                 return mtu;
1247
1248         mtu = dst->dev->mtu;
1249
1250         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1251                 if (rt->rt_uses_gateway && mtu > 576)
1252                         mtu = 576;
1253         }
1254
1255         return min_t(unsigned int, mtu, IP_MAX_MTU);
1256 }
1257
1258 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1259 {
1260         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1261         struct fib_nh_exception *fnhe;
1262         u32 hval;
1263
1264         if (!hash)
1265                 return NULL;
1266
1267         hval = fnhe_hashfun(daddr);
1268
1269         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1270              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1271                 if (fnhe->fnhe_daddr == daddr)
1272                         return fnhe;
1273         }
1274         return NULL;
1275 }
1276
1277 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1278                               __be32 daddr)
1279 {
1280         bool ret = false;
1281
1282         spin_lock_bh(&fnhe_lock);
1283
1284         if (daddr == fnhe->fnhe_daddr) {
1285                 struct rtable __rcu **porig;
1286                 struct rtable *orig;
1287                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1288
1289                 if (rt_is_input_route(rt))
1290                         porig = &fnhe->fnhe_rth_input;
1291                 else
1292                         porig = &fnhe->fnhe_rth_output;
1293                 orig = rcu_dereference(*porig);
1294
1295                 if (fnhe->fnhe_genid != genid) {
1296                         fnhe->fnhe_genid = genid;
1297                         fnhe->fnhe_gw = 0;
1298                         fnhe->fnhe_pmtu = 0;
1299                         fnhe->fnhe_expires = 0;
1300                         fnhe_flush_routes(fnhe);
1301                         orig = NULL;
1302                 }
1303                 fill_route_from_fnhe(rt, fnhe);
1304                 if (!rt->rt_gateway)
1305                         rt->rt_gateway = daddr;
1306
1307                 if (!(rt->dst.flags & DST_NOCACHE)) {
1308                         rcu_assign_pointer(*porig, rt);
1309                         if (orig)
1310                                 rt_free(orig);
1311                         ret = true;
1312                 }
1313
1314                 fnhe->fnhe_stamp = jiffies;
1315         }
1316         spin_unlock_bh(&fnhe_lock);
1317
1318         return ret;
1319 }
1320
1321 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1322 {
1323         struct rtable *orig, *prev, **p;
1324         bool ret = true;
1325
1326         if (rt_is_input_route(rt)) {
1327                 p = (struct rtable **)&nh->nh_rth_input;
1328         } else {
1329                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1330         }
1331         orig = *p;
1332
1333         prev = cmpxchg(p, orig, rt);
1334         if (prev == orig) {
1335                 if (orig)
1336                         rt_free(orig);
1337         } else
1338                 ret = false;
1339
1340         return ret;
1341 }
1342
1343 struct uncached_list {
1344         spinlock_t              lock;
1345         struct list_head        head;
1346 };
1347
1348 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1349
1350 static void rt_add_uncached_list(struct rtable *rt)
1351 {
1352         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1353
1354         rt->rt_uncached_list = ul;
1355
1356         spin_lock_bh(&ul->lock);
1357         list_add_tail(&rt->rt_uncached, &ul->head);
1358         spin_unlock_bh(&ul->lock);
1359 }
1360
1361 static void ipv4_dst_destroy(struct dst_entry *dst)
1362 {
1363         struct rtable *rt = (struct rtable *) dst;
1364
1365         if (!list_empty(&rt->rt_uncached)) {
1366                 struct uncached_list *ul = rt->rt_uncached_list;
1367
1368                 spin_lock_bh(&ul->lock);
1369                 list_del(&rt->rt_uncached);
1370                 spin_unlock_bh(&ul->lock);
1371         }
1372 }
1373
1374 void rt_flush_dev(struct net_device *dev)
1375 {
1376         struct net *net = dev_net(dev);
1377         struct rtable *rt;
1378         int cpu;
1379
1380         for_each_possible_cpu(cpu) {
1381                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1382
1383                 spin_lock_bh(&ul->lock);
1384                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1385                         if (rt->dst.dev != dev)
1386                                 continue;
1387                         rt->dst.dev = net->loopback_dev;
1388                         dev_hold(rt->dst.dev);
1389                         dev_put(dev);
1390                 }
1391                 spin_unlock_bh(&ul->lock);
1392         }
1393 }
1394
1395 static bool rt_cache_valid(const struct rtable *rt)
1396 {
1397         return  rt &&
1398                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1399                 !rt_is_expired(rt);
1400 }
1401
1402 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1403                            const struct fib_result *res,
1404                            struct fib_nh_exception *fnhe,
1405                            struct fib_info *fi, u16 type, u32 itag)
1406 {
1407         bool cached = false;
1408
1409         if (fi) {
1410                 struct fib_nh *nh = &FIB_RES_NH(*res);
1411
1412                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1413                         rt->rt_gateway = nh->nh_gw;
1414                         rt->rt_uses_gateway = 1;
1415                 }
1416                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1417 #ifdef CONFIG_IP_ROUTE_CLASSID
1418                 rt->dst.tclassid = nh->nh_tclassid;
1419 #endif
1420                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1421                 if (unlikely(fnhe))
1422                         cached = rt_bind_exception(rt, fnhe, daddr);
1423                 else if (!(rt->dst.flags & DST_NOCACHE))
1424                         cached = rt_cache_route(nh, rt);
1425                 if (unlikely(!cached)) {
1426                         /* Routes we intend to cache in nexthop exception or
1427                          * FIB nexthop have the DST_NOCACHE bit clear.
1428                          * However, if we are unsuccessful at storing this
1429                          * route into the cache we really need to set it.
1430                          */
1431                         rt->dst.flags |= DST_NOCACHE;
1432                         if (!rt->rt_gateway)
1433                                 rt->rt_gateway = daddr;
1434                         rt_add_uncached_list(rt);
1435                 }
1436         } else
1437                 rt_add_uncached_list(rt);
1438
1439 #ifdef CONFIG_IP_ROUTE_CLASSID
1440 #ifdef CONFIG_IP_MULTIPLE_TABLES
1441         set_class_tag(rt, res->tclassid);
1442 #endif
1443         set_class_tag(rt, itag);
1444 #endif
1445 }
1446
1447 struct rtable *rt_dst_alloc(struct net_device *dev,
1448                             unsigned int flags, u16 type,
1449                             bool nopolicy, bool noxfrm, bool will_cache)
1450 {
1451         struct rtable *rt;
1452
1453         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1454                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1455                        (nopolicy ? DST_NOPOLICY : 0) |
1456                        (noxfrm ? DST_NOXFRM : 0));
1457
1458         if (rt) {
1459                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1460                 rt->rt_flags = flags;
1461                 rt->rt_type = type;
1462                 rt->rt_is_input = 0;
1463                 rt->rt_iif = 0;
1464                 rt->rt_pmtu = 0;
1465                 rt->rt_gateway = 0;
1466                 rt->rt_uses_gateway = 0;
1467                 rt->rt_table_id = 0;
1468                 INIT_LIST_HEAD(&rt->rt_uncached);
1469
1470                 rt->dst.output = ip_output;
1471                 if (flags & RTCF_LOCAL)
1472                         rt->dst.input = ip_local_deliver;
1473         }
1474
1475         return rt;
1476 }
1477 EXPORT_SYMBOL(rt_dst_alloc);
1478
1479 /* called in rcu_read_lock() section */
1480 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1481                                 u8 tos, struct net_device *dev, int our)
1482 {
1483         struct rtable *rth;
1484         struct in_device *in_dev = __in_dev_get_rcu(dev);
1485         unsigned int flags = RTCF_MULTICAST;
1486         u32 itag = 0;
1487         int err;
1488
1489         /* Primary sanity checks. */
1490
1491         if (!in_dev)
1492                 return -EINVAL;
1493
1494         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1495             skb->protocol != htons(ETH_P_IP))
1496                 goto e_inval;
1497
1498         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1499                 goto e_inval;
1500
1501         if (ipv4_is_zeronet(saddr)) {
1502                 if (!ipv4_is_local_multicast(daddr))
1503                         goto e_inval;
1504         } else {
1505                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1506                                           in_dev, &itag);
1507                 if (err < 0)
1508                         goto e_err;
1509         }
1510         if (our)
1511                 flags |= RTCF_LOCAL;
1512
1513         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1514                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1515         if (!rth)
1516                 goto e_nobufs;
1517
1518 #ifdef CONFIG_IP_ROUTE_CLASSID
1519         rth->dst.tclassid = itag;
1520 #endif
1521         rth->dst.output = ip_rt_bug;
1522         rth->rt_is_input= 1;
1523
1524 #ifdef CONFIG_IP_MROUTE
1525         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1526                 rth->dst.input = ip_mr_input;
1527 #endif
1528         RT_CACHE_STAT_INC(in_slow_mc);
1529
1530         skb_dst_set(skb, &rth->dst);
1531         return 0;
1532
1533 e_nobufs:
1534         return -ENOBUFS;
1535 e_inval:
1536         return -EINVAL;
1537 e_err:
1538         return err;
1539 }
1540
1541
1542 static void ip_handle_martian_source(struct net_device *dev,
1543                                      struct in_device *in_dev,
1544                                      struct sk_buff *skb,
1545                                      __be32 daddr,
1546                                      __be32 saddr)
1547 {
1548         RT_CACHE_STAT_INC(in_martian_src);
1549 #ifdef CONFIG_IP_ROUTE_VERBOSE
1550         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1551                 /*
1552                  *      RFC1812 recommendation, if source is martian,
1553                  *      the only hint is MAC header.
1554                  */
1555                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1556                         &daddr, &saddr, dev->name);
1557                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1558                         print_hex_dump(KERN_WARNING, "ll header: ",
1559                                        DUMP_PREFIX_OFFSET, 16, 1,
1560                                        skb_mac_header(skb),
1561                                        dev->hard_header_len, true);
1562                 }
1563         }
1564 #endif
1565 }
1566
1567 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1568 {
1569         struct fnhe_hash_bucket *hash;
1570         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1571         u32 hval = fnhe_hashfun(daddr);
1572
1573         spin_lock_bh(&fnhe_lock);
1574
1575         hash = rcu_dereference_protected(nh->nh_exceptions,
1576                                          lockdep_is_held(&fnhe_lock));
1577         hash += hval;
1578
1579         fnhe_p = &hash->chain;
1580         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1581         while (fnhe) {
1582                 if (fnhe->fnhe_daddr == daddr) {
1583                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1584                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1585                         fnhe_flush_routes(fnhe);
1586                         kfree_rcu(fnhe, rcu);
1587                         break;
1588                 }
1589                 fnhe_p = &fnhe->fnhe_next;
1590                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1591                                                  lockdep_is_held(&fnhe_lock));
1592         }
1593
1594         spin_unlock_bh(&fnhe_lock);
1595 }
1596
1597 /* called in rcu_read_lock() section */
1598 static int __mkroute_input(struct sk_buff *skb,
1599                            const struct fib_result *res,
1600                            struct in_device *in_dev,
1601                            __be32 daddr, __be32 saddr, u32 tos)
1602 {
1603         struct fib_nh_exception *fnhe;
1604         struct rtable *rth;
1605         int err;
1606         struct in_device *out_dev;
1607         bool do_cache;
1608         u32 itag = 0;
1609
1610         /* get a working reference to the output device */
1611         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1612         if (!out_dev) {
1613                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1614                 return -EINVAL;
1615         }
1616
1617         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1618                                   in_dev->dev, in_dev, &itag);
1619         if (err < 0) {
1620                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1621                                          saddr);
1622
1623                 goto cleanup;
1624         }
1625
1626         do_cache = res->fi && !itag;
1627         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1628             skb->protocol == htons(ETH_P_IP) &&
1629             (IN_DEV_SHARED_MEDIA(out_dev) ||
1630              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1631                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1632
1633         if (skb->protocol != htons(ETH_P_IP)) {
1634                 /* Not IP (i.e. ARP). Do not create route, if it is
1635                  * invalid for proxy arp. DNAT routes are always valid.
1636                  *
1637                  * Proxy arp feature have been extended to allow, ARP
1638                  * replies back to the same interface, to support
1639                  * Private VLAN switch technologies. See arp.c.
1640                  */
1641                 if (out_dev == in_dev &&
1642                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1643                         err = -EINVAL;
1644                         goto cleanup;
1645                 }
1646         }
1647
1648         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1649         if (do_cache) {
1650                 if (fnhe) {
1651                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1652                         if (rth && rth->dst.expires &&
1653                             time_after(jiffies, rth->dst.expires)) {
1654                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1655                                 fnhe = NULL;
1656                         } else {
1657                                 goto rt_cache;
1658                         }
1659                 }
1660
1661                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1662
1663 rt_cache:
1664                 if (rt_cache_valid(rth)) {
1665                         skb_dst_set_noref(skb, &rth->dst);
1666                         goto out;
1667                 }
1668         }
1669
1670         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1671                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1672                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1673         if (!rth) {
1674                 err = -ENOBUFS;
1675                 goto cleanup;
1676         }
1677
1678         rth->rt_is_input = 1;
1679         if (res->table)
1680                 rth->rt_table_id = res->table->tb_id;
1681         RT_CACHE_STAT_INC(in_slow_tot);
1682
1683         rth->dst.input = ip_forward;
1684
1685         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1686         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1687                 rth->dst.lwtstate->orig_output = rth->dst.output;
1688                 rth->dst.output = lwtunnel_output;
1689         }
1690         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1691                 rth->dst.lwtstate->orig_input = rth->dst.input;
1692                 rth->dst.input = lwtunnel_input;
1693         }
1694         skb_dst_set(skb, &rth->dst);
1695 out:
1696         err = 0;
1697  cleanup:
1698         return err;
1699 }
1700
1701 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1702
1703 /* To make ICMP packets follow the right flow, the multipath hash is
1704  * calculated from the inner IP addresses in reverse order.
1705  */
1706 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1707 {
1708         const struct iphdr *outer_iph = ip_hdr(skb);
1709         struct icmphdr _icmph;
1710         const struct icmphdr *icmph;
1711         struct iphdr _inner_iph;
1712         const struct iphdr *inner_iph;
1713
1714         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1715                 goto standard_hash;
1716
1717         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1718                                    &_icmph);
1719         if (!icmph)
1720                 goto standard_hash;
1721
1722         if (icmph->type != ICMP_DEST_UNREACH &&
1723             icmph->type != ICMP_REDIRECT &&
1724             icmph->type != ICMP_TIME_EXCEEDED &&
1725             icmph->type != ICMP_PARAMETERPROB) {
1726                 goto standard_hash;
1727         }
1728
1729         inner_iph = skb_header_pointer(skb,
1730                                        outer_iph->ihl * 4 + sizeof(_icmph),
1731                                        sizeof(_inner_iph), &_inner_iph);
1732         if (!inner_iph)
1733                 goto standard_hash;
1734
1735         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1736
1737 standard_hash:
1738         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1739 }
1740
1741 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1742
1743 static int ip_mkroute_input(struct sk_buff *skb,
1744                             struct fib_result *res,
1745                             const struct flowi4 *fl4,
1746                             struct in_device *in_dev,
1747                             __be32 daddr, __be32 saddr, u32 tos)
1748 {
1749 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1750         if (res->fi && res->fi->fib_nhs > 1) {
1751                 int h;
1752
1753                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1754                         h = ip_multipath_icmp_hash(skb);
1755                 else
1756                         h = fib_multipath_hash(saddr, daddr);
1757                 fib_select_multipath(res, h);
1758         }
1759 #endif
1760
1761         /* create a routing cache entry */
1762         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1763 }
1764
1765 /*
1766  *      NOTE. We drop all the packets that has local source
1767  *      addresses, because every properly looped back packet
1768  *      must have correct destination already attached by output routine.
1769  *
1770  *      Such approach solves two big problems:
1771  *      1. Not simplex devices are handled properly.
1772  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1773  *      called with rcu_read_lock()
1774  */
1775
1776 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777                                u8 tos, struct net_device *dev)
1778 {
1779         struct fib_result res;
1780         struct in_device *in_dev = __in_dev_get_rcu(dev);
1781         struct ip_tunnel_info *tun_info;
1782         struct flowi4   fl4;
1783         unsigned int    flags = 0;
1784         u32             itag = 0;
1785         struct rtable   *rth;
1786         int             err = -EINVAL;
1787         struct net    *net = dev_net(dev);
1788         bool do_cache;
1789
1790         /* IP on this device is disabled. */
1791
1792         if (!in_dev)
1793                 goto out;
1794
1795         /* Check for the most weird martians, which can be not detected
1796            by fib_lookup.
1797          */
1798
1799         tun_info = skb_tunnel_info(skb);
1800         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1801                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1802         else
1803                 fl4.flowi4_tun_key.tun_id = 0;
1804         skb_dst_drop(skb);
1805
1806         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1807                 goto martian_source;
1808
1809         res.fi = NULL;
1810         res.table = NULL;
1811         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1812                 goto brd_input;
1813
1814         /* Accept zero addresses only to limited broadcast;
1815          * I even do not know to fix it or not. Waiting for complains :-)
1816          */
1817         if (ipv4_is_zeronet(saddr))
1818                 goto martian_source;
1819
1820         if (ipv4_is_zeronet(daddr))
1821                 goto martian_destination;
1822
1823         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1824          * and call it once if daddr or/and saddr are loopback addresses
1825          */
1826         if (ipv4_is_loopback(daddr)) {
1827                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1828                         goto martian_destination;
1829         } else if (ipv4_is_loopback(saddr)) {
1830                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1831                         goto martian_source;
1832         }
1833
1834         /*
1835          *      Now we are ready to route packet.
1836          */
1837         fl4.flowi4_oif = 0;
1838         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1839         fl4.flowi4_mark = skb->mark;
1840         fl4.flowi4_tos = tos;
1841         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1842         fl4.flowi4_flags = 0;
1843         fl4.daddr = daddr;
1844         fl4.saddr = saddr;
1845         err = fib_lookup(net, &fl4, &res, 0);
1846         if (err != 0) {
1847                 if (!IN_DEV_FORWARD(in_dev))
1848                         err = -EHOSTUNREACH;
1849                 goto no_route;
1850         }
1851
1852         if (res.type == RTN_BROADCAST)
1853                 goto brd_input;
1854
1855         if (res.type == RTN_LOCAL) {
1856                 err = fib_validate_source(skb, saddr, daddr, tos,
1857                                           0, dev, in_dev, &itag);
1858                 if (err < 0)
1859                         goto martian_source;
1860                 goto local_input;
1861         }
1862
1863         if (!IN_DEV_FORWARD(in_dev)) {
1864                 err = -EHOSTUNREACH;
1865                 goto no_route;
1866         }
1867         if (res.type != RTN_UNICAST)
1868                 goto martian_destination;
1869
1870         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1871 out:    return err;
1872
1873 brd_input:
1874         if (skb->protocol != htons(ETH_P_IP))
1875                 goto e_inval;
1876
1877         if (!ipv4_is_zeronet(saddr)) {
1878                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1879                                           in_dev, &itag);
1880                 if (err < 0)
1881                         goto martian_source;
1882         }
1883         flags |= RTCF_BROADCAST;
1884         res.type = RTN_BROADCAST;
1885         RT_CACHE_STAT_INC(in_brd);
1886
1887 local_input:
1888         do_cache = false;
1889         if (res.fi) {
1890                 if (!itag) {
1891                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1892                         if (rt_cache_valid(rth)) {
1893                                 skb_dst_set_noref(skb, &rth->dst);
1894                                 err = 0;
1895                                 goto out;
1896                         }
1897                         do_cache = true;
1898                 }
1899         }
1900
1901         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1902                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1903         if (!rth)
1904                 goto e_nobufs;
1905
1906         rth->dst.output= ip_rt_bug;
1907 #ifdef CONFIG_IP_ROUTE_CLASSID
1908         rth->dst.tclassid = itag;
1909 #endif
1910         rth->rt_is_input = 1;
1911         if (res.table)
1912                 rth->rt_table_id = res.table->tb_id;
1913
1914         RT_CACHE_STAT_INC(in_slow_tot);
1915         if (res.type == RTN_UNREACHABLE) {
1916                 rth->dst.input= ip_error;
1917                 rth->dst.error= -err;
1918                 rth->rt_flags   &= ~RTCF_LOCAL;
1919         }
1920         if (do_cache) {
1921                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1922                         rth->dst.flags |= DST_NOCACHE;
1923                         rt_add_uncached_list(rth);
1924                 }
1925         }
1926         skb_dst_set(skb, &rth->dst);
1927         err = 0;
1928         goto out;
1929
1930 no_route:
1931         RT_CACHE_STAT_INC(in_no_route);
1932         res.type = RTN_UNREACHABLE;
1933         res.fi = NULL;
1934         res.table = NULL;
1935         goto local_input;
1936
1937         /*
1938          *      Do not cache martian addresses: they should be logged (RFC1812)
1939          */
1940 martian_destination:
1941         RT_CACHE_STAT_INC(in_martian_dst);
1942 #ifdef CONFIG_IP_ROUTE_VERBOSE
1943         if (IN_DEV_LOG_MARTIANS(in_dev))
1944                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1945                                      &daddr, &saddr, dev->name);
1946 #endif
1947
1948 e_inval:
1949         err = -EINVAL;
1950         goto out;
1951
1952 e_nobufs:
1953         err = -ENOBUFS;
1954         goto out;
1955
1956 martian_source:
1957         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1958         goto out;
1959 }
1960
1961 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1962                          u8 tos, struct net_device *dev)
1963 {
1964         int res;
1965
1966         rcu_read_lock();
1967
1968         /* Multicast recognition logic is moved from route cache to here.
1969            The problem was that too many Ethernet cards have broken/missing
1970            hardware multicast filters :-( As result the host on multicasting
1971            network acquires a lot of useless route cache entries, sort of
1972            SDR messages from all the world. Now we try to get rid of them.
1973            Really, provided software IP multicast filter is organized
1974            reasonably (at least, hashed), it does not result in a slowdown
1975            comparing with route cache reject entries.
1976            Note, that multicast routers are not affected, because
1977            route cache entry is created eventually.
1978          */
1979         if (ipv4_is_multicast(daddr)) {
1980                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1981
1982                 if (in_dev) {
1983                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1984                                                   ip_hdr(skb)->protocol);
1985                         if (our
1986 #ifdef CONFIG_IP_MROUTE
1987                                 ||
1988                             (!ipv4_is_local_multicast(daddr) &&
1989                              IN_DEV_MFORWARD(in_dev))
1990 #endif
1991                            ) {
1992                                 int res = ip_route_input_mc(skb, daddr, saddr,
1993                                                             tos, dev, our);
1994                                 rcu_read_unlock();
1995                                 return res;
1996                         }
1997                 }
1998                 rcu_read_unlock();
1999                 return -EINVAL;
2000         }
2001         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2002         rcu_read_unlock();
2003         return res;
2004 }
2005 EXPORT_SYMBOL(ip_route_input_noref);
2006
2007 /* called with rcu_read_lock() */
2008 static struct rtable *__mkroute_output(const struct fib_result *res,
2009                                        const struct flowi4 *fl4, int orig_oif,
2010                                        struct net_device *dev_out,
2011                                        unsigned int flags)
2012 {
2013         struct fib_info *fi = res->fi;
2014         struct fib_nh_exception *fnhe;
2015         struct in_device *in_dev;
2016         u16 type = res->type;
2017         struct rtable *rth;
2018         bool do_cache;
2019
2020         in_dev = __in_dev_get_rcu(dev_out);
2021         if (!in_dev)
2022                 return ERR_PTR(-EINVAL);
2023
2024         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2025                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2026                         return ERR_PTR(-EINVAL);
2027
2028         if (ipv4_is_lbcast(fl4->daddr))
2029                 type = RTN_BROADCAST;
2030         else if (ipv4_is_multicast(fl4->daddr))
2031                 type = RTN_MULTICAST;
2032         else if (ipv4_is_zeronet(fl4->daddr))
2033                 return ERR_PTR(-EINVAL);
2034
2035         if (dev_out->flags & IFF_LOOPBACK)
2036                 flags |= RTCF_LOCAL;
2037
2038         do_cache = true;
2039         if (type == RTN_BROADCAST) {
2040                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2041                 fi = NULL;
2042         } else if (type == RTN_MULTICAST) {
2043                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2044                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2045                                      fl4->flowi4_proto))
2046                         flags &= ~RTCF_LOCAL;
2047                 else
2048                         do_cache = false;
2049                 /* If multicast route do not exist use
2050                  * default one, but do not gateway in this case.
2051                  * Yes, it is hack.
2052                  */
2053                 if (fi && res->prefixlen < 4)
2054                         fi = NULL;
2055         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2056                    (orig_oif != dev_out->ifindex)) {
2057                 /* For local routes that require a particular output interface
2058                  * we do not want to cache the result.  Caching the result
2059                  * causes incorrect behaviour when there are multiple source
2060                  * addresses on the interface, the end result being that if the
2061                  * intended recipient is waiting on that interface for the
2062                  * packet he won't receive it because it will be delivered on
2063                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2064                  * be set to the loopback interface as well.
2065                  */
2066                 fi = NULL;
2067         }
2068
2069         fnhe = NULL;
2070         do_cache &= fi != NULL;
2071         if (do_cache) {
2072                 struct rtable __rcu **prth;
2073                 struct fib_nh *nh = &FIB_RES_NH(*res);
2074
2075                 fnhe = find_exception(nh, fl4->daddr);
2076                 if (fnhe) {
2077                         prth = &fnhe->fnhe_rth_output;
2078                         rth = rcu_dereference(*prth);
2079                         if (rth && rth->dst.expires &&
2080                             time_after(jiffies, rth->dst.expires)) {
2081                                 ip_del_fnhe(nh, fl4->daddr);
2082                                 fnhe = NULL;
2083                         } else {
2084                                 goto rt_cache;
2085                         }
2086                 }
2087
2088                 if (unlikely(fl4->flowi4_flags &
2089                              FLOWI_FLAG_KNOWN_NH &&
2090                              !(nh->nh_gw &&
2091                                nh->nh_scope == RT_SCOPE_LINK))) {
2092                         do_cache = false;
2093                         goto add;
2094                 }
2095                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2096                 rth = rcu_dereference(*prth);
2097
2098 rt_cache:
2099                 if (rt_cache_valid(rth)) {
2100                         dst_hold(&rth->dst);
2101                         return rth;
2102                 }
2103         }
2104
2105 add:
2106         rth = rt_dst_alloc(dev_out, flags, type,
2107                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2108                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2109                            do_cache);
2110         if (!rth)
2111                 return ERR_PTR(-ENOBUFS);
2112
2113         rth->rt_iif     = orig_oif ? : 0;
2114         if (res->table)
2115                 rth->rt_table_id = res->table->tb_id;
2116
2117         RT_CACHE_STAT_INC(out_slow_tot);
2118
2119         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2120                 if (flags & RTCF_LOCAL &&
2121                     !(dev_out->flags & IFF_LOOPBACK)) {
2122                         rth->dst.output = ip_mc_output;
2123                         RT_CACHE_STAT_INC(out_slow_mc);
2124                 }
2125 #ifdef CONFIG_IP_MROUTE
2126                 if (type == RTN_MULTICAST) {
2127                         if (IN_DEV_MFORWARD(in_dev) &&
2128                             !ipv4_is_local_multicast(fl4->daddr)) {
2129                                 rth->dst.input = ip_mr_input;
2130                                 rth->dst.output = ip_mc_output;
2131                         }
2132                 }
2133 #endif
2134         }
2135
2136         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2137         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2138                 rth->dst.output = lwtunnel_output;
2139
2140         return rth;
2141 }
2142
2143 /*
2144  * Major route resolver routine.
2145  */
2146
2147 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2148                                           int mp_hash)
2149 {
2150         struct net_device *dev_out = NULL;
2151         __u8 tos = RT_FL_TOS(fl4);
2152         unsigned int flags = 0;
2153         struct fib_result res;
2154         struct rtable *rth;
2155         int master_idx;
2156         int orig_oif;
2157         int err = -ENETUNREACH;
2158
2159         res.tclassid    = 0;
2160         res.fi          = NULL;
2161         res.table       = NULL;
2162
2163         orig_oif = fl4->flowi4_oif;
2164
2165         master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
2166         if (master_idx)
2167                 fl4->flowi4_oif = master_idx;
2168         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2169         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2170         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2171                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2172
2173         rcu_read_lock();
2174         if (fl4->saddr) {
2175                 rth = ERR_PTR(-EINVAL);
2176                 if (ipv4_is_multicast(fl4->saddr) ||
2177                     ipv4_is_lbcast(fl4->saddr) ||
2178                     ipv4_is_zeronet(fl4->saddr))
2179                         goto out;
2180
2181                 /* I removed check for oif == dev_out->oif here.
2182                    It was wrong for two reasons:
2183                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2184                       is assigned to multiple interfaces.
2185                    2. Moreover, we are allowed to send packets with saddr
2186                       of another iface. --ANK
2187                  */
2188
2189                 if (fl4->flowi4_oif == 0 &&
2190                     (ipv4_is_multicast(fl4->daddr) ||
2191                      ipv4_is_lbcast(fl4->daddr))) {
2192                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2194                         if (!dev_out)
2195                                 goto out;
2196
2197                         /* Special hack: user can direct multicasts
2198                            and limited broadcast via necessary interface
2199                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2200                            This hack is not just for fun, it allows
2201                            vic,vat and friends to work.
2202                            They bind socket to loopback, set ttl to zero
2203                            and expect that it will work.
2204                            From the viewpoint of routing cache they are broken,
2205                            because we are not allowed to build multicast path
2206                            with loopback source addr (look, routing cache
2207                            cannot know, that ttl is zero, so that packet
2208                            will not leave this host and route is valid).
2209                            Luckily, this hack is good workaround.
2210                          */
2211
2212                         fl4->flowi4_oif = dev_out->ifindex;
2213                         goto make_route;
2214                 }
2215
2216                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2217                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2218                         if (!__ip_dev_find(net, fl4->saddr, false))
2219                                 goto out;
2220                 }
2221         }
2222
2223
2224         if (fl4->flowi4_oif) {
2225                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2226                 rth = ERR_PTR(-ENODEV);
2227                 if (!dev_out)
2228                         goto out;
2229
2230                 /* RACE: Check return value of inet_select_addr instead. */
2231                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2232                         rth = ERR_PTR(-ENETUNREACH);
2233                         goto out;
2234                 }
2235                 if (ipv4_is_local_multicast(fl4->daddr) ||
2236                     ipv4_is_lbcast(fl4->daddr) ||
2237                     fl4->flowi4_proto == IPPROTO_IGMP) {
2238                         if (!fl4->saddr)
2239                                 fl4->saddr = inet_select_addr(dev_out, 0,
2240                                                               RT_SCOPE_LINK);
2241                         goto make_route;
2242                 }
2243                 if (!fl4->saddr) {
2244                         if (ipv4_is_multicast(fl4->daddr))
2245                                 fl4->saddr = inet_select_addr(dev_out, 0,
2246                                                               fl4->flowi4_scope);
2247                         else if (!fl4->daddr)
2248                                 fl4->saddr = inet_select_addr(dev_out, 0,
2249                                                               RT_SCOPE_HOST);
2250                 }
2251
2252                 rth = l3mdev_get_rtable(dev_out, fl4);
2253                 if (rth)
2254                         goto out;
2255         }
2256
2257         if (!fl4->daddr) {
2258                 fl4->daddr = fl4->saddr;
2259                 if (!fl4->daddr)
2260                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2261                 dev_out = net->loopback_dev;
2262                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2263                 res.type = RTN_LOCAL;
2264                 flags |= RTCF_LOCAL;
2265                 goto make_route;
2266         }
2267
2268         err = fib_lookup(net, fl4, &res, 0);
2269         if (err) {
2270                 res.fi = NULL;
2271                 res.table = NULL;
2272                 if (fl4->flowi4_oif &&
2273                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2274                         /* Apparently, routing tables are wrong. Assume,
2275                            that the destination is on link.
2276
2277                            WHY? DW.
2278                            Because we are allowed to send to iface
2279                            even if it has NO routes and NO assigned
2280                            addresses. When oif is specified, routing
2281                            tables are looked up with only one purpose:
2282                            to catch if destination is gatewayed, rather than
2283                            direct. Moreover, if MSG_DONTROUTE is set,
2284                            we send packet, ignoring both routing tables
2285                            and ifaddr state. --ANK
2286
2287
2288                            We could make it even if oif is unknown,
2289                            likely IPv6, but we do not.
2290                          */
2291
2292                         if (fl4->saddr == 0)
2293                                 fl4->saddr = inet_select_addr(dev_out, 0,
2294                                                               RT_SCOPE_LINK);
2295                         res.type = RTN_UNICAST;
2296                         goto make_route;
2297                 }
2298                 rth = ERR_PTR(err);
2299                 goto out;
2300         }
2301
2302         if (res.type == RTN_LOCAL) {
2303                 if (!fl4->saddr) {
2304                         if (res.fi->fib_prefsrc)
2305                                 fl4->saddr = res.fi->fib_prefsrc;
2306                         else
2307                                 fl4->saddr = fl4->daddr;
2308                 }
2309                 dev_out = net->loopback_dev;
2310                 fl4->flowi4_oif = dev_out->ifindex;
2311                 flags |= RTCF_LOCAL;
2312                 goto make_route;
2313         }
2314
2315         fib_select_path(net, &res, fl4, mp_hash);
2316
2317         dev_out = FIB_RES_DEV(res);
2318         fl4->flowi4_oif = dev_out->ifindex;
2319
2320
2321 make_route:
2322         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2323
2324 out:
2325         rcu_read_unlock();
2326         return rth;
2327 }
2328 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2329
2330 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2331 {
2332         return NULL;
2333 }
2334
2335 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2336 {
2337         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2338
2339         return mtu ? : dst->dev->mtu;
2340 }
2341
2342 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2343                                           struct sk_buff *skb, u32 mtu)
2344 {
2345 }
2346
2347 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2348                                        struct sk_buff *skb)
2349 {
2350 }
2351
2352 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2353                                           unsigned long old)
2354 {
2355         return NULL;
2356 }
2357
2358 static struct dst_ops ipv4_dst_blackhole_ops = {
2359         .family                 =       AF_INET,
2360         .check                  =       ipv4_blackhole_dst_check,
2361         .mtu                    =       ipv4_blackhole_mtu,
2362         .default_advmss         =       ipv4_default_advmss,
2363         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2364         .redirect               =       ipv4_rt_blackhole_redirect,
2365         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2366         .neigh_lookup           =       ipv4_neigh_lookup,
2367 };
2368
2369 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2370 {
2371         struct rtable *ort = (struct rtable *) dst_orig;
2372         struct rtable *rt;
2373
2374         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2375         if (rt) {
2376                 struct dst_entry *new = &rt->dst;
2377
2378                 new->__use = 1;
2379                 new->input = dst_discard;
2380                 new->output = dst_discard_out;
2381
2382                 new->dev = ort->dst.dev;
2383                 if (new->dev)
2384                         dev_hold(new->dev);
2385
2386                 rt->rt_is_input = ort->rt_is_input;
2387                 rt->rt_iif = ort->rt_iif;
2388                 rt->rt_pmtu = ort->rt_pmtu;
2389
2390                 rt->rt_genid = rt_genid_ipv4(net);
2391                 rt->rt_flags = ort->rt_flags;
2392                 rt->rt_type = ort->rt_type;
2393                 rt->rt_gateway = ort->rt_gateway;
2394                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2395
2396                 INIT_LIST_HEAD(&rt->rt_uncached);
2397                 dst_free(new);
2398         }
2399
2400         dst_release(dst_orig);
2401
2402         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2403 }
2404
2405 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2406                                     const struct sock *sk)
2407 {
2408         struct rtable *rt = __ip_route_output_key(net, flp4);
2409
2410         if (IS_ERR(rt))
2411                 return rt;
2412
2413         if (flp4->flowi4_proto)
2414                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2415                                                         flowi4_to_flowi(flp4),
2416                                                         sk, 0);
2417
2418         return rt;
2419 }
2420 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2421
2422 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2423                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2424                         u32 seq, int event, int nowait, unsigned int flags)
2425 {
2426         struct rtable *rt = skb_rtable(skb);
2427         struct rtmsg *r;
2428         struct nlmsghdr *nlh;
2429         unsigned long expires = 0;
2430         u32 error;
2431         u32 metrics[RTAX_MAX];
2432
2433         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2434         if (!nlh)
2435                 return -EMSGSIZE;
2436
2437         r = nlmsg_data(nlh);
2438         r->rtm_family    = AF_INET;
2439         r->rtm_dst_len  = 32;
2440         r->rtm_src_len  = 0;
2441         r->rtm_tos      = fl4->flowi4_tos;
2442         r->rtm_table    = table_id;
2443         if (nla_put_u32(skb, RTA_TABLE, table_id))
2444                 goto nla_put_failure;
2445         r->rtm_type     = rt->rt_type;
2446         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2447         r->rtm_protocol = RTPROT_UNSPEC;
2448         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2449         if (rt->rt_flags & RTCF_NOTIFY)
2450                 r->rtm_flags |= RTM_F_NOTIFY;
2451         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2452                 r->rtm_flags |= RTCF_DOREDIRECT;
2453
2454         if (nla_put_in_addr(skb, RTA_DST, dst))
2455                 goto nla_put_failure;
2456         if (src) {
2457                 r->rtm_src_len = 32;
2458                 if (nla_put_in_addr(skb, RTA_SRC, src))
2459                         goto nla_put_failure;
2460         }
2461         if (rt->dst.dev &&
2462             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2463                 goto nla_put_failure;
2464 #ifdef CONFIG_IP_ROUTE_CLASSID
2465         if (rt->dst.tclassid &&
2466             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2467                 goto nla_put_failure;
2468 #endif
2469         if (!rt_is_input_route(rt) &&
2470             fl4->saddr != src) {
2471                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2472                         goto nla_put_failure;
2473         }
2474         if (rt->rt_uses_gateway &&
2475             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2476                 goto nla_put_failure;
2477
2478         expires = rt->dst.expires;
2479         if (expires) {
2480                 unsigned long now = jiffies;
2481
2482                 if (time_before(now, expires))
2483                         expires -= now;
2484                 else
2485                         expires = 0;
2486         }
2487
2488         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2489         if (rt->rt_pmtu && expires)
2490                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2491         if (rtnetlink_put_metrics(skb, metrics) < 0)
2492                 goto nla_put_failure;
2493
2494         if (fl4->flowi4_mark &&
2495             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2496                 goto nla_put_failure;
2497
2498         error = rt->dst.error;
2499
2500         if (rt_is_input_route(rt)) {
2501 #ifdef CONFIG_IP_MROUTE
2502                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2503                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2504                         int err = ipmr_get_route(net, skb,
2505                                                  fl4->saddr, fl4->daddr,
2506                                                  r, nowait, portid);
2507
2508                         if (err <= 0) {
2509                                 if (!nowait) {
2510                                         if (err == 0)
2511                                                 return 0;
2512                                         goto nla_put_failure;
2513                                 } else {
2514                                         if (err == -EMSGSIZE)
2515                                                 goto nla_put_failure;
2516                                         error = err;
2517                                 }
2518                         }
2519                 } else
2520 #endif
2521                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2522                                 goto nla_put_failure;
2523         }
2524
2525         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2526                 goto nla_put_failure;
2527
2528         nlmsg_end(skb, nlh);
2529         return 0;
2530
2531 nla_put_failure:
2532         nlmsg_cancel(skb, nlh);
2533         return -EMSGSIZE;
2534 }
2535
2536 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2537 {
2538         struct net *net = sock_net(in_skb->sk);
2539         struct rtmsg *rtm;
2540         struct nlattr *tb[RTA_MAX+1];
2541         struct rtable *rt = NULL;
2542         struct flowi4 fl4;
2543         __be32 dst = 0;
2544         __be32 src = 0;
2545         u32 iif;
2546         int err;
2547         int mark;
2548         struct sk_buff *skb;
2549         u32 table_id = RT_TABLE_MAIN;
2550
2551         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2552         if (err < 0)
2553                 goto errout;
2554
2555         rtm = nlmsg_data(nlh);
2556
2557         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2558         if (!skb) {
2559                 err = -ENOBUFS;
2560                 goto errout;
2561         }
2562
2563         /* Reserve room for dummy headers, this skb can pass
2564            through good chunk of routing engine.
2565          */
2566         skb_reset_mac_header(skb);
2567         skb_reset_network_header(skb);
2568
2569         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2570         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2571         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2572
2573         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2574         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2575         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2576         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2577
2578         memset(&fl4, 0, sizeof(fl4));
2579         fl4.daddr = dst;
2580         fl4.saddr = src;
2581         fl4.flowi4_tos = rtm->rtm_tos;
2582         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2583         fl4.flowi4_mark = mark;
2584
2585         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2586                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2587
2588         if (iif) {
2589                 struct net_device *dev;
2590
2591                 dev = __dev_get_by_index(net, iif);
2592                 if (!dev) {
2593                         err = -ENODEV;
2594                         goto errout_free;
2595                 }
2596
2597                 skb->protocol   = htons(ETH_P_IP);
2598                 skb->dev        = dev;
2599                 skb->mark       = mark;
2600                 local_bh_disable();
2601                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2602                 local_bh_enable();
2603
2604                 rt = skb_rtable(skb);
2605                 if (err == 0 && rt->dst.error)
2606                         err = -rt->dst.error;
2607         } else {
2608                 rt = ip_route_output_key(net, &fl4);
2609
2610                 err = 0;
2611                 if (IS_ERR(rt))
2612                         err = PTR_ERR(rt);
2613         }
2614
2615         if (err)
2616                 goto errout_free;
2617
2618         skb_dst_set(skb, &rt->dst);
2619         if (rtm->rtm_flags & RTM_F_NOTIFY)
2620                 rt->rt_flags |= RTCF_NOTIFY;
2621
2622         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2623                 table_id = rt->rt_table_id;
2624
2625         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2626                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2627                            RTM_NEWROUTE, 0, 0);
2628         if (err < 0)
2629                 goto errout_free;
2630
2631         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2632 errout:
2633         return err;
2634
2635 errout_free:
2636         kfree_skb(skb);
2637         goto errout;
2638 }
2639
2640 void ip_rt_multicast_event(struct in_device *in_dev)
2641 {
2642         rt_cache_flush(dev_net(in_dev->dev));
2643 }
2644
2645 #ifdef CONFIG_SYSCTL
2646 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2647 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2648 static int ip_rt_gc_elasticity __read_mostly    = 8;
2649
2650 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2651                                         void __user *buffer,
2652                                         size_t *lenp, loff_t *ppos)
2653 {
2654         struct net *net = (struct net *)__ctl->extra1;
2655
2656         if (write) {
2657                 rt_cache_flush(net);
2658                 fnhe_genid_bump(net);
2659                 return 0;
2660         }
2661
2662         return -EINVAL;
2663 }
2664
2665 static struct ctl_table ipv4_route_table[] = {
2666         {
2667                 .procname       = "gc_thresh",
2668                 .data           = &ipv4_dst_ops.gc_thresh,
2669                 .maxlen         = sizeof(int),
2670                 .mode           = 0644,
2671                 .proc_handler   = proc_dointvec,
2672         },
2673         {
2674                 .procname       = "max_size",
2675                 .data           = &ip_rt_max_size,
2676                 .maxlen         = sizeof(int),
2677                 .mode           = 0644,
2678                 .proc_handler   = proc_dointvec,
2679         },
2680         {
2681                 /*  Deprecated. Use gc_min_interval_ms */
2682
2683                 .procname       = "gc_min_interval",
2684                 .data           = &ip_rt_gc_min_interval,
2685                 .maxlen         = sizeof(int),
2686                 .mode           = 0644,
2687                 .proc_handler   = proc_dointvec_jiffies,
2688         },
2689         {
2690                 .procname       = "gc_min_interval_ms",
2691                 .data           = &ip_rt_gc_min_interval,
2692                 .maxlen         = sizeof(int),
2693                 .mode           = 0644,
2694                 .proc_handler   = proc_dointvec_ms_jiffies,
2695         },
2696         {
2697                 .procname       = "gc_timeout",
2698                 .data           = &ip_rt_gc_timeout,
2699                 .maxlen         = sizeof(int),
2700                 .mode           = 0644,
2701                 .proc_handler   = proc_dointvec_jiffies,
2702         },
2703         {
2704                 .procname       = "gc_interval",
2705                 .data           = &ip_rt_gc_interval,
2706                 .maxlen         = sizeof(int),
2707                 .mode           = 0644,
2708                 .proc_handler   = proc_dointvec_jiffies,
2709         },
2710         {
2711                 .procname       = "redirect_load",
2712                 .data           = &ip_rt_redirect_load,
2713                 .maxlen         = sizeof(int),
2714                 .mode           = 0644,
2715                 .proc_handler   = proc_dointvec,
2716         },
2717         {
2718                 .procname       = "redirect_number",
2719                 .data           = &ip_rt_redirect_number,
2720                 .maxlen         = sizeof(int),
2721                 .mode           = 0644,
2722                 .proc_handler   = proc_dointvec,
2723         },
2724         {
2725                 .procname       = "redirect_silence",
2726                 .data           = &ip_rt_redirect_silence,
2727                 .maxlen         = sizeof(int),
2728                 .mode           = 0644,
2729                 .proc_handler   = proc_dointvec,
2730         },
2731         {
2732                 .procname       = "error_cost",
2733                 .data           = &ip_rt_error_cost,
2734                 .maxlen         = sizeof(int),
2735                 .mode           = 0644,
2736                 .proc_handler   = proc_dointvec,
2737         },
2738         {
2739                 .procname       = "error_burst",
2740                 .data           = &ip_rt_error_burst,
2741                 .maxlen         = sizeof(int),
2742                 .mode           = 0644,
2743                 .proc_handler   = proc_dointvec,
2744         },
2745         {
2746                 .procname       = "gc_elasticity",
2747                 .data           = &ip_rt_gc_elasticity,
2748                 .maxlen         = sizeof(int),
2749                 .mode           = 0644,
2750                 .proc_handler   = proc_dointvec,
2751         },
2752         {
2753                 .procname       = "mtu_expires",
2754                 .data           = &ip_rt_mtu_expires,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = proc_dointvec_jiffies,
2758         },
2759         {
2760                 .procname       = "min_pmtu",
2761                 .data           = &ip_rt_min_pmtu,
2762                 .maxlen         = sizeof(int),
2763                 .mode           = 0644,
2764                 .proc_handler   = proc_dointvec,
2765         },
2766         {
2767                 .procname       = "min_adv_mss",
2768                 .data           = &ip_rt_min_advmss,
2769                 .maxlen         = sizeof(int),
2770                 .mode           = 0644,
2771                 .proc_handler   = proc_dointvec,
2772         },
2773         { }
2774 };
2775
2776 static struct ctl_table ipv4_route_flush_table[] = {
2777         {
2778                 .procname       = "flush",
2779                 .maxlen         = sizeof(int),
2780                 .mode           = 0200,
2781                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2782         },
2783         { },
2784 };
2785
2786 static __net_init int sysctl_route_net_init(struct net *net)
2787 {
2788         struct ctl_table *tbl;
2789
2790         tbl = ipv4_route_flush_table;
2791         if (!net_eq(net, &init_net)) {
2792                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2793                 if (!tbl)
2794                         goto err_dup;
2795
2796                 /* Don't export sysctls to unprivileged users */
2797                 if (net->user_ns != &init_user_ns)
2798                         tbl[0].procname = NULL;
2799         }
2800         tbl[0].extra1 = net;
2801
2802         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2803         if (!net->ipv4.route_hdr)
2804                 goto err_reg;
2805         return 0;
2806
2807 err_reg:
2808         if (tbl != ipv4_route_flush_table)
2809                 kfree(tbl);
2810 err_dup:
2811         return -ENOMEM;
2812 }
2813
2814 static __net_exit void sysctl_route_net_exit(struct net *net)
2815 {
2816         struct ctl_table *tbl;
2817
2818         tbl = net->ipv4.route_hdr->ctl_table_arg;
2819         unregister_net_sysctl_table(net->ipv4.route_hdr);
2820         BUG_ON(tbl == ipv4_route_flush_table);
2821         kfree(tbl);
2822 }
2823
2824 static __net_initdata struct pernet_operations sysctl_route_ops = {
2825         .init = sysctl_route_net_init,
2826         .exit = sysctl_route_net_exit,
2827 };
2828 #endif
2829
2830 static __net_init int rt_genid_init(struct net *net)
2831 {
2832         atomic_set(&net->ipv4.rt_genid, 0);
2833         atomic_set(&net->fnhe_genid, 0);
2834         get_random_bytes(&net->ipv4.dev_addr_genid,
2835                          sizeof(net->ipv4.dev_addr_genid));
2836         return 0;
2837 }
2838
2839 static __net_initdata struct pernet_operations rt_genid_ops = {
2840         .init = rt_genid_init,
2841 };
2842
2843 static int __net_init ipv4_inetpeer_init(struct net *net)
2844 {
2845         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2846
2847         if (!bp)
2848                 return -ENOMEM;
2849         inet_peer_base_init(bp);
2850         net->ipv4.peers = bp;
2851         return 0;
2852 }
2853
2854 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2855 {
2856         struct inet_peer_base *bp = net->ipv4.peers;
2857
2858         net->ipv4.peers = NULL;
2859         inetpeer_invalidate_tree(bp);
2860         kfree(bp);
2861 }
2862
2863 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2864         .init   =       ipv4_inetpeer_init,
2865         .exit   =       ipv4_inetpeer_exit,
2866 };
2867
2868 #ifdef CONFIG_IP_ROUTE_CLASSID
2869 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2870 #endif /* CONFIG_IP_ROUTE_CLASSID */
2871
2872 int __init ip_rt_init(void)
2873 {
2874         int rc = 0;
2875         int cpu;
2876
2877         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2878         if (!ip_idents)
2879                 panic("IP: failed to allocate ip_idents\n");
2880
2881         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2882
2883         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2884         if (!ip_tstamps)
2885                 panic("IP: failed to allocate ip_tstamps\n");
2886
2887         for_each_possible_cpu(cpu) {
2888                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2889
2890                 INIT_LIST_HEAD(&ul->head);
2891                 spin_lock_init(&ul->lock);
2892         }
2893 #ifdef CONFIG_IP_ROUTE_CLASSID
2894         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2895         if (!ip_rt_acct)
2896                 panic("IP: failed to allocate ip_rt_acct\n");
2897 #endif
2898
2899         ipv4_dst_ops.kmem_cachep =
2900                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2901                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2902
2903         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2904
2905         if (dst_entries_init(&ipv4_dst_ops) < 0)
2906                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2907
2908         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2909                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2910
2911         ipv4_dst_ops.gc_thresh = ~0;
2912         ip_rt_max_size = INT_MAX;
2913
2914         devinet_init();
2915         ip_fib_init();
2916
2917         if (ip_rt_proc_init())
2918                 pr_err("Unable to create route proc files\n");
2919 #ifdef CONFIG_XFRM
2920         xfrm_init();
2921         xfrm4_init();
2922 #endif
2923         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2924
2925 #ifdef CONFIG_SYSCTL
2926         register_pernet_subsys(&sysctl_route_ops);
2927 #endif
2928         register_pernet_subsys(&rt_genid_ops);
2929         register_pernet_subsys(&ipv4_inetpeer_ops);
2930         return rc;
2931 }
2932
2933 #ifdef CONFIG_SYSCTL
2934 /*
2935  * We really need to sanitize the damn ipv4 init order, then all
2936  * this nonsense will go away.
2937  */
2938 void __init ip_static_sysctl_init(void)
2939 {
2940         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2941 }
2942 #endif