net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149 {
 150         WARN_ON(1);
 151         return NULL;
 152 }
 153
 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                            struct sk_buff *skb,
 156                                            const void *daddr);
 157
 158 static struct dst_ops ipv4_dst_ops = {
 159         .family =               AF_INET,
 160         .check =                ipv4_dst_check,
 161         .default_advmss =       ipv4_default_advmss,
 162         .mtu =                  ipv4_mtu,
 163         .cow_metrics =          ipv4_cow_metrics,
 164         .destroy =              ipv4_dst_destroy,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .redirect =             ip_do_redirect,
 169         .local_out =            __ip_local_out,
 170         .neigh_lookup =         ipv4_neigh_lookup,
 171 };
 172
 173 #define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175 const __u8 ip_tos2prio[16] = {
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK)
 192 };
 193 EXPORT_SYMBOL(ip_tos2prio);
 194
 195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198 #ifdef CONFIG_PROC_FS
 199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200 {
 201         if (*pos)
 202                 return NULL;
 203         return SEQ_START_TOKEN;
 204 }
 205
 206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207 {
 208         ++*pos;
 209         return NULL;
 210 }
 211
 212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213 {
 214 }
 215
 216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217 {
 218         if (v == SEQ_START_TOKEN)
 219                 seq_printf(seq, "%-127s\n",
 220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                            "HHUptod\tSpecDst");
 223         return 0;
 224 }
 225
 226 static const struct seq_operations rt_cache_seq_ops = {
 227         .start  = rt_cache_seq_start,
 228         .next   = rt_cache_seq_next,
 229         .stop   = rt_cache_seq_stop,
 230         .show   = rt_cache_seq_show,
 231 };
 232
 233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234 {
 235         return seq_open(file, &rt_cache_seq_ops);
 236 }
 237
 238 static const struct file_operations rt_cache_seq_fops = {
 239         .owner   = THIS_MODULE,
 240         .open    = rt_cache_seq_open,
 241         .read    = seq_read,
 242         .llseek  = seq_lseek,
 243         .release = seq_release,
 244 };
 245
 246
 247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248 {
 249         int cpu;
 250
 251         if (*pos == 0)
 252                 return SEQ_START_TOKEN;
 253
 254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                 if (!cpu_possible(cpu))
 256                         continue;
 257                 *pos = cpu+1;
 258                 return &per_cpu(rt_cache_stat, cpu);
 259         }
 260         return NULL;
 261 }
 262
 263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264 {
 265         int cpu;
 266
 267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                 if (!cpu_possible(cpu))
 269                         continue;
 270                 *pos = cpu+1;
 271                 return &per_cpu(rt_cache_stat, cpu);
 272         }
 273         return NULL;
 274
 275 }
 276
 277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278 {
 279
 280 }
 281
 282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283 {
 284         struct rt_cache_stat *st = v;
 285
 286         if (v == SEQ_START_TOKEN) {
 287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                 return 0;
 289         }
 290
 291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                    dst_entries_get_slow(&ipv4_dst_ops),
 294                    0, /* st->in_hit */
 295                    st->in_slow_tot,
 296                    st->in_slow_mc,
 297                    st->in_no_route,
 298                    st->in_brd,
 299                    st->in_martian_dst,
 300                    st->in_martian_src,
 301
 302                    0, /* st->out_hit */
 303                    st->out_slow_tot,
 304                    st->out_slow_mc,
 305
 306                    0, /* st->gc_total */
 307                    0, /* st->gc_ignored */
 308                    0, /* st->gc_goal_miss */
 309                    0, /* st->gc_dst_overflow */
 310                    0, /* st->in_hlist_search */
 311                    0  /* st->out_hlist_search */
 312                 );
 313         return 0;
 314 }
 315
 316 static const struct seq_operations rt_cpu_seq_ops = {
 317         .start  = rt_cpu_seq_start,
 318         .next   = rt_cpu_seq_next,
 319         .stop   = rt_cpu_seq_stop,
 320         .show   = rt_cpu_seq_show,
 321 };
 322
 323
 324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325 {
 326         return seq_open(file, &rt_cpu_seq_ops);
 327 }
 328
 329 static const struct file_operations rt_cpu_seq_fops = {
 330         .owner   = THIS_MODULE,
 331         .open    = rt_cpu_seq_open,
 332         .read    = seq_read,
 333         .llseek  = seq_lseek,
 334         .release = seq_release,
 335 };
 336
 337 #ifdef CONFIG_IP_ROUTE_CLASSID
 338 static int rt_acct_proc_show(struct seq_file *m, void *v)
 339 {
 340         struct ip_rt_acct *dst, *src;
 341         unsigned int i, j;
 342
 343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344         if (!dst)
 345                 return -ENOMEM;
 346
 347         for_each_possible_cpu(i) {
 348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                 for (j = 0; j < 256; j++) {
 350                         dst[j].o_bytes   += src[j].o_bytes;
 351                         dst[j].o_packets += src[j].o_packets;
 352                         dst[j].i_bytes   += src[j].i_bytes;
 353                         dst[j].i_packets += src[j].i_packets;
 354                 }
 355         }
 356
 357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358         kfree(dst);
 359         return 0;
 360 }
 361
 362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363 {
 364         return single_open(file, rt_acct_proc_show, NULL);
 365 }
 366
 367 static const struct file_operations rt_acct_proc_fops = {
 368         .owner          = THIS_MODULE,
 369         .open           = rt_acct_proc_open,
 370         .read           = seq_read,
 371         .llseek         = seq_lseek,
 372         .release        = single_release,
 373 };
 374 #endif
 375
 376 static int __net_init ip_rt_do_proc_init(struct net *net)
 377 {
 378         struct proc_dir_entry *pde;
 379
 380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                           &rt_cache_seq_fops);
 382         if (!pde)
 383                 goto err1;
 384
 385         pde = proc_create("rt_cache", S_IRUGO,
 386                           net->proc_net_stat, &rt_cpu_seq_fops);
 387         if (!pde)
 388                 goto err2;
 389
 390 #ifdef CONFIG_IP_ROUTE_CLASSID
 391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392         if (!pde)
 393                 goto err3;
 394 #endif
 395         return 0;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398 err3:
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400 #endif
 401 err2:
 402         remove_proc_entry("rt_cache", net->proc_net);
 403 err1:
 404         return -ENOMEM;
 405 }
 406
 407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408 {
 409         remove_proc_entry("rt_cache", net->proc_net_stat);
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 #ifdef CONFIG_IP_ROUTE_CLASSID
 412         remove_proc_entry("rt_acct", net->proc_net);
 413 #endif
 414 }
 415
 416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417         .init = ip_rt_do_proc_init,
 418         .exit = ip_rt_do_proc_exit,
 419 };
 420
 421 static int __init ip_rt_proc_init(void)
 422 {
 423         return register_pernet_subsys(&ip_rt_proc_ops);
 424 }
 425
 426 #else
 427 static inline int ip_rt_proc_init(void)
 428 {
 429         return 0;
 430 }
 431 #endif /* CONFIG_PROC_FS */
 432
 433 static inline bool rt_is_expired(const struct rtable *rth)
 434 {
 435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436 }
 437
 438 void rt_cache_flush(struct net *net)
 439 {
 440         rt_genid_bump_ipv4(net);
 441 }
 442
 443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                            struct sk_buff *skb,
 445                                            const void *daddr)
 446 {
 447         struct net_device *dev = dst->dev;
 448         const __be32 *pkey = daddr;
 449         const struct rtable *rt;
 450         struct neighbour *n;
 451
 452         rt = (const struct rtable *) dst;
 453         if (rt->rt_gateway)
 454                 pkey = (const __be32 *) &rt->rt_gateway;
 455         else if (skb)
 456                 pkey = &ip_hdr(skb)->daddr;
 457
 458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459         if (n)
 460                 return n;
 461         return neigh_create(&arp_tbl, pkey, dev);
 462 }
 463
 464 #define IP_IDENTS_SZ 2048u
 465
 466 static atomic_t *ip_idents __read_mostly;
 467 static u32 *ip_tstamps __read_mostly;
 468
 469 /* In order to protect privacy, we add a perturbation to identifiers
 470  * if one generator is seldom used. This makes hard for an attacker
 471  * to infer how many packets were sent between two points in time.
 472  */
 473 u32 ip_idents_reserve(u32 hash, int segs)
 474 {
 475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477         u32 old = ACCESS_ONCE(*p_tstamp);
 478         u32 now = (u32)jiffies;
 479         u32 new, delta = 0;
 480
 481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                 delta = prandom_u32_max(now - old);
 483
 484         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 485         do {
 486                 old = (u32)atomic_read(p_id);
 487                 new = old + delta + segs;
 488         } while (atomic_cmpxchg(p_id, old, new) != old);
 489
 490         return new - segs;
 491 }
 492 EXPORT_SYMBOL(ip_idents_reserve);
 493
 494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 495 {
 496         static u32 ip_idents_hashrnd __read_mostly;
 497         u32 hash, id;
 498
 499         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 500
 501         hash = jhash_3words((__force u32)iph->daddr,
 502                             (__force u32)iph->saddr,
 503                             iph->protocol ^ net_hash_mix(net),
 504                             ip_idents_hashrnd);
 505         id = ip_idents_reserve(hash, segs);
 506         iph->id = htons(id);
 507 }
 508 EXPORT_SYMBOL(__ip_select_ident);
 509
 510 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 511                              const struct iphdr *iph,
 512                              int oif, u8 tos,
 513                              u8 prot, u32 mark, int flow_flags)
 514 {
 515         if (sk) {
 516                 const struct inet_sock *inet = inet_sk(sk);
 517
 518                 oif = sk->sk_bound_dev_if;
 519                 mark = sk->sk_mark;
 520                 tos = RT_CONN_FLAGS(sk);
 521                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 522         }
 523         flowi4_init_output(fl4, oif, mark, tos,
 524                            RT_SCOPE_UNIVERSE, prot,
 525                            flow_flags,
 526                            iph->daddr, iph->saddr, 0, 0);
 527 }
 528
 529 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530                                const struct sock *sk)
 531 {
 532         const struct iphdr *iph = ip_hdr(skb);
 533         int oif = skb->dev->ifindex;
 534         u8 tos = RT_TOS(iph->tos);
 535         u8 prot = iph->protocol;
 536         u32 mark = skb->mark;
 537
 538         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 539 }
 540
 541 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 542 {
 543         const struct inet_sock *inet = inet_sk(sk);
 544         const struct ip_options_rcu *inet_opt;
 545         __be32 daddr = inet->inet_daddr;
 546
 547         rcu_read_lock();
 548         inet_opt = rcu_dereference(inet->inet_opt);
 549         if (inet_opt && inet_opt->opt.srr)
 550                 daddr = inet_opt->opt.faddr;
 551         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 552                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 553                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 554                            inet_sk_flowi_flags(sk),
 555                            daddr, inet->inet_saddr, 0, 0);
 556         rcu_read_unlock();
 557 }
 558
 559 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 560                                  const struct sk_buff *skb)
 561 {
 562         if (skb)
 563                 build_skb_flow_key(fl4, skb, sk);
 564         else
 565                 build_sk_flow_key(fl4, sk);
 566 }
 567
 568 static inline void rt_free(struct rtable *rt)
 569 {
 570         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 571 }
 572
 573 static DEFINE_SPINLOCK(fnhe_lock);
 574
 575 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576 {
 577         struct rtable *rt;
 578
 579         rt = rcu_dereference(fnhe->fnhe_rth_input);
 580         if (rt) {
 581                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582                 rt_free(rt);
 583         }
 584         rt = rcu_dereference(fnhe->fnhe_rth_output);
 585         if (rt) {
 586                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 587                 rt_free(rt);
 588         }
 589 }
 590
 591 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 592 {
 593         struct fib_nh_exception *fnhe, *oldest;
 594
 595         oldest = rcu_dereference(hash->chain);
 596         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 597              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 598                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 599                         oldest = fnhe;
 600         }
 601         fnhe_flush_routes(oldest);
 602         return oldest;
 603 }
 604
 605 static inline u32 fnhe_hashfun(__be32 daddr)
 606 {
 607         static u32 fnhe_hashrnd __read_mostly;
 608         u32 hval;
 609
 610         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 611         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 612         return hash_32(hval, FNHE_HASH_SHIFT);
 613 }
 614
 615 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 616 {
 617         rt->rt_pmtu = fnhe->fnhe_pmtu;
 618         rt->dst.expires = fnhe->fnhe_expires;
 619
 620         if (fnhe->fnhe_gw) {
 621                 rt->rt_flags |= RTCF_REDIRECTED;
 622                 rt->rt_gateway = fnhe->fnhe_gw;
 623                 rt->rt_uses_gateway = 1;
 624         }
 625 }
 626
 627 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 628                                   u32 pmtu, unsigned long expires)
 629 {
 630         struct fnhe_hash_bucket *hash;
 631         struct fib_nh_exception *fnhe;
 632         struct rtable *rt;
 633         unsigned int i;
 634         int depth;
 635         u32 hval = fnhe_hashfun(daddr);
 636
 637         spin_lock_bh(&fnhe_lock);
 638
 639         hash = rcu_dereference(nh->nh_exceptions);
 640         if (!hash) {
 641                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 642                 if (!hash)
 643                         goto out_unlock;
 644                 rcu_assign_pointer(nh->nh_exceptions, hash);
 645         }
 646
 647         hash += hval;
 648
 649         depth = 0;
 650         for (fnhe = rcu_dereference(hash->chain); fnhe;
 651              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 652                 if (fnhe->fnhe_daddr == daddr)
 653                         break;
 654                 depth++;
 655         }
 656
 657         if (fnhe) {
 658                 if (gw)
 659                         fnhe->fnhe_gw = gw;
 660                 if (pmtu) {
 661                         fnhe->fnhe_pmtu = pmtu;
 662                         fnhe->fnhe_expires = max(1UL, expires);
 663                 }
 664                 /* Update all cached dsts too */
 665                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 666                 if (rt)
 667                         fill_route_from_fnhe(rt, fnhe);
 668                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 669                 if (rt)
 670                         fill_route_from_fnhe(rt, fnhe);
 671         } else {
 672                 if (depth > FNHE_RECLAIM_DEPTH)
 673                         fnhe = fnhe_oldest(hash);
 674                 else {
 675                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 676                         if (!fnhe)
 677                                 goto out_unlock;
 678
 679                         fnhe->fnhe_next = hash->chain;
 680                         rcu_assign_pointer(hash->chain, fnhe);
 681                 }
 682                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 683                 fnhe->fnhe_daddr = daddr;
 684                 fnhe->fnhe_gw = gw;
 685                 fnhe->fnhe_pmtu = pmtu;
 686                 fnhe->fnhe_expires = expires;
 687
 688                 /* Exception created; mark the cached routes for the nexthop
 689                  * stale, so anyone caching it rechecks if this exception
 690                  * applies to them.
 691                  */
 692                 rt = rcu_dereference(nh->nh_rth_input);
 693                 if (rt)
 694                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 695
 696                 for_each_possible_cpu(i) {
 697                         struct rtable __rcu **prt;
 698                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 699                         rt = rcu_dereference(*prt);
 700                         if (rt)
 701                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 702                 }
 703         }
 704
 705         fnhe->fnhe_stamp = jiffies;
 706
 707 out_unlock:
 708         spin_unlock_bh(&fnhe_lock);
 709 }
 710
 711 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 712                              bool kill_route)
 713 {
 714         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 715         __be32 old_gw = ip_hdr(skb)->saddr;
 716         struct net_device *dev = skb->dev;
 717         struct in_device *in_dev;
 718         struct fib_result res;
 719         struct neighbour *n;
 720         struct net *net;
 721
 722         switch (icmp_hdr(skb)->code & 7) {
 723         case ICMP_REDIR_NET:
 724         case ICMP_REDIR_NETTOS:
 725         case ICMP_REDIR_HOST:
 726         case ICMP_REDIR_HOSTTOS:
 727                 break;
 728
 729         default:
 730                 return;
 731         }
 732
 733         if (rt->rt_gateway != old_gw)
 734                 return;
 735
 736         in_dev = __in_dev_get_rcu(dev);
 737         if (!in_dev)
 738                 return;
 739
 740         net = dev_net(dev);
 741         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 742             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 743             ipv4_is_zeronet(new_gw))
 744                 goto reject_redirect;
 745
 746         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 747                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 748                         goto reject_redirect;
 749                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 750                         goto reject_redirect;
 751         } else {
 752                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 753                         goto reject_redirect;
 754         }
 755
 756         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 757         if (!IS_ERR(n)) {
 758                 if (!(n->nud_state & NUD_VALID)) {
 759                         neigh_event_send(n, NULL);
 760                 } else {
 761                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 762                                 struct fib_nh *nh = &FIB_RES_NH(res);
 763
 764                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 765                                                 0, jiffies + ip_rt_gc_timeout);
 766                         }
 767                         if (kill_route)
 768                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 769                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 770                 }
 771                 neigh_release(n);
 772         }
 773         return;
 774
 775 reject_redirect:
 776 #ifdef CONFIG_IP_ROUTE_VERBOSE
 777         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 778                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 779                 __be32 daddr = iph->daddr;
 780                 __be32 saddr = iph->saddr;
 781
 782                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 783                                      "  Advised path = %pI4 -> %pI4\n",
 784                                      &old_gw, dev->name, &new_gw,
 785                                      &saddr, &daddr);
 786         }
 787 #endif
 788         ;
 789 }
 790
 791 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 792 {
 793         struct rtable *rt;
 794         struct flowi4 fl4;
 795         const struct iphdr *iph = (const struct iphdr *) skb->data;
 796         int oif = skb->dev->ifindex;
 797         u8 tos = RT_TOS(iph->tos);
 798         u8 prot = iph->protocol;
 799         u32 mark = skb->mark;
 800
 801         rt = (struct rtable *) dst;
 802
 803         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 804         __ip_do_redirect(rt, skb, &fl4, true);
 805 }
 806
 807 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 808 {
 809         struct rtable *rt = (struct rtable *)dst;
 810         struct dst_entry *ret = dst;
 811
 812         if (rt) {
 813                 if (dst->obsolete > 0) {
 814                         ip_rt_put(rt);
 815                         ret = NULL;
 816                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 817                            rt->dst.expires) {
 818                         ip_rt_put(rt);
 819                         ret = NULL;
 820                 }
 821         }
 822         return ret;
 823 }
 824
 825 /*
 826  * Algorithm:
 827  *      1. The first ip_rt_redirect_number redirects are sent
 828  *         with exponential backoff, then we stop sending them at all,
 829  *         assuming that the host ignores our redirects.
 830  *      2. If we did not see packets requiring redirects
 831  *         during ip_rt_redirect_silence, we assume that the host
 832  *         forgot redirected route and start to send redirects again.
 833  *
 834  * This algorithm is much cheaper and more intelligent than dumb load limiting
 835  * in icmp.c.
 836  *
 837  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 838  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 839  */
 840
 841 void ip_rt_send_redirect(struct sk_buff *skb)
 842 {
 843         struct rtable *rt = skb_rtable(skb);
 844         struct in_device *in_dev;
 845         struct inet_peer *peer;
 846         struct net *net;
 847         int log_martians;
 848         int vif;
 849
 850         rcu_read_lock();
 851         in_dev = __in_dev_get_rcu(rt->dst.dev);
 852         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 853                 rcu_read_unlock();
 854                 return;
 855         }
 856         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 857         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 858         rcu_read_unlock();
 859
 860         net = dev_net(rt->dst.dev);
 861         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 862         if (!peer) {
 863                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 864                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 865                 return;
 866         }
 867
 868         /* No redirected packets during ip_rt_redirect_silence;
 869          * reset the algorithm.
 870          */
 871         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 872                 peer->rate_tokens = 0;
 873
 874         /* Too many ignored redirects; do not send anything
 875          * set dst.rate_last to the last seen redirected packet.
 876          */
 877         if (peer->rate_tokens >= ip_rt_redirect_number) {
 878                 peer->rate_last = jiffies;
 879                 goto out_put_peer;
 880         }
 881
 882         /* Check for load limit; set rate_last to the latest sent
 883          * redirect.
 884          */
 885         if (peer->rate_tokens == 0 ||
 886             time_after(jiffies,
 887                        (peer->rate_last +
 888                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 889                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 890
 891                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 892                 peer->rate_last = jiffies;
 893                 ++peer->rate_tokens;
 894 #ifdef CONFIG_IP_ROUTE_VERBOSE
 895                 if (log_martians &&
 896                     peer->rate_tokens == ip_rt_redirect_number)
 897                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 898                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 899                                              &ip_hdr(skb)->daddr, &gw);
 900 #endif
 901         }
 902 out_put_peer:
 903         inet_putpeer(peer);
 904 }
 905
 906 static int ip_error(struct sk_buff *skb)
 907 {
 908         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 909         struct rtable *rt = skb_rtable(skb);
 910         struct inet_peer *peer;
 911         unsigned long now;
 912         struct net *net;
 913         bool send;
 914         int code;
 915
 916         /* IP on this device is disabled. */
 917         if (!in_dev)
 918                 goto out;
 919
 920         net = dev_net(rt->dst.dev);
 921         if (!IN_DEV_FORWARD(in_dev)) {
 922                 switch (rt->dst.error) {
 923                 case EHOSTUNREACH:
 924                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 925                         break;
 926
 927                 case ENETUNREACH:
 928                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 929                         break;
 930                 }
 931                 goto out;
 932         }
 933
 934         switch (rt->dst.error) {
 935         case EINVAL:
 936         default:
 937                 goto out;
 938         case EHOSTUNREACH:
 939                 code = ICMP_HOST_UNREACH;
 940                 break;
 941         case ENETUNREACH:
 942                 code = ICMP_NET_UNREACH;
 943                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 944                 break;
 945         case EACCES:
 946                 code = ICMP_PKT_FILTERED;
 947                 break;
 948         }
 949
 950         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 951                                l3mdev_master_ifindex(skb->dev), 1);
 952
 953         send = true;
 954         if (peer) {
 955                 now = jiffies;
 956                 peer->rate_tokens += now - peer->rate_last;
 957                 if (peer->rate_tokens > ip_rt_error_burst)
 958                         peer->rate_tokens = ip_rt_error_burst;
 959                 peer->rate_last = now;
 960                 if (peer->rate_tokens >= ip_rt_error_cost)
 961                         peer->rate_tokens -= ip_rt_error_cost;
 962                 else
 963                         send = false;
 964                 inet_putpeer(peer);
 965         }
 966         if (send)
 967                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 968
 969 out:    kfree_skb(skb);
 970         return 0;
 971 }
 972
 973 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 974 {
 975         struct dst_entry *dst = &rt->dst;
 976         struct fib_result res;
 977
 978         if (dst_metric_locked(dst, RTAX_MTU))
 979                 return;
 980
 981         if (ipv4_mtu(dst) < mtu)
 982                 return;
 983
 984         if (mtu < ip_rt_min_pmtu)
 985                 mtu = ip_rt_min_pmtu;
 986
 987         if (rt->rt_pmtu == mtu &&
 988             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 989                 return;
 990
 991         rcu_read_lock();
 992         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 993                 struct fib_nh *nh = &FIB_RES_NH(res);
 994
 995                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 996                                       jiffies + ip_rt_mtu_expires);
 997         }
 998         rcu_read_unlock();
 999 }
1000
1001 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1002                               struct sk_buff *skb, u32 mtu)
1003 {
1004         struct rtable *rt = (struct rtable *) dst;
1005         struct flowi4 fl4;
1006
1007         ip_rt_build_flow_key(&fl4, sk, skb);
1008         __ip_rt_update_pmtu(rt, &fl4, mtu);
1009 }
1010
1011 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1012                       int oif, u32 mark, u8 protocol, int flow_flags)
1013 {
1014         const struct iphdr *iph = (const struct iphdr *) skb->data;
1015         struct flowi4 fl4;
1016         struct rtable *rt;
1017
1018         if (!mark)
1019                 mark = IP4_REPLY_MARK(net, skb->mark);
1020
1021         __build_flow_key(&fl4, NULL, iph, oif,
1022                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1023         rt = __ip_route_output_key(net, &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1030
1031 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1032 {
1033         const struct iphdr *iph = (const struct iphdr *) skb->data;
1034         struct flowi4 fl4;
1035         struct rtable *rt;
1036
1037         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1038
1039         if (!fl4.flowi4_mark)
1040                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1041
1042         rt = __ip_route_output_key(sock_net(sk), &fl4);
1043         if (!IS_ERR(rt)) {
1044                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1045                 ip_rt_put(rt);
1046         }
1047 }
1048
1049 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054         struct dst_entry *odst = NULL;
1055         bool new = false;
1056
1057         bh_lock_sock(sk);
1058
1059         if (!ip_sk_accept_pmtu(sk))
1060                 goto out;
1061
1062         odst = sk_dst_get(sk);
1063
1064         if (sock_owned_by_user(sk) || !odst) {
1065                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1066                 goto out;
1067         }
1068
1069         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071         rt = (struct rtable *)odst;
1072         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1073                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                 if (IS_ERR(rt))
1075                         goto out;
1076
1077                 new = true;
1078         }
1079
1080         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1081
1082         if (!dst_check(&rt->dst, 0)) {
1083                 if (new)
1084                         dst_release(&rt->dst);
1085
1086                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1087                 if (IS_ERR(rt))
1088                         goto out;
1089
1090                 new = true;
1091         }
1092
1093         if (new)
1094                 sk_dst_set(sk, &rt->dst);
1095
1096 out:
1097         bh_unlock_sock(sk);
1098         dst_release(odst);
1099 }
1100 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1101
1102 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1103                    int oif, u32 mark, u8 protocol, int flow_flags)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *) skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108
1109         __build_flow_key(&fl4, NULL, iph, oif,
1110                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1111         rt = __ip_route_output_key(net, &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_do_redirect(rt, skb, &fl4, false);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117 EXPORT_SYMBOL_GPL(ipv4_redirect);
1118
1119 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1120 {
1121         const struct iphdr *iph = (const struct iphdr *) skb->data;
1122         struct flowi4 fl4;
1123         struct rtable *rt;
1124
1125         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1126         rt = __ip_route_output_key(sock_net(sk), &fl4);
1127         if (!IS_ERR(rt)) {
1128                 __ip_do_redirect(rt, skb, &fl4, false);
1129                 ip_rt_put(rt);
1130         }
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1133
1134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1135 {
1136         struct rtable *rt = (struct rtable *) dst;
1137
1138         /* All IPV4 dsts are created with ->obsolete set to the value
1139          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1140          * into this function always.
1141          *
1142          * When a PMTU/redirect information update invalidates a route,
1143          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1144          * DST_OBSOLETE_DEAD by dst_free().
1145          */
1146         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1147                 return NULL;
1148         return dst;
1149 }
1150
1151 static void ipv4_link_failure(struct sk_buff *skb)
1152 {
1153         struct rtable *rt;
1154
1155         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1156
1157         rt = skb_rtable(skb);
1158         if (rt)
1159                 dst_set_expires(&rt->dst, 0);
1160 }
1161
1162 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1163 {
1164         pr_debug("%s: %pI4 -> %pI4, %s\n",
1165                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1166                  skb->dev ? skb->dev->name : "?");
1167         kfree_skb(skb);
1168         WARN_ON(1);
1169         return 0;
1170 }
1171
1172 /*
1173    We do not cache source address of outgoing interface,
1174    because it is used only by IP RR, TS and SRR options,
1175    so that it out of fast path.
1176
1177    BTW remember: "addr" is allowed to be not aligned
1178    in IP options!
1179  */
1180
1181 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1182 {
1183         __be32 src;
1184
1185         if (rt_is_output_route(rt))
1186                 src = ip_hdr(skb)->saddr;
1187         else {
1188                 struct fib_result res;
1189                 struct flowi4 fl4;
1190                 struct iphdr *iph;
1191
1192                 iph = ip_hdr(skb);
1193
1194                 memset(&fl4, 0, sizeof(fl4));
1195                 fl4.daddr = iph->daddr;
1196                 fl4.saddr = iph->saddr;
1197                 fl4.flowi4_tos = RT_TOS(iph->tos);
1198                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1199                 fl4.flowi4_iif = skb->dev->ifindex;
1200                 fl4.flowi4_mark = skb->mark;
1201
1202                 rcu_read_lock();
1203                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1204                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1205                 else
1206                         src = inet_select_addr(rt->dst.dev,
1207                                                rt_nexthop(rt, iph->daddr),
1208                                                RT_SCOPE_UNIVERSE);
1209                 rcu_read_unlock();
1210         }
1211         memcpy(addr, &src, 4);
1212 }
1213
1214 #ifdef CONFIG_IP_ROUTE_CLASSID
1215 static void set_class_tag(struct rtable *rt, u32 tag)
1216 {
1217         if (!(rt->dst.tclassid & 0xFFFF))
1218                 rt->dst.tclassid |= tag & 0xFFFF;
1219         if (!(rt->dst.tclassid & 0xFFFF0000))
1220                 rt->dst.tclassid |= tag & 0xFFFF0000;
1221 }
1222 #endif
1223
1224 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1225 {
1226         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1227
1228         if (advmss == 0) {
1229                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1230                                ip_rt_min_advmss);
1231                 if (advmss > 65535 - 40)
1232                         advmss = 65535 - 40;
1233         }
1234         return advmss;
1235 }
1236
1237 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1238 {
1239         const struct rtable *rt = (const struct rtable *) dst;
1240         unsigned int mtu = rt->rt_pmtu;
1241
1242         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1243                 mtu = dst_metric_raw(dst, RTAX_MTU);
1244
1245         if (mtu)
1246                 return mtu;
1247
1248         mtu = dst->dev->mtu;
1249
1250         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1251                 if (rt->rt_uses_gateway && mtu > 576)
1252                         mtu = 576;
1253         }
1254
1255         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1256
1257         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1258 }
1259
1260 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1261 {
1262         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1263         struct fib_nh_exception *fnhe;
1264         u32 hval;
1265
1266         if (!hash)
1267                 return NULL;
1268
1269         hval = fnhe_hashfun(daddr);
1270
1271         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1272              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1273                 if (fnhe->fnhe_daddr == daddr)
1274                         return fnhe;
1275         }
1276         return NULL;
1277 }
1278
1279 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1280                               __be32 daddr)
1281 {
1282         bool ret = false;
1283
1284         spin_lock_bh(&fnhe_lock);
1285
1286         if (daddr == fnhe->fnhe_daddr) {
1287                 struct rtable __rcu **porig;
1288                 struct rtable *orig;
1289                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1290
1291                 if (rt_is_input_route(rt))
1292                         porig = &fnhe->fnhe_rth_input;
1293                 else
1294                         porig = &fnhe->fnhe_rth_output;
1295                 orig = rcu_dereference(*porig);
1296
1297                 if (fnhe->fnhe_genid != genid) {
1298                         fnhe->fnhe_genid = genid;
1299                         fnhe->fnhe_gw = 0;
1300                         fnhe->fnhe_pmtu = 0;
1301                         fnhe->fnhe_expires = 0;
1302                         fnhe_flush_routes(fnhe);
1303                         orig = NULL;
1304                 }
1305                 fill_route_from_fnhe(rt, fnhe);
1306                 if (!rt->rt_gateway)
1307                         rt->rt_gateway = daddr;
1308
1309                 if (!(rt->dst.flags & DST_NOCACHE)) {
1310                         rcu_assign_pointer(*porig, rt);
1311                         if (orig)
1312                                 rt_free(orig);
1313                         ret = true;
1314                 }
1315
1316                 fnhe->fnhe_stamp = jiffies;
1317         }
1318         spin_unlock_bh(&fnhe_lock);
1319
1320         return ret;
1321 }
1322
1323 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1324 {
1325         struct rtable *orig, *prev, **p;
1326         bool ret = true;
1327
1328         if (rt_is_input_route(rt)) {
1329                 p = (struct rtable **)&nh->nh_rth_input;
1330         } else {
1331                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1332         }
1333         orig = *p;
1334
1335         prev = cmpxchg(p, orig, rt);
1336         if (prev == orig) {
1337                 if (orig)
1338                         rt_free(orig);
1339         } else
1340                 ret = false;
1341
1342         return ret;
1343 }
1344
1345 struct uncached_list {
1346         spinlock_t              lock;
1347         struct list_head        head;
1348 };
1349
1350 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1351
1352 static void rt_add_uncached_list(struct rtable *rt)
1353 {
1354         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1355
1356         rt->rt_uncached_list = ul;
1357
1358         spin_lock_bh(&ul->lock);
1359         list_add_tail(&rt->rt_uncached, &ul->head);
1360         spin_unlock_bh(&ul->lock);
1361 }
1362
1363 static void ipv4_dst_destroy(struct dst_entry *dst)
1364 {
1365         struct rtable *rt = (struct rtable *) dst;
1366
1367         if (!list_empty(&rt->rt_uncached)) {
1368                 struct uncached_list *ul = rt->rt_uncached_list;
1369
1370                 spin_lock_bh(&ul->lock);
1371                 list_del(&rt->rt_uncached);
1372                 spin_unlock_bh(&ul->lock);
1373         }
1374 }
1375
1376 void rt_flush_dev(struct net_device *dev)
1377 {
1378         struct net *net = dev_net(dev);
1379         struct rtable *rt;
1380         int cpu;
1381
1382         for_each_possible_cpu(cpu) {
1383                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1384
1385                 spin_lock_bh(&ul->lock);
1386                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1387                         if (rt->dst.dev != dev)
1388                                 continue;
1389                         rt->dst.dev = net->loopback_dev;
1390                         dev_hold(rt->dst.dev);
1391                         dev_put(dev);
1392                 }
1393                 spin_unlock_bh(&ul->lock);
1394         }
1395 }
1396
1397 static bool rt_cache_valid(const struct rtable *rt)
1398 {
1399         return  rt &&
1400                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1401                 !rt_is_expired(rt);
1402 }
1403
1404 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1405                            const struct fib_result *res,
1406                            struct fib_nh_exception *fnhe,
1407                            struct fib_info *fi, u16 type, u32 itag)
1408 {
1409         bool cached = false;
1410
1411         if (fi) {
1412                 struct fib_nh *nh = &FIB_RES_NH(*res);
1413
1414                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1415                         rt->rt_gateway = nh->nh_gw;
1416                         rt->rt_uses_gateway = 1;
1417                 }
1418                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1419 #ifdef CONFIG_IP_ROUTE_CLASSID
1420                 rt->dst.tclassid = nh->nh_tclassid;
1421 #endif
1422                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1423                 if (unlikely(fnhe))
1424                         cached = rt_bind_exception(rt, fnhe, daddr);
1425                 else if (!(rt->dst.flags & DST_NOCACHE))
1426                         cached = rt_cache_route(nh, rt);
1427                 if (unlikely(!cached)) {
1428                         /* Routes we intend to cache in nexthop exception or
1429                          * FIB nexthop have the DST_NOCACHE bit clear.
1430                          * However, if we are unsuccessful at storing this
1431                          * route into the cache we really need to set it.
1432                          */
1433                         rt->dst.flags |= DST_NOCACHE;
1434                         if (!rt->rt_gateway)
1435                                 rt->rt_gateway = daddr;
1436                         rt_add_uncached_list(rt);
1437                 }
1438         } else
1439                 rt_add_uncached_list(rt);
1440
1441 #ifdef CONFIG_IP_ROUTE_CLASSID
1442 #ifdef CONFIG_IP_MULTIPLE_TABLES
1443         set_class_tag(rt, res->tclassid);
1444 #endif
1445         set_class_tag(rt, itag);
1446 #endif
1447 }
1448
1449 struct rtable *rt_dst_alloc(struct net_device *dev,
1450                             unsigned int flags, u16 type,
1451                             bool nopolicy, bool noxfrm, bool will_cache)
1452 {
1453         struct rtable *rt;
1454
1455         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1456                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1457                        (nopolicy ? DST_NOPOLICY : 0) |
1458                        (noxfrm ? DST_NOXFRM : 0));
1459
1460         if (rt) {
1461                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1462                 rt->rt_flags = flags;
1463                 rt->rt_type = type;
1464                 rt->rt_is_input = 0;
1465                 rt->rt_iif = 0;
1466                 rt->rt_pmtu = 0;
1467                 rt->rt_gateway = 0;
1468                 rt->rt_uses_gateway = 0;
1469                 rt->rt_table_id = 0;
1470                 INIT_LIST_HEAD(&rt->rt_uncached);
1471
1472                 rt->dst.output = ip_output;
1473                 if (flags & RTCF_LOCAL)
1474                         rt->dst.input = ip_local_deliver;
1475         }
1476
1477         return rt;
1478 }
1479 EXPORT_SYMBOL(rt_dst_alloc);
1480
1481 /* called in rcu_read_lock() section */
1482 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1483                                 u8 tos, struct net_device *dev, int our)
1484 {
1485         struct rtable *rth;
1486         struct in_device *in_dev = __in_dev_get_rcu(dev);
1487         unsigned int flags = RTCF_MULTICAST;
1488         u32 itag = 0;
1489         int err;
1490
1491         /* Primary sanity checks. */
1492
1493         if (!in_dev)
1494                 return -EINVAL;
1495
1496         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1497             skb->protocol != htons(ETH_P_IP))
1498                 goto e_inval;
1499
1500         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1501                 goto e_inval;
1502
1503         if (ipv4_is_zeronet(saddr)) {
1504                 if (!ipv4_is_local_multicast(daddr))
1505                         goto e_inval;
1506         } else {
1507                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1508                                           in_dev, &itag);
1509                 if (err < 0)
1510                         goto e_err;
1511         }
1512         if (our)
1513                 flags |= RTCF_LOCAL;
1514
1515         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1516                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1517         if (!rth)
1518                 goto e_nobufs;
1519
1520 #ifdef CONFIG_IP_ROUTE_CLASSID
1521         rth->dst.tclassid = itag;
1522 #endif
1523         rth->dst.output = ip_rt_bug;
1524         rth->rt_is_input= 1;
1525
1526 #ifdef CONFIG_IP_MROUTE
1527         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1528                 rth->dst.input = ip_mr_input;
1529 #endif
1530         RT_CACHE_STAT_INC(in_slow_mc);
1531
1532         skb_dst_set(skb, &rth->dst);
1533         return 0;
1534
1535 e_nobufs:
1536         return -ENOBUFS;
1537 e_inval:
1538         return -EINVAL;
1539 e_err:
1540         return err;
1541 }
1542
1543
1544 static void ip_handle_martian_source(struct net_device *dev,
1545                                      struct in_device *in_dev,
1546                                      struct sk_buff *skb,
1547                                      __be32 daddr,
1548                                      __be32 saddr)
1549 {
1550         RT_CACHE_STAT_INC(in_martian_src);
1551 #ifdef CONFIG_IP_ROUTE_VERBOSE
1552         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1553                 /*
1554                  *      RFC1812 recommendation, if source is martian,
1555                  *      the only hint is MAC header.
1556                  */
1557                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1558                         &daddr, &saddr, dev->name);
1559                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1560                         print_hex_dump(KERN_WARNING, "ll header: ",
1561                                        DUMP_PREFIX_OFFSET, 16, 1,
1562                                        skb_mac_header(skb),
1563                                        dev->hard_header_len, true);
1564                 }
1565         }
1566 #endif
1567 }
1568
1569 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1570 {
1571         struct fnhe_hash_bucket *hash;
1572         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1573         u32 hval = fnhe_hashfun(daddr);
1574
1575         spin_lock_bh(&fnhe_lock);
1576
1577         hash = rcu_dereference_protected(nh->nh_exceptions,
1578                                          lockdep_is_held(&fnhe_lock));
1579         hash += hval;
1580
1581         fnhe_p = &hash->chain;
1582         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1583         while (fnhe) {
1584                 if (fnhe->fnhe_daddr == daddr) {
1585                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1586                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1587                         fnhe_flush_routes(fnhe);
1588                         kfree_rcu(fnhe, rcu);
1589                         break;
1590                 }
1591                 fnhe_p = &fnhe->fnhe_next;
1592                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1593                                                  lockdep_is_held(&fnhe_lock));
1594         }
1595
1596         spin_unlock_bh(&fnhe_lock);
1597 }
1598
1599 /* called in rcu_read_lock() section */
1600 static int __mkroute_input(struct sk_buff *skb,
1601                            const struct fib_result *res,
1602                            struct in_device *in_dev,
1603                            __be32 daddr, __be32 saddr, u32 tos)
1604 {
1605         struct fib_nh_exception *fnhe;
1606         struct rtable *rth;
1607         int err;
1608         struct in_device *out_dev;
1609         bool do_cache;
1610         u32 itag = 0;
1611
1612         /* get a working reference to the output device */
1613         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1614         if (!out_dev) {
1615                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1616                 return -EINVAL;
1617         }
1618
1619         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1620                                   in_dev->dev, in_dev, &itag);
1621         if (err < 0) {
1622                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1623                                          saddr);
1624
1625                 goto cleanup;
1626         }
1627
1628         do_cache = res->fi && !itag;
1629         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1630             skb->protocol == htons(ETH_P_IP) &&
1631             (IN_DEV_SHARED_MEDIA(out_dev) ||
1632              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1633                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1634
1635         if (skb->protocol != htons(ETH_P_IP)) {
1636                 /* Not IP (i.e. ARP). Do not create route, if it is
1637                  * invalid for proxy arp. DNAT routes are always valid.
1638                  *
1639                  * Proxy arp feature have been extended to allow, ARP
1640                  * replies back to the same interface, to support
1641                  * Private VLAN switch technologies. See arp.c.
1642                  */
1643                 if (out_dev == in_dev &&
1644                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1645                         err = -EINVAL;
1646                         goto cleanup;
1647                 }
1648         }
1649
1650         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1651         if (do_cache) {
1652                 if (fnhe) {
1653                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1654                         if (rth && rth->dst.expires &&
1655                             time_after(jiffies, rth->dst.expires)) {
1656                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1657                                 fnhe = NULL;
1658                         } else {
1659                                 goto rt_cache;
1660                         }
1661                 }
1662
1663                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1664
1665 rt_cache:
1666                 if (rt_cache_valid(rth)) {
1667                         skb_dst_set_noref(skb, &rth->dst);
1668                         goto out;
1669                 }
1670         }
1671
1672         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1673                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1674                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1675         if (!rth) {
1676                 err = -ENOBUFS;
1677                 goto cleanup;
1678         }
1679
1680         rth->rt_is_input = 1;
1681         if (res->table)
1682                 rth->rt_table_id = res->table->tb_id;
1683         RT_CACHE_STAT_INC(in_slow_tot);
1684
1685         rth->dst.input = ip_forward;
1686
1687         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1688         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1689                 rth->dst.lwtstate->orig_output = rth->dst.output;
1690                 rth->dst.output = lwtunnel_output;
1691         }
1692         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1693                 rth->dst.lwtstate->orig_input = rth->dst.input;
1694                 rth->dst.input = lwtunnel_input;
1695         }
1696         skb_dst_set(skb, &rth->dst);
1697 out:
1698         err = 0;
1699  cleanup:
1700         return err;
1701 }
1702
1703 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1704
1705 /* To make ICMP packets follow the right flow, the multipath hash is
1706  * calculated from the inner IP addresses in reverse order.
1707  */
1708 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1709 {
1710         const struct iphdr *outer_iph = ip_hdr(skb);
1711         struct icmphdr _icmph;
1712         const struct icmphdr *icmph;
1713         struct iphdr _inner_iph;
1714         const struct iphdr *inner_iph;
1715
1716         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1717                 goto standard_hash;
1718
1719         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1720                                    &_icmph);
1721         if (!icmph)
1722                 goto standard_hash;
1723
1724         if (icmph->type != ICMP_DEST_UNREACH &&
1725             icmph->type != ICMP_REDIRECT &&
1726             icmph->type != ICMP_TIME_EXCEEDED &&
1727             icmph->type != ICMP_PARAMETERPROB) {
1728                 goto standard_hash;
1729         }
1730
1731         inner_iph = skb_header_pointer(skb,
1732                                        outer_iph->ihl * 4 + sizeof(_icmph),
1733                                        sizeof(_inner_iph), &_inner_iph);
1734         if (!inner_iph)
1735                 goto standard_hash;
1736
1737         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1738
1739 standard_hash:
1740         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1741 }
1742
1743 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1744
1745 static int ip_mkroute_input(struct sk_buff *skb,
1746                             struct fib_result *res,
1747                             const struct flowi4 *fl4,
1748                             struct in_device *in_dev,
1749                             __be32 daddr, __be32 saddr, u32 tos)
1750 {
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752         if (res->fi && res->fi->fib_nhs > 1) {
1753                 int h;
1754
1755                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1756                         h = ip_multipath_icmp_hash(skb);
1757                 else
1758                         h = fib_multipath_hash(saddr, daddr);
1759                 fib_select_multipath(res, h);
1760         }
1761 #endif
1762
1763         /* create a routing cache entry */
1764         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1765 }
1766
1767 /*
1768  *      NOTE. We drop all the packets that has local source
1769  *      addresses, because every properly looped back packet
1770  *      must have correct destination already attached by output routine.
1771  *
1772  *      Such approach solves two big problems:
1773  *      1. Not simplex devices are handled properly.
1774  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1775  *      called with rcu_read_lock()
1776  */
1777
1778 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1779                                u8 tos, struct net_device *dev)
1780 {
1781         struct fib_result res;
1782         struct in_device *in_dev = __in_dev_get_rcu(dev);
1783         struct ip_tunnel_info *tun_info;
1784         struct flowi4   fl4;
1785         unsigned int    flags = 0;
1786         u32             itag = 0;
1787         struct rtable   *rth;
1788         int             err = -EINVAL;
1789         struct net    *net = dev_net(dev);
1790         bool do_cache;
1791
1792         /* IP on this device is disabled. */
1793
1794         if (!in_dev)
1795                 goto out;
1796
1797         /* Check for the most weird martians, which can be not detected
1798            by fib_lookup.
1799          */
1800
1801         tun_info = skb_tunnel_info(skb);
1802         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1803                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1804         else
1805                 fl4.flowi4_tun_key.tun_id = 0;
1806         skb_dst_drop(skb);
1807
1808         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1809                 goto martian_source;
1810
1811         res.fi = NULL;
1812         res.table = NULL;
1813         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1814                 goto brd_input;
1815
1816         /* Accept zero addresses only to limited broadcast;
1817          * I even do not know to fix it or not. Waiting for complains :-)
1818          */
1819         if (ipv4_is_zeronet(saddr))
1820                 goto martian_source;
1821
1822         if (ipv4_is_zeronet(daddr))
1823                 goto martian_destination;
1824
1825         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1826          * and call it once if daddr or/and saddr are loopback addresses
1827          */
1828         if (ipv4_is_loopback(daddr)) {
1829                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1830                         goto martian_destination;
1831         } else if (ipv4_is_loopback(saddr)) {
1832                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1833                         goto martian_source;
1834         }
1835
1836         /*
1837          *      Now we are ready to route packet.
1838          */
1839         fl4.flowi4_oif = 0;
1840         fl4.flowi4_iif = dev->ifindex;
1841         fl4.flowi4_mark = skb->mark;
1842         fl4.flowi4_tos = tos;
1843         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1844         fl4.flowi4_flags = 0;
1845         fl4.daddr = daddr;
1846         fl4.saddr = saddr;
1847         err = fib_lookup(net, &fl4, &res, 0);
1848         if (err != 0) {
1849                 if (!IN_DEV_FORWARD(in_dev))
1850                         err = -EHOSTUNREACH;
1851                 goto no_route;
1852         }
1853
1854         if (res.type == RTN_BROADCAST)
1855                 goto brd_input;
1856
1857         if (res.type == RTN_LOCAL) {
1858                 err = fib_validate_source(skb, saddr, daddr, tos,
1859                                           0, dev, in_dev, &itag);
1860                 if (err < 0)
1861                         goto martian_source;
1862                 goto local_input;
1863         }
1864
1865         if (!IN_DEV_FORWARD(in_dev)) {
1866                 err = -EHOSTUNREACH;
1867                 goto no_route;
1868         }
1869         if (res.type != RTN_UNICAST)
1870                 goto martian_destination;
1871
1872         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1873 out:    return err;
1874
1875 brd_input:
1876         if (skb->protocol != htons(ETH_P_IP))
1877                 goto e_inval;
1878
1879         if (!ipv4_is_zeronet(saddr)) {
1880                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1881                                           in_dev, &itag);
1882                 if (err < 0)
1883                         goto martian_source;
1884         }
1885         flags |= RTCF_BROADCAST;
1886         res.type = RTN_BROADCAST;
1887         RT_CACHE_STAT_INC(in_brd);
1888
1889 local_input:
1890         do_cache = false;
1891         if (res.fi) {
1892                 if (!itag) {
1893                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1894                         if (rt_cache_valid(rth)) {
1895                                 skb_dst_set_noref(skb, &rth->dst);
1896                                 err = 0;
1897                                 goto out;
1898                         }
1899                         do_cache = true;
1900                 }
1901         }
1902
1903         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1904                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1905         if (!rth)
1906                 goto e_nobufs;
1907
1908         rth->dst.output= ip_rt_bug;
1909 #ifdef CONFIG_IP_ROUTE_CLASSID
1910         rth->dst.tclassid = itag;
1911 #endif
1912         rth->rt_is_input = 1;
1913         if (res.table)
1914                 rth->rt_table_id = res.table->tb_id;
1915
1916         RT_CACHE_STAT_INC(in_slow_tot);
1917         if (res.type == RTN_UNREACHABLE) {
1918                 rth->dst.input= ip_error;
1919                 rth->dst.error= -err;
1920                 rth->rt_flags   &= ~RTCF_LOCAL;
1921         }
1922         if (do_cache) {
1923                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1924                         rth->dst.flags |= DST_NOCACHE;
1925                         rt_add_uncached_list(rth);
1926                 }
1927         }
1928         skb_dst_set(skb, &rth->dst);
1929         err = 0;
1930         goto out;
1931
1932 no_route:
1933         RT_CACHE_STAT_INC(in_no_route);
1934         res.type = RTN_UNREACHABLE;
1935         res.fi = NULL;
1936         res.table = NULL;
1937         goto local_input;
1938
1939         /*
1940          *      Do not cache martian addresses: they should be logged (RFC1812)
1941          */
1942 martian_destination:
1943         RT_CACHE_STAT_INC(in_martian_dst);
1944 #ifdef CONFIG_IP_ROUTE_VERBOSE
1945         if (IN_DEV_LOG_MARTIANS(in_dev))
1946                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1947                                      &daddr, &saddr, dev->name);
1948 #endif
1949
1950 e_inval:
1951         err = -EINVAL;
1952         goto out;
1953
1954 e_nobufs:
1955         err = -ENOBUFS;
1956         goto out;
1957
1958 martian_source:
1959         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1960         goto out;
1961 }
1962
1963 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1964                          u8 tos, struct net_device *dev)
1965 {
1966         int res;
1967
1968         rcu_read_lock();
1969
1970         /* Multicast recognition logic is moved from route cache to here.
1971            The problem was that too many Ethernet cards have broken/missing
1972            hardware multicast filters :-( As result the host on multicasting
1973            network acquires a lot of useless route cache entries, sort of
1974            SDR messages from all the world. Now we try to get rid of them.
1975            Really, provided software IP multicast filter is organized
1976            reasonably (at least, hashed), it does not result in a slowdown
1977            comparing with route cache reject entries.
1978            Note, that multicast routers are not affected, because
1979            route cache entry is created eventually.
1980          */
1981         if (ipv4_is_multicast(daddr)) {
1982                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1983
1984                 if (in_dev) {
1985                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1986                                                   ip_hdr(skb)->protocol);
1987                         if (our
1988 #ifdef CONFIG_IP_MROUTE
1989                                 ||
1990                             (!ipv4_is_local_multicast(daddr) &&
1991                              IN_DEV_MFORWARD(in_dev))
1992 #endif
1993                            ) {
1994                                 int res = ip_route_input_mc(skb, daddr, saddr,
1995                                                             tos, dev, our);
1996                                 rcu_read_unlock();
1997                                 return res;
1998                         }
1999                 }
2000                 rcu_read_unlock();
2001                 return -EINVAL;
2002         }
2003         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2004         rcu_read_unlock();
2005         return res;
2006 }
2007 EXPORT_SYMBOL(ip_route_input_noref);
2008
2009 /* called with rcu_read_lock() */
2010 static struct rtable *__mkroute_output(const struct fib_result *res,
2011                                        const struct flowi4 *fl4, int orig_oif,
2012                                        struct net_device *dev_out,
2013                                        unsigned int flags)
2014 {
2015         struct fib_info *fi = res->fi;
2016         struct fib_nh_exception *fnhe;
2017         struct in_device *in_dev;
2018         u16 type = res->type;
2019         struct rtable *rth;
2020         bool do_cache;
2021
2022         in_dev = __in_dev_get_rcu(dev_out);
2023         if (!in_dev)
2024                 return ERR_PTR(-EINVAL);
2025
2026         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2027                 if (ipv4_is_loopback(fl4->saddr) &&
2028                     !(dev_out->flags & IFF_LOOPBACK) &&
2029                     !netif_is_l3_master(dev_out))
2030                         return ERR_PTR(-EINVAL);
2031
2032         if (ipv4_is_lbcast(fl4->daddr))
2033                 type = RTN_BROADCAST;
2034         else if (ipv4_is_multicast(fl4->daddr))
2035                 type = RTN_MULTICAST;
2036         else if (ipv4_is_zeronet(fl4->daddr))
2037                 return ERR_PTR(-EINVAL);
2038
2039         if (dev_out->flags & IFF_LOOPBACK)
2040                 flags |= RTCF_LOCAL;
2041
2042         do_cache = true;
2043         if (type == RTN_BROADCAST) {
2044                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2045                 fi = NULL;
2046         } else if (type == RTN_MULTICAST) {
2047                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2048                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2049                                      fl4->flowi4_proto))
2050                         flags &= ~RTCF_LOCAL;
2051                 else
2052                         do_cache = false;
2053                 /* If multicast route do not exist use
2054                  * default one, but do not gateway in this case.
2055                  * Yes, it is hack.
2056                  */
2057                 if (fi && res->prefixlen < 4)
2058                         fi = NULL;
2059         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2060                    (orig_oif != dev_out->ifindex)) {
2061                 /* For local routes that require a particular output interface
2062                  * we do not want to cache the result.  Caching the result
2063                  * causes incorrect behaviour when there are multiple source
2064                  * addresses on the interface, the end result being that if the
2065                  * intended recipient is waiting on that interface for the
2066                  * packet he won't receive it because it will be delivered on
2067                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2068                  * be set to the loopback interface as well.
2069                  */
2070                 fi = NULL;
2071         }
2072
2073         fnhe = NULL;
2074         do_cache &= fi != NULL;
2075         if (do_cache) {
2076                 struct rtable __rcu **prth;
2077                 struct fib_nh *nh = &FIB_RES_NH(*res);
2078
2079                 fnhe = find_exception(nh, fl4->daddr);
2080                 if (fnhe) {
2081                         prth = &fnhe->fnhe_rth_output;
2082                         rth = rcu_dereference(*prth);
2083                         if (rth && rth->dst.expires &&
2084                             time_after(jiffies, rth->dst.expires)) {
2085                                 ip_del_fnhe(nh, fl4->daddr);
2086                                 fnhe = NULL;
2087                         } else {
2088                                 goto rt_cache;
2089                         }
2090                 }
2091
2092                 if (unlikely(fl4->flowi4_flags &
2093                              FLOWI_FLAG_KNOWN_NH &&
2094                              !(nh->nh_gw &&
2095                                nh->nh_scope == RT_SCOPE_LINK))) {
2096                         do_cache = false;
2097                         goto add;
2098                 }
2099                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2100                 rth = rcu_dereference(*prth);
2101
2102 rt_cache:
2103                 if (rt_cache_valid(rth)) {
2104                         dst_hold(&rth->dst);
2105                         return rth;
2106                 }
2107         }
2108
2109 add:
2110         rth = rt_dst_alloc(dev_out, flags, type,
2111                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2112                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2113                            do_cache);
2114         if (!rth)
2115                 return ERR_PTR(-ENOBUFS);
2116
2117         rth->rt_iif     = orig_oif ? : 0;
2118         if (res->table)
2119                 rth->rt_table_id = res->table->tb_id;
2120
2121         RT_CACHE_STAT_INC(out_slow_tot);
2122
2123         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2124                 if (flags & RTCF_LOCAL &&
2125                     !(dev_out->flags & IFF_LOOPBACK)) {
2126                         rth->dst.output = ip_mc_output;
2127                         RT_CACHE_STAT_INC(out_slow_mc);
2128                 }
2129 #ifdef CONFIG_IP_MROUTE
2130                 if (type == RTN_MULTICAST) {
2131                         if (IN_DEV_MFORWARD(in_dev) &&
2132                             !ipv4_is_local_multicast(fl4->daddr)) {
2133                                 rth->dst.input = ip_mr_input;
2134                                 rth->dst.output = ip_mc_output;
2135                         }
2136                 }
2137 #endif
2138         }
2139
2140         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2141         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2142                 rth->dst.output = lwtunnel_output;
2143
2144         return rth;
2145 }
2146
2147 /*
2148  * Major route resolver routine.
2149  */
2150
2151 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2152                                           int mp_hash)
2153 {
2154         struct net_device *dev_out = NULL;
2155         __u8 tos = RT_FL_TOS(fl4);
2156         unsigned int flags = 0;
2157         struct fib_result res;
2158         struct rtable *rth;
2159         int orig_oif;
2160         int err = -ENETUNREACH;
2161
2162         res.tclassid    = 0;
2163         res.fi          = NULL;
2164         res.table       = NULL;
2165
2166         orig_oif = fl4->flowi4_oif;
2167
2168         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2169         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2170         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2171                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2172
2173         rcu_read_lock();
2174         if (fl4->saddr) {
2175                 rth = ERR_PTR(-EINVAL);
2176                 if (ipv4_is_multicast(fl4->saddr) ||
2177                     ipv4_is_lbcast(fl4->saddr) ||
2178                     ipv4_is_zeronet(fl4->saddr))
2179                         goto out;
2180
2181                 /* I removed check for oif == dev_out->oif here.
2182                    It was wrong for two reasons:
2183                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2184                       is assigned to multiple interfaces.
2185                    2. Moreover, we are allowed to send packets with saddr
2186                       of another iface. --ANK
2187                  */
2188
2189                 if (fl4->flowi4_oif == 0 &&
2190                     (ipv4_is_multicast(fl4->daddr) ||
2191                      ipv4_is_lbcast(fl4->daddr))) {
2192                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2194                         if (!dev_out)
2195                                 goto out;
2196
2197                         /* Special hack: user can direct multicasts
2198                            and limited broadcast via necessary interface
2199                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2200                            This hack is not just for fun, it allows
2201                            vic,vat and friends to work.
2202                            They bind socket to loopback, set ttl to zero
2203                            and expect that it will work.
2204                            From the viewpoint of routing cache they are broken,
2205                            because we are not allowed to build multicast path
2206                            with loopback source addr (look, routing cache
2207                            cannot know, that ttl is zero, so that packet
2208                            will not leave this host and route is valid).
2209                            Luckily, this hack is good workaround.
2210                          */
2211
2212                         fl4->flowi4_oif = dev_out->ifindex;
2213                         goto make_route;
2214                 }
2215
2216                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2217                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2218                         if (!__ip_dev_find(net, fl4->saddr, false))
2219                                 goto out;
2220                 }
2221         }
2222
2223
2224         if (fl4->flowi4_oif) {
2225                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2226                 rth = ERR_PTR(-ENODEV);
2227                 if (!dev_out)
2228                         goto out;
2229
2230                 /* RACE: Check return value of inet_select_addr instead. */
2231                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2232                         rth = ERR_PTR(-ENETUNREACH);
2233                         goto out;
2234                 }
2235                 if (ipv4_is_local_multicast(fl4->daddr) ||
2236                     ipv4_is_lbcast(fl4->daddr) ||
2237                     fl4->flowi4_proto == IPPROTO_IGMP) {
2238                         if (!fl4->saddr)
2239                                 fl4->saddr = inet_select_addr(dev_out, 0,
2240                                                               RT_SCOPE_LINK);
2241                         goto make_route;
2242                 }
2243                 if (!fl4->saddr) {
2244                         if (ipv4_is_multicast(fl4->daddr))
2245                                 fl4->saddr = inet_select_addr(dev_out, 0,
2246                                                               fl4->flowi4_scope);
2247                         else if (!fl4->daddr)
2248                                 fl4->saddr = inet_select_addr(dev_out, 0,
2249                                                               RT_SCOPE_HOST);
2250                 }
2251         }
2252
2253         if (!fl4->daddr) {
2254                 fl4->daddr = fl4->saddr;
2255                 if (!fl4->daddr)
2256                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2257                 dev_out = net->loopback_dev;
2258                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2259                 res.type = RTN_LOCAL;
2260                 flags |= RTCF_LOCAL;
2261                 goto make_route;
2262         }
2263
2264         err = fib_lookup(net, fl4, &res, 0);
2265         if (err) {
2266                 res.fi = NULL;
2267                 res.table = NULL;
2268                 if (fl4->flowi4_oif) {
2269                         /* Apparently, routing tables are wrong. Assume,
2270                            that the destination is on link.
2271
2272                            WHY? DW.
2273                            Because we are allowed to send to iface
2274                            even if it has NO routes and NO assigned
2275                            addresses. When oif is specified, routing
2276                            tables are looked up with only one purpose:
2277                            to catch if destination is gatewayed, rather than
2278                            direct. Moreover, if MSG_DONTROUTE is set,
2279                            we send packet, ignoring both routing tables
2280                            and ifaddr state. --ANK
2281
2282
2283                            We could make it even if oif is unknown,
2284                            likely IPv6, but we do not.
2285                          */
2286
2287                         if (fl4->saddr == 0)
2288                                 fl4->saddr = inet_select_addr(dev_out, 0,
2289                                                               RT_SCOPE_LINK);
2290                         res.type = RTN_UNICAST;
2291                         goto make_route;
2292                 }
2293                 rth = ERR_PTR(err);
2294                 goto out;
2295         }
2296
2297         if (res.type == RTN_LOCAL) {
2298                 if (!fl4->saddr) {
2299                         if (res.fi->fib_prefsrc)
2300                                 fl4->saddr = res.fi->fib_prefsrc;
2301                         else
2302                                 fl4->saddr = fl4->daddr;
2303                 }
2304
2305                 /* L3 master device is the loopback for that domain */
2306                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2307                 fl4->flowi4_oif = dev_out->ifindex;
2308                 flags |= RTCF_LOCAL;
2309                 goto make_route;
2310         }
2311
2312         fib_select_path(net, &res, fl4, mp_hash);
2313
2314         dev_out = FIB_RES_DEV(res);
2315         fl4->flowi4_oif = dev_out->ifindex;
2316
2317
2318 make_route:
2319         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2320
2321 out:
2322         rcu_read_unlock();
2323         return rth;
2324 }
2325 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2326
2327 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2328 {
2329         return NULL;
2330 }
2331
2332 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2333 {
2334         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2335
2336         return mtu ? : dst->dev->mtu;
2337 }
2338
2339 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2340                                           struct sk_buff *skb, u32 mtu)
2341 {
2342 }
2343
2344 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2345                                        struct sk_buff *skb)
2346 {
2347 }
2348
2349 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2350                                           unsigned long old)
2351 {
2352         return NULL;
2353 }
2354
2355 static struct dst_ops ipv4_dst_blackhole_ops = {
2356         .family                 =       AF_INET,
2357         .check                  =       ipv4_blackhole_dst_check,
2358         .mtu                    =       ipv4_blackhole_mtu,
2359         .default_advmss         =       ipv4_default_advmss,
2360         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2361         .redirect               =       ipv4_rt_blackhole_redirect,
2362         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2363         .neigh_lookup           =       ipv4_neigh_lookup,
2364 };
2365
2366 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2367 {
2368         struct rtable *ort = (struct rtable *) dst_orig;
2369         struct rtable *rt;
2370
2371         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2372         if (rt) {
2373                 struct dst_entry *new = &rt->dst;
2374
2375                 new->__use = 1;
2376                 new->input = dst_discard;
2377                 new->output = dst_discard_out;
2378
2379                 new->dev = ort->dst.dev;
2380                 if (new->dev)
2381                         dev_hold(new->dev);
2382
2383                 rt->rt_is_input = ort->rt_is_input;
2384                 rt->rt_iif = ort->rt_iif;
2385                 rt->rt_pmtu = ort->rt_pmtu;
2386
2387                 rt->rt_genid = rt_genid_ipv4(net);
2388                 rt->rt_flags = ort->rt_flags;
2389                 rt->rt_type = ort->rt_type;
2390                 rt->rt_gateway = ort->rt_gateway;
2391                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2392
2393                 INIT_LIST_HEAD(&rt->rt_uncached);
2394                 dst_free(new);
2395         }
2396
2397         dst_release(dst_orig);
2398
2399         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2400 }
2401
2402 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2403                                     const struct sock *sk)
2404 {
2405         struct rtable *rt = __ip_route_output_key(net, flp4);
2406
2407         if (IS_ERR(rt))
2408                 return rt;
2409
2410         if (flp4->flowi4_proto)
2411                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2412                                                         flowi4_to_flowi(flp4),
2413                                                         sk, 0);
2414
2415         return rt;
2416 }
2417 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2418
2419 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2420                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2421                         u32 seq, int event, int nowait, unsigned int flags)
2422 {
2423         struct rtable *rt = skb_rtable(skb);
2424         struct rtmsg *r;
2425         struct nlmsghdr *nlh;
2426         unsigned long expires = 0;
2427         u32 error;
2428         u32 metrics[RTAX_MAX];
2429
2430         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2431         if (!nlh)
2432                 return -EMSGSIZE;
2433
2434         r = nlmsg_data(nlh);
2435         r->rtm_family    = AF_INET;
2436         r->rtm_dst_len  = 32;
2437         r->rtm_src_len  = 0;
2438         r->rtm_tos      = fl4->flowi4_tos;
2439         r->rtm_table    = table_id;
2440         if (nla_put_u32(skb, RTA_TABLE, table_id))
2441                 goto nla_put_failure;
2442         r->rtm_type     = rt->rt_type;
2443         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2444         r->rtm_protocol = RTPROT_UNSPEC;
2445         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2446         if (rt->rt_flags & RTCF_NOTIFY)
2447                 r->rtm_flags |= RTM_F_NOTIFY;
2448         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2449                 r->rtm_flags |= RTCF_DOREDIRECT;
2450
2451         if (nla_put_in_addr(skb, RTA_DST, dst))
2452                 goto nla_put_failure;
2453         if (src) {
2454                 r->rtm_src_len = 32;
2455                 if (nla_put_in_addr(skb, RTA_SRC, src))
2456                         goto nla_put_failure;
2457         }
2458         if (rt->dst.dev &&
2459             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2460                 goto nla_put_failure;
2461 #ifdef CONFIG_IP_ROUTE_CLASSID
2462         if (rt->dst.tclassid &&
2463             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2464                 goto nla_put_failure;
2465 #endif
2466         if (!rt_is_input_route(rt) &&
2467             fl4->saddr != src) {
2468                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2469                         goto nla_put_failure;
2470         }
2471         if (rt->rt_uses_gateway &&
2472             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2473                 goto nla_put_failure;
2474
2475         expires = rt->dst.expires;
2476         if (expires) {
2477                 unsigned long now = jiffies;
2478
2479                 if (time_before(now, expires))
2480                         expires -= now;
2481                 else
2482                         expires = 0;
2483         }
2484
2485         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2486         if (rt->rt_pmtu && expires)
2487                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2488         if (rtnetlink_put_metrics(skb, metrics) < 0)
2489                 goto nla_put_failure;
2490
2491         if (fl4->flowi4_mark &&
2492             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2493                 goto nla_put_failure;
2494
2495         error = rt->dst.error;
2496
2497         if (rt_is_input_route(rt)) {
2498 #ifdef CONFIG_IP_MROUTE
2499                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2500                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2501                         int err = ipmr_get_route(net, skb,
2502                                                  fl4->saddr, fl4->daddr,
2503                                                  r, nowait, portid);
2504
2505                         if (err <= 0) {
2506                                 if (!nowait) {
2507                                         if (err == 0)
2508                                                 return 0;
2509                                         goto nla_put_failure;
2510                                 } else {
2511                                         if (err == -EMSGSIZE)
2512                                                 goto nla_put_failure;
2513                                         error = err;
2514                                 }
2515                         }
2516                 } else
2517 #endif
2518                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2519                                 goto nla_put_failure;
2520         }
2521
2522         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2523                 goto nla_put_failure;
2524
2525         nlmsg_end(skb, nlh);
2526         return 0;
2527
2528 nla_put_failure:
2529         nlmsg_cancel(skb, nlh);
2530         return -EMSGSIZE;
2531 }
2532
2533 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2534 {
2535         struct net *net = sock_net(in_skb->sk);
2536         struct rtmsg *rtm;
2537         struct nlattr *tb[RTA_MAX+1];
2538         struct rtable *rt = NULL;
2539         struct flowi4 fl4;
2540         __be32 dst = 0;
2541         __be32 src = 0;
2542         u32 iif;
2543         int err;
2544         int mark;
2545         struct sk_buff *skb;
2546         u32 table_id = RT_TABLE_MAIN;
2547
2548         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2549         if (err < 0)
2550                 goto errout;
2551
2552         rtm = nlmsg_data(nlh);
2553
2554         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555         if (!skb) {
2556                 err = -ENOBUFS;
2557                 goto errout;
2558         }
2559
2560         /* Reserve room for dummy headers, this skb can pass
2561            through good chunk of routing engine.
2562          */
2563         skb_reset_mac_header(skb);
2564         skb_reset_network_header(skb);
2565
2566         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2567         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2568         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2569
2570         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2571         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2572         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2573         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2574
2575         memset(&fl4, 0, sizeof(fl4));
2576         fl4.daddr = dst;
2577         fl4.saddr = src;
2578         fl4.flowi4_tos = rtm->rtm_tos;
2579         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2580         fl4.flowi4_mark = mark;
2581
2582         if (iif) {
2583                 struct net_device *dev;
2584
2585                 dev = __dev_get_by_index(net, iif);
2586                 if (!dev) {
2587                         err = -ENODEV;
2588                         goto errout_free;
2589                 }
2590
2591                 skb->protocol   = htons(ETH_P_IP);
2592                 skb->dev        = dev;
2593                 skb->mark       = mark;
2594                 local_bh_disable();
2595                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2596                 local_bh_enable();
2597
2598                 rt = skb_rtable(skb);
2599                 if (err == 0 && rt->dst.error)
2600                         err = -rt->dst.error;
2601         } else {
2602                 rt = ip_route_output_key(net, &fl4);
2603
2604                 err = 0;
2605                 if (IS_ERR(rt))
2606                         err = PTR_ERR(rt);
2607         }
2608
2609         if (err)
2610                 goto errout_free;
2611
2612         skb_dst_set(skb, &rt->dst);
2613         if (rtm->rtm_flags & RTM_F_NOTIFY)
2614                 rt->rt_flags |= RTCF_NOTIFY;
2615
2616         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2617                 table_id = rt->rt_table_id;
2618
2619         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2620                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2621                            RTM_NEWROUTE, 0, 0);
2622         if (err < 0)
2623                 goto errout_free;
2624
2625         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2626 errout:
2627         return err;
2628
2629 errout_free:
2630         kfree_skb(skb);
2631         goto errout;
2632 }
2633
2634 void ip_rt_multicast_event(struct in_device *in_dev)
2635 {
2636         rt_cache_flush(dev_net(in_dev->dev));
2637 }
2638
2639 #ifdef CONFIG_SYSCTL
2640 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2641 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2642 static int ip_rt_gc_elasticity __read_mostly    = 8;
2643
2644 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2645                                         void __user *buffer,
2646                                         size_t *lenp, loff_t *ppos)
2647 {
2648         struct net *net = (struct net *)__ctl->extra1;
2649
2650         if (write) {
2651                 rt_cache_flush(net);
2652                 fnhe_genid_bump(net);
2653                 return 0;
2654         }
2655
2656         return -EINVAL;
2657 }
2658
2659 static struct ctl_table ipv4_route_table[] = {
2660         {
2661                 .procname       = "gc_thresh",
2662                 .data           = &ipv4_dst_ops.gc_thresh,
2663                 .maxlen         = sizeof(int),
2664                 .mode           = 0644,
2665                 .proc_handler   = proc_dointvec,
2666         },
2667         {
2668                 .procname       = "max_size",
2669                 .data           = &ip_rt_max_size,
2670                 .maxlen         = sizeof(int),
2671                 .mode           = 0644,
2672                 .proc_handler   = proc_dointvec,
2673         },
2674         {
2675                 /*  Deprecated. Use gc_min_interval_ms */
2676
2677                 .procname       = "gc_min_interval",
2678                 .data           = &ip_rt_gc_min_interval,
2679                 .maxlen         = sizeof(int),
2680                 .mode           = 0644,
2681                 .proc_handler   = proc_dointvec_jiffies,
2682         },
2683         {
2684                 .procname       = "gc_min_interval_ms",
2685                 .data           = &ip_rt_gc_min_interval,
2686                 .maxlen         = sizeof(int),
2687                 .mode           = 0644,
2688                 .proc_handler   = proc_dointvec_ms_jiffies,
2689         },
2690         {
2691                 .procname       = "gc_timeout",
2692                 .data           = &ip_rt_gc_timeout,
2693                 .maxlen         = sizeof(int),
2694                 .mode           = 0644,
2695                 .proc_handler   = proc_dointvec_jiffies,
2696         },
2697         {
2698                 .procname       = "gc_interval",
2699                 .data           = &ip_rt_gc_interval,
2700                 .maxlen         = sizeof(int),
2701                 .mode           = 0644,
2702                 .proc_handler   = proc_dointvec_jiffies,
2703         },
2704         {
2705                 .procname       = "redirect_load",
2706                 .data           = &ip_rt_redirect_load,
2707                 .maxlen         = sizeof(int),
2708                 .mode           = 0644,
2709                 .proc_handler   = proc_dointvec,
2710         },
2711         {
2712                 .procname       = "redirect_number",
2713                 .data           = &ip_rt_redirect_number,
2714                 .maxlen         = sizeof(int),
2715                 .mode           = 0644,
2716                 .proc_handler   = proc_dointvec,
2717         },
2718         {
2719                 .procname       = "redirect_silence",
2720                 .data           = &ip_rt_redirect_silence,
2721                 .maxlen         = sizeof(int),
2722                 .mode           = 0644,
2723                 .proc_handler   = proc_dointvec,
2724         },
2725         {
2726                 .procname       = "error_cost",
2727                 .data           = &ip_rt_error_cost,
2728                 .maxlen         = sizeof(int),
2729                 .mode           = 0644,
2730                 .proc_handler   = proc_dointvec,
2731         },
2732         {
2733                 .procname       = "error_burst",
2734                 .data           = &ip_rt_error_burst,
2735                 .maxlen         = sizeof(int),
2736                 .mode           = 0644,
2737                 .proc_handler   = proc_dointvec,
2738         },
2739         {
2740                 .procname       = "gc_elasticity",
2741                 .data           = &ip_rt_gc_elasticity,
2742                 .maxlen         = sizeof(int),
2743                 .mode           = 0644,
2744                 .proc_handler   = proc_dointvec,
2745         },
2746         {
2747                 .procname       = "mtu_expires",
2748                 .data           = &ip_rt_mtu_expires,
2749                 .maxlen         = sizeof(int),
2750                 .mode           = 0644,
2751                 .proc_handler   = proc_dointvec_jiffies,
2752         },
2753         {
2754                 .procname       = "min_pmtu",
2755                 .data           = &ip_rt_min_pmtu,
2756                 .maxlen         = sizeof(int),
2757                 .mode           = 0644,
2758                 .proc_handler   = proc_dointvec,
2759         },
2760         {
2761                 .procname       = "min_adv_mss",
2762                 .data           = &ip_rt_min_advmss,
2763                 .maxlen         = sizeof(int),
2764                 .mode           = 0644,
2765                 .proc_handler   = proc_dointvec,
2766         },
2767         { }
2768 };
2769
2770 static struct ctl_table ipv4_route_flush_table[] = {
2771         {
2772                 .procname       = "flush",
2773                 .maxlen         = sizeof(int),
2774                 .mode           = 0200,
2775                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2776         },
2777         { },
2778 };
2779
2780 static __net_init int sysctl_route_net_init(struct net *net)
2781 {
2782         struct ctl_table *tbl;
2783
2784         tbl = ipv4_route_flush_table;
2785         if (!net_eq(net, &init_net)) {
2786                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2787                 if (!tbl)
2788                         goto err_dup;
2789
2790                 /* Don't export sysctls to unprivileged users */
2791                 if (net->user_ns != &init_user_ns)
2792                         tbl[0].procname = NULL;
2793         }
2794         tbl[0].extra1 = net;
2795
2796         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2797         if (!net->ipv4.route_hdr)
2798                 goto err_reg;
2799         return 0;
2800
2801 err_reg:
2802         if (tbl != ipv4_route_flush_table)
2803                 kfree(tbl);
2804 err_dup:
2805         return -ENOMEM;
2806 }
2807
2808 static __net_exit void sysctl_route_net_exit(struct net *net)
2809 {
2810         struct ctl_table *tbl;
2811
2812         tbl = net->ipv4.route_hdr->ctl_table_arg;
2813         unregister_net_sysctl_table(net->ipv4.route_hdr);
2814         BUG_ON(tbl == ipv4_route_flush_table);
2815         kfree(tbl);
2816 }
2817
2818 static __net_initdata struct pernet_operations sysctl_route_ops = {
2819         .init = sysctl_route_net_init,
2820         .exit = sysctl_route_net_exit,
2821 };
2822 #endif
2823
2824 static __net_init int rt_genid_init(struct net *net)
2825 {
2826         atomic_set(&net->ipv4.rt_genid, 0);
2827         atomic_set(&net->fnhe_genid, 0);
2828         get_random_bytes(&net->ipv4.dev_addr_genid,
2829                          sizeof(net->ipv4.dev_addr_genid));
2830         return 0;
2831 }
2832
2833 static __net_initdata struct pernet_operations rt_genid_ops = {
2834         .init = rt_genid_init,
2835 };
2836
2837 static int __net_init ipv4_inetpeer_init(struct net *net)
2838 {
2839         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2840
2841         if (!bp)
2842                 return -ENOMEM;
2843         inet_peer_base_init(bp);
2844         net->ipv4.peers = bp;
2845         return 0;
2846 }
2847
2848 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2849 {
2850         struct inet_peer_base *bp = net->ipv4.peers;
2851
2852         net->ipv4.peers = NULL;
2853         inetpeer_invalidate_tree(bp);
2854         kfree(bp);
2855 }
2856
2857 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2858         .init   =       ipv4_inetpeer_init,
2859         .exit   =       ipv4_inetpeer_exit,
2860 };
2861
2862 #ifdef CONFIG_IP_ROUTE_CLASSID
2863 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2864 #endif /* CONFIG_IP_ROUTE_CLASSID */
2865
2866 int __init ip_rt_init(void)
2867 {
2868         int rc = 0;
2869         int cpu;
2870
2871         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2872         if (!ip_idents)
2873                 panic("IP: failed to allocate ip_idents\n");
2874
2875         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2876
2877         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2878         if (!ip_tstamps)
2879                 panic("IP: failed to allocate ip_tstamps\n");
2880
2881         for_each_possible_cpu(cpu) {
2882                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2883
2884                 INIT_LIST_HEAD(&ul->head);
2885                 spin_lock_init(&ul->lock);
2886         }
2887 #ifdef CONFIG_IP_ROUTE_CLASSID
2888         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2889         if (!ip_rt_acct)
2890                 panic("IP: failed to allocate ip_rt_acct\n");
2891 #endif
2892
2893         ipv4_dst_ops.kmem_cachep =
2894                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2895                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2896
2897         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2898
2899         if (dst_entries_init(&ipv4_dst_ops) < 0)
2900                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2901
2902         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2903                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2904
2905         ipv4_dst_ops.gc_thresh = ~0;
2906         ip_rt_max_size = INT_MAX;
2907
2908         devinet_init();
2909         ip_fib_init();
2910
2911         if (ip_rt_proc_init())
2912                 pr_err("Unable to create route proc files\n");
2913 #ifdef CONFIG_XFRM
2914         xfrm_init();
2915         xfrm4_init();
2916 #endif
2917         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2918
2919 #ifdef CONFIG_SYSCTL
2920         register_pernet_subsys(&sysctl_route_ops);
2921 #endif
2922         register_pernet_subsys(&rt_genid_ops);
2923         register_pernet_subsys(&ipv4_inetpeer_ops);
2924         return rc;
2925 }
2926
2927 #ifdef CONFIG_SYSCTL
2928 /*
2929  * We really need to sanitize the damn ipv4 init order, then all
2930  * this nonsense will go away.
2931  */
2932 void __init ip_static_sysctl_init(void)
2933 {
2934         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2935 }
2936 #endif