net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                            int length, int *eof, void *data)
 518 {
 519         unsigned int i;
 520
 521         if ((offset & 3) || (length & 3))
 522                 return -EIO;
 523
 524         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                 *eof = 1;
 526                 return 0;
 527         }
 528
 529         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                 *eof = 1;
 532         }
 533
 534         offset /= sizeof(u32);
 535
 536         if (length > 0) {
 537                 u32 *dst = (u32 *) buffer;
 538
 539                 *start = buffer;
 540                 memset(dst, 0, length);
 541
 542                 for_each_possible_cpu(i) {
 543                         unsigned int j;
 544                         u32 *src;
 545
 546                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                         for (j = 0; j < length/4; j++)
 548                                 dst[j] += src[j];
 549                 }
 550         }
 551         return length;
 552 }
 553 #endif
 554
 555 static int __net_init ip_rt_do_proc_init(struct net *net)
 556 {
 557         struct proc_dir_entry *pde;
 558
 559         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                         &rt_cache_seq_fops);
 561         if (!pde)
 562                 goto err1;
 563
 564         pde = proc_create("rt_cache", S_IRUGO,
 565                           net->proc_net_stat, &rt_cpu_seq_fops);
 566         if (!pde)
 567                 goto err2;
 568
 569 #ifdef CONFIG_NET_CLS_ROUTE
 570         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                         ip_rt_acct_read, NULL);
 572         if (!pde)
 573                 goto err3;
 574 #endif
 575         return 0;
 576
 577 #ifdef CONFIG_NET_CLS_ROUTE
 578 err3:
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580 #endif
 581 err2:
 582         remove_proc_entry("rt_cache", net->proc_net);
 583 err1:
 584         return -ENOMEM;
 585 }
 586
 587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588 {
 589         remove_proc_entry("rt_cache", net->proc_net_stat);
 590         remove_proc_entry("rt_cache", net->proc_net);
 591         remove_proc_entry("rt_acct", net->proc_net);
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                 (fl1->oif ^ fl2->oif) |
 701                 (fl1->iif ^ fl2->iif)) == 0;
 702 }
 703
 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707 }
 708
 709 static inline int rt_is_expired(struct rtable *rth)
 710 {
 711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712 }
 713
 714 /*
 715  * Perform a full scan of hash table and free all entries.
 716  * Can be called by a softirq or a process.
 717  * In the later case, we want to be reschedule if necessary
 718  */
 719 static void rt_do_flush(int process_context)
 720 {
 721         unsigned int i;
 722         struct rtable *rth, *next;
 723         struct rtable * tail;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 if (process_context && need_resched())
 727                         cond_resched();
 728                 rth = rt_hash_table[i].chain;
 729                 if (!rth)
 730                         continue;
 731
 732                 spin_lock_bh(rt_hash_lock_addr(i));
 733 #ifdef CONFIG_NET_NS
 734                 {
 735                 struct rtable ** prev, * p;
 736
 737                 rth = rt_hash_table[i].chain;
 738
 739                 /* defer releasing the head of the list after spin_unlock */
 740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                         if (!rt_is_expired(tail))
 742                                 break;
 743                 if (rth != tail)
 744                         rt_hash_table[i].chain = tail;
 745
 746                 /* call rt_free on entries after the tail requiring flush */
 747                 prev = &rt_hash_table[i].chain;
 748                 for (p = *prev; p; p = next) {
 749                         next = p->u.dst.rt_next;
 750                         if (!rt_is_expired(p)) {
 751                                 prev = &p->u.dst.rt_next;
 752                         } else {
 753                                 *prev = next;
 754                                 rt_free(p);
 755                         }
 756                 }
 757                 }
 758 #else
 759                 rth = rt_hash_table[i].chain;
 760                 rt_hash_table[i].chain = NULL;
 761                 tail = NULL;
 762 #endif
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                 for (; rth != tail; rth = next) {
 766                         next = rth->u.dst.rt_next;
 767                         rt_free(rth);
 768                 }
 769         }
 770 }
 771
 772 /*
 773  * While freeing expired entries, we compute average chain length
 774  * and standard deviation, using fixed-point arithmetic.
 775  * This to have an estimation of rt_chain_length_max
 776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778  */
 779
 780 #define FRACT_BITS 3
 781 #define ONE (1UL << FRACT_BITS)
 782
 783 static void rt_check_expire(void)
 784 {
 785         static unsigned int rover;
 786         unsigned int i = rover, goal;
 787         struct rtable *rth, *aux, **rthp;
 788         unsigned long samples = 0;
 789         unsigned long sum = 0, sum2 = 0;
 790         unsigned long delta;
 791         u64 mult;
 792
 793         delta = jiffies - expires_ljiffies;
 794         expires_ljiffies = jiffies;
 795         mult = ((u64)delta) << rt_hash_log;
 796         if (ip_rt_gc_timeout > 1)
 797                 do_div(mult, ip_rt_gc_timeout);
 798         goal = (unsigned int)mult;
 799         if (goal > rt_hash_mask)
 800                 goal = rt_hash_mask + 1;
 801         for (; goal > 0; goal--) {
 802                 unsigned long tmo = ip_rt_gc_timeout;
 803                 unsigned long length;
 804
 805                 i = (i + 1) & rt_hash_mask;
 806                 rthp = &rt_hash_table[i].chain;
 807
 808                 if (need_resched())
 809                         cond_resched();
 810
 811                 samples++;
 812
 813                 if (*rthp == NULL)
 814                         continue;
 815                 length = 0;
 816                 spin_lock_bh(rt_hash_lock_addr(i));
 817                 while ((rth = *rthp) != NULL) {
 818                         prefetch(rth->u.dst.rt_next);
 819                         if (rt_is_expired(rth)) {
 820                                 *rthp = rth->u.dst.rt_next;
 821                                 rt_free(rth);
 822                                 continue;
 823                         }
 824                         if (rth->u.dst.expires) {
 825                                 /* Entry is expired even if it is in use */
 826                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 827 nofree:
 828                                         tmo >>= 1;
 829                                         rthp = &rth->u.dst.rt_next;
 830                                         /*
 831                                          * We only count entries on
 832                                          * a chain with equal hash inputs once
 833                                          * so that entries for different QOS
 834                                          * levels, and other non-hash input
 835                                          * attributes don't unfairly skew
 836                                          * the length computation
 837                                          */
 838                                         for (aux = rt_hash_table[i].chain;;) {
 839                                                 if (aux == rth) {
 840                                                         length += ONE;
 841                                                         break;
 842                                                 }
 843                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 844                                                         break;
 845                                                 aux = aux->u.dst.rt_next;
 846                                         }
 847                                         continue;
 848                                 }
 849                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 850                                 goto nofree;
 851
 852                         /* Cleanup aged off entries. */
 853                         *rthp = rth->u.dst.rt_next;
 854                         rt_free(rth);
 855                 }
 856                 spin_unlock_bh(rt_hash_lock_addr(i));
 857                 sum += length;
 858                 sum2 += length*length;
 859         }
 860         if (samples) {
 861                 unsigned long avg = sum / samples;
 862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                 rt_chain_length_max = max_t(unsigned long,
 864                                         ip_rt_gc_elasticity,
 865                                         (avg + 4*sd) >> FRACT_BITS);
 866         }
 867         rover = i;
 868 }
 869
 870 /*
 871  * rt_worker_func() is run in process context.
 872  * we call rt_check_expire() to scan part of the hash table
 873  */
 874 static void rt_worker_func(struct work_struct *work)
 875 {
 876         rt_check_expire();
 877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878 }
 879
 880 /*
 881  * Pertubation of rt_genid by a small quantity [1..256]
 882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883  * many times (2^24) without giving recent rt_genid.
 884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885  */
 886 static void rt_cache_invalidate(struct net *net)
 887 {
 888         unsigned char shuffle;
 889
 890         get_random_bytes(&shuffle, sizeof(shuffle));
 891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892 }
 893
 894 /*
 895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896  * delay >= 0 : invalidate & flush cache (can be long)
 897  */
 898 void rt_cache_flush(struct net *net, int delay)
 899 {
 900         rt_cache_invalidate(net);
 901         if (delay >= 0)
 902                 rt_do_flush(!in_softirq());
 903 }
 904
 905 /*
 906  * We change rt_genid and let gc do the cleanup
 907  */
 908 static void rt_secret_rebuild(unsigned long __net)
 909 {
 910         struct net *net = (struct net *)__net;
 911         rt_cache_invalidate(net);
 912         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 913 }
 914
 915 static void rt_secret_rebuild_oneshot(struct net *net)
 916 {
 917         del_timer_sync(&net->ipv4.rt_secret_timer);
 918         rt_cache_invalidate(net);
 919         if (ip_rt_secret_interval) {
 920                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 921                 add_timer(&net->ipv4.rt_secret_timer);
 922         }
 923 }
 924
 925 static void rt_emergency_hash_rebuild(struct net *net)
 926 {
 927         if (net_ratelimit()) {
 928                 printk(KERN_WARNING "Route hash chain too long!\n");
 929                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 930         }
 931
 932         rt_secret_rebuild_oneshot(net);
 933 }
 934
 935 /*
 936    Short description of GC goals.
 937
 938    We want to build algorithm, which will keep routing cache
 939    at some equilibrium point, when number of aged off entries
 940    is kept approximately equal to newly generated ones.
 941
 942    Current expiration strength is variable "expire".
 943    We try to adjust it dynamically, so that if networking
 944    is idle expires is large enough to keep enough of warm entries,
 945    and when load increases it reduces to limit cache size.
 946  */
 947
 948 static int rt_garbage_collect(struct dst_ops *ops)
 949 {
 950         static unsigned long expire = RT_GC_TIMEOUT;
 951         static unsigned long last_gc;
 952         static int rover;
 953         static int equilibrium;
 954         struct rtable *rth, **rthp;
 955         unsigned long now = jiffies;
 956         int goal;
 957
 958         /*
 959          * Garbage collection is pretty expensive,
 960          * do not make it too frequently.
 961          */
 962
 963         RT_CACHE_STAT_INC(gc_total);
 964
 965         if (now - last_gc < ip_rt_gc_min_interval &&
 966             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 967                 RT_CACHE_STAT_INC(gc_ignored);
 968                 goto out;
 969         }
 970
 971         /* Calculate number of entries, which we want to expire now. */
 972         goal = atomic_read(&ipv4_dst_ops.entries) -
 973                 (ip_rt_gc_elasticity << rt_hash_log);
 974         if (goal <= 0) {
 975                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 976                         equilibrium = ipv4_dst_ops.gc_thresh;
 977                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                 if (goal > 0) {
 979                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 980                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 981                 }
 982         } else {
 983                 /* We are in dangerous area. Try to reduce cache really
 984                  * aggressively.
 985                  */
 986                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 987                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 988         }
 989
 990         if (now - last_gc >= ip_rt_gc_min_interval)
 991                 last_gc = now;
 992
 993         if (goal <= 0) {
 994                 equilibrium += goal;
 995                 goto work_done;
 996         }
 997
 998         do {
 999                 int i, k;
1000
1001                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002                         unsigned long tmo = expire;
1003
1004                         k = (k + 1) & rt_hash_mask;
1005                         rthp = &rt_hash_table[k].chain;
1006                         spin_lock_bh(rt_hash_lock_addr(k));
1007                         while ((rth = *rthp) != NULL) {
1008                                 if (!rt_is_expired(rth) &&
1009                                         !rt_may_expire(rth, tmo, expire)) {
1010                                         tmo >>= 1;
1011                                         rthp = &rth->u.dst.rt_next;
1012                                         continue;
1013                                 }
1014                                 *rthp = rth->u.dst.rt_next;
1015                                 rt_free(rth);
1016                                 goal--;
1017                         }
1018                         spin_unlock_bh(rt_hash_lock_addr(k));
1019                         if (goal <= 0)
1020                                 break;
1021                 }
1022                 rover = k;
1023
1024                 if (goal <= 0)
1025                         goto work_done;
1026
1027                 /* Goal is not achieved. We stop process if:
1028
1029                    - if expire reduced to zero. Otherwise, expire is halfed.
1030                    - if table is not full.
1031                    - if we are called from interrupt.
1032                    - jiffies check is just fallback/debug loop breaker.
1033                      We will not spin here for long time in any case.
1034                  */
1035
1036                 RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038                 if (expire == 0)
1039                         break;
1040
1041                 expire >>= 1;
1042 #if RT_CACHE_DEBUG >= 2
1043                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1045 #endif
1046
1047                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048                         goto out;
1049         } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052                 goto out;
1053         if (net_ratelimit())
1054                 printk(KERN_WARNING "dst cache overflow\n");
1055         RT_CACHE_STAT_INC(gc_dst_overflow);
1056         return 1;
1057
1058 work_done:
1059         expire += ip_rt_gc_min_interval;
1060         if (expire > ip_rt_gc_timeout ||
1061             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062                 expire = ip_rt_gc_timeout;
1063 #if RT_CACHE_DEBUG >= 2
1064         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066 #endif
1067 out:    return 0;
1068 }
1069
1070 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071                           struct rtable **rp, struct sk_buff *skb)
1072 {
1073         struct rtable   *rth, **rthp;
1074         unsigned long   now;
1075         struct rtable *cand, **candp;
1076         u32             min_score;
1077         int             chain_length;
1078         int attempts = !in_softirq();
1079
1080 restart:
1081         chain_length = 0;
1082         min_score = ~(u32)0;
1083         cand = NULL;
1084         candp = NULL;
1085         now = jiffies;
1086
1087         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088                 rt_drop(rt);
1089                 return 0;
1090         }
1091
1092         rthp = &rt_hash_table[hash].chain;
1093
1094         spin_lock_bh(rt_hash_lock_addr(hash));
1095         while ((rth = *rthp) != NULL) {
1096                 if (rt_is_expired(rth)) {
1097                         *rthp = rth->u.dst.rt_next;
1098                         rt_free(rth);
1099                         continue;
1100                 }
1101                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1102                         /* Put it first */
1103                         *rthp = rth->u.dst.rt_next;
1104                         /*
1105                          * Since lookup is lockfree, the deletion
1106                          * must be visible to another weakly ordered CPU before
1107                          * the insertion at the start of the hash chain.
1108                          */
1109                         rcu_assign_pointer(rth->u.dst.rt_next,
1110                                            rt_hash_table[hash].chain);
1111                         /*
1112                          * Since lookup is lockfree, the update writes
1113                          * must be ordered for consistency on SMP.
1114                          */
1115                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1116
1117                         dst_use(&rth->u.dst, now);
1118                         spin_unlock_bh(rt_hash_lock_addr(hash));
1119
1120                         rt_drop(rt);
1121                         if (rp)
1122                                 *rp = rth;
1123                         else
1124                                 skb_dst_set(skb, &rth->u.dst);
1125                         return 0;
1126                 }
1127
1128                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1129                         u32 score = rt_score(rth);
1130
1131                         if (score <= min_score) {
1132                                 cand = rth;
1133                                 candp = rthp;
1134                                 min_score = score;
1135                         }
1136                 }
1137
1138                 chain_length++;
1139
1140                 rthp = &rth->u.dst.rt_next;
1141         }
1142
1143         if (cand) {
1144                 /* ip_rt_gc_elasticity used to be average length of chain
1145                  * length, when exceeded gc becomes really aggressive.
1146                  *
1147                  * The second limit is less certain. At the moment it allows
1148                  * only 2 entries per bucket. We will see.
1149                  */
1150                 if (chain_length > ip_rt_gc_elasticity) {
1151                         *candp = cand->u.dst.rt_next;
1152                         rt_free(cand);
1153                 }
1154         } else {
1155                 if (chain_length > rt_chain_length_max) {
1156                         struct net *net = dev_net(rt->u.dst.dev);
1157                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1158                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1159                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1160                                         rt->u.dst.dev->name, num);
1161                         }
1162                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1163                 }
1164         }
1165
1166         /* Try to bind route to arp only if it is output
1167            route or unicast forwarding path.
1168          */
1169         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1170                 int err = arp_bind_neighbour(&rt->u.dst);
1171                 if (err) {
1172                         spin_unlock_bh(rt_hash_lock_addr(hash));
1173
1174                         if (err != -ENOBUFS) {
1175                                 rt_drop(rt);
1176                                 return err;
1177                         }
1178
1179                         /* Neighbour tables are full and nothing
1180                            can be released. Try to shrink route cache,
1181                            it is most likely it holds some neighbour records.
1182                          */
1183                         if (attempts-- > 0) {
1184                                 int saved_elasticity = ip_rt_gc_elasticity;
1185                                 int saved_int = ip_rt_gc_min_interval;
1186                                 ip_rt_gc_elasticity     = 1;
1187                                 ip_rt_gc_min_interval   = 0;
1188                                 rt_garbage_collect(&ipv4_dst_ops);
1189                                 ip_rt_gc_min_interval   = saved_int;
1190                                 ip_rt_gc_elasticity     = saved_elasticity;
1191                                 goto restart;
1192                         }
1193
1194                         if (net_ratelimit())
1195                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1196                         rt_drop(rt);
1197                         return -ENOBUFS;
1198                 }
1199         }
1200
1201         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1202
1203 #if RT_CACHE_DEBUG >= 2
1204         if (rt->u.dst.rt_next) {
1205                 struct rtable *trt;
1206                 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1207                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1208                         printk(" . %pI4", &trt->rt_dst);
1209                 printk("\n");
1210         }
1211 #endif
1212         /*
1213          * Since lookup is lockfree, we must make sure
1214          * previous writes to rt are comitted to memory
1215          * before making rt visible to other CPUS.
1216          */
1217         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1218
1219         spin_unlock_bh(rt_hash_lock_addr(hash));
1220         if (rp)
1221                 *rp = rt;
1222         else
1223                 skb_dst_set(skb, &rt->u.dst);
1224         return 0;
1225 }
1226
1227 void rt_bind_peer(struct rtable *rt, int create)
1228 {
1229         static DEFINE_SPINLOCK(rt_peer_lock);
1230         struct inet_peer *peer;
1231
1232         peer = inet_getpeer(rt->rt_dst, create);
1233
1234         spin_lock_bh(&rt_peer_lock);
1235         if (rt->peer == NULL) {
1236                 rt->peer = peer;
1237                 peer = NULL;
1238         }
1239         spin_unlock_bh(&rt_peer_lock);
1240         if (peer)
1241                 inet_putpeer(peer);
1242 }
1243
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253         static DEFINE_SPINLOCK(ip_fb_id_lock);
1254         static u32 ip_fallback_id;
1255         u32 salt;
1256
1257         spin_lock_bh(&ip_fb_id_lock);
1258         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259         iph->id = htons(salt & 0xFFFF);
1260         ip_fallback_id = salt;
1261         spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (rt) {
1269                 if (rt->peer == NULL)
1270                         rt_bind_peer(rt, 1);
1271
1272                 /* If peer is attached to destination, it is never detached,
1273                    so that we need not to grab a lock to dereference it.
1274                  */
1275                 if (rt->peer) {
1276                         iph->id = htons(inet_getid(rt->peer, more));
1277                         return;
1278                 }
1279         } else
1280                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281                        __builtin_return_address(0));
1282
1283         ip_select_fb_ident(iph);
1284 }
1285
1286 static void rt_del(unsigned hash, struct rtable *rt)
1287 {
1288         struct rtable **rthp, *aux;
1289
1290         rthp = &rt_hash_table[hash].chain;
1291         spin_lock_bh(rt_hash_lock_addr(hash));
1292         ip_rt_put(rt);
1293         while ((aux = *rthp) != NULL) {
1294                 if (aux == rt || rt_is_expired(aux)) {
1295                         *rthp = aux->u.dst.rt_next;
1296                         rt_free(aux);
1297                         continue;
1298                 }
1299                 rthp = &aux->u.dst.rt_next;
1300         }
1301         spin_unlock_bh(rt_hash_lock_addr(hash));
1302 }
1303
1304 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1305                     __be32 saddr, struct net_device *dev)
1306 {
1307         int i, k;
1308         struct in_device *in_dev = in_dev_get(dev);
1309         struct rtable *rth, **rthp;
1310         __be32  skeys[2] = { saddr, 0 };
1311         int  ikeys[2] = { dev->ifindex, 0 };
1312         struct netevent_redirect netevent;
1313         struct net *net;
1314
1315         if (!in_dev)
1316                 return;
1317
1318         net = dev_net(dev);
1319         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1320             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1321             || ipv4_is_zeronet(new_gw))
1322                 goto reject_redirect;
1323
1324         if (!rt_caching(net))
1325                 goto reject_redirect;
1326
1327         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1328                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1329                         goto reject_redirect;
1330                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1331                         goto reject_redirect;
1332         } else {
1333                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1334                         goto reject_redirect;
1335         }
1336
1337         for (i = 0; i < 2; i++) {
1338                 for (k = 0; k < 2; k++) {
1339                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1340                                                 rt_genid(net));
1341
1342                         rthp=&rt_hash_table[hash].chain;
1343
1344                         rcu_read_lock();
1345                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1346                                 struct rtable *rt;
1347
1348                                 if (rth->fl.fl4_dst != daddr ||
1349                                     rth->fl.fl4_src != skeys[i] ||
1350                                     rth->fl.oif != ikeys[k] ||
1351                                     rth->fl.iif != 0 ||
1352                                     rt_is_expired(rth) ||
1353                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1354                                         rthp = &rth->u.dst.rt_next;
1355                                         continue;
1356                                 }
1357
1358                                 if (rth->rt_dst != daddr ||
1359                                     rth->rt_src != saddr ||
1360                                     rth->u.dst.error ||
1361                                     rth->rt_gateway != old_gw ||
1362                                     rth->u.dst.dev != dev)
1363                                         break;
1364
1365                                 dst_hold(&rth->u.dst);
1366                                 rcu_read_unlock();
1367
1368                                 rt = dst_alloc(&ipv4_dst_ops);
1369                                 if (rt == NULL) {
1370                                         ip_rt_put(rth);
1371                                         in_dev_put(in_dev);
1372                                         return;
1373                                 }
1374
1375                                 /* Copy all the information. */
1376                                 *rt = *rth;
1377                                 rt->u.dst.__use         = 1;
1378                                 atomic_set(&rt->u.dst.__refcnt, 1);
1379                                 rt->u.dst.child         = NULL;
1380                                 if (rt->u.dst.dev)
1381                                         dev_hold(rt->u.dst.dev);
1382                                 if (rt->idev)
1383                                         in_dev_hold(rt->idev);
1384                                 rt->u.dst.obsolete      = 0;
1385                                 rt->u.dst.lastuse       = jiffies;
1386                                 rt->u.dst.path          = &rt->u.dst;
1387                                 rt->u.dst.neighbour     = NULL;
1388                                 rt->u.dst.hh            = NULL;
1389 #ifdef CONFIG_XFRM
1390                                 rt->u.dst.xfrm          = NULL;
1391 #endif
1392                                 rt->rt_genid            = rt_genid(net);
1393                                 rt->rt_flags            |= RTCF_REDIRECTED;
1394
1395                                 /* Gateway is different ... */
1396                                 rt->rt_gateway          = new_gw;
1397
1398                                 /* Redirect received -> path was valid */
1399                                 dst_confirm(&rth->u.dst);
1400
1401                                 if (rt->peer)
1402                                         atomic_inc(&rt->peer->refcnt);
1403
1404                                 if (arp_bind_neighbour(&rt->u.dst) ||
1405                                     !(rt->u.dst.neighbour->nud_state &
1406                                             NUD_VALID)) {
1407                                         if (rt->u.dst.neighbour)
1408                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1409                                         ip_rt_put(rth);
1410                                         rt_drop(rt);
1411                                         goto do_next;
1412                                 }
1413
1414                                 netevent.old = &rth->u.dst;
1415                                 netevent.new = &rt->u.dst;
1416                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1417                                                         &netevent);
1418
1419                                 rt_del(hash, rth);
1420                                 if (!rt_intern_hash(hash, rt, &rt, NULL))
1421                                         ip_rt_put(rt);
1422                                 goto do_next;
1423                         }
1424                         rcu_read_unlock();
1425                 do_next:
1426                         ;
1427                 }
1428         }
1429         in_dev_put(in_dev);
1430         return;
1431
1432 reject_redirect:
1433 #ifdef CONFIG_IP_ROUTE_VERBOSE
1434         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1435                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1436                         "  Advised path = %pI4 -> %pI4\n",
1437                        &old_gw, dev->name, &new_gw,
1438                        &saddr, &daddr);
1439 #endif
1440         in_dev_put(in_dev);
1441 }
1442
1443 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1444 {
1445         struct rtable *rt = (struct rtable *)dst;
1446         struct dst_entry *ret = dst;
1447
1448         if (rt) {
1449                 if (dst->obsolete) {
1450                         ip_rt_put(rt);
1451                         ret = NULL;
1452                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1453                            rt->u.dst.expires) {
1454                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1455                                                 rt->fl.oif,
1456                                                 rt_genid(dev_net(dst->dev)));
1457 #if RT_CACHE_DEBUG >= 1
1458                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1459                                 &rt->rt_dst, rt->fl.fl4_tos);
1460 #endif
1461                         rt_del(hash, rt);
1462                         ret = NULL;
1463                 }
1464         }
1465         return ret;
1466 }
1467
1468 /*
1469  * Algorithm:
1470  *      1. The first ip_rt_redirect_number redirects are sent
1471  *         with exponential backoff, then we stop sending them at all,
1472  *         assuming that the host ignores our redirects.
1473  *      2. If we did not see packets requiring redirects
1474  *         during ip_rt_redirect_silence, we assume that the host
1475  *         forgot redirected route and start to send redirects again.
1476  *
1477  * This algorithm is much cheaper and more intelligent than dumb load limiting
1478  * in icmp.c.
1479  *
1480  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1481  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1482  */
1483
1484 void ip_rt_send_redirect(struct sk_buff *skb)
1485 {
1486         struct rtable *rt = skb_rtable(skb);
1487         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1488
1489         if (!in_dev)
1490                 return;
1491
1492         if (!IN_DEV_TX_REDIRECTS(in_dev))
1493                 goto out;
1494
1495         /* No redirected packets during ip_rt_redirect_silence;
1496          * reset the algorithm.
1497          */
1498         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1499                 rt->u.dst.rate_tokens = 0;
1500
1501         /* Too many ignored redirects; do not send anything
1502          * set u.dst.rate_last to the last seen redirected packet.
1503          */
1504         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1505                 rt->u.dst.rate_last = jiffies;
1506                 goto out;
1507         }
1508
1509         /* Check for load limit; set rate_last to the latest sent
1510          * redirect.
1511          */
1512         if (rt->u.dst.rate_tokens == 0 ||
1513             time_after(jiffies,
1514                        (rt->u.dst.rate_last +
1515                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1516                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1517                 rt->u.dst.rate_last = jiffies;
1518                 ++rt->u.dst.rate_tokens;
1519 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1521                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1522                     net_ratelimit())
1523                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1524                                 &rt->rt_src, rt->rt_iif,
1525                                 &rt->rt_dst, &rt->rt_gateway);
1526 #endif
1527         }
1528 out:
1529         in_dev_put(in_dev);
1530 }
1531
1532 static int ip_error(struct sk_buff *skb)
1533 {
1534         struct rtable *rt = skb_rtable(skb);
1535         unsigned long now;
1536         int code;
1537
1538         switch (rt->u.dst.error) {
1539                 case EINVAL:
1540                 default:
1541                         goto out;
1542                 case EHOSTUNREACH:
1543                         code = ICMP_HOST_UNREACH;
1544                         break;
1545                 case ENETUNREACH:
1546                         code = ICMP_NET_UNREACH;
1547                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1548                                         IPSTATS_MIB_INNOROUTES);
1549                         break;
1550                 case EACCES:
1551                         code = ICMP_PKT_FILTERED;
1552                         break;
1553         }
1554
1555         now = jiffies;
1556         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1557         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1558                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1559         rt->u.dst.rate_last = now;
1560         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1561                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1562                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1563         }
1564
1565 out:    kfree_skb(skb);
1566         return 0;
1567 }
1568
1569 /*
1570  *      The last two values are not from the RFC but
1571  *      are needed for AMPRnet AX.25 paths.
1572  */
1573
1574 static const unsigned short mtu_plateau[] =
1575 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1576
1577 static inline unsigned short guess_mtu(unsigned short old_mtu)
1578 {
1579         int i;
1580
1581         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1582                 if (old_mtu > mtu_plateau[i])
1583                         return mtu_plateau[i];
1584         return 68;
1585 }
1586
1587 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1588                                  unsigned short new_mtu,
1589                                  struct net_device *dev)
1590 {
1591         int i, k;
1592         unsigned short old_mtu = ntohs(iph->tot_len);
1593         struct rtable *rth;
1594         int  ikeys[2] = { dev->ifindex, 0 };
1595         __be32  skeys[2] = { iph->saddr, 0, };
1596         __be32  daddr = iph->daddr;
1597         unsigned short est_mtu = 0;
1598
1599         if (ipv4_config.no_pmtu_disc)
1600                 return 0;
1601
1602         for (k = 0; k < 2; k++) {
1603                 for (i = 0; i < 2; i++) {
1604                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1605                                                 rt_genid(net));
1606
1607                         rcu_read_lock();
1608                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1609                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1610                                 unsigned short mtu = new_mtu;
1611
1612                                 if (rth->fl.fl4_dst != daddr ||
1613                                     rth->fl.fl4_src != skeys[i] ||
1614                                     rth->rt_dst != daddr ||
1615                                     rth->rt_src != iph->saddr ||
1616                                     rth->fl.oif != ikeys[k] ||
1617                                     rth->fl.iif != 0 ||
1618                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1619                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1620                                     rt_is_expired(rth))
1621                                         continue;
1622
1623                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1624
1625                                         /* BSD 4.2 compatibility hack :-( */
1626                                         if (mtu == 0 &&
1627                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1628                                             old_mtu >= 68 + (iph->ihl << 2))
1629                                                 old_mtu -= iph->ihl << 2;
1630
1631                                         mtu = guess_mtu(old_mtu);
1632                                 }
1633                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1634                                         if (mtu < dst_mtu(&rth->u.dst)) {
1635                                                 dst_confirm(&rth->u.dst);
1636                                                 if (mtu < ip_rt_min_pmtu) {
1637                                                         mtu = ip_rt_min_pmtu;
1638                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1639                                                                 (1 << RTAX_MTU);
1640                                                 }
1641                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1642                                                 dst_set_expires(&rth->u.dst,
1643                                                         ip_rt_mtu_expires);
1644                                         }
1645                                         est_mtu = mtu;
1646                                 }
1647                         }
1648                         rcu_read_unlock();
1649                 }
1650         }
1651         return est_mtu ? : new_mtu;
1652 }
1653
1654 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1655 {
1656         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1657             !(dst_metric_locked(dst, RTAX_MTU))) {
1658                 if (mtu < ip_rt_min_pmtu) {
1659                         mtu = ip_rt_min_pmtu;
1660                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1661                 }
1662                 dst->metrics[RTAX_MTU-1] = mtu;
1663                 dst_set_expires(dst, ip_rt_mtu_expires);
1664                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1665         }
1666 }
1667
1668 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1669 {
1670         return NULL;
1671 }
1672
1673 static void ipv4_dst_destroy(struct dst_entry *dst)
1674 {
1675         struct rtable *rt = (struct rtable *) dst;
1676         struct inet_peer *peer = rt->peer;
1677         struct in_device *idev = rt->idev;
1678
1679         if (peer) {
1680                 rt->peer = NULL;
1681                 inet_putpeer(peer);
1682         }
1683
1684         if (idev) {
1685                 rt->idev = NULL;
1686                 in_dev_put(idev);
1687         }
1688 }
1689
1690 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1691                             int how)
1692 {
1693         struct rtable *rt = (struct rtable *) dst;
1694         struct in_device *idev = rt->idev;
1695         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1696                 struct in_device *loopback_idev =
1697                         in_dev_get(dev_net(dev)->loopback_dev);
1698                 if (loopback_idev) {
1699                         rt->idev = loopback_idev;
1700                         in_dev_put(idev);
1701                 }
1702         }
1703 }
1704
1705 static void ipv4_link_failure(struct sk_buff *skb)
1706 {
1707         struct rtable *rt;
1708
1709         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1710
1711         rt = skb_rtable(skb);
1712         if (rt)
1713                 dst_set_expires(&rt->u.dst, 0);
1714 }
1715
1716 static int ip_rt_bug(struct sk_buff *skb)
1717 {
1718         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1719                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1720                 skb->dev ? skb->dev->name : "?");
1721         kfree_skb(skb);
1722         return 0;
1723 }
1724
1725 /*
1726    We do not cache source address of outgoing interface,
1727    because it is used only by IP RR, TS and SRR options,
1728    so that it out of fast path.
1729
1730    BTW remember: "addr" is allowed to be not aligned
1731    in IP options!
1732  */
1733
1734 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1735 {
1736         __be32 src;
1737         struct fib_result res;
1738
1739         if (rt->fl.iif == 0)
1740                 src = rt->rt_src;
1741         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1742                 src = FIB_RES_PREFSRC(res);
1743                 fib_res_put(&res);
1744         } else
1745                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1746                                         RT_SCOPE_UNIVERSE);
1747         memcpy(addr, &src, 4);
1748 }
1749
1750 #ifdef CONFIG_NET_CLS_ROUTE
1751 static void set_class_tag(struct rtable *rt, u32 tag)
1752 {
1753         if (!(rt->u.dst.tclassid & 0xFFFF))
1754                 rt->u.dst.tclassid |= tag & 0xFFFF;
1755         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1756                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1757 }
1758 #endif
1759
1760 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1761 {
1762         struct fib_info *fi = res->fi;
1763
1764         if (fi) {
1765                 if (FIB_RES_GW(*res) &&
1766                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1767                         rt->rt_gateway = FIB_RES_GW(*res);
1768                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1769                        sizeof(rt->u.dst.metrics));
1770                 if (fi->fib_mtu == 0) {
1771                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1772                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1773                             rt->rt_gateway != rt->rt_dst &&
1774                             rt->u.dst.dev->mtu > 576)
1775                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1776                 }
1777 #ifdef CONFIG_NET_CLS_ROUTE
1778                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1779 #endif
1780         } else
1781                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1782
1783         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1784                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1785         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1786                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1787         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1788                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1789                                        ip_rt_min_advmss);
1790         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1791                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1792
1793 #ifdef CONFIG_NET_CLS_ROUTE
1794 #ifdef CONFIG_IP_MULTIPLE_TABLES
1795         set_class_tag(rt, fib_rules_tclass(res));
1796 #endif
1797         set_class_tag(rt, itag);
1798 #endif
1799         rt->rt_type = res->type;
1800 }
1801
1802 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1803                                 u8 tos, struct net_device *dev, int our)
1804 {
1805         unsigned hash;
1806         struct rtable *rth;
1807         __be32 spec_dst;
1808         struct in_device *in_dev = in_dev_get(dev);
1809         u32 itag = 0;
1810
1811         /* Primary sanity checks. */
1812
1813         if (in_dev == NULL)
1814                 return -EINVAL;
1815
1816         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1817             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1818                 goto e_inval;
1819
1820         if (ipv4_is_zeronet(saddr)) {
1821                 if (!ipv4_is_local_multicast(daddr))
1822                         goto e_inval;
1823                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1824         } else if (fib_validate_source(saddr, 0, tos, 0,
1825                                         dev, &spec_dst, &itag) < 0)
1826                 goto e_inval;
1827
1828         rth = dst_alloc(&ipv4_dst_ops);
1829         if (!rth)
1830                 goto e_nobufs;
1831
1832         rth->u.dst.output= ip_rt_bug;
1833
1834         atomic_set(&rth->u.dst.__refcnt, 1);
1835         rth->u.dst.flags= DST_HOST;
1836         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1837                 rth->u.dst.flags |= DST_NOPOLICY;
1838         rth->fl.fl4_dst = daddr;
1839         rth->rt_dst     = daddr;
1840         rth->fl.fl4_tos = tos;
1841         rth->fl.mark    = skb->mark;
1842         rth->fl.fl4_src = saddr;
1843         rth->rt_src     = saddr;
1844 #ifdef CONFIG_NET_CLS_ROUTE
1845         rth->u.dst.tclassid = itag;
1846 #endif
1847         rth->rt_iif     =
1848         rth->fl.iif     = dev->ifindex;
1849         rth->u.dst.dev  = init_net.loopback_dev;
1850         dev_hold(rth->u.dst.dev);
1851         rth->idev       = in_dev_get(rth->u.dst.dev);
1852         rth->fl.oif     = 0;
1853         rth->rt_gateway = daddr;
1854         rth->rt_spec_dst= spec_dst;
1855         rth->rt_genid   = rt_genid(dev_net(dev));
1856         rth->rt_flags   = RTCF_MULTICAST;
1857         rth->rt_type    = RTN_MULTICAST;
1858         if (our) {
1859                 rth->u.dst.input= ip_local_deliver;
1860                 rth->rt_flags |= RTCF_LOCAL;
1861         }
1862
1863 #ifdef CONFIG_IP_MROUTE
1864         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1865                 rth->u.dst.input = ip_mr_input;
1866 #endif
1867         RT_CACHE_STAT_INC(in_slow_mc);
1868
1869         in_dev_put(in_dev);
1870         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1871         return rt_intern_hash(hash, rth, NULL, skb);
1872
1873 e_nobufs:
1874         in_dev_put(in_dev);
1875         return -ENOBUFS;
1876
1877 e_inval:
1878         in_dev_put(in_dev);
1879         return -EINVAL;
1880 }
1881
1882
1883 static void ip_handle_martian_source(struct net_device *dev,
1884                                      struct in_device *in_dev,
1885                                      struct sk_buff *skb,
1886                                      __be32 daddr,
1887                                      __be32 saddr)
1888 {
1889         RT_CACHE_STAT_INC(in_martian_src);
1890 #ifdef CONFIG_IP_ROUTE_VERBOSE
1891         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1892                 /*
1893                  *      RFC1812 recommendation, if source is martian,
1894                  *      the only hint is MAC header.
1895                  */
1896                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1897                         &daddr, &saddr, dev->name);
1898                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1899                         int i;
1900                         const unsigned char *p = skb_mac_header(skb);
1901                         printk(KERN_WARNING "ll header: ");
1902                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1903                                 printk("%02x", *p);
1904                                 if (i < (dev->hard_header_len - 1))
1905                                         printk(":");
1906                         }
1907                         printk("\n");
1908                 }
1909         }
1910 #endif
1911 }
1912
1913 static int __mkroute_input(struct sk_buff *skb,
1914                            struct fib_result *res,
1915                            struct in_device *in_dev,
1916                            __be32 daddr, __be32 saddr, u32 tos,
1917                            struct rtable **result)
1918 {
1919
1920         struct rtable *rth;
1921         int err;
1922         struct in_device *out_dev;
1923         unsigned flags = 0;
1924         __be32 spec_dst;
1925         u32 itag;
1926
1927         /* get a working reference to the output device */
1928         out_dev = in_dev_get(FIB_RES_DEV(*res));
1929         if (out_dev == NULL) {
1930                 if (net_ratelimit())
1931                         printk(KERN_CRIT "Bug in ip_route_input" \
1932                                "_slow(). Please, report\n");
1933                 return -EINVAL;
1934         }
1935
1936
1937         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1938                                   in_dev->dev, &spec_dst, &itag);
1939         if (err < 0) {
1940                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1941                                          saddr);
1942
1943                 err = -EINVAL;
1944                 goto cleanup;
1945         }
1946
1947         if (err)
1948                 flags |= RTCF_DIRECTSRC;
1949
1950         if (out_dev == in_dev && err &&
1951             (IN_DEV_SHARED_MEDIA(out_dev) ||
1952              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1953                 flags |= RTCF_DOREDIRECT;
1954
1955         if (skb->protocol != htons(ETH_P_IP)) {
1956                 /* Not IP (i.e. ARP). Do not create route, if it is
1957                  * invalid for proxy arp. DNAT routes are always valid.
1958                  */
1959                 if (out_dev == in_dev) {
1960                         err = -EINVAL;
1961                         goto cleanup;
1962                 }
1963         }
1964
1965
1966         rth = dst_alloc(&ipv4_dst_ops);
1967         if (!rth) {
1968                 err = -ENOBUFS;
1969                 goto cleanup;
1970         }
1971
1972         atomic_set(&rth->u.dst.__refcnt, 1);
1973         rth->u.dst.flags= DST_HOST;
1974         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1975                 rth->u.dst.flags |= DST_NOPOLICY;
1976         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1977                 rth->u.dst.flags |= DST_NOXFRM;
1978         rth->fl.fl4_dst = daddr;
1979         rth->rt_dst     = daddr;
1980         rth->fl.fl4_tos = tos;
1981         rth->fl.mark    = skb->mark;
1982         rth->fl.fl4_src = saddr;
1983         rth->rt_src     = saddr;
1984         rth->rt_gateway = daddr;
1985         rth->rt_iif     =
1986                 rth->fl.iif     = in_dev->dev->ifindex;
1987         rth->u.dst.dev  = (out_dev)->dev;
1988         dev_hold(rth->u.dst.dev);
1989         rth->idev       = in_dev_get(rth->u.dst.dev);
1990         rth->fl.oif     = 0;
1991         rth->rt_spec_dst= spec_dst;
1992
1993         rth->u.dst.input = ip_forward;
1994         rth->u.dst.output = ip_output;
1995         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1996
1997         rt_set_nexthop(rth, res, itag);
1998
1999         rth->rt_flags = flags;
2000
2001         *result = rth;
2002         err = 0;
2003  cleanup:
2004         /* release the working reference to the output device */
2005         in_dev_put(out_dev);
2006         return err;
2007 }
2008
2009 static int ip_mkroute_input(struct sk_buff *skb,
2010                             struct fib_result *res,
2011                             const struct flowi *fl,
2012                             struct in_device *in_dev,
2013                             __be32 daddr, __be32 saddr, u32 tos)
2014 {
2015         struct rtable* rth = NULL;
2016         int err;
2017         unsigned hash;
2018
2019 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2020         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2021                 fib_select_multipath(fl, res);
2022 #endif
2023
2024         /* create a routing cache entry */
2025         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2026         if (err)
2027                 return err;
2028
2029         /* put it into the cache */
2030         hash = rt_hash(daddr, saddr, fl->iif,
2031                        rt_genid(dev_net(rth->u.dst.dev)));
2032         return rt_intern_hash(hash, rth, NULL, skb);
2033 }
2034
2035 /*
2036  *      NOTE. We drop all the packets that has local source
2037  *      addresses, because every properly looped back packet
2038  *      must have correct destination already attached by output routine.
2039  *
2040  *      Such approach solves two big problems:
2041  *      1. Not simplex devices are handled properly.
2042  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2043  */
2044
2045 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046                                u8 tos, struct net_device *dev)
2047 {
2048         struct fib_result res;
2049         struct in_device *in_dev = in_dev_get(dev);
2050         struct flowi fl = { .nl_u = { .ip4_u =
2051                                       { .daddr = daddr,
2052                                         .saddr = saddr,
2053                                         .tos = tos,
2054                                         .scope = RT_SCOPE_UNIVERSE,
2055                                       } },
2056                             .mark = skb->mark,
2057                             .iif = dev->ifindex };
2058         unsigned        flags = 0;
2059         u32             itag = 0;
2060         struct rtable * rth;
2061         unsigned        hash;
2062         __be32          spec_dst;
2063         int             err = -EINVAL;
2064         int             free_res = 0;
2065         struct net    * net = dev_net(dev);
2066
2067         /* IP on this device is disabled. */
2068
2069         if (!in_dev)
2070                 goto out;
2071
2072         /* Check for the most weird martians, which can be not detected
2073            by fib_lookup.
2074          */
2075
2076         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2077             ipv4_is_loopback(saddr))
2078                 goto martian_source;
2079
2080         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2081                 goto brd_input;
2082
2083         /* Accept zero addresses only to limited broadcast;
2084          * I even do not know to fix it or not. Waiting for complains :-)
2085          */
2086         if (ipv4_is_zeronet(saddr))
2087                 goto martian_source;
2088
2089         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2090             ipv4_is_loopback(daddr))
2091                 goto martian_destination;
2092
2093         /*
2094          *      Now we are ready to route packet.
2095          */
2096         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2097                 if (!IN_DEV_FORWARD(in_dev))
2098                         goto e_hostunreach;
2099                 goto no_route;
2100         }
2101         free_res = 1;
2102
2103         RT_CACHE_STAT_INC(in_slow_tot);
2104
2105         if (res.type == RTN_BROADCAST)
2106                 goto brd_input;
2107
2108         if (res.type == RTN_LOCAL) {
2109                 int result;
2110                 result = fib_validate_source(saddr, daddr, tos,
2111                                              net->loopback_dev->ifindex,
2112                                              dev, &spec_dst, &itag);
2113                 if (result < 0)
2114                         goto martian_source;
2115                 if (result)
2116                         flags |= RTCF_DIRECTSRC;
2117                 spec_dst = daddr;
2118                 goto local_input;
2119         }
2120
2121         if (!IN_DEV_FORWARD(in_dev))
2122                 goto e_hostunreach;
2123         if (res.type != RTN_UNICAST)
2124                 goto martian_destination;
2125
2126         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2127 done:
2128         in_dev_put(in_dev);
2129         if (free_res)
2130                 fib_res_put(&res);
2131 out:    return err;
2132
2133 brd_input:
2134         if (skb->protocol != htons(ETH_P_IP))
2135                 goto e_inval;
2136
2137         if (ipv4_is_zeronet(saddr))
2138                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2139         else {
2140                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2141                                           &itag);
2142                 if (err < 0)
2143                         goto martian_source;
2144                 if (err)
2145                         flags |= RTCF_DIRECTSRC;
2146         }
2147         flags |= RTCF_BROADCAST;
2148         res.type = RTN_BROADCAST;
2149         RT_CACHE_STAT_INC(in_brd);
2150
2151 local_input:
2152         rth = dst_alloc(&ipv4_dst_ops);
2153         if (!rth)
2154                 goto e_nobufs;
2155
2156         rth->u.dst.output= ip_rt_bug;
2157         rth->rt_genid = rt_genid(net);
2158
2159         atomic_set(&rth->u.dst.__refcnt, 1);
2160         rth->u.dst.flags= DST_HOST;
2161         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2162                 rth->u.dst.flags |= DST_NOPOLICY;
2163         rth->fl.fl4_dst = daddr;
2164         rth->rt_dst     = daddr;
2165         rth->fl.fl4_tos = tos;
2166         rth->fl.mark    = skb->mark;
2167         rth->fl.fl4_src = saddr;
2168         rth->rt_src     = saddr;
2169 #ifdef CONFIG_NET_CLS_ROUTE
2170         rth->u.dst.tclassid = itag;
2171 #endif
2172         rth->rt_iif     =
2173         rth->fl.iif     = dev->ifindex;
2174         rth->u.dst.dev  = net->loopback_dev;
2175         dev_hold(rth->u.dst.dev);
2176         rth->idev       = in_dev_get(rth->u.dst.dev);
2177         rth->rt_gateway = daddr;
2178         rth->rt_spec_dst= spec_dst;
2179         rth->u.dst.input= ip_local_deliver;
2180         rth->rt_flags   = flags|RTCF_LOCAL;
2181         if (res.type == RTN_UNREACHABLE) {
2182                 rth->u.dst.input= ip_error;
2183                 rth->u.dst.error= -err;
2184                 rth->rt_flags   &= ~RTCF_LOCAL;
2185         }
2186         rth->rt_type    = res.type;
2187         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2188         err = rt_intern_hash(hash, rth, NULL, skb);
2189         goto done;
2190
2191 no_route:
2192         RT_CACHE_STAT_INC(in_no_route);
2193         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2194         res.type = RTN_UNREACHABLE;
2195         if (err == -ESRCH)
2196                 err = -ENETUNREACH;
2197         goto local_input;
2198
2199         /*
2200          *      Do not cache martian addresses: they should be logged (RFC1812)
2201          */
2202 martian_destination:
2203         RT_CACHE_STAT_INC(in_martian_dst);
2204 #ifdef CONFIG_IP_ROUTE_VERBOSE
2205         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2206                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2207                         &daddr, &saddr, dev->name);
2208 #endif
2209
2210 e_hostunreach:
2211         err = -EHOSTUNREACH;
2212         goto done;
2213
2214 e_inval:
2215         err = -EINVAL;
2216         goto done;
2217
2218 e_nobufs:
2219         err = -ENOBUFS;
2220         goto done;
2221
2222 martian_source:
2223         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2224         goto e_inval;
2225 }
2226
2227 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2228                    u8 tos, struct net_device *dev)
2229 {
2230         struct rtable * rth;
2231         unsigned        hash;
2232         int iif = dev->ifindex;
2233         struct net *net;
2234
2235         net = dev_net(dev);
2236
2237         if (!rt_caching(net))
2238                 goto skip_cache;
2239
2240         tos &= IPTOS_RT_MASK;
2241         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2242
2243         rcu_read_lock();
2244         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2245              rth = rcu_dereference(rth->u.dst.rt_next)) {
2246                 if (((rth->fl.fl4_dst ^ daddr) |
2247                      (rth->fl.fl4_src ^ saddr) |
2248                      (rth->fl.iif ^ iif) |
2249                      rth->fl.oif |
2250                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2251                     rth->fl.mark == skb->mark &&
2252                     net_eq(dev_net(rth->u.dst.dev), net) &&
2253                     !rt_is_expired(rth)) {
2254                         dst_use(&rth->u.dst, jiffies);
2255                         RT_CACHE_STAT_INC(in_hit);
2256                         rcu_read_unlock();
2257                         skb_dst_set(skb, &rth->u.dst);
2258                         return 0;
2259                 }
2260                 RT_CACHE_STAT_INC(in_hlist_search);
2261         }
2262         rcu_read_unlock();
2263
2264 skip_cache:
2265         /* Multicast recognition logic is moved from route cache to here.
2266            The problem was that too many Ethernet cards have broken/missing
2267            hardware multicast filters :-( As result the host on multicasting
2268            network acquires a lot of useless route cache entries, sort of
2269            SDR messages from all the world. Now we try to get rid of them.
2270            Really, provided software IP multicast filter is organized
2271            reasonably (at least, hashed), it does not result in a slowdown
2272            comparing with route cache reject entries.
2273            Note, that multicast routers are not affected, because
2274            route cache entry is created eventually.
2275          */
2276         if (ipv4_is_multicast(daddr)) {
2277                 struct in_device *in_dev;
2278
2279                 rcu_read_lock();
2280                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2281                         int our = ip_check_mc(in_dev, daddr, saddr,
2282                                 ip_hdr(skb)->protocol);
2283                         if (our
2284 #ifdef CONFIG_IP_MROUTE
2285                             || (!ipv4_is_local_multicast(daddr) &&
2286                                 IN_DEV_MFORWARD(in_dev))
2287 #endif
2288                             ) {
2289                                 rcu_read_unlock();
2290                                 return ip_route_input_mc(skb, daddr, saddr,
2291                                                          tos, dev, our);
2292                         }
2293                 }
2294                 rcu_read_unlock();
2295                 return -EINVAL;
2296         }
2297         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2298 }
2299
2300 static int __mkroute_output(struct rtable **result,
2301                             struct fib_result *res,
2302                             const struct flowi *fl,
2303                             const struct flowi *oldflp,
2304                             struct net_device *dev_out,
2305                             unsigned flags)
2306 {
2307         struct rtable *rth;
2308         struct in_device *in_dev;
2309         u32 tos = RT_FL_TOS(oldflp);
2310         int err = 0;
2311
2312         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2313                 return -EINVAL;
2314
2315         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2316                 res->type = RTN_BROADCAST;
2317         else if (ipv4_is_multicast(fl->fl4_dst))
2318                 res->type = RTN_MULTICAST;
2319         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2320                 return -EINVAL;
2321
2322         if (dev_out->flags & IFF_LOOPBACK)
2323                 flags |= RTCF_LOCAL;
2324
2325         /* get work reference to inet device */
2326         in_dev = in_dev_get(dev_out);
2327         if (!in_dev)
2328                 return -EINVAL;
2329
2330         if (res->type == RTN_BROADCAST) {
2331                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2332                 if (res->fi) {
2333                         fib_info_put(res->fi);
2334                         res->fi = NULL;
2335                 }
2336         } else if (res->type == RTN_MULTICAST) {
2337                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2338                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2339                                  oldflp->proto))
2340                         flags &= ~RTCF_LOCAL;
2341                 /* If multicast route do not exist use
2342                    default one, but do not gateway in this case.
2343                    Yes, it is hack.
2344                  */
2345                 if (res->fi && res->prefixlen < 4) {
2346                         fib_info_put(res->fi);
2347                         res->fi = NULL;
2348                 }
2349         }
2350
2351
2352         rth = dst_alloc(&ipv4_dst_ops);
2353         if (!rth) {
2354                 err = -ENOBUFS;
2355                 goto cleanup;
2356         }
2357
2358         atomic_set(&rth->u.dst.__refcnt, 1);
2359         rth->u.dst.flags= DST_HOST;
2360         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2361                 rth->u.dst.flags |= DST_NOXFRM;
2362         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2363                 rth->u.dst.flags |= DST_NOPOLICY;
2364
2365         rth->fl.fl4_dst = oldflp->fl4_dst;
2366         rth->fl.fl4_tos = tos;
2367         rth->fl.fl4_src = oldflp->fl4_src;
2368         rth->fl.oif     = oldflp->oif;
2369         rth->fl.mark    = oldflp->mark;
2370         rth->rt_dst     = fl->fl4_dst;
2371         rth->rt_src     = fl->fl4_src;
2372         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2373         /* get references to the devices that are to be hold by the routing
2374            cache entry */
2375         rth->u.dst.dev  = dev_out;
2376         dev_hold(dev_out);
2377         rth->idev       = in_dev_get(dev_out);
2378         rth->rt_gateway = fl->fl4_dst;
2379         rth->rt_spec_dst= fl->fl4_src;
2380
2381         rth->u.dst.output=ip_output;
2382         rth->rt_genid = rt_genid(dev_net(dev_out));
2383
2384         RT_CACHE_STAT_INC(out_slow_tot);
2385
2386         if (flags & RTCF_LOCAL) {
2387                 rth->u.dst.input = ip_local_deliver;
2388                 rth->rt_spec_dst = fl->fl4_dst;
2389         }
2390         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2391                 rth->rt_spec_dst = fl->fl4_src;
2392                 if (flags & RTCF_LOCAL &&
2393                     !(dev_out->flags & IFF_LOOPBACK)) {
2394                         rth->u.dst.output = ip_mc_output;
2395                         RT_CACHE_STAT_INC(out_slow_mc);
2396                 }
2397 #ifdef CONFIG_IP_MROUTE
2398                 if (res->type == RTN_MULTICAST) {
2399                         if (IN_DEV_MFORWARD(in_dev) &&
2400                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2401                                 rth->u.dst.input = ip_mr_input;
2402                                 rth->u.dst.output = ip_mc_output;
2403                         }
2404                 }
2405 #endif
2406         }
2407
2408         rt_set_nexthop(rth, res, 0);
2409
2410         rth->rt_flags = flags;
2411
2412         *result = rth;
2413  cleanup:
2414         /* release work reference to inet device */
2415         in_dev_put(in_dev);
2416
2417         return err;
2418 }
2419
2420 static int ip_mkroute_output(struct rtable **rp,
2421                              struct fib_result *res,
2422                              const struct flowi *fl,
2423                              const struct flowi *oldflp,
2424                              struct net_device *dev_out,
2425                              unsigned flags)
2426 {
2427         struct rtable *rth = NULL;
2428         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2429         unsigned hash;
2430         if (err == 0) {
2431                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2432                                rt_genid(dev_net(dev_out)));
2433                 err = rt_intern_hash(hash, rth, rp, NULL);
2434         }
2435
2436         return err;
2437 }
2438
2439 /*
2440  * Major route resolver routine.
2441  */
2442
2443 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2444                                 const struct flowi *oldflp)
2445 {
2446         u32 tos = RT_FL_TOS(oldflp);
2447         struct flowi fl = { .nl_u = { .ip4_u =
2448                                       { .daddr = oldflp->fl4_dst,
2449                                         .saddr = oldflp->fl4_src,
2450                                         .tos = tos & IPTOS_RT_MASK,
2451                                         .scope = ((tos & RTO_ONLINK) ?
2452                                                   RT_SCOPE_LINK :
2453                                                   RT_SCOPE_UNIVERSE),
2454                                       } },
2455                             .mark = oldflp->mark,
2456                             .iif = net->loopback_dev->ifindex,
2457                             .oif = oldflp->oif };
2458         struct fib_result res;
2459         unsigned flags = 0;
2460         struct net_device *dev_out = NULL;
2461         int free_res = 0;
2462         int err;
2463
2464
2465         res.fi          = NULL;
2466 #ifdef CONFIG_IP_MULTIPLE_TABLES
2467         res.r           = NULL;
2468 #endif
2469
2470         if (oldflp->fl4_src) {
2471                 err = -EINVAL;
2472                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2473                     ipv4_is_lbcast(oldflp->fl4_src) ||
2474                     ipv4_is_zeronet(oldflp->fl4_src))
2475                         goto out;
2476
2477                 /* I removed check for oif == dev_out->oif here.
2478                    It was wrong for two reasons:
2479                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2480                       is assigned to multiple interfaces.
2481                    2. Moreover, we are allowed to send packets with saddr
2482                       of another iface. --ANK
2483                  */
2484
2485                 if (oldflp->oif == 0
2486                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2487                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2488                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2489                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2490                         if (dev_out == NULL)
2491                                 goto out;
2492
2493                         /* Special hack: user can direct multicasts
2494                            and limited broadcast via necessary interface
2495                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2496                            This hack is not just for fun, it allows
2497                            vic,vat and friends to work.
2498                            They bind socket to loopback, set ttl to zero
2499                            and expect that it will work.
2500                            From the viewpoint of routing cache they are broken,
2501                            because we are not allowed to build multicast path
2502                            with loopback source addr (look, routing cache
2503                            cannot know, that ttl is zero, so that packet
2504                            will not leave this host and route is valid).
2505                            Luckily, this hack is good workaround.
2506                          */
2507
2508                         fl.oif = dev_out->ifindex;
2509                         goto make_route;
2510                 }
2511
2512                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2513                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2514                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2515                         if (dev_out == NULL)
2516                                 goto out;
2517                         dev_put(dev_out);
2518                         dev_out = NULL;
2519                 }
2520         }
2521
2522
2523         if (oldflp->oif) {
2524                 dev_out = dev_get_by_index(net, oldflp->oif);
2525                 err = -ENODEV;
2526                 if (dev_out == NULL)
2527                         goto out;
2528
2529                 /* RACE: Check return value of inet_select_addr instead. */
2530                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2531                         dev_put(dev_out);
2532                         goto out;       /* Wrong error code */
2533                 }
2534
2535                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2536                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2537                         if (!fl.fl4_src)
2538                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2539                                                               RT_SCOPE_LINK);
2540                         goto make_route;
2541                 }
2542                 if (!fl.fl4_src) {
2543                         if (ipv4_is_multicast(oldflp->fl4_dst))
2544                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2545                                                               fl.fl4_scope);
2546                         else if (!oldflp->fl4_dst)
2547                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2548                                                               RT_SCOPE_HOST);
2549                 }
2550         }
2551
2552         if (!fl.fl4_dst) {
2553                 fl.fl4_dst = fl.fl4_src;
2554                 if (!fl.fl4_dst)
2555                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2556                 if (dev_out)
2557                         dev_put(dev_out);
2558                 dev_out = net->loopback_dev;
2559                 dev_hold(dev_out);
2560                 fl.oif = net->loopback_dev->ifindex;
2561                 res.type = RTN_LOCAL;
2562                 flags |= RTCF_LOCAL;
2563                 goto make_route;
2564         }
2565
2566         if (fib_lookup(net, &fl, &res)) {
2567                 res.fi = NULL;
2568                 if (oldflp->oif) {
2569                         /* Apparently, routing tables are wrong. Assume,
2570                            that the destination is on link.
2571
2572                            WHY? DW.
2573                            Because we are allowed to send to iface
2574                            even if it has NO routes and NO assigned
2575                            addresses. When oif is specified, routing
2576                            tables are looked up with only one purpose:
2577                            to catch if destination is gatewayed, rather than
2578                            direct. Moreover, if MSG_DONTROUTE is set,
2579                            we send packet, ignoring both routing tables
2580                            and ifaddr state. --ANK
2581
2582
2583                            We could make it even if oif is unknown,
2584                            likely IPv6, but we do not.
2585                          */
2586
2587                         if (fl.fl4_src == 0)
2588                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2589                                                               RT_SCOPE_LINK);
2590                         res.type = RTN_UNICAST;
2591                         goto make_route;
2592                 }
2593                 if (dev_out)
2594                         dev_put(dev_out);
2595                 err = -ENETUNREACH;
2596                 goto out;
2597         }
2598         free_res = 1;
2599
2600         if (res.type == RTN_LOCAL) {
2601                 if (!fl.fl4_src)
2602                         fl.fl4_src = fl.fl4_dst;
2603                 if (dev_out)
2604                         dev_put(dev_out);
2605                 dev_out = net->loopback_dev;
2606                 dev_hold(dev_out);
2607                 fl.oif = dev_out->ifindex;
2608                 if (res.fi)
2609                         fib_info_put(res.fi);
2610                 res.fi = NULL;
2611                 flags |= RTCF_LOCAL;
2612                 goto make_route;
2613         }
2614
2615 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2616         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2617                 fib_select_multipath(&fl, &res);
2618         else
2619 #endif
2620         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2621                 fib_select_default(net, &fl, &res);
2622
2623         if (!fl.fl4_src)
2624                 fl.fl4_src = FIB_RES_PREFSRC(res);
2625
2626         if (dev_out)
2627                 dev_put(dev_out);
2628         dev_out = FIB_RES_DEV(res);
2629         dev_hold(dev_out);
2630         fl.oif = dev_out->ifindex;
2631
2632
2633 make_route:
2634         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2635
2636
2637         if (free_res)
2638                 fib_res_put(&res);
2639         if (dev_out)
2640                 dev_put(dev_out);
2641 out:    return err;
2642 }
2643
2644 int __ip_route_output_key(struct net *net, struct rtable **rp,
2645                           const struct flowi *flp)
2646 {
2647         unsigned hash;
2648         struct rtable *rth;
2649
2650         if (!rt_caching(net))
2651                 goto slow_output;
2652
2653         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2654
2655         rcu_read_lock_bh();
2656         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2657                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2658                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2659                     rth->fl.fl4_src == flp->fl4_src &&
2660                     rth->fl.iif == 0 &&
2661                     rth->fl.oif == flp->oif &&
2662                     rth->fl.mark == flp->mark &&
2663                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2664                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2665                     net_eq(dev_net(rth->u.dst.dev), net) &&
2666                     !rt_is_expired(rth)) {
2667                         dst_use(&rth->u.dst, jiffies);
2668                         RT_CACHE_STAT_INC(out_hit);
2669                         rcu_read_unlock_bh();
2670                         *rp = rth;
2671                         return 0;
2672                 }
2673                 RT_CACHE_STAT_INC(out_hlist_search);
2674         }
2675         rcu_read_unlock_bh();
2676
2677 slow_output:
2678         return ip_route_output_slow(net, rp, flp);
2679 }
2680
2681 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2682
2683 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2684 {
2685 }
2686
2687 static struct dst_ops ipv4_dst_blackhole_ops = {
2688         .family                 =       AF_INET,
2689         .protocol               =       cpu_to_be16(ETH_P_IP),
2690         .destroy                =       ipv4_dst_destroy,
2691         .check                  =       ipv4_dst_check,
2692         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2693         .entries                =       ATOMIC_INIT(0),
2694 };
2695
2696
2697 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2698 {
2699         struct rtable *ort = *rp;
2700         struct rtable *rt = (struct rtable *)
2701                 dst_alloc(&ipv4_dst_blackhole_ops);
2702
2703         if (rt) {
2704                 struct dst_entry *new = &rt->u.dst;
2705
2706                 atomic_set(&new->__refcnt, 1);
2707                 new->__use = 1;
2708                 new->input = dst_discard;
2709                 new->output = dst_discard;
2710                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2711
2712                 new->dev = ort->u.dst.dev;
2713                 if (new->dev)
2714                         dev_hold(new->dev);
2715
2716                 rt->fl = ort->fl;
2717
2718                 rt->idev = ort->idev;
2719                 if (rt->idev)
2720                         in_dev_hold(rt->idev);
2721                 rt->rt_genid = rt_genid(net);
2722                 rt->rt_flags = ort->rt_flags;
2723                 rt->rt_type = ort->rt_type;
2724                 rt->rt_dst = ort->rt_dst;
2725                 rt->rt_src = ort->rt_src;
2726                 rt->rt_iif = ort->rt_iif;
2727                 rt->rt_gateway = ort->rt_gateway;
2728                 rt->rt_spec_dst = ort->rt_spec_dst;
2729                 rt->peer = ort->peer;
2730                 if (rt->peer)
2731                         atomic_inc(&rt->peer->refcnt);
2732
2733                 dst_free(new);
2734         }
2735
2736         dst_release(&(*rp)->u.dst);
2737         *rp = rt;
2738         return (rt ? 0 : -ENOMEM);
2739 }
2740
2741 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2742                          struct sock *sk, int flags)
2743 {
2744         int err;
2745
2746         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2747                 return err;
2748
2749         if (flp->proto) {
2750                 if (!flp->fl4_src)
2751                         flp->fl4_src = (*rp)->rt_src;
2752                 if (!flp->fl4_dst)
2753                         flp->fl4_dst = (*rp)->rt_dst;
2754                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2755                                     flags ? XFRM_LOOKUP_WAIT : 0);
2756                 if (err == -EREMOTE)
2757                         err = ipv4_dst_blackhole(net, rp, flp);
2758
2759                 return err;
2760         }
2761
2762         return 0;
2763 }
2764
2765 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2766
2767 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2768 {
2769         return ip_route_output_flow(net, rp, flp, NULL, 0);
2770 }
2771
2772 static int rt_fill_info(struct net *net,
2773                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2774                         int nowait, unsigned int flags)
2775 {
2776         struct rtable *rt = skb_rtable(skb);
2777         struct rtmsg *r;
2778         struct nlmsghdr *nlh;
2779         long expires;
2780         u32 id = 0, ts = 0, tsage = 0, error;
2781
2782         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2783         if (nlh == NULL)
2784                 return -EMSGSIZE;
2785
2786         r = nlmsg_data(nlh);
2787         r->rtm_family    = AF_INET;
2788         r->rtm_dst_len  = 32;
2789         r->rtm_src_len  = 0;
2790         r->rtm_tos      = rt->fl.fl4_tos;
2791         r->rtm_table    = RT_TABLE_MAIN;
2792         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2793         r->rtm_type     = rt->rt_type;
2794         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2795         r->rtm_protocol = RTPROT_UNSPEC;
2796         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2797         if (rt->rt_flags & RTCF_NOTIFY)
2798                 r->rtm_flags |= RTM_F_NOTIFY;
2799
2800         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2801
2802         if (rt->fl.fl4_src) {
2803                 r->rtm_src_len = 32;
2804                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2805         }
2806         if (rt->u.dst.dev)
2807                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2808 #ifdef CONFIG_NET_CLS_ROUTE
2809         if (rt->u.dst.tclassid)
2810                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2811 #endif
2812         if (rt->fl.iif)
2813                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2814         else if (rt->rt_src != rt->fl.fl4_src)
2815                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2816
2817         if (rt->rt_dst != rt->rt_gateway)
2818                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2819
2820         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2821                 goto nla_put_failure;
2822
2823         error = rt->u.dst.error;
2824         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2825         if (rt->peer) {
2826                 id = rt->peer->ip_id_count;
2827                 if (rt->peer->tcp_ts_stamp) {
2828                         ts = rt->peer->tcp_ts;
2829                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2830                 }
2831         }
2832
2833         if (rt->fl.iif) {
2834 #ifdef CONFIG_IP_MROUTE
2835                 __be32 dst = rt->rt_dst;
2836
2837                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2838                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2839                         int err = ipmr_get_route(net, skb, r, nowait);
2840                         if (err <= 0) {
2841                                 if (!nowait) {
2842                                         if (err == 0)
2843                                                 return 0;
2844                                         goto nla_put_failure;
2845                                 } else {
2846                                         if (err == -EMSGSIZE)
2847                                                 goto nla_put_failure;
2848                                         error = err;
2849                                 }
2850                         }
2851                 } else
2852 #endif
2853                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2854         }
2855
2856         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2857                                expires, error) < 0)
2858                 goto nla_put_failure;
2859
2860         return nlmsg_end(skb, nlh);
2861
2862 nla_put_failure:
2863         nlmsg_cancel(skb, nlh);
2864         return -EMSGSIZE;
2865 }
2866
2867 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2868 {
2869         struct net *net = sock_net(in_skb->sk);
2870         struct rtmsg *rtm;
2871         struct nlattr *tb[RTA_MAX+1];
2872         struct rtable *rt = NULL;
2873         __be32 dst = 0;
2874         __be32 src = 0;
2875         u32 iif;
2876         int err;
2877         struct sk_buff *skb;
2878
2879         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2880         if (err < 0)
2881                 goto errout;
2882
2883         rtm = nlmsg_data(nlh);
2884
2885         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2886         if (skb == NULL) {
2887                 err = -ENOBUFS;
2888                 goto errout;
2889         }
2890
2891         /* Reserve room for dummy headers, this skb can pass
2892            through good chunk of routing engine.
2893          */
2894         skb_reset_mac_header(skb);
2895         skb_reset_network_header(skb);
2896
2897         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2898         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2899         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2900
2901         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2902         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2903         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2904
2905         if (iif) {
2906                 struct net_device *dev;
2907
2908                 dev = __dev_get_by_index(net, iif);
2909                 if (dev == NULL) {
2910                         err = -ENODEV;
2911                         goto errout_free;
2912                 }
2913
2914                 skb->protocol   = htons(ETH_P_IP);
2915                 skb->dev        = dev;
2916                 local_bh_disable();
2917                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2918                 local_bh_enable();
2919
2920                 rt = skb_rtable(skb);
2921                 if (err == 0 && rt->u.dst.error)
2922                         err = -rt->u.dst.error;
2923         } else {
2924                 struct flowi fl = {
2925                         .nl_u = {
2926                                 .ip4_u = {
2927                                         .daddr = dst,
2928                                         .saddr = src,
2929                                         .tos = rtm->rtm_tos,
2930                                 },
2931                         },
2932                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2933                 };
2934                 err = ip_route_output_key(net, &rt, &fl);
2935         }
2936
2937         if (err)
2938                 goto errout_free;
2939
2940         skb_dst_set(skb, &rt->u.dst);
2941         if (rtm->rtm_flags & RTM_F_NOTIFY)
2942                 rt->rt_flags |= RTCF_NOTIFY;
2943
2944         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2945                            RTM_NEWROUTE, 0, 0);
2946         if (err <= 0)
2947                 goto errout_free;
2948
2949         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2950 errout:
2951         return err;
2952
2953 errout_free:
2954         kfree_skb(skb);
2955         goto errout;
2956 }
2957
2958 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2959 {
2960         struct rtable *rt;
2961         int h, s_h;
2962         int idx, s_idx;
2963         struct net *net;
2964
2965         net = sock_net(skb->sk);
2966
2967         s_h = cb->args[0];
2968         if (s_h < 0)
2969                 s_h = 0;
2970         s_idx = idx = cb->args[1];
2971         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2972                 if (!rt_hash_table[h].chain)
2973                         continue;
2974                 rcu_read_lock_bh();
2975                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2976                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2977                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2978                                 continue;
2979                         if (rt_is_expired(rt))
2980                                 continue;
2981                         skb_dst_set(skb, dst_clone(&rt->u.dst));
2982                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2983                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2984                                          1, NLM_F_MULTI) <= 0) {
2985                                 skb_dst_drop(skb);
2986                                 rcu_read_unlock_bh();
2987                                 goto done;
2988                         }
2989                         skb_dst_drop(skb);
2990                 }
2991                 rcu_read_unlock_bh();
2992         }
2993
2994 done:
2995         cb->args[0] = h;
2996         cb->args[1] = idx;
2997         return skb->len;
2998 }
2999
3000 void ip_rt_multicast_event(struct in_device *in_dev)
3001 {
3002         rt_cache_flush(dev_net(in_dev->dev), 0);
3003 }
3004
3005 #ifdef CONFIG_SYSCTL
3006 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3007                                         struct file *filp, void __user *buffer,
3008                                         size_t *lenp, loff_t *ppos)
3009 {
3010         if (write) {
3011                 int flush_delay;
3012                 ctl_table ctl;
3013                 struct net *net;
3014
3015                 memcpy(&ctl, __ctl, sizeof(ctl));
3016                 ctl.data = &flush_delay;
3017                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3018
3019                 net = (struct net *)__ctl->extra1;
3020                 rt_cache_flush(net, flush_delay);
3021                 return 0;
3022         }
3023
3024         return -EINVAL;
3025 }
3026
3027 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3028                                                 void __user *oldval,
3029                                                 size_t __user *oldlenp,
3030                                                 void __user *newval,
3031                                                 size_t newlen)
3032 {
3033         int delay;
3034         struct net *net;
3035         if (newlen != sizeof(int))
3036                 return -EINVAL;
3037         if (get_user(delay, (int __user *)newval))
3038                 return -EFAULT;
3039         net = (struct net *)table->extra1;
3040         rt_cache_flush(net, delay);
3041         return 0;
3042 }
3043
3044 static void rt_secret_reschedule(int old)
3045 {
3046         struct net *net;
3047         int new = ip_rt_secret_interval;
3048         int diff = new - old;
3049
3050         if (!diff)
3051                 return;
3052
3053         rtnl_lock();
3054         for_each_net(net) {
3055                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3056
3057                 if (!new)
3058                         continue;
3059
3060                 if (deleted) {
3061                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3062
3063                         if (time <= 0 || (time += diff) <= 0)
3064                                 time = 0;
3065
3066                         net->ipv4.rt_secret_timer.expires = time;
3067                 } else
3068                         net->ipv4.rt_secret_timer.expires = new;
3069
3070                 net->ipv4.rt_secret_timer.expires += jiffies;
3071                 add_timer(&net->ipv4.rt_secret_timer);
3072         }
3073         rtnl_unlock();
3074 }
3075
3076 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3077                                           struct file *filp,
3078                                           void __user *buffer, size_t *lenp,
3079                                           loff_t *ppos)
3080 {
3081         int old = ip_rt_secret_interval;
3082         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3083
3084         rt_secret_reschedule(old);
3085
3086         return ret;
3087 }
3088
3089 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3090                                                    void __user *oldval,
3091                                                    size_t __user *oldlenp,
3092                                                    void __user *newval,
3093                                                    size_t newlen)
3094 {
3095         int old = ip_rt_secret_interval;
3096         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3097
3098         rt_secret_reschedule(old);
3099
3100         return ret;
3101 }
3102
3103 static ctl_table ipv4_route_table[] = {
3104         {
3105                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3106                 .procname       = "gc_thresh",
3107                 .data           = &ipv4_dst_ops.gc_thresh,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec,
3111         },
3112         {
3113                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3114                 .procname       = "max_size",
3115                 .data           = &ip_rt_max_size,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = proc_dointvec,
3119         },
3120         {
3121                 /*  Deprecated. Use gc_min_interval_ms */
3122
3123                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3124                 .procname       = "gc_min_interval",
3125                 .data           = &ip_rt_gc_min_interval,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_jiffies,
3129                 .strategy       = sysctl_jiffies,
3130         },
3131         {
3132                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3133                 .procname       = "gc_min_interval_ms",
3134                 .data           = &ip_rt_gc_min_interval,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec_ms_jiffies,
3138                 .strategy       = sysctl_ms_jiffies,
3139         },
3140         {
3141                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3142                 .procname       = "gc_timeout",
3143                 .data           = &ip_rt_gc_timeout,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec_jiffies,
3147                 .strategy       = sysctl_jiffies,
3148         },
3149         {
3150                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3151                 .procname       = "gc_interval",
3152                 .data           = &ip_rt_gc_interval,
3153                 .maxlen         = sizeof(int),
3154                 .mode           = 0644,
3155                 .proc_handler   = proc_dointvec_jiffies,
3156                 .strategy       = sysctl_jiffies,
3157         },
3158         {
3159                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3160                 .procname       = "redirect_load",
3161                 .data           = &ip_rt_redirect_load,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3168                 .procname       = "redirect_number",
3169                 .data           = &ip_rt_redirect_number,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec,
3173         },
3174         {
3175                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3176                 .procname       = "redirect_silence",
3177                 .data           = &ip_rt_redirect_silence,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec,
3181         },
3182         {
3183                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3184                 .procname       = "error_cost",
3185                 .data           = &ip_rt_error_cost,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec,
3189         },
3190         {
3191                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3192                 .procname       = "error_burst",
3193                 .data           = &ip_rt_error_burst,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec,
3197         },
3198         {
3199                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3200                 .procname       = "gc_elasticity",
3201                 .data           = &ip_rt_gc_elasticity,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3208                 .procname       = "mtu_expires",
3209                 .data           = &ip_rt_mtu_expires,
3210                 .maxlen         = sizeof(int),
3211                 .mode           = 0644,
3212                 .proc_handler   = proc_dointvec_jiffies,
3213                 .strategy       = sysctl_jiffies,
3214         },
3215         {
3216                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3217                 .procname       = "min_pmtu",
3218                 .data           = &ip_rt_min_pmtu,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec,
3222         },
3223         {
3224                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3225                 .procname       = "min_adv_mss",
3226                 .data           = &ip_rt_min_advmss,
3227                 .maxlen         = sizeof(int),
3228                 .mode           = 0644,
3229                 .proc_handler   = proc_dointvec,
3230         },
3231         {
3232                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3233                 .procname       = "secret_interval",
3234                 .data           = &ip_rt_secret_interval,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3238                 .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3239         },
3240         { .ctl_name = 0 }
3241 };
3242
3243 static struct ctl_table empty[1];
3244
3245 static struct ctl_table ipv4_skeleton[] =
3246 {
3247         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3248           .mode = 0555, .child = ipv4_route_table},
3249         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3250           .mode = 0555, .child = empty},
3251         { }
3252 };
3253
3254 static __net_initdata struct ctl_path ipv4_path[] = {
3255         { .procname = "net", .ctl_name = CTL_NET, },
3256         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3257         { },
3258 };
3259
3260 static struct ctl_table ipv4_route_flush_table[] = {
3261         {
3262                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3263                 .procname       = "flush",
3264                 .maxlen         = sizeof(int),
3265                 .mode           = 0200,
3266                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3267                 .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3268         },
3269         { .ctl_name = 0 },
3270 };
3271
3272 static __net_initdata struct ctl_path ipv4_route_path[] = {
3273         { .procname = "net", .ctl_name = CTL_NET, },
3274         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3275         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3276         { },
3277 };
3278
3279 static __net_init int sysctl_route_net_init(struct net *net)
3280 {
3281         struct ctl_table *tbl;
3282
3283         tbl = ipv4_route_flush_table;
3284         if (net != &init_net) {
3285                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3286                 if (tbl == NULL)
3287                         goto err_dup;
3288         }
3289         tbl[0].extra1 = net;
3290
3291         net->ipv4.route_hdr =
3292                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3293         if (net->ipv4.route_hdr == NULL)
3294                 goto err_reg;
3295         return 0;
3296
3297 err_reg:
3298         if (tbl != ipv4_route_flush_table)
3299                 kfree(tbl);
3300 err_dup:
3301         return -ENOMEM;
3302 }
3303
3304 static __net_exit void sysctl_route_net_exit(struct net *net)
3305 {
3306         struct ctl_table *tbl;
3307
3308         tbl = net->ipv4.route_hdr->ctl_table_arg;
3309         unregister_net_sysctl_table(net->ipv4.route_hdr);
3310         BUG_ON(tbl == ipv4_route_flush_table);
3311         kfree(tbl);
3312 }
3313
3314 static __net_initdata struct pernet_operations sysctl_route_ops = {
3315         .init = sysctl_route_net_init,
3316         .exit = sysctl_route_net_exit,
3317 };
3318 #endif
3319
3320
3321 static __net_init int rt_secret_timer_init(struct net *net)
3322 {
3323         atomic_set(&net->ipv4.rt_genid,
3324                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3325                         (jiffies ^ (jiffies >> 7))));
3326
3327         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3328         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3329         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3330
3331         if (ip_rt_secret_interval) {
3332                 net->ipv4.rt_secret_timer.expires =
3333                         jiffies + net_random() % ip_rt_secret_interval +
3334                         ip_rt_secret_interval;
3335                 add_timer(&net->ipv4.rt_secret_timer);
3336         }
3337         return 0;
3338 }
3339
3340 static __net_exit void rt_secret_timer_exit(struct net *net)
3341 {
3342         del_timer_sync(&net->ipv4.rt_secret_timer);
3343 }
3344
3345 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3346         .init = rt_secret_timer_init,
3347         .exit = rt_secret_timer_exit,
3348 };
3349
3350
3351 #ifdef CONFIG_NET_CLS_ROUTE
3352 struct ip_rt_acct *ip_rt_acct __read_mostly;
3353 #endif /* CONFIG_NET_CLS_ROUTE */
3354
3355 static __initdata unsigned long rhash_entries;
3356 static int __init set_rhash_entries(char *str)
3357 {
3358         if (!str)
3359                 return 0;
3360         rhash_entries = simple_strtoul(str, &str, 0);
3361         return 1;
3362 }
3363 __setup("rhash_entries=", set_rhash_entries);
3364
3365 int __init ip_rt_init(void)
3366 {
3367         int rc = 0;
3368
3369 #ifdef CONFIG_NET_CLS_ROUTE
3370         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3371         if (!ip_rt_acct)
3372                 panic("IP: failed to allocate ip_rt_acct\n");
3373 #endif
3374
3375         ipv4_dst_ops.kmem_cachep =
3376                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3377                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3378
3379         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3380
3381         rt_hash_table = (struct rt_hash_bucket *)
3382                 alloc_large_system_hash("IP route cache",
3383                                         sizeof(struct rt_hash_bucket),
3384                                         rhash_entries,
3385                                         (num_physpages >= 128 * 1024) ?
3386                                         15 : 17,
3387                                         0,
3388                                         &rt_hash_log,
3389                                         &rt_hash_mask,
3390                                         rhash_entries ? 0 : 512 * 1024);
3391         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392         rt_hash_lock_init();
3393
3394         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3396
3397         devinet_init();
3398         ip_fib_init();
3399
3400         /* All the timers, started at system startup tend
3401            to synchronize. Perturb it a bit.
3402          */
3403         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3404         expires_ljiffies = jiffies;
3405         schedule_delayed_work(&expires_work,
3406                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3407
3408         if (register_pernet_subsys(&rt_secret_timer_ops))
3409                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3410
3411         if (ip_rt_proc_init())
3412                 printk(KERN_ERR "Unable to create route proc files\n");
3413 #ifdef CONFIG_XFRM
3414         xfrm_init();
3415         xfrm4_init();
3416 #endif
3417         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3418
3419 #ifdef CONFIG_SYSCTL
3420         register_pernet_subsys(&sysctl_route_ops);
3421 #endif
3422         return rc;
3423 }
3424
3425 #ifdef CONFIG_SYSCTL
3426 /*
3427  * We really need to sanitize the damn ipv4 init order, then all
3428  * this nonsense will go away.
3429  */
3430 void __init ip_static_sysctl_init(void)
3431 {
3432         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3433 }
3434 #endif
3435
3436 EXPORT_SYMBOL(__ip_select_ident);
3437 EXPORT_SYMBOL(ip_route_input);
3438 EXPORT_SYMBOL(ip_route_output_key);