net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/config.h>
  66 #include <linux/module.h>
  67 #include <asm/uaccess.h>
  68 #include <asm/system.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/sched.h>
  73 #include <linux/mm.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/rtnetlink.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <net/protocol.h>
  95 #include <net/ip.h>
  96 #include <net/route.h>
  97 #include <net/inetpeer.h>
  98 #include <net/sock.h>
  99 #include <net/ip_fib.h>
 100 #include <net/arp.h>
 101 #include <net/tcp.h>
 102 #include <net/icmp.h>
 103 #include <net/xfrm.h>
 104 #include <net/ip_mp_alg.h>
 105 #ifdef CONFIG_SYSCTL
 106 #include <linux/sysctl.h>
 107 #endif
 108
 109 #define RT_FL_TOS(oldflp) \
 110     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 111
 112 #define IP_MAX_MTU      0xFFF0
 113
 114 #define RT_GC_TIMEOUT (300*HZ)
 115
 116 static int ip_rt_min_delay              = 2 * HZ;
 117 static int ip_rt_max_delay              = 10 * HZ;
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval            = 60 * HZ;
 121 static int ip_rt_gc_min_interval        = HZ / 2;
 122 static int ip_rt_redirect_number        = 9;
 123 static int ip_rt_redirect_load          = HZ / 50;
 124 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost             = HZ;
 126 static int ip_rt_error_burst            = 5 * HZ;
 127 static int ip_rt_gc_elasticity          = 8;
 128 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 130 static int ip_rt_min_advmss             = 256;
 131 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 132 static unsigned long rt_deadline;
 133
 134 #define RTprint(a...)   printk(KERN_DEBUG a)
 135
 136 static struct timer_list rt_flush_timer;
 137 static struct timer_list rt_periodic_timer;
 138 static struct timer_list rt_secret_timer;
 139
 140 /*
 141  *      Interface to generic destination cache.
 142  */
 143
 144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 145 static void              ipv4_dst_destroy(struct dst_entry *dst);
 146 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 147                                          struct net_device *dev, int how);
 148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 149 static void              ipv4_link_failure(struct sk_buff *skb);
 150 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 151 static int rt_garbage_collect(void);
 152
 153
 154 static struct dst_ops ipv4_dst_ops = {
 155         .family =               AF_INET,
 156         .protocol =             __constant_htons(ETH_P_IP),
 157         .gc =                   rt_garbage_collect,
 158         .check =                ipv4_dst_check,
 159         .destroy =              ipv4_dst_destroy,
 160         .ifdown =               ipv4_dst_ifdown,
 161         .negative_advice =      ipv4_negative_advice,
 162         .link_failure =         ipv4_link_failure,
 163         .update_pmtu =          ip_rt_update_pmtu,
 164         .entry_size =           sizeof(struct rtable),
 165 };
 166
 167 #define ECN_OR_COST(class)      TC_PRIO_##class
 168
 169 __u8 ip_tos2prio[16] = {
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(FILLER),
 172         TC_PRIO_BESTEFFORT,
 173         ECN_OR_COST(BESTEFFORT),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_BULK,
 177         ECN_OR_COST(BULK),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE,
 181         ECN_OR_COST(INTERACTIVE),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK),
 184         TC_PRIO_INTERACTIVE_BULK,
 185         ECN_OR_COST(INTERACTIVE_BULK)
 186 };
 187
 188
 189 /*
 190  * Route cache.
 191  */
 192
 193 /* The locking scheme is rather straight forward:
 194  *
 195  * 1) Read-Copy Update protects the buckets of the central route hash.
 196  * 2) Only writers remove entries, and they hold the lock
 197  *    as they look at rtable reference counts.
 198  * 3) Only readers acquire references to rtable entries,
 199  *    they do so with atomic increments and with the
 200  *    lock held.
 201  */
 202
 203 struct rt_hash_bucket {
 204         struct rtable   *chain;
 205 };
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 207 /*
 208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 209  * The size of this table is a power of two and depends on the number of CPUS.
 210  */
 211 #if NR_CPUS >= 32
 212 #define RT_HASH_LOCK_SZ 4096
 213 #elif NR_CPUS >= 16
 214 #define RT_HASH_LOCK_SZ 2048
 215 #elif NR_CPUS >= 8
 216 #define RT_HASH_LOCK_SZ 1024
 217 #elif NR_CPUS >= 4
 218 #define RT_HASH_LOCK_SZ 512
 219 #else
 220 #define RT_HASH_LOCK_SZ 256
 221 #endif
 222
 223 static spinlock_t       *rt_hash_locks;
 224 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 225 # define rt_hash_lock_init()    { \
 226                 int i; \
 227                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 228                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 229                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 230                         spin_lock_init(&rt_hash_locks[i]); \
 231                 }
 232 #else
 233 # define rt_hash_lock_addr(slot) NULL
 234 # define rt_hash_lock_init()
 235 #endif
 236
 237 static struct rt_hash_bucket    *rt_hash_table;
 238 static unsigned                 rt_hash_mask;
 239 static int                      rt_hash_log;
 240 static unsigned int             rt_hash_rnd;
 241
 242 struct rt_cache_stat *rt_cache_stat;
 243
 244 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 245                                 struct rtable **res);
 246
 247 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 248 {
 249         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 250                 & rt_hash_mask);
 251 }
 252
 253 #ifdef CONFIG_PROC_FS
 254 struct rt_cache_iter_state {
 255         int bucket;
 256 };
 257
 258 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 259 {
 260         struct rtable *r = NULL;
 261         struct rt_cache_iter_state *st = seq->private;
 262
 263         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 264                 rcu_read_lock_bh();
 265                 r = rt_hash_table[st->bucket].chain;
 266                 if (r)
 267                         break;
 268                 rcu_read_unlock_bh();
 269         }
 270         return r;
 271 }
 272
 273 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 274 {
 275         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 276
 277         r = r->u.rt_next;
 278         while (!r) {
 279                 rcu_read_unlock_bh();
 280                 if (--st->bucket < 0)
 281                         break;
 282                 rcu_read_lock_bh();
 283                 r = rt_hash_table[st->bucket].chain;
 284         }
 285         return r;
 286 }
 287
 288 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 289 {
 290         struct rtable *r = rt_cache_get_first(seq);
 291
 292         if (r)
 293                 while (pos && (r = rt_cache_get_next(seq, r)))
 294                         --pos;
 295         return pos ? NULL : r;
 296 }
 297
 298 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 299 {
 300         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 301 }
 302
 303 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 304 {
 305         struct rtable *r = NULL;
 306
 307         if (v == SEQ_START_TOKEN)
 308                 r = rt_cache_get_first(seq);
 309         else
 310                 r = rt_cache_get_next(seq, v);
 311         ++*pos;
 312         return r;
 313 }
 314
 315 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 316 {
 317         if (v && v != SEQ_START_TOKEN)
 318                 rcu_read_unlock_bh();
 319 }
 320
 321 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 322 {
 323         if (v == SEQ_START_TOKEN)
 324                 seq_printf(seq, "%-127s\n",
 325                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 326                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 327                            "HHUptod\tSpecDst");
 328         else {
 329                 struct rtable *r = v;
 330                 char temp[256];
 331
 332                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 333                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 334                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 335                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 336                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 337                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 338                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 339                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 340                         dst_metric(&r->u.dst, RTAX_WINDOW),
 341                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 342                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 343                         r->fl.fl4_tos,
 344                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 345                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 346                                        dev_queue_xmit) : 0,
 347                         r->rt_spec_dst);
 348                 seq_printf(seq, "%-127s\n", temp);
 349         }
 350         return 0;
 351 }
 352
 353 static struct seq_operations rt_cache_seq_ops = {
 354         .start  = rt_cache_seq_start,
 355         .next   = rt_cache_seq_next,
 356         .stop   = rt_cache_seq_stop,
 357         .show   = rt_cache_seq_show,
 358 };
 359
 360 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 361 {
 362         struct seq_file *seq;
 363         int rc = -ENOMEM;
 364         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 365
 366         if (!s)
 367                 goto out;
 368         rc = seq_open(file, &rt_cache_seq_ops);
 369         if (rc)
 370                 goto out_kfree;
 371         seq          = file->private_data;
 372         seq->private = s;
 373         memset(s, 0, sizeof(*s));
 374 out:
 375         return rc;
 376 out_kfree:
 377         kfree(s);
 378         goto out;
 379 }
 380
 381 static struct file_operations rt_cache_seq_fops = {
 382         .owner   = THIS_MODULE,
 383         .open    = rt_cache_seq_open,
 384         .read    = seq_read,
 385         .llseek  = seq_lseek,
 386         .release = seq_release_private,
 387 };
 388
 389
 390 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 391 {
 392         int cpu;
 393
 394         if (*pos == 0)
 395                 return SEQ_START_TOKEN;
 396
 397         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 398                 if (!cpu_possible(cpu))
 399                         continue;
 400                 *pos = cpu+1;
 401                 return per_cpu_ptr(rt_cache_stat, cpu);
 402         }
 403         return NULL;
 404 }
 405
 406 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 407 {
 408         int cpu;
 409
 410         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 411                 if (!cpu_possible(cpu))
 412                         continue;
 413                 *pos = cpu+1;
 414                 return per_cpu_ptr(rt_cache_stat, cpu);
 415         }
 416         return NULL;
 417
 418 }
 419
 420 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 421 {
 422
 423 }
 424
 425 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 426 {
 427         struct rt_cache_stat *st = v;
 428
 429         if (v == SEQ_START_TOKEN) {
 430                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 431                 return 0;
 432         }
 433
 434         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 435                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 436                    atomic_read(&ipv4_dst_ops.entries),
 437                    st->in_hit,
 438                    st->in_slow_tot,
 439                    st->in_slow_mc,
 440                    st->in_no_route,
 441                    st->in_brd,
 442                    st->in_martian_dst,
 443                    st->in_martian_src,
 444
 445                    st->out_hit,
 446                    st->out_slow_tot,
 447                    st->out_slow_mc,
 448
 449                    st->gc_total,
 450                    st->gc_ignored,
 451                    st->gc_goal_miss,
 452                    st->gc_dst_overflow,
 453                    st->in_hlist_search,
 454                    st->out_hlist_search
 455                 );
 456         return 0;
 457 }
 458
 459 static struct seq_operations rt_cpu_seq_ops = {
 460         .start  = rt_cpu_seq_start,
 461         .next   = rt_cpu_seq_next,
 462         .stop   = rt_cpu_seq_stop,
 463         .show   = rt_cpu_seq_show,
 464 };
 465
 466
 467 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 468 {
 469         return seq_open(file, &rt_cpu_seq_ops);
 470 }
 471
 472 static struct file_operations rt_cpu_seq_fops = {
 473         .owner   = THIS_MODULE,
 474         .open    = rt_cpu_seq_open,
 475         .read    = seq_read,
 476         .llseek  = seq_lseek,
 477         .release = seq_release,
 478 };
 479
 480 #endif /* CONFIG_PROC_FS */
 481
 482 static __inline__ void rt_free(struct rtable *rt)
 483 {
 484         multipath_remove(rt);
 485         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 486 }
 487
 488 static __inline__ void rt_drop(struct rtable *rt)
 489 {
 490         multipath_remove(rt);
 491         ip_rt_put(rt);
 492         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 493 }
 494
 495 static __inline__ int rt_fast_clean(struct rtable *rth)
 496 {
 497         /* Kill broadcast/multicast entries very aggresively, if they
 498            collide in hash table with more useful entries */
 499         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 500                 rth->fl.iif && rth->u.rt_next;
 501 }
 502
 503 static __inline__ int rt_valuable(struct rtable *rth)
 504 {
 505         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 506                 rth->u.dst.expires;
 507 }
 508
 509 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 510 {
 511         unsigned long age;
 512         int ret = 0;
 513
 514         if (atomic_read(&rth->u.dst.__refcnt))
 515                 goto out;
 516
 517         ret = 1;
 518         if (rth->u.dst.expires &&
 519             time_after_eq(jiffies, rth->u.dst.expires))
 520                 goto out;
 521
 522         age = jiffies - rth->u.dst.lastuse;
 523         ret = 0;
 524         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 525             (age <= tmo2 && rt_valuable(rth)))
 526                 goto out;
 527         ret = 1;
 528 out:    return ret;
 529 }
 530
 531 /* Bits of score are:
 532  * 31: very valuable
 533  * 30: not quite useless
 534  * 29..0: usage counter
 535  */
 536 static inline u32 rt_score(struct rtable *rt)
 537 {
 538         u32 score = jiffies - rt->u.dst.lastuse;
 539
 540         score = ~score & ~(3<<30);
 541
 542         if (rt_valuable(rt))
 543                 score |= (1<<31);
 544
 545         if (!rt->fl.iif ||
 546             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 547                 score |= (1<<30);
 548
 549         return score;
 550 }
 551
 552 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 553 {
 554         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 555                fl1->oif     == fl2->oif &&
 556                fl1->iif     == fl2->iif;
 557 }
 558
 559 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 560 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 561                                                 struct rtable *expentry,
 562                                                 int *removed_count)
 563 {
 564         int passedexpired = 0;
 565         struct rtable **nextstep = NULL;
 566         struct rtable **rthp = chain_head;
 567         struct rtable *rth;
 568
 569         if (removed_count)
 570                 *removed_count = 0;
 571
 572         while ((rth = *rthp) != NULL) {
 573                 if (rth == expentry)
 574                         passedexpired = 1;
 575
 576                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 577                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 578                         if (*rthp == expentry) {
 579                                 *rthp = rth->u.rt_next;
 580                                 continue;
 581                         } else {
 582                                 *rthp = rth->u.rt_next;
 583                                 rt_free(rth);
 584                                 if (removed_count)
 585                                         ++(*removed_count);
 586                         }
 587                 } else {
 588                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 589                             passedexpired && !nextstep)
 590                                 nextstep = &rth->u.rt_next;
 591
 592                         rthp = &rth->u.rt_next;
 593                 }
 594         }
 595
 596         rt_free(expentry);
 597         if (removed_count)
 598                 ++(*removed_count);
 599
 600         return nextstep;
 601 }
 602 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 603
 604
 605 /* This runs via a timer and thus is always in BH context. */
 606 static void rt_check_expire(unsigned long dummy)
 607 {
 608         static int rover;
 609         int i = rover, t;
 610         struct rtable *rth, **rthp;
 611         unsigned long now = jiffies;
 612
 613         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 614              t -= ip_rt_gc_timeout) {
 615                 unsigned long tmo = ip_rt_gc_timeout;
 616
 617                 i = (i + 1) & rt_hash_mask;
 618                 rthp = &rt_hash_table[i].chain;
 619
 620                 spin_lock(rt_hash_lock_addr(i));
 621                 while ((rth = *rthp) != NULL) {
 622                         if (rth->u.dst.expires) {
 623                                 /* Entry is expired even if it is in use */
 624                                 if (time_before_eq(now, rth->u.dst.expires)) {
 625                                         tmo >>= 1;
 626                                         rthp = &rth->u.rt_next;
 627                                         continue;
 628                                 }
 629                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 630                                 tmo >>= 1;
 631                                 rthp = &rth->u.rt_next;
 632                                 continue;
 633                         }
 634
 635                         /* Cleanup aged off entries. */
 636 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 637                         /* remove all related balanced entries if necessary */
 638                         if (rth->u.dst.flags & DST_BALANCED) {
 639                                 rthp = rt_remove_balanced_route(
 640                                         &rt_hash_table[i].chain,
 641                                         rth, NULL);
 642                                 if (!rthp)
 643                                         break;
 644                         } else {
 645                                 *rthp = rth->u.rt_next;
 646                                 rt_free(rth);
 647                         }
 648 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 649                         *rthp = rth->u.rt_next;
 650                         rt_free(rth);
 651 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 652                 }
 653                 spin_unlock(rt_hash_lock_addr(i));
 654
 655                 /* Fallback loop breaker. */
 656                 if (time_after(jiffies, now))
 657                         break;
 658         }
 659         rover = i;
 660         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 661 }
 662
 663 /* This can run from both BH and non-BH contexts, the latter
 664  * in the case of a forced flush event.
 665  */
 666 static void rt_run_flush(unsigned long dummy)
 667 {
 668         int i;
 669         struct rtable *rth, *next;
 670
 671         rt_deadline = 0;
 672
 673         get_random_bytes(&rt_hash_rnd, 4);
 674
 675         for (i = rt_hash_mask; i >= 0; i--) {
 676                 spin_lock_bh(rt_hash_lock_addr(i));
 677                 rth = rt_hash_table[i].chain;
 678                 if (rth)
 679                         rt_hash_table[i].chain = NULL;
 680                 spin_unlock_bh(rt_hash_lock_addr(i));
 681
 682                 for (; rth; rth = next) {
 683                         next = rth->u.rt_next;
 684                         rt_free(rth);
 685                 }
 686         }
 687 }
 688
 689 static DEFINE_SPINLOCK(rt_flush_lock);
 690
 691 void rt_cache_flush(int delay)
 692 {
 693         unsigned long now = jiffies;
 694         int user_mode = !in_softirq();
 695
 696         if (delay < 0)
 697                 delay = ip_rt_min_delay;
 698
 699         /* flush existing multipath state*/
 700         multipath_flush();
 701
 702         spin_lock_bh(&rt_flush_lock);
 703
 704         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 705                 long tmo = (long)(rt_deadline - now);
 706
 707                 /* If flush timer is already running
 708                    and flush request is not immediate (delay > 0):
 709
 710                    if deadline is not achieved, prolongate timer to "delay",
 711                    otherwise fire it at deadline time.
 712                  */
 713
 714                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 715                         tmo = 0;
 716
 717                 if (delay > tmo)
 718                         delay = tmo;
 719         }
 720
 721         if (delay <= 0) {
 722                 spin_unlock_bh(&rt_flush_lock);
 723                 rt_run_flush(0);
 724                 return;
 725         }
 726
 727         if (rt_deadline == 0)
 728                 rt_deadline = now + ip_rt_max_delay;
 729
 730         mod_timer(&rt_flush_timer, now+delay);
 731         spin_unlock_bh(&rt_flush_lock);
 732 }
 733
 734 static void rt_secret_rebuild(unsigned long dummy)
 735 {
 736         unsigned long now = jiffies;
 737
 738         rt_cache_flush(0);
 739         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 740 }
 741
 742 /*
 743    Short description of GC goals.
 744
 745    We want to build algorithm, which will keep routing cache
 746    at some equilibrium point, when number of aged off entries
 747    is kept approximately equal to newly generated ones.
 748
 749    Current expiration strength is variable "expire".
 750    We try to adjust it dynamically, so that if networking
 751    is idle expires is large enough to keep enough of warm entries,
 752    and when load increases it reduces to limit cache size.
 753  */
 754
 755 static int rt_garbage_collect(void)
 756 {
 757         static unsigned long expire = RT_GC_TIMEOUT;
 758         static unsigned long last_gc;
 759         static int rover;
 760         static int equilibrium;
 761         struct rtable *rth, **rthp;
 762         unsigned long now = jiffies;
 763         int goal;
 764
 765         /*
 766          * Garbage collection is pretty expensive,
 767          * do not make it too frequently.
 768          */
 769
 770         RT_CACHE_STAT_INC(gc_total);
 771
 772         if (now - last_gc < ip_rt_gc_min_interval &&
 773             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 774                 RT_CACHE_STAT_INC(gc_ignored);
 775                 goto out;
 776         }
 777
 778         /* Calculate number of entries, which we want to expire now. */
 779         goal = atomic_read(&ipv4_dst_ops.entries) -
 780                 (ip_rt_gc_elasticity << rt_hash_log);
 781         if (goal <= 0) {
 782                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 783                         equilibrium = ipv4_dst_ops.gc_thresh;
 784                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 785                 if (goal > 0) {
 786                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 787                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 788                 }
 789         } else {
 790                 /* We are in dangerous area. Try to reduce cache really
 791                  * aggressively.
 792                  */
 793                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 794                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 795         }
 796
 797         if (now - last_gc >= ip_rt_gc_min_interval)
 798                 last_gc = now;
 799
 800         if (goal <= 0) {
 801                 equilibrium += goal;
 802                 goto work_done;
 803         }
 804
 805         do {
 806                 int i, k;
 807
 808                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 809                         unsigned long tmo = expire;
 810
 811                         k = (k + 1) & rt_hash_mask;
 812                         rthp = &rt_hash_table[k].chain;
 813                         spin_lock_bh(rt_hash_lock_addr(k));
 814                         while ((rth = *rthp) != NULL) {
 815                                 if (!rt_may_expire(rth, tmo, expire)) {
 816                                         tmo >>= 1;
 817                                         rthp = &rth->u.rt_next;
 818                                         continue;
 819                                 }
 820 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 821                                 /* remove all related balanced entries
 822                                  * if necessary
 823                                  */
 824                                 if (rth->u.dst.flags & DST_BALANCED) {
 825                                         int r;
 826
 827                                         rthp = rt_remove_balanced_route(
 828                                                 &rt_hash_table[i].chain,
 829                                                 rth,
 830                                                 &r);
 831                                         goal -= r;
 832                                         if (!rthp)
 833                                                 break;
 834                                 } else {
 835                                         *rthp = rth->u.rt_next;
 836                                         rt_free(rth);
 837                                         goal--;
 838                                 }
 839 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 840                                 *rthp = rth->u.rt_next;
 841                                 rt_free(rth);
 842                                 goal--;
 843 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 844                         }
 845                         spin_unlock_bh(rt_hash_lock_addr(k));
 846                         if (goal <= 0)
 847                                 break;
 848                 }
 849                 rover = k;
 850
 851                 if (goal <= 0)
 852                         goto work_done;
 853
 854                 /* Goal is not achieved. We stop process if:
 855
 856                    - if expire reduced to zero. Otherwise, expire is halfed.
 857                    - if table is not full.
 858                    - if we are called from interrupt.
 859                    - jiffies check is just fallback/debug loop breaker.
 860                      We will not spin here for long time in any case.
 861                  */
 862
 863                 RT_CACHE_STAT_INC(gc_goal_miss);
 864
 865                 if (expire == 0)
 866                         break;
 867
 868                 expire >>= 1;
 869 #if RT_CACHE_DEBUG >= 2
 870                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 871                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 872 #endif
 873
 874                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 875                         goto out;
 876         } while (!in_softirq() && time_before_eq(jiffies, now));
 877
 878         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 879                 goto out;
 880         if (net_ratelimit())
 881                 printk(KERN_WARNING "dst cache overflow\n");
 882         RT_CACHE_STAT_INC(gc_dst_overflow);
 883         return 1;
 884
 885 work_done:
 886         expire += ip_rt_gc_min_interval;
 887         if (expire > ip_rt_gc_timeout ||
 888             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 889                 expire = ip_rt_gc_timeout;
 890 #if RT_CACHE_DEBUG >= 2
 891         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 892                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 893 #endif
 894 out:    return 0;
 895 }
 896
 897 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 898 {
 899         struct rtable   *rth, **rthp;
 900         unsigned long   now;
 901         struct rtable *cand, **candp;
 902         u32             min_score;
 903         int             chain_length;
 904         int attempts = !in_softirq();
 905
 906 restart:
 907         chain_length = 0;
 908         min_score = ~(u32)0;
 909         cand = NULL;
 910         candp = NULL;
 911         now = jiffies;
 912
 913         rthp = &rt_hash_table[hash].chain;
 914
 915         spin_lock_bh(rt_hash_lock_addr(hash));
 916         while ((rth = *rthp) != NULL) {
 917 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 918                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 919                     compare_keys(&rth->fl, &rt->fl)) {
 920 #else
 921                 if (compare_keys(&rth->fl, &rt->fl)) {
 922 #endif
 923                         /* Put it first */
 924                         *rthp = rth->u.rt_next;
 925                         /*
 926                          * Since lookup is lockfree, the deletion
 927                          * must be visible to another weakly ordered CPU before
 928                          * the insertion at the start of the hash chain.
 929                          */
 930                         rcu_assign_pointer(rth->u.rt_next,
 931                                            rt_hash_table[hash].chain);
 932                         /*
 933                          * Since lookup is lockfree, the update writes
 934                          * must be ordered for consistency on SMP.
 935                          */
 936                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 937
 938                         rth->u.dst.__use++;
 939                         dst_hold(&rth->u.dst);
 940                         rth->u.dst.lastuse = now;
 941                         spin_unlock_bh(rt_hash_lock_addr(hash));
 942
 943                         rt_drop(rt);
 944                         *rp = rth;
 945                         return 0;
 946                 }
 947
 948                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 949                         u32 score = rt_score(rth);
 950
 951                         if (score <= min_score) {
 952                                 cand = rth;
 953                                 candp = rthp;
 954                                 min_score = score;
 955                         }
 956                 }
 957
 958                 chain_length++;
 959
 960                 rthp = &rth->u.rt_next;
 961         }
 962
 963         if (cand) {
 964                 /* ip_rt_gc_elasticity used to be average length of chain
 965                  * length, when exceeded gc becomes really aggressive.
 966                  *
 967                  * The second limit is less certain. At the moment it allows
 968                  * only 2 entries per bucket. We will see.
 969                  */
 970                 if (chain_length > ip_rt_gc_elasticity) {
 971                         *candp = cand->u.rt_next;
 972                         rt_free(cand);
 973                 }
 974         }
 975
 976         /* Try to bind route to arp only if it is output
 977            route or unicast forwarding path.
 978          */
 979         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 980                 int err = arp_bind_neighbour(&rt->u.dst);
 981                 if (err) {
 982                         spin_unlock_bh(rt_hash_lock_addr(hash));
 983
 984                         if (err != -ENOBUFS) {
 985                                 rt_drop(rt);
 986                                 return err;
 987                         }
 988
 989                         /* Neighbour tables are full and nothing
 990                            can be released. Try to shrink route cache,
 991                            it is most likely it holds some neighbour records.
 992                          */
 993                         if (attempts-- > 0) {
 994                                 int saved_elasticity = ip_rt_gc_elasticity;
 995                                 int saved_int = ip_rt_gc_min_interval;
 996                                 ip_rt_gc_elasticity     = 1;
 997                                 ip_rt_gc_min_interval   = 0;
 998                                 rt_garbage_collect();
 999                                 ip_rt_gc_min_interval   = saved_int;
1000                                 ip_rt_gc_elasticity     = saved_elasticity;
1001                                 goto restart;
1002                         }
1003
1004                         if (net_ratelimit())
1005                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1006                         rt_drop(rt);
1007                         return -ENOBUFS;
1008                 }
1009         }
1010
1011         rt->u.rt_next = rt_hash_table[hash].chain;
1012 #if RT_CACHE_DEBUG >= 2
1013         if (rt->u.rt_next) {
1014                 struct rtable *trt;
1015                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1016                        NIPQUAD(rt->rt_dst));
1017                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1018                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1019                 printk("\n");
1020         }
1021 #endif
1022         rt_hash_table[hash].chain = rt;
1023         spin_unlock_bh(rt_hash_lock_addr(hash));
1024         *rp = rt;
1025         return 0;
1026 }
1027
1028 void rt_bind_peer(struct rtable *rt, int create)
1029 {
1030         static DEFINE_SPINLOCK(rt_peer_lock);
1031         struct inet_peer *peer;
1032
1033         peer = inet_getpeer(rt->rt_dst, create);
1034
1035         spin_lock_bh(&rt_peer_lock);
1036         if (rt->peer == NULL) {
1037                 rt->peer = peer;
1038                 peer = NULL;
1039         }
1040         spin_unlock_bh(&rt_peer_lock);
1041         if (peer)
1042                 inet_putpeer(peer);
1043 }
1044
1045 /*
1046  * Peer allocation may fail only in serious out-of-memory conditions.  However
1047  * we still can generate some output.
1048  * Random ID selection looks a bit dangerous because we have no chances to
1049  * select ID being unique in a reasonable period of time.
1050  * But broken packet identifier may be better than no packet at all.
1051  */
1052 static void ip_select_fb_ident(struct iphdr *iph)
1053 {
1054         static DEFINE_SPINLOCK(ip_fb_id_lock);
1055         static u32 ip_fallback_id;
1056         u32 salt;
1057
1058         spin_lock_bh(&ip_fb_id_lock);
1059         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1060         iph->id = htons(salt & 0xFFFF);
1061         ip_fallback_id = salt;
1062         spin_unlock_bh(&ip_fb_id_lock);
1063 }
1064
1065 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1066 {
1067         struct rtable *rt = (struct rtable *) dst;
1068
1069         if (rt) {
1070                 if (rt->peer == NULL)
1071                         rt_bind_peer(rt, 1);
1072
1073                 /* If peer is attached to destination, it is never detached,
1074                    so that we need not to grab a lock to dereference it.
1075                  */
1076                 if (rt->peer) {
1077                         iph->id = htons(inet_getid(rt->peer, more));
1078                         return;
1079                 }
1080         } else
1081                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1082                        __builtin_return_address(0));
1083
1084         ip_select_fb_ident(iph);
1085 }
1086
1087 static void rt_del(unsigned hash, struct rtable *rt)
1088 {
1089         struct rtable **rthp;
1090
1091         spin_lock_bh(rt_hash_lock_addr(hash));
1092         ip_rt_put(rt);
1093         for (rthp = &rt_hash_table[hash].chain; *rthp;
1094              rthp = &(*rthp)->u.rt_next)
1095                 if (*rthp == rt) {
1096                         *rthp = rt->u.rt_next;
1097                         rt_free(rt);
1098                         break;
1099                 }
1100         spin_unlock_bh(rt_hash_lock_addr(hash));
1101 }
1102
1103 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1104                     u32 saddr, u8 tos, struct net_device *dev)
1105 {
1106         int i, k;
1107         struct in_device *in_dev = in_dev_get(dev);
1108         struct rtable *rth, **rthp;
1109         u32  skeys[2] = { saddr, 0 };
1110         int  ikeys[2] = { dev->ifindex, 0 };
1111
1112         tos &= IPTOS_RT_MASK;
1113
1114         if (!in_dev)
1115                 return;
1116
1117         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1118             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1119                 goto reject_redirect;
1120
1121         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1122                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1123                         goto reject_redirect;
1124                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1125                         goto reject_redirect;
1126         } else {
1127                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1128                         goto reject_redirect;
1129         }
1130
1131         for (i = 0; i < 2; i++) {
1132                 for (k = 0; k < 2; k++) {
1133                         unsigned hash = rt_hash_code(daddr,
1134                                                      skeys[i] ^ (ikeys[k] << 5),
1135                                                      tos);
1136
1137                         rthp=&rt_hash_table[hash].chain;
1138
1139                         rcu_read_lock();
1140                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1141                                 struct rtable *rt;
1142
1143                                 if (rth->fl.fl4_dst != daddr ||
1144                                     rth->fl.fl4_src != skeys[i] ||
1145                                     rth->fl.fl4_tos != tos ||
1146                                     rth->fl.oif != ikeys[k] ||
1147                                     rth->fl.iif != 0) {
1148                                         rthp = &rth->u.rt_next;
1149                                         continue;
1150                                 }
1151
1152                                 if (rth->rt_dst != daddr ||
1153                                     rth->rt_src != saddr ||
1154                                     rth->u.dst.error ||
1155                                     rth->rt_gateway != old_gw ||
1156                                     rth->u.dst.dev != dev)
1157                                         break;
1158
1159                                 dst_hold(&rth->u.dst);
1160                                 rcu_read_unlock();
1161
1162                                 rt = dst_alloc(&ipv4_dst_ops);
1163                                 if (rt == NULL) {
1164                                         ip_rt_put(rth);
1165                                         in_dev_put(in_dev);
1166                                         return;
1167                                 }
1168
1169                                 /* Copy all the information. */
1170                                 *rt = *rth;
1171                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1172                                 rt->u.dst.__use         = 1;
1173                                 atomic_set(&rt->u.dst.__refcnt, 1);
1174                                 rt->u.dst.child         = NULL;
1175                                 if (rt->u.dst.dev)
1176                                         dev_hold(rt->u.dst.dev);
1177                                 if (rt->idev)
1178                                         in_dev_hold(rt->idev);
1179                                 rt->u.dst.obsolete      = 0;
1180                                 rt->u.dst.lastuse       = jiffies;
1181                                 rt->u.dst.path          = &rt->u.dst;
1182                                 rt->u.dst.neighbour     = NULL;
1183                                 rt->u.dst.hh            = NULL;
1184                                 rt->u.dst.xfrm          = NULL;
1185
1186                                 rt->rt_flags            |= RTCF_REDIRECTED;
1187
1188                                 /* Gateway is different ... */
1189                                 rt->rt_gateway          = new_gw;
1190
1191                                 /* Redirect received -> path was valid */
1192                                 dst_confirm(&rth->u.dst);
1193
1194                                 if (rt->peer)
1195                                         atomic_inc(&rt->peer->refcnt);
1196
1197                                 if (arp_bind_neighbour(&rt->u.dst) ||
1198                                     !(rt->u.dst.neighbour->nud_state &
1199                                             NUD_VALID)) {
1200                                         if (rt->u.dst.neighbour)
1201                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1202                                         ip_rt_put(rth);
1203                                         rt_drop(rt);
1204                                         goto do_next;
1205                                 }
1206
1207                                 rt_del(hash, rth);
1208                                 if (!rt_intern_hash(hash, rt, &rt))
1209                                         ip_rt_put(rt);
1210                                 goto do_next;
1211                         }
1212                         rcu_read_unlock();
1213                 do_next:
1214                         ;
1215                 }
1216         }
1217         in_dev_put(in_dev);
1218         return;
1219
1220 reject_redirect:
1221 #ifdef CONFIG_IP_ROUTE_VERBOSE
1222         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1223                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1224                         "%u.%u.%u.%u ignored.\n"
1225                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1226                         "tos %02x\n",
1227                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1228                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1229 #endif
1230         in_dev_put(in_dev);
1231 }
1232
1233 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1234 {
1235         struct rtable *rt = (struct rtable*)dst;
1236         struct dst_entry *ret = dst;
1237
1238         if (rt) {
1239                 if (dst->obsolete) {
1240                         ip_rt_put(rt);
1241                         ret = NULL;
1242                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1243                            rt->u.dst.expires) {
1244                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1245                                                      rt->fl.fl4_src ^
1246                                                         (rt->fl.oif << 5),
1247                                                      rt->fl.fl4_tos);
1248 #if RT_CACHE_DEBUG >= 1
1249                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1250                                           "%u.%u.%u.%u/%02x dropped\n",
1251                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1252 #endif
1253                         rt_del(hash, rt);
1254                         ret = NULL;
1255                 }
1256         }
1257         return ret;
1258 }
1259
1260 /*
1261  * Algorithm:
1262  *      1. The first ip_rt_redirect_number redirects are sent
1263  *         with exponential backoff, then we stop sending them at all,
1264  *         assuming that the host ignores our redirects.
1265  *      2. If we did not see packets requiring redirects
1266  *         during ip_rt_redirect_silence, we assume that the host
1267  *         forgot redirected route and start to send redirects again.
1268  *
1269  * This algorithm is much cheaper and more intelligent than dumb load limiting
1270  * in icmp.c.
1271  *
1272  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1273  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1274  */
1275
1276 void ip_rt_send_redirect(struct sk_buff *skb)
1277 {
1278         struct rtable *rt = (struct rtable*)skb->dst;
1279         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1280
1281         if (!in_dev)
1282                 return;
1283
1284         if (!IN_DEV_TX_REDIRECTS(in_dev))
1285                 goto out;
1286
1287         /* No redirected packets during ip_rt_redirect_silence;
1288          * reset the algorithm.
1289          */
1290         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1291                 rt->u.dst.rate_tokens = 0;
1292
1293         /* Too many ignored redirects; do not send anything
1294          * set u.dst.rate_last to the last seen redirected packet.
1295          */
1296         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1297                 rt->u.dst.rate_last = jiffies;
1298                 goto out;
1299         }
1300
1301         /* Check for load limit; set rate_last to the latest sent
1302          * redirect.
1303          */
1304         if (time_after(jiffies,
1305                        (rt->u.dst.rate_last +
1306                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1307                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1308                 rt->u.dst.rate_last = jiffies;
1309                 ++rt->u.dst.rate_tokens;
1310 #ifdef CONFIG_IP_ROUTE_VERBOSE
1311                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1312                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1313                     net_ratelimit())
1314                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1315                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1316                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1317                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1318 #endif
1319         }
1320 out:
1321         in_dev_put(in_dev);
1322 }
1323
1324 static int ip_error(struct sk_buff *skb)
1325 {
1326         struct rtable *rt = (struct rtable*)skb->dst;
1327         unsigned long now;
1328         int code;
1329
1330         switch (rt->u.dst.error) {
1331                 case EINVAL:
1332                 default:
1333                         goto out;
1334                 case EHOSTUNREACH:
1335                         code = ICMP_HOST_UNREACH;
1336                         break;
1337                 case ENETUNREACH:
1338                         code = ICMP_NET_UNREACH;
1339                         break;
1340                 case EACCES:
1341                         code = ICMP_PKT_FILTERED;
1342                         break;
1343         }
1344
1345         now = jiffies;
1346         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1347         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1348                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1349         rt->u.dst.rate_last = now;
1350         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1351                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1352                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1353         }
1354
1355 out:    kfree_skb(skb);
1356         return 0;
1357 }
1358
1359 /*
1360  *      The last two values are not from the RFC but
1361  *      are needed for AMPRnet AX.25 paths.
1362  */
1363
1364 static unsigned short mtu_plateau[] =
1365 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1366
1367 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1368 {
1369         int i;
1370
1371         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1372                 if (old_mtu > mtu_plateau[i])
1373                         return mtu_plateau[i];
1374         return 68;
1375 }
1376
1377 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1378 {
1379         int i;
1380         unsigned short old_mtu = ntohs(iph->tot_len);
1381         struct rtable *rth;
1382         u32  skeys[2] = { iph->saddr, 0, };
1383         u32  daddr = iph->daddr;
1384         u8   tos = iph->tos & IPTOS_RT_MASK;
1385         unsigned short est_mtu = 0;
1386
1387         if (ipv4_config.no_pmtu_disc)
1388                 return 0;
1389
1390         for (i = 0; i < 2; i++) {
1391                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1392
1393                 rcu_read_lock();
1394                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1395                      rth = rcu_dereference(rth->u.rt_next)) {
1396                         if (rth->fl.fl4_dst == daddr &&
1397                             rth->fl.fl4_src == skeys[i] &&
1398                             rth->rt_dst  == daddr &&
1399                             rth->rt_src  == iph->saddr &&
1400                             rth->fl.fl4_tos == tos &&
1401                             rth->fl.iif == 0 &&
1402                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1403                                 unsigned short mtu = new_mtu;
1404
1405                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1406
1407                                         /* BSD 4.2 compatibility hack :-( */
1408                                         if (mtu == 0 &&
1409                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1410                                             old_mtu >= 68 + (iph->ihl << 2))
1411                                                 old_mtu -= iph->ihl << 2;
1412
1413                                         mtu = guess_mtu(old_mtu);
1414                                 }
1415                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1416                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1417                                                 dst_confirm(&rth->u.dst);
1418                                                 if (mtu < ip_rt_min_pmtu) {
1419                                                         mtu = ip_rt_min_pmtu;
1420                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1421                                                                 (1 << RTAX_MTU);
1422                                                 }
1423                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1424                                                 dst_set_expires(&rth->u.dst,
1425                                                         ip_rt_mtu_expires);
1426                                         }
1427                                         est_mtu = mtu;
1428                                 }
1429                         }
1430                 }
1431                 rcu_read_unlock();
1432         }
1433         return est_mtu ? : new_mtu;
1434 }
1435
1436 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1437 {
1438         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1439             !(dst_metric_locked(dst, RTAX_MTU))) {
1440                 if (mtu < ip_rt_min_pmtu) {
1441                         mtu = ip_rt_min_pmtu;
1442                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1443                 }
1444                 dst->metrics[RTAX_MTU-1] = mtu;
1445                 dst_set_expires(dst, ip_rt_mtu_expires);
1446         }
1447 }
1448
1449 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1450 {
1451         return NULL;
1452 }
1453
1454 static void ipv4_dst_destroy(struct dst_entry *dst)
1455 {
1456         struct rtable *rt = (struct rtable *) dst;
1457         struct inet_peer *peer = rt->peer;
1458         struct in_device *idev = rt->idev;
1459
1460         if (peer) {
1461                 rt->peer = NULL;
1462                 inet_putpeer(peer);
1463         }
1464
1465         if (idev) {
1466                 rt->idev = NULL;
1467                 in_dev_put(idev);
1468         }
1469 }
1470
1471 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1472                             int how)
1473 {
1474         struct rtable *rt = (struct rtable *) dst;
1475         struct in_device *idev = rt->idev;
1476         if (dev != &loopback_dev && idev && idev->dev == dev) {
1477                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1478                 if (loopback_idev) {
1479                         rt->idev = loopback_idev;
1480                         in_dev_put(idev);
1481                 }
1482         }
1483 }
1484
1485 static void ipv4_link_failure(struct sk_buff *skb)
1486 {
1487         struct rtable *rt;
1488
1489         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1490
1491         rt = (struct rtable *) skb->dst;
1492         if (rt)
1493                 dst_set_expires(&rt->u.dst, 0);
1494 }
1495
1496 static int ip_rt_bug(struct sk_buff *skb)
1497 {
1498         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1499                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1500                 skb->dev ? skb->dev->name : "?");
1501         kfree_skb(skb);
1502         return 0;
1503 }
1504
1505 /*
1506    We do not cache source address of outgoing interface,
1507    because it is used only by IP RR, TS and SRR options,
1508    so that it out of fast path.
1509
1510    BTW remember: "addr" is allowed to be not aligned
1511    in IP options!
1512  */
1513
1514 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1515 {
1516         u32 src;
1517         struct fib_result res;
1518
1519         if (rt->fl.iif == 0)
1520                 src = rt->rt_src;
1521         else if (fib_lookup(&rt->fl, &res) == 0) {
1522                 src = FIB_RES_PREFSRC(res);
1523                 fib_res_put(&res);
1524         } else
1525                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1526                                         RT_SCOPE_UNIVERSE);
1527         memcpy(addr, &src, 4);
1528 }
1529
1530 #ifdef CONFIG_NET_CLS_ROUTE
1531 static void set_class_tag(struct rtable *rt, u32 tag)
1532 {
1533         if (!(rt->u.dst.tclassid & 0xFFFF))
1534                 rt->u.dst.tclassid |= tag & 0xFFFF;
1535         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1536                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1537 }
1538 #endif
1539
1540 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1541 {
1542         struct fib_info *fi = res->fi;
1543
1544         if (fi) {
1545                 if (FIB_RES_GW(*res) &&
1546                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1547                         rt->rt_gateway = FIB_RES_GW(*res);
1548                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1549                        sizeof(rt->u.dst.metrics));
1550                 if (fi->fib_mtu == 0) {
1551                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1552                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1553                             rt->rt_gateway != rt->rt_dst &&
1554                             rt->u.dst.dev->mtu > 576)
1555                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1556                 }
1557 #ifdef CONFIG_NET_CLS_ROUTE
1558                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1559 #endif
1560         } else
1561                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1562
1563         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1564                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1565         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1566                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1567         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1568                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1569                                        ip_rt_min_advmss);
1570         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1571                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1572
1573 #ifdef CONFIG_NET_CLS_ROUTE
1574 #ifdef CONFIG_IP_MULTIPLE_TABLES
1575         set_class_tag(rt, fib_rules_tclass(res));
1576 #endif
1577         set_class_tag(rt, itag);
1578 #endif
1579         rt->rt_type = res->type;
1580 }
1581
1582 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1583                                 u8 tos, struct net_device *dev, int our)
1584 {
1585         unsigned hash;
1586         struct rtable *rth;
1587         u32 spec_dst;
1588         struct in_device *in_dev = in_dev_get(dev);
1589         u32 itag = 0;
1590
1591         /* Primary sanity checks. */
1592
1593         if (in_dev == NULL)
1594                 return -EINVAL;
1595
1596         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1597             skb->protocol != htons(ETH_P_IP))
1598                 goto e_inval;
1599
1600         if (ZERONET(saddr)) {
1601                 if (!LOCAL_MCAST(daddr))
1602                         goto e_inval;
1603                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1604         } else if (fib_validate_source(saddr, 0, tos, 0,
1605                                         dev, &spec_dst, &itag) < 0)
1606                 goto e_inval;
1607
1608         rth = dst_alloc(&ipv4_dst_ops);
1609         if (!rth)
1610                 goto e_nobufs;
1611
1612         rth->u.dst.output= ip_rt_bug;
1613
1614         atomic_set(&rth->u.dst.__refcnt, 1);
1615         rth->u.dst.flags= DST_HOST;
1616         if (in_dev->cnf.no_policy)
1617                 rth->u.dst.flags |= DST_NOPOLICY;
1618         rth->fl.fl4_dst = daddr;
1619         rth->rt_dst     = daddr;
1620         rth->fl.fl4_tos = tos;
1621 #ifdef CONFIG_IP_ROUTE_FWMARK
1622         rth->fl.fl4_fwmark= skb->nfmark;
1623 #endif
1624         rth->fl.fl4_src = saddr;
1625         rth->rt_src     = saddr;
1626 #ifdef CONFIG_NET_CLS_ROUTE
1627         rth->u.dst.tclassid = itag;
1628 #endif
1629         rth->rt_iif     =
1630         rth->fl.iif     = dev->ifindex;
1631         rth->u.dst.dev  = &loopback_dev;
1632         dev_hold(rth->u.dst.dev);
1633         rth->idev       = in_dev_get(rth->u.dst.dev);
1634         rth->fl.oif     = 0;
1635         rth->rt_gateway = daddr;
1636         rth->rt_spec_dst= spec_dst;
1637         rth->rt_type    = RTN_MULTICAST;
1638         rth->rt_flags   = RTCF_MULTICAST;
1639         if (our) {
1640                 rth->u.dst.input= ip_local_deliver;
1641                 rth->rt_flags |= RTCF_LOCAL;
1642         }
1643
1644 #ifdef CONFIG_IP_MROUTE
1645         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1646                 rth->u.dst.input = ip_mr_input;
1647 #endif
1648         RT_CACHE_STAT_INC(in_slow_mc);
1649
1650         in_dev_put(in_dev);
1651         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1652         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1653
1654 e_nobufs:
1655         in_dev_put(in_dev);
1656         return -ENOBUFS;
1657
1658 e_inval:
1659         in_dev_put(in_dev);
1660         return -EINVAL;
1661 }
1662
1663
1664 static void ip_handle_martian_source(struct net_device *dev,
1665                                      struct in_device *in_dev,
1666                                      struct sk_buff *skb,
1667                                      u32 daddr,
1668                                      u32 saddr)
1669 {
1670         RT_CACHE_STAT_INC(in_martian_src);
1671 #ifdef CONFIG_IP_ROUTE_VERBOSE
1672         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1673                 /*
1674                  *      RFC1812 recommendation, if source is martian,
1675                  *      the only hint is MAC header.
1676                  */
1677                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1678                         "%u.%u.%u.%u, on dev %s\n",
1679                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1680                 if (dev->hard_header_len) {
1681                         int i;
1682                         unsigned char *p = skb->mac.raw;
1683                         printk(KERN_WARNING "ll header: ");
1684                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1685                                 printk("%02x", *p);
1686                                 if (i < (dev->hard_header_len - 1))
1687                                         printk(":");
1688                         }
1689                         printk("\n");
1690                 }
1691         }
1692 #endif
1693 }
1694
1695 static inline int __mkroute_input(struct sk_buff *skb,
1696                                   struct fib_result* res,
1697                                   struct in_device *in_dev,
1698                                   u32 daddr, u32 saddr, u32 tos,
1699                                   struct rtable **result)
1700 {
1701
1702         struct rtable *rth;
1703         int err;
1704         struct in_device *out_dev;
1705         unsigned flags = 0;
1706         u32 spec_dst, itag;
1707
1708         /* get a working reference to the output device */
1709         out_dev = in_dev_get(FIB_RES_DEV(*res));
1710         if (out_dev == NULL) {
1711                 if (net_ratelimit())
1712                         printk(KERN_CRIT "Bug in ip_route_input" \
1713                                "_slow(). Please, report\n");
1714                 return -EINVAL;
1715         }
1716
1717
1718         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1719                                   in_dev->dev, &spec_dst, &itag);
1720         if (err < 0) {
1721                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1722                                          saddr);
1723
1724                 err = -EINVAL;
1725                 goto cleanup;
1726         }
1727
1728         if (err)
1729                 flags |= RTCF_DIRECTSRC;
1730
1731         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1732             (IN_DEV_SHARED_MEDIA(out_dev) ||
1733              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1734                 flags |= RTCF_DOREDIRECT;
1735
1736         if (skb->protocol != htons(ETH_P_IP)) {
1737                 /* Not IP (i.e. ARP). Do not create route, if it is
1738                  * invalid for proxy arp. DNAT routes are always valid.
1739                  */
1740                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1741                         err = -EINVAL;
1742                         goto cleanup;
1743                 }
1744         }
1745
1746
1747         rth = dst_alloc(&ipv4_dst_ops);
1748         if (!rth) {
1749                 err = -ENOBUFS;
1750                 goto cleanup;
1751         }
1752
1753         rth->u.dst.flags= DST_HOST;
1754 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1755         if (res->fi->fib_nhs > 1)
1756                 rth->u.dst.flags |= DST_BALANCED;
1757 #endif
1758         if (in_dev->cnf.no_policy)
1759                 rth->u.dst.flags |= DST_NOPOLICY;
1760         if (in_dev->cnf.no_xfrm)
1761                 rth->u.dst.flags |= DST_NOXFRM;
1762         rth->fl.fl4_dst = daddr;
1763         rth->rt_dst     = daddr;
1764         rth->fl.fl4_tos = tos;
1765 #ifdef CONFIG_IP_ROUTE_FWMARK
1766         rth->fl.fl4_fwmark= skb->nfmark;
1767 #endif
1768         rth->fl.fl4_src = saddr;
1769         rth->rt_src     = saddr;
1770         rth->rt_gateway = daddr;
1771         rth->rt_iif     =
1772                 rth->fl.iif     = in_dev->dev->ifindex;
1773         rth->u.dst.dev  = (out_dev)->dev;
1774         dev_hold(rth->u.dst.dev);
1775         rth->idev       = in_dev_get(rth->u.dst.dev);
1776         rth->fl.oif     = 0;
1777         rth->rt_spec_dst= spec_dst;
1778
1779         rth->u.dst.input = ip_forward;
1780         rth->u.dst.output = ip_output;
1781
1782         rt_set_nexthop(rth, res, itag);
1783
1784         rth->rt_flags = flags;
1785
1786         *result = rth;
1787         err = 0;
1788  cleanup:
1789         /* release the working reference to the output device */
1790         in_dev_put(out_dev);
1791         return err;
1792 }
1793
1794 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1795                                        struct fib_result* res,
1796                                        const struct flowi *fl,
1797                                        struct in_device *in_dev,
1798                                        u32 daddr, u32 saddr, u32 tos)
1799 {
1800         struct rtable* rth = NULL;
1801         int err;
1802         unsigned hash;
1803
1804 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1805         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1806                 fib_select_multipath(fl, res);
1807 #endif
1808
1809         /* create a routing cache entry */
1810         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1811         if (err)
1812                 return err;
1813         atomic_set(&rth->u.dst.__refcnt, 1);
1814
1815         /* put it into the cache */
1816         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1817         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1818 }
1819
1820 static inline int ip_mkroute_input(struct sk_buff *skb,
1821                                    struct fib_result* res,
1822                                    const struct flowi *fl,
1823                                    struct in_device *in_dev,
1824                                    u32 daddr, u32 saddr, u32 tos)
1825 {
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1827         struct rtable* rth = NULL;
1828         unsigned char hop, hopcount, lasthop;
1829         int err = -EINVAL;
1830         unsigned int hash;
1831
1832         if (res->fi)
1833                 hopcount = res->fi->fib_nhs;
1834         else
1835                 hopcount = 1;
1836
1837         lasthop = hopcount - 1;
1838
1839         /* distinguish between multipath and singlepath */
1840         if (hopcount < 2)
1841                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1842                                             saddr, tos);
1843
1844         /* add all alternatives to the routing cache */
1845         for (hop = 0; hop < hopcount; hop++) {
1846                 res->nh_sel = hop;
1847
1848                 /* create a routing cache entry */
1849                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1850                                       &rth);
1851                 if (err)
1852                         return err;
1853
1854                 /* put it into the cache */
1855                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1856                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1857                 if (err)
1858                         return err;
1859
1860                 /* forward hop information to multipath impl. */
1861                 multipath_set_nhinfo(rth,
1862                                      FIB_RES_NETWORK(*res),
1863                                      FIB_RES_NETMASK(*res),
1864                                      res->prefixlen,
1865                                      &FIB_RES_NH(*res));
1866
1867                 /* only for the last hop the reference count is handled
1868                  * outside
1869                  */
1870                 if (hop == lasthop)
1871                         atomic_set(&(skb->dst->__refcnt), 1);
1872         }
1873         return err;
1874 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1875         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1877 }
1878
1879
1880 /*
1881  *      NOTE. We drop all the packets that has local source
1882  *      addresses, because every properly looped back packet
1883  *      must have correct destination already attached by output routine.
1884  *
1885  *      Such approach solves two big problems:
1886  *      1. Not simplex devices are handled properly.
1887  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1888  */
1889
1890 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891                                u8 tos, struct net_device *dev)
1892 {
1893         struct fib_result res;
1894         struct in_device *in_dev = in_dev_get(dev);
1895         struct flowi fl = { .nl_u = { .ip4_u =
1896                                       { .daddr = daddr,
1897                                         .saddr = saddr,
1898                                         .tos = tos,
1899                                         .scope = RT_SCOPE_UNIVERSE,
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901                                         .fwmark = skb->nfmark
1902 #endif
1903                                       } },
1904                             .iif = dev->ifindex };
1905         unsigned        flags = 0;
1906         u32             itag = 0;
1907         struct rtable * rth;
1908         unsigned        hash;
1909         u32             spec_dst;
1910         int             err = -EINVAL;
1911         int             free_res = 0;
1912
1913         /* IP on this device is disabled. */
1914
1915         if (!in_dev)
1916                 goto out;
1917
1918         /* Check for the most weird martians, which can be not detected
1919            by fib_lookup.
1920          */
1921
1922         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923                 goto martian_source;
1924
1925         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1926                 goto brd_input;
1927
1928         /* Accept zero addresses only to limited broadcast;
1929          * I even do not know to fix it or not. Waiting for complains :-)
1930          */
1931         if (ZERONET(saddr))
1932                 goto martian_source;
1933
1934         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935                 goto martian_destination;
1936
1937         /*
1938          *      Now we are ready to route packet.
1939          */
1940         if ((err = fib_lookup(&fl, &res)) != 0) {
1941                 if (!IN_DEV_FORWARD(in_dev))
1942                         goto e_hostunreach;
1943                 goto no_route;
1944         }
1945         free_res = 1;
1946
1947         RT_CACHE_STAT_INC(in_slow_tot);
1948
1949         if (res.type == RTN_BROADCAST)
1950                 goto brd_input;
1951
1952         if (res.type == RTN_LOCAL) {
1953                 int result;
1954                 result = fib_validate_source(saddr, daddr, tos,
1955                                              loopback_dev.ifindex,
1956                                              dev, &spec_dst, &itag);
1957                 if (result < 0)
1958                         goto martian_source;
1959                 if (result)
1960                         flags |= RTCF_DIRECTSRC;
1961                 spec_dst = daddr;
1962                 goto local_input;
1963         }
1964
1965         if (!IN_DEV_FORWARD(in_dev))
1966                 goto e_hostunreach;
1967         if (res.type != RTN_UNICAST)
1968                 goto martian_destination;
1969
1970         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971         if (err == -ENOBUFS)
1972                 goto e_nobufs;
1973         if (err == -EINVAL)
1974                 goto e_inval;
1975
1976 done:
1977         in_dev_put(in_dev);
1978         if (free_res)
1979                 fib_res_put(&res);
1980 out:    return err;
1981
1982 brd_input:
1983         if (skb->protocol != htons(ETH_P_IP))
1984                 goto e_inval;
1985
1986         if (ZERONET(saddr))
1987                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1988         else {
1989                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1990                                           &itag);
1991                 if (err < 0)
1992                         goto martian_source;
1993                 if (err)
1994                         flags |= RTCF_DIRECTSRC;
1995         }
1996         flags |= RTCF_BROADCAST;
1997         res.type = RTN_BROADCAST;
1998         RT_CACHE_STAT_INC(in_brd);
1999
2000 local_input:
2001         rth = dst_alloc(&ipv4_dst_ops);
2002         if (!rth)
2003                 goto e_nobufs;
2004
2005         rth->u.dst.output= ip_rt_bug;
2006
2007         atomic_set(&rth->u.dst.__refcnt, 1);
2008         rth->u.dst.flags= DST_HOST;
2009         if (in_dev->cnf.no_policy)
2010                 rth->u.dst.flags |= DST_NOPOLICY;
2011         rth->fl.fl4_dst = daddr;
2012         rth->rt_dst     = daddr;
2013         rth->fl.fl4_tos = tos;
2014 #ifdef CONFIG_IP_ROUTE_FWMARK
2015         rth->fl.fl4_fwmark= skb->nfmark;
2016 #endif
2017         rth->fl.fl4_src = saddr;
2018         rth->rt_src     = saddr;
2019 #ifdef CONFIG_NET_CLS_ROUTE
2020         rth->u.dst.tclassid = itag;
2021 #endif
2022         rth->rt_iif     =
2023         rth->fl.iif     = dev->ifindex;
2024         rth->u.dst.dev  = &loopback_dev;
2025         dev_hold(rth->u.dst.dev);
2026         rth->idev       = in_dev_get(rth->u.dst.dev);
2027         rth->rt_gateway = daddr;
2028         rth->rt_spec_dst= spec_dst;
2029         rth->u.dst.input= ip_local_deliver;
2030         rth->rt_flags   = flags|RTCF_LOCAL;
2031         if (res.type == RTN_UNREACHABLE) {
2032                 rth->u.dst.input= ip_error;
2033                 rth->u.dst.error= -err;
2034                 rth->rt_flags   &= ~RTCF_LOCAL;
2035         }
2036         rth->rt_type    = res.type;
2037         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2038         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2039         goto done;
2040
2041 no_route:
2042         RT_CACHE_STAT_INC(in_no_route);
2043         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044         res.type = RTN_UNREACHABLE;
2045         goto local_input;
2046
2047         /*
2048          *      Do not cache martian addresses: they should be logged (RFC1812)
2049          */
2050 martian_destination:
2051         RT_CACHE_STAT_INC(in_martian_dst);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055                         "%u.%u.%u.%u, dev %s\n",
2056                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2057 #endif
2058
2059 e_hostunreach:
2060         err = -EHOSTUNREACH;
2061         goto done;
2062
2063 e_inval:
2064         err = -EINVAL;
2065         goto done;
2066
2067 e_nobufs:
2068         err = -ENOBUFS;
2069         goto done;
2070
2071 martian_source:
2072         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2073         goto e_inval;
2074 }
2075
2076 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077                    u8 tos, struct net_device *dev)
2078 {
2079         struct rtable * rth;
2080         unsigned        hash;
2081         int iif = dev->ifindex;
2082
2083         tos &= IPTOS_RT_MASK;
2084         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2085
2086         rcu_read_lock();
2087         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088              rth = rcu_dereference(rth->u.rt_next)) {
2089                 if (rth->fl.fl4_dst == daddr &&
2090                     rth->fl.fl4_src == saddr &&
2091                     rth->fl.iif == iif &&
2092                     rth->fl.oif == 0 &&
2093 #ifdef CONFIG_IP_ROUTE_FWMARK
2094                     rth->fl.fl4_fwmark == skb->nfmark &&
2095 #endif
2096                     rth->fl.fl4_tos == tos) {
2097                         rth->u.dst.lastuse = jiffies;
2098                         dst_hold(&rth->u.dst);
2099                         rth->u.dst.__use++;
2100                         RT_CACHE_STAT_INC(in_hit);
2101                         rcu_read_unlock();
2102                         skb->dst = (struct dst_entry*)rth;
2103                         return 0;
2104                 }
2105                 RT_CACHE_STAT_INC(in_hlist_search);
2106         }
2107         rcu_read_unlock();
2108
2109         /* Multicast recognition logic is moved from route cache to here.
2110            The problem was that too many Ethernet cards have broken/missing
2111            hardware multicast filters :-( As result the host on multicasting
2112            network acquires a lot of useless route cache entries, sort of
2113            SDR messages from all the world. Now we try to get rid of them.
2114            Really, provided software IP multicast filter is organized
2115            reasonably (at least, hashed), it does not result in a slowdown
2116            comparing with route cache reject entries.
2117            Note, that multicast routers are not affected, because
2118            route cache entry is created eventually.
2119          */
2120         if (MULTICAST(daddr)) {
2121                 struct in_device *in_dev;
2122
2123                 rcu_read_lock();
2124                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2125                         int our = ip_check_mc(in_dev, daddr, saddr,
2126                                 skb->nh.iph->protocol);
2127                         if (our
2128 #ifdef CONFIG_IP_MROUTE
2129                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2130 #endif
2131                             ) {
2132                                 rcu_read_unlock();
2133                                 return ip_route_input_mc(skb, daddr, saddr,
2134                                                          tos, dev, our);
2135                         }
2136                 }
2137                 rcu_read_unlock();
2138                 return -EINVAL;
2139         }
2140         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2141 }
2142
2143 static inline int __mkroute_output(struct rtable **result,
2144                                    struct fib_result* res,
2145                                    const struct flowi *fl,
2146                                    const struct flowi *oldflp,
2147                                    struct net_device *dev_out,
2148                                    unsigned flags)
2149 {
2150         struct rtable *rth;
2151         struct in_device *in_dev;
2152         u32 tos = RT_FL_TOS(oldflp);
2153         int err = 0;
2154
2155         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2156                 return -EINVAL;
2157
2158         if (fl->fl4_dst == 0xFFFFFFFF)
2159                 res->type = RTN_BROADCAST;
2160         else if (MULTICAST(fl->fl4_dst))
2161                 res->type = RTN_MULTICAST;
2162         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2163                 return -EINVAL;
2164
2165         if (dev_out->flags & IFF_LOOPBACK)
2166                 flags |= RTCF_LOCAL;
2167
2168         /* get work reference to inet device */
2169         in_dev = in_dev_get(dev_out);
2170         if (!in_dev)
2171                 return -EINVAL;
2172
2173         if (res->type == RTN_BROADCAST) {
2174                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175                 if (res->fi) {
2176                         fib_info_put(res->fi);
2177                         res->fi = NULL;
2178                 }
2179         } else if (res->type == RTN_MULTICAST) {
2180                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2182                                  oldflp->proto))
2183                         flags &= ~RTCF_LOCAL;
2184                 /* If multicast route do not exist use
2185                    default one, but do not gateway in this case.
2186                    Yes, it is hack.
2187                  */
2188                 if (res->fi && res->prefixlen < 4) {
2189                         fib_info_put(res->fi);
2190                         res->fi = NULL;
2191                 }
2192         }
2193
2194
2195         rth = dst_alloc(&ipv4_dst_ops);
2196         if (!rth) {
2197                 err = -ENOBUFS;
2198                 goto cleanup;
2199         }
2200
2201         rth->u.dst.flags= DST_HOST;
2202 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2203         if (res->fi) {
2204                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2205                 if (res->fi->fib_nhs > 1)
2206                         rth->u.dst.flags |= DST_BALANCED;
2207         }
2208 #endif
2209         if (in_dev->cnf.no_xfrm)
2210                 rth->u.dst.flags |= DST_NOXFRM;
2211         if (in_dev->cnf.no_policy)
2212                 rth->u.dst.flags |= DST_NOPOLICY;
2213
2214         rth->fl.fl4_dst = oldflp->fl4_dst;
2215         rth->fl.fl4_tos = tos;
2216         rth->fl.fl4_src = oldflp->fl4_src;
2217         rth->fl.oif     = oldflp->oif;
2218 #ifdef CONFIG_IP_ROUTE_FWMARK
2219         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2220 #endif
2221         rth->rt_dst     = fl->fl4_dst;
2222         rth->rt_src     = fl->fl4_src;
2223         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2224         /* get references to the devices that are to be hold by the routing
2225            cache entry */
2226         rth->u.dst.dev  = dev_out;
2227         dev_hold(dev_out);
2228         rth->idev       = in_dev_get(dev_out);
2229         rth->rt_gateway = fl->fl4_dst;
2230         rth->rt_spec_dst= fl->fl4_src;
2231
2232         rth->u.dst.output=ip_output;
2233
2234         RT_CACHE_STAT_INC(out_slow_tot);
2235
2236         if (flags & RTCF_LOCAL) {
2237                 rth->u.dst.input = ip_local_deliver;
2238                 rth->rt_spec_dst = fl->fl4_dst;
2239         }
2240         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2241                 rth->rt_spec_dst = fl->fl4_src;
2242                 if (flags & RTCF_LOCAL &&
2243                     !(dev_out->flags & IFF_LOOPBACK)) {
2244                         rth->u.dst.output = ip_mc_output;
2245                         RT_CACHE_STAT_INC(out_slow_mc);
2246                 }
2247 #ifdef CONFIG_IP_MROUTE
2248                 if (res->type == RTN_MULTICAST) {
2249                         if (IN_DEV_MFORWARD(in_dev) &&
2250                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2251                                 rth->u.dst.input = ip_mr_input;
2252                                 rth->u.dst.output = ip_mc_output;
2253                         }
2254                 }
2255 #endif
2256         }
2257
2258         rt_set_nexthop(rth, res, 0);
2259
2260         rth->rt_flags = flags;
2261
2262         *result = rth;
2263  cleanup:
2264         /* release work reference to inet device */
2265         in_dev_put(in_dev);
2266
2267         return err;
2268 }
2269
2270 static inline int ip_mkroute_output_def(struct rtable **rp,
2271                                         struct fib_result* res,
2272                                         const struct flowi *fl,
2273                                         const struct flowi *oldflp,
2274                                         struct net_device *dev_out,
2275                                         unsigned flags)
2276 {
2277         struct rtable *rth = NULL;
2278         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2279         unsigned hash;
2280         if (err == 0) {
2281                 u32 tos = RT_FL_TOS(oldflp);
2282
2283                 atomic_set(&rth->u.dst.__refcnt, 1);
2284
2285                 hash = rt_hash_code(oldflp->fl4_dst,
2286                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2287                 err = rt_intern_hash(hash, rth, rp);
2288         }
2289
2290         return err;
2291 }
2292
2293 static inline int ip_mkroute_output(struct rtable** rp,
2294                                     struct fib_result* res,
2295                                     const struct flowi *fl,
2296                                     const struct flowi *oldflp,
2297                                     struct net_device *dev_out,
2298                                     unsigned flags)
2299 {
2300 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2301         u32 tos = RT_FL_TOS(oldflp);
2302         unsigned char hop;
2303         unsigned hash;
2304         int err = -EINVAL;
2305         struct rtable *rth = NULL;
2306
2307         if (res->fi && res->fi->fib_nhs > 1) {
2308                 unsigned char hopcount = res->fi->fib_nhs;
2309
2310                 for (hop = 0; hop < hopcount; hop++) {
2311                         struct net_device *dev2nexthop;
2312
2313                         res->nh_sel = hop;
2314
2315                         /* hold a work reference to the output device */
2316                         dev2nexthop = FIB_RES_DEV(*res);
2317                         dev_hold(dev2nexthop);
2318
2319                         err = __mkroute_output(&rth, res, fl, oldflp,
2320                                                dev2nexthop, flags);
2321
2322                         if (err != 0)
2323                                 goto cleanup;
2324
2325                         hash = rt_hash_code(oldflp->fl4_dst,
2326                                             oldflp->fl4_src ^
2327                                             (oldflp->oif << 5), tos);
2328                         err = rt_intern_hash(hash, rth, rp);
2329
2330                         /* forward hop information to multipath impl. */
2331                         multipath_set_nhinfo(rth,
2332                                              FIB_RES_NETWORK(*res),
2333                                              FIB_RES_NETMASK(*res),
2334                                              res->prefixlen,
2335                                              &FIB_RES_NH(*res));
2336                 cleanup:
2337                         /* release work reference to output device */
2338                         dev_put(dev2nexthop);
2339
2340                         if (err != 0)
2341                                 return err;
2342                 }
2343                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2344                 return err;
2345         } else {
2346                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2347                                              flags);
2348         }
2349 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2350         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2351 #endif
2352 }
2353
2354 /*
2355  * Major route resolver routine.
2356  */
2357
2358 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2359 {
2360         u32 tos = RT_FL_TOS(oldflp);
2361         struct flowi fl = { .nl_u = { .ip4_u =
2362                                       { .daddr = oldflp->fl4_dst,
2363                                         .saddr = oldflp->fl4_src,
2364                                         .tos = tos & IPTOS_RT_MASK,
2365                                         .scope = ((tos & RTO_ONLINK) ?
2366                                                   RT_SCOPE_LINK :
2367                                                   RT_SCOPE_UNIVERSE),
2368 #ifdef CONFIG_IP_ROUTE_FWMARK
2369                                         .fwmark = oldflp->fl4_fwmark
2370 #endif
2371                                       } },
2372                             .iif = loopback_dev.ifindex,
2373                             .oif = oldflp->oif };
2374         struct fib_result res;
2375         unsigned flags = 0;
2376         struct net_device *dev_out = NULL;
2377         int free_res = 0;
2378         int err;
2379
2380
2381         res.fi          = NULL;
2382 #ifdef CONFIG_IP_MULTIPLE_TABLES
2383         res.r           = NULL;
2384 #endif
2385
2386         if (oldflp->fl4_src) {
2387                 err = -EINVAL;
2388                 if (MULTICAST(oldflp->fl4_src) ||
2389                     BADCLASS(oldflp->fl4_src) ||
2390                     ZERONET(oldflp->fl4_src))
2391                         goto out;
2392
2393                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2394                 dev_out = ip_dev_find(oldflp->fl4_src);
2395                 if (dev_out == NULL)
2396                         goto out;
2397
2398                 /* I removed check for oif == dev_out->oif here.
2399                    It was wrong for two reasons:
2400                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2401                       assigned to multiple interfaces.
2402                    2. Moreover, we are allowed to send packets with saddr
2403                       of another iface. --ANK
2404                  */
2405
2406                 if (oldflp->oif == 0
2407                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2408                         /* Special hack: user can direct multicasts
2409                            and limited broadcast via necessary interface
2410                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2411                            This hack is not just for fun, it allows
2412                            vic,vat and friends to work.
2413                            They bind socket to loopback, set ttl to zero
2414                            and expect that it will work.
2415                            From the viewpoint of routing cache they are broken,
2416                            because we are not allowed to build multicast path
2417                            with loopback source addr (look, routing cache
2418                            cannot know, that ttl is zero, so that packet
2419                            will not leave this host and route is valid).
2420                            Luckily, this hack is good workaround.
2421                          */
2422
2423                         fl.oif = dev_out->ifindex;
2424                         goto make_route;
2425                 }
2426                 if (dev_out)
2427                         dev_put(dev_out);
2428                 dev_out = NULL;
2429         }
2430
2431
2432         if (oldflp->oif) {
2433                 dev_out = dev_get_by_index(oldflp->oif);
2434                 err = -ENODEV;
2435                 if (dev_out == NULL)
2436                         goto out;
2437                 if (__in_dev_get(dev_out) == NULL) {
2438                         dev_put(dev_out);
2439                         goto out;       /* Wrong error code */
2440                 }
2441
2442                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2443                         if (!fl.fl4_src)
2444                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2445                                                               RT_SCOPE_LINK);
2446                         goto make_route;
2447                 }
2448                 if (!fl.fl4_src) {
2449                         if (MULTICAST(oldflp->fl4_dst))
2450                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2451                                                               fl.fl4_scope);
2452                         else if (!oldflp->fl4_dst)
2453                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2454                                                               RT_SCOPE_HOST);
2455                 }
2456         }
2457
2458         if (!fl.fl4_dst) {
2459                 fl.fl4_dst = fl.fl4_src;
2460                 if (!fl.fl4_dst)
2461                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2462                 if (dev_out)
2463                         dev_put(dev_out);
2464                 dev_out = &loopback_dev;
2465                 dev_hold(dev_out);
2466                 fl.oif = loopback_dev.ifindex;
2467                 res.type = RTN_LOCAL;
2468                 flags |= RTCF_LOCAL;
2469                 goto make_route;
2470         }
2471
2472         if (fib_lookup(&fl, &res)) {
2473                 res.fi = NULL;
2474                 if (oldflp->oif) {
2475                         /* Apparently, routing tables are wrong. Assume,
2476                            that the destination is on link.
2477
2478                            WHY? DW.
2479                            Because we are allowed to send to iface
2480                            even if it has NO routes and NO assigned
2481                            addresses. When oif is specified, routing
2482                            tables are looked up with only one purpose:
2483                            to catch if destination is gatewayed, rather than
2484                            direct. Moreover, if MSG_DONTROUTE is set,
2485                            we send packet, ignoring both routing tables
2486                            and ifaddr state. --ANK
2487
2488
2489                            We could make it even if oif is unknown,
2490                            likely IPv6, but we do not.
2491                          */
2492
2493                         if (fl.fl4_src == 0)
2494                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2495                                                               RT_SCOPE_LINK);
2496                         res.type = RTN_UNICAST;
2497                         goto make_route;
2498                 }
2499                 if (dev_out)
2500                         dev_put(dev_out);
2501                 err = -ENETUNREACH;
2502                 goto out;
2503         }
2504         free_res = 1;
2505
2506         if (res.type == RTN_LOCAL) {
2507                 if (!fl.fl4_src)
2508                         fl.fl4_src = fl.fl4_dst;
2509                 if (dev_out)
2510                         dev_put(dev_out);
2511                 dev_out = &loopback_dev;
2512                 dev_hold(dev_out);
2513                 fl.oif = dev_out->ifindex;
2514                 if (res.fi)
2515                         fib_info_put(res.fi);
2516                 res.fi = NULL;
2517                 flags |= RTCF_LOCAL;
2518                 goto make_route;
2519         }
2520
2521 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2522         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2523                 fib_select_multipath(&fl, &res);
2524         else
2525 #endif
2526         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2527                 fib_select_default(&fl, &res);
2528
2529         if (!fl.fl4_src)
2530                 fl.fl4_src = FIB_RES_PREFSRC(res);
2531
2532         if (dev_out)
2533                 dev_put(dev_out);
2534         dev_out = FIB_RES_DEV(res);
2535         dev_hold(dev_out);
2536         fl.oif = dev_out->ifindex;
2537
2538
2539 make_route:
2540         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2541
2542
2543         if (free_res)
2544                 fib_res_put(&res);
2545         if (dev_out)
2546                 dev_put(dev_out);
2547 out:    return err;
2548 }
2549
2550 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2551 {
2552         unsigned hash;
2553         struct rtable *rth;
2554
2555         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2556
2557         rcu_read_lock_bh();
2558         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2559                 rth = rcu_dereference(rth->u.rt_next)) {
2560                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2561                     rth->fl.fl4_src == flp->fl4_src &&
2562                     rth->fl.iif == 0 &&
2563                     rth->fl.oif == flp->oif &&
2564 #ifdef CONFIG_IP_ROUTE_FWMARK
2565                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2566 #endif
2567                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2568                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2569
2570                         /* check for multipath routes and choose one if
2571                          * necessary
2572                          */
2573                         if (multipath_select_route(flp, rth, rp)) {
2574                                 dst_hold(&(*rp)->u.dst);
2575                                 RT_CACHE_STAT_INC(out_hit);
2576                                 rcu_read_unlock_bh();
2577                                 return 0;
2578                         }
2579
2580                         rth->u.dst.lastuse = jiffies;
2581                         dst_hold(&rth->u.dst);
2582                         rth->u.dst.__use++;
2583                         RT_CACHE_STAT_INC(out_hit);
2584                         rcu_read_unlock_bh();
2585                         *rp = rth;
2586                         return 0;
2587                 }
2588                 RT_CACHE_STAT_INC(out_hlist_search);
2589         }
2590         rcu_read_unlock_bh();
2591
2592         return ip_route_output_slow(rp, flp);
2593 }
2594
2595 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2596 {
2597         int err;
2598
2599         if ((err = __ip_route_output_key(rp, flp)) != 0)
2600                 return err;
2601
2602         if (flp->proto) {
2603                 if (!flp->fl4_src)
2604                         flp->fl4_src = (*rp)->rt_src;
2605                 if (!flp->fl4_dst)
2606                         flp->fl4_dst = (*rp)->rt_dst;
2607                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2608         }
2609
2610         return 0;
2611 }
2612
2613 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2614 {
2615         return ip_route_output_flow(rp, flp, NULL, 0);
2616 }
2617
2618 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2619                         int nowait, unsigned int flags)
2620 {
2621         struct rtable *rt = (struct rtable*)skb->dst;
2622         struct rtmsg *r;
2623         struct nlmsghdr  *nlh;
2624         unsigned char    *b = skb->tail;
2625         struct rta_cacheinfo ci;
2626 #ifdef CONFIG_IP_MROUTE
2627         struct rtattr *eptr;
2628 #endif
2629         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2630         r = NLMSG_DATA(nlh);
2631         r->rtm_family    = AF_INET;
2632         r->rtm_dst_len  = 32;
2633         r->rtm_src_len  = 0;
2634         r->rtm_tos      = rt->fl.fl4_tos;
2635         r->rtm_table    = RT_TABLE_MAIN;
2636         r->rtm_type     = rt->rt_type;
2637         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2638         r->rtm_protocol = RTPROT_UNSPEC;
2639         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2640         if (rt->rt_flags & RTCF_NOTIFY)
2641                 r->rtm_flags |= RTM_F_NOTIFY;
2642         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2643         if (rt->fl.fl4_src) {
2644                 r->rtm_src_len = 32;
2645                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2646         }
2647         if (rt->u.dst.dev)
2648                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2649 #ifdef CONFIG_NET_CLS_ROUTE
2650         if (rt->u.dst.tclassid)
2651                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2652 #endif
2653 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2654         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2655                 __u32 alg = rt->rt_multipath_alg;
2656
2657                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2658         }
2659 #endif
2660         if (rt->fl.iif)
2661                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2662         else if (rt->rt_src != rt->fl.fl4_src)
2663                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2664         if (rt->rt_dst != rt->rt_gateway)
2665                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2666         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2667                 goto rtattr_failure;
2668         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2669         ci.rta_used     = rt->u.dst.__use;
2670         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2671         if (rt->u.dst.expires)
2672                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2673         else
2674                 ci.rta_expires = 0;
2675         ci.rta_error    = rt->u.dst.error;
2676         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2677         if (rt->peer) {
2678                 ci.rta_id = rt->peer->ip_id_count;
2679                 if (rt->peer->tcp_ts_stamp) {
2680                         ci.rta_ts = rt->peer->tcp_ts;
2681                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2682                 }
2683         }
2684 #ifdef CONFIG_IP_MROUTE
2685         eptr = (struct rtattr*)skb->tail;
2686 #endif
2687         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2688         if (rt->fl.iif) {
2689 #ifdef CONFIG_IP_MROUTE
2690                 u32 dst = rt->rt_dst;
2691
2692                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2693                     ipv4_devconf.mc_forwarding) {
2694                         int err = ipmr_get_route(skb, r, nowait);
2695                         if (err <= 0) {
2696                                 if (!nowait) {
2697                                         if (err == 0)
2698                                                 return 0;
2699                                         goto nlmsg_failure;
2700                                 } else {
2701                                         if (err == -EMSGSIZE)
2702                                                 goto nlmsg_failure;
2703                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2704                                 }
2705                         }
2706                 } else
2707 #endif
2708                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2709         }
2710
2711         nlh->nlmsg_len = skb->tail - b;
2712         return skb->len;
2713
2714 nlmsg_failure:
2715 rtattr_failure:
2716         skb_trim(skb, b - skb->data);
2717         return -1;
2718 }
2719
2720 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2721 {
2722         struct rtattr **rta = arg;
2723         struct rtmsg *rtm = NLMSG_DATA(nlh);
2724         struct rtable *rt = NULL;
2725         u32 dst = 0;
2726         u32 src = 0;
2727         int iif = 0;
2728         int err = -ENOBUFS;
2729         struct sk_buff *skb;
2730
2731         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2732         if (!skb)
2733                 goto out;
2734
2735         /* Reserve room for dummy headers, this skb can pass
2736            through good chunk of routing engine.
2737          */
2738         skb->mac.raw = skb->data;
2739         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2740
2741         if (rta[RTA_SRC - 1])
2742                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2743         if (rta[RTA_DST - 1])
2744                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2745         if (rta[RTA_IIF - 1])
2746                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2747
2748         if (iif) {
2749                 struct net_device *dev = __dev_get_by_index(iif);
2750                 err = -ENODEV;
2751                 if (!dev)
2752                         goto out_free;
2753                 skb->protocol   = htons(ETH_P_IP);
2754                 skb->dev        = dev;
2755                 local_bh_disable();
2756                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2757                 local_bh_enable();
2758                 rt = (struct rtable*)skb->dst;
2759                 if (!err && rt->u.dst.error)
2760                         err = -rt->u.dst.error;
2761         } else {
2762                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2763                                                          .saddr = src,
2764                                                          .tos = rtm->rtm_tos } } };
2765                 int oif = 0;
2766                 if (rta[RTA_OIF - 1])
2767                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2768                 fl.oif = oif;
2769                 err = ip_route_output_key(&rt, &fl);
2770         }
2771         if (err)
2772                 goto out_free;
2773
2774         skb->dst = &rt->u.dst;
2775         if (rtm->rtm_flags & RTM_F_NOTIFY)
2776                 rt->rt_flags |= RTCF_NOTIFY;
2777
2778         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2779
2780         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2781                                 RTM_NEWROUTE, 0, 0);
2782         if (!err)
2783                 goto out_free;
2784         if (err < 0) {
2785                 err = -EMSGSIZE;
2786                 goto out_free;
2787         }
2788
2789         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2790         if (err > 0)
2791                 err = 0;
2792 out:    return err;
2793
2794 out_free:
2795         kfree_skb(skb);
2796         goto out;
2797 }
2798
2799 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2800 {
2801         struct rtable *rt;
2802         int h, s_h;
2803         int idx, s_idx;
2804
2805         s_h = cb->args[0];
2806         s_idx = idx = cb->args[1];
2807         for (h = 0; h <= rt_hash_mask; h++) {
2808                 if (h < s_h) continue;
2809                 if (h > s_h)
2810                         s_idx = 0;
2811                 rcu_read_lock_bh();
2812                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2813                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2814                         if (idx < s_idx)
2815                                 continue;
2816                         skb->dst = dst_clone(&rt->u.dst);
2817                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2818                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2819                                          1, NLM_F_MULTI) <= 0) {
2820                                 dst_release(xchg(&skb->dst, NULL));
2821                                 rcu_read_unlock_bh();
2822                                 goto done;
2823                         }
2824                         dst_release(xchg(&skb->dst, NULL));
2825                 }
2826                 rcu_read_unlock_bh();
2827         }
2828
2829 done:
2830         cb->args[0] = h;
2831         cb->args[1] = idx;
2832         return skb->len;
2833 }
2834
2835 void ip_rt_multicast_event(struct in_device *in_dev)
2836 {
2837         rt_cache_flush(0);
2838 }
2839
2840 #ifdef CONFIG_SYSCTL
2841 static int flush_delay;
2842
2843 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2844                                         struct file *filp, void __user *buffer,
2845                                         size_t *lenp, loff_t *ppos)
2846 {
2847         if (write) {
2848                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2849                 rt_cache_flush(flush_delay);
2850                 return 0;
2851         }
2852
2853         return -EINVAL;
2854 }
2855
2856 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2857                                                 int __user *name,
2858                                                 int nlen,
2859                                                 void __user *oldval,
2860                                                 size_t __user *oldlenp,
2861                                                 void __user *newval,
2862                                                 size_t newlen,
2863                                                 void **context)
2864 {
2865         int delay;
2866         if (newlen != sizeof(int))
2867                 return -EINVAL;
2868         if (get_user(delay, (int __user *)newval))
2869                 return -EFAULT;
2870         rt_cache_flush(delay);
2871         return 0;
2872 }
2873
2874 ctl_table ipv4_route_table[] = {
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2877                 .procname       = "flush",
2878                 .data           = &flush_delay,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0200,
2881                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2882                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2886                 .procname       = "min_delay",
2887                 .data           = &ip_rt_min_delay,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec_jiffies,
2891                 .strategy       = &sysctl_jiffies,
2892         },
2893         {
2894                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2895                 .procname       = "max_delay",
2896                 .data           = &ip_rt_max_delay,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = &proc_dointvec_jiffies,
2900                 .strategy       = &sysctl_jiffies,
2901         },
2902         {
2903                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2904                 .procname       = "gc_thresh",
2905                 .data           = &ipv4_dst_ops.gc_thresh,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = &proc_dointvec,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2912                 .procname       = "max_size",
2913                 .data           = &ip_rt_max_size,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec,
2917         },
2918         {
2919                 /*  Deprecated. Use gc_min_interval_ms */
2920
2921                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2922                 .procname       = "gc_min_interval",
2923                 .data           = &ip_rt_gc_min_interval,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = &proc_dointvec_jiffies,
2927                 .strategy       = &sysctl_jiffies,
2928         },
2929         {
2930                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2931                 .procname       = "gc_min_interval_ms",
2932                 .data           = &ip_rt_gc_min_interval,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = &proc_dointvec_ms_jiffies,
2936                 .strategy       = &sysctl_ms_jiffies,
2937         },
2938         {
2939                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2940                 .procname       = "gc_timeout",
2941                 .data           = &ip_rt_gc_timeout,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = &proc_dointvec_jiffies,
2945                 .strategy       = &sysctl_jiffies,
2946         },
2947         {
2948                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2949                 .procname       = "gc_interval",
2950                 .data           = &ip_rt_gc_interval,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = &proc_dointvec_jiffies,
2954                 .strategy       = &sysctl_jiffies,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2958                 .procname       = "redirect_load",
2959                 .data           = &ip_rt_redirect_load,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2966                 .procname       = "redirect_number",
2967                 .data           = &ip_rt_redirect_number,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2974                 .procname       = "redirect_silence",
2975                 .data           = &ip_rt_redirect_silence,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2982                 .procname       = "error_cost",
2983                 .data           = &ip_rt_error_cost,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec,
2987         },
2988         {
2989                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2990                 .procname       = "error_burst",
2991                 .data           = &ip_rt_error_burst,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = &proc_dointvec,
2995         },
2996         {
2997                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2998                 .procname       = "gc_elasticity",
2999                 .data           = &ip_rt_gc_elasticity,
3000                 .maxlen         = sizeof(int),
3001                 .mode           = 0644,
3002                 .proc_handler   = &proc_dointvec,
3003         },
3004         {
3005                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3006                 .procname       = "mtu_expires",
3007                 .data           = &ip_rt_mtu_expires,
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0644,
3010                 .proc_handler   = &proc_dointvec_jiffies,
3011                 .strategy       = &sysctl_jiffies,
3012         },
3013         {
3014                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3015                 .procname       = "min_pmtu",
3016                 .data           = &ip_rt_min_pmtu,
3017                 .maxlen         = sizeof(int),
3018                 .mode           = 0644,
3019                 .proc_handler   = &proc_dointvec,
3020         },
3021         {
3022                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3023                 .procname       = "min_adv_mss",
3024                 .data           = &ip_rt_min_advmss,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = &proc_dointvec,
3028         },
3029         {
3030                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3031                 .procname       = "secret_interval",
3032                 .data           = &ip_rt_secret_interval,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = &proc_dointvec_jiffies,
3036                 .strategy       = &sysctl_jiffies,
3037         },
3038         { .ctl_name = 0 }
3039 };
3040 #endif
3041
3042 #ifdef CONFIG_NET_CLS_ROUTE
3043 struct ip_rt_acct *ip_rt_acct;
3044
3045 /* This code sucks.  But you should have seen it before! --RR */
3046
3047 /* IP route accounting ptr for this logical cpu number. */
3048 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3049
3050 #ifdef CONFIG_PROC_FS
3051 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3052                            int length, int *eof, void *data)
3053 {
3054         unsigned int i;
3055
3056         if ((offset & 3) || (length & 3))
3057                 return -EIO;
3058
3059         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3060                 *eof = 1;
3061                 return 0;
3062         }
3063
3064         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3065                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3066                 *eof = 1;
3067         }
3068
3069         offset /= sizeof(u32);
3070
3071         if (length > 0) {
3072                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3073                 u32 *dst = (u32 *) buffer;
3074
3075                 /* Copy first cpu. */
3076                 *start = buffer;
3077                 memcpy(dst, src, length);
3078
3079                 /* Add the other cpus in, one int at a time */
3080                 for_each_cpu(i) {
3081                         unsigned int j;
3082
3083                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3084
3085                         for (j = 0; j < length/4; j++)
3086                                 dst[j] += src[j];
3087                 }
3088         }
3089         return length;
3090 }
3091 #endif /* CONFIG_PROC_FS */
3092 #endif /* CONFIG_NET_CLS_ROUTE */
3093
3094 static __initdata unsigned long rhash_entries;
3095 static int __init set_rhash_entries(char *str)
3096 {
3097         if (!str)
3098                 return 0;
3099         rhash_entries = simple_strtoul(str, &str, 0);
3100         return 1;
3101 }
3102 __setup("rhash_entries=", set_rhash_entries);
3103
3104 int __init ip_rt_init(void)
3105 {
3106         int order, goal, rc = 0;
3107
3108         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3109                              (jiffies ^ (jiffies >> 7)));
3110
3111 #ifdef CONFIG_NET_CLS_ROUTE
3112         for (order = 0;
3113              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3114                 /* NOTHING */;
3115         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3116         if (!ip_rt_acct)
3117                 panic("IP: failed to allocate ip_rt_acct\n");
3118         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3119 #endif
3120
3121         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3122                                                      sizeof(struct rtable),
3123                                                      0, SLAB_HWCACHE_ALIGN,
3124                                                      NULL, NULL);
3125
3126         if (!ipv4_dst_ops.kmem_cachep)
3127                 panic("IP: failed to allocate ip_dst_cache\n");
3128
3129         goal = num_physpages >> (26 - PAGE_SHIFT);
3130         if (rhash_entries)
3131                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3132         for (order = 0; (1UL << order) < goal; order++)
3133                 /* NOTHING */;
3134
3135         do {
3136                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3137                         sizeof(struct rt_hash_bucket);
3138                 while (rt_hash_mask & (rt_hash_mask - 1))
3139                         rt_hash_mask--;
3140                 rt_hash_table = (struct rt_hash_bucket *)
3141                         __get_free_pages(GFP_ATOMIC, order);
3142         } while (rt_hash_table == NULL && --order > 0);
3143
3144         if (!rt_hash_table)
3145                 panic("Failed to allocate IP route cache hash table\n");
3146
3147         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3148                rt_hash_mask,
3149                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3150
3151         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3152                 /* NOTHING */;
3153
3154         rt_hash_mask--;
3155         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3156         rt_hash_lock_init();
3157
3158         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3159         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3160
3161         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3162         if (!rt_cache_stat)
3163                 return -ENOMEM;
3164
3165         devinet_init();
3166         ip_fib_init();
3167
3168         init_timer(&rt_flush_timer);
3169         rt_flush_timer.function = rt_run_flush;
3170         init_timer(&rt_periodic_timer);
3171         rt_periodic_timer.function = rt_check_expire;
3172         init_timer(&rt_secret_timer);
3173         rt_secret_timer.function = rt_secret_rebuild;
3174
3175         /* All the timers, started at system startup tend
3176            to synchronize. Perturb it a bit.
3177          */
3178         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3179                                         ip_rt_gc_interval;
3180         add_timer(&rt_periodic_timer);
3181
3182         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3183                 ip_rt_secret_interval;
3184         add_timer(&rt_secret_timer);
3185
3186 #ifdef CONFIG_PROC_FS
3187         {
3188         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3189         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3190             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3191                                              proc_net_stat))) {
3192                 free_percpu(rt_cache_stat);
3193                 return -ENOMEM;
3194         }
3195         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3196         }
3197 #ifdef CONFIG_NET_CLS_ROUTE
3198         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3199 #endif
3200 #endif
3201 #ifdef CONFIG_XFRM
3202         xfrm_init();
3203         xfrm4_init();
3204 #endif
3205         return rc;
3206 }
3207
3208 EXPORT_SYMBOL(__ip_select_ident);
3209 EXPORT_SYMBOL(ip_route_input);
3210 EXPORT_SYMBOL(ip_route_output_key);