net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113 #include <net/ip_tunnels.h>
 114 #include <net/l3mdev.h>
 115
 116 #include "fib_lookup.h"
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363 #endif
 364
 365 static int __net_init ip_rt_do_proc_init(struct net *net)
 366 {
 367         struct proc_dir_entry *pde;
 368
 369         pde = proc_create("rt_cache", 0444, net->proc_net,
 370                           &rt_cache_seq_fops);
 371         if (!pde)
 372                 goto err1;
 373
 374         pde = proc_create("rt_cache", 0444,
 375                           net->proc_net_stat, &rt_cpu_seq_fops);
 376         if (!pde)
 377                 goto err2;
 378
 379 #ifdef CONFIG_IP_ROUTE_CLASSID
 380         pde = proc_create_single("rt_acct", 0, net->proc_net,
 381                         rt_acct_proc_show);
 382         if (!pde)
 383                 goto err3;
 384 #endif
 385         return 0;
 386
 387 #ifdef CONFIG_IP_ROUTE_CLASSID
 388 err3:
 389         remove_proc_entry("rt_cache", net->proc_net_stat);
 390 #endif
 391 err2:
 392         remove_proc_entry("rt_cache", net->proc_net);
 393 err1:
 394         return -ENOMEM;
 395 }
 396
 397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 398 {
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400         remove_proc_entry("rt_cache", net->proc_net);
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402         remove_proc_entry("rt_acct", net->proc_net);
 403 #endif
 404 }
 405
 406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 407         .init = ip_rt_do_proc_init,
 408         .exit = ip_rt_do_proc_exit,
 409 };
 410
 411 static int __init ip_rt_proc_init(void)
 412 {
 413         return register_pernet_subsys(&ip_rt_proc_ops);
 414 }
 415
 416 #else
 417 static inline int ip_rt_proc_init(void)
 418 {
 419         return 0;
 420 }
 421 #endif /* CONFIG_PROC_FS */
 422
 423 static inline bool rt_is_expired(const struct rtable *rth)
 424 {
 425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 426 }
 427
 428 void rt_cache_flush(struct net *net)
 429 {
 430         rt_genid_bump_ipv4(net);
 431 }
 432
 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 434                                            struct sk_buff *skb,
 435                                            const void *daddr)
 436 {
 437         struct net_device *dev = dst->dev;
 438         const __be32 *pkey = daddr;
 439         const struct rtable *rt;
 440         struct neighbour *n;
 441
 442         rt = (const struct rtable *) dst;
 443         if (rt->rt_gateway)
 444                 pkey = (const __be32 *) &rt->rt_gateway;
 445         else if (skb)
 446                 pkey = &ip_hdr(skb)->daddr;
 447
 448         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 449         if (n)
 450                 return n;
 451         return neigh_create(&arp_tbl, pkey, dev);
 452 }
 453
 454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 455 {
 456         struct net_device *dev = dst->dev;
 457         const __be32 *pkey = daddr;
 458         const struct rtable *rt;
 459
 460         rt = (const struct rtable *)dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *)&rt->rt_gateway;
 463         else if (!daddr ||
 464                  (rt->rt_flags &
 465                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 466                 return;
 467
 468         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 469 }
 470
 471 #define IP_IDENTS_SZ 2048u
 472
 473 static atomic_t *ip_idents __read_mostly;
 474 static u32 *ip_tstamps __read_mostly;
 475
 476 /* In order to protect privacy, we add a perturbation to identifiers
 477  * if one generator is seldom used. This makes hard for an attacker
 478  * to infer how many packets were sent between two points in time.
 479  */
 480 u32 ip_idents_reserve(u32 hash, int segs)
 481 {
 482         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 483         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 484         u32 old = READ_ONCE(*p_tstamp);
 485         u32 now = (u32)jiffies;
 486         u32 new, delta = 0;
 487
 488         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 489                 delta = prandom_u32_max(now - old);
 490
 491         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 492         do {
 493                 old = (u32)atomic_read(p_id);
 494                 new = old + delta + segs;
 495         } while (atomic_cmpxchg(p_id, old, new) != old);
 496
 497         return new - segs;
 498 }
 499 EXPORT_SYMBOL(ip_idents_reserve);
 500
 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 502 {
 503         static u32 ip_idents_hashrnd __read_mostly;
 504         u32 hash, id;
 505
 506         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 507
 508         hash = jhash_3words((__force u32)iph->daddr,
 509                             (__force u32)iph->saddr,
 510                             iph->protocol ^ net_hash_mix(net),
 511                             ip_idents_hashrnd);
 512         id = ip_idents_reserve(hash, segs);
 513         iph->id = htons(id);
 514 }
 515 EXPORT_SYMBOL(__ip_select_ident);
 516
 517 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 518                              const struct sock *sk,
 519                              const struct iphdr *iph,
 520                              int oif, u8 tos,
 521                              u8 prot, u32 mark, int flow_flags)
 522 {
 523         if (sk) {
 524                 const struct inet_sock *inet = inet_sk(sk);
 525
 526                 oif = sk->sk_bound_dev_if;
 527                 mark = sk->sk_mark;
 528                 tos = RT_CONN_FLAGS(sk);
 529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 530         }
 531         flowi4_init_output(fl4, oif, mark, tos,
 532                            RT_SCOPE_UNIVERSE, prot,
 533                            flow_flags,
 534                            iph->daddr, iph->saddr, 0, 0,
 535                            sock_net_uid(net, sk));
 536 }
 537
 538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 539                                const struct sock *sk)
 540 {
 541         const struct net *net = dev_net(skb->dev);
 542         const struct iphdr *iph = ip_hdr(skb);
 543         int oif = skb->dev->ifindex;
 544         u8 tos = RT_TOS(iph->tos);
 545         u8 prot = iph->protocol;
 546         u32 mark = skb->mark;
 547
 548         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 549 }
 550
 551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 552 {
 553         const struct inet_sock *inet = inet_sk(sk);
 554         const struct ip_options_rcu *inet_opt;
 555         __be32 daddr = inet->inet_daddr;
 556
 557         rcu_read_lock();
 558         inet_opt = rcu_dereference(inet->inet_opt);
 559         if (inet_opt && inet_opt->opt.srr)
 560                 daddr = inet_opt->opt.faddr;
 561         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 562                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 563                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 564                            inet_sk_flowi_flags(sk),
 565                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 566         rcu_read_unlock();
 567 }
 568
 569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 570                                  const struct sk_buff *skb)
 571 {
 572         if (skb)
 573                 build_skb_flow_key(fl4, skb, sk);
 574         else
 575                 build_sk_flow_key(fl4, sk);
 576 }
 577
 578 static DEFINE_SPINLOCK(fnhe_lock);
 579
 580 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 581 {
 582         struct rtable *rt;
 583
 584         rt = rcu_dereference(fnhe->fnhe_rth_input);
 585         if (rt) {
 586                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 587                 dst_dev_put(&rt->dst);
 588                 dst_release(&rt->dst);
 589         }
 590         rt = rcu_dereference(fnhe->fnhe_rth_output);
 591         if (rt) {
 592                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 593                 dst_dev_put(&rt->dst);
 594                 dst_release(&rt->dst);
 595         }
 596 }
 597
 598 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 599 {
 600         struct fib_nh_exception *fnhe, *oldest;
 601
 602         oldest = rcu_dereference(hash->chain);
 603         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 604              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 605                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 606                         oldest = fnhe;
 607         }
 608         fnhe_flush_routes(oldest);
 609         return oldest;
 610 }
 611
 612 static inline u32 fnhe_hashfun(__be32 daddr)
 613 {
 614         static u32 fnhe_hashrnd __read_mostly;
 615         u32 hval;
 616
 617         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 618         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 619         return hash_32(hval, FNHE_HASH_SHIFT);
 620 }
 621
 622 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 623 {
 624         rt->rt_pmtu = fnhe->fnhe_pmtu;
 625         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 626         rt->dst.expires = fnhe->fnhe_expires;
 627
 628         if (fnhe->fnhe_gw) {
 629                 rt->rt_flags |= RTCF_REDIRECTED;
 630                 rt->rt_gateway = fnhe->fnhe_gw;
 631                 rt->rt_uses_gateway = 1;
 632         }
 633 }
 634
 635 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 636                                   u32 pmtu, bool lock, unsigned long expires)
 637 {
 638         struct fnhe_hash_bucket *hash;
 639         struct fib_nh_exception *fnhe;
 640         struct rtable *rt;
 641         u32 genid, hval;
 642         unsigned int i;
 643         int depth;
 644
 645         genid = fnhe_genid(dev_net(nh->nh_dev));
 646         hval = fnhe_hashfun(daddr);
 647
 648         spin_lock_bh(&fnhe_lock);
 649
 650         hash = rcu_dereference(nh->nh_exceptions);
 651         if (!hash) {
 652                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 653                 if (!hash)
 654                         goto out_unlock;
 655                 rcu_assign_pointer(nh->nh_exceptions, hash);
 656         }
 657
 658         hash += hval;
 659
 660         depth = 0;
 661         for (fnhe = rcu_dereference(hash->chain); fnhe;
 662              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 663                 if (fnhe->fnhe_daddr == daddr)
 664                         break;
 665                 depth++;
 666         }
 667
 668         if (fnhe) {
 669                 if (fnhe->fnhe_genid != genid)
 670                         fnhe->fnhe_genid = genid;
 671                 if (gw)
 672                         fnhe->fnhe_gw = gw;
 673                 if (pmtu) {
 674                         fnhe->fnhe_pmtu = pmtu;
 675                         fnhe->fnhe_mtu_locked = lock;
 676                 }
 677                 fnhe->fnhe_expires = max(1UL, expires);
 678                 /* Update all cached dsts too */
 679                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 680                 if (rt)
 681                         fill_route_from_fnhe(rt, fnhe);
 682                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 683                 if (rt)
 684                         fill_route_from_fnhe(rt, fnhe);
 685         } else {
 686                 if (depth > FNHE_RECLAIM_DEPTH)
 687                         fnhe = fnhe_oldest(hash);
 688                 else {
 689                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 690                         if (!fnhe)
 691                                 goto out_unlock;
 692
 693                         fnhe->fnhe_next = hash->chain;
 694                         rcu_assign_pointer(hash->chain, fnhe);
 695                 }
 696                 fnhe->fnhe_genid = genid;
 697                 fnhe->fnhe_daddr = daddr;
 698                 fnhe->fnhe_gw = gw;
 699                 fnhe->fnhe_pmtu = pmtu;
 700                 fnhe->fnhe_mtu_locked = lock;
 701                 fnhe->fnhe_expires = max(1UL, expires);
 702
 703                 /* Exception created; mark the cached routes for the nexthop
 704                  * stale, so anyone caching it rechecks if this exception
 705                  * applies to them.
 706                  */
 707                 rt = rcu_dereference(nh->nh_rth_input);
 708                 if (rt)
 709                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 710
 711                 for_each_possible_cpu(i) {
 712                         struct rtable __rcu **prt;
 713                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 714                         rt = rcu_dereference(*prt);
 715                         if (rt)
 716                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 717                 }
 718         }
 719
 720         fnhe->fnhe_stamp = jiffies;
 721
 722 out_unlock:
 723         spin_unlock_bh(&fnhe_lock);
 724 }
 725
 726 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 727                              bool kill_route)
 728 {
 729         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 730         __be32 old_gw = ip_hdr(skb)->saddr;
 731         struct net_device *dev = skb->dev;
 732         struct in_device *in_dev;
 733         struct fib_result res;
 734         struct neighbour *n;
 735         struct net *net;
 736
 737         switch (icmp_hdr(skb)->code & 7) {
 738         case ICMP_REDIR_NET:
 739         case ICMP_REDIR_NETTOS:
 740         case ICMP_REDIR_HOST:
 741         case ICMP_REDIR_HOSTTOS:
 742                 break;
 743
 744         default:
 745                 return;
 746         }
 747
 748         if (rt->rt_gateway != old_gw)
 749                 return;
 750
 751         in_dev = __in_dev_get_rcu(dev);
 752         if (!in_dev)
 753                 return;
 754
 755         net = dev_net(dev);
 756         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 757             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 758             ipv4_is_zeronet(new_gw))
 759                 goto reject_redirect;
 760
 761         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 762                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 763                         goto reject_redirect;
 764                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 765                         goto reject_redirect;
 766         } else {
 767                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 768                         goto reject_redirect;
 769         }
 770
 771         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 772         if (!n)
 773                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 774         if (!IS_ERR(n)) {
 775                 if (!(n->nud_state & NUD_VALID)) {
 776                         neigh_event_send(n, NULL);
 777                 } else {
 778                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 779                                 struct fib_nh *nh = &FIB_RES_NH(res);
 780
 781                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 782                                                 0, false,
 783                                                 jiffies + ip_rt_gc_timeout);
 784                         }
 785                         if (kill_route)
 786                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 787                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 788                 }
 789                 neigh_release(n);
 790         }
 791         return;
 792
 793 reject_redirect:
 794 #ifdef CONFIG_IP_ROUTE_VERBOSE
 795         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 796                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 797                 __be32 daddr = iph->daddr;
 798                 __be32 saddr = iph->saddr;
 799
 800                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 801                                      "  Advised path = %pI4 -> %pI4\n",
 802                                      &old_gw, dev->name, &new_gw,
 803                                      &saddr, &daddr);
 804         }
 805 #endif
 806         ;
 807 }
 808
 809 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 810 {
 811         struct rtable *rt;
 812         struct flowi4 fl4;
 813         const struct iphdr *iph = (const struct iphdr *) skb->data;
 814         struct net *net = dev_net(skb->dev);
 815         int oif = skb->dev->ifindex;
 816         u8 tos = RT_TOS(iph->tos);
 817         u8 prot = iph->protocol;
 818         u32 mark = skb->mark;
 819
 820         rt = (struct rtable *) dst;
 821
 822         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 823         __ip_do_redirect(rt, skb, &fl4, true);
 824 }
 825
 826 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 827 {
 828         struct rtable *rt = (struct rtable *)dst;
 829         struct dst_entry *ret = dst;
 830
 831         if (rt) {
 832                 if (dst->obsolete > 0) {
 833                         ip_rt_put(rt);
 834                         ret = NULL;
 835                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 836                            rt->dst.expires) {
 837                         ip_rt_put(rt);
 838                         ret = NULL;
 839                 }
 840         }
 841         return ret;
 842 }
 843
 844 /*
 845  * Algorithm:
 846  *      1. The first ip_rt_redirect_number redirects are sent
 847  *         with exponential backoff, then we stop sending them at all,
 848  *         assuming that the host ignores our redirects.
 849  *      2. If we did not see packets requiring redirects
 850  *         during ip_rt_redirect_silence, we assume that the host
 851  *         forgot redirected route and start to send redirects again.
 852  *
 853  * This algorithm is much cheaper and more intelligent than dumb load limiting
 854  * in icmp.c.
 855  *
 856  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 857  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 858  */
 859
 860 void ip_rt_send_redirect(struct sk_buff *skb)
 861 {
 862         struct rtable *rt = skb_rtable(skb);
 863         struct in_device *in_dev;
 864         struct inet_peer *peer;
 865         struct net *net;
 866         int log_martians;
 867         int vif;
 868
 869         rcu_read_lock();
 870         in_dev = __in_dev_get_rcu(rt->dst.dev);
 871         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 872                 rcu_read_unlock();
 873                 return;
 874         }
 875         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 876         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 877         rcu_read_unlock();
 878
 879         net = dev_net(rt->dst.dev);
 880         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 881         if (!peer) {
 882                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 883                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 884                 return;
 885         }
 886
 887         /* No redirected packets during ip_rt_redirect_silence;
 888          * reset the algorithm.
 889          */
 890         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 891                 peer->rate_tokens = 0;
 892
 893         /* Too many ignored redirects; do not send anything
 894          * set dst.rate_last to the last seen redirected packet.
 895          */
 896         if (peer->rate_tokens >= ip_rt_redirect_number) {
 897                 peer->rate_last = jiffies;
 898                 goto out_put_peer;
 899         }
 900
 901         /* Check for load limit; set rate_last to the latest sent
 902          * redirect.
 903          */
 904         if (peer->rate_tokens == 0 ||
 905             time_after(jiffies,
 906                        (peer->rate_last +
 907                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 908                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 909
 910                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 911                 peer->rate_last = jiffies;
 912                 ++peer->rate_tokens;
 913 #ifdef CONFIG_IP_ROUTE_VERBOSE
 914                 if (log_martians &&
 915                     peer->rate_tokens == ip_rt_redirect_number)
 916                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 917                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 918                                              &ip_hdr(skb)->daddr, &gw);
 919 #endif
 920         }
 921 out_put_peer:
 922         inet_putpeer(peer);
 923 }
 924
 925 static int ip_error(struct sk_buff *skb)
 926 {
 927         struct rtable *rt = skb_rtable(skb);
 928         struct net_device *dev = skb->dev;
 929         struct in_device *in_dev;
 930         struct inet_peer *peer;
 931         unsigned long now;
 932         struct net *net;
 933         bool send;
 934         int code;
 935
 936         if (netif_is_l3_master(skb->dev)) {
 937                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 938                 if (!dev)
 939                         goto out;
 940         }
 941
 942         in_dev = __in_dev_get_rcu(dev);
 943
 944         /* IP on this device is disabled. */
 945         if (!in_dev)
 946                 goto out;
 947
 948         net = dev_net(rt->dst.dev);
 949         if (!IN_DEV_FORWARD(in_dev)) {
 950                 switch (rt->dst.error) {
 951                 case EHOSTUNREACH:
 952                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 953                         break;
 954
 955                 case ENETUNREACH:
 956                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 957                         break;
 958                 }
 959                 goto out;
 960         }
 961
 962         switch (rt->dst.error) {
 963         case EINVAL:
 964         default:
 965                 goto out;
 966         case EHOSTUNREACH:
 967                 code = ICMP_HOST_UNREACH;
 968                 break;
 969         case ENETUNREACH:
 970                 code = ICMP_NET_UNREACH;
 971                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 972                 break;
 973         case EACCES:
 974                 code = ICMP_PKT_FILTERED;
 975                 break;
 976         }
 977
 978         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 979                                l3mdev_master_ifindex(skb->dev), 1);
 980
 981         send = true;
 982         if (peer) {
 983                 now = jiffies;
 984                 peer->rate_tokens += now - peer->rate_last;
 985                 if (peer->rate_tokens > ip_rt_error_burst)
 986                         peer->rate_tokens = ip_rt_error_burst;
 987                 peer->rate_last = now;
 988                 if (peer->rate_tokens >= ip_rt_error_cost)
 989                         peer->rate_tokens -= ip_rt_error_cost;
 990                 else
 991                         send = false;
 992                 inet_putpeer(peer);
 993         }
 994         if (send)
 995                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 996
 997 out:    kfree_skb(skb);
 998         return 0;
 999 }
1000
1001 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1002 {
1003         struct dst_entry *dst = &rt->dst;
1004         struct fib_result res;
1005         bool lock = false;
1006
1007         if (ip_mtu_locked(dst))
1008                 return;
1009
1010         if (ipv4_mtu(dst) < mtu)
1011                 return;
1012
1013         if (mtu < ip_rt_min_pmtu) {
1014                 lock = true;
1015                 mtu = ip_rt_min_pmtu;
1016         }
1017
1018         if (rt->rt_pmtu == mtu &&
1019             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1020                 return;
1021
1022         rcu_read_lock();
1023         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1024                 struct fib_nh *nh = &FIB_RES_NH(res);
1025
1026                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1027                                       jiffies + ip_rt_mtu_expires);
1028         }
1029         rcu_read_unlock();
1030 }
1031
1032 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1033                               struct sk_buff *skb, u32 mtu)
1034 {
1035         struct rtable *rt = (struct rtable *) dst;
1036         struct flowi4 fl4;
1037
1038         ip_rt_build_flow_key(&fl4, sk, skb);
1039         __ip_rt_update_pmtu(rt, &fl4, mtu);
1040 }
1041
1042 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1043                       int oif, u8 protocol)
1044 {
1045         const struct iphdr *iph = (const struct iphdr *) skb->data;
1046         struct flowi4 fl4;
1047         struct rtable *rt;
1048         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1049
1050         __build_flow_key(net, &fl4, NULL, iph, oif,
1051                          RT_TOS(iph->tos), protocol, mark, 0);
1052         rt = __ip_route_output_key(net, &fl4);
1053         if (!IS_ERR(rt)) {
1054                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1055                 ip_rt_put(rt);
1056         }
1057 }
1058 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1059
1060 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1061 {
1062         const struct iphdr *iph = (const struct iphdr *) skb->data;
1063         struct flowi4 fl4;
1064         struct rtable *rt;
1065
1066         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1067
1068         if (!fl4.flowi4_mark)
1069                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1070
1071         rt = __ip_route_output_key(sock_net(sk), &fl4);
1072         if (!IS_ERR(rt)) {
1073                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1074                 ip_rt_put(rt);
1075         }
1076 }
1077
1078 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1079 {
1080         const struct iphdr *iph = (const struct iphdr *) skb->data;
1081         struct flowi4 fl4;
1082         struct rtable *rt;
1083         struct dst_entry *odst = NULL;
1084         bool new = false;
1085         struct net *net = sock_net(sk);
1086
1087         bh_lock_sock(sk);
1088
1089         if (!ip_sk_accept_pmtu(sk))
1090                 goto out;
1091
1092         odst = sk_dst_get(sk);
1093
1094         if (sock_owned_by_user(sk) || !odst) {
1095                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1096                 goto out;
1097         }
1098
1099         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1100
1101         rt = (struct rtable *)odst;
1102         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1103                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1104                 if (IS_ERR(rt))
1105                         goto out;
1106
1107                 new = true;
1108         }
1109
1110         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1111
1112         if (!dst_check(&rt->dst, 0)) {
1113                 if (new)
1114                         dst_release(&rt->dst);
1115
1116                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                 if (IS_ERR(rt))
1118                         goto out;
1119
1120                 new = true;
1121         }
1122
1123         if (new)
1124                 sk_dst_set(sk, &rt->dst);
1125
1126 out:
1127         bh_unlock_sock(sk);
1128         dst_release(odst);
1129 }
1130 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1131
1132 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1133                    int oif, u8 protocol)
1134 {
1135         const struct iphdr *iph = (const struct iphdr *) skb->data;
1136         struct flowi4 fl4;
1137         struct rtable *rt;
1138
1139         __build_flow_key(net, &fl4, NULL, iph, oif,
1140                          RT_TOS(iph->tos), protocol, 0, 0);
1141         rt = __ip_route_output_key(net, &fl4);
1142         if (!IS_ERR(rt)) {
1143                 __ip_do_redirect(rt, skb, &fl4, false);
1144                 ip_rt_put(rt);
1145         }
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_redirect);
1148
1149 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1150 {
1151         const struct iphdr *iph = (const struct iphdr *) skb->data;
1152         struct flowi4 fl4;
1153         struct rtable *rt;
1154         struct net *net = sock_net(sk);
1155
1156         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157         rt = __ip_route_output_key(net, &fl4);
1158         if (!IS_ERR(rt)) {
1159                 __ip_do_redirect(rt, skb, &fl4, false);
1160                 ip_rt_put(rt);
1161         }
1162 }
1163 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1164
1165 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1166 {
1167         struct rtable *rt = (struct rtable *) dst;
1168
1169         /* All IPV4 dsts are created with ->obsolete set to the value
1170          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1171          * into this function always.
1172          *
1173          * When a PMTU/redirect information update invalidates a route,
1174          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1175          * DST_OBSOLETE_DEAD by dst_free().
1176          */
1177         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1178                 return NULL;
1179         return dst;
1180 }
1181
1182 static void ipv4_link_failure(struct sk_buff *skb)
1183 {
1184         struct rtable *rt;
1185
1186         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1187
1188         rt = skb_rtable(skb);
1189         if (rt)
1190                 dst_set_expires(&rt->dst, 0);
1191 }
1192
1193 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1194 {
1195         pr_debug("%s: %pI4 -> %pI4, %s\n",
1196                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1197                  skb->dev ? skb->dev->name : "?");
1198         kfree_skb(skb);
1199         WARN_ON(1);
1200         return 0;
1201 }
1202
1203 /*
1204    We do not cache source address of outgoing interface,
1205    because it is used only by IP RR, TS and SRR options,
1206    so that it out of fast path.
1207
1208    BTW remember: "addr" is allowed to be not aligned
1209    in IP options!
1210  */
1211
1212 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1213 {
1214         __be32 src;
1215
1216         if (rt_is_output_route(rt))
1217                 src = ip_hdr(skb)->saddr;
1218         else {
1219                 struct fib_result res;
1220                 struct iphdr *iph = ip_hdr(skb);
1221                 struct flowi4 fl4 = {
1222                         .daddr = iph->daddr,
1223                         .saddr = iph->saddr,
1224                         .flowi4_tos = RT_TOS(iph->tos),
1225                         .flowi4_oif = rt->dst.dev->ifindex,
1226                         .flowi4_iif = skb->dev->ifindex,
1227                         .flowi4_mark = skb->mark,
1228                 };
1229
1230                 rcu_read_lock();
1231                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1232                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1233                 else
1234                         src = inet_select_addr(rt->dst.dev,
1235                                                rt_nexthop(rt, iph->daddr),
1236                                                RT_SCOPE_UNIVERSE);
1237                 rcu_read_unlock();
1238         }
1239         memcpy(addr, &src, 4);
1240 }
1241
1242 #ifdef CONFIG_IP_ROUTE_CLASSID
1243 static void set_class_tag(struct rtable *rt, u32 tag)
1244 {
1245         if (!(rt->dst.tclassid & 0xFFFF))
1246                 rt->dst.tclassid |= tag & 0xFFFF;
1247         if (!(rt->dst.tclassid & 0xFFFF0000))
1248                 rt->dst.tclassid |= tag & 0xFFFF0000;
1249 }
1250 #endif
1251
1252 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1253 {
1254         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1255         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1256                                     ip_rt_min_advmss);
1257
1258         return min(advmss, IPV4_MAX_PMTU - header_size);
1259 }
1260
1261 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1262 {
1263         const struct rtable *rt = (const struct rtable *) dst;
1264         unsigned int mtu = rt->rt_pmtu;
1265
1266         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1267                 mtu = dst_metric_raw(dst, RTAX_MTU);
1268
1269         if (mtu)
1270                 return mtu;
1271
1272         mtu = READ_ONCE(dst->dev->mtu);
1273
1274         if (unlikely(ip_mtu_locked(dst))) {
1275                 if (rt->rt_uses_gateway && mtu > 576)
1276                         mtu = 576;
1277         }
1278
1279         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1280
1281         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1282 }
1283
1284 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1285 {
1286         struct fnhe_hash_bucket *hash;
1287         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1288         u32 hval = fnhe_hashfun(daddr);
1289
1290         spin_lock_bh(&fnhe_lock);
1291
1292         hash = rcu_dereference_protected(nh->nh_exceptions,
1293                                          lockdep_is_held(&fnhe_lock));
1294         hash += hval;
1295
1296         fnhe_p = &hash->chain;
1297         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1298         while (fnhe) {
1299                 if (fnhe->fnhe_daddr == daddr) {
1300                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1301                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1302                         fnhe_flush_routes(fnhe);
1303                         kfree_rcu(fnhe, rcu);
1304                         break;
1305                 }
1306                 fnhe_p = &fnhe->fnhe_next;
1307                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1308                                                  lockdep_is_held(&fnhe_lock));
1309         }
1310
1311         spin_unlock_bh(&fnhe_lock);
1312 }
1313
1314 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1315 {
1316         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1317         struct fib_nh_exception *fnhe;
1318         u32 hval;
1319
1320         if (!hash)
1321                 return NULL;
1322
1323         hval = fnhe_hashfun(daddr);
1324
1325         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1326              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1327                 if (fnhe->fnhe_daddr == daddr) {
1328                         if (fnhe->fnhe_expires &&
1329                             time_after(jiffies, fnhe->fnhe_expires)) {
1330                                 ip_del_fnhe(nh, daddr);
1331                                 break;
1332                         }
1333                         return fnhe;
1334                 }
1335         }
1336         return NULL;
1337 }
1338
1339 /* MTU selection:
1340  * 1. mtu on route is locked - use it
1341  * 2. mtu from nexthop exception
1342  * 3. mtu from egress device
1343  */
1344
1345 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1346 {
1347         struct fib_info *fi = res->fi;
1348         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1349         struct net_device *dev = nh->nh_dev;
1350         u32 mtu = 0;
1351
1352         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1353             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1354                 mtu = fi->fib_mtu;
1355
1356         if (likely(!mtu)) {
1357                 struct fib_nh_exception *fnhe;
1358
1359                 fnhe = find_exception(nh, daddr);
1360                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1361                         mtu = fnhe->fnhe_pmtu;
1362         }
1363
1364         if (likely(!mtu))
1365                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1366
1367         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1368 }
1369
1370 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1371                               __be32 daddr, const bool do_cache)
1372 {
1373         bool ret = false;
1374
1375         spin_lock_bh(&fnhe_lock);
1376
1377         if (daddr == fnhe->fnhe_daddr) {
1378                 struct rtable __rcu **porig;
1379                 struct rtable *orig;
1380                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1381
1382                 if (rt_is_input_route(rt))
1383                         porig = &fnhe->fnhe_rth_input;
1384                 else
1385                         porig = &fnhe->fnhe_rth_output;
1386                 orig = rcu_dereference(*porig);
1387
1388                 if (fnhe->fnhe_genid != genid) {
1389                         fnhe->fnhe_genid = genid;
1390                         fnhe->fnhe_gw = 0;
1391                         fnhe->fnhe_pmtu = 0;
1392                         fnhe->fnhe_expires = 0;
1393                         fnhe->fnhe_mtu_locked = false;
1394                         fnhe_flush_routes(fnhe);
1395                         orig = NULL;
1396                 }
1397                 fill_route_from_fnhe(rt, fnhe);
1398                 if (!rt->rt_gateway)
1399                         rt->rt_gateway = daddr;
1400
1401                 if (do_cache) {
1402                         dst_hold(&rt->dst);
1403                         rcu_assign_pointer(*porig, rt);
1404                         if (orig) {
1405                                 dst_dev_put(&orig->dst);
1406                                 dst_release(&orig->dst);
1407                         }
1408                         ret = true;
1409                 }
1410
1411                 fnhe->fnhe_stamp = jiffies;
1412         }
1413         spin_unlock_bh(&fnhe_lock);
1414
1415         return ret;
1416 }
1417
1418 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1419 {
1420         struct rtable *orig, *prev, **p;
1421         bool ret = true;
1422
1423         if (rt_is_input_route(rt)) {
1424                 p = (struct rtable **)&nh->nh_rth_input;
1425         } else {
1426                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1427         }
1428         orig = *p;
1429
1430         /* hold dst before doing cmpxchg() to avoid race condition
1431          * on this dst
1432          */
1433         dst_hold(&rt->dst);
1434         prev = cmpxchg(p, orig, rt);
1435         if (prev == orig) {
1436                 if (orig) {
1437                         dst_dev_put(&orig->dst);
1438                         dst_release(&orig->dst);
1439                 }
1440         } else {
1441                 dst_release(&rt->dst);
1442                 ret = false;
1443         }
1444
1445         return ret;
1446 }
1447
1448 struct uncached_list {
1449         spinlock_t              lock;
1450         struct list_head        head;
1451 };
1452
1453 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1454
1455 void rt_add_uncached_list(struct rtable *rt)
1456 {
1457         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1458
1459         rt->rt_uncached_list = ul;
1460
1461         spin_lock_bh(&ul->lock);
1462         list_add_tail(&rt->rt_uncached, &ul->head);
1463         spin_unlock_bh(&ul->lock);
1464 }
1465
1466 void rt_del_uncached_list(struct rtable *rt)
1467 {
1468         if (!list_empty(&rt->rt_uncached)) {
1469                 struct uncached_list *ul = rt->rt_uncached_list;
1470
1471                 spin_lock_bh(&ul->lock);
1472                 list_del(&rt->rt_uncached);
1473                 spin_unlock_bh(&ul->lock);
1474         }
1475 }
1476
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1480         struct rtable *rt = (struct rtable *)dst;
1481
1482         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1483                 kfree(p);
1484
1485         rt_del_uncached_list(rt);
1486 }
1487
1488 void rt_flush_dev(struct net_device *dev)
1489 {
1490         struct net *net = dev_net(dev);
1491         struct rtable *rt;
1492         int cpu;
1493
1494         for_each_possible_cpu(cpu) {
1495                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1496
1497                 spin_lock_bh(&ul->lock);
1498                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1499                         if (rt->dst.dev != dev)
1500                                 continue;
1501                         rt->dst.dev = net->loopback_dev;
1502                         dev_hold(rt->dst.dev);
1503                         dev_put(dev);
1504                 }
1505                 spin_unlock_bh(&ul->lock);
1506         }
1507 }
1508
1509 static bool rt_cache_valid(const struct rtable *rt)
1510 {
1511         return  rt &&
1512                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1513                 !rt_is_expired(rt);
1514 }
1515
1516 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1517                            const struct fib_result *res,
1518                            struct fib_nh_exception *fnhe,
1519                            struct fib_info *fi, u16 type, u32 itag,
1520                            const bool do_cache)
1521 {
1522         bool cached = false;
1523
1524         if (fi) {
1525                 struct fib_nh *nh = &FIB_RES_NH(*res);
1526
1527                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1528                         rt->rt_gateway = nh->nh_gw;
1529                         rt->rt_uses_gateway = 1;
1530                 }
1531                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1532
1533 #ifdef CONFIG_IP_ROUTE_CLASSID
1534                 rt->dst.tclassid = nh->nh_tclassid;
1535 #endif
1536                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1537                 if (unlikely(fnhe))
1538                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1539                 else if (do_cache)
1540                         cached = rt_cache_route(nh, rt);
1541                 if (unlikely(!cached)) {
1542                         /* Routes we intend to cache in nexthop exception or
1543                          * FIB nexthop have the DST_NOCACHE bit clear.
1544                          * However, if we are unsuccessful at storing this
1545                          * route into the cache we really need to set it.
1546                          */
1547                         if (!rt->rt_gateway)
1548                                 rt->rt_gateway = daddr;
1549                         rt_add_uncached_list(rt);
1550                 }
1551         } else
1552                 rt_add_uncached_list(rt);
1553
1554 #ifdef CONFIG_IP_ROUTE_CLASSID
1555 #ifdef CONFIG_IP_MULTIPLE_TABLES
1556         set_class_tag(rt, res->tclassid);
1557 #endif
1558         set_class_tag(rt, itag);
1559 #endif
1560 }
1561
1562 struct rtable *rt_dst_alloc(struct net_device *dev,
1563                             unsigned int flags, u16 type,
1564                             bool nopolicy, bool noxfrm, bool will_cache)
1565 {
1566         struct rtable *rt;
1567
1568         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1569                        (will_cache ? 0 : DST_HOST) |
1570                        (nopolicy ? DST_NOPOLICY : 0) |
1571                        (noxfrm ? DST_NOXFRM : 0));
1572
1573         if (rt) {
1574                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1575                 rt->rt_flags = flags;
1576                 rt->rt_type = type;
1577                 rt->rt_is_input = 0;
1578                 rt->rt_iif = 0;
1579                 rt->rt_pmtu = 0;
1580                 rt->rt_mtu_locked = 0;
1581                 rt->rt_gateway = 0;
1582                 rt->rt_uses_gateway = 0;
1583                 INIT_LIST_HEAD(&rt->rt_uncached);
1584
1585                 rt->dst.output = ip_output;
1586                 if (flags & RTCF_LOCAL)
1587                         rt->dst.input = ip_local_deliver;
1588         }
1589
1590         return rt;
1591 }
1592 EXPORT_SYMBOL(rt_dst_alloc);
1593
1594 /* called in rcu_read_lock() section */
1595 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1596                           u8 tos, struct net_device *dev,
1597                           struct in_device *in_dev, u32 *itag)
1598 {
1599         int err;
1600
1601         /* Primary sanity checks. */
1602         if (!in_dev)
1603                 return -EINVAL;
1604
1605         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1606             skb->protocol != htons(ETH_P_IP))
1607                 return -EINVAL;
1608
1609         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1610                 return -EINVAL;
1611
1612         if (ipv4_is_zeronet(saddr)) {
1613                 if (!ipv4_is_local_multicast(daddr))
1614                         return -EINVAL;
1615         } else {
1616                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1617                                           in_dev, itag);
1618                 if (err < 0)
1619                         return err;
1620         }
1621         return 0;
1622 }
1623
1624 /* called in rcu_read_lock() section */
1625 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1626                              u8 tos, struct net_device *dev, int our)
1627 {
1628         struct in_device *in_dev = __in_dev_get_rcu(dev);
1629         unsigned int flags = RTCF_MULTICAST;
1630         struct rtable *rth;
1631         u32 itag = 0;
1632         int err;
1633
1634         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1635         if (err)
1636                 return err;
1637
1638         if (our)
1639                 flags |= RTCF_LOCAL;
1640
1641         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1642                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1643         if (!rth)
1644                 return -ENOBUFS;
1645
1646 #ifdef CONFIG_IP_ROUTE_CLASSID
1647         rth->dst.tclassid = itag;
1648 #endif
1649         rth->dst.output = ip_rt_bug;
1650         rth->rt_is_input= 1;
1651
1652 #ifdef CONFIG_IP_MROUTE
1653         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1654                 rth->dst.input = ip_mr_input;
1655 #endif
1656         RT_CACHE_STAT_INC(in_slow_mc);
1657
1658         skb_dst_set(skb, &rth->dst);
1659         return 0;
1660 }
1661
1662
1663 static void ip_handle_martian_source(struct net_device *dev,
1664                                      struct in_device *in_dev,
1665                                      struct sk_buff *skb,
1666                                      __be32 daddr,
1667                                      __be32 saddr)
1668 {
1669         RT_CACHE_STAT_INC(in_martian_src);
1670 #ifdef CONFIG_IP_ROUTE_VERBOSE
1671         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1672                 /*
1673                  *      RFC1812 recommendation, if source is martian,
1674                  *      the only hint is MAC header.
1675                  */
1676                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1677                         &daddr, &saddr, dev->name);
1678                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1679                         print_hex_dump(KERN_WARNING, "ll header: ",
1680                                        DUMP_PREFIX_OFFSET, 16, 1,
1681                                        skb_mac_header(skb),
1682                                        dev->hard_header_len, true);
1683                 }
1684         }
1685 #endif
1686 }
1687
1688 /* called in rcu_read_lock() section */
1689 static int __mkroute_input(struct sk_buff *skb,
1690                            const struct fib_result *res,
1691                            struct in_device *in_dev,
1692                            __be32 daddr, __be32 saddr, u32 tos)
1693 {
1694         struct fib_nh_exception *fnhe;
1695         struct rtable *rth;
1696         int err;
1697         struct in_device *out_dev;
1698         bool do_cache;
1699         u32 itag = 0;
1700
1701         /* get a working reference to the output device */
1702         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1703         if (!out_dev) {
1704                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1705                 return -EINVAL;
1706         }
1707
1708         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1709                                   in_dev->dev, in_dev, &itag);
1710         if (err < 0) {
1711                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1712                                          saddr);
1713
1714                 goto cleanup;
1715         }
1716
1717         do_cache = res->fi && !itag;
1718         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1719             skb->protocol == htons(ETH_P_IP) &&
1720             (IN_DEV_SHARED_MEDIA(out_dev) ||
1721              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1722                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1723
1724         if (skb->protocol != htons(ETH_P_IP)) {
1725                 /* Not IP (i.e. ARP). Do not create route, if it is
1726                  * invalid for proxy arp. DNAT routes are always valid.
1727                  *
1728                  * Proxy arp feature have been extended to allow, ARP
1729                  * replies back to the same interface, to support
1730                  * Private VLAN switch technologies. See arp.c.
1731                  */
1732                 if (out_dev == in_dev &&
1733                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1734                         err = -EINVAL;
1735                         goto cleanup;
1736                 }
1737         }
1738
1739         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1740         if (do_cache) {
1741                 if (fnhe)
1742                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1743                 else
1744                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1745                 if (rt_cache_valid(rth)) {
1746                         skb_dst_set_noref(skb, &rth->dst);
1747                         goto out;
1748                 }
1749         }
1750
1751         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1752                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1753                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1754         if (!rth) {
1755                 err = -ENOBUFS;
1756                 goto cleanup;
1757         }
1758
1759         rth->rt_is_input = 1;
1760         RT_CACHE_STAT_INC(in_slow_tot);
1761
1762         rth->dst.input = ip_forward;
1763
1764         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1765                        do_cache);
1766         lwtunnel_set_redirect(&rth->dst);
1767         skb_dst_set(skb, &rth->dst);
1768 out:
1769         err = 0;
1770  cleanup:
1771         return err;
1772 }
1773
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775 /* To make ICMP packets follow the right flow, the multipath hash is
1776  * calculated from the inner IP addresses.
1777  */
1778 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1779                                  struct flow_keys *hash_keys)
1780 {
1781         const struct iphdr *outer_iph = ip_hdr(skb);
1782         const struct iphdr *key_iph = outer_iph;
1783         const struct iphdr *inner_iph;
1784         const struct icmphdr *icmph;
1785         struct iphdr _inner_iph;
1786         struct icmphdr _icmph;
1787
1788         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1789                 goto out;
1790
1791         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1792                 goto out;
1793
1794         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1795                                    &_icmph);
1796         if (!icmph)
1797                 goto out;
1798
1799         if (icmph->type != ICMP_DEST_UNREACH &&
1800             icmph->type != ICMP_REDIRECT &&
1801             icmph->type != ICMP_TIME_EXCEEDED &&
1802             icmph->type != ICMP_PARAMETERPROB)
1803                 goto out;
1804
1805         inner_iph = skb_header_pointer(skb,
1806                                        outer_iph->ihl * 4 + sizeof(_icmph),
1807                                        sizeof(_inner_iph), &_inner_iph);
1808         if (!inner_iph)
1809                 goto out;
1810
1811         key_iph = inner_iph;
1812 out:
1813         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1814         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1815 }
1816
1817 /* if skb is set it will be used and fl4 can be NULL */
1818 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1819                        const struct sk_buff *skb, struct flow_keys *flkeys)
1820 {
1821         struct flow_keys hash_keys;
1822         u32 mhash;
1823
1824         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1825         case 0:
1826                 memset(&hash_keys, 0, sizeof(hash_keys));
1827                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1828                 if (skb) {
1829                         ip_multipath_l3_keys(skb, &hash_keys);
1830                 } else {
1831                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1832                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1833                 }
1834                 break;
1835         case 1:
1836                 /* skb is currently provided only when forwarding */
1837                 if (skb) {
1838                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1839                         struct flow_keys keys;
1840
1841                         /* short-circuit if we already have L4 hash present */
1842                         if (skb->l4_hash)
1843                                 return skb_get_hash_raw(skb) >> 1;
1844
1845                         memset(&hash_keys, 0, sizeof(hash_keys));
1846
1847                         if (!flkeys) {
1848                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1849                                 flkeys = &keys;
1850                         }
1851
1852                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1853                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1854                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1855                         hash_keys.ports.src = flkeys->ports.src;
1856                         hash_keys.ports.dst = flkeys->ports.dst;
1857                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1858                 } else {
1859                         memset(&hash_keys, 0, sizeof(hash_keys));
1860                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1861                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1862                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1863                         hash_keys.ports.src = fl4->fl4_sport;
1864                         hash_keys.ports.dst = fl4->fl4_dport;
1865                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1866                 }
1867                 break;
1868         }
1869         mhash = flow_hash_from_keys(&hash_keys);
1870
1871         return mhash >> 1;
1872 }
1873 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1874
1875 static int ip_mkroute_input(struct sk_buff *skb,
1876                             struct fib_result *res,
1877                             struct in_device *in_dev,
1878                             __be32 daddr, __be32 saddr, u32 tos,
1879                             struct flow_keys *hkeys)
1880 {
1881 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1882         if (res->fi && res->fi->fib_nhs > 1) {
1883                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1884
1885                 fib_select_multipath(res, h);
1886         }
1887 #endif
1888
1889         /* create a routing cache entry */
1890         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1891 }
1892
1893 /*
1894  *      NOTE. We drop all the packets that has local source
1895  *      addresses, because every properly looped back packet
1896  *      must have correct destination already attached by output routine.
1897  *
1898  *      Such approach solves two big problems:
1899  *      1. Not simplex devices are handled properly.
1900  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1901  *      called with rcu_read_lock()
1902  */
1903
1904 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1905                                u8 tos, struct net_device *dev,
1906                                struct fib_result *res)
1907 {
1908         struct in_device *in_dev = __in_dev_get_rcu(dev);
1909         struct flow_keys *flkeys = NULL, _flkeys;
1910         struct net    *net = dev_net(dev);
1911         struct ip_tunnel_info *tun_info;
1912         int             err = -EINVAL;
1913         unsigned int    flags = 0;
1914         u32             itag = 0;
1915         struct rtable   *rth;
1916         struct flowi4   fl4;
1917         bool do_cache;
1918
1919         /* IP on this device is disabled. */
1920
1921         if (!in_dev)
1922                 goto out;
1923
1924         /* Check for the most weird martians, which can be not detected
1925            by fib_lookup.
1926          */
1927
1928         tun_info = skb_tunnel_info(skb);
1929         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1930                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1931         else
1932                 fl4.flowi4_tun_key.tun_id = 0;
1933         skb_dst_drop(skb);
1934
1935         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1936                 goto martian_source;
1937
1938         res->fi = NULL;
1939         res->table = NULL;
1940         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1941                 goto brd_input;
1942
1943         /* Accept zero addresses only to limited broadcast;
1944          * I even do not know to fix it or not. Waiting for complains :-)
1945          */
1946         if (ipv4_is_zeronet(saddr))
1947                 goto martian_source;
1948
1949         if (ipv4_is_zeronet(daddr))
1950                 goto martian_destination;
1951
1952         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1953          * and call it once if daddr or/and saddr are loopback addresses
1954          */
1955         if (ipv4_is_loopback(daddr)) {
1956                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1957                         goto martian_destination;
1958         } else if (ipv4_is_loopback(saddr)) {
1959                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1960                         goto martian_source;
1961         }
1962
1963         /*
1964          *      Now we are ready to route packet.
1965          */
1966         fl4.flowi4_oif = 0;
1967         fl4.flowi4_iif = dev->ifindex;
1968         fl4.flowi4_mark = skb->mark;
1969         fl4.flowi4_tos = tos;
1970         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1971         fl4.flowi4_flags = 0;
1972         fl4.daddr = daddr;
1973         fl4.saddr = saddr;
1974         fl4.flowi4_uid = sock_net_uid(net, NULL);
1975
1976         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1977                 flkeys = &_flkeys;
1978         } else {
1979                 fl4.flowi4_proto = 0;
1980                 fl4.fl4_sport = 0;
1981                 fl4.fl4_dport = 0;
1982         }
1983
1984         err = fib_lookup(net, &fl4, res, 0);
1985         if (err != 0) {
1986                 if (!IN_DEV_FORWARD(in_dev))
1987                         err = -EHOSTUNREACH;
1988                 goto no_route;
1989         }
1990
1991         if (res->type == RTN_BROADCAST) {
1992                 if (IN_DEV_BFORWARD(in_dev))
1993                         goto make_route;
1994                 goto brd_input;
1995         }
1996
1997         if (res->type == RTN_LOCAL) {
1998                 err = fib_validate_source(skb, saddr, daddr, tos,
1999                                           0, dev, in_dev, &itag);
2000                 if (err < 0)
2001                         goto martian_source;
2002                 goto local_input;
2003         }
2004
2005         if (!IN_DEV_FORWARD(in_dev)) {
2006                 err = -EHOSTUNREACH;
2007                 goto no_route;
2008         }
2009         if (res->type != RTN_UNICAST)
2010                 goto martian_destination;
2011
2012 make_route:
2013         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2014 out:    return err;
2015
2016 brd_input:
2017         if (skb->protocol != htons(ETH_P_IP))
2018                 goto e_inval;
2019
2020         if (!ipv4_is_zeronet(saddr)) {
2021                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2022                                           in_dev, &itag);
2023                 if (err < 0)
2024                         goto martian_source;
2025         }
2026         flags |= RTCF_BROADCAST;
2027         res->type = RTN_BROADCAST;
2028         RT_CACHE_STAT_INC(in_brd);
2029
2030 local_input:
2031         do_cache = false;
2032         if (res->fi) {
2033                 if (!itag) {
2034                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2035                         if (rt_cache_valid(rth)) {
2036                                 skb_dst_set_noref(skb, &rth->dst);
2037                                 err = 0;
2038                                 goto out;
2039                         }
2040                         do_cache = true;
2041                 }
2042         }
2043
2044         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2045                            flags | RTCF_LOCAL, res->type,
2046                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2047         if (!rth)
2048                 goto e_nobufs;
2049
2050         rth->dst.output= ip_rt_bug;
2051 #ifdef CONFIG_IP_ROUTE_CLASSID
2052         rth->dst.tclassid = itag;
2053 #endif
2054         rth->rt_is_input = 1;
2055
2056         RT_CACHE_STAT_INC(in_slow_tot);
2057         if (res->type == RTN_UNREACHABLE) {
2058                 rth->dst.input= ip_error;
2059                 rth->dst.error= -err;
2060                 rth->rt_flags   &= ~RTCF_LOCAL;
2061         }
2062
2063         if (do_cache) {
2064                 struct fib_nh *nh = &FIB_RES_NH(*res);
2065
2066                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2067                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2068                         WARN_ON(rth->dst.input == lwtunnel_input);
2069                         rth->dst.lwtstate->orig_input = rth->dst.input;
2070                         rth->dst.input = lwtunnel_input;
2071                 }
2072
2073                 if (unlikely(!rt_cache_route(nh, rth)))
2074                         rt_add_uncached_list(rth);
2075         }
2076         skb_dst_set(skb, &rth->dst);
2077         err = 0;
2078         goto out;
2079
2080 no_route:
2081         RT_CACHE_STAT_INC(in_no_route);
2082         res->type = RTN_UNREACHABLE;
2083         res->fi = NULL;
2084         res->table = NULL;
2085         goto local_input;
2086
2087         /*
2088          *      Do not cache martian addresses: they should be logged (RFC1812)
2089          */
2090 martian_destination:
2091         RT_CACHE_STAT_INC(in_martian_dst);
2092 #ifdef CONFIG_IP_ROUTE_VERBOSE
2093         if (IN_DEV_LOG_MARTIANS(in_dev))
2094                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2095                                      &daddr, &saddr, dev->name);
2096 #endif
2097
2098 e_inval:
2099         err = -EINVAL;
2100         goto out;
2101
2102 e_nobufs:
2103         err = -ENOBUFS;
2104         goto out;
2105
2106 martian_source:
2107         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2108         goto out;
2109 }
2110
2111 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112                          u8 tos, struct net_device *dev)
2113 {
2114         struct fib_result res;
2115         int err;
2116
2117         tos &= IPTOS_RT_MASK;
2118         rcu_read_lock();
2119         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2120         rcu_read_unlock();
2121
2122         return err;
2123 }
2124 EXPORT_SYMBOL(ip_route_input_noref);
2125
2126 /* called with rcu_read_lock held */
2127 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2128                        u8 tos, struct net_device *dev, struct fib_result *res)
2129 {
2130         /* Multicast recognition logic is moved from route cache to here.
2131            The problem was that too many Ethernet cards have broken/missing
2132            hardware multicast filters :-( As result the host on multicasting
2133            network acquires a lot of useless route cache entries, sort of
2134            SDR messages from all the world. Now we try to get rid of them.
2135            Really, provided software IP multicast filter is organized
2136            reasonably (at least, hashed), it does not result in a slowdown
2137            comparing with route cache reject entries.
2138            Note, that multicast routers are not affected, because
2139            route cache entry is created eventually.
2140          */
2141         if (ipv4_is_multicast(daddr)) {
2142                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2143                 int our = 0;
2144                 int err = -EINVAL;
2145
2146                 if (in_dev)
2147                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2148                                               ip_hdr(skb)->protocol);
2149
2150                 /* check l3 master if no match yet */
2151                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2152                         struct in_device *l3_in_dev;
2153
2154                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2155                         if (l3_in_dev)
2156                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2157                                                       ip_hdr(skb)->protocol);
2158                 }
2159
2160                 if (our
2161 #ifdef CONFIG_IP_MROUTE
2162                         ||
2163                     (!ipv4_is_local_multicast(daddr) &&
2164                      IN_DEV_MFORWARD(in_dev))
2165 #endif
2166                    ) {
2167                         err = ip_route_input_mc(skb, daddr, saddr,
2168                                                 tos, dev, our);
2169                 }
2170                 return err;
2171         }
2172
2173         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2174 }
2175
2176 /* called with rcu_read_lock() */
2177 static struct rtable *__mkroute_output(const struct fib_result *res,
2178                                        const struct flowi4 *fl4, int orig_oif,
2179                                        struct net_device *dev_out,
2180                                        unsigned int flags)
2181 {
2182         struct fib_info *fi = res->fi;
2183         struct fib_nh_exception *fnhe;
2184         struct in_device *in_dev;
2185         u16 type = res->type;
2186         struct rtable *rth;
2187         bool do_cache;
2188
2189         in_dev = __in_dev_get_rcu(dev_out);
2190         if (!in_dev)
2191                 return ERR_PTR(-EINVAL);
2192
2193         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2194                 if (ipv4_is_loopback(fl4->saddr) &&
2195                     !(dev_out->flags & IFF_LOOPBACK) &&
2196                     !netif_is_l3_master(dev_out))
2197                         return ERR_PTR(-EINVAL);
2198
2199         if (ipv4_is_lbcast(fl4->daddr))
2200                 type = RTN_BROADCAST;
2201         else if (ipv4_is_multicast(fl4->daddr))
2202                 type = RTN_MULTICAST;
2203         else if (ipv4_is_zeronet(fl4->daddr))
2204                 return ERR_PTR(-EINVAL);
2205
2206         if (dev_out->flags & IFF_LOOPBACK)
2207                 flags |= RTCF_LOCAL;
2208
2209         do_cache = true;
2210         if (type == RTN_BROADCAST) {
2211                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2212                 fi = NULL;
2213         } else if (type == RTN_MULTICAST) {
2214                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2215                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2216                                      fl4->flowi4_proto))
2217                         flags &= ~RTCF_LOCAL;
2218                 else
2219                         do_cache = false;
2220                 /* If multicast route do not exist use
2221                  * default one, but do not gateway in this case.
2222                  * Yes, it is hack.
2223                  */
2224                 if (fi && res->prefixlen < 4)
2225                         fi = NULL;
2226         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2227                    (orig_oif != dev_out->ifindex)) {
2228                 /* For local routes that require a particular output interface
2229                  * we do not want to cache the result.  Caching the result
2230                  * causes incorrect behaviour when there are multiple source
2231                  * addresses on the interface, the end result being that if the
2232                  * intended recipient is waiting on that interface for the
2233                  * packet he won't receive it because it will be delivered on
2234                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2235                  * be set to the loopback interface as well.
2236                  */
2237                 do_cache = false;
2238         }
2239
2240         fnhe = NULL;
2241         do_cache &= fi != NULL;
2242         if (fi) {
2243                 struct rtable __rcu **prth;
2244                 struct fib_nh *nh = &FIB_RES_NH(*res);
2245
2246                 fnhe = find_exception(nh, fl4->daddr);
2247                 if (!do_cache)
2248                         goto add;
2249                 if (fnhe) {
2250                         prth = &fnhe->fnhe_rth_output;
2251                 } else {
2252                         if (unlikely(fl4->flowi4_flags &
2253                                      FLOWI_FLAG_KNOWN_NH &&
2254                                      !(nh->nh_gw &&
2255                                        nh->nh_scope == RT_SCOPE_LINK))) {
2256                                 do_cache = false;
2257                                 goto add;
2258                         }
2259                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2260                 }
2261                 rth = rcu_dereference(*prth);
2262                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2263                         return rth;
2264         }
2265
2266 add:
2267         rth = rt_dst_alloc(dev_out, flags, type,
2268                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2269                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2270                            do_cache);
2271         if (!rth)
2272                 return ERR_PTR(-ENOBUFS);
2273
2274         rth->rt_iif = orig_oif;
2275
2276         RT_CACHE_STAT_INC(out_slow_tot);
2277
2278         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2279                 if (flags & RTCF_LOCAL &&
2280                     !(dev_out->flags & IFF_LOOPBACK)) {
2281                         rth->dst.output = ip_mc_output;
2282                         RT_CACHE_STAT_INC(out_slow_mc);
2283                 }
2284 #ifdef CONFIG_IP_MROUTE
2285                 if (type == RTN_MULTICAST) {
2286                         if (IN_DEV_MFORWARD(in_dev) &&
2287                             !ipv4_is_local_multicast(fl4->daddr)) {
2288                                 rth->dst.input = ip_mr_input;
2289                                 rth->dst.output = ip_mc_output;
2290                         }
2291                 }
2292 #endif
2293         }
2294
2295         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2296         lwtunnel_set_redirect(&rth->dst);
2297
2298         return rth;
2299 }
2300
2301 /*
2302  * Major route resolver routine.
2303  */
2304
2305 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2306                                         const struct sk_buff *skb)
2307 {
2308         __u8 tos = RT_FL_TOS(fl4);
2309         struct fib_result res = {
2310                 .type           = RTN_UNSPEC,
2311                 .fi             = NULL,
2312                 .table          = NULL,
2313                 .tclassid       = 0,
2314         };
2315         struct rtable *rth;
2316
2317         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2318         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2319         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2320                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2321
2322         rcu_read_lock();
2323         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2324         rcu_read_unlock();
2325
2326         return rth;
2327 }
2328 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2329
2330 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2331                                             struct fib_result *res,
2332                                             const struct sk_buff *skb)
2333 {
2334         struct net_device *dev_out = NULL;
2335         int orig_oif = fl4->flowi4_oif;
2336         unsigned int flags = 0;
2337         struct rtable *rth;
2338         int err = -ENETUNREACH;
2339
2340         if (fl4->saddr) {
2341                 rth = ERR_PTR(-EINVAL);
2342                 if (ipv4_is_multicast(fl4->saddr) ||
2343                     ipv4_is_lbcast(fl4->saddr) ||
2344                     ipv4_is_zeronet(fl4->saddr))
2345                         goto out;
2346
2347                 /* I removed check for oif == dev_out->oif here.
2348                    It was wrong for two reasons:
2349                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2350                       is assigned to multiple interfaces.
2351                    2. Moreover, we are allowed to send packets with saddr
2352                       of another iface. --ANK
2353                  */
2354
2355                 if (fl4->flowi4_oif == 0 &&
2356                     (ipv4_is_multicast(fl4->daddr) ||
2357                      ipv4_is_lbcast(fl4->daddr))) {
2358                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2359                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2360                         if (!dev_out)
2361                                 goto out;
2362
2363                         /* Special hack: user can direct multicasts
2364                            and limited broadcast via necessary interface
2365                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2366                            This hack is not just for fun, it allows
2367                            vic,vat and friends to work.
2368                            They bind socket to loopback, set ttl to zero
2369                            and expect that it will work.
2370                            From the viewpoint of routing cache they are broken,
2371                            because we are not allowed to build multicast path
2372                            with loopback source addr (look, routing cache
2373                            cannot know, that ttl is zero, so that packet
2374                            will not leave this host and route is valid).
2375                            Luckily, this hack is good workaround.
2376                          */
2377
2378                         fl4->flowi4_oif = dev_out->ifindex;
2379                         goto make_route;
2380                 }
2381
2382                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2383                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2384                         if (!__ip_dev_find(net, fl4->saddr, false))
2385                                 goto out;
2386                 }
2387         }
2388
2389
2390         if (fl4->flowi4_oif) {
2391                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2392                 rth = ERR_PTR(-ENODEV);
2393                 if (!dev_out)
2394                         goto out;
2395
2396                 /* RACE: Check return value of inet_select_addr instead. */
2397                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2398                         rth = ERR_PTR(-ENETUNREACH);
2399                         goto out;
2400                 }
2401                 if (ipv4_is_local_multicast(fl4->daddr) ||
2402                     ipv4_is_lbcast(fl4->daddr) ||
2403                     fl4->flowi4_proto == IPPROTO_IGMP) {
2404                         if (!fl4->saddr)
2405                                 fl4->saddr = inet_select_addr(dev_out, 0,
2406                                                               RT_SCOPE_LINK);
2407                         goto make_route;
2408                 }
2409                 if (!fl4->saddr) {
2410                         if (ipv4_is_multicast(fl4->daddr))
2411                                 fl4->saddr = inet_select_addr(dev_out, 0,
2412                                                               fl4->flowi4_scope);
2413                         else if (!fl4->daddr)
2414                                 fl4->saddr = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_HOST);
2416                 }
2417         }
2418
2419         if (!fl4->daddr) {
2420                 fl4->daddr = fl4->saddr;
2421                 if (!fl4->daddr)
2422                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2423                 dev_out = net->loopback_dev;
2424                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2425                 res->type = RTN_LOCAL;
2426                 flags |= RTCF_LOCAL;
2427                 goto make_route;
2428         }
2429
2430         err = fib_lookup(net, fl4, res, 0);
2431         if (err) {
2432                 res->fi = NULL;
2433                 res->table = NULL;
2434                 if (fl4->flowi4_oif &&
2435                     (ipv4_is_multicast(fl4->daddr) ||
2436                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2437                         /* Apparently, routing tables are wrong. Assume,
2438                            that the destination is on link.
2439
2440                            WHY? DW.
2441                            Because we are allowed to send to iface
2442                            even if it has NO routes and NO assigned
2443                            addresses. When oif is specified, routing
2444                            tables are looked up with only one purpose:
2445                            to catch if destination is gatewayed, rather than
2446                            direct. Moreover, if MSG_DONTROUTE is set,
2447                            we send packet, ignoring both routing tables
2448                            and ifaddr state. --ANK
2449
2450
2451                            We could make it even if oif is unknown,
2452                            likely IPv6, but we do not.
2453                          */
2454
2455                         if (fl4->saddr == 0)
2456                                 fl4->saddr = inet_select_addr(dev_out, 0,
2457                                                               RT_SCOPE_LINK);
2458                         res->type = RTN_UNICAST;
2459                         goto make_route;
2460                 }
2461                 rth = ERR_PTR(err);
2462                 goto out;
2463         }
2464
2465         if (res->type == RTN_LOCAL) {
2466                 if (!fl4->saddr) {
2467                         if (res->fi->fib_prefsrc)
2468                                 fl4->saddr = res->fi->fib_prefsrc;
2469                         else
2470                                 fl4->saddr = fl4->daddr;
2471                 }
2472
2473                 /* L3 master device is the loopback for that domain */
2474                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2475                         net->loopback_dev;
2476
2477                 /* make sure orig_oif points to fib result device even
2478                  * though packet rx/tx happens over loopback or l3mdev
2479                  */
2480                 orig_oif = FIB_RES_OIF(*res);
2481
2482                 fl4->flowi4_oif = dev_out->ifindex;
2483                 flags |= RTCF_LOCAL;
2484                 goto make_route;
2485         }
2486
2487         fib_select_path(net, res, fl4, skb);
2488
2489         dev_out = FIB_RES_DEV(*res);
2490         fl4->flowi4_oif = dev_out->ifindex;
2491
2492
2493 make_route:
2494         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2495
2496 out:
2497         return rth;
2498 }
2499
2500 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2501 {
2502         return NULL;
2503 }
2504
2505 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2506 {
2507         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2508
2509         return mtu ? : dst->dev->mtu;
2510 }
2511
2512 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2513                                           struct sk_buff *skb, u32 mtu)
2514 {
2515 }
2516
2517 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2518                                        struct sk_buff *skb)
2519 {
2520 }
2521
2522 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2523                                           unsigned long old)
2524 {
2525         return NULL;
2526 }
2527
2528 static struct dst_ops ipv4_dst_blackhole_ops = {
2529         .family                 =       AF_INET,
2530         .check                  =       ipv4_blackhole_dst_check,
2531         .mtu                    =       ipv4_blackhole_mtu,
2532         .default_advmss         =       ipv4_default_advmss,
2533         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2534         .redirect               =       ipv4_rt_blackhole_redirect,
2535         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2536         .neigh_lookup           =       ipv4_neigh_lookup,
2537 };
2538
2539 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2540 {
2541         struct rtable *ort = (struct rtable *) dst_orig;
2542         struct rtable *rt;
2543
2544         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2545         if (rt) {
2546                 struct dst_entry *new = &rt->dst;
2547
2548                 new->__use = 1;
2549                 new->input = dst_discard;
2550                 new->output = dst_discard_out;
2551
2552                 new->dev = net->loopback_dev;
2553                 if (new->dev)
2554                         dev_hold(new->dev);
2555
2556                 rt->rt_is_input = ort->rt_is_input;
2557                 rt->rt_iif = ort->rt_iif;
2558                 rt->rt_pmtu = ort->rt_pmtu;
2559                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2560
2561                 rt->rt_genid = rt_genid_ipv4(net);
2562                 rt->rt_flags = ort->rt_flags;
2563                 rt->rt_type = ort->rt_type;
2564                 rt->rt_gateway = ort->rt_gateway;
2565                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2566
2567                 INIT_LIST_HEAD(&rt->rt_uncached);
2568         }
2569
2570         dst_release(dst_orig);
2571
2572         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2573 }
2574
2575 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2576                                     const struct sock *sk)
2577 {
2578         struct rtable *rt = __ip_route_output_key(net, flp4);
2579
2580         if (IS_ERR(rt))
2581                 return rt;
2582
2583         if (flp4->flowi4_proto)
2584                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2585                                                         flowi4_to_flowi(flp4),
2586                                                         sk, 0);
2587
2588         return rt;
2589 }
2590 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2591
2592 /* called with rcu_read_lock held */
2593 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2594                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2595                         struct sk_buff *skb, u32 portid, u32 seq)
2596 {
2597         struct rtmsg *r;
2598         struct nlmsghdr *nlh;
2599         unsigned long expires = 0;
2600         u32 error;
2601         u32 metrics[RTAX_MAX];
2602
2603         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2604         if (!nlh)
2605                 return -EMSGSIZE;
2606
2607         r = nlmsg_data(nlh);
2608         r->rtm_family    = AF_INET;
2609         r->rtm_dst_len  = 32;
2610         r->rtm_src_len  = 0;
2611         r->rtm_tos      = fl4->flowi4_tos;
2612         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2613         if (nla_put_u32(skb, RTA_TABLE, table_id))
2614                 goto nla_put_failure;
2615         r->rtm_type     = rt->rt_type;
2616         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2617         r->rtm_protocol = RTPROT_UNSPEC;
2618         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619         if (rt->rt_flags & RTCF_NOTIFY)
2620                 r->rtm_flags |= RTM_F_NOTIFY;
2621         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2622                 r->rtm_flags |= RTCF_DOREDIRECT;
2623
2624         if (nla_put_in_addr(skb, RTA_DST, dst))
2625                 goto nla_put_failure;
2626         if (src) {
2627                 r->rtm_src_len = 32;
2628                 if (nla_put_in_addr(skb, RTA_SRC, src))
2629                         goto nla_put_failure;
2630         }
2631         if (rt->dst.dev &&
2632             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2633                 goto nla_put_failure;
2634 #ifdef CONFIG_IP_ROUTE_CLASSID
2635         if (rt->dst.tclassid &&
2636             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2637                 goto nla_put_failure;
2638 #endif
2639         if (!rt_is_input_route(rt) &&
2640             fl4->saddr != src) {
2641                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2642                         goto nla_put_failure;
2643         }
2644         if (rt->rt_uses_gateway &&
2645             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2646                 goto nla_put_failure;
2647
2648         expires = rt->dst.expires;
2649         if (expires) {
2650                 unsigned long now = jiffies;
2651
2652                 if (time_before(now, expires))
2653                         expires -= now;
2654                 else
2655                         expires = 0;
2656         }
2657
2658         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2659         if (rt->rt_pmtu && expires)
2660                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2661         if (rt->rt_mtu_locked && expires)
2662                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2663         if (rtnetlink_put_metrics(skb, metrics) < 0)
2664                 goto nla_put_failure;
2665
2666         if (fl4->flowi4_mark &&
2667             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2668                 goto nla_put_failure;
2669
2670         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2671             nla_put_u32(skb, RTA_UID,
2672                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2673                 goto nla_put_failure;
2674
2675         error = rt->dst.error;
2676
2677         if (rt_is_input_route(rt)) {
2678 #ifdef CONFIG_IP_MROUTE
2679                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2680                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2681                         int err = ipmr_get_route(net, skb,
2682                                                  fl4->saddr, fl4->daddr,
2683                                                  r, portid);
2684
2685                         if (err <= 0) {
2686                                 if (err == 0)
2687                                         return 0;
2688                                 goto nla_put_failure;
2689                         }
2690                 } else
2691 #endif
2692                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2693                                 goto nla_put_failure;
2694         }
2695
2696         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2697                 goto nla_put_failure;
2698
2699         nlmsg_end(skb, nlh);
2700         return 0;
2701
2702 nla_put_failure:
2703         nlmsg_cancel(skb, nlh);
2704         return -EMSGSIZE;
2705 }
2706
2707 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2708                                                    u8 ip_proto, __be16 sport,
2709                                                    __be16 dport)
2710 {
2711         struct sk_buff *skb;
2712         struct iphdr *iph;
2713
2714         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715         if (!skb)
2716                 return NULL;
2717
2718         /* Reserve room for dummy headers, this skb can pass
2719          * through good chunk of routing engine.
2720          */
2721         skb_reset_mac_header(skb);
2722         skb_reset_network_header(skb);
2723         skb->protocol = htons(ETH_P_IP);
2724         iph = skb_put(skb, sizeof(struct iphdr));
2725         iph->protocol = ip_proto;
2726         iph->saddr = src;
2727         iph->daddr = dst;
2728         iph->version = 0x4;
2729         iph->frag_off = 0;
2730         iph->ihl = 0x5;
2731         skb_set_transport_header(skb, skb->len);
2732
2733         switch (iph->protocol) {
2734         case IPPROTO_UDP: {
2735                 struct udphdr *udph;
2736
2737                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2738                 udph->source = sport;
2739                 udph->dest = dport;
2740                 udph->len = sizeof(struct udphdr);
2741                 udph->check = 0;
2742                 break;
2743         }
2744         case IPPROTO_TCP: {
2745                 struct tcphdr *tcph;
2746
2747                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2748                 tcph->source    = sport;
2749                 tcph->dest      = dport;
2750                 tcph->doff      = sizeof(struct tcphdr) / 4;
2751                 tcph->rst = 1;
2752                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2753                                             src, dst, 0);
2754                 break;
2755         }
2756         case IPPROTO_ICMP: {
2757                 struct icmphdr *icmph;
2758
2759                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2760                 icmph->type = ICMP_ECHO;
2761                 icmph->code = 0;
2762         }
2763         }
2764
2765         return skb;
2766 }
2767
2768 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2769                              struct netlink_ext_ack *extack)
2770 {
2771         struct net *net = sock_net(in_skb->sk);
2772         struct nlattr *tb[RTA_MAX+1];
2773         u32 table_id = RT_TABLE_MAIN;
2774         __be16 sport = 0, dport = 0;
2775         struct fib_result res = {};
2776         u8 ip_proto = IPPROTO_UDP;
2777         struct rtable *rt = NULL;
2778         struct sk_buff *skb;
2779         struct rtmsg *rtm;
2780         struct flowi4 fl4 = {};
2781         __be32 dst = 0;
2782         __be32 src = 0;
2783         kuid_t uid;
2784         u32 iif;
2785         int err;
2786         int mark;
2787
2788         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2789                           extack);
2790         if (err < 0)
2791                 return err;
2792
2793         rtm = nlmsg_data(nlh);
2794         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2795         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2796         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2797         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2798         if (tb[RTA_UID])
2799                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2800         else
2801                 uid = (iif ? INVALID_UID : current_uid());
2802
2803         if (tb[RTA_IP_PROTO]) {
2804                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2805                                                   &ip_proto, extack);
2806                 if (err)
2807                         return err;
2808         }
2809
2810         if (tb[RTA_SPORT])
2811                 sport = nla_get_be16(tb[RTA_SPORT]);
2812
2813         if (tb[RTA_DPORT])
2814                 dport = nla_get_be16(tb[RTA_DPORT]);
2815
2816         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2817         if (!skb)
2818                 return -ENOBUFS;
2819
2820         fl4.daddr = dst;
2821         fl4.saddr = src;
2822         fl4.flowi4_tos = rtm->rtm_tos;
2823         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2824         fl4.flowi4_mark = mark;
2825         fl4.flowi4_uid = uid;
2826         if (sport)
2827                 fl4.fl4_sport = sport;
2828         if (dport)
2829                 fl4.fl4_dport = dport;
2830         fl4.flowi4_proto = ip_proto;
2831
2832         rcu_read_lock();
2833
2834         if (iif) {
2835                 struct net_device *dev;
2836
2837                 dev = dev_get_by_index_rcu(net, iif);
2838                 if (!dev) {
2839                         err = -ENODEV;
2840                         goto errout_rcu;
2841                 }
2842
2843                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2844                 skb->dev        = dev;
2845                 skb->mark       = mark;
2846                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2847                                          dev, &res);
2848
2849                 rt = skb_rtable(skb);
2850                 if (err == 0 && rt->dst.error)
2851                         err = -rt->dst.error;
2852         } else {
2853                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2854                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2855                 err = 0;
2856                 if (IS_ERR(rt))
2857                         err = PTR_ERR(rt);
2858                 else
2859                         skb_dst_set(skb, &rt->dst);
2860         }
2861
2862         if (err)
2863                 goto errout_rcu;
2864
2865         if (rtm->rtm_flags & RTM_F_NOTIFY)
2866                 rt->rt_flags |= RTCF_NOTIFY;
2867
2868         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2869                 table_id = res.table ? res.table->tb_id : 0;
2870
2871         /* reset skb for netlink reply msg */
2872         skb_trim(skb, 0);
2873         skb_reset_network_header(skb);
2874         skb_reset_transport_header(skb);
2875         skb_reset_mac_header(skb);
2876
2877         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2878                 if (!res.fi) {
2879                         err = fib_props[res.type].error;
2880                         if (!err)
2881                                 err = -EHOSTUNREACH;
2882                         goto errout_rcu;
2883                 }
2884                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2885                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2886                                     rt->rt_type, res.prefix, res.prefixlen,
2887                                     fl4.flowi4_tos, res.fi, 0);
2888         } else {
2889                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2890                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2891         }
2892         if (err < 0)
2893                 goto errout_rcu;
2894
2895         rcu_read_unlock();
2896
2897         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2898
2899 errout_free:
2900         return err;
2901 errout_rcu:
2902         rcu_read_unlock();
2903         kfree_skb(skb);
2904         goto errout_free;
2905 }
2906
2907 void ip_rt_multicast_event(struct in_device *in_dev)
2908 {
2909         rt_cache_flush(dev_net(in_dev->dev));
2910 }
2911
2912 #ifdef CONFIG_SYSCTL
2913 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2914 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2915 static int ip_rt_gc_elasticity __read_mostly    = 8;
2916 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2917
2918 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2919                                         void __user *buffer,
2920                                         size_t *lenp, loff_t *ppos)
2921 {
2922         struct net *net = (struct net *)__ctl->extra1;
2923
2924         if (write) {
2925                 rt_cache_flush(net);
2926                 fnhe_genid_bump(net);
2927                 return 0;
2928         }
2929
2930         return -EINVAL;
2931 }
2932
2933 static struct ctl_table ipv4_route_table[] = {
2934         {
2935                 .procname       = "gc_thresh",
2936                 .data           = &ipv4_dst_ops.gc_thresh,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = proc_dointvec,
2940         },
2941         {
2942                 .procname       = "max_size",
2943                 .data           = &ip_rt_max_size,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = proc_dointvec,
2947         },
2948         {
2949                 /*  Deprecated. Use gc_min_interval_ms */
2950
2951                 .procname       = "gc_min_interval",
2952                 .data           = &ip_rt_gc_min_interval,
2953                 .maxlen         = sizeof(int),
2954                 .mode           = 0644,
2955                 .proc_handler   = proc_dointvec_jiffies,
2956         },
2957         {
2958                 .procname       = "gc_min_interval_ms",
2959                 .data           = &ip_rt_gc_min_interval,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = proc_dointvec_ms_jiffies,
2963         },
2964         {
2965                 .procname       = "gc_timeout",
2966                 .data           = &ip_rt_gc_timeout,
2967                 .maxlen         = sizeof(int),
2968                 .mode           = 0644,
2969                 .proc_handler   = proc_dointvec_jiffies,
2970         },
2971         {
2972                 .procname       = "gc_interval",
2973                 .data           = &ip_rt_gc_interval,
2974                 .maxlen         = sizeof(int),
2975                 .mode           = 0644,
2976                 .proc_handler   = proc_dointvec_jiffies,
2977         },
2978         {
2979                 .procname       = "redirect_load",
2980                 .data           = &ip_rt_redirect_load,
2981                 .maxlen         = sizeof(int),
2982                 .mode           = 0644,
2983                 .proc_handler   = proc_dointvec,
2984         },
2985         {
2986                 .procname       = "redirect_number",
2987                 .data           = &ip_rt_redirect_number,
2988                 .maxlen         = sizeof(int),
2989                 .mode           = 0644,
2990                 .proc_handler   = proc_dointvec,
2991         },
2992         {
2993                 .procname       = "redirect_silence",
2994                 .data           = &ip_rt_redirect_silence,
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0644,
2997                 .proc_handler   = proc_dointvec,
2998         },
2999         {
3000                 .procname       = "error_cost",
3001                 .data           = &ip_rt_error_cost,
3002                 .maxlen         = sizeof(int),
3003                 .mode           = 0644,
3004                 .proc_handler   = proc_dointvec,
3005         },
3006         {
3007                 .procname       = "error_burst",
3008                 .data           = &ip_rt_error_burst,
3009                 .maxlen         = sizeof(int),
3010                 .mode           = 0644,
3011                 .proc_handler   = proc_dointvec,
3012         },
3013         {
3014                 .procname       = "gc_elasticity",
3015                 .data           = &ip_rt_gc_elasticity,
3016                 .maxlen         = sizeof(int),
3017                 .mode           = 0644,
3018                 .proc_handler   = proc_dointvec,
3019         },
3020         {
3021                 .procname       = "mtu_expires",
3022                 .data           = &ip_rt_mtu_expires,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = proc_dointvec_jiffies,
3026         },
3027         {
3028                 .procname       = "min_pmtu",
3029                 .data           = &ip_rt_min_pmtu,
3030                 .maxlen         = sizeof(int),
3031                 .mode           = 0644,
3032                 .proc_handler   = proc_dointvec_minmax,
3033                 .extra1         = &ip_min_valid_pmtu,
3034         },
3035         {
3036                 .procname       = "min_adv_mss",
3037                 .data           = &ip_rt_min_advmss,
3038                 .maxlen         = sizeof(int),
3039                 .mode           = 0644,
3040                 .proc_handler   = proc_dointvec,
3041         },
3042         { }
3043 };
3044
3045 static struct ctl_table ipv4_route_flush_table[] = {
3046         {
3047                 .procname       = "flush",
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0200,
3050                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3051         },
3052         { },
3053 };
3054
3055 static __net_init int sysctl_route_net_init(struct net *net)
3056 {
3057         struct ctl_table *tbl;
3058
3059         tbl = ipv4_route_flush_table;
3060         if (!net_eq(net, &init_net)) {
3061                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3062                 if (!tbl)
3063                         goto err_dup;
3064
3065                 /* Don't export sysctls to unprivileged users */
3066                 if (net->user_ns != &init_user_ns)
3067                         tbl[0].procname = NULL;
3068         }
3069         tbl[0].extra1 = net;
3070
3071         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3072         if (!net->ipv4.route_hdr)
3073                 goto err_reg;
3074         return 0;
3075
3076 err_reg:
3077         if (tbl != ipv4_route_flush_table)
3078                 kfree(tbl);
3079 err_dup:
3080         return -ENOMEM;
3081 }
3082
3083 static __net_exit void sysctl_route_net_exit(struct net *net)
3084 {
3085         struct ctl_table *tbl;
3086
3087         tbl = net->ipv4.route_hdr->ctl_table_arg;
3088         unregister_net_sysctl_table(net->ipv4.route_hdr);
3089         BUG_ON(tbl == ipv4_route_flush_table);
3090         kfree(tbl);
3091 }
3092
3093 static __net_initdata struct pernet_operations sysctl_route_ops = {
3094         .init = sysctl_route_net_init,
3095         .exit = sysctl_route_net_exit,
3096 };
3097 #endif
3098
3099 static __net_init int rt_genid_init(struct net *net)
3100 {
3101         atomic_set(&net->ipv4.rt_genid, 0);
3102         atomic_set(&net->fnhe_genid, 0);
3103         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3104         return 0;
3105 }
3106
3107 static __net_initdata struct pernet_operations rt_genid_ops = {
3108         .init = rt_genid_init,
3109 };
3110
3111 static int __net_init ipv4_inetpeer_init(struct net *net)
3112 {
3113         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3114
3115         if (!bp)
3116                 return -ENOMEM;
3117         inet_peer_base_init(bp);
3118         net->ipv4.peers = bp;
3119         return 0;
3120 }
3121
3122 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3123 {
3124         struct inet_peer_base *bp = net->ipv4.peers;
3125
3126         net->ipv4.peers = NULL;
3127         inetpeer_invalidate_tree(bp);
3128         kfree(bp);
3129 }
3130
3131 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3132         .init   =       ipv4_inetpeer_init,
3133         .exit   =       ipv4_inetpeer_exit,
3134 };
3135
3136 #ifdef CONFIG_IP_ROUTE_CLASSID
3137 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3138 #endif /* CONFIG_IP_ROUTE_CLASSID */
3139
3140 int __init ip_rt_init(void)
3141 {
3142         int cpu;
3143
3144         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3145                                   GFP_KERNEL);
3146         if (!ip_idents)
3147                 panic("IP: failed to allocate ip_idents\n");
3148
3149         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3150
3151         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3152         if (!ip_tstamps)
3153                 panic("IP: failed to allocate ip_tstamps\n");
3154
3155         for_each_possible_cpu(cpu) {
3156                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3157
3158                 INIT_LIST_HEAD(&ul->head);
3159                 spin_lock_init(&ul->lock);
3160         }
3161 #ifdef CONFIG_IP_ROUTE_CLASSID
3162         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3163         if (!ip_rt_acct)
3164                 panic("IP: failed to allocate ip_rt_acct\n");
3165 #endif
3166
3167         ipv4_dst_ops.kmem_cachep =
3168                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3169                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3170
3171         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3172
3173         if (dst_entries_init(&ipv4_dst_ops) < 0)
3174                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3175
3176         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3177                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3178
3179         ipv4_dst_ops.gc_thresh = ~0;
3180         ip_rt_max_size = INT_MAX;
3181
3182         devinet_init();
3183         ip_fib_init();
3184
3185         if (ip_rt_proc_init())
3186                 pr_err("Unable to create route proc files\n");
3187 #ifdef CONFIG_XFRM
3188         xfrm_init();
3189         xfrm4_init();
3190 #endif
3191         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3192                       RTNL_FLAG_DOIT_UNLOCKED);
3193
3194 #ifdef CONFIG_SYSCTL
3195         register_pernet_subsys(&sysctl_route_ops);
3196 #endif
3197         register_pernet_subsys(&rt_genid_ops);
3198         register_pernet_subsys(&ipv4_inetpeer_ops);
3199         return 0;
3200 }
3201
3202 #ifdef CONFIG_SYSCTL
3203 /*
3204  * We really need to sanitize the damn ipv4 init order, then all
3205  * this nonsense will go away.
3206  */
3207 void __init ip_static_sysctl_init(void)
3208 {
3209         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3210 }
3211 #endif