Merge tag 'asoc-v4.2-disable-topology' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags,
322                                         struct fib6_table *table)
323 {
324         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
325                                         0, DST_OBSOLETE_FORCE_CHK, flags);
326
327         if (rt) {
328                 struct dst_entry *dst = &rt->dst;
329
330                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
331                 INIT_LIST_HEAD(&rt->rt6i_siblings);
332                 INIT_LIST_HEAD(&rt->rt6i_uncached);
333         }
334         return rt;
335 }
336
337 static struct rt6_info *ip6_dst_alloc(struct net *net,
338                                       struct net_device *dev,
339                                       int flags,
340                                       struct fib6_table *table)
341 {
342         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
343
344         if (rt) {
345                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
346                 if (rt->rt6i_pcpu) {
347                         int cpu;
348
349                         for_each_possible_cpu(cpu) {
350                                 struct rt6_info **p;
351
352                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
353                                 /* no one shares rt */
354                                 *p =  NULL;
355                         }
356                 } else {
357                         dst_destroy((struct dst_entry *)rt);
358                         return NULL;
359                 }
360         }
361
362         return rt;
363 }
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct dst_entry *from = dst->from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         free_percpu(rt->rt6i_pcpu);
373         rt6_uncached_list_del(rt);
374
375         idev = rt->rt6i_idev;
376         if (idev) {
377                 rt->rt6i_idev = NULL;
378                 in6_dev_put(idev);
379         }
380
381         dst->from = NULL;
382         dst_release(from);
383 }
384
385 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
386                            int how)
387 {
388         struct rt6_info *rt = (struct rt6_info *)dst;
389         struct inet6_dev *idev = rt->rt6i_idev;
390         struct net_device *loopback_dev =
391                 dev_net(dev)->loopback_dev;
392
393         if (dev != loopback_dev) {
394                 if (idev && idev->dev == dev) {
395                         struct inet6_dev *loopback_idev =
396                                 in6_dev_get(loopback_dev);
397                         if (loopback_idev) {
398                                 rt->rt6i_idev = loopback_idev;
399                                 in6_dev_put(idev);
400                         }
401                 }
402         }
403 }
404
405 static bool rt6_check_expired(const struct rt6_info *rt)
406 {
407         if (rt->rt6i_flags & RTF_EXPIRES) {
408                 if (time_after(jiffies, rt->dst.expires))
409                         return true;
410         } else if (rt->dst.from) {
411                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
412         }
413         return false;
414 }
415
416 /* Multipath route selection:
417  *   Hash based function using packet header and flowlabel.
418  * Adapted from fib_info_hashfn()
419  */
420 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
421                                const struct flowi6 *fl6)
422 {
423         unsigned int val = fl6->flowi6_proto;
424
425         val ^= ipv6_addr_hash(&fl6->daddr);
426         val ^= ipv6_addr_hash(&fl6->saddr);
427
428         /* Work only if this not encapsulated */
429         switch (fl6->flowi6_proto) {
430         case IPPROTO_UDP:
431         case IPPROTO_TCP:
432         case IPPROTO_SCTP:
433                 val ^= (__force u16)fl6->fl6_sport;
434                 val ^= (__force u16)fl6->fl6_dport;
435                 break;
436
437         case IPPROTO_ICMPV6:
438                 val ^= (__force u16)fl6->fl6_icmp_type;
439                 val ^= (__force u16)fl6->fl6_icmp_code;
440                 break;
441         }
442         /* RFC6438 recommands to use flowlabel */
443         val ^= (__force u32)fl6->flowlabel;
444
445         /* Perhaps, we need to tune, this function? */
446         val = val ^ (val >> 7) ^ (val >> 12);
447         return val % candidate_count;
448 }
449
450 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
451                                              struct flowi6 *fl6, int oif,
452                                              int strict)
453 {
454         struct rt6_info *sibling, *next_sibling;
455         int route_choosen;
456
457         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
458         /* Don't change the route, if route_choosen == 0
459          * (siblings does not include ourself)
460          */
461         if (route_choosen)
462                 list_for_each_entry_safe(sibling, next_sibling,
463                                 &match->rt6i_siblings, rt6i_siblings) {
464                         route_choosen--;
465                         if (route_choosen == 0) {
466                                 if (rt6_score_route(sibling, oif, strict) < 0)
467                                         break;
468                                 match = sibling;
469                                 break;
470                         }
471                 }
472         return match;
473 }
474
475 /*
476  *      Route lookup. Any table->tb6_lock is implied.
477  */
478
479 static inline struct rt6_info *rt6_device_match(struct net *net,
480                                                     struct rt6_info *rt,
481                                                     const struct in6_addr *saddr,
482                                                     int oif,
483                                                     int flags)
484 {
485         struct rt6_info *local = NULL;
486         struct rt6_info *sprt;
487
488         if (!oif && ipv6_addr_any(saddr))
489                 goto out;
490
491         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
492                 struct net_device *dev = sprt->dst.dev;
493
494                 if (oif) {
495                         if (dev->ifindex == oif)
496                                 return sprt;
497                         if (dev->flags & IFF_LOOPBACK) {
498                                 if (!sprt->rt6i_idev ||
499                                     sprt->rt6i_idev->dev->ifindex != oif) {
500                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
501                                                 continue;
502                                         if (local && (!oif ||
503                                                       local->rt6i_idev->dev->ifindex == oif))
504                                                 continue;
505                                 }
506                                 local = sprt;
507                         }
508                 } else {
509                         if (ipv6_chk_addr(net, saddr, dev,
510                                           flags & RT6_LOOKUP_F_IFACE))
511                                 return sprt;
512                 }
513         }
514
515         if (oif) {
516                 if (local)
517                         return local;
518
519                 if (flags & RT6_LOOKUP_F_IFACE)
520                         return net->ipv6.ip6_null_entry;
521         }
522 out:
523         return rt;
524 }
525
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528         struct work_struct work;
529         struct in6_addr target;
530         struct net_device *dev;
531 };
532
533 static void rt6_probe_deferred(struct work_struct *w)
534 {
535         struct in6_addr mcaddr;
536         struct __rt6_probe_work *work =
537                 container_of(w, struct __rt6_probe_work, work);
538
539         addrconf_addr_solict_mult(&work->target, &mcaddr);
540         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
541         dev_put(work->dev);
542         kfree(work);
543 }
544
545 static void rt6_probe(struct rt6_info *rt)
546 {
547         struct neighbour *neigh;
548         /*
549          * Okay, this does not seem to be appropriate
550          * for now, however, we need to check if it
551          * is really so; aka Router Reachability Probing.
552          *
553          * Router Reachability Probe MUST be rate-limited
554          * to no more than one per minute.
555          */
556         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
557                 return;
558         rcu_read_lock_bh();
559         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
560         if (neigh) {
561                 write_lock(&neigh->lock);
562                 if (neigh->nud_state & NUD_VALID)
563                         goto out;
564         }
565
566         if (!neigh ||
567             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
568                 struct __rt6_probe_work *work;
569
570                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
571
572                 if (neigh && work)
573                         __neigh_set_probe_once(neigh);
574
575                 if (neigh)
576                         write_unlock(&neigh->lock);
577
578                 if (work) {
579                         INIT_WORK(&work->work, rt6_probe_deferred);
580                         work->target = rt->rt6i_gateway;
581                         dev_hold(rt->dst.dev);
582                         work->dev = rt->dst.dev;
583                         schedule_work(&work->work);
584                 }
585         } else {
586 out:
587                 write_unlock(&neigh->lock);
588         }
589         rcu_read_unlock_bh();
590 }
591 #else
592 static inline void rt6_probe(struct rt6_info *rt)
593 {
594 }
595 #endif
596
597 /*
598  * Default Router Selection (RFC 2461 6.3.6)
599  */
600 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
601 {
602         struct net_device *dev = rt->dst.dev;
603         if (!oif || dev->ifindex == oif)
604                 return 2;
605         if ((dev->flags & IFF_LOOPBACK) &&
606             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
607                 return 1;
608         return 0;
609 }
610
611 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
612 {
613         struct neighbour *neigh;
614         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
615
616         if (rt->rt6i_flags & RTF_NONEXTHOP ||
617             !(rt->rt6i_flags & RTF_GATEWAY))
618                 return RT6_NUD_SUCCEED;
619
620         rcu_read_lock_bh();
621         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
622         if (neigh) {
623                 read_lock(&neigh->lock);
624                 if (neigh->nud_state & NUD_VALID)
625                         ret = RT6_NUD_SUCCEED;
626 #ifdef CONFIG_IPV6_ROUTER_PREF
627                 else if (!(neigh->nud_state & NUD_FAILED))
628                         ret = RT6_NUD_SUCCEED;
629                 else
630                         ret = RT6_NUD_FAIL_PROBE;
631 #endif
632                 read_unlock(&neigh->lock);
633         } else {
634                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
635                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
636         }
637         rcu_read_unlock_bh();
638
639         return ret;
640 }
641
642 static int rt6_score_route(struct rt6_info *rt, int oif,
643                            int strict)
644 {
645         int m;
646
647         m = rt6_check_dev(rt, oif);
648         if (!m && (strict & RT6_LOOKUP_F_IFACE))
649                 return RT6_NUD_FAIL_HARD;
650 #ifdef CONFIG_IPV6_ROUTER_PREF
651         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
652 #endif
653         if (strict & RT6_LOOKUP_F_REACHABLE) {
654                 int n = rt6_check_neigh(rt);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
662                                    int *mpri, struct rt6_info *match,
663                                    bool *do_rr)
664 {
665         int m;
666         bool match_do_rr = false;
667
668         if (rt6_check_expired(rt))
669                 goto out;
670
671         m = rt6_score_route(rt, oif, strict);
672         if (m == RT6_NUD_FAIL_DO_RR) {
673                 match_do_rr = true;
674                 m = 0; /* lowest valid score */
675         } else if (m == RT6_NUD_FAIL_HARD) {
676                 goto out;
677         }
678
679         if (strict & RT6_LOOKUP_F_REACHABLE)
680                 rt6_probe(rt);
681
682         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
683         if (m > *mpri) {
684                 *do_rr = match_do_rr;
685                 *mpri = m;
686                 match = rt;
687         }
688 out:
689         return match;
690 }
691
692 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
693                                      struct rt6_info *rr_head,
694                                      u32 metric, int oif, int strict,
695                                      bool *do_rr)
696 {
697         struct rt6_info *rt, *match, *cont;
698         int mpri = -1;
699
700         match = NULL;
701         cont = NULL;
702         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
703                 if (rt->rt6i_metric != metric) {
704                         cont = rt;
705                         break;
706                 }
707
708                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
709         }
710
711         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
712                 if (rt->rt6i_metric != metric) {
713                         cont = rt;
714                         break;
715                 }
716
717                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718         }
719
720         if (match || !cont)
721                 return match;
722
723         for (rt = cont; rt; rt = rt->dst.rt6_next)
724                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725
726         return match;
727 }
728
729 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
730 {
731         struct rt6_info *match, *rt0;
732         struct net *net;
733         bool do_rr = false;
734
735         rt0 = fn->rr_ptr;
736         if (!rt0)
737                 fn->rr_ptr = rt0 = fn->leaf;
738
739         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
740                              &do_rr);
741
742         if (do_rr) {
743                 struct rt6_info *next = rt0->dst.rt6_next;
744
745                 /* no entries matched; do round-robin */
746                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
747                         next = fn->leaf;
748
749                 if (next != rt0)
750                         fn->rr_ptr = next;
751         }
752
753         net = dev_net(rt0->dst.dev);
754         return match ? match : net->ipv6.ip6_null_entry;
755 }
756
757 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
758 {
759         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
760 }
761
762 #ifdef CONFIG_IPV6_ROUTE_INFO
763 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
764                   const struct in6_addr *gwaddr)
765 {
766         struct net *net = dev_net(dev);
767         struct route_info *rinfo = (struct route_info *) opt;
768         struct in6_addr prefix_buf, *prefix;
769         unsigned int pref;
770         unsigned long lifetime;
771         struct rt6_info *rt;
772
773         if (len < sizeof(struct route_info)) {
774                 return -EINVAL;
775         }
776
777         /* Sanity check for prefix_len and length */
778         if (rinfo->length > 3) {
779                 return -EINVAL;
780         } else if (rinfo->prefix_len > 128) {
781                 return -EINVAL;
782         } else if (rinfo->prefix_len > 64) {
783                 if (rinfo->length < 2) {
784                         return -EINVAL;
785                 }
786         } else if (rinfo->prefix_len > 0) {
787                 if (rinfo->length < 1) {
788                         return -EINVAL;
789                 }
790         }
791
792         pref = rinfo->route_pref;
793         if (pref == ICMPV6_ROUTER_PREF_INVALID)
794                 return -EINVAL;
795
796         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
797
798         if (rinfo->length == 3)
799                 prefix = (struct in6_addr *)rinfo->prefix;
800         else {
801                 /* this function is safe */
802                 ipv6_addr_prefix(&prefix_buf,
803                                  (struct in6_addr *)rinfo->prefix,
804                                  rinfo->prefix_len);
805                 prefix = &prefix_buf;
806         }
807
808         if (rinfo->prefix_len == 0)
809                 rt = rt6_get_dflt_router(gwaddr, dev);
810         else
811                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
812                                         gwaddr, dev->ifindex);
813
814         if (rt && !lifetime) {
815                 ip6_del_rt(rt);
816                 rt = NULL;
817         }
818
819         if (!rt && lifetime)
820                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
821                                         pref);
822         else if (rt)
823                 rt->rt6i_flags = RTF_ROUTEINFO |
824                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
825
826         if (rt) {
827                 if (!addrconf_finite_timeout(lifetime))
828                         rt6_clean_expires(rt);
829                 else
830                         rt6_set_expires(rt, jiffies + HZ * lifetime);
831
832                 ip6_rt_put(rt);
833         }
834         return 0;
835 }
836 #endif
837
838 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
839                                         struct in6_addr *saddr)
840 {
841         struct fib6_node *pn;
842         while (1) {
843                 if (fn->fn_flags & RTN_TL_ROOT)
844                         return NULL;
845                 pn = fn->parent;
846                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
847                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
848                 else
849                         fn = pn;
850                 if (fn->fn_flags & RTN_RTINFO)
851                         return fn;
852         }
853 }
854
855 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
856                                              struct fib6_table *table,
857                                              struct flowi6 *fl6, int flags)
858 {
859         struct fib6_node *fn;
860         struct rt6_info *rt;
861
862         read_lock_bh(&table->tb6_lock);
863         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
864 restart:
865         rt = fn->leaf;
866         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
867         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
868                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
869         if (rt == net->ipv6.ip6_null_entry) {
870                 fn = fib6_backtrack(fn, &fl6->saddr);
871                 if (fn)
872                         goto restart;
873         }
874         dst_use(&rt->dst, jiffies);
875         read_unlock_bh(&table->tb6_lock);
876         return rt;
877
878 }
879
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881                                     int flags)
882 {
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888                             const struct in6_addr *saddr, int oif, int strict)
889 {
890         struct flowi6 fl6 = {
891                 .flowi6_oif = oif,
892                 .daddr = *daddr,
893         };
894         struct dst_entry *dst;
895         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896
897         if (saddr) {
898                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899                 flags |= RT6_LOOKUP_F_HAS_SADDR;
900         }
901
902         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903         if (dst->error == 0)
904                 return (struct rt6_info *) dst;
905
906         dst_release(dst);
907
908         return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919                         struct mx6_config *mxc)
920 {
921         int err;
922         struct fib6_table *table;
923
924         table = rt->rt6i_table;
925         write_lock_bh(&table->tb6_lock);
926         err = fib6_add(&table->tb6_root, rt, info, mxc);
927         write_unlock_bh(&table->tb6_lock);
928
929         return err;
930 }
931
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
935         struct mx6_config mxc = { .mx = NULL, };
936
937         return __ip6_ins_rt(rt, &info, &mxc);
938 }
939
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941                                            const struct in6_addr *daddr,
942                                            const struct in6_addr *saddr)
943 {
944         struct rt6_info *rt;
945
946         /*
947          *      Clone the route.
948          */
949
950         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951                 ort = (struct rt6_info *)ort->dst.from;
952
953         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
954                              0, ort->rt6i_table);
955
956         if (!rt)
957                 return NULL;
958
959         ip6_rt_copy_init(rt, ort);
960         rt->rt6i_flags |= RTF_CACHE;
961         rt->rt6i_metric = 0;
962         rt->dst.flags |= DST_HOST;
963         rt->rt6i_dst.addr = *daddr;
964         rt->rt6i_dst.plen = 128;
965
966         if (!rt6_is_gw_or_nonexthop(ort)) {
967                 if (ort->rt6i_dst.plen != 128 &&
968                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
969                         rt->rt6i_flags |= RTF_ANYCAST;
970 #ifdef CONFIG_IPV6_SUBTREES
971                 if (rt->rt6i_src.plen && saddr) {
972                         rt->rt6i_src.addr = *saddr;
973                         rt->rt6i_src.plen = 128;
974                 }
975 #endif
976         }
977
978         return rt;
979 }
980
981 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
982 {
983         struct rt6_info *pcpu_rt;
984
985         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
986                                   rt->dst.dev, rt->dst.flags,
987                                   rt->rt6i_table);
988
989         if (!pcpu_rt)
990                 return NULL;
991         ip6_rt_copy_init(pcpu_rt, rt);
992         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
993         pcpu_rt->rt6i_flags |= RTF_PCPU;
994         return pcpu_rt;
995 }
996
997 /* It should be called with read_lock_bh(&tb6_lock) acquired */
998 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
999 {
1000         struct rt6_info *pcpu_rt, *prev, **p;
1001
1002         p = this_cpu_ptr(rt->rt6i_pcpu);
1003         pcpu_rt = *p;
1004
1005         if (pcpu_rt)
1006                 goto done;
1007
1008         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1009         if (!pcpu_rt) {
1010                 struct net *net = dev_net(rt->dst.dev);
1011
1012                 pcpu_rt = net->ipv6.ip6_null_entry;
1013                 goto done;
1014         }
1015
1016         prev = cmpxchg(p, NULL, pcpu_rt);
1017         if (prev) {
1018                 /* If someone did it before us, return prev instead */
1019                 dst_destroy(&pcpu_rt->dst);
1020                 pcpu_rt = prev;
1021         }
1022
1023 done:
1024         dst_hold(&pcpu_rt->dst);
1025         rt6_dst_from_metrics_check(pcpu_rt);
1026         return pcpu_rt;
1027 }
1028
1029 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1030                                       struct flowi6 *fl6, int flags)
1031 {
1032         struct fib6_node *fn, *saved_fn;
1033         struct rt6_info *rt;
1034         int strict = 0;
1035
1036         strict |= flags & RT6_LOOKUP_F_IFACE;
1037         if (net->ipv6.devconf_all->forwarding == 0)
1038                 strict |= RT6_LOOKUP_F_REACHABLE;
1039
1040         read_lock_bh(&table->tb6_lock);
1041
1042         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1043         saved_fn = fn;
1044
1045 redo_rt6_select:
1046         rt = rt6_select(fn, oif, strict);
1047         if (rt->rt6i_nsiblings)
1048                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1049         if (rt == net->ipv6.ip6_null_entry) {
1050                 fn = fib6_backtrack(fn, &fl6->saddr);
1051                 if (fn)
1052                         goto redo_rt6_select;
1053                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1054                         /* also consider unreachable route */
1055                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1056                         fn = saved_fn;
1057                         goto redo_rt6_select;
1058                 }
1059         }
1060
1061
1062         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1063                 dst_use(&rt->dst, jiffies);
1064                 read_unlock_bh(&table->tb6_lock);
1065
1066                 rt6_dst_from_metrics_check(rt);
1067                 return rt;
1068         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1069                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1070                 /* Create a RTF_CACHE clone which will not be
1071                  * owned by the fib6 tree.  It is for the special case where
1072                  * the daddr in the skb during the neighbor look-up is different
1073                  * from the fl6->daddr used to look-up route here.
1074                  */
1075
1076                 struct rt6_info *uncached_rt;
1077
1078                 dst_use(&rt->dst, jiffies);
1079                 read_unlock_bh(&table->tb6_lock);
1080
1081                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1082                 dst_release(&rt->dst);
1083
1084                 if (uncached_rt)
1085                         rt6_uncached_list_add(uncached_rt);
1086                 else
1087                         uncached_rt = net->ipv6.ip6_null_entry;
1088
1089                 dst_hold(&uncached_rt->dst);
1090                 return uncached_rt;
1091
1092         } else {
1093                 /* Get a percpu copy */
1094
1095                 struct rt6_info *pcpu_rt;
1096
1097                 rt->dst.lastuse = jiffies;
1098                 rt->dst.__use++;
1099                 pcpu_rt = rt6_get_pcpu_route(rt);
1100                 read_unlock_bh(&table->tb6_lock);
1101
1102                 return pcpu_rt;
1103         }
1104 }
1105
1106 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1107                                             struct flowi6 *fl6, int flags)
1108 {
1109         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1110 }
1111
1112 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1113                                                 struct net_device *dev,
1114                                                 struct flowi6 *fl6, int flags)
1115 {
1116         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1117                 flags |= RT6_LOOKUP_F_IFACE;
1118
1119         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1120 }
1121
1122 void ip6_route_input(struct sk_buff *skb)
1123 {
1124         const struct ipv6hdr *iph = ipv6_hdr(skb);
1125         struct net *net = dev_net(skb->dev);
1126         int flags = RT6_LOOKUP_F_HAS_SADDR;
1127         struct flowi6 fl6 = {
1128                 .flowi6_iif = skb->dev->ifindex,
1129                 .daddr = iph->daddr,
1130                 .saddr = iph->saddr,
1131                 .flowlabel = ip6_flowinfo(iph),
1132                 .flowi6_mark = skb->mark,
1133                 .flowi6_proto = iph->nexthdr,
1134         };
1135
1136         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1137 }
1138
1139 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1140                                              struct flowi6 *fl6, int flags)
1141 {
1142         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1143 }
1144
1145 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1146                                     struct flowi6 *fl6)
1147 {
1148         int flags = 0;
1149
1150         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1151
1152         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1153                 flags |= RT6_LOOKUP_F_IFACE;
1154
1155         if (!ipv6_addr_any(&fl6->saddr))
1156                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1157         else if (sk)
1158                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1159
1160         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1161 }
1162 EXPORT_SYMBOL(ip6_route_output);
1163
1164 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1165 {
1166         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1167         struct dst_entry *new = NULL;
1168
1169         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1170         if (rt) {
1171                 new = &rt->dst;
1172
1173                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1174
1175                 new->__use = 1;
1176                 new->input = dst_discard;
1177                 new->output = dst_discard_sk;
1178
1179                 if (dst_metrics_read_only(&ort->dst))
1180                         new->_metrics = ort->dst._metrics;
1181                 else
1182                         dst_copy_metrics(new, &ort->dst);
1183                 rt->rt6i_idev = ort->rt6i_idev;
1184                 if (rt->rt6i_idev)
1185                         in6_dev_hold(rt->rt6i_idev);
1186
1187                 rt->rt6i_gateway = ort->rt6i_gateway;
1188                 rt->rt6i_flags = ort->rt6i_flags;
1189                 rt->rt6i_metric = 0;
1190
1191                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1192 #ifdef CONFIG_IPV6_SUBTREES
1193                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1194 #endif
1195
1196                 dst_free(new);
1197         }
1198
1199         dst_release(dst_orig);
1200         return new ? new : ERR_PTR(-ENOMEM);
1201 }
1202
1203 /*
1204  *      Destination cache support functions
1205  */
1206
1207 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1208 {
1209         if (rt->dst.from &&
1210             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1211                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1212 }
1213
1214 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1215 {
1216         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1217                 return NULL;
1218
1219         if (rt6_check_expired(rt))
1220                 return NULL;
1221
1222         return &rt->dst;
1223 }
1224
1225 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1226 {
1227         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1228             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1229                 return &rt->dst;
1230         else
1231                 return NULL;
1232 }
1233
1234 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1235 {
1236         struct rt6_info *rt;
1237
1238         rt = (struct rt6_info *) dst;
1239
1240         /* All IPV6 dsts are created with ->obsolete set to the value
1241          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1242          * into this function always.
1243          */
1244
1245         rt6_dst_from_metrics_check(rt);
1246
1247         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1248                 return rt6_dst_from_check(rt, cookie);
1249         else
1250                 return rt6_check(rt, cookie);
1251 }
1252
1253 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1254 {
1255         struct rt6_info *rt = (struct rt6_info *) dst;
1256
1257         if (rt) {
1258                 if (rt->rt6i_flags & RTF_CACHE) {
1259                         if (rt6_check_expired(rt)) {
1260                                 ip6_del_rt(rt);
1261                                 dst = NULL;
1262                         }
1263                 } else {
1264                         dst_release(dst);
1265                         dst = NULL;
1266                 }
1267         }
1268         return dst;
1269 }
1270
1271 static void ip6_link_failure(struct sk_buff *skb)
1272 {
1273         struct rt6_info *rt;
1274
1275         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1276
1277         rt = (struct rt6_info *) skb_dst(skb);
1278         if (rt) {
1279                 if (rt->rt6i_flags & RTF_CACHE) {
1280                         dst_hold(&rt->dst);
1281                         if (ip6_del_rt(rt))
1282                                 dst_free(&rt->dst);
1283                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1284                         rt->rt6i_node->fn_sernum = -1;
1285                 }
1286         }
1287 }
1288
1289 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1290 {
1291         struct net *net = dev_net(rt->dst.dev);
1292
1293         rt->rt6i_flags |= RTF_MODIFIED;
1294         rt->rt6i_pmtu = mtu;
1295         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1296 }
1297
1298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1299                                  const struct ipv6hdr *iph, u32 mtu)
1300 {
1301         struct rt6_info *rt6 = (struct rt6_info *)dst;
1302
1303         if (rt6->rt6i_flags & RTF_LOCAL)
1304                 return;
1305
1306         dst_confirm(dst);
1307         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1308         if (mtu >= dst_mtu(dst))
1309                 return;
1310
1311         if (rt6->rt6i_flags & RTF_CACHE) {
1312                 rt6_do_update_pmtu(rt6, mtu);
1313         } else {
1314                 const struct in6_addr *daddr, *saddr;
1315                 struct rt6_info *nrt6;
1316
1317                 if (iph) {
1318                         daddr = &iph->daddr;
1319                         saddr = &iph->saddr;
1320                 } else if (sk) {
1321                         daddr = &sk->sk_v6_daddr;
1322                         saddr = &inet6_sk(sk)->saddr;
1323                 } else {
1324                         return;
1325                 }
1326                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1327                 if (nrt6) {
1328                         rt6_do_update_pmtu(nrt6, mtu);
1329
1330                         /* ip6_ins_rt(nrt6) will bump the
1331                          * rt6->rt6i_node->fn_sernum
1332                          * which will fail the next rt6_check() and
1333                          * invalidate the sk->sk_dst_cache.
1334                          */
1335                         ip6_ins_rt(nrt6);
1336                 }
1337         }
1338 }
1339
1340 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1341                                struct sk_buff *skb, u32 mtu)
1342 {
1343         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1344 }
1345
1346 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1347                      int oif, u32 mark)
1348 {
1349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1350         struct dst_entry *dst;
1351         struct flowi6 fl6;
1352
1353         memset(&fl6, 0, sizeof(fl6));
1354         fl6.flowi6_oif = oif;
1355         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1356         fl6.daddr = iph->daddr;
1357         fl6.saddr = iph->saddr;
1358         fl6.flowlabel = ip6_flowinfo(iph);
1359
1360         dst = ip6_route_output(net, NULL, &fl6);
1361         if (!dst->error)
1362                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1363         dst_release(dst);
1364 }
1365 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1366
1367 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1368 {
1369         ip6_update_pmtu(skb, sock_net(sk), mtu,
1370                         sk->sk_bound_dev_if, sk->sk_mark);
1371 }
1372 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1373
1374 /* Handle redirects */
1375 struct ip6rd_flowi {
1376         struct flowi6 fl6;
1377         struct in6_addr gateway;
1378 };
1379
1380 static struct rt6_info *__ip6_route_redirect(struct net *net,
1381                                              struct fib6_table *table,
1382                                              struct flowi6 *fl6,
1383                                              int flags)
1384 {
1385         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1386         struct rt6_info *rt;
1387         struct fib6_node *fn;
1388
1389         /* Get the "current" route for this destination and
1390          * check if the redirect has come from approriate router.
1391          *
1392          * RFC 4861 specifies that redirects should only be
1393          * accepted if they come from the nexthop to the target.
1394          * Due to the way the routes are chosen, this notion
1395          * is a bit fuzzy and one might need to check all possible
1396          * routes.
1397          */
1398
1399         read_lock_bh(&table->tb6_lock);
1400         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1401 restart:
1402         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1403                 if (rt6_check_expired(rt))
1404                         continue;
1405                 if (rt->dst.error)
1406                         break;
1407                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1408                         continue;
1409                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1410                         continue;
1411                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1412                         continue;
1413                 break;
1414         }
1415
1416         if (!rt)
1417                 rt = net->ipv6.ip6_null_entry;
1418         else if (rt->dst.error) {
1419                 rt = net->ipv6.ip6_null_entry;
1420                 goto out;
1421         }
1422
1423         if (rt == net->ipv6.ip6_null_entry) {
1424                 fn = fib6_backtrack(fn, &fl6->saddr);
1425                 if (fn)
1426                         goto restart;
1427         }
1428
1429 out:
1430         dst_hold(&rt->dst);
1431
1432         read_unlock_bh(&table->tb6_lock);
1433
1434         return rt;
1435 };
1436
1437 static struct dst_entry *ip6_route_redirect(struct net *net,
1438                                         const struct flowi6 *fl6,
1439                                         const struct in6_addr *gateway)
1440 {
1441         int flags = RT6_LOOKUP_F_HAS_SADDR;
1442         struct ip6rd_flowi rdfl;
1443
1444         rdfl.fl6 = *fl6;
1445         rdfl.gateway = *gateway;
1446
1447         return fib6_rule_lookup(net, &rdfl.fl6,
1448                                 flags, __ip6_route_redirect);
1449 }
1450
1451 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1452 {
1453         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1454         struct dst_entry *dst;
1455         struct flowi6 fl6;
1456
1457         memset(&fl6, 0, sizeof(fl6));
1458         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1459         fl6.flowi6_oif = oif;
1460         fl6.flowi6_mark = mark;
1461         fl6.daddr = iph->daddr;
1462         fl6.saddr = iph->saddr;
1463         fl6.flowlabel = ip6_flowinfo(iph);
1464
1465         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1466         rt6_do_redirect(dst, NULL, skb);
1467         dst_release(dst);
1468 }
1469 EXPORT_SYMBOL_GPL(ip6_redirect);
1470
1471 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1472                             u32 mark)
1473 {
1474         const struct ipv6hdr *iph = ipv6_hdr(skb);
1475         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1476         struct dst_entry *dst;
1477         struct flowi6 fl6;
1478
1479         memset(&fl6, 0, sizeof(fl6));
1480         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1481         fl6.flowi6_oif = oif;
1482         fl6.flowi6_mark = mark;
1483         fl6.daddr = msg->dest;
1484         fl6.saddr = iph->daddr;
1485
1486         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1487         rt6_do_redirect(dst, NULL, skb);
1488         dst_release(dst);
1489 }
1490
1491 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1492 {
1493         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1494 }
1495 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1496
1497 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1498 {
1499         struct net_device *dev = dst->dev;
1500         unsigned int mtu = dst_mtu(dst);
1501         struct net *net = dev_net(dev);
1502
1503         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1504
1505         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1506                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1507
1508         /*
1509          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1510          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1511          * IPV6_MAXPLEN is also valid and means: "any MSS,
1512          * rely only on pmtu discovery"
1513          */
1514         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1515                 mtu = IPV6_MAXPLEN;
1516         return mtu;
1517 }
1518
1519 static unsigned int ip6_mtu(const struct dst_entry *dst)
1520 {
1521         const struct rt6_info *rt = (const struct rt6_info *)dst;
1522         unsigned int mtu = rt->rt6i_pmtu;
1523         struct inet6_dev *idev;
1524
1525         if (mtu)
1526                 goto out;
1527
1528         mtu = dst_metric_raw(dst, RTAX_MTU);
1529         if (mtu)
1530                 goto out;
1531
1532         mtu = IPV6_MIN_MTU;
1533
1534         rcu_read_lock();
1535         idev = __in6_dev_get(dst->dev);
1536         if (idev)
1537                 mtu = idev->cnf.mtu6;
1538         rcu_read_unlock();
1539
1540 out:
1541         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1542 }
1543
1544 static struct dst_entry *icmp6_dst_gc_list;
1545 static DEFINE_SPINLOCK(icmp6_dst_lock);
1546
1547 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1548                                   struct flowi6 *fl6)
1549 {
1550         struct dst_entry *dst;
1551         struct rt6_info *rt;
1552         struct inet6_dev *idev = in6_dev_get(dev);
1553         struct net *net = dev_net(dev);
1554
1555         if (unlikely(!idev))
1556                 return ERR_PTR(-ENODEV);
1557
1558         rt = ip6_dst_alloc(net, dev, 0, NULL);
1559         if (unlikely(!rt)) {
1560                 in6_dev_put(idev);
1561                 dst = ERR_PTR(-ENOMEM);
1562                 goto out;
1563         }
1564
1565         rt->dst.flags |= DST_HOST;
1566         rt->dst.output  = ip6_output;
1567         atomic_set(&rt->dst.__refcnt, 1);
1568         rt->rt6i_gateway  = fl6->daddr;
1569         rt->rt6i_dst.addr = fl6->daddr;
1570         rt->rt6i_dst.plen = 128;
1571         rt->rt6i_idev     = idev;
1572         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1573
1574         spin_lock_bh(&icmp6_dst_lock);
1575         rt->dst.next = icmp6_dst_gc_list;
1576         icmp6_dst_gc_list = &rt->dst;
1577         spin_unlock_bh(&icmp6_dst_lock);
1578
1579         fib6_force_start_gc(net);
1580
1581         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1582
1583 out:
1584         return dst;
1585 }
1586
1587 int icmp6_dst_gc(void)
1588 {
1589         struct dst_entry *dst, **pprev;
1590         int more = 0;
1591
1592         spin_lock_bh(&icmp6_dst_lock);
1593         pprev = &icmp6_dst_gc_list;
1594
1595         while ((dst = *pprev) != NULL) {
1596                 if (!atomic_read(&dst->__refcnt)) {
1597                         *pprev = dst->next;
1598                         dst_free(dst);
1599                 } else {
1600                         pprev = &dst->next;
1601                         ++more;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&icmp6_dst_lock);
1606
1607         return more;
1608 }
1609
1610 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1611                             void *arg)
1612 {
1613         struct dst_entry *dst, **pprev;
1614
1615         spin_lock_bh(&icmp6_dst_lock);
1616         pprev = &icmp6_dst_gc_list;
1617         while ((dst = *pprev) != NULL) {
1618                 struct rt6_info *rt = (struct rt6_info *) dst;
1619                 if (func(rt, arg)) {
1620                         *pprev = dst->next;
1621                         dst_free(dst);
1622                 } else {
1623                         pprev = &dst->next;
1624                 }
1625         }
1626         spin_unlock_bh(&icmp6_dst_lock);
1627 }
1628
1629 static int ip6_dst_gc(struct dst_ops *ops)
1630 {
1631         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1632         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1633         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1634         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1635         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1636         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1637         int entries;
1638
1639         entries = dst_entries_get_fast(ops);
1640         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1641             entries <= rt_max_size)
1642                 goto out;
1643
1644         net->ipv6.ip6_rt_gc_expire++;
1645         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1646         entries = dst_entries_get_slow(ops);
1647         if (entries < ops->gc_thresh)
1648                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1649 out:
1650         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1651         return entries > rt_max_size;
1652 }
1653
1654 static int ip6_convert_metrics(struct mx6_config *mxc,
1655                                const struct fib6_config *cfg)
1656 {
1657         struct nlattr *nla;
1658         int remaining;
1659         u32 *mp;
1660
1661         if (!cfg->fc_mx)
1662                 return 0;
1663
1664         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1665         if (unlikely(!mp))
1666                 return -ENOMEM;
1667
1668         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1669                 int type = nla_type(nla);
1670
1671                 if (type) {
1672                         u32 val;
1673
1674                         if (unlikely(type > RTAX_MAX))
1675                                 goto err;
1676                         if (type == RTAX_CC_ALGO) {
1677                                 char tmp[TCP_CA_NAME_MAX];
1678
1679                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1680                                 val = tcp_ca_get_key_by_name(tmp);
1681                                 if (val == TCP_CA_UNSPEC)
1682                                         goto err;
1683                         } else {
1684                                 val = nla_get_u32(nla);
1685                         }
1686
1687                         mp[type - 1] = val;
1688                         __set_bit(type - 1, mxc->mx_valid);
1689                 }
1690         }
1691
1692         mxc->mx = mp;
1693
1694         return 0;
1695  err:
1696         kfree(mp);
1697         return -EINVAL;
1698 }
1699
1700 int ip6_route_add(struct fib6_config *cfg)
1701 {
1702         int err;
1703         struct net *net = cfg->fc_nlinfo.nl_net;
1704         struct rt6_info *rt = NULL;
1705         struct net_device *dev = NULL;
1706         struct inet6_dev *idev = NULL;
1707         struct fib6_table *table;
1708         struct mx6_config mxc = { .mx = NULL, };
1709         int addr_type;
1710
1711         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1712                 return -EINVAL;
1713 #ifndef CONFIG_IPV6_SUBTREES
1714         if (cfg->fc_src_len)
1715                 return -EINVAL;
1716 #endif
1717         if (cfg->fc_ifindex) {
1718                 err = -ENODEV;
1719                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1720                 if (!dev)
1721                         goto out;
1722                 idev = in6_dev_get(dev);
1723                 if (!idev)
1724                         goto out;
1725         }
1726
1727         if (cfg->fc_metric == 0)
1728                 cfg->fc_metric = IP6_RT_PRIO_USER;
1729
1730         err = -ENOBUFS;
1731         if (cfg->fc_nlinfo.nlh &&
1732             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1733                 table = fib6_get_table(net, cfg->fc_table);
1734                 if (!table) {
1735                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1736                         table = fib6_new_table(net, cfg->fc_table);
1737                 }
1738         } else {
1739                 table = fib6_new_table(net, cfg->fc_table);
1740         }
1741
1742         if (!table)
1743                 goto out;
1744
1745         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1746
1747         if (!rt) {
1748                 err = -ENOMEM;
1749                 goto out;
1750         }
1751
1752         if (cfg->fc_flags & RTF_EXPIRES)
1753                 rt6_set_expires(rt, jiffies +
1754                                 clock_t_to_jiffies(cfg->fc_expires));
1755         else
1756                 rt6_clean_expires(rt);
1757
1758         if (cfg->fc_protocol == RTPROT_UNSPEC)
1759                 cfg->fc_protocol = RTPROT_BOOT;
1760         rt->rt6i_protocol = cfg->fc_protocol;
1761
1762         addr_type = ipv6_addr_type(&cfg->fc_dst);
1763
1764         if (addr_type & IPV6_ADDR_MULTICAST)
1765                 rt->dst.input = ip6_mc_input;
1766         else if (cfg->fc_flags & RTF_LOCAL)
1767                 rt->dst.input = ip6_input;
1768         else
1769                 rt->dst.input = ip6_forward;
1770
1771         rt->dst.output = ip6_output;
1772
1773         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1774         rt->rt6i_dst.plen = cfg->fc_dst_len;
1775         if (rt->rt6i_dst.plen == 128)
1776                 rt->dst.flags |= DST_HOST;
1777
1778 #ifdef CONFIG_IPV6_SUBTREES
1779         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1780         rt->rt6i_src.plen = cfg->fc_src_len;
1781 #endif
1782
1783         rt->rt6i_metric = cfg->fc_metric;
1784
1785         /* We cannot add true routes via loopback here,
1786            they would result in kernel looping; promote them to reject routes
1787          */
1788         if ((cfg->fc_flags & RTF_REJECT) ||
1789             (dev && (dev->flags & IFF_LOOPBACK) &&
1790              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1791              !(cfg->fc_flags & RTF_LOCAL))) {
1792                 /* hold loopback dev/idev if we haven't done so. */
1793                 if (dev != net->loopback_dev) {
1794                         if (dev) {
1795                                 dev_put(dev);
1796                                 in6_dev_put(idev);
1797                         }
1798                         dev = net->loopback_dev;
1799                         dev_hold(dev);
1800                         idev = in6_dev_get(dev);
1801                         if (!idev) {
1802                                 err = -ENODEV;
1803                                 goto out;
1804                         }
1805                 }
1806                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1807                 switch (cfg->fc_type) {
1808                 case RTN_BLACKHOLE:
1809                         rt->dst.error = -EINVAL;
1810                         rt->dst.output = dst_discard_sk;
1811                         rt->dst.input = dst_discard;
1812                         break;
1813                 case RTN_PROHIBIT:
1814                         rt->dst.error = -EACCES;
1815                         rt->dst.output = ip6_pkt_prohibit_out;
1816                         rt->dst.input = ip6_pkt_prohibit;
1817                         break;
1818                 case RTN_THROW:
1819                 default:
1820                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1821                                         : -ENETUNREACH;
1822                         rt->dst.output = ip6_pkt_discard_out;
1823                         rt->dst.input = ip6_pkt_discard;
1824                         break;
1825                 }
1826                 goto install_route;
1827         }
1828
1829         if (cfg->fc_flags & RTF_GATEWAY) {
1830                 const struct in6_addr *gw_addr;
1831                 int gwa_type;
1832
1833                 gw_addr = &cfg->fc_gateway;
1834
1835                 /* if gw_addr is local we will fail to detect this in case
1836                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1837                  * will return already-added prefix route via interface that
1838                  * prefix route was assigned to, which might be non-loopback.
1839                  */
1840                 err = -EINVAL;
1841                 if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1842                         goto out;
1843
1844                 rt->rt6i_gateway = *gw_addr;
1845                 gwa_type = ipv6_addr_type(gw_addr);
1846
1847                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1848                         struct rt6_info *grt;
1849
1850                         /* IPv6 strictly inhibits using not link-local
1851                            addresses as nexthop address.
1852                            Otherwise, router will not able to send redirects.
1853                            It is very good, but in some (rare!) circumstances
1854                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1855                            some exceptions. --ANK
1856                          */
1857                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1858                                 goto out;
1859
1860                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1861
1862                         err = -EHOSTUNREACH;
1863                         if (!grt)
1864                                 goto out;
1865                         if (dev) {
1866                                 if (dev != grt->dst.dev) {
1867                                         ip6_rt_put(grt);
1868                                         goto out;
1869                                 }
1870                         } else {
1871                                 dev = grt->dst.dev;
1872                                 idev = grt->rt6i_idev;
1873                                 dev_hold(dev);
1874                                 in6_dev_hold(grt->rt6i_idev);
1875                         }
1876                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1877                                 err = 0;
1878                         ip6_rt_put(grt);
1879
1880                         if (err)
1881                                 goto out;
1882                 }
1883                 err = -EINVAL;
1884                 if (!dev || (dev->flags & IFF_LOOPBACK))
1885                         goto out;
1886         }
1887
1888         err = -ENODEV;
1889         if (!dev)
1890                 goto out;
1891
1892         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1893                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1894                         err = -EINVAL;
1895                         goto out;
1896                 }
1897                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1898                 rt->rt6i_prefsrc.plen = 128;
1899         } else
1900                 rt->rt6i_prefsrc.plen = 0;
1901
1902         rt->rt6i_flags = cfg->fc_flags;
1903
1904 install_route:
1905         rt->dst.dev = dev;
1906         rt->rt6i_idev = idev;
1907         rt->rt6i_table = table;
1908
1909         cfg->fc_nlinfo.nl_net = dev_net(dev);
1910
1911         err = ip6_convert_metrics(&mxc, cfg);
1912         if (err)
1913                 goto out;
1914
1915         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1916
1917         kfree(mxc.mx);
1918         return err;
1919 out:
1920         if (dev)
1921                 dev_put(dev);
1922         if (idev)
1923                 in6_dev_put(idev);
1924         if (rt)
1925                 dst_free(&rt->dst);
1926         return err;
1927 }
1928
1929 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1930 {
1931         int err;
1932         struct fib6_table *table;
1933         struct net *net = dev_net(rt->dst.dev);
1934
1935         if (rt == net->ipv6.ip6_null_entry) {
1936                 err = -ENOENT;
1937                 goto out;
1938         }
1939
1940         table = rt->rt6i_table;
1941         write_lock_bh(&table->tb6_lock);
1942         err = fib6_del(rt, info);
1943         write_unlock_bh(&table->tb6_lock);
1944
1945 out:
1946         ip6_rt_put(rt);
1947         return err;
1948 }
1949
1950 int ip6_del_rt(struct rt6_info *rt)
1951 {
1952         struct nl_info info = {
1953                 .nl_net = dev_net(rt->dst.dev),
1954         };
1955         return __ip6_del_rt(rt, &info);
1956 }
1957
1958 static int ip6_route_del(struct fib6_config *cfg)
1959 {
1960         struct fib6_table *table;
1961         struct fib6_node *fn;
1962         struct rt6_info *rt;
1963         int err = -ESRCH;
1964
1965         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1966         if (!table)
1967                 return err;
1968
1969         read_lock_bh(&table->tb6_lock);
1970
1971         fn = fib6_locate(&table->tb6_root,
1972                          &cfg->fc_dst, cfg->fc_dst_len,
1973                          &cfg->fc_src, cfg->fc_src_len);
1974
1975         if (fn) {
1976                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1977                         if ((rt->rt6i_flags & RTF_CACHE) &&
1978                             !(cfg->fc_flags & RTF_CACHE))
1979                                 continue;
1980                         if (cfg->fc_ifindex &&
1981                             (!rt->dst.dev ||
1982                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1983                                 continue;
1984                         if (cfg->fc_flags & RTF_GATEWAY &&
1985                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1986                                 continue;
1987                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1988                                 continue;
1989                         dst_hold(&rt->dst);
1990                         read_unlock_bh(&table->tb6_lock);
1991
1992                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1993                 }
1994         }
1995         read_unlock_bh(&table->tb6_lock);
1996
1997         return err;
1998 }
1999
2000 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2001 {
2002         struct net *net = dev_net(skb->dev);
2003         struct netevent_redirect netevent;
2004         struct rt6_info *rt, *nrt = NULL;
2005         struct ndisc_options ndopts;
2006         struct inet6_dev *in6_dev;
2007         struct neighbour *neigh;
2008         struct rd_msg *msg;
2009         int optlen, on_link;
2010         u8 *lladdr;
2011
2012         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2013         optlen -= sizeof(*msg);
2014
2015         if (optlen < 0) {
2016                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2017                 return;
2018         }
2019
2020         msg = (struct rd_msg *)icmp6_hdr(skb);
2021
2022         if (ipv6_addr_is_multicast(&msg->dest)) {
2023                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2024                 return;
2025         }
2026
2027         on_link = 0;
2028         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2029                 on_link = 1;
2030         } else if (ipv6_addr_type(&msg->target) !=
2031                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2032                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2033                 return;
2034         }
2035
2036         in6_dev = __in6_dev_get(skb->dev);
2037         if (!in6_dev)
2038                 return;
2039         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2040                 return;
2041
2042         /* RFC2461 8.1:
2043          *      The IP source address of the Redirect MUST be the same as the current
2044          *      first-hop router for the specified ICMP Destination Address.
2045          */
2046
2047         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2048                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2049                 return;
2050         }
2051
2052         lladdr = NULL;
2053         if (ndopts.nd_opts_tgt_lladdr) {
2054                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2055                                              skb->dev);
2056                 if (!lladdr) {
2057                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2058                         return;
2059                 }
2060         }
2061
2062         rt = (struct rt6_info *) dst;
2063         if (rt == net->ipv6.ip6_null_entry) {
2064                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2065                 return;
2066         }
2067
2068         /* Redirect received -> path was valid.
2069          * Look, redirects are sent only in response to data packets,
2070          * so that this nexthop apparently is reachable. --ANK
2071          */
2072         dst_confirm(&rt->dst);
2073
2074         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2075         if (!neigh)
2076                 return;
2077
2078         /*
2079          *      We have finally decided to accept it.
2080          */
2081
2082         neigh_update(neigh, lladdr, NUD_STALE,
2083                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2084                      NEIGH_UPDATE_F_OVERRIDE|
2085                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2086                                      NEIGH_UPDATE_F_ISROUTER))
2087                      );
2088
2089         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2090         if (!nrt)
2091                 goto out;
2092
2093         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2094         if (on_link)
2095                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2096
2097         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2098
2099         if (ip6_ins_rt(nrt))
2100                 goto out;
2101
2102         netevent.old = &rt->dst;
2103         netevent.new = &nrt->dst;
2104         netevent.daddr = &msg->dest;
2105         netevent.neigh = neigh;
2106         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2107
2108         if (rt->rt6i_flags & RTF_CACHE) {
2109                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2110                 ip6_del_rt(rt);
2111         }
2112
2113 out:
2114         neigh_release(neigh);
2115 }
2116
2117 /*
2118  *      Misc support functions
2119  */
2120
2121 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2122 {
2123         BUG_ON(from->dst.from);
2124
2125         rt->rt6i_flags &= ~RTF_EXPIRES;
2126         dst_hold(&from->dst);
2127         rt->dst.from = &from->dst;
2128         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2129 }
2130
2131 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2132 {
2133         rt->dst.input = ort->dst.input;
2134         rt->dst.output = ort->dst.output;
2135         rt->rt6i_dst = ort->rt6i_dst;
2136         rt->dst.error = ort->dst.error;
2137         rt->rt6i_idev = ort->rt6i_idev;
2138         if (rt->rt6i_idev)
2139                 in6_dev_hold(rt->rt6i_idev);
2140         rt->dst.lastuse = jiffies;
2141         rt->rt6i_gateway = ort->rt6i_gateway;
2142         rt->rt6i_flags = ort->rt6i_flags;
2143         rt6_set_from(rt, ort);
2144         rt->rt6i_metric = ort->rt6i_metric;
2145 #ifdef CONFIG_IPV6_SUBTREES
2146         rt->rt6i_src = ort->rt6i_src;
2147 #endif
2148         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2149         rt->rt6i_table = ort->rt6i_table;
2150 }
2151
2152 #ifdef CONFIG_IPV6_ROUTE_INFO
2153 static struct rt6_info *rt6_get_route_info(struct net *net,
2154                                            const struct in6_addr *prefix, int prefixlen,
2155                                            const struct in6_addr *gwaddr, int ifindex)
2156 {
2157         struct fib6_node *fn;
2158         struct rt6_info *rt = NULL;
2159         struct fib6_table *table;
2160
2161         table = fib6_get_table(net, RT6_TABLE_INFO);
2162         if (!table)
2163                 return NULL;
2164
2165         read_lock_bh(&table->tb6_lock);
2166         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2167         if (!fn)
2168                 goto out;
2169
2170         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2171                 if (rt->dst.dev->ifindex != ifindex)
2172                         continue;
2173                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2174                         continue;
2175                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2176                         continue;
2177                 dst_hold(&rt->dst);
2178                 break;
2179         }
2180 out:
2181         read_unlock_bh(&table->tb6_lock);
2182         return rt;
2183 }
2184
2185 static struct rt6_info *rt6_add_route_info(struct net *net,
2186                                            const struct in6_addr *prefix, int prefixlen,
2187                                            const struct in6_addr *gwaddr, int ifindex,
2188                                            unsigned int pref)
2189 {
2190         struct fib6_config cfg = {
2191                 .fc_table       = RT6_TABLE_INFO,
2192                 .fc_metric      = IP6_RT_PRIO_USER,
2193                 .fc_ifindex     = ifindex,
2194                 .fc_dst_len     = prefixlen,
2195                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2196                                   RTF_UP | RTF_PREF(pref),
2197                 .fc_nlinfo.portid = 0,
2198                 .fc_nlinfo.nlh = NULL,
2199                 .fc_nlinfo.nl_net = net,
2200         };
2201
2202         cfg.fc_dst = *prefix;
2203         cfg.fc_gateway = *gwaddr;
2204
2205         /* We should treat it as a default route if prefix length is 0. */
2206         if (!prefixlen)
2207                 cfg.fc_flags |= RTF_DEFAULT;
2208
2209         ip6_route_add(&cfg);
2210
2211         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2212 }
2213 #endif
2214
2215 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2216 {
2217         struct rt6_info *rt;
2218         struct fib6_table *table;
2219
2220         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2221         if (!table)
2222                 return NULL;
2223
2224         read_lock_bh(&table->tb6_lock);
2225         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2226                 if (dev == rt->dst.dev &&
2227                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2228                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2229                         break;
2230         }
2231         if (rt)
2232                 dst_hold(&rt->dst);
2233         read_unlock_bh(&table->tb6_lock);
2234         return rt;
2235 }
2236
2237 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2238                                      struct net_device *dev,
2239                                      unsigned int pref)
2240 {
2241         struct fib6_config cfg = {
2242                 .fc_table       = RT6_TABLE_DFLT,
2243                 .fc_metric      = IP6_RT_PRIO_USER,
2244                 .fc_ifindex     = dev->ifindex,
2245                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2246                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2247                 .fc_nlinfo.portid = 0,
2248                 .fc_nlinfo.nlh = NULL,
2249                 .fc_nlinfo.nl_net = dev_net(dev),
2250         };
2251
2252         cfg.fc_gateway = *gwaddr;
2253
2254         ip6_route_add(&cfg);
2255
2256         return rt6_get_dflt_router(gwaddr, dev);
2257 }
2258
2259 void rt6_purge_dflt_routers(struct net *net)
2260 {
2261         struct rt6_info *rt;
2262         struct fib6_table *table;
2263
2264         /* NOTE: Keep consistent with rt6_get_dflt_router */
2265         table = fib6_get_table(net, RT6_TABLE_DFLT);
2266         if (!table)
2267                 return;
2268
2269 restart:
2270         read_lock_bh(&table->tb6_lock);
2271         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2272                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2273                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2274                         dst_hold(&rt->dst);
2275                         read_unlock_bh(&table->tb6_lock);
2276                         ip6_del_rt(rt);
2277                         goto restart;
2278                 }
2279         }
2280         read_unlock_bh(&table->tb6_lock);
2281 }
2282
2283 static void rtmsg_to_fib6_config(struct net *net,
2284                                  struct in6_rtmsg *rtmsg,
2285                                  struct fib6_config *cfg)
2286 {
2287         memset(cfg, 0, sizeof(*cfg));
2288
2289         cfg->fc_table = RT6_TABLE_MAIN;
2290         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2291         cfg->fc_metric = rtmsg->rtmsg_metric;
2292         cfg->fc_expires = rtmsg->rtmsg_info;
2293         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2294         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2295         cfg->fc_flags = rtmsg->rtmsg_flags;
2296
2297         cfg->fc_nlinfo.nl_net = net;
2298
2299         cfg->fc_dst = rtmsg->rtmsg_dst;
2300         cfg->fc_src = rtmsg->rtmsg_src;
2301         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2302 }
2303
2304 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2305 {
2306         struct fib6_config cfg;
2307         struct in6_rtmsg rtmsg;
2308         int err;
2309
2310         switch (cmd) {
2311         case SIOCADDRT:         /* Add a route */
2312         case SIOCDELRT:         /* Delete a route */
2313                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2314                         return -EPERM;
2315                 err = copy_from_user(&rtmsg, arg,
2316                                      sizeof(struct in6_rtmsg));
2317                 if (err)
2318                         return -EFAULT;
2319
2320                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2321
2322                 rtnl_lock();
2323                 switch (cmd) {
2324                 case SIOCADDRT:
2325                         err = ip6_route_add(&cfg);
2326                         break;
2327                 case SIOCDELRT:
2328                         err = ip6_route_del(&cfg);
2329                         break;
2330                 default:
2331                         err = -EINVAL;
2332                 }
2333                 rtnl_unlock();
2334
2335                 return err;
2336         }
2337
2338         return -EINVAL;
2339 }
2340
2341 /*
2342  *      Drop the packet on the floor
2343  */
2344
2345 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2346 {
2347         int type;
2348         struct dst_entry *dst = skb_dst(skb);
2349         switch (ipstats_mib_noroutes) {
2350         case IPSTATS_MIB_INNOROUTES:
2351                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2352                 if (type == IPV6_ADDR_ANY) {
2353                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2354                                       IPSTATS_MIB_INADDRERRORS);
2355                         break;
2356                 }
2357                 /* FALLTHROUGH */
2358         case IPSTATS_MIB_OUTNOROUTES:
2359                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2360                               ipstats_mib_noroutes);
2361                 break;
2362         }
2363         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2364         kfree_skb(skb);
2365         return 0;
2366 }
2367
2368 static int ip6_pkt_discard(struct sk_buff *skb)
2369 {
2370         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2371 }
2372
2373 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2374 {
2375         skb->dev = skb_dst(skb)->dev;
2376         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2377 }
2378
2379 static int ip6_pkt_prohibit(struct sk_buff *skb)
2380 {
2381         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2382 }
2383
2384 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2385 {
2386         skb->dev = skb_dst(skb)->dev;
2387         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2388 }
2389
2390 /*
2391  *      Allocate a dst for local (unicast / anycast) address.
2392  */
2393
2394 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2395                                     const struct in6_addr *addr,
2396                                     bool anycast)
2397 {
2398         struct net *net = dev_net(idev->dev);
2399         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2400                                             DST_NOCOUNT, NULL);
2401         if (!rt)
2402                 return ERR_PTR(-ENOMEM);
2403
2404         in6_dev_hold(idev);
2405
2406         rt->dst.flags |= DST_HOST;
2407         rt->dst.input = ip6_input;
2408         rt->dst.output = ip6_output;
2409         rt->rt6i_idev = idev;
2410
2411         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2412         if (anycast)
2413                 rt->rt6i_flags |= RTF_ANYCAST;
2414         else
2415                 rt->rt6i_flags |= RTF_LOCAL;
2416
2417         rt->rt6i_gateway  = *addr;
2418         rt->rt6i_dst.addr = *addr;
2419         rt->rt6i_dst.plen = 128;
2420         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2421
2422         atomic_set(&rt->dst.__refcnt, 1);
2423
2424         return rt;
2425 }
2426
2427 int ip6_route_get_saddr(struct net *net,
2428                         struct rt6_info *rt,
2429                         const struct in6_addr *daddr,
2430                         unsigned int prefs,
2431                         struct in6_addr *saddr)
2432 {
2433         struct inet6_dev *idev =
2434                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2435         int err = 0;
2436         if (rt && rt->rt6i_prefsrc.plen)
2437                 *saddr = rt->rt6i_prefsrc.addr;
2438         else
2439                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2440                                          daddr, prefs, saddr);
2441         return err;
2442 }
2443
2444 /* remove deleted ip from prefsrc entries */
2445 struct arg_dev_net_ip {
2446         struct net_device *dev;
2447         struct net *net;
2448         struct in6_addr *addr;
2449 };
2450
2451 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2452 {
2453         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2454         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2455         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2456
2457         if (((void *)rt->dst.dev == dev || !dev) &&
2458             rt != net->ipv6.ip6_null_entry &&
2459             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2460                 /* remove prefsrc entry */
2461                 rt->rt6i_prefsrc.plen = 0;
2462         }
2463         return 0;
2464 }
2465
2466 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2467 {
2468         struct net *net = dev_net(ifp->idev->dev);
2469         struct arg_dev_net_ip adni = {
2470                 .dev = ifp->idev->dev,
2471                 .net = net,
2472                 .addr = &ifp->addr,
2473         };
2474         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2475 }
2476
2477 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2478 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2479
2480 /* Remove routers and update dst entries when gateway turn into host. */
2481 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2482 {
2483         struct in6_addr *gateway = (struct in6_addr *)arg;
2484
2485         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2486              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2487              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2488                 return -1;
2489         }
2490         return 0;
2491 }
2492
2493 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2494 {
2495         fib6_clean_all(net, fib6_clean_tohost, gateway);
2496 }
2497
2498 struct arg_dev_net {
2499         struct net_device *dev;
2500         struct net *net;
2501 };
2502
2503 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2504 {
2505         const struct arg_dev_net *adn = arg;
2506         const struct net_device *dev = adn->dev;
2507
2508         if ((rt->dst.dev == dev || !dev) &&
2509             rt != adn->net->ipv6.ip6_null_entry)
2510                 return -1;
2511
2512         return 0;
2513 }
2514
2515 void rt6_ifdown(struct net *net, struct net_device *dev)
2516 {
2517         struct arg_dev_net adn = {
2518                 .dev = dev,
2519                 .net = net,
2520         };
2521
2522         fib6_clean_all(net, fib6_ifdown, &adn);
2523         icmp6_clean_all(fib6_ifdown, &adn);
2524         rt6_uncached_list_flush_dev(net, dev);
2525 }
2526
2527 struct rt6_mtu_change_arg {
2528         struct net_device *dev;
2529         unsigned int mtu;
2530 };
2531
2532 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2533 {
2534         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2535         struct inet6_dev *idev;
2536
2537         /* In IPv6 pmtu discovery is not optional,
2538            so that RTAX_MTU lock cannot disable it.
2539            We still use this lock to block changes
2540            caused by addrconf/ndisc.
2541         */
2542
2543         idev = __in6_dev_get(arg->dev);
2544         if (!idev)
2545                 return 0;
2546
2547         /* For administrative MTU increase, there is no way to discover
2548            IPv6 PMTU increase, so PMTU increase should be updated here.
2549            Since RFC 1981 doesn't include administrative MTU increase
2550            update PMTU increase is a MUST. (i.e. jumbo frame)
2551          */
2552         /*
2553            If new MTU is less than route PMTU, this new MTU will be the
2554            lowest MTU in the path, update the route PMTU to reflect PMTU
2555            decreases; if new MTU is greater than route PMTU, and the
2556            old MTU is the lowest MTU in the path, update the route PMTU
2557            to reflect the increase. In this case if the other nodes' MTU
2558            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2559            PMTU discouvery.
2560          */
2561         if (rt->dst.dev == arg->dev &&
2562             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2563                 if (rt->rt6i_flags & RTF_CACHE) {
2564                         /* For RTF_CACHE with rt6i_pmtu == 0
2565                          * (i.e. a redirected route),
2566                          * the metrics of its rt->dst.from has already
2567                          * been updated.
2568                          */
2569                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2570                                 rt->rt6i_pmtu = arg->mtu;
2571                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2572                            (dst_mtu(&rt->dst) < arg->mtu &&
2573                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2574                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2575                 }
2576         }
2577         return 0;
2578 }
2579
2580 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2581 {
2582         struct rt6_mtu_change_arg arg = {
2583                 .dev = dev,
2584                 .mtu = mtu,
2585         };
2586
2587         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2588 }
2589
2590 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2591         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2592         [RTA_OIF]               = { .type = NLA_U32 },
2593         [RTA_IIF]               = { .type = NLA_U32 },
2594         [RTA_PRIORITY]          = { .type = NLA_U32 },
2595         [RTA_METRICS]           = { .type = NLA_NESTED },
2596         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2597         [RTA_PREF]              = { .type = NLA_U8 },
2598 };
2599
2600 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2601                               struct fib6_config *cfg)
2602 {
2603         struct rtmsg *rtm;
2604         struct nlattr *tb[RTA_MAX+1];
2605         unsigned int pref;
2606         int err;
2607
2608         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2609         if (err < 0)
2610                 goto errout;
2611
2612         err = -EINVAL;
2613         rtm = nlmsg_data(nlh);
2614         memset(cfg, 0, sizeof(*cfg));
2615
2616         cfg->fc_table = rtm->rtm_table;
2617         cfg->fc_dst_len = rtm->rtm_dst_len;
2618         cfg->fc_src_len = rtm->rtm_src_len;
2619         cfg->fc_flags = RTF_UP;
2620         cfg->fc_protocol = rtm->rtm_protocol;
2621         cfg->fc_type = rtm->rtm_type;
2622
2623         if (rtm->rtm_type == RTN_UNREACHABLE ||
2624             rtm->rtm_type == RTN_BLACKHOLE ||
2625             rtm->rtm_type == RTN_PROHIBIT ||
2626             rtm->rtm_type == RTN_THROW)
2627                 cfg->fc_flags |= RTF_REJECT;
2628
2629         if (rtm->rtm_type == RTN_LOCAL)
2630                 cfg->fc_flags |= RTF_LOCAL;
2631
2632         if (rtm->rtm_flags & RTM_F_CLONED)
2633                 cfg->fc_flags |= RTF_CACHE;
2634
2635         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2636         cfg->fc_nlinfo.nlh = nlh;
2637         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2638
2639         if (tb[RTA_GATEWAY]) {
2640                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2641                 cfg->fc_flags |= RTF_GATEWAY;
2642         }
2643
2644         if (tb[RTA_DST]) {
2645                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2646
2647                 if (nla_len(tb[RTA_DST]) < plen)
2648                         goto errout;
2649
2650                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2651         }
2652
2653         if (tb[RTA_SRC]) {
2654                 int plen = (rtm->rtm_src_len + 7) >> 3;
2655
2656                 if (nla_len(tb[RTA_SRC]) < plen)
2657                         goto errout;
2658
2659                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2660         }
2661
2662         if (tb[RTA_PREFSRC])
2663                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2664
2665         if (tb[RTA_OIF])
2666                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2667
2668         if (tb[RTA_PRIORITY])
2669                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2670
2671         if (tb[RTA_METRICS]) {
2672                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2673                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2674         }
2675
2676         if (tb[RTA_TABLE])
2677                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2678
2679         if (tb[RTA_MULTIPATH]) {
2680                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2681                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2682         }
2683
2684         if (tb[RTA_PREF]) {
2685                 pref = nla_get_u8(tb[RTA_PREF]);
2686                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2687                     pref != ICMPV6_ROUTER_PREF_HIGH)
2688                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2689                 cfg->fc_flags |= RTF_PREF(pref);
2690         }
2691
2692         err = 0;
2693 errout:
2694         return err;
2695 }
2696
2697 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2698 {
2699         struct fib6_config r_cfg;
2700         struct rtnexthop *rtnh;
2701         int remaining;
2702         int attrlen;
2703         int err = 0, last_err = 0;
2704
2705         remaining = cfg->fc_mp_len;
2706 beginning:
2707         rtnh = (struct rtnexthop *)cfg->fc_mp;
2708
2709         /* Parse a Multipath Entry */
2710         while (rtnh_ok(rtnh, remaining)) {
2711                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2712                 if (rtnh->rtnh_ifindex)
2713                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2714
2715                 attrlen = rtnh_attrlen(rtnh);
2716                 if (attrlen > 0) {
2717                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2718
2719                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2720                         if (nla) {
2721                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2722                                 r_cfg.fc_flags |= RTF_GATEWAY;
2723                         }
2724                 }
2725                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2726                 if (err) {
2727                         last_err = err;
2728                         /* If we are trying to remove a route, do not stop the
2729                          * loop when ip6_route_del() fails (because next hop is
2730                          * already gone), we should try to remove all next hops.
2731                          */
2732                         if (add) {
2733                                 /* If add fails, we should try to delete all
2734                                  * next hops that have been already added.
2735                                  */
2736                                 add = 0;
2737                                 remaining = cfg->fc_mp_len - remaining;
2738                                 goto beginning;
2739                         }
2740                 }
2741                 /* Because each route is added like a single route we remove
2742                  * these flags after the first nexthop: if there is a collision,
2743                  * we have already failed to add the first nexthop:
2744                  * fib6_add_rt2node() has rejected it; when replacing, old
2745                  * nexthops have been replaced by first new, the rest should
2746                  * be added to it.
2747                  */
2748                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2749                                                      NLM_F_REPLACE);
2750                 rtnh = rtnh_next(rtnh, &remaining);
2751         }
2752
2753         return last_err;
2754 }
2755
2756 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2757 {
2758         struct fib6_config cfg;
2759         int err;
2760
2761         err = rtm_to_fib6_config(skb, nlh, &cfg);
2762         if (err < 0)
2763                 return err;
2764
2765         if (cfg.fc_mp)
2766                 return ip6_route_multipath(&cfg, 0);
2767         else
2768                 return ip6_route_del(&cfg);
2769 }
2770
2771 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2772 {
2773         struct fib6_config cfg;
2774         int err;
2775
2776         err = rtm_to_fib6_config(skb, nlh, &cfg);
2777         if (err < 0)
2778                 return err;
2779
2780         if (cfg.fc_mp)
2781                 return ip6_route_multipath(&cfg, 1);
2782         else
2783                 return ip6_route_add(&cfg);
2784 }
2785
2786 static inline size_t rt6_nlmsg_size(void)
2787 {
2788         return NLMSG_ALIGN(sizeof(struct rtmsg))
2789                + nla_total_size(16) /* RTA_SRC */
2790                + nla_total_size(16) /* RTA_DST */
2791                + nla_total_size(16) /* RTA_GATEWAY */
2792                + nla_total_size(16) /* RTA_PREFSRC */
2793                + nla_total_size(4) /* RTA_TABLE */
2794                + nla_total_size(4) /* RTA_IIF */
2795                + nla_total_size(4) /* RTA_OIF */
2796                + nla_total_size(4) /* RTA_PRIORITY */
2797                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2798                + nla_total_size(sizeof(struct rta_cacheinfo))
2799                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2800                + nla_total_size(1); /* RTA_PREF */
2801 }
2802
2803 static int rt6_fill_node(struct net *net,
2804                          struct sk_buff *skb, struct rt6_info *rt,
2805                          struct in6_addr *dst, struct in6_addr *src,
2806                          int iif, int type, u32 portid, u32 seq,
2807                          int prefix, int nowait, unsigned int flags)
2808 {
2809         u32 metrics[RTAX_MAX];
2810         struct rtmsg *rtm;
2811         struct nlmsghdr *nlh;
2812         long expires;
2813         u32 table;
2814
2815         if (prefix) {   /* user wants prefix routes only */
2816                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2817                         /* success since this is not a prefix route */
2818                         return 1;
2819                 }
2820         }
2821
2822         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2823         if (!nlh)
2824                 return -EMSGSIZE;
2825
2826         rtm = nlmsg_data(nlh);
2827         rtm->rtm_family = AF_INET6;
2828         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2829         rtm->rtm_src_len = rt->rt6i_src.plen;
2830         rtm->rtm_tos = 0;
2831         if (rt->rt6i_table)
2832                 table = rt->rt6i_table->tb6_id;
2833         else
2834                 table = RT6_TABLE_UNSPEC;
2835         rtm->rtm_table = table;
2836         if (nla_put_u32(skb, RTA_TABLE, table))
2837                 goto nla_put_failure;
2838         if (rt->rt6i_flags & RTF_REJECT) {
2839                 switch (rt->dst.error) {
2840                 case -EINVAL:
2841                         rtm->rtm_type = RTN_BLACKHOLE;
2842                         break;
2843                 case -EACCES:
2844                         rtm->rtm_type = RTN_PROHIBIT;
2845                         break;
2846                 case -EAGAIN:
2847                         rtm->rtm_type = RTN_THROW;
2848                         break;
2849                 default:
2850                         rtm->rtm_type = RTN_UNREACHABLE;
2851                         break;
2852                 }
2853         }
2854         else if (rt->rt6i_flags & RTF_LOCAL)
2855                 rtm->rtm_type = RTN_LOCAL;
2856         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2857                 rtm->rtm_type = RTN_LOCAL;
2858         else
2859                 rtm->rtm_type = RTN_UNICAST;
2860         rtm->rtm_flags = 0;
2861         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2862         rtm->rtm_protocol = rt->rt6i_protocol;
2863         if (rt->rt6i_flags & RTF_DYNAMIC)
2864                 rtm->rtm_protocol = RTPROT_REDIRECT;
2865         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2866                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2867                         rtm->rtm_protocol = RTPROT_RA;
2868                 else
2869                         rtm->rtm_protocol = RTPROT_KERNEL;
2870         }
2871
2872         if (rt->rt6i_flags & RTF_CACHE)
2873                 rtm->rtm_flags |= RTM_F_CLONED;
2874
2875         if (dst) {
2876                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2877                         goto nla_put_failure;
2878                 rtm->rtm_dst_len = 128;
2879         } else if (rtm->rtm_dst_len)
2880                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2881                         goto nla_put_failure;
2882 #ifdef CONFIG_IPV6_SUBTREES
2883         if (src) {
2884                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2885                         goto nla_put_failure;
2886                 rtm->rtm_src_len = 128;
2887         } else if (rtm->rtm_src_len &&
2888                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2889                 goto nla_put_failure;
2890 #endif
2891         if (iif) {
2892 #ifdef CONFIG_IPV6_MROUTE
2893                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2894                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2895                         if (err <= 0) {
2896                                 if (!nowait) {
2897                                         if (err == 0)
2898                                                 return 0;
2899                                         goto nla_put_failure;
2900                                 } else {
2901                                         if (err == -EMSGSIZE)
2902                                                 goto nla_put_failure;
2903                                 }
2904                         }
2905                 } else
2906 #endif
2907                         if (nla_put_u32(skb, RTA_IIF, iif))
2908                                 goto nla_put_failure;
2909         } else if (dst) {
2910                 struct in6_addr saddr_buf;
2911                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2912                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2913                         goto nla_put_failure;
2914         }
2915
2916         if (rt->rt6i_prefsrc.plen) {
2917                 struct in6_addr saddr_buf;
2918                 saddr_buf = rt->rt6i_prefsrc.addr;
2919                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2920                         goto nla_put_failure;
2921         }
2922
2923         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2924         if (rt->rt6i_pmtu)
2925                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2926         if (rtnetlink_put_metrics(skb, metrics) < 0)
2927                 goto nla_put_failure;
2928
2929         if (rt->rt6i_flags & RTF_GATEWAY) {
2930                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2931                         goto nla_put_failure;
2932         }
2933
2934         if (rt->dst.dev &&
2935             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2936                 goto nla_put_failure;
2937         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2938                 goto nla_put_failure;
2939
2940         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2941
2942         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2943                 goto nla_put_failure;
2944
2945         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2946                 goto nla_put_failure;
2947
2948         nlmsg_end(skb, nlh);
2949         return 0;
2950
2951 nla_put_failure:
2952         nlmsg_cancel(skb, nlh);
2953         return -EMSGSIZE;
2954 }
2955
2956 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2957 {
2958         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2959         int prefix;
2960
2961         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2962                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2963                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2964         } else
2965                 prefix = 0;
2966
2967         return rt6_fill_node(arg->net,
2968                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2969                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2970                      prefix, 0, NLM_F_MULTI);
2971 }
2972
2973 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2974 {
2975         struct net *net = sock_net(in_skb->sk);
2976         struct nlattr *tb[RTA_MAX+1];
2977         struct rt6_info *rt;
2978         struct sk_buff *skb;
2979         struct rtmsg *rtm;
2980         struct flowi6 fl6;
2981         int err, iif = 0, oif = 0;
2982
2983         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2984         if (err < 0)
2985                 goto errout;
2986
2987         err = -EINVAL;
2988         memset(&fl6, 0, sizeof(fl6));
2989
2990         if (tb[RTA_SRC]) {
2991                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2992                         goto errout;
2993
2994                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2995         }
2996
2997         if (tb[RTA_DST]) {
2998                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2999                         goto errout;
3000
3001                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3002         }
3003
3004         if (tb[RTA_IIF])
3005                 iif = nla_get_u32(tb[RTA_IIF]);
3006
3007         if (tb[RTA_OIF])
3008                 oif = nla_get_u32(tb[RTA_OIF]);
3009
3010         if (tb[RTA_MARK])
3011                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3012
3013         if (iif) {
3014                 struct net_device *dev;
3015                 int flags = 0;
3016
3017                 dev = __dev_get_by_index(net, iif);
3018                 if (!dev) {
3019                         err = -ENODEV;
3020                         goto errout;
3021                 }
3022
3023                 fl6.flowi6_iif = iif;
3024
3025                 if (!ipv6_addr_any(&fl6.saddr))
3026                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3027
3028                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3029                                                                flags);
3030         } else {
3031                 fl6.flowi6_oif = oif;
3032
3033                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3034         }
3035
3036         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3037         if (!skb) {
3038                 ip6_rt_put(rt);
3039                 err = -ENOBUFS;
3040                 goto errout;
3041         }
3042
3043         /* Reserve room for dummy headers, this skb can pass
3044            through good chunk of routing engine.
3045          */
3046         skb_reset_mac_header(skb);
3047         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3048
3049         skb_dst_set(skb, &rt->dst);
3050
3051         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3052                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3053                             nlh->nlmsg_seq, 0, 0, 0);
3054         if (err < 0) {
3055                 kfree_skb(skb);
3056                 goto errout;
3057         }
3058
3059         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3060 errout:
3061         return err;
3062 }
3063
3064 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3065 {
3066         struct sk_buff *skb;
3067         struct net *net = info->nl_net;
3068         u32 seq;
3069         int err;
3070
3071         err = -ENOBUFS;
3072         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3073
3074         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3075         if (!skb)
3076                 goto errout;
3077
3078         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3079                                 event, info->portid, seq, 0, 0, 0);
3080         if (err < 0) {
3081                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3082                 WARN_ON(err == -EMSGSIZE);
3083                 kfree_skb(skb);
3084                 goto errout;
3085         }
3086         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3087                     info->nlh, gfp_any());
3088         return;
3089 errout:
3090         if (err < 0)
3091                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3092 }
3093
3094 static int ip6_route_dev_notify(struct notifier_block *this,
3095                                 unsigned long event, void *ptr)
3096 {
3097         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3098         struct net *net = dev_net(dev);
3099
3100         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3101                 net->ipv6.ip6_null_entry->dst.dev = dev;
3102                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3103 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3104                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3105                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3106                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3107                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3108 #endif
3109         }
3110
3111         return NOTIFY_OK;
3112 }
3113
3114 /*
3115  *      /proc
3116  */
3117
3118 #ifdef CONFIG_PROC_FS
3119
3120 static const struct file_operations ipv6_route_proc_fops = {
3121         .owner          = THIS_MODULE,
3122         .open           = ipv6_route_open,
3123         .read           = seq_read,
3124         .llseek         = seq_lseek,
3125         .release        = seq_release_net,
3126 };
3127
3128 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3129 {
3130         struct net *net = (struct net *)seq->private;
3131         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3132                    net->ipv6.rt6_stats->fib_nodes,
3133                    net->ipv6.rt6_stats->fib_route_nodes,
3134                    net->ipv6.rt6_stats->fib_rt_alloc,
3135                    net->ipv6.rt6_stats->fib_rt_entries,
3136                    net->ipv6.rt6_stats->fib_rt_cache,
3137                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3138                    net->ipv6.rt6_stats->fib_discarded_routes);
3139
3140         return 0;
3141 }
3142
3143 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3144 {
3145         return single_open_net(inode, file, rt6_stats_seq_show);
3146 }
3147
3148 static const struct file_operations rt6_stats_seq_fops = {
3149         .owner   = THIS_MODULE,
3150         .open    = rt6_stats_seq_open,
3151         .read    = seq_read,
3152         .llseek  = seq_lseek,
3153         .release = single_release_net,
3154 };
3155 #endif  /* CONFIG_PROC_FS */
3156
3157 #ifdef CONFIG_SYSCTL
3158
3159 static
3160 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3161                               void __user *buffer, size_t *lenp, loff_t *ppos)
3162 {
3163         struct net *net;
3164         int delay;
3165         if (!write)
3166                 return -EINVAL;
3167
3168         net = (struct net *)ctl->extra1;
3169         delay = net->ipv6.sysctl.flush_delay;
3170         proc_dointvec(ctl, write, buffer, lenp, ppos);
3171         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3172         return 0;
3173 }
3174
3175 struct ctl_table ipv6_route_table_template[] = {
3176         {
3177                 .procname       =       "flush",
3178                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3179                 .maxlen         =       sizeof(int),
3180                 .mode           =       0200,
3181                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3182         },
3183         {
3184                 .procname       =       "gc_thresh",
3185                 .data           =       &ip6_dst_ops_template.gc_thresh,
3186                 .maxlen         =       sizeof(int),
3187                 .mode           =       0644,
3188                 .proc_handler   =       proc_dointvec,
3189         },
3190         {
3191                 .procname       =       "max_size",
3192                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3193                 .maxlen         =       sizeof(int),
3194                 .mode           =       0644,
3195                 .proc_handler   =       proc_dointvec,
3196         },
3197         {
3198                 .procname       =       "gc_min_interval",
3199                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3200                 .maxlen         =       sizeof(int),
3201                 .mode           =       0644,
3202                 .proc_handler   =       proc_dointvec_jiffies,
3203         },
3204         {
3205                 .procname       =       "gc_timeout",
3206                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3207                 .maxlen         =       sizeof(int),
3208                 .mode           =       0644,
3209                 .proc_handler   =       proc_dointvec_jiffies,
3210         },
3211         {
3212                 .procname       =       "gc_interval",
3213                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3214                 .maxlen         =       sizeof(int),
3215                 .mode           =       0644,
3216                 .proc_handler   =       proc_dointvec_jiffies,
3217         },
3218         {
3219                 .procname       =       "gc_elasticity",
3220                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3221                 .maxlen         =       sizeof(int),
3222                 .mode           =       0644,
3223                 .proc_handler   =       proc_dointvec,
3224         },
3225         {
3226                 .procname       =       "mtu_expires",
3227                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3228                 .maxlen         =       sizeof(int),
3229                 .mode           =       0644,
3230                 .proc_handler   =       proc_dointvec_jiffies,
3231         },
3232         {
3233                 .procname       =       "min_adv_mss",
3234                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3235                 .maxlen         =       sizeof(int),
3236                 .mode           =       0644,
3237                 .proc_handler   =       proc_dointvec,
3238         },
3239         {
3240                 .procname       =       "gc_min_interval_ms",
3241                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3242                 .maxlen         =       sizeof(int),
3243                 .mode           =       0644,
3244                 .proc_handler   =       proc_dointvec_ms_jiffies,
3245         },
3246         { }
3247 };
3248
3249 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3250 {
3251         struct ctl_table *table;
3252
3253         table = kmemdup(ipv6_route_table_template,
3254                         sizeof(ipv6_route_table_template),
3255                         GFP_KERNEL);
3256
3257         if (table) {
3258                 table[0].data = &net->ipv6.sysctl.flush_delay;
3259                 table[0].extra1 = net;
3260                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3261                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3262                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3263                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3264                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3265                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3266                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3267                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3268                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3269
3270                 /* Don't export sysctls to unprivileged users */
3271                 if (net->user_ns != &init_user_ns)
3272                         table[0].procname = NULL;
3273         }
3274
3275         return table;
3276 }
3277 #endif
3278
3279 static int __net_init ip6_route_net_init(struct net *net)
3280 {
3281         int ret = -ENOMEM;
3282
3283         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3284                sizeof(net->ipv6.ip6_dst_ops));
3285
3286         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3287                 goto out_ip6_dst_ops;
3288
3289         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3290                                            sizeof(*net->ipv6.ip6_null_entry),
3291                                            GFP_KERNEL);
3292         if (!net->ipv6.ip6_null_entry)
3293                 goto out_ip6_dst_entries;
3294         net->ipv6.ip6_null_entry->dst.path =
3295                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3296         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3297         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3298                          ip6_template_metrics, true);
3299
3300 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3301         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3302                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3303                                                GFP_KERNEL);
3304         if (!net->ipv6.ip6_prohibit_entry)
3305                 goto out_ip6_null_entry;
3306         net->ipv6.ip6_prohibit_entry->dst.path =
3307                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3308         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3309         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3310                          ip6_template_metrics, true);
3311
3312         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3313                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3314                                                GFP_KERNEL);
3315         if (!net->ipv6.ip6_blk_hole_entry)
3316                 goto out_ip6_prohibit_entry;
3317         net->ipv6.ip6_blk_hole_entry->dst.path =
3318                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3319         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3320         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3321                          ip6_template_metrics, true);
3322 #endif
3323
3324         net->ipv6.sysctl.flush_delay = 0;
3325         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3326         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3327         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3328         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3329         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3330         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3331         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3332
3333         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3334
3335         ret = 0;
3336 out:
3337         return ret;
3338
3339 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3340 out_ip6_prohibit_entry:
3341         kfree(net->ipv6.ip6_prohibit_entry);
3342 out_ip6_null_entry:
3343         kfree(net->ipv6.ip6_null_entry);
3344 #endif
3345 out_ip6_dst_entries:
3346         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3347 out_ip6_dst_ops:
3348         goto out;
3349 }
3350
3351 static void __net_exit ip6_route_net_exit(struct net *net)
3352 {
3353         kfree(net->ipv6.ip6_null_entry);
3354 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3355         kfree(net->ipv6.ip6_prohibit_entry);
3356         kfree(net->ipv6.ip6_blk_hole_entry);
3357 #endif
3358         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3359 }
3360
3361 static int __net_init ip6_route_net_init_late(struct net *net)
3362 {
3363 #ifdef CONFIG_PROC_FS
3364         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3365         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3366 #endif
3367         return 0;
3368 }
3369
3370 static void __net_exit ip6_route_net_exit_late(struct net *net)
3371 {
3372 #ifdef CONFIG_PROC_FS
3373         remove_proc_entry("ipv6_route", net->proc_net);
3374         remove_proc_entry("rt6_stats", net->proc_net);
3375 #endif
3376 }
3377
3378 static struct pernet_operations ip6_route_net_ops = {
3379         .init = ip6_route_net_init,
3380         .exit = ip6_route_net_exit,
3381 };
3382
3383 static int __net_init ipv6_inetpeer_init(struct net *net)
3384 {
3385         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3386
3387         if (!bp)
3388                 return -ENOMEM;
3389         inet_peer_base_init(bp);
3390         net->ipv6.peers = bp;
3391         return 0;
3392 }
3393
3394 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3395 {
3396         struct inet_peer_base *bp = net->ipv6.peers;
3397
3398         net->ipv6.peers = NULL;
3399         inetpeer_invalidate_tree(bp);
3400         kfree(bp);
3401 }
3402
3403 static struct pernet_operations ipv6_inetpeer_ops = {
3404         .init   =       ipv6_inetpeer_init,
3405         .exit   =       ipv6_inetpeer_exit,
3406 };
3407
3408 static struct pernet_operations ip6_route_net_late_ops = {
3409         .init = ip6_route_net_init_late,
3410         .exit = ip6_route_net_exit_late,
3411 };
3412
3413 static struct notifier_block ip6_route_dev_notifier = {
3414         .notifier_call = ip6_route_dev_notify,
3415         .priority = 0,
3416 };
3417
3418 int __init ip6_route_init(void)
3419 {
3420         int ret;
3421         int cpu;
3422
3423         ret = -ENOMEM;
3424         ip6_dst_ops_template.kmem_cachep =
3425                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3426                                   SLAB_HWCACHE_ALIGN, NULL);
3427         if (!ip6_dst_ops_template.kmem_cachep)
3428                 goto out;
3429
3430         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3431         if (ret)
3432                 goto out_kmem_cache;
3433
3434         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3435         if (ret)
3436                 goto out_dst_entries;
3437
3438         ret = register_pernet_subsys(&ip6_route_net_ops);
3439         if (ret)
3440                 goto out_register_inetpeer;
3441
3442         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3443
3444         /* Registering of the loopback is done before this portion of code,
3445          * the loopback reference in rt6_info will not be taken, do it
3446          * manually for init_net */
3447         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3448         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3449   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3450         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3451         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3452         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3453         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3454   #endif
3455         ret = fib6_init();
3456         if (ret)
3457                 goto out_register_subsys;
3458
3459         ret = xfrm6_init();
3460         if (ret)
3461                 goto out_fib6_init;
3462
3463         ret = fib6_rules_init();
3464         if (ret)
3465                 goto xfrm6_init;
3466
3467         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3468         if (ret)
3469                 goto fib6_rules_init;
3470
3471         ret = -ENOBUFS;
3472         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3473             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3474             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3475                 goto out_register_late_subsys;
3476
3477         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3478         if (ret)
3479                 goto out_register_late_subsys;
3480
3481         for_each_possible_cpu(cpu) {
3482                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3483
3484                 INIT_LIST_HEAD(&ul->head);
3485                 spin_lock_init(&ul->lock);
3486         }
3487
3488 out:
3489         return ret;
3490
3491 out_register_late_subsys:
3492         unregister_pernet_subsys(&ip6_route_net_late_ops);
3493 fib6_rules_init:
3494         fib6_rules_cleanup();
3495 xfrm6_init:
3496         xfrm6_fini();
3497 out_fib6_init:
3498         fib6_gc_cleanup();
3499 out_register_subsys:
3500         unregister_pernet_subsys(&ip6_route_net_ops);
3501 out_register_inetpeer:
3502         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3503 out_dst_entries:
3504         dst_entries_destroy(&ip6_dst_blackhole_ops);
3505 out_kmem_cache:
3506         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3507         goto out;
3508 }
3509
3510 void ip6_route_cleanup(void)
3511 {
3512         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3513         unregister_pernet_subsys(&ip6_route_net_late_ops);
3514         fib6_rules_cleanup();
3515         xfrm6_fini();
3516         fib6_gc_cleanup();
3517         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3518         unregister_pernet_subsys(&ip6_route_net_ops);
3519         dst_entries_destroy(&ip6_dst_blackhole_ops);
3520         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3521 }