8dddb45c433e53ad35d30cd4263a5a5080cd0da9
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/overflow-arith.h>
32 #include <linux/string.h>
33 #include <linux/socket.h>
34 #include <linux/net.h>
35 #include <linux/netdevice.h>
36 #include <linux/if_arp.h>
37 #include <linux/in6.h>
38 #include <linux/tcp.h>
39 #include <linux/route.h>
40 #include <linux/module.h>
41 #include <linux/slab.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59
60 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
61 {
62         struct dst_entry *dst = skb_dst(skb);
63         struct net_device *dev = dst->dev;
64         struct neighbour *neigh;
65         struct in6_addr *nexthop;
66         int ret;
67
68         skb->protocol = htons(ETH_P_IPV6);
69         skb->dev = dev;
70
71         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73
74                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75                     ((mroute6_socket(dev_net(dev), skb) &&
76                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78                                          &ipv6_hdr(skb)->saddr))) {
79                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80
81                         /* Do not check for IFF_ALLMULTI; multicast routing
82                            is not supported in any case.
83                          */
84                         if (newskb)
85                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86                                         sk, newskb, NULL, newskb->dev,
87                                         dev_loopback_xmit);
88
89                         if (ipv6_hdr(skb)->hop_limit == 0) {
90                                 IP6_INC_STATS(dev_net(dev), idev,
91                                               IPSTATS_MIB_OUTDISCARDS);
92                                 kfree_skb(skb);
93                                 return 0;
94                         }
95                 }
96
97                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
98                                 skb->len);
99
100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101                     IPV6_ADDR_SCOPE_NODELOCAL &&
102                     !(dev->flags & IFF_LOOPBACK)) {
103                         kfree_skb(skb);
104                         return 0;
105                 }
106         }
107
108         rcu_read_lock_bh();
109         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
110         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
111         if (unlikely(!neigh))
112                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
113         if (!IS_ERR(neigh)) {
114                 ret = dst_neigh_output(dst, neigh, skb);
115                 rcu_read_unlock_bh();
116                 return ret;
117         }
118         rcu_read_unlock_bh();
119
120         IP6_INC_STATS(dev_net(dst->dev),
121                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
122         kfree_skb(skb);
123         return -EINVAL;
124 }
125
126 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
127 {
128         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
129             dst_allfrag(skb_dst(skb)) ||
130             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
131                 return ip6_fragment(sk, skb, ip6_finish_output2);
132         else
133                 return ip6_finish_output2(sk, skb);
134 }
135
136 int ip6_output(struct sock *sk, struct sk_buff *skb)
137 {
138         struct net_device *dev = skb_dst(skb)->dev;
139         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
140         if (unlikely(idev->cnf.disable_ipv6)) {
141                 IP6_INC_STATS(dev_net(dev), idev,
142                               IPSTATS_MIB_OUTDISCARDS);
143                 kfree_skb(skb);
144                 return 0;
145         }
146
147         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
148                             NULL, dev,
149                             ip6_finish_output,
150                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
151 }
152
153 /*
154  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
155  */
156
157 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158              struct ipv6_txoptions *opt, int tclass)
159 {
160         struct net *net = sock_net(sk);
161         struct ipv6_pinfo *np = inet6_sk(sk);
162         struct in6_addr *first_hop = &fl6->daddr;
163         struct dst_entry *dst = skb_dst(skb);
164         struct ipv6hdr *hdr;
165         u8  proto = fl6->flowi6_proto;
166         int seg_len = skb->len;
167         int hlimit = -1;
168         u32 mtu;
169
170         if (opt) {
171                 unsigned int head_room;
172
173                 /* First: exthdrs may take lots of space (~8K for now)
174                    MAX_HEADER is not enough.
175                  */
176                 head_room = opt->opt_nflen + opt->opt_flen;
177                 seg_len += head_room;
178                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179
180                 if (skb_headroom(skb) < head_room) {
181                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182                         if (!skb2) {
183                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184                                               IPSTATS_MIB_OUTDISCARDS);
185                                 kfree_skb(skb);
186                                 return -ENOBUFS;
187                         }
188                         consume_skb(skb);
189                         skb = skb2;
190                         skb_set_owner_w(skb, sk);
191                 }
192                 if (opt->opt_flen)
193                         ipv6_push_frag_opts(skb, opt, &proto);
194                 if (opt->opt_nflen)
195                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
196         }
197
198         skb_push(skb, sizeof(struct ipv6hdr));
199         skb_reset_network_header(skb);
200         hdr = ipv6_hdr(skb);
201
202         /*
203          *      Fill in the IPv6 header
204          */
205         if (np)
206                 hlimit = np->hop_limit;
207         if (hlimit < 0)
208                 hlimit = ip6_dst_hoplimit(dst);
209
210         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
211                                                      np->autoflowlabel, fl6));
212
213         hdr->payload_len = htons(seg_len);
214         hdr->nexthdr = proto;
215         hdr->hop_limit = hlimit;
216
217         hdr->saddr = fl6->saddr;
218         hdr->daddr = *first_hop;
219
220         skb->protocol = htons(ETH_P_IPV6);
221         skb->priority = sk->sk_priority;
222         skb->mark = sk->sk_mark;
223
224         mtu = dst_mtu(dst);
225         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
226                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
227                               IPSTATS_MIB_OUT, skb->len);
228                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
229                                NULL, dst->dev, dst_output_sk);
230         }
231
232         skb->dev = dst->dev;
233         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
234         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
235         kfree_skb(skb);
236         return -EMSGSIZE;
237 }
238 EXPORT_SYMBOL(ip6_xmit);
239
240 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
241 {
242         struct ip6_ra_chain *ra;
243         struct sock *last = NULL;
244
245         read_lock(&ip6_ra_lock);
246         for (ra = ip6_ra_chain; ra; ra = ra->next) {
247                 struct sock *sk = ra->sk;
248                 if (sk && ra->sel == sel &&
249                     (!sk->sk_bound_dev_if ||
250                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
251                         if (last) {
252                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
253                                 if (skb2)
254                                         rawv6_rcv(last, skb2);
255                         }
256                         last = sk;
257                 }
258         }
259
260         if (last) {
261                 rawv6_rcv(last, skb);
262                 read_unlock(&ip6_ra_lock);
263                 return 1;
264         }
265         read_unlock(&ip6_ra_lock);
266         return 0;
267 }
268
269 static int ip6_forward_proxy_check(struct sk_buff *skb)
270 {
271         struct ipv6hdr *hdr = ipv6_hdr(skb);
272         u8 nexthdr = hdr->nexthdr;
273         __be16 frag_off;
274         int offset;
275
276         if (ipv6_ext_hdr(nexthdr)) {
277                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
278                 if (offset < 0)
279                         return 0;
280         } else
281                 offset = sizeof(struct ipv6hdr);
282
283         if (nexthdr == IPPROTO_ICMPV6) {
284                 struct icmp6hdr *icmp6;
285
286                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
287                                          offset + 1 - skb->data)))
288                         return 0;
289
290                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
291
292                 switch (icmp6->icmp6_type) {
293                 case NDISC_ROUTER_SOLICITATION:
294                 case NDISC_ROUTER_ADVERTISEMENT:
295                 case NDISC_NEIGHBOUR_SOLICITATION:
296                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
297                 case NDISC_REDIRECT:
298                         /* For reaction involving unicast neighbor discovery
299                          * message destined to the proxied address, pass it to
300                          * input function.
301                          */
302                         return 1;
303                 default:
304                         break;
305                 }
306         }
307
308         /*
309          * The proxying router can't forward traffic sent to a link-local
310          * address, so signal the sender and discard the packet. This
311          * behavior is clarified by the MIPv6 specification.
312          */
313         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
314                 dst_link_failure(skb);
315                 return -1;
316         }
317
318         return 0;
319 }
320
321 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
322 {
323         skb_sender_cpu_clear(skb);
324         return dst_output_sk(sk, skb);
325 }
326
327 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
328 {
329         unsigned int mtu;
330         struct inet6_dev *idev;
331
332         if (dst_metric_locked(dst, RTAX_MTU)) {
333                 mtu = dst_metric_raw(dst, RTAX_MTU);
334                 if (mtu)
335                         return mtu;
336         }
337
338         mtu = IPV6_MIN_MTU;
339         rcu_read_lock();
340         idev = __in6_dev_get(dst->dev);
341         if (idev)
342                 mtu = idev->cnf.mtu6;
343         rcu_read_unlock();
344
345         return mtu;
346 }
347
348 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
349 {
350         if (skb->len <= mtu)
351                 return false;
352
353         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
354         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
355                 return true;
356
357         if (skb->ignore_df)
358                 return false;
359
360         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
361                 return false;
362
363         return true;
364 }
365
366 int ip6_forward(struct sk_buff *skb)
367 {
368         struct dst_entry *dst = skb_dst(skb);
369         struct ipv6hdr *hdr = ipv6_hdr(skb);
370         struct inet6_skb_parm *opt = IP6CB(skb);
371         struct net *net = dev_net(dst->dev);
372         u32 mtu;
373
374         if (net->ipv6.devconf_all->forwarding == 0)
375                 goto error;
376
377         if (skb->pkt_type != PACKET_HOST)
378                 goto drop;
379
380         if (unlikely(skb->sk))
381                 goto drop;
382
383         if (skb_warn_if_lro(skb))
384                 goto drop;
385
386         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
387                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
388                                  IPSTATS_MIB_INDISCARDS);
389                 goto drop;
390         }
391
392         skb_forward_csum(skb);
393
394         /*
395          *      We DO NOT make any processing on
396          *      RA packets, pushing them to user level AS IS
397          *      without ane WARRANTY that application will be able
398          *      to interpret them. The reason is that we
399          *      cannot make anything clever here.
400          *
401          *      We are not end-node, so that if packet contains
402          *      AH/ESP, we cannot make anything.
403          *      Defragmentation also would be mistake, RA packets
404          *      cannot be fragmented, because there is no warranty
405          *      that different fragments will go along one path. --ANK
406          */
407         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
408                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
409                         return 0;
410         }
411
412         /*
413          *      check and decrement ttl
414          */
415         if (hdr->hop_limit <= 1) {
416                 /* Force OUTPUT device used as source address */
417                 skb->dev = dst->dev;
418                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
419                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
420                                  IPSTATS_MIB_INHDRERRORS);
421
422                 kfree_skb(skb);
423                 return -ETIMEDOUT;
424         }
425
426         /* XXX: idev->cnf.proxy_ndp? */
427         if (net->ipv6.devconf_all->proxy_ndp &&
428             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
429                 int proxied = ip6_forward_proxy_check(skb);
430                 if (proxied > 0)
431                         return ip6_input(skb);
432                 else if (proxied < 0) {
433                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
434                                          IPSTATS_MIB_INDISCARDS);
435                         goto drop;
436                 }
437         }
438
439         if (!xfrm6_route_forward(skb)) {
440                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
441                                  IPSTATS_MIB_INDISCARDS);
442                 goto drop;
443         }
444         dst = skb_dst(skb);
445
446         /* IPv6 specs say nothing about it, but it is clear that we cannot
447            send redirects to source routed frames.
448            We don't send redirects to frames decapsulated from IPsec.
449          */
450         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
451                 struct in6_addr *target = NULL;
452                 struct inet_peer *peer;
453                 struct rt6_info *rt;
454
455                 /*
456                  *      incoming and outgoing devices are the same
457                  *      send a redirect.
458                  */
459
460                 rt = (struct rt6_info *) dst;
461                 if (rt->rt6i_flags & RTF_GATEWAY)
462                         target = &rt->rt6i_gateway;
463                 else
464                         target = &hdr->daddr;
465
466                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
467
468                 /* Limit redirects both by destination (here)
469                    and by source (inside ndisc_send_redirect)
470                  */
471                 if (inet_peer_xrlim_allow(peer, 1*HZ))
472                         ndisc_send_redirect(skb, target);
473                 if (peer)
474                         inet_putpeer(peer);
475         } else {
476                 int addrtype = ipv6_addr_type(&hdr->saddr);
477
478                 /* This check is security critical. */
479                 if (addrtype == IPV6_ADDR_ANY ||
480                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
481                         goto error;
482                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
483                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
484                                     ICMPV6_NOT_NEIGHBOUR, 0);
485                         goto error;
486                 }
487         }
488
489         mtu = ip6_dst_mtu_forward(dst);
490         if (mtu < IPV6_MIN_MTU)
491                 mtu = IPV6_MIN_MTU;
492
493         if (ip6_pkt_too_big(skb, mtu)) {
494                 /* Again, force OUTPUT device used as source address */
495                 skb->dev = dst->dev;
496                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
497                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
498                                  IPSTATS_MIB_INTOOBIGERRORS);
499                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
500                                  IPSTATS_MIB_FRAGFAILS);
501                 kfree_skb(skb);
502                 return -EMSGSIZE;
503         }
504
505         if (skb_cow(skb, dst->dev->hard_header_len)) {
506                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
507                                  IPSTATS_MIB_OUTDISCARDS);
508                 goto drop;
509         }
510
511         hdr = ipv6_hdr(skb);
512
513         /* Mangling hops number delayed to point after skb COW */
514
515         hdr->hop_limit--;
516
517         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
518         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
519         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
520                        skb->dev, dst->dev,
521                        ip6_forward_finish);
522
523 error:
524         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
525 drop:
526         kfree_skb(skb);
527         return -EINVAL;
528 }
529
530 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
531 {
532         to->pkt_type = from->pkt_type;
533         to->priority = from->priority;
534         to->protocol = from->protocol;
535         skb_dst_drop(to);
536         skb_dst_set(to, dst_clone(skb_dst(from)));
537         to->dev = from->dev;
538         to->mark = from->mark;
539
540 #ifdef CONFIG_NET_SCHED
541         to->tc_index = from->tc_index;
542 #endif
543         nf_copy(to, from);
544         skb_copy_secmark(to, from);
545 }
546
547 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
548                  int (*output)(struct sock *, struct sk_buff *))
549 {
550         struct sk_buff *frag;
551         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
552         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
553                                 inet6_sk(skb->sk) : NULL;
554         struct ipv6hdr *tmp_hdr;
555         struct frag_hdr *fh;
556         unsigned int mtu, hlen, left, len;
557         int hroom, troom;
558         __be32 frag_id;
559         int ptr, offset = 0, err = 0;
560         u8 *prevhdr, nexthdr = 0;
561         struct net *net = dev_net(skb_dst(skb)->dev);
562
563         hlen = ip6_find_1stfragopt(skb, &prevhdr);
564         nexthdr = *prevhdr;
565
566         mtu = ip6_skb_dst_mtu(skb);
567
568         /* We must not fragment if the socket is set to force MTU discovery
569          * or if the skb it not generated by a local socket.
570          */
571         if (unlikely(!skb->ignore_df && skb->len > mtu))
572                 goto fail_toobig;
573
574         if (IP6CB(skb)->frag_max_size) {
575                 if (IP6CB(skb)->frag_max_size > mtu)
576                         goto fail_toobig;
577
578                 /* don't send fragments larger than what we received */
579                 mtu = IP6CB(skb)->frag_max_size;
580                 if (mtu < IPV6_MIN_MTU)
581                         mtu = IPV6_MIN_MTU;
582         }
583
584         if (np && np->frag_size < mtu) {
585                 if (np->frag_size)
586                         mtu = np->frag_size;
587         }
588
589         if (overflow_usub(mtu, hlen + sizeof(struct frag_hdr), &mtu) ||
590             mtu <= 7)
591                 goto fail_toobig;
592
593         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
594                                     &ipv6_hdr(skb)->saddr);
595
596         hroom = LL_RESERVED_SPACE(rt->dst.dev);
597         if (skb_has_frag_list(skb)) {
598                 int first_len = skb_pagelen(skb);
599                 struct sk_buff *frag2;
600
601                 if (first_len - hlen > mtu ||
602                     ((first_len - hlen) & 7) ||
603                     skb_cloned(skb) ||
604                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
605                         goto slow_path;
606
607                 skb_walk_frags(skb, frag) {
608                         /* Correct geometry. */
609                         if (frag->len > mtu ||
610                             ((frag->len & 7) && frag->next) ||
611                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
612                                 goto slow_path_clean;
613
614                         /* Partially cloned skb? */
615                         if (skb_shared(frag))
616                                 goto slow_path_clean;
617
618                         BUG_ON(frag->sk);
619                         if (skb->sk) {
620                                 frag->sk = skb->sk;
621                                 frag->destructor = sock_wfree;
622                         }
623                         skb->truesize -= frag->truesize;
624                 }
625
626                 err = 0;
627                 offset = 0;
628                 /* BUILD HEADER */
629
630                 *prevhdr = NEXTHDR_FRAGMENT;
631                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
632                 if (!tmp_hdr) {
633                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
634                                       IPSTATS_MIB_FRAGFAILS);
635                         err = -ENOMEM;
636                         goto fail;
637                 }
638                 frag = skb_shinfo(skb)->frag_list;
639                 skb_frag_list_init(skb);
640
641                 __skb_pull(skb, hlen);
642                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
643                 __skb_push(skb, hlen);
644                 skb_reset_network_header(skb);
645                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
646
647                 fh->nexthdr = nexthdr;
648                 fh->reserved = 0;
649                 fh->frag_off = htons(IP6_MF);
650                 fh->identification = frag_id;
651
652                 first_len = skb_pagelen(skb);
653                 skb->data_len = first_len - skb_headlen(skb);
654                 skb->len = first_len;
655                 ipv6_hdr(skb)->payload_len = htons(first_len -
656                                                    sizeof(struct ipv6hdr));
657
658                 dst_hold(&rt->dst);
659
660                 for (;;) {
661                         /* Prepare header of the next frame,
662                          * before previous one went down. */
663                         if (frag) {
664                                 frag->ip_summed = CHECKSUM_NONE;
665                                 skb_reset_transport_header(frag);
666                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
667                                 __skb_push(frag, hlen);
668                                 skb_reset_network_header(frag);
669                                 memcpy(skb_network_header(frag), tmp_hdr,
670                                        hlen);
671                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
672                                 fh->nexthdr = nexthdr;
673                                 fh->reserved = 0;
674                                 fh->frag_off = htons(offset);
675                                 if (frag->next)
676                                         fh->frag_off |= htons(IP6_MF);
677                                 fh->identification = frag_id;
678                                 ipv6_hdr(frag)->payload_len =
679                                                 htons(frag->len -
680                                                       sizeof(struct ipv6hdr));
681                                 ip6_copy_metadata(frag, skb);
682                         }
683
684                         err = output(sk, skb);
685                         if (!err)
686                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
687                                               IPSTATS_MIB_FRAGCREATES);
688
689                         if (err || !frag)
690                                 break;
691
692                         skb = frag;
693                         frag = skb->next;
694                         skb->next = NULL;
695                 }
696
697                 kfree(tmp_hdr);
698
699                 if (err == 0) {
700                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
701                                       IPSTATS_MIB_FRAGOKS);
702                         ip6_rt_put(rt);
703                         return 0;
704                 }
705
706                 kfree_skb_list(frag);
707
708                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
709                               IPSTATS_MIB_FRAGFAILS);
710                 ip6_rt_put(rt);
711                 return err;
712
713 slow_path_clean:
714                 skb_walk_frags(skb, frag2) {
715                         if (frag2 == frag)
716                                 break;
717                         frag2->sk = NULL;
718                         frag2->destructor = NULL;
719                         skb->truesize += frag2->truesize;
720                 }
721         }
722
723 slow_path:
724         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
725             skb_checksum_help(skb))
726                 goto fail;
727
728         left = skb->len - hlen;         /* Space per frame */
729         ptr = hlen;                     /* Where to start from */
730
731         /*
732          *      Fragment the datagram.
733          */
734
735         *prevhdr = NEXTHDR_FRAGMENT;
736         troom = rt->dst.dev->needed_tailroom;
737
738         /*
739          *      Keep copying data until we run out.
740          */
741         while (left > 0)        {
742                 len = left;
743                 /* IF: it doesn't fit, use 'mtu' - the data space left */
744                 if (len > mtu)
745                         len = mtu;
746                 /* IF: we are not sending up to and including the packet end
747                    then align the next start on an eight byte boundary */
748                 if (len < left) {
749                         len &= ~7;
750                 }
751
752                 /* Allocate buffer */
753                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
754                                  hroom + troom, GFP_ATOMIC);
755                 if (!frag) {
756                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
757                                       IPSTATS_MIB_FRAGFAILS);
758                         err = -ENOMEM;
759                         goto fail;
760                 }
761
762                 /*
763                  *      Set up data on packet
764                  */
765
766                 ip6_copy_metadata(frag, skb);
767                 skb_reserve(frag, hroom);
768                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
769                 skb_reset_network_header(frag);
770                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
771                 frag->transport_header = (frag->network_header + hlen +
772                                           sizeof(struct frag_hdr));
773
774                 /*
775                  *      Charge the memory for the fragment to any owner
776                  *      it might possess
777                  */
778                 if (skb->sk)
779                         skb_set_owner_w(frag, skb->sk);
780
781                 /*
782                  *      Copy the packet header into the new buffer.
783                  */
784                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
785
786                 /*
787                  *      Build fragment header.
788                  */
789                 fh->nexthdr = nexthdr;
790                 fh->reserved = 0;
791                 fh->identification = frag_id;
792
793                 /*
794                  *      Copy a block of the IP datagram.
795                  */
796                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
797                                      len));
798                 left -= len;
799
800                 fh->frag_off = htons(offset);
801                 if (left > 0)
802                         fh->frag_off |= htons(IP6_MF);
803                 ipv6_hdr(frag)->payload_len = htons(frag->len -
804                                                     sizeof(struct ipv6hdr));
805
806                 ptr += len;
807                 offset += len;
808
809                 /*
810                  *      Put this fragment into the sending queue.
811                  */
812                 err = output(sk, frag);
813                 if (err)
814                         goto fail;
815
816                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
817                               IPSTATS_MIB_FRAGCREATES);
818         }
819         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820                       IPSTATS_MIB_FRAGOKS);
821         consume_skb(skb);
822         return err;
823
824 fail_toobig:
825         if (skb->sk && dst_allfrag(skb_dst(skb)))
826                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
827
828         skb->dev = skb_dst(skb)->dev;
829         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
830         err = -EMSGSIZE;
831
832 fail:
833         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
834                       IPSTATS_MIB_FRAGFAILS);
835         kfree_skb(skb);
836         return err;
837 }
838
839 static inline int ip6_rt_check(const struct rt6key *rt_key,
840                                const struct in6_addr *fl_addr,
841                                const struct in6_addr *addr_cache)
842 {
843         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
844                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
845 }
846
847 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
848                                           struct dst_entry *dst,
849                                           const struct flowi6 *fl6)
850 {
851         struct ipv6_pinfo *np = inet6_sk(sk);
852         struct rt6_info *rt;
853
854         if (!dst)
855                 goto out;
856
857         if (dst->ops->family != AF_INET6) {
858                 dst_release(dst);
859                 return NULL;
860         }
861
862         rt = (struct rt6_info *)dst;
863         /* Yes, checking route validity in not connected
864          * case is not very simple. Take into account,
865          * that we do not support routing by source, TOS,
866          * and MSG_DONTROUTE            --ANK (980726)
867          *
868          * 1. ip6_rt_check(): If route was host route,
869          *    check that cached destination is current.
870          *    If it is network route, we still may
871          *    check its validity using saved pointer
872          *    to the last used address: daddr_cache.
873          *    We do not want to save whole address now,
874          *    (because main consumer of this service
875          *    is tcp, which has not this problem),
876          *    so that the last trick works only on connected
877          *    sockets.
878          * 2. oif also should be the same.
879          */
880         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
881 #ifdef CONFIG_IPV6_SUBTREES
882             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
883 #endif
884            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
885               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
886                 dst_release(dst);
887                 dst = NULL;
888         }
889
890 out:
891         return dst;
892 }
893
894 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
895                                struct dst_entry **dst, struct flowi6 *fl6)
896 {
897 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
898         struct neighbour *n;
899         struct rt6_info *rt;
900 #endif
901         int err;
902
903         /* The correct way to handle this would be to do
904          * ip6_route_get_saddr, and then ip6_route_output; however,
905          * the route-specific preferred source forces the
906          * ip6_route_output call _before_ ip6_route_get_saddr.
907          *
908          * In source specific routing (no src=any default route),
909          * ip6_route_output will fail given src=any saddr, though, so
910          * that's why we try it again later.
911          */
912         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
913                 struct rt6_info *rt;
914                 bool had_dst = *dst != NULL;
915
916                 if (!had_dst)
917                         *dst = ip6_route_output(net, sk, fl6);
918                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
919                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
920                                           sk ? inet6_sk(sk)->srcprefs : 0,
921                                           &fl6->saddr);
922                 if (err)
923                         goto out_err_release;
924
925                 /* If we had an erroneous initial result, pretend it
926                  * never existed and let the SA-enabled version take
927                  * over.
928                  */
929                 if (!had_dst && (*dst)->error) {
930                         dst_release(*dst);
931                         *dst = NULL;
932                 }
933         }
934
935         if (!*dst)
936                 *dst = ip6_route_output(net, sk, fl6);
937
938         err = (*dst)->error;
939         if (err)
940                 goto out_err_release;
941
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943         /*
944          * Here if the dst entry we've looked up
945          * has a neighbour entry that is in the INCOMPLETE
946          * state and the src address from the flow is
947          * marked as OPTIMISTIC, we release the found
948          * dst entry and replace it instead with the
949          * dst entry of the nexthop router
950          */
951         rt = (struct rt6_info *) *dst;
952         rcu_read_lock_bh();
953         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
954                                       rt6_nexthop(rt, &fl6->daddr));
955         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
956         rcu_read_unlock_bh();
957
958         if (err) {
959                 struct inet6_ifaddr *ifp;
960                 struct flowi6 fl_gw6;
961                 int redirect;
962
963                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
964                                       (*dst)->dev, 1);
965
966                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
967                 if (ifp)
968                         in6_ifa_put(ifp);
969
970                 if (redirect) {
971                         /*
972                          * We need to get the dst entry for the
973                          * default router instead
974                          */
975                         dst_release(*dst);
976                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
977                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
978                         *dst = ip6_route_output(net, sk, &fl_gw6);
979                         err = (*dst)->error;
980                         if (err)
981                                 goto out_err_release;
982                 }
983         }
984 #endif
985
986         return 0;
987
988 out_err_release:
989         if (err == -ENETUNREACH)
990                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
991         dst_release(*dst);
992         *dst = NULL;
993         return err;
994 }
995
996 /**
997  *      ip6_dst_lookup - perform route lookup on flow
998  *      @sk: socket which provides route info
999  *      @dst: pointer to dst_entry * for result
1000  *      @fl6: flow to lookup
1001  *
1002  *      This function performs a route lookup on the given flow.
1003  *
1004  *      It returns zero on success, or a standard errno code on error.
1005  */
1006 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1007                    struct flowi6 *fl6)
1008 {
1009         *dst = NULL;
1010         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1011 }
1012 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1013
1014 /**
1015  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1016  *      @sk: socket which provides route info
1017  *      @fl6: flow to lookup
1018  *      @final_dst: final destination address for ipsec lookup
1019  *
1020  *      This function performs a route lookup on the given flow.
1021  *
1022  *      It returns a valid dst pointer on success, or a pointer encoded
1023  *      error code.
1024  */
1025 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1026                                       const struct in6_addr *final_dst)
1027 {
1028         struct dst_entry *dst = NULL;
1029         int err;
1030
1031         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1032         if (err)
1033                 return ERR_PTR(err);
1034         if (final_dst)
1035                 fl6->daddr = *final_dst;
1036         if (!fl6->flowi6_oif)
1037                 fl6->flowi6_oif = dst->dev->ifindex;
1038
1039         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1042
1043 /**
1044  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1045  *      @sk: socket which provides the dst cache and route info
1046  *      @fl6: flow to lookup
1047  *      @final_dst: final destination address for ipsec lookup
1048  *
1049  *      This function performs a route lookup on the given flow with the
1050  *      possibility of using the cached route in the socket if it is valid.
1051  *      It will take the socket dst lock when operating on the dst cache.
1052  *      As a result, this function can only be used in process context.
1053  *
1054  *      It returns a valid dst pointer on success, or a pointer encoded
1055  *      error code.
1056  */
1057 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058                                          const struct in6_addr *final_dst)
1059 {
1060         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1061         int err;
1062
1063         dst = ip6_sk_dst_check(sk, dst, fl6);
1064
1065         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1066         if (err)
1067                 return ERR_PTR(err);
1068         if (final_dst)
1069                 fl6->daddr = *final_dst;
1070
1071         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1074
1075 static inline int ip6_ufo_append_data(struct sock *sk,
1076                         struct sk_buff_head *queue,
1077                         int getfrag(void *from, char *to, int offset, int len,
1078                         int odd, struct sk_buff *skb),
1079                         void *from, int length, int hh_len, int fragheaderlen,
1080                         int transhdrlen, int mtu, unsigned int flags,
1081                         const struct flowi6 *fl6)
1082
1083 {
1084         struct sk_buff *skb;
1085         int err;
1086
1087         /* There is support for UDP large send offload by network
1088          * device, so create one single skb packet containing complete
1089          * udp datagram
1090          */
1091         skb = skb_peek_tail(queue);
1092         if (!skb) {
1093                 skb = sock_alloc_send_skb(sk,
1094                         hh_len + fragheaderlen + transhdrlen + 20,
1095                         (flags & MSG_DONTWAIT), &err);
1096                 if (!skb)
1097                         return err;
1098
1099                 /* reserve space for Hardware header */
1100                 skb_reserve(skb, hh_len);
1101
1102                 /* create space for UDP/IP header */
1103                 skb_put(skb, fragheaderlen + transhdrlen);
1104
1105                 /* initialize network header pointer */
1106                 skb_reset_network_header(skb);
1107
1108                 /* initialize protocol header pointer */
1109                 skb->transport_header = skb->network_header + fragheaderlen;
1110
1111                 skb->protocol = htons(ETH_P_IPV6);
1112                 skb->csum = 0;
1113
1114                 __skb_queue_tail(queue, skb);
1115         } else if (skb_is_gso(skb)) {
1116                 goto append;
1117         }
1118
1119         skb->ip_summed = CHECKSUM_PARTIAL;
1120         /* Specify the length of each IPv6 datagram fragment.
1121          * It has to be a multiple of 8.
1122          */
1123         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1124                                      sizeof(struct frag_hdr)) & ~7;
1125         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1126         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1127                                                          &fl6->daddr,
1128                                                          &fl6->saddr);
1129
1130 append:
1131         return skb_append_datato_frags(sk, skb, getfrag, from,
1132                                        (length - transhdrlen));
1133 }
1134
1135 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1136                                                gfp_t gfp)
1137 {
1138         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1139 }
1140
1141 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1142                                                 gfp_t gfp)
1143 {
1144         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1145 }
1146
1147 static void ip6_append_data_mtu(unsigned int *mtu,
1148                                 int *maxfraglen,
1149                                 unsigned int fragheaderlen,
1150                                 struct sk_buff *skb,
1151                                 struct rt6_info *rt,
1152                                 unsigned int orig_mtu)
1153 {
1154         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1155                 if (!skb) {
1156                         /* first fragment, reserve header_len */
1157                         *mtu = orig_mtu - rt->dst.header_len;
1158
1159                 } else {
1160                         /*
1161                          * this fragment is not first, the headers
1162                          * space is regarded as data space.
1163                          */
1164                         *mtu = orig_mtu;
1165                 }
1166                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1167                               + fragheaderlen - sizeof(struct frag_hdr);
1168         }
1169 }
1170
1171 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1172                           struct inet6_cork *v6_cork,
1173                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1174                           struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176         struct ipv6_pinfo *np = inet6_sk(sk);
1177         unsigned int mtu;
1178
1179         /*
1180          * setup for corking
1181          */
1182         if (opt) {
1183                 if (WARN_ON(v6_cork->opt))
1184                         return -EINVAL;
1185
1186                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1187                 if (unlikely(!v6_cork->opt))
1188                         return -ENOBUFS;
1189
1190                 v6_cork->opt->tot_len = opt->tot_len;
1191                 v6_cork->opt->opt_flen = opt->opt_flen;
1192                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1193
1194                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1200                                                     sk->sk_allocation);
1201                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1205                                                    sk->sk_allocation);
1206                 if (opt->hopopt && !v6_cork->opt->hopopt)
1207                         return -ENOBUFS;
1208
1209                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1210                                                     sk->sk_allocation);
1211                 if (opt->srcrt && !v6_cork->opt->srcrt)
1212                         return -ENOBUFS;
1213
1214                 /* need source address above miyazawa*/
1215         }
1216         dst_hold(&rt->dst);
1217         cork->base.dst = &rt->dst;
1218         cork->fl.u.ip6 = *fl6;
1219         v6_cork->hop_limit = hlimit;
1220         v6_cork->tclass = tclass;
1221         if (rt->dst.flags & DST_XFRM_TUNNEL)
1222                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1224         else
1225                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1227         if (np->frag_size < mtu) {
1228                 if (np->frag_size)
1229                         mtu = np->frag_size;
1230         }
1231         cork->base.fragsize = mtu;
1232         if (dst_allfrag(rt->dst.path))
1233                 cork->base.flags |= IPCORK_ALLFRAG;
1234         cork->base.length = 0;
1235
1236         return 0;
1237 }
1238
1239 static int __ip6_append_data(struct sock *sk,
1240                              struct flowi6 *fl6,
1241                              struct sk_buff_head *queue,
1242                              struct inet_cork *cork,
1243                              struct inet6_cork *v6_cork,
1244                              struct page_frag *pfrag,
1245                              int getfrag(void *from, char *to, int offset,
1246                                          int len, int odd, struct sk_buff *skb),
1247                              void *from, int length, int transhdrlen,
1248                              unsigned int flags, int dontfrag)
1249 {
1250         struct sk_buff *skb, *skb_prev = NULL;
1251         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1252         int exthdrlen = 0;
1253         int dst_exthdrlen = 0;
1254         int hh_len;
1255         int copy;
1256         int err;
1257         int offset = 0;
1258         __u8 tx_flags = 0;
1259         u32 tskey = 0;
1260         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1261         struct ipv6_txoptions *opt = v6_cork->opt;
1262         int csummode = CHECKSUM_NONE;
1263
1264         skb = skb_peek_tail(queue);
1265         if (!skb) {
1266                 exthdrlen = opt ? opt->opt_flen : 0;
1267                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1268         }
1269
1270         mtu = cork->fragsize;
1271         orig_mtu = mtu;
1272
1273         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1274
1275         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1276                         (opt ? opt->opt_nflen : 0);
1277         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1278                      sizeof(struct frag_hdr);
1279
1280         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1281                 unsigned int maxnonfragsize, headersize;
1282
1283                 headersize = sizeof(struct ipv6hdr) +
1284                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1285                              (dst_allfrag(&rt->dst) ?
1286                               sizeof(struct frag_hdr) : 0) +
1287                              rt->rt6i_nfheader_len;
1288
1289                 if (ip6_sk_ignore_df(sk))
1290                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1291                 else
1292                         maxnonfragsize = mtu;
1293
1294                 /* dontfrag active */
1295                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1296                     (sk->sk_protocol == IPPROTO_UDP ||
1297                      sk->sk_protocol == IPPROTO_RAW)) {
1298                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1299                                                    sizeof(struct ipv6hdr));
1300                         goto emsgsize;
1301                 }
1302
1303                 if (cork->length + length > maxnonfragsize - headersize) {
1304 emsgsize:
1305                         ipv6_local_error(sk, EMSGSIZE, fl6,
1306                                          mtu - headersize +
1307                                          sizeof(struct ipv6hdr));
1308                         return -EMSGSIZE;
1309                 }
1310         }
1311
1312         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1313                 sock_tx_timestamp(sk, &tx_flags);
1314                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1315                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1316                         tskey = sk->sk_tskey++;
1317         }
1318
1319         /* If this is the first and only packet and device
1320          * supports checksum offloading, let's use it.
1321          * Use transhdrlen, same as IPv4, because partial
1322          * sums only work when transhdrlen is set.
1323          */
1324         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1325             length + fragheaderlen < mtu &&
1326             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1327             !exthdrlen)
1328                 csummode = CHECKSUM_PARTIAL;
1329         /*
1330          * Let's try using as much space as possible.
1331          * Use MTU if total length of the message fits into the MTU.
1332          * Otherwise, we need to reserve fragment header and
1333          * fragment alignment (= 8-15 octects, in total).
1334          *
1335          * Note that we may need to "move" the data from the tail of
1336          * of the buffer to the new fragment when we split
1337          * the message.
1338          *
1339          * FIXME: It may be fragmented into multiple chunks
1340          *        at once if non-fragmentable extension headers
1341          *        are too large.
1342          * --yoshfuji
1343          */
1344
1345         cork->length += length;
1346         if (((length > mtu) ||
1347              (skb && skb_is_gso(skb))) &&
1348             (sk->sk_protocol == IPPROTO_UDP) &&
1349             (rt->dst.dev->features & NETIF_F_UFO) &&
1350             (sk->sk_type == SOCK_DGRAM)) {
1351                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1352                                           hh_len, fragheaderlen,
1353                                           transhdrlen, mtu, flags, fl6);
1354                 if (err)
1355                         goto error;
1356                 return 0;
1357         }
1358
1359         if (!skb)
1360                 goto alloc_new_skb;
1361
1362         while (length > 0) {
1363                 /* Check if the remaining data fits into current packet. */
1364                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1365                 if (copy < length)
1366                         copy = maxfraglen - skb->len;
1367
1368                 if (copy <= 0) {
1369                         char *data;
1370                         unsigned int datalen;
1371                         unsigned int fraglen;
1372                         unsigned int fraggap;
1373                         unsigned int alloclen;
1374 alloc_new_skb:
1375                         /* There's no room in the current skb */
1376                         if (skb)
1377                                 fraggap = skb->len - maxfraglen;
1378                         else
1379                                 fraggap = 0;
1380                         /* update mtu and maxfraglen if necessary */
1381                         if (!skb || !skb_prev)
1382                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1383                                                     fragheaderlen, skb, rt,
1384                                                     orig_mtu);
1385
1386                         skb_prev = skb;
1387
1388                         /*
1389                          * If remaining data exceeds the mtu,
1390                          * we know we need more fragment(s).
1391                          */
1392                         datalen = length + fraggap;
1393
1394                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1395                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1396                         if ((flags & MSG_MORE) &&
1397                             !(rt->dst.dev->features&NETIF_F_SG))
1398                                 alloclen = mtu;
1399                         else
1400                                 alloclen = datalen + fragheaderlen;
1401
1402                         alloclen += dst_exthdrlen;
1403
1404                         if (datalen != length + fraggap) {
1405                                 /*
1406                                  * this is not the last fragment, the trailer
1407                                  * space is regarded as data space.
1408                                  */
1409                                 datalen += rt->dst.trailer_len;
1410                         }
1411
1412                         alloclen += rt->dst.trailer_len;
1413                         fraglen = datalen + fragheaderlen;
1414
1415                         /*
1416                          * We just reserve space for fragment header.
1417                          * Note: this may be overallocation if the message
1418                          * (without MSG_MORE) fits into the MTU.
1419                          */
1420                         alloclen += sizeof(struct frag_hdr);
1421
1422                         if (transhdrlen) {
1423                                 skb = sock_alloc_send_skb(sk,
1424                                                 alloclen + hh_len,
1425                                                 (flags & MSG_DONTWAIT), &err);
1426                         } else {
1427                                 skb = NULL;
1428                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1429                                     2 * sk->sk_sndbuf)
1430                                         skb = sock_wmalloc(sk,
1431                                                            alloclen + hh_len, 1,
1432                                                            sk->sk_allocation);
1433                                 if (unlikely(!skb))
1434                                         err = -ENOBUFS;
1435                         }
1436                         if (!skb)
1437                                 goto error;
1438                         /*
1439                          *      Fill in the control structures
1440                          */
1441                         skb->protocol = htons(ETH_P_IPV6);
1442                         skb->ip_summed = csummode;
1443                         skb->csum = 0;
1444                         /* reserve for fragmentation and ipsec header */
1445                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1446                                     dst_exthdrlen);
1447
1448                         /* Only the initial fragment is time stamped */
1449                         skb_shinfo(skb)->tx_flags = tx_flags;
1450                         tx_flags = 0;
1451                         skb_shinfo(skb)->tskey = tskey;
1452                         tskey = 0;
1453
1454                         /*
1455                          *      Find where to start putting bytes
1456                          */
1457                         data = skb_put(skb, fraglen);
1458                         skb_set_network_header(skb, exthdrlen);
1459                         data += fragheaderlen;
1460                         skb->transport_header = (skb->network_header +
1461                                                  fragheaderlen);
1462                         if (fraggap) {
1463                                 skb->csum = skb_copy_and_csum_bits(
1464                                         skb_prev, maxfraglen,
1465                                         data + transhdrlen, fraggap, 0);
1466                                 skb_prev->csum = csum_sub(skb_prev->csum,
1467                                                           skb->csum);
1468                                 data += fraggap;
1469                                 pskb_trim_unique(skb_prev, maxfraglen);
1470                         }
1471                         copy = datalen - transhdrlen - fraggap;
1472
1473                         if (copy < 0) {
1474                                 err = -EINVAL;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1478                                 err = -EFAULT;
1479                                 kfree_skb(skb);
1480                                 goto error;
1481                         }
1482
1483                         offset += copy;
1484                         length -= datalen - fraggap;
1485                         transhdrlen = 0;
1486                         exthdrlen = 0;
1487                         dst_exthdrlen = 0;
1488
1489                         /*
1490                          * Put the packet on the pending queue
1491                          */
1492                         __skb_queue_tail(queue, skb);
1493                         continue;
1494                 }
1495
1496                 if (copy > length)
1497                         copy = length;
1498
1499                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1500                         unsigned int off;
1501
1502                         off = skb->len;
1503                         if (getfrag(from, skb_put(skb, copy),
1504                                                 offset, copy, off, skb) < 0) {
1505                                 __skb_trim(skb, off);
1506                                 err = -EFAULT;
1507                                 goto error;
1508                         }
1509                 } else {
1510                         int i = skb_shinfo(skb)->nr_frags;
1511
1512                         err = -ENOMEM;
1513                         if (!sk_page_frag_refill(sk, pfrag))
1514                                 goto error;
1515
1516                         if (!skb_can_coalesce(skb, i, pfrag->page,
1517                                               pfrag->offset)) {
1518                                 err = -EMSGSIZE;
1519                                 if (i == MAX_SKB_FRAGS)
1520                                         goto error;
1521
1522                                 __skb_fill_page_desc(skb, i, pfrag->page,
1523                                                      pfrag->offset, 0);
1524                                 skb_shinfo(skb)->nr_frags = ++i;
1525                                 get_page(pfrag->page);
1526                         }
1527                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1528                         if (getfrag(from,
1529                                     page_address(pfrag->page) + pfrag->offset,
1530                                     offset, copy, skb->len, skb) < 0)
1531                                 goto error_efault;
1532
1533                         pfrag->offset += copy;
1534                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1535                         skb->len += copy;
1536                         skb->data_len += copy;
1537                         skb->truesize += copy;
1538                         atomic_add(copy, &sk->sk_wmem_alloc);
1539                 }
1540                 offset += copy;
1541                 length -= copy;
1542         }
1543
1544         return 0;
1545
1546 error_efault:
1547         err = -EFAULT;
1548 error:
1549         cork->length -= length;
1550         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1551         return err;
1552 }
1553
1554 int ip6_append_data(struct sock *sk,
1555                     int getfrag(void *from, char *to, int offset, int len,
1556                                 int odd, struct sk_buff *skb),
1557                     void *from, int length, int transhdrlen, int hlimit,
1558                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1559                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1560 {
1561         struct inet_sock *inet = inet_sk(sk);
1562         struct ipv6_pinfo *np = inet6_sk(sk);
1563         int exthdrlen;
1564         int err;
1565
1566         if (flags&MSG_PROBE)
1567                 return 0;
1568         if (skb_queue_empty(&sk->sk_write_queue)) {
1569                 /*
1570                  * setup for corking
1571                  */
1572                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1573                                      tclass, opt, rt, fl6);
1574                 if (err)
1575                         return err;
1576
1577                 exthdrlen = (opt ? opt->opt_flen : 0);
1578                 length += exthdrlen;
1579                 transhdrlen += exthdrlen;
1580         } else {
1581                 fl6 = &inet->cork.fl.u.ip6;
1582                 transhdrlen = 0;
1583         }
1584
1585         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1586                                  &np->cork, sk_page_frag(sk), getfrag,
1587                                  from, length, transhdrlen, flags, dontfrag);
1588 }
1589 EXPORT_SYMBOL_GPL(ip6_append_data);
1590
1591 static void ip6_cork_release(struct inet_cork_full *cork,
1592                              struct inet6_cork *v6_cork)
1593 {
1594         if (v6_cork->opt) {
1595                 kfree(v6_cork->opt->dst0opt);
1596                 kfree(v6_cork->opt->dst1opt);
1597                 kfree(v6_cork->opt->hopopt);
1598                 kfree(v6_cork->opt->srcrt);
1599                 kfree(v6_cork->opt);
1600                 v6_cork->opt = NULL;
1601         }
1602
1603         if (cork->base.dst) {
1604                 dst_release(cork->base.dst);
1605                 cork->base.dst = NULL;
1606                 cork->base.flags &= ~IPCORK_ALLFRAG;
1607         }
1608         memset(&cork->fl, 0, sizeof(cork->fl));
1609 }
1610
1611 struct sk_buff *__ip6_make_skb(struct sock *sk,
1612                                struct sk_buff_head *queue,
1613                                struct inet_cork_full *cork,
1614                                struct inet6_cork *v6_cork)
1615 {
1616         struct sk_buff *skb, *tmp_skb;
1617         struct sk_buff **tail_skb;
1618         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1619         struct ipv6_pinfo *np = inet6_sk(sk);
1620         struct net *net = sock_net(sk);
1621         struct ipv6hdr *hdr;
1622         struct ipv6_txoptions *opt = v6_cork->opt;
1623         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1624         struct flowi6 *fl6 = &cork->fl.u.ip6;
1625         unsigned char proto = fl6->flowi6_proto;
1626
1627         skb = __skb_dequeue(queue);
1628         if (!skb)
1629                 goto out;
1630         tail_skb = &(skb_shinfo(skb)->frag_list);
1631
1632         /* move skb->data to ip header from ext header */
1633         if (skb->data < skb_network_header(skb))
1634                 __skb_pull(skb, skb_network_offset(skb));
1635         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1636                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1637                 *tail_skb = tmp_skb;
1638                 tail_skb = &(tmp_skb->next);
1639                 skb->len += tmp_skb->len;
1640                 skb->data_len += tmp_skb->len;
1641                 skb->truesize += tmp_skb->truesize;
1642                 tmp_skb->destructor = NULL;
1643                 tmp_skb->sk = NULL;
1644         }
1645
1646         /* Allow local fragmentation. */
1647         skb->ignore_df = ip6_sk_ignore_df(sk);
1648
1649         *final_dst = fl6->daddr;
1650         __skb_pull(skb, skb_network_header_len(skb));
1651         if (opt && opt->opt_flen)
1652                 ipv6_push_frag_opts(skb, opt, &proto);
1653         if (opt && opt->opt_nflen)
1654                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1655
1656         skb_push(skb, sizeof(struct ipv6hdr));
1657         skb_reset_network_header(skb);
1658         hdr = ipv6_hdr(skb);
1659
1660         ip6_flow_hdr(hdr, v6_cork->tclass,
1661                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1662                                         np->autoflowlabel, fl6));
1663         hdr->hop_limit = v6_cork->hop_limit;
1664         hdr->nexthdr = proto;
1665         hdr->saddr = fl6->saddr;
1666         hdr->daddr = *final_dst;
1667
1668         skb->priority = sk->sk_priority;
1669         skb->mark = sk->sk_mark;
1670
1671         skb_dst_set(skb, dst_clone(&rt->dst));
1672         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1673         if (proto == IPPROTO_ICMPV6) {
1674                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1675
1676                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1677                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1678         }
1679
1680         ip6_cork_release(cork, v6_cork);
1681 out:
1682         return skb;
1683 }
1684
1685 int ip6_send_skb(struct sk_buff *skb)
1686 {
1687         struct net *net = sock_net(skb->sk);
1688         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1689         int err;
1690
1691         err = ip6_local_out(skb);
1692         if (err) {
1693                 if (err > 0)
1694                         err = net_xmit_errno(err);
1695                 if (err)
1696                         IP6_INC_STATS(net, rt->rt6i_idev,
1697                                       IPSTATS_MIB_OUTDISCARDS);
1698         }
1699
1700         return err;
1701 }
1702
1703 int ip6_push_pending_frames(struct sock *sk)
1704 {
1705         struct sk_buff *skb;
1706
1707         skb = ip6_finish_skb(sk);
1708         if (!skb)
1709                 return 0;
1710
1711         return ip6_send_skb(skb);
1712 }
1713 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1714
1715 static void __ip6_flush_pending_frames(struct sock *sk,
1716                                        struct sk_buff_head *queue,
1717                                        struct inet_cork_full *cork,
1718                                        struct inet6_cork *v6_cork)
1719 {
1720         struct sk_buff *skb;
1721
1722         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1723                 if (skb_dst(skb))
1724                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1725                                       IPSTATS_MIB_OUTDISCARDS);
1726                 kfree_skb(skb);
1727         }
1728
1729         ip6_cork_release(cork, v6_cork);
1730 }
1731
1732 void ip6_flush_pending_frames(struct sock *sk)
1733 {
1734         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1735                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1736 }
1737 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1738
1739 struct sk_buff *ip6_make_skb(struct sock *sk,
1740                              int getfrag(void *from, char *to, int offset,
1741                                          int len, int odd, struct sk_buff *skb),
1742                              void *from, int length, int transhdrlen,
1743                              int hlimit, int tclass,
1744                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1745                              struct rt6_info *rt, unsigned int flags,
1746                              int dontfrag)
1747 {
1748         struct inet_cork_full cork;
1749         struct inet6_cork v6_cork;
1750         struct sk_buff_head queue;
1751         int exthdrlen = (opt ? opt->opt_flen : 0);
1752         int err;
1753
1754         if (flags & MSG_PROBE)
1755                 return NULL;
1756
1757         __skb_queue_head_init(&queue);
1758
1759         cork.base.flags = 0;
1760         cork.base.addr = 0;
1761         cork.base.opt = NULL;
1762         v6_cork.opt = NULL;
1763         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1764         if (err)
1765                 return ERR_PTR(err);
1766
1767         if (dontfrag < 0)
1768                 dontfrag = inet6_sk(sk)->dontfrag;
1769
1770         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1771                                 &current->task_frag, getfrag, from,
1772                                 length + exthdrlen, transhdrlen + exthdrlen,
1773                                 flags, dontfrag);
1774         if (err) {
1775                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1776                 return ERR_PTR(err);
1777         }
1778
1779         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1780 }