net: l3mdev: remove redundant calls
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63         struct dst_entry *dst = skb_dst(skb);
64         struct net_device *dev = dst->dev;
65         struct neighbour *neigh;
66         struct in6_addr *nexthop;
67         int ret;
68
69         skb->protocol = htons(ETH_P_IPV6);
70         skb->dev = dev;
71
72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74
75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76                     ((mroute6_socket(net, skb) &&
77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79                                          &ipv6_hdr(skb)->saddr))) {
80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81
82                         /* Do not check for IFF_ALLMULTI; multicast routing
83                            is not supported in any case.
84                          */
85                         if (newskb)
86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87                                         net, sk, newskb, NULL, newskb->dev,
88                                         dev_loopback_xmit);
89
90                         if (ipv6_hdr(skb)->hop_limit == 0) {
91                                 IP6_INC_STATS(net, idev,
92                                               IPSTATS_MIB_OUTDISCARDS);
93                                 kfree_skb(skb);
94                                 return 0;
95                         }
96                 }
97
98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99
100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101                     IPV6_ADDR_SCOPE_NODELOCAL &&
102                     !(dev->flags & IFF_LOOPBACK)) {
103                         kfree_skb(skb);
104                         return 0;
105                 }
106         }
107
108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109                 int res = lwtunnel_xmit(skb);
110
111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112                         return res;
113         }
114
115         rcu_read_lock_bh();
116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118         if (unlikely(!neigh))
119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120         if (!IS_ERR(neigh)) {
121                 ret = dst_neigh_output(dst, neigh, skb);
122                 rcu_read_unlock_bh();
123                 return ret;
124         }
125         rcu_read_unlock_bh();
126
127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128         kfree_skb(skb);
129         return -EINVAL;
130 }
131
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135             dst_allfrag(skb_dst(skb)) ||
136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
138         else
139                 return ip6_finish_output2(net, sk, skb);
140 }
141
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144         struct net_device *dev = skb_dst(skb)->dev;
145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146
147         if (unlikely(idev->cnf.disable_ipv6)) {
148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149                 kfree_skb(skb);
150                 return 0;
151         }
152
153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154                             net, sk, skb, NULL, dev,
155                             ip6_finish_output,
156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158
159 /*
160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
161  * Note : socket lock is not held for SYNACK packets, but might be modified
162  * by calls to skb_set_owner_w() and ipv6_local_error(),
163  * which are using proper atomic operations or spinlocks.
164  */
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166              struct ipv6_txoptions *opt, int tclass)
167 {
168         struct net *net = sock_net(sk);
169         const struct ipv6_pinfo *np = inet6_sk(sk);
170         struct in6_addr *first_hop = &fl6->daddr;
171         struct dst_entry *dst = skb_dst(skb);
172         struct ipv6hdr *hdr;
173         u8  proto = fl6->flowi6_proto;
174         int seg_len = skb->len;
175         int hlimit = -1;
176         u32 mtu;
177
178         if (opt) {
179                 unsigned int head_room;
180
181                 /* First: exthdrs may take lots of space (~8K for now)
182                    MAX_HEADER is not enough.
183                  */
184                 head_room = opt->opt_nflen + opt->opt_flen;
185                 seg_len += head_room;
186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
187
188                 if (skb_headroom(skb) < head_room) {
189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190                         if (!skb2) {
191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192                                               IPSTATS_MIB_OUTDISCARDS);
193                                 kfree_skb(skb);
194                                 return -ENOBUFS;
195                         }
196                         consume_skb(skb);
197                         skb = skb2;
198                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
199                          * it is safe to call in our context (socket lock not held)
200                          */
201                         skb_set_owner_w(skb, (struct sock *)sk);
202                 }
203                 if (opt->opt_flen)
204                         ipv6_push_frag_opts(skb, opt, &proto);
205                 if (opt->opt_nflen)
206                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207         }
208
209         skb_push(skb, sizeof(struct ipv6hdr));
210         skb_reset_network_header(skb);
211         hdr = ipv6_hdr(skb);
212
213         /*
214          *      Fill in the IPv6 header
215          */
216         if (np)
217                 hlimit = np->hop_limit;
218         if (hlimit < 0)
219                 hlimit = ip6_dst_hoplimit(dst);
220
221         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
222                                                      np->autoflowlabel, fl6));
223
224         hdr->payload_len = htons(seg_len);
225         hdr->nexthdr = proto;
226         hdr->hop_limit = hlimit;
227
228         hdr->saddr = fl6->saddr;
229         hdr->daddr = *first_hop;
230
231         skb->protocol = htons(ETH_P_IPV6);
232         skb->priority = sk->sk_priority;
233         skb->mark = sk->sk_mark;
234
235         mtu = dst_mtu(dst);
236         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
237                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
238                               IPSTATS_MIB_OUT, skb->len);
239
240                 /* if egress device is enslaved to an L3 master device pass the
241                  * skb to its handler for processing
242                  */
243                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
244                 if (unlikely(!skb))
245                         return 0;
246
247                 /* hooks should never assume socket lock is held.
248                  * we promote our socket to non const
249                  */
250                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
251                                net, (struct sock *)sk, skb, NULL, dst->dev,
252                                dst_output);
253         }
254
255         skb->dev = dst->dev;
256         /* ipv6_local_error() does not require socket lock,
257          * we promote our socket to non const
258          */
259         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
260
261         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262         kfree_skb(skb);
263         return -EMSGSIZE;
264 }
265 EXPORT_SYMBOL(ip6_xmit);
266
267 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
268 {
269         struct ip6_ra_chain *ra;
270         struct sock *last = NULL;
271
272         read_lock(&ip6_ra_lock);
273         for (ra = ip6_ra_chain; ra; ra = ra->next) {
274                 struct sock *sk = ra->sk;
275                 if (sk && ra->sel == sel &&
276                     (!sk->sk_bound_dev_if ||
277                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
278                         if (last) {
279                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
280                                 if (skb2)
281                                         rawv6_rcv(last, skb2);
282                         }
283                         last = sk;
284                 }
285         }
286
287         if (last) {
288                 rawv6_rcv(last, skb);
289                 read_unlock(&ip6_ra_lock);
290                 return 1;
291         }
292         read_unlock(&ip6_ra_lock);
293         return 0;
294 }
295
296 static int ip6_forward_proxy_check(struct sk_buff *skb)
297 {
298         struct ipv6hdr *hdr = ipv6_hdr(skb);
299         u8 nexthdr = hdr->nexthdr;
300         __be16 frag_off;
301         int offset;
302
303         if (ipv6_ext_hdr(nexthdr)) {
304                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
305                 if (offset < 0)
306                         return 0;
307         } else
308                 offset = sizeof(struct ipv6hdr);
309
310         if (nexthdr == IPPROTO_ICMPV6) {
311                 struct icmp6hdr *icmp6;
312
313                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
314                                          offset + 1 - skb->data)))
315                         return 0;
316
317                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
318
319                 switch (icmp6->icmp6_type) {
320                 case NDISC_ROUTER_SOLICITATION:
321                 case NDISC_ROUTER_ADVERTISEMENT:
322                 case NDISC_NEIGHBOUR_SOLICITATION:
323                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
324                 case NDISC_REDIRECT:
325                         /* For reaction involving unicast neighbor discovery
326                          * message destined to the proxied address, pass it to
327                          * input function.
328                          */
329                         return 1;
330                 default:
331                         break;
332                 }
333         }
334
335         /*
336          * The proxying router can't forward traffic sent to a link-local
337          * address, so signal the sender and discard the packet. This
338          * behavior is clarified by the MIPv6 specification.
339          */
340         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
341                 dst_link_failure(skb);
342                 return -1;
343         }
344
345         return 0;
346 }
347
348 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
349                                      struct sk_buff *skb)
350 {
351         return dst_output(net, sk, skb);
352 }
353
354 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
355 {
356         unsigned int mtu;
357         struct inet6_dev *idev;
358
359         if (dst_metric_locked(dst, RTAX_MTU)) {
360                 mtu = dst_metric_raw(dst, RTAX_MTU);
361                 if (mtu)
362                         return mtu;
363         }
364
365         mtu = IPV6_MIN_MTU;
366         rcu_read_lock();
367         idev = __in6_dev_get(dst->dev);
368         if (idev)
369                 mtu = idev->cnf.mtu6;
370         rcu_read_unlock();
371
372         return mtu;
373 }
374
375 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
376 {
377         if (skb->len <= mtu)
378                 return false;
379
380         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
381         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
382                 return true;
383
384         if (skb->ignore_df)
385                 return false;
386
387         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
388                 return false;
389
390         return true;
391 }
392
393 int ip6_forward(struct sk_buff *skb)
394 {
395         struct dst_entry *dst = skb_dst(skb);
396         struct ipv6hdr *hdr = ipv6_hdr(skb);
397         struct inet6_skb_parm *opt = IP6CB(skb);
398         struct net *net = dev_net(dst->dev);
399         u32 mtu;
400
401         if (net->ipv6.devconf_all->forwarding == 0)
402                 goto error;
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         if (unlikely(skb->sk))
408                 goto drop;
409
410         if (skb_warn_if_lro(skb))
411                 goto drop;
412
413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
415                                 IPSTATS_MIB_INDISCARDS);
416                 goto drop;
417         }
418
419         skb_forward_csum(skb);
420
421         /*
422          *      We DO NOT make any processing on
423          *      RA packets, pushing them to user level AS IS
424          *      without ane WARRANTY that application will be able
425          *      to interpret them. The reason is that we
426          *      cannot make anything clever here.
427          *
428          *      We are not end-node, so that if packet contains
429          *      AH/ESP, we cannot make anything.
430          *      Defragmentation also would be mistake, RA packets
431          *      cannot be fragmented, because there is no warranty
432          *      that different fragments will go along one path. --ANK
433          */
434         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
435                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
436                         return 0;
437         }
438
439         /*
440          *      check and decrement ttl
441          */
442         if (hdr->hop_limit <= 1) {
443                 /* Force OUTPUT device used as source address */
444                 skb->dev = dst->dev;
445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447                                 IPSTATS_MIB_INHDRERRORS);
448
449                 kfree_skb(skb);
450                 return -ETIMEDOUT;
451         }
452
453         /* XXX: idev->cnf.proxy_ndp? */
454         if (net->ipv6.devconf_all->proxy_ndp &&
455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456                 int proxied = ip6_forward_proxy_check(skb);
457                 if (proxied > 0)
458                         return ip6_input(skb);
459                 else if (proxied < 0) {
460                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
461                                         IPSTATS_MIB_INDISCARDS);
462                         goto drop;
463                 }
464         }
465
466         if (!xfrm6_route_forward(skb)) {
467                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
468                                 IPSTATS_MIB_INDISCARDS);
469                 goto drop;
470         }
471         dst = skb_dst(skb);
472
473         /* IPv6 specs say nothing about it, but it is clear that we cannot
474            send redirects to source routed frames.
475            We don't send redirects to frames decapsulated from IPsec.
476          */
477         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
478                 struct in6_addr *target = NULL;
479                 struct inet_peer *peer;
480                 struct rt6_info *rt;
481
482                 /*
483                  *      incoming and outgoing devices are the same
484                  *      send a redirect.
485                  */
486
487                 rt = (struct rt6_info *) dst;
488                 if (rt->rt6i_flags & RTF_GATEWAY)
489                         target = &rt->rt6i_gateway;
490                 else
491                         target = &hdr->daddr;
492
493                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
494
495                 /* Limit redirects both by destination (here)
496                    and by source (inside ndisc_send_redirect)
497                  */
498                 if (inet_peer_xrlim_allow(peer, 1*HZ))
499                         ndisc_send_redirect(skb, target);
500                 if (peer)
501                         inet_putpeer(peer);
502         } else {
503                 int addrtype = ipv6_addr_type(&hdr->saddr);
504
505                 /* This check is security critical. */
506                 if (addrtype == IPV6_ADDR_ANY ||
507                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
508                         goto error;
509                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
510                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
511                                     ICMPV6_NOT_NEIGHBOUR, 0);
512                         goto error;
513                 }
514         }
515
516         mtu = ip6_dst_mtu_forward(dst);
517         if (mtu < IPV6_MIN_MTU)
518                 mtu = IPV6_MIN_MTU;
519
520         if (ip6_pkt_too_big(skb, mtu)) {
521                 /* Again, force OUTPUT device used as source address */
522                 skb->dev = dst->dev;
523                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
525                                 IPSTATS_MIB_INTOOBIGERRORS);
526                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
527                                 IPSTATS_MIB_FRAGFAILS);
528                 kfree_skb(skb);
529                 return -EMSGSIZE;
530         }
531
532         if (skb_cow(skb, dst->dev->hard_header_len)) {
533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534                                 IPSTATS_MIB_OUTDISCARDS);
535                 goto drop;
536         }
537
538         hdr = ipv6_hdr(skb);
539
540         /* Mangling hops number delayed to point after skb COW */
541
542         hdr->hop_limit--;
543
544         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
545         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
546         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
547                        net, NULL, skb, skb->dev, dst->dev,
548                        ip6_forward_finish);
549
550 error:
551         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
552 drop:
553         kfree_skb(skb);
554         return -EINVAL;
555 }
556
557 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
558 {
559         to->pkt_type = from->pkt_type;
560         to->priority = from->priority;
561         to->protocol = from->protocol;
562         skb_dst_drop(to);
563         skb_dst_set(to, dst_clone(skb_dst(from)));
564         to->dev = from->dev;
565         to->mark = from->mark;
566
567 #ifdef CONFIG_NET_SCHED
568         to->tc_index = from->tc_index;
569 #endif
570         nf_copy(to, from);
571         skb_copy_secmark(to, from);
572 }
573
574 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
575                  int (*output)(struct net *, struct sock *, struct sk_buff *))
576 {
577         struct sk_buff *frag;
578         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
579         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
580                                 inet6_sk(skb->sk) : NULL;
581         struct ipv6hdr *tmp_hdr;
582         struct frag_hdr *fh;
583         unsigned int mtu, hlen, left, len;
584         int hroom, troom;
585         __be32 frag_id;
586         int ptr, offset = 0, err = 0;
587         u8 *prevhdr, nexthdr = 0;
588
589         hlen = ip6_find_1stfragopt(skb, &prevhdr);
590         nexthdr = *prevhdr;
591
592         mtu = ip6_skb_dst_mtu(skb);
593
594         /* We must not fragment if the socket is set to force MTU discovery
595          * or if the skb it not generated by a local socket.
596          */
597         if (unlikely(!skb->ignore_df && skb->len > mtu))
598                 goto fail_toobig;
599
600         if (IP6CB(skb)->frag_max_size) {
601                 if (IP6CB(skb)->frag_max_size > mtu)
602                         goto fail_toobig;
603
604                 /* don't send fragments larger than what we received */
605                 mtu = IP6CB(skb)->frag_max_size;
606                 if (mtu < IPV6_MIN_MTU)
607                         mtu = IPV6_MIN_MTU;
608         }
609
610         if (np && np->frag_size < mtu) {
611                 if (np->frag_size)
612                         mtu = np->frag_size;
613         }
614         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
615                 goto fail_toobig;
616         mtu -= hlen + sizeof(struct frag_hdr);
617
618         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
619                                     &ipv6_hdr(skb)->saddr);
620
621         if (skb->ip_summed == CHECKSUM_PARTIAL &&
622             (err = skb_checksum_help(skb)))
623                 goto fail;
624
625         hroom = LL_RESERVED_SPACE(rt->dst.dev);
626         if (skb_has_frag_list(skb)) {
627                 int first_len = skb_pagelen(skb);
628                 struct sk_buff *frag2;
629
630                 if (first_len - hlen > mtu ||
631                     ((first_len - hlen) & 7) ||
632                     skb_cloned(skb) ||
633                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
634                         goto slow_path;
635
636                 skb_walk_frags(skb, frag) {
637                         /* Correct geometry. */
638                         if (frag->len > mtu ||
639                             ((frag->len & 7) && frag->next) ||
640                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
641                                 goto slow_path_clean;
642
643                         /* Partially cloned skb? */
644                         if (skb_shared(frag))
645                                 goto slow_path_clean;
646
647                         BUG_ON(frag->sk);
648                         if (skb->sk) {
649                                 frag->sk = skb->sk;
650                                 frag->destructor = sock_wfree;
651                         }
652                         skb->truesize -= frag->truesize;
653                 }
654
655                 err = 0;
656                 offset = 0;
657                 /* BUILD HEADER */
658
659                 *prevhdr = NEXTHDR_FRAGMENT;
660                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
661                 if (!tmp_hdr) {
662                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
663                                       IPSTATS_MIB_FRAGFAILS);
664                         err = -ENOMEM;
665                         goto fail;
666                 }
667                 frag = skb_shinfo(skb)->frag_list;
668                 skb_frag_list_init(skb);
669
670                 __skb_pull(skb, hlen);
671                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
672                 __skb_push(skb, hlen);
673                 skb_reset_network_header(skb);
674                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
675
676                 fh->nexthdr = nexthdr;
677                 fh->reserved = 0;
678                 fh->frag_off = htons(IP6_MF);
679                 fh->identification = frag_id;
680
681                 first_len = skb_pagelen(skb);
682                 skb->data_len = first_len - skb_headlen(skb);
683                 skb->len = first_len;
684                 ipv6_hdr(skb)->payload_len = htons(first_len -
685                                                    sizeof(struct ipv6hdr));
686
687                 dst_hold(&rt->dst);
688
689                 for (;;) {
690                         /* Prepare header of the next frame,
691                          * before previous one went down. */
692                         if (frag) {
693                                 frag->ip_summed = CHECKSUM_NONE;
694                                 skb_reset_transport_header(frag);
695                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
696                                 __skb_push(frag, hlen);
697                                 skb_reset_network_header(frag);
698                                 memcpy(skb_network_header(frag), tmp_hdr,
699                                        hlen);
700                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
701                                 fh->nexthdr = nexthdr;
702                                 fh->reserved = 0;
703                                 fh->frag_off = htons(offset);
704                                 if (frag->next)
705                                         fh->frag_off |= htons(IP6_MF);
706                                 fh->identification = frag_id;
707                                 ipv6_hdr(frag)->payload_len =
708                                                 htons(frag->len -
709                                                       sizeof(struct ipv6hdr));
710                                 ip6_copy_metadata(frag, skb);
711                         }
712
713                         err = output(net, sk, skb);
714                         if (!err)
715                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
716                                               IPSTATS_MIB_FRAGCREATES);
717
718                         if (err || !frag)
719                                 break;
720
721                         skb = frag;
722                         frag = skb->next;
723                         skb->next = NULL;
724                 }
725
726                 kfree(tmp_hdr);
727
728                 if (err == 0) {
729                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730                                       IPSTATS_MIB_FRAGOKS);
731                         ip6_rt_put(rt);
732                         return 0;
733                 }
734
735                 kfree_skb_list(frag);
736
737                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738                               IPSTATS_MIB_FRAGFAILS);
739                 ip6_rt_put(rt);
740                 return err;
741
742 slow_path_clean:
743                 skb_walk_frags(skb, frag2) {
744                         if (frag2 == frag)
745                                 break;
746                         frag2->sk = NULL;
747                         frag2->destructor = NULL;
748                         skb->truesize += frag2->truesize;
749                 }
750         }
751
752 slow_path:
753         left = skb->len - hlen;         /* Space per frame */
754         ptr = hlen;                     /* Where to start from */
755
756         /*
757          *      Fragment the datagram.
758          */
759
760         *prevhdr = NEXTHDR_FRAGMENT;
761         troom = rt->dst.dev->needed_tailroom;
762
763         /*
764          *      Keep copying data until we run out.
765          */
766         while (left > 0)        {
767                 len = left;
768                 /* IF: it doesn't fit, use 'mtu' - the data space left */
769                 if (len > mtu)
770                         len = mtu;
771                 /* IF: we are not sending up to and including the packet end
772                    then align the next start on an eight byte boundary */
773                 if (len < left) {
774                         len &= ~7;
775                 }
776
777                 /* Allocate buffer */
778                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
779                                  hroom + troom, GFP_ATOMIC);
780                 if (!frag) {
781                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782                                       IPSTATS_MIB_FRAGFAILS);
783                         err = -ENOMEM;
784                         goto fail;
785                 }
786
787                 /*
788                  *      Set up data on packet
789                  */
790
791                 ip6_copy_metadata(frag, skb);
792                 skb_reserve(frag, hroom);
793                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
794                 skb_reset_network_header(frag);
795                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
796                 frag->transport_header = (frag->network_header + hlen +
797                                           sizeof(struct frag_hdr));
798
799                 /*
800                  *      Charge the memory for the fragment to any owner
801                  *      it might possess
802                  */
803                 if (skb->sk)
804                         skb_set_owner_w(frag, skb->sk);
805
806                 /*
807                  *      Copy the packet header into the new buffer.
808                  */
809                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
810
811                 /*
812                  *      Build fragment header.
813                  */
814                 fh->nexthdr = nexthdr;
815                 fh->reserved = 0;
816                 fh->identification = frag_id;
817
818                 /*
819                  *      Copy a block of the IP datagram.
820                  */
821                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
822                                      len));
823                 left -= len;
824
825                 fh->frag_off = htons(offset);
826                 if (left > 0)
827                         fh->frag_off |= htons(IP6_MF);
828                 ipv6_hdr(frag)->payload_len = htons(frag->len -
829                                                     sizeof(struct ipv6hdr));
830
831                 ptr += len;
832                 offset += len;
833
834                 /*
835                  *      Put this fragment into the sending queue.
836                  */
837                 err = output(net, sk, frag);
838                 if (err)
839                         goto fail;
840
841                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
842                               IPSTATS_MIB_FRAGCREATES);
843         }
844         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
845                       IPSTATS_MIB_FRAGOKS);
846         consume_skb(skb);
847         return err;
848
849 fail_toobig:
850         if (skb->sk && dst_allfrag(skb_dst(skb)))
851                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
852
853         skb->dev = skb_dst(skb)->dev;
854         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
855         err = -EMSGSIZE;
856
857 fail:
858         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                       IPSTATS_MIB_FRAGFAILS);
860         kfree_skb(skb);
861         return err;
862 }
863
864 static inline int ip6_rt_check(const struct rt6key *rt_key,
865                                const struct in6_addr *fl_addr,
866                                const struct in6_addr *addr_cache)
867 {
868         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
869                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
870 }
871
872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
873                                           struct dst_entry *dst,
874                                           const struct flowi6 *fl6)
875 {
876         struct ipv6_pinfo *np = inet6_sk(sk);
877         struct rt6_info *rt;
878
879         if (!dst)
880                 goto out;
881
882         if (dst->ops->family != AF_INET6) {
883                 dst_release(dst);
884                 return NULL;
885         }
886
887         rt = (struct rt6_info *)dst;
888         /* Yes, checking route validity in not connected
889          * case is not very simple. Take into account,
890          * that we do not support routing by source, TOS,
891          * and MSG_DONTROUTE            --ANK (980726)
892          *
893          * 1. ip6_rt_check(): If route was host route,
894          *    check that cached destination is current.
895          *    If it is network route, we still may
896          *    check its validity using saved pointer
897          *    to the last used address: daddr_cache.
898          *    We do not want to save whole address now,
899          *    (because main consumer of this service
900          *    is tcp, which has not this problem),
901          *    so that the last trick works only on connected
902          *    sockets.
903          * 2. oif also should be the same.
904          */
905         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
908 #endif
909            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
910               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
911                 dst_release(dst);
912                 dst = NULL;
913         }
914
915 out:
916         return dst;
917 }
918
919 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
920                                struct dst_entry **dst, struct flowi6 *fl6)
921 {
922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
923         struct neighbour *n;
924         struct rt6_info *rt;
925 #endif
926         int err;
927         int flags = 0;
928
929         if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
930             (!*dst || !(*dst)->error)) {
931                 err = l3mdev_get_saddr6(net, sk, fl6);
932                 if (err)
933                         goto out_err;
934         }
935
936         /* The correct way to handle this would be to do
937          * ip6_route_get_saddr, and then ip6_route_output; however,
938          * the route-specific preferred source forces the
939          * ip6_route_output call _before_ ip6_route_get_saddr.
940          *
941          * In source specific routing (no src=any default route),
942          * ip6_route_output will fail given src=any saddr, though, so
943          * that's why we try it again later.
944          */
945         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
946                 struct rt6_info *rt;
947                 bool had_dst = *dst != NULL;
948
949                 if (!had_dst)
950                         *dst = ip6_route_output(net, sk, fl6);
951                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
952                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
953                                           sk ? inet6_sk(sk)->srcprefs : 0,
954                                           &fl6->saddr);
955                 if (err)
956                         goto out_err_release;
957
958                 /* If we had an erroneous initial result, pretend it
959                  * never existed and let the SA-enabled version take
960                  * over.
961                  */
962                 if (!had_dst && (*dst)->error) {
963                         dst_release(*dst);
964                         *dst = NULL;
965                 }
966
967                 if (fl6->flowi6_oif)
968                         flags |= RT6_LOOKUP_F_IFACE;
969         }
970
971         if (!*dst)
972                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
973
974         err = (*dst)->error;
975         if (err)
976                 goto out_err_release;
977
978 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
979         /*
980          * Here if the dst entry we've looked up
981          * has a neighbour entry that is in the INCOMPLETE
982          * state and the src address from the flow is
983          * marked as OPTIMISTIC, we release the found
984          * dst entry and replace it instead with the
985          * dst entry of the nexthop router
986          */
987         rt = (struct rt6_info *) *dst;
988         rcu_read_lock_bh();
989         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
990                                       rt6_nexthop(rt, &fl6->daddr));
991         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
992         rcu_read_unlock_bh();
993
994         if (err) {
995                 struct inet6_ifaddr *ifp;
996                 struct flowi6 fl_gw6;
997                 int redirect;
998
999                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1000                                       (*dst)->dev, 1);
1001
1002                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1003                 if (ifp)
1004                         in6_ifa_put(ifp);
1005
1006                 if (redirect) {
1007                         /*
1008                          * We need to get the dst entry for the
1009                          * default router instead
1010                          */
1011                         dst_release(*dst);
1012                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1013                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1014                         *dst = ip6_route_output(net, sk, &fl_gw6);
1015                         err = (*dst)->error;
1016                         if (err)
1017                                 goto out_err_release;
1018                 }
1019         }
1020 #endif
1021
1022         return 0;
1023
1024 out_err_release:
1025         dst_release(*dst);
1026         *dst = NULL;
1027 out_err:
1028         if (err == -ENETUNREACH)
1029                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1030         return err;
1031 }
1032
1033 /**
1034  *      ip6_dst_lookup - perform route lookup on flow
1035  *      @sk: socket which provides route info
1036  *      @dst: pointer to dst_entry * for result
1037  *      @fl6: flow to lookup
1038  *
1039  *      This function performs a route lookup on the given flow.
1040  *
1041  *      It returns zero on success, or a standard errno code on error.
1042  */
1043 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1044                    struct flowi6 *fl6)
1045 {
1046         *dst = NULL;
1047         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1048 }
1049 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1050
1051 /**
1052  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1053  *      @sk: socket which provides route info
1054  *      @fl6: flow to lookup
1055  *      @final_dst: final destination address for ipsec lookup
1056  *
1057  *      This function performs a route lookup on the given flow.
1058  *
1059  *      It returns a valid dst pointer on success, or a pointer encoded
1060  *      error code.
1061  */
1062 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1063                                       const struct in6_addr *final_dst)
1064 {
1065         struct dst_entry *dst = NULL;
1066         int err;
1067
1068         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1069         if (err)
1070                 return ERR_PTR(err);
1071         if (final_dst)
1072                 fl6->daddr = *final_dst;
1073
1074         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1077
1078 /**
1079  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1080  *      @sk: socket which provides the dst cache and route info
1081  *      @fl6: flow to lookup
1082  *      @final_dst: final destination address for ipsec lookup
1083  *
1084  *      This function performs a route lookup on the given flow with the
1085  *      possibility of using the cached route in the socket if it is valid.
1086  *      It will take the socket dst lock when operating on the dst cache.
1087  *      As a result, this function can only be used in process context.
1088  *
1089  *      It returns a valid dst pointer on success, or a pointer encoded
1090  *      error code.
1091  */
1092 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1093                                          const struct in6_addr *final_dst)
1094 {
1095         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096
1097         dst = ip6_sk_dst_check(sk, dst, fl6);
1098         if (!dst)
1099                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1100
1101         return dst;
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1104
1105 static inline int ip6_ufo_append_data(struct sock *sk,
1106                         struct sk_buff_head *queue,
1107                         int getfrag(void *from, char *to, int offset, int len,
1108                         int odd, struct sk_buff *skb),
1109                         void *from, int length, int hh_len, int fragheaderlen,
1110                         int exthdrlen, int transhdrlen, int mtu,
1111                         unsigned int flags, const struct flowi6 *fl6)
1112
1113 {
1114         struct sk_buff *skb;
1115         int err;
1116
1117         /* There is support for UDP large send offload by network
1118          * device, so create one single skb packet containing complete
1119          * udp datagram
1120          */
1121         skb = skb_peek_tail(queue);
1122         if (!skb) {
1123                 skb = sock_alloc_send_skb(sk,
1124                         hh_len + fragheaderlen + transhdrlen + 20,
1125                         (flags & MSG_DONTWAIT), &err);
1126                 if (!skb)
1127                         return err;
1128
1129                 /* reserve space for Hardware header */
1130                 skb_reserve(skb, hh_len);
1131
1132                 /* create space for UDP/IP header */
1133                 skb_put(skb, fragheaderlen + transhdrlen);
1134
1135                 /* initialize network header pointer */
1136                 skb_set_network_header(skb, exthdrlen);
1137
1138                 /* initialize protocol header pointer */
1139                 skb->transport_header = skb->network_header + fragheaderlen;
1140
1141                 skb->protocol = htons(ETH_P_IPV6);
1142                 skb->csum = 0;
1143
1144                 __skb_queue_tail(queue, skb);
1145         } else if (skb_is_gso(skb)) {
1146                 goto append;
1147         }
1148
1149         skb->ip_summed = CHECKSUM_PARTIAL;
1150         /* Specify the length of each IPv6 datagram fragment.
1151          * It has to be a multiple of 8.
1152          */
1153         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1154                                      sizeof(struct frag_hdr)) & ~7;
1155         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1156         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1157                                                          &fl6->daddr,
1158                                                          &fl6->saddr);
1159
1160 append:
1161         return skb_append_datato_frags(sk, skb, getfrag, from,
1162                                        (length - transhdrlen));
1163 }
1164
1165 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1166                                                gfp_t gfp)
1167 {
1168         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 }
1170
1171 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1172                                                 gfp_t gfp)
1173 {
1174         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 }
1176
1177 static void ip6_append_data_mtu(unsigned int *mtu,
1178                                 int *maxfraglen,
1179                                 unsigned int fragheaderlen,
1180                                 struct sk_buff *skb,
1181                                 struct rt6_info *rt,
1182                                 unsigned int orig_mtu)
1183 {
1184         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1185                 if (!skb) {
1186                         /* first fragment, reserve header_len */
1187                         *mtu = orig_mtu - rt->dst.header_len;
1188
1189                 } else {
1190                         /*
1191                          * this fragment is not first, the headers
1192                          * space is regarded as data space.
1193                          */
1194                         *mtu = orig_mtu;
1195                 }
1196                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1197                               + fragheaderlen - sizeof(struct frag_hdr);
1198         }
1199 }
1200
1201 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1202                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1203                           struct rt6_info *rt, struct flowi6 *fl6)
1204 {
1205         struct ipv6_pinfo *np = inet6_sk(sk);
1206         unsigned int mtu;
1207         struct ipv6_txoptions *opt = ipc6->opt;
1208
1209         /*
1210          * setup for corking
1211          */
1212         if (opt) {
1213                 if (WARN_ON(v6_cork->opt))
1214                         return -EINVAL;
1215
1216                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1217                 if (unlikely(!v6_cork->opt))
1218                         return -ENOBUFS;
1219
1220                 v6_cork->opt->tot_len = opt->tot_len;
1221                 v6_cork->opt->opt_flen = opt->opt_flen;
1222                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1223
1224                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225                                                     sk->sk_allocation);
1226                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1227                         return -ENOBUFS;
1228
1229                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230                                                     sk->sk_allocation);
1231                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1232                         return -ENOBUFS;
1233
1234                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1235                                                    sk->sk_allocation);
1236                 if (opt->hopopt && !v6_cork->opt->hopopt)
1237                         return -ENOBUFS;
1238
1239                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240                                                     sk->sk_allocation);
1241                 if (opt->srcrt && !v6_cork->opt->srcrt)
1242                         return -ENOBUFS;
1243
1244                 /* need source address above miyazawa*/
1245         }
1246         dst_hold(&rt->dst);
1247         cork->base.dst = &rt->dst;
1248         cork->fl.u.ip6 = *fl6;
1249         v6_cork->hop_limit = ipc6->hlimit;
1250         v6_cork->tclass = ipc6->tclass;
1251         if (rt->dst.flags & DST_XFRM_TUNNEL)
1252                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1253                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1254         else
1255                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1256                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1257         if (np->frag_size < mtu) {
1258                 if (np->frag_size)
1259                         mtu = np->frag_size;
1260         }
1261         cork->base.fragsize = mtu;
1262         if (dst_allfrag(rt->dst.path))
1263                 cork->base.flags |= IPCORK_ALLFRAG;
1264         cork->base.length = 0;
1265
1266         return 0;
1267 }
1268
1269 static int __ip6_append_data(struct sock *sk,
1270                              struct flowi6 *fl6,
1271                              struct sk_buff_head *queue,
1272                              struct inet_cork *cork,
1273                              struct inet6_cork *v6_cork,
1274                              struct page_frag *pfrag,
1275                              int getfrag(void *from, char *to, int offset,
1276                                          int len, int odd, struct sk_buff *skb),
1277                              void *from, int length, int transhdrlen,
1278                              unsigned int flags, struct ipcm6_cookie *ipc6,
1279                              const struct sockcm_cookie *sockc)
1280 {
1281         struct sk_buff *skb, *skb_prev = NULL;
1282         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1283         int exthdrlen = 0;
1284         int dst_exthdrlen = 0;
1285         int hh_len;
1286         int copy;
1287         int err;
1288         int offset = 0;
1289         __u8 tx_flags = 0;
1290         u32 tskey = 0;
1291         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1292         struct ipv6_txoptions *opt = v6_cork->opt;
1293         int csummode = CHECKSUM_NONE;
1294         unsigned int maxnonfragsize, headersize;
1295
1296         skb = skb_peek_tail(queue);
1297         if (!skb) {
1298                 exthdrlen = opt ? opt->opt_flen : 0;
1299                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1300         }
1301
1302         mtu = cork->fragsize;
1303         orig_mtu = mtu;
1304
1305         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1306
1307         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1308                         (opt ? opt->opt_nflen : 0);
1309         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1310                      sizeof(struct frag_hdr);
1311
1312         headersize = sizeof(struct ipv6hdr) +
1313                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1314                      (dst_allfrag(&rt->dst) ?
1315                       sizeof(struct frag_hdr) : 0) +
1316                      rt->rt6i_nfheader_len;
1317
1318         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1319             (sk->sk_protocol == IPPROTO_UDP ||
1320              sk->sk_protocol == IPPROTO_RAW)) {
1321                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1322                                 sizeof(struct ipv6hdr));
1323                 goto emsgsize;
1324         }
1325
1326         if (ip6_sk_ignore_df(sk))
1327                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1328         else
1329                 maxnonfragsize = mtu;
1330
1331         if (cork->length + length > maxnonfragsize - headersize) {
1332 emsgsize:
1333                 ipv6_local_error(sk, EMSGSIZE, fl6,
1334                                  mtu - headersize +
1335                                  sizeof(struct ipv6hdr));
1336                 return -EMSGSIZE;
1337         }
1338
1339         /* CHECKSUM_PARTIAL only with no extension headers and when
1340          * we are not going to fragment
1341          */
1342         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1343             headersize == sizeof(struct ipv6hdr) &&
1344             length < mtu - headersize &&
1345             !(flags & MSG_MORE) &&
1346             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1347                 csummode = CHECKSUM_PARTIAL;
1348
1349         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1350                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1351                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1352                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1353                         tskey = sk->sk_tskey++;
1354         }
1355
1356         /*
1357          * Let's try using as much space as possible.
1358          * Use MTU if total length of the message fits into the MTU.
1359          * Otherwise, we need to reserve fragment header and
1360          * fragment alignment (= 8-15 octects, in total).
1361          *
1362          * Note that we may need to "move" the data from the tail of
1363          * of the buffer to the new fragment when we split
1364          * the message.
1365          *
1366          * FIXME: It may be fragmented into multiple chunks
1367          *        at once if non-fragmentable extension headers
1368          *        are too large.
1369          * --yoshfuji
1370          */
1371
1372         cork->length += length;
1373         if (((length > mtu) ||
1374              (skb && skb_is_gso(skb))) &&
1375             (sk->sk_protocol == IPPROTO_UDP) &&
1376             (rt->dst.dev->features & NETIF_F_UFO) &&
1377             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1378                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1379                                           hh_len, fragheaderlen, exthdrlen,
1380                                           transhdrlen, mtu, flags, fl6);
1381                 if (err)
1382                         goto error;
1383                 return 0;
1384         }
1385
1386         if (!skb)
1387                 goto alloc_new_skb;
1388
1389         while (length > 0) {
1390                 /* Check if the remaining data fits into current packet. */
1391                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1392                 if (copy < length)
1393                         copy = maxfraglen - skb->len;
1394
1395                 if (copy <= 0) {
1396                         char *data;
1397                         unsigned int datalen;
1398                         unsigned int fraglen;
1399                         unsigned int fraggap;
1400                         unsigned int alloclen;
1401 alloc_new_skb:
1402                         /* There's no room in the current skb */
1403                         if (skb)
1404                                 fraggap = skb->len - maxfraglen;
1405                         else
1406                                 fraggap = 0;
1407                         /* update mtu and maxfraglen if necessary */
1408                         if (!skb || !skb_prev)
1409                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1410                                                     fragheaderlen, skb, rt,
1411                                                     orig_mtu);
1412
1413                         skb_prev = skb;
1414
1415                         /*
1416                          * If remaining data exceeds the mtu,
1417                          * we know we need more fragment(s).
1418                          */
1419                         datalen = length + fraggap;
1420
1421                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1422                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1423                         if ((flags & MSG_MORE) &&
1424                             !(rt->dst.dev->features&NETIF_F_SG))
1425                                 alloclen = mtu;
1426                         else
1427                                 alloclen = datalen + fragheaderlen;
1428
1429                         alloclen += dst_exthdrlen;
1430
1431                         if (datalen != length + fraggap) {
1432                                 /*
1433                                  * this is not the last fragment, the trailer
1434                                  * space is regarded as data space.
1435                                  */
1436                                 datalen += rt->dst.trailer_len;
1437                         }
1438
1439                         alloclen += rt->dst.trailer_len;
1440                         fraglen = datalen + fragheaderlen;
1441
1442                         /*
1443                          * We just reserve space for fragment header.
1444                          * Note: this may be overallocation if the message
1445                          * (without MSG_MORE) fits into the MTU.
1446                          */
1447                         alloclen += sizeof(struct frag_hdr);
1448
1449                         if (transhdrlen) {
1450                                 skb = sock_alloc_send_skb(sk,
1451                                                 alloclen + hh_len,
1452                                                 (flags & MSG_DONTWAIT), &err);
1453                         } else {
1454                                 skb = NULL;
1455                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1456                                     2 * sk->sk_sndbuf)
1457                                         skb = sock_wmalloc(sk,
1458                                                            alloclen + hh_len, 1,
1459                                                            sk->sk_allocation);
1460                                 if (unlikely(!skb))
1461                                         err = -ENOBUFS;
1462                         }
1463                         if (!skb)
1464                                 goto error;
1465                         /*
1466                          *      Fill in the control structures
1467                          */
1468                         skb->protocol = htons(ETH_P_IPV6);
1469                         skb->ip_summed = csummode;
1470                         skb->csum = 0;
1471                         /* reserve for fragmentation and ipsec header */
1472                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1473                                     dst_exthdrlen);
1474
1475                         /* Only the initial fragment is time stamped */
1476                         skb_shinfo(skb)->tx_flags = tx_flags;
1477                         tx_flags = 0;
1478                         skb_shinfo(skb)->tskey = tskey;
1479                         tskey = 0;
1480
1481                         /*
1482                          *      Find where to start putting bytes
1483                          */
1484                         data = skb_put(skb, fraglen);
1485                         skb_set_network_header(skb, exthdrlen);
1486                         data += fragheaderlen;
1487                         skb->transport_header = (skb->network_header +
1488                                                  fragheaderlen);
1489                         if (fraggap) {
1490                                 skb->csum = skb_copy_and_csum_bits(
1491                                         skb_prev, maxfraglen,
1492                                         data + transhdrlen, fraggap, 0);
1493                                 skb_prev->csum = csum_sub(skb_prev->csum,
1494                                                           skb->csum);
1495                                 data += fraggap;
1496                                 pskb_trim_unique(skb_prev, maxfraglen);
1497                         }
1498                         copy = datalen - transhdrlen - fraggap;
1499
1500                         if (copy < 0) {
1501                                 err = -EINVAL;
1502                                 kfree_skb(skb);
1503                                 goto error;
1504                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1505                                 err = -EFAULT;
1506                                 kfree_skb(skb);
1507                                 goto error;
1508                         }
1509
1510                         offset += copy;
1511                         length -= datalen - fraggap;
1512                         transhdrlen = 0;
1513                         exthdrlen = 0;
1514                         dst_exthdrlen = 0;
1515
1516                         /*
1517                          * Put the packet on the pending queue
1518                          */
1519                         __skb_queue_tail(queue, skb);
1520                         continue;
1521                 }
1522
1523                 if (copy > length)
1524                         copy = length;
1525
1526                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1527                         unsigned int off;
1528
1529                         off = skb->len;
1530                         if (getfrag(from, skb_put(skb, copy),
1531                                                 offset, copy, off, skb) < 0) {
1532                                 __skb_trim(skb, off);
1533                                 err = -EFAULT;
1534                                 goto error;
1535                         }
1536                 } else {
1537                         int i = skb_shinfo(skb)->nr_frags;
1538
1539                         err = -ENOMEM;
1540                         if (!sk_page_frag_refill(sk, pfrag))
1541                                 goto error;
1542
1543                         if (!skb_can_coalesce(skb, i, pfrag->page,
1544                                               pfrag->offset)) {
1545                                 err = -EMSGSIZE;
1546                                 if (i == MAX_SKB_FRAGS)
1547                                         goto error;
1548
1549                                 __skb_fill_page_desc(skb, i, pfrag->page,
1550                                                      pfrag->offset, 0);
1551                                 skb_shinfo(skb)->nr_frags = ++i;
1552                                 get_page(pfrag->page);
1553                         }
1554                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1555                         if (getfrag(from,
1556                                     page_address(pfrag->page) + pfrag->offset,
1557                                     offset, copy, skb->len, skb) < 0)
1558                                 goto error_efault;
1559
1560                         pfrag->offset += copy;
1561                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1562                         skb->len += copy;
1563                         skb->data_len += copy;
1564                         skb->truesize += copy;
1565                         atomic_add(copy, &sk->sk_wmem_alloc);
1566                 }
1567                 offset += copy;
1568                 length -= copy;
1569         }
1570
1571         return 0;
1572
1573 error_efault:
1574         err = -EFAULT;
1575 error:
1576         cork->length -= length;
1577         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1578         return err;
1579 }
1580
1581 int ip6_append_data(struct sock *sk,
1582                     int getfrag(void *from, char *to, int offset, int len,
1583                                 int odd, struct sk_buff *skb),
1584                     void *from, int length, int transhdrlen,
1585                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1586                     struct rt6_info *rt, unsigned int flags,
1587                     const struct sockcm_cookie *sockc)
1588 {
1589         struct inet_sock *inet = inet_sk(sk);
1590         struct ipv6_pinfo *np = inet6_sk(sk);
1591         int exthdrlen;
1592         int err;
1593
1594         if (flags&MSG_PROBE)
1595                 return 0;
1596         if (skb_queue_empty(&sk->sk_write_queue)) {
1597                 /*
1598                  * setup for corking
1599                  */
1600                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1601                                      ipc6, rt, fl6);
1602                 if (err)
1603                         return err;
1604
1605                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1606                 length += exthdrlen;
1607                 transhdrlen += exthdrlen;
1608         } else {
1609                 fl6 = &inet->cork.fl.u.ip6;
1610                 transhdrlen = 0;
1611         }
1612
1613         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1614                                  &np->cork, sk_page_frag(sk), getfrag,
1615                                  from, length, transhdrlen, flags, ipc6, sockc);
1616 }
1617 EXPORT_SYMBOL_GPL(ip6_append_data);
1618
1619 static void ip6_cork_release(struct inet_cork_full *cork,
1620                              struct inet6_cork *v6_cork)
1621 {
1622         if (v6_cork->opt) {
1623                 kfree(v6_cork->opt->dst0opt);
1624                 kfree(v6_cork->opt->dst1opt);
1625                 kfree(v6_cork->opt->hopopt);
1626                 kfree(v6_cork->opt->srcrt);
1627                 kfree(v6_cork->opt);
1628                 v6_cork->opt = NULL;
1629         }
1630
1631         if (cork->base.dst) {
1632                 dst_release(cork->base.dst);
1633                 cork->base.dst = NULL;
1634                 cork->base.flags &= ~IPCORK_ALLFRAG;
1635         }
1636         memset(&cork->fl, 0, sizeof(cork->fl));
1637 }
1638
1639 struct sk_buff *__ip6_make_skb(struct sock *sk,
1640                                struct sk_buff_head *queue,
1641                                struct inet_cork_full *cork,
1642                                struct inet6_cork *v6_cork)
1643 {
1644         struct sk_buff *skb, *tmp_skb;
1645         struct sk_buff **tail_skb;
1646         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1647         struct ipv6_pinfo *np = inet6_sk(sk);
1648         struct net *net = sock_net(sk);
1649         struct ipv6hdr *hdr;
1650         struct ipv6_txoptions *opt = v6_cork->opt;
1651         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1652         struct flowi6 *fl6 = &cork->fl.u.ip6;
1653         unsigned char proto = fl6->flowi6_proto;
1654
1655         skb = __skb_dequeue(queue);
1656         if (!skb)
1657                 goto out;
1658         tail_skb = &(skb_shinfo(skb)->frag_list);
1659
1660         /* move skb->data to ip header from ext header */
1661         if (skb->data < skb_network_header(skb))
1662                 __skb_pull(skb, skb_network_offset(skb));
1663         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1664                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1665                 *tail_skb = tmp_skb;
1666                 tail_skb = &(tmp_skb->next);
1667                 skb->len += tmp_skb->len;
1668                 skb->data_len += tmp_skb->len;
1669                 skb->truesize += tmp_skb->truesize;
1670                 tmp_skb->destructor = NULL;
1671                 tmp_skb->sk = NULL;
1672         }
1673
1674         /* Allow local fragmentation. */
1675         skb->ignore_df = ip6_sk_ignore_df(sk);
1676
1677         *final_dst = fl6->daddr;
1678         __skb_pull(skb, skb_network_header_len(skb));
1679         if (opt && opt->opt_flen)
1680                 ipv6_push_frag_opts(skb, opt, &proto);
1681         if (opt && opt->opt_nflen)
1682                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1683
1684         skb_push(skb, sizeof(struct ipv6hdr));
1685         skb_reset_network_header(skb);
1686         hdr = ipv6_hdr(skb);
1687
1688         ip6_flow_hdr(hdr, v6_cork->tclass,
1689                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1690                                         np->autoflowlabel, fl6));
1691         hdr->hop_limit = v6_cork->hop_limit;
1692         hdr->nexthdr = proto;
1693         hdr->saddr = fl6->saddr;
1694         hdr->daddr = *final_dst;
1695
1696         skb->priority = sk->sk_priority;
1697         skb->mark = sk->sk_mark;
1698
1699         skb_dst_set(skb, dst_clone(&rt->dst));
1700         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1701         if (proto == IPPROTO_ICMPV6) {
1702                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1703
1704                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1705                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1706         }
1707
1708         ip6_cork_release(cork, v6_cork);
1709 out:
1710         return skb;
1711 }
1712
1713 int ip6_send_skb(struct sk_buff *skb)
1714 {
1715         struct net *net = sock_net(skb->sk);
1716         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1717         int err;
1718
1719         err = ip6_local_out(net, skb->sk, skb);
1720         if (err) {
1721                 if (err > 0)
1722                         err = net_xmit_errno(err);
1723                 if (err)
1724                         IP6_INC_STATS(net, rt->rt6i_idev,
1725                                       IPSTATS_MIB_OUTDISCARDS);
1726         }
1727
1728         return err;
1729 }
1730
1731 int ip6_push_pending_frames(struct sock *sk)
1732 {
1733         struct sk_buff *skb;
1734
1735         skb = ip6_finish_skb(sk);
1736         if (!skb)
1737                 return 0;
1738
1739         return ip6_send_skb(skb);
1740 }
1741 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1742
1743 static void __ip6_flush_pending_frames(struct sock *sk,
1744                                        struct sk_buff_head *queue,
1745                                        struct inet_cork_full *cork,
1746                                        struct inet6_cork *v6_cork)
1747 {
1748         struct sk_buff *skb;
1749
1750         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1751                 if (skb_dst(skb))
1752                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1753                                       IPSTATS_MIB_OUTDISCARDS);
1754                 kfree_skb(skb);
1755         }
1756
1757         ip6_cork_release(cork, v6_cork);
1758 }
1759
1760 void ip6_flush_pending_frames(struct sock *sk)
1761 {
1762         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1763                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1764 }
1765 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1766
1767 struct sk_buff *ip6_make_skb(struct sock *sk,
1768                              int getfrag(void *from, char *to, int offset,
1769                                          int len, int odd, struct sk_buff *skb),
1770                              void *from, int length, int transhdrlen,
1771                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1772                              struct rt6_info *rt, unsigned int flags,
1773                              const struct sockcm_cookie *sockc)
1774 {
1775         struct inet_cork_full cork;
1776         struct inet6_cork v6_cork;
1777         struct sk_buff_head queue;
1778         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1779         int err;
1780
1781         if (flags & MSG_PROBE)
1782                 return NULL;
1783
1784         __skb_queue_head_init(&queue);
1785
1786         cork.base.flags = 0;
1787         cork.base.addr = 0;
1788         cork.base.opt = NULL;
1789         v6_cork.opt = NULL;
1790         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1791         if (err)
1792                 return ERR_PTR(err);
1793
1794         if (ipc6->dontfrag < 0)
1795                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1796
1797         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1798                                 &current->task_frag, getfrag, from,
1799                                 length + exthdrlen, transhdrlen + exthdrlen,
1800                                 flags, ipc6, sockc);
1801         if (err) {
1802                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1803                 return ERR_PTR(err);
1804         }
1805
1806         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1807 }