Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203
204         if (opt) {
205                 unsigned int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         return dst_output(net, sk, skb);
379 }
380
381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383         unsigned int mtu;
384         struct inet6_dev *idev;
385
386         if (dst_metric_locked(dst, RTAX_MTU)) {
387                 mtu = dst_metric_raw(dst, RTAX_MTU);
388                 if (mtu)
389                         return mtu;
390         }
391
392         mtu = IPV6_MIN_MTU;
393         rcu_read_lock();
394         idev = __in6_dev_get(dst->dev);
395         if (idev)
396                 mtu = idev->cnf.mtu6;
397         rcu_read_unlock();
398
399         return mtu;
400 }
401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402
403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404 {
405         if (skb->len <= mtu)
406                 return false;
407
408         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410                 return true;
411
412         if (skb->ignore_df)
413                 return false;
414
415         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416                 return false;
417
418         return true;
419 }
420
421 int ip6_forward(struct sk_buff *skb)
422 {
423         struct dst_entry *dst = skb_dst(skb);
424         struct ipv6hdr *hdr = ipv6_hdr(skb);
425         struct inet6_skb_parm *opt = IP6CB(skb);
426         struct net *net = dev_net(dst->dev);
427         u32 mtu;
428
429         if (net->ipv6.devconf_all->forwarding == 0)
430                 goto error;
431
432         if (skb->pkt_type != PACKET_HOST)
433                 goto drop;
434
435         if (unlikely(skb->sk))
436                 goto drop;
437
438         if (skb_warn_if_lro(skb))
439                 goto drop;
440
441         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
443                                 IPSTATS_MIB_INDISCARDS);
444                 goto drop;
445         }
446
447         skb_forward_csum(skb);
448
449         /*
450          *      We DO NOT make any processing on
451          *      RA packets, pushing them to user level AS IS
452          *      without ane WARRANTY that application will be able
453          *      to interpret them. The reason is that we
454          *      cannot make anything clever here.
455          *
456          *      We are not end-node, so that if packet contains
457          *      AH/ESP, we cannot make anything.
458          *      Defragmentation also would be mistake, RA packets
459          *      cannot be fragmented, because there is no warranty
460          *      that different fragments will go along one path. --ANK
461          */
462         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464                         return 0;
465         }
466
467         /*
468          *      check and decrement ttl
469          */
470         if (hdr->hop_limit <= 1) {
471                 /* Force OUTPUT device used as source address */
472                 skb->dev = dst->dev;
473                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
475                                 IPSTATS_MIB_INHDRERRORS);
476
477                 kfree_skb(skb);
478                 return -ETIMEDOUT;
479         }
480
481         /* XXX: idev->cnf.proxy_ndp? */
482         if (net->ipv6.devconf_all->proxy_ndp &&
483             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484                 int proxied = ip6_forward_proxy_check(skb);
485                 if (proxied > 0)
486                         return ip6_input(skb);
487                 else if (proxied < 0) {
488                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
489                                         IPSTATS_MIB_INDISCARDS);
490                         goto drop;
491                 }
492         }
493
494         if (!xfrm6_route_forward(skb)) {
495                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
496                                 IPSTATS_MIB_INDISCARDS);
497                 goto drop;
498         }
499         dst = skb_dst(skb);
500
501         /* IPv6 specs say nothing about it, but it is clear that we cannot
502            send redirects to source routed frames.
503            We don't send redirects to frames decapsulated from IPsec.
504          */
505         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506                 struct in6_addr *target = NULL;
507                 struct inet_peer *peer;
508                 struct rt6_info *rt;
509
510                 /*
511                  *      incoming and outgoing devices are the same
512                  *      send a redirect.
513                  */
514
515                 rt = (struct rt6_info *) dst;
516                 if (rt->rt6i_flags & RTF_GATEWAY)
517                         target = &rt->rt6i_gateway;
518                 else
519                         target = &hdr->daddr;
520
521                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522
523                 /* Limit redirects both by destination (here)
524                    and by source (inside ndisc_send_redirect)
525                  */
526                 if (inet_peer_xrlim_allow(peer, 1*HZ))
527                         ndisc_send_redirect(skb, target);
528                 if (peer)
529                         inet_putpeer(peer);
530         } else {
531                 int addrtype = ipv6_addr_type(&hdr->saddr);
532
533                 /* This check is security critical. */
534                 if (addrtype == IPV6_ADDR_ANY ||
535                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536                         goto error;
537                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
538                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539                                     ICMPV6_NOT_NEIGHBOUR, 0);
540                         goto error;
541                 }
542         }
543
544         mtu = ip6_dst_mtu_forward(dst);
545         if (mtu < IPV6_MIN_MTU)
546                 mtu = IPV6_MIN_MTU;
547
548         if (ip6_pkt_too_big(skb, mtu)) {
549                 /* Again, force OUTPUT device used as source address */
550                 skb->dev = dst->dev;
551                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
553                                 IPSTATS_MIB_INTOOBIGERRORS);
554                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
555                                 IPSTATS_MIB_FRAGFAILS);
556                 kfree_skb(skb);
557                 return -EMSGSIZE;
558         }
559
560         if (skb_cow(skb, dst->dev->hard_header_len)) {
561                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
562                                 IPSTATS_MIB_OUTDISCARDS);
563                 goto drop;
564         }
565
566         hdr = ipv6_hdr(skb);
567
568         /* Mangling hops number delayed to point after skb COW */
569
570         hdr->hop_limit--;
571
572         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575                        net, NULL, skb, skb->dev, dst->dev,
576                        ip6_forward_finish);
577
578 error:
579         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580 drop:
581         kfree_skb(skb);
582         return -EINVAL;
583 }
584
585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586 {
587         to->pkt_type = from->pkt_type;
588         to->priority = from->priority;
589         to->protocol = from->protocol;
590         skb_dst_drop(to);
591         skb_dst_set(to, dst_clone(skb_dst(from)));
592         to->dev = from->dev;
593         to->mark = from->mark;
594
595 #ifdef CONFIG_NET_SCHED
596         to->tc_index = from->tc_index;
597 #endif
598         nf_copy(to, from);
599         skb_copy_secmark(to, from);
600 }
601
602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603                  int (*output)(struct net *, struct sock *, struct sk_buff *))
604 {
605         struct sk_buff *frag;
606         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608                                 inet6_sk(skb->sk) : NULL;
609         struct ipv6hdr *tmp_hdr;
610         struct frag_hdr *fh;
611         unsigned int mtu, hlen, left, len;
612         int hroom, troom;
613         __be32 frag_id;
614         int ptr, offset = 0, err = 0;
615         u8 *prevhdr, nexthdr = 0;
616
617         err = ip6_find_1stfragopt(skb, &prevhdr);
618         if (err < 0)
619                 goto fail;
620         hlen = err;
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.
627          */
628         if (unlikely(!skb->ignore_df && skb->len > mtu))
629                 goto fail_toobig;
630
631         if (IP6CB(skb)->frag_max_size) {
632                 if (IP6CB(skb)->frag_max_size > mtu)
633                         goto fail_toobig;
634
635                 /* don't send fragments larger than what we received */
636                 mtu = IP6CB(skb)->frag_max_size;
637                 if (mtu < IPV6_MIN_MTU)
638                         mtu = IPV6_MIN_MTU;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646                 goto fail_toobig;
647         mtu -= hlen + sizeof(struct frag_hdr);
648
649         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650                                     &ipv6_hdr(skb)->saddr);
651
652         if (skb->ip_summed == CHECKSUM_PARTIAL &&
653             (err = skb_checksum_help(skb)))
654                 goto fail;
655
656         hroom = LL_RESERVED_SPACE(rt->dst.dev);
657         if (skb_has_frag_list(skb)) {
658                 unsigned int first_len = skb_pagelen(skb);
659                 struct sk_buff *frag2;
660
661                 if (first_len - hlen > mtu ||
662                     ((first_len - hlen) & 7) ||
663                     skb_cloned(skb) ||
664                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665                         goto slow_path;
666
667                 skb_walk_frags(skb, frag) {
668                         /* Correct geometry. */
669                         if (frag->len > mtu ||
670                             ((frag->len & 7) && frag->next) ||
671                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672                                 goto slow_path_clean;
673
674                         /* Partially cloned skb? */
675                         if (skb_shared(frag))
676                                 goto slow_path_clean;
677
678                         BUG_ON(frag->sk);
679                         if (skb->sk) {
680                                 frag->sk = skb->sk;
681                                 frag->destructor = sock_wfree;
682                         }
683                         skb->truesize -= frag->truesize;
684                 }
685
686                 err = 0;
687                 offset = 0;
688                 /* BUILD HEADER */
689
690                 *prevhdr = NEXTHDR_FRAGMENT;
691                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692                 if (!tmp_hdr) {
693                         err = -ENOMEM;
694                         goto fail;
695                 }
696                 frag = skb_shinfo(skb)->frag_list;
697                 skb_frag_list_init(skb);
698
699                 __skb_pull(skb, hlen);
700                 fh = __skb_push(skb, sizeof(struct frag_hdr));
701                 __skb_push(skb, hlen);
702                 skb_reset_network_header(skb);
703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705                 fh->nexthdr = nexthdr;
706                 fh->reserved = 0;
707                 fh->frag_off = htons(IP6_MF);
708                 fh->identification = frag_id;
709
710                 first_len = skb_pagelen(skb);
711                 skb->data_len = first_len - skb_headlen(skb);
712                 skb->len = first_len;
713                 ipv6_hdr(skb)->payload_len = htons(first_len -
714                                                    sizeof(struct ipv6hdr));
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(net, sk, skb);
741                         if (!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         return 0;
759                 }
760
761                 kfree_skb_list(frag);
762
763                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764                               IPSTATS_MIB_FRAGFAILS);
765                 return err;
766
767 slow_path_clean:
768                 skb_walk_frags(skb, frag2) {
769                         if (frag2 == frag)
770                                 break;
771                         frag2->sk = NULL;
772                         frag2->destructor = NULL;
773                         skb->truesize += frag2->truesize;
774                 }
775         }
776
777 slow_path:
778         left = skb->len - hlen;         /* Space per frame */
779         ptr = hlen;                     /* Where to start from */
780
781         /*
782          *      Fragment the datagram.
783          */
784
785         troom = rt->dst.dev->needed_tailroom;
786
787         /*
788          *      Keep copying data until we run out.
789          */
790         while (left > 0)        {
791                 u8 *fragnexthdr_offset;
792
793                 len = left;
794                 /* IF: it doesn't fit, use 'mtu' - the data space left */
795                 if (len > mtu)
796                         len = mtu;
797                 /* IF: we are not sending up to and including the packet end
798                    then align the next start on an eight byte boundary */
799                 if (len < left) {
800                         len &= ~7;
801                 }
802
803                 /* Allocate buffer */
804                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805                                  hroom + troom, GFP_ATOMIC);
806                 if (!frag) {
807                         err = -ENOMEM;
808                         goto fail;
809                 }
810
811                 /*
812                  *      Set up data on packet
813                  */
814
815                 ip6_copy_metadata(frag, skb);
816                 skb_reserve(frag, hroom);
817                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818                 skb_reset_network_header(frag);
819                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820                 frag->transport_header = (frag->network_header + hlen +
821                                           sizeof(struct frag_hdr));
822
823                 /*
824                  *      Charge the memory for the fragment to any owner
825                  *      it might possess
826                  */
827                 if (skb->sk)
828                         skb_set_owner_w(frag, skb->sk);
829
830                 /*
831                  *      Copy the packet header into the new buffer.
832                  */
833                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834
835                 fragnexthdr_offset = skb_network_header(frag);
836                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
837                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
838
839                 /*
840                  *      Build fragment header.
841                  */
842                 fh->nexthdr = nexthdr;
843                 fh->reserved = 0;
844                 fh->identification = frag_id;
845
846                 /*
847                  *      Copy a block of the IP datagram.
848                  */
849                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850                                      len));
851                 left -= len;
852
853                 fh->frag_off = htons(offset);
854                 if (left > 0)
855                         fh->frag_off |= htons(IP6_MF);
856                 ipv6_hdr(frag)->payload_len = htons(frag->len -
857                                                     sizeof(struct ipv6hdr));
858
859                 ptr += len;
860                 offset += len;
861
862                 /*
863                  *      Put this fragment into the sending queue.
864                  */
865                 err = output(net, sk, frag);
866                 if (err)
867                         goto fail;
868
869                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870                               IPSTATS_MIB_FRAGCREATES);
871         }
872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873                       IPSTATS_MIB_FRAGOKS);
874         consume_skb(skb);
875         return err;
876
877 fail_toobig:
878         if (skb->sk && dst_allfrag(skb_dst(skb)))
879                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880
881         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882         err = -EMSGSIZE;
883
884 fail:
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGFAILS);
887         kfree_skb(skb);
888         return err;
889 }
890
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892                                const struct in6_addr *fl_addr,
893                                const struct in6_addr *addr_cache)
894 {
895         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900                                           struct dst_entry *dst,
901                                           const struct flowi6 *fl6)
902 {
903         struct ipv6_pinfo *np = inet6_sk(sk);
904         struct rt6_info *rt;
905
906         if (!dst)
907                 goto out;
908
909         if (dst->ops->family != AF_INET6) {
910                 dst_release(dst);
911                 return NULL;
912         }
913
914         rt = (struct rt6_info *)dst;
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938                 dst_release(dst);
939                 dst = NULL;
940         }
941
942 out:
943         return dst;
944 }
945
946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947                                struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951         struct rt6_info *rt;
952 #endif
953         int err;
954         int flags = 0;
955
956         /* The correct way to handle this would be to do
957          * ip6_route_get_saddr, and then ip6_route_output; however,
958          * the route-specific preferred source forces the
959          * ip6_route_output call _before_ ip6_route_get_saddr.
960          *
961          * In source specific routing (no src=any default route),
962          * ip6_route_output will fail given src=any saddr, though, so
963          * that's why we try it again later.
964          */
965         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966                 struct rt6_info *rt;
967                 bool had_dst = *dst != NULL;
968
969                 if (!had_dst)
970                         *dst = ip6_route_output(net, sk, fl6);
971                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973                                           sk ? inet6_sk(sk)->srcprefs : 0,
974                                           &fl6->saddr);
975                 if (err)
976                         goto out_err_release;
977
978                 /* If we had an erroneous initial result, pretend it
979                  * never existed and let the SA-enabled version take
980                  * over.
981                  */
982                 if (!had_dst && (*dst)->error) {
983                         dst_release(*dst);
984                         *dst = NULL;
985                 }
986
987                 if (fl6->flowi6_oif)
988                         flags |= RT6_LOOKUP_F_IFACE;
989         }
990
991         if (!*dst)
992                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
993
994         err = (*dst)->error;
995         if (err)
996                 goto out_err_release;
997
998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999         /*
1000          * Here if the dst entry we've looked up
1001          * has a neighbour entry that is in the INCOMPLETE
1002          * state and the src address from the flow is
1003          * marked as OPTIMISTIC, we release the found
1004          * dst entry and replace it instead with the
1005          * dst entry of the nexthop router
1006          */
1007         rt = (struct rt6_info *) *dst;
1008         rcu_read_lock_bh();
1009         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010                                       rt6_nexthop(rt, &fl6->daddr));
1011         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012         rcu_read_unlock_bh();
1013
1014         if (err) {
1015                 struct inet6_ifaddr *ifp;
1016                 struct flowi6 fl_gw6;
1017                 int redirect;
1018
1019                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020                                       (*dst)->dev, 1);
1021
1022                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023                 if (ifp)
1024                         in6_ifa_put(ifp);
1025
1026                 if (redirect) {
1027                         /*
1028                          * We need to get the dst entry for the
1029                          * default router instead
1030                          */
1031                         dst_release(*dst);
1032                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034                         *dst = ip6_route_output(net, sk, &fl_gw6);
1035                         err = (*dst)->error;
1036                         if (err)
1037                                 goto out_err_release;
1038                 }
1039         }
1040 #endif
1041         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043                 err = -EAFNOSUPPORT;
1044                 goto out_err_release;
1045         }
1046
1047         return 0;
1048
1049 out_err_release:
1050         dst_release(*dst);
1051         *dst = NULL;
1052
1053         if (err == -ENETUNREACH)
1054                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055         return err;
1056 }
1057
1058 /**
1059  *      ip6_dst_lookup - perform route lookup on flow
1060  *      @sk: socket which provides route info
1061  *      @dst: pointer to dst_entry * for result
1062  *      @fl6: flow to lookup
1063  *
1064  *      This function performs a route lookup on the given flow.
1065  *
1066  *      It returns zero on success, or a standard errno code on error.
1067  */
1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069                    struct flowi6 *fl6)
1070 {
1071         *dst = NULL;
1072         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076 /**
1077  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078  *      @sk: socket which provides route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *
1082  *      This function performs a route lookup on the given flow.
1083  *
1084  *      It returns a valid dst pointer on success, or a pointer encoded
1085  *      error code.
1086  */
1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088                                       const struct in6_addr *final_dst)
1089 {
1090         struct dst_entry *dst = NULL;
1091         int err;
1092
1093         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094         if (err)
1095                 return ERR_PTR(err);
1096         if (final_dst)
1097                 fl6->daddr = *final_dst;
1098
1099         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103 /**
1104  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105  *      @sk: socket which provides the dst cache and route info
1106  *      @fl6: flow to lookup
1107  *      @final_dst: final destination address for ipsec lookup
1108  *
1109  *      This function performs a route lookup on the given flow with the
1110  *      possibility of using the cached route in the socket if it is valid.
1111  *      It will take the socket dst lock when operating on the dst cache.
1112  *      As a result, this function can only be used in process context.
1113  *
1114  *      It returns a valid dst pointer on success, or a pointer encoded
1115  *      error code.
1116  */
1117 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1118                                          const struct in6_addr *final_dst)
1119 {
1120         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1121
1122         dst = ip6_sk_dst_check(sk, dst, fl6);
1123         if (!dst)
1124                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125
1126         return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131                                                gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137                                                 gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143                                 int *maxfraglen,
1144                                 unsigned int fragheaderlen,
1145                                 struct sk_buff *skb,
1146                                 struct rt6_info *rt,
1147                                 unsigned int orig_mtu)
1148 {
1149         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150                 if (!skb) {
1151                         /* first fragment, reserve header_len */
1152                         *mtu = orig_mtu - rt->dst.header_len;
1153
1154                 } else {
1155                         /*
1156                          * this fragment is not first, the headers
1157                          * space is regarded as data space.
1158                          */
1159                         *mtu = orig_mtu;
1160                 }
1161                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162                               + fragheaderlen - sizeof(struct frag_hdr);
1163         }
1164 }
1165
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168                           struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         unsigned int mtu;
1172         struct ipv6_txoptions *opt = ipc6->opt;
1173
1174         /*
1175          * setup for corking
1176          */
1177         if (opt) {
1178                 if (WARN_ON(v6_cork->opt))
1179                         return -EINVAL;
1180
1181                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182                 if (unlikely(!v6_cork->opt))
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->tot_len = sizeof(*opt);
1186                 v6_cork->opt->opt_flen = opt->opt_flen;
1187                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200                                                    sk->sk_allocation);
1201                 if (opt->hopopt && !v6_cork->opt->hopopt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205                                                     sk->sk_allocation);
1206                 if (opt->srcrt && !v6_cork->opt->srcrt)
1207                         return -ENOBUFS;
1208
1209                 /* need source address above miyazawa*/
1210         }
1211         dst_hold(&rt->dst);
1212         cork->base.dst = &rt->dst;
1213         cork->fl.u.ip6 = *fl6;
1214         v6_cork->hop_limit = ipc6->hlimit;
1215         v6_cork->tclass = ipc6->tclass;
1216         if (rt->dst.flags & DST_XFRM_TUNNEL)
1217                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219         else
1220                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222         if (np->frag_size < mtu) {
1223                 if (np->frag_size)
1224                         mtu = np->frag_size;
1225         }
1226         if (mtu < IPV6_MIN_MTU)
1227                 return -EINVAL;
1228         cork->base.fragsize = mtu;
1229         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230                 cork->base.flags |= IPCORK_ALLFRAG;
1231         cork->base.length = 0;
1232
1233         return 0;
1234 }
1235
1236 static int __ip6_append_data(struct sock *sk,
1237                              struct flowi6 *fl6,
1238                              struct sk_buff_head *queue,
1239                              struct inet_cork *cork,
1240                              struct inet6_cork *v6_cork,
1241                              struct page_frag *pfrag,
1242                              int getfrag(void *from, char *to, int offset,
1243                                          int len, int odd, struct sk_buff *skb),
1244                              void *from, int length, int transhdrlen,
1245                              unsigned int flags, struct ipcm6_cookie *ipc6,
1246                              const struct sockcm_cookie *sockc)
1247 {
1248         struct sk_buff *skb, *skb_prev = NULL;
1249         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1250         int exthdrlen = 0;
1251         int dst_exthdrlen = 0;
1252         int hh_len;
1253         int copy;
1254         int err;
1255         int offset = 0;
1256         __u8 tx_flags = 0;
1257         u32 tskey = 0;
1258         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259         struct ipv6_txoptions *opt = v6_cork->opt;
1260         int csummode = CHECKSUM_NONE;
1261         unsigned int maxnonfragsize, headersize;
1262
1263         skb = skb_peek_tail(queue);
1264         if (!skb) {
1265                 exthdrlen = opt ? opt->opt_flen : 0;
1266                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267         }
1268
1269         mtu = cork->fragsize;
1270         orig_mtu = mtu;
1271
1272         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1273
1274         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1275                         (opt ? opt->opt_nflen : 0);
1276         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1277                      sizeof(struct frag_hdr);
1278
1279         headersize = sizeof(struct ipv6hdr) +
1280                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281                      (dst_allfrag(&rt->dst) ?
1282                       sizeof(struct frag_hdr) : 0) +
1283                      rt->rt6i_nfheader_len;
1284
1285         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1286             (sk->sk_protocol == IPPROTO_UDP ||
1287              sk->sk_protocol == IPPROTO_RAW)) {
1288                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1289                                 sizeof(struct ipv6hdr));
1290                 goto emsgsize;
1291         }
1292
1293         if (ip6_sk_ignore_df(sk))
1294                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1295         else
1296                 maxnonfragsize = mtu;
1297
1298         if (cork->length + length > maxnonfragsize - headersize) {
1299 emsgsize:
1300                 ipv6_local_error(sk, EMSGSIZE, fl6,
1301                                  mtu - headersize +
1302                                  sizeof(struct ipv6hdr));
1303                 return -EMSGSIZE;
1304         }
1305
1306         /* CHECKSUM_PARTIAL only with no extension headers and when
1307          * we are not going to fragment
1308          */
1309         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1310             headersize == sizeof(struct ipv6hdr) &&
1311             length <= mtu - headersize &&
1312             !(flags & MSG_MORE) &&
1313             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1314                 csummode = CHECKSUM_PARTIAL;
1315
1316         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1317                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1318                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1319                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1320                         tskey = sk->sk_tskey++;
1321         }
1322
1323         /*
1324          * Let's try using as much space as possible.
1325          * Use MTU if total length of the message fits into the MTU.
1326          * Otherwise, we need to reserve fragment header and
1327          * fragment alignment (= 8-15 octects, in total).
1328          *
1329          * Note that we may need to "move" the data from the tail of
1330          * of the buffer to the new fragment when we split
1331          * the message.
1332          *
1333          * FIXME: It may be fragmented into multiple chunks
1334          *        at once if non-fragmentable extension headers
1335          *        are too large.
1336          * --yoshfuji
1337          */
1338
1339         cork->length += length;
1340         if (!skb)
1341                 goto alloc_new_skb;
1342
1343         while (length > 0) {
1344                 /* Check if the remaining data fits into current packet. */
1345                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1346                 if (copy < length)
1347                         copy = maxfraglen - skb->len;
1348
1349                 if (copy <= 0) {
1350                         char *data;
1351                         unsigned int datalen;
1352                         unsigned int fraglen;
1353                         unsigned int fraggap;
1354                         unsigned int alloclen;
1355 alloc_new_skb:
1356                         /* There's no room in the current skb */
1357                         if (skb)
1358                                 fraggap = skb->len - maxfraglen;
1359                         else
1360                                 fraggap = 0;
1361                         /* update mtu and maxfraglen if necessary */
1362                         if (!skb || !skb_prev)
1363                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1364                                                     fragheaderlen, skb, rt,
1365                                                     orig_mtu);
1366
1367                         skb_prev = skb;
1368
1369                         /*
1370                          * If remaining data exceeds the mtu,
1371                          * we know we need more fragment(s).
1372                          */
1373                         datalen = length + fraggap;
1374
1375                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1376                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1377                         if ((flags & MSG_MORE) &&
1378                             !(rt->dst.dev->features&NETIF_F_SG))
1379                                 alloclen = mtu;
1380                         else
1381                                 alloclen = datalen + fragheaderlen;
1382
1383                         alloclen += dst_exthdrlen;
1384
1385                         if (datalen != length + fraggap) {
1386                                 /*
1387                                  * this is not the last fragment, the trailer
1388                                  * space is regarded as data space.
1389                                  */
1390                                 datalen += rt->dst.trailer_len;
1391                         }
1392
1393                         alloclen += rt->dst.trailer_len;
1394                         fraglen = datalen + fragheaderlen;
1395
1396                         /*
1397                          * We just reserve space for fragment header.
1398                          * Note: this may be overallocation if the message
1399                          * (without MSG_MORE) fits into the MTU.
1400                          */
1401                         alloclen += sizeof(struct frag_hdr);
1402
1403                         copy = datalen - transhdrlen - fraggap;
1404                         if (copy < 0) {
1405                                 err = -EINVAL;
1406                                 goto error;
1407                         }
1408                         if (transhdrlen) {
1409                                 skb = sock_alloc_send_skb(sk,
1410                                                 alloclen + hh_len,
1411                                                 (flags & MSG_DONTWAIT), &err);
1412                         } else {
1413                                 skb = NULL;
1414                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1415                                     2 * sk->sk_sndbuf)
1416                                         skb = sock_wmalloc(sk,
1417                                                            alloclen + hh_len, 1,
1418                                                            sk->sk_allocation);
1419                                 if (unlikely(!skb))
1420                                         err = -ENOBUFS;
1421                         }
1422                         if (!skb)
1423                                 goto error;
1424                         /*
1425                          *      Fill in the control structures
1426                          */
1427                         skb->protocol = htons(ETH_P_IPV6);
1428                         skb->ip_summed = csummode;
1429                         skb->csum = 0;
1430                         /* reserve for fragmentation and ipsec header */
1431                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1432                                     dst_exthdrlen);
1433
1434                         /* Only the initial fragment is time stamped */
1435                         skb_shinfo(skb)->tx_flags = tx_flags;
1436                         tx_flags = 0;
1437                         skb_shinfo(skb)->tskey = tskey;
1438                         tskey = 0;
1439
1440                         /*
1441                          *      Find where to start putting bytes
1442                          */
1443                         data = skb_put(skb, fraglen);
1444                         skb_set_network_header(skb, exthdrlen);
1445                         data += fragheaderlen;
1446                         skb->transport_header = (skb->network_header +
1447                                                  fragheaderlen);
1448                         if (fraggap) {
1449                                 skb->csum = skb_copy_and_csum_bits(
1450                                         skb_prev, maxfraglen,
1451                                         data + transhdrlen, fraggap, 0);
1452                                 skb_prev->csum = csum_sub(skb_prev->csum,
1453                                                           skb->csum);
1454                                 data += fraggap;
1455                                 pskb_trim_unique(skb_prev, maxfraglen);
1456                         }
1457                         if (copy > 0 &&
1458                             getfrag(from, data + transhdrlen, offset,
1459                                     copy, fraggap, skb) < 0) {
1460                                 err = -EFAULT;
1461                                 kfree_skb(skb);
1462                                 goto error;
1463                         }
1464
1465                         offset += copy;
1466                         length -= datalen - fraggap;
1467                         transhdrlen = 0;
1468                         exthdrlen = 0;
1469                         dst_exthdrlen = 0;
1470
1471                         if ((flags & MSG_CONFIRM) && !skb_prev)
1472                                 skb_set_dst_pending_confirm(skb, 1);
1473
1474                         /*
1475                          * Put the packet on the pending queue
1476                          */
1477                         __skb_queue_tail(queue, skb);
1478                         continue;
1479                 }
1480
1481                 if (copy > length)
1482                         copy = length;
1483
1484                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1485                         unsigned int off;
1486
1487                         off = skb->len;
1488                         if (getfrag(from, skb_put(skb, copy),
1489                                                 offset, copy, off, skb) < 0) {
1490                                 __skb_trim(skb, off);
1491                                 err = -EFAULT;
1492                                 goto error;
1493                         }
1494                 } else {
1495                         int i = skb_shinfo(skb)->nr_frags;
1496
1497                         err = -ENOMEM;
1498                         if (!sk_page_frag_refill(sk, pfrag))
1499                                 goto error;
1500
1501                         if (!skb_can_coalesce(skb, i, pfrag->page,
1502                                               pfrag->offset)) {
1503                                 err = -EMSGSIZE;
1504                                 if (i == MAX_SKB_FRAGS)
1505                                         goto error;
1506
1507                                 __skb_fill_page_desc(skb, i, pfrag->page,
1508                                                      pfrag->offset, 0);
1509                                 skb_shinfo(skb)->nr_frags = ++i;
1510                                 get_page(pfrag->page);
1511                         }
1512                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1513                         if (getfrag(from,
1514                                     page_address(pfrag->page) + pfrag->offset,
1515                                     offset, copy, skb->len, skb) < 0)
1516                                 goto error_efault;
1517
1518                         pfrag->offset += copy;
1519                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1520                         skb->len += copy;
1521                         skb->data_len += copy;
1522                         skb->truesize += copy;
1523                         refcount_add(copy, &sk->sk_wmem_alloc);
1524                 }
1525                 offset += copy;
1526                 length -= copy;
1527         }
1528
1529         return 0;
1530
1531 error_efault:
1532         err = -EFAULT;
1533 error:
1534         cork->length -= length;
1535         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536         return err;
1537 }
1538
1539 int ip6_append_data(struct sock *sk,
1540                     int getfrag(void *from, char *to, int offset, int len,
1541                                 int odd, struct sk_buff *skb),
1542                     void *from, int length, int transhdrlen,
1543                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1544                     struct rt6_info *rt, unsigned int flags,
1545                     const struct sockcm_cookie *sockc)
1546 {
1547         struct inet_sock *inet = inet_sk(sk);
1548         struct ipv6_pinfo *np = inet6_sk(sk);
1549         int exthdrlen;
1550         int err;
1551
1552         if (flags&MSG_PROBE)
1553                 return 0;
1554         if (skb_queue_empty(&sk->sk_write_queue)) {
1555                 /*
1556                  * setup for corking
1557                  */
1558                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1559                                      ipc6, rt, fl6);
1560                 if (err)
1561                         return err;
1562
1563                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1564                 length += exthdrlen;
1565                 transhdrlen += exthdrlen;
1566         } else {
1567                 fl6 = &inet->cork.fl.u.ip6;
1568                 transhdrlen = 0;
1569         }
1570
1571         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1572                                  &np->cork, sk_page_frag(sk), getfrag,
1573                                  from, length, transhdrlen, flags, ipc6, sockc);
1574 }
1575 EXPORT_SYMBOL_GPL(ip6_append_data);
1576
1577 static void ip6_cork_release(struct inet_cork_full *cork,
1578                              struct inet6_cork *v6_cork)
1579 {
1580         if (v6_cork->opt) {
1581                 kfree(v6_cork->opt->dst0opt);
1582                 kfree(v6_cork->opt->dst1opt);
1583                 kfree(v6_cork->opt->hopopt);
1584                 kfree(v6_cork->opt->srcrt);
1585                 kfree(v6_cork->opt);
1586                 v6_cork->opt = NULL;
1587         }
1588
1589         if (cork->base.dst) {
1590                 dst_release(cork->base.dst);
1591                 cork->base.dst = NULL;
1592                 cork->base.flags &= ~IPCORK_ALLFRAG;
1593         }
1594         memset(&cork->fl, 0, sizeof(cork->fl));
1595 }
1596
1597 struct sk_buff *__ip6_make_skb(struct sock *sk,
1598                                struct sk_buff_head *queue,
1599                                struct inet_cork_full *cork,
1600                                struct inet6_cork *v6_cork)
1601 {
1602         struct sk_buff *skb, *tmp_skb;
1603         struct sk_buff **tail_skb;
1604         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1605         struct ipv6_pinfo *np = inet6_sk(sk);
1606         struct net *net = sock_net(sk);
1607         struct ipv6hdr *hdr;
1608         struct ipv6_txoptions *opt = v6_cork->opt;
1609         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1610         struct flowi6 *fl6 = &cork->fl.u.ip6;
1611         unsigned char proto = fl6->flowi6_proto;
1612
1613         skb = __skb_dequeue(queue);
1614         if (!skb)
1615                 goto out;
1616         tail_skb = &(skb_shinfo(skb)->frag_list);
1617
1618         /* move skb->data to ip header from ext header */
1619         if (skb->data < skb_network_header(skb))
1620                 __skb_pull(skb, skb_network_offset(skb));
1621         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1622                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1623                 *tail_skb = tmp_skb;
1624                 tail_skb = &(tmp_skb->next);
1625                 skb->len += tmp_skb->len;
1626                 skb->data_len += tmp_skb->len;
1627                 skb->truesize += tmp_skb->truesize;
1628                 tmp_skb->destructor = NULL;
1629                 tmp_skb->sk = NULL;
1630         }
1631
1632         /* Allow local fragmentation. */
1633         skb->ignore_df = ip6_sk_ignore_df(sk);
1634
1635         *final_dst = fl6->daddr;
1636         __skb_pull(skb, skb_network_header_len(skb));
1637         if (opt && opt->opt_flen)
1638                 ipv6_push_frag_opts(skb, opt, &proto);
1639         if (opt && opt->opt_nflen)
1640                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1641
1642         skb_push(skb, sizeof(struct ipv6hdr));
1643         skb_reset_network_header(skb);
1644         hdr = ipv6_hdr(skb);
1645
1646         ip6_flow_hdr(hdr, v6_cork->tclass,
1647                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1648                                         ip6_autoflowlabel(net, np), fl6));
1649         hdr->hop_limit = v6_cork->hop_limit;
1650         hdr->nexthdr = proto;
1651         hdr->saddr = fl6->saddr;
1652         hdr->daddr = *final_dst;
1653
1654         skb->priority = sk->sk_priority;
1655         skb->mark = sk->sk_mark;
1656
1657         skb_dst_set(skb, dst_clone(&rt->dst));
1658         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1659         if (proto == IPPROTO_ICMPV6) {
1660                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1661
1662                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1663                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1664         }
1665
1666         ip6_cork_release(cork, v6_cork);
1667 out:
1668         return skb;
1669 }
1670
1671 int ip6_send_skb(struct sk_buff *skb)
1672 {
1673         struct net *net = sock_net(skb->sk);
1674         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1675         int err;
1676
1677         err = ip6_local_out(net, skb->sk, skb);
1678         if (err) {
1679                 if (err > 0)
1680                         err = net_xmit_errno(err);
1681                 if (err)
1682                         IP6_INC_STATS(net, rt->rt6i_idev,
1683                                       IPSTATS_MIB_OUTDISCARDS);
1684         }
1685
1686         return err;
1687 }
1688
1689 int ip6_push_pending_frames(struct sock *sk)
1690 {
1691         struct sk_buff *skb;
1692
1693         skb = ip6_finish_skb(sk);
1694         if (!skb)
1695                 return 0;
1696
1697         return ip6_send_skb(skb);
1698 }
1699 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1700
1701 static void __ip6_flush_pending_frames(struct sock *sk,
1702                                        struct sk_buff_head *queue,
1703                                        struct inet_cork_full *cork,
1704                                        struct inet6_cork *v6_cork)
1705 {
1706         struct sk_buff *skb;
1707
1708         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1709                 if (skb_dst(skb))
1710                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1711                                       IPSTATS_MIB_OUTDISCARDS);
1712                 kfree_skb(skb);
1713         }
1714
1715         ip6_cork_release(cork, v6_cork);
1716 }
1717
1718 void ip6_flush_pending_frames(struct sock *sk)
1719 {
1720         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1721                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1722 }
1723 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1724
1725 struct sk_buff *ip6_make_skb(struct sock *sk,
1726                              int getfrag(void *from, char *to, int offset,
1727                                          int len, int odd, struct sk_buff *skb),
1728                              void *from, int length, int transhdrlen,
1729                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1730                              struct rt6_info *rt, unsigned int flags,
1731                              const struct sockcm_cookie *sockc)
1732 {
1733         struct inet_cork_full cork;
1734         struct inet6_cork v6_cork;
1735         struct sk_buff_head queue;
1736         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1737         int err;
1738
1739         if (flags & MSG_PROBE)
1740                 return NULL;
1741
1742         __skb_queue_head_init(&queue);
1743
1744         cork.base.flags = 0;
1745         cork.base.addr = 0;
1746         cork.base.opt = NULL;
1747         cork.base.dst = NULL;
1748         v6_cork.opt = NULL;
1749         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1750         if (err) {
1751                 ip6_cork_release(&cork, &v6_cork);
1752                 return ERR_PTR(err);
1753         }
1754         if (ipc6->dontfrag < 0)
1755                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1756
1757         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1758                                 &current->task_frag, getfrag, from,
1759                                 length + exthdrlen, transhdrlen + exthdrlen,
1760                                 flags, ipc6, sockc);
1761         if (err) {
1762                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1763                 return ERR_PTR(err);
1764         }
1765
1766         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1767 }