udpv6: Fix gso_size setting in ip6_ufo_append_data
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
1da177e4
LT
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv6.h>
43
44#include <net/sock.h>
45#include <net/snmp.h>
46
47#include <net/ipv6.h>
48#include <net/ndisc.h>
49#include <net/protocol.h>
50#include <net/ip6_route.h>
51#include <net/addrconf.h>
52#include <net/rawv6.h>
53#include <net/icmp.h>
54#include <net/xfrm.h>
55#include <net/checksum.h>
7bc570c8 56#include <linux/mroute6.h>
1da177e4
LT
57
58static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61{
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
64
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
70}
71
ef76bc23
HX
72int __ip6_local_out(struct sk_buff *skb)
73{
74 int len;
75
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
78 len = 0;
79 ipv6_hdr(skb)->payload_len = htons(len);
80
adf30907 81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
ef76bc23
HX
82 dst_output);
83}
84
85int ip6_local_out(struct sk_buff *skb)
86{
87 int err;
88
89 err = __ip6_local_out(skb);
90 if (likely(err == 1))
91 err = dst_output(skb);
92
93 return err;
94}
95EXPORT_SYMBOL_GPL(ip6_local_out);
96
ad643a79 97static int ip6_output_finish(struct sk_buff *skb)
1da177e4 98{
adf30907 99 struct dst_entry *dst = skb_dst(skb);
1da177e4 100
3644f0ce
SH
101 if (dst->hh)
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
1da177e4
LT
104 return dst->neighbour->output(skb);
105
483a47d2
DL
106 IP6_INC_STATS_BH(dev_net(dst->dev),
107 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
108 kfree_skb(skb);
109 return -EINVAL;
110
111}
112
113/* dev_loopback_xmit for use with netfilter. */
114static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115{
459a98ed 116 skb_reset_mac_header(newskb);
bbe735e4 117 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 120 WARN_ON(!skb_dst(newskb));
1da177e4
LT
121
122 netif_rx(newskb);
123 return 0;
124}
125
126
127static int ip6_output2(struct sk_buff *skb)
128{
adf30907 129 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
130 struct net_device *dev = dst->dev;
131
132 skb->protocol = htons(ETH_P_IPV6);
133 skb->dev = dev;
134
0660e03f 135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
1da177e4 136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
adf30907 137 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4
LT
138
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
bd91b8bf
BT
140 ((mroute6_socket(dev_net(dev)) &&
141 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
142 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
143 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
144 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
145
146 /* Do not check for IFF_ALLMULTI; multicast routing
147 is not supported in any case.
148 */
149 if (newskb)
6e23ae2a
PM
150 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
151 NULL, newskb->dev,
1da177e4
LT
152 ip6_dev_loopback_xmit);
153
0660e03f 154 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
155 IP6_INC_STATS(dev_net(dev), idev,
156 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
157 kfree_skb(skb);
158 return 0;
159 }
160 }
161
edf391ff
NH
162 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
163 skb->len);
1da177e4
LT
164 }
165
6e23ae2a
PM
166 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
167 ip6_output_finish);
1da177e4
LT
168}
169
628a5c56
JH
170static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
171{
172 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
173
174 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
adf30907 175 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
176}
177
1da177e4
LT
178int ip6_output(struct sk_buff *skb)
179{
adf30907 180 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 181 if (unlikely(idev->cnf.disable_ipv6)) {
adf30907 182 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
3bd653c8 183 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
184 kfree_skb(skb);
185 return 0;
186 }
187
628a5c56 188 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
adf30907 189 dst_allfrag(skb_dst(skb)))
1da177e4
LT
190 return ip6_fragment(skb, ip6_output2);
191 else
192 return ip6_output2(skb);
193}
194
1da177e4
LT
195/*
196 * xmit an sk_buff (used by TCP)
197 */
198
199int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
200 struct ipv6_txoptions *opt, int ipfragok)
201{
3bd653c8 202 struct net *net = sock_net(sk);
b30bd282 203 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4 204 struct in6_addr *first_hop = &fl->fl6_dst;
adf30907 205 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
206 struct ipv6hdr *hdr;
207 u8 proto = fl->proto;
208 int seg_len = skb->len;
41a1f8ea 209 int hlimit, tclass;
1da177e4
LT
210 u32 mtu;
211
212 if (opt) {
c2636b4d 213 unsigned int head_room;
1da177e4
LT
214
215 /* First: exthdrs may take lots of space (~8K for now)
216 MAX_HEADER is not enough.
217 */
218 head_room = opt->opt_nflen + opt->opt_flen;
219 seg_len += head_room;
220 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
221
222 if (skb_headroom(skb) < head_room) {
223 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 224 if (skb2 == NULL) {
adf30907 225 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
226 IPSTATS_MIB_OUTDISCARDS);
227 kfree_skb(skb);
1da177e4
LT
228 return -ENOBUFS;
229 }
a11d206d
YH
230 kfree_skb(skb);
231 skb = skb2;
1da177e4
LT
232 if (sk)
233 skb_set_owner_w(skb, sk);
234 }
235 if (opt->opt_flen)
236 ipv6_push_frag_opts(skb, opt, &proto);
237 if (opt->opt_nflen)
238 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
239 }
240
e2d1bca7
ACM
241 skb_push(skb, sizeof(struct ipv6hdr));
242 skb_reset_network_header(skb);
0660e03f 243 hdr = ipv6_hdr(skb);
1da177e4 244
77e2f14f
WY
245 /* Allow local fragmentation. */
246 if (ipfragok)
247 skb->local_df = 1;
248
1da177e4
LT
249 /*
250 * Fill in the IPv6 header
251 */
252
1da177e4
LT
253 hlimit = -1;
254 if (np)
255 hlimit = np->hop_limit;
256 if (hlimit < 0)
6b75d090 257 hlimit = ip6_dst_hoplimit(dst);
1da177e4 258
41a1f8ea
YH
259 tclass = -1;
260 if (np)
261 tclass = np->tclass;
262 if (tclass < 0)
263 tclass = 0;
264
90bcaf7b 265 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 266
1da177e4
LT
267 hdr->payload_len = htons(seg_len);
268 hdr->nexthdr = proto;
269 hdr->hop_limit = hlimit;
270
271 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
272 ipv6_addr_copy(&hdr->daddr, first_hop);
273
a2c2064f 274 skb->priority = sk->sk_priority;
4a19ec58 275 skb->mark = sk->sk_mark;
a2c2064f 276
1da177e4 277 mtu = dst_mtu(dst);
283d07ac 278 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
adf30907 279 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 280 IPSTATS_MIB_OUT, skb->len);
6e23ae2a 281 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 282 dst_output);
1da177e4
LT
283 }
284
285 if (net_ratelimit())
286 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
287 skb->dev = dst->dev;
288 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
adf30907 289 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
290 kfree_skb(skb);
291 return -EMSGSIZE;
292}
293
7159039a
YH
294EXPORT_SYMBOL(ip6_xmit);
295
1da177e4
LT
296/*
297 * To avoid extra problems ND packets are send through this
298 * routine. It's code duplication but I really want to avoid
299 * extra checks since ipv6_build_header is used by TCP (which
300 * is for us performance critical)
301 */
302
303int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 304 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
305 int proto, int len)
306{
307 struct ipv6_pinfo *np = inet6_sk(sk);
308 struct ipv6hdr *hdr;
309 int totlen;
310
311 skb->protocol = htons(ETH_P_IPV6);
312 skb->dev = dev;
313
314 totlen = len + sizeof(struct ipv6hdr);
315
55f79cc0
ACM
316 skb_reset_network_header(skb);
317 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 318 hdr = ipv6_hdr(skb);
1da177e4 319
ae08e1f0 320 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
321
322 hdr->payload_len = htons(len);
323 hdr->nexthdr = proto;
324 hdr->hop_limit = np->hop_limit;
325
326 ipv6_addr_copy(&hdr->saddr, saddr);
327 ipv6_addr_copy(&hdr->daddr, daddr);
328
329 return 0;
330}
331
332static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
333{
334 struct ip6_ra_chain *ra;
335 struct sock *last = NULL;
336
337 read_lock(&ip6_ra_lock);
338 for (ra = ip6_ra_chain; ra; ra = ra->next) {
339 struct sock *sk = ra->sk;
0bd1b59b
AM
340 if (sk && ra->sel == sel &&
341 (!sk->sk_bound_dev_if ||
342 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
343 if (last) {
344 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
345 if (skb2)
346 rawv6_rcv(last, skb2);
347 }
348 last = sk;
349 }
350 }
351
352 if (last) {
353 rawv6_rcv(last, skb);
354 read_unlock(&ip6_ra_lock);
355 return 1;
356 }
357 read_unlock(&ip6_ra_lock);
358 return 0;
359}
360
e21e0b5f
VN
361static int ip6_forward_proxy_check(struct sk_buff *skb)
362{
0660e03f 363 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
364 u8 nexthdr = hdr->nexthdr;
365 int offset;
366
367 if (ipv6_ext_hdr(nexthdr)) {
368 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
369 if (offset < 0)
370 return 0;
371 } else
372 offset = sizeof(struct ipv6hdr);
373
374 if (nexthdr == IPPROTO_ICMPV6) {
375 struct icmp6hdr *icmp6;
376
d56f90a7
ACM
377 if (!pskb_may_pull(skb, (skb_network_header(skb) +
378 offset + 1 - skb->data)))
e21e0b5f
VN
379 return 0;
380
d56f90a7 381 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
382
383 switch (icmp6->icmp6_type) {
384 case NDISC_ROUTER_SOLICITATION:
385 case NDISC_ROUTER_ADVERTISEMENT:
386 case NDISC_NEIGHBOUR_SOLICITATION:
387 case NDISC_NEIGHBOUR_ADVERTISEMENT:
388 case NDISC_REDIRECT:
389 /* For reaction involving unicast neighbor discovery
390 * message destined to the proxied address, pass it to
391 * input function.
392 */
393 return 1;
394 default:
395 break;
396 }
397 }
398
74553b09
VN
399 /*
400 * The proxying router can't forward traffic sent to a link-local
401 * address, so signal the sender and discard the packet. This
402 * behavior is clarified by the MIPv6 specification.
403 */
404 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
405 dst_link_failure(skb);
406 return -1;
407 }
408
e21e0b5f
VN
409 return 0;
410}
411
1da177e4
LT
412static inline int ip6_forward_finish(struct sk_buff *skb)
413{
414 return dst_output(skb);
415}
416
417int ip6_forward(struct sk_buff *skb)
418{
adf30907 419 struct dst_entry *dst = skb_dst(skb);
0660e03f 420 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 421 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 422 struct net *net = dev_net(dst->dev);
1ab1457c 423
53b7997f 424 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
425 goto error;
426
4497b076
BH
427 if (skb_warn_if_lro(skb))
428 goto drop;
429
1da177e4 430 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
3bd653c8 431 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
432 goto drop;
433 }
434
35fc92a9 435 skb_forward_csum(skb);
1da177e4
LT
436
437 /*
438 * We DO NOT make any processing on
439 * RA packets, pushing them to user level AS IS
440 * without ane WARRANTY that application will be able
441 * to interpret them. The reason is that we
442 * cannot make anything clever here.
443 *
444 * We are not end-node, so that if packet contains
445 * AH/ESP, we cannot make anything.
446 * Defragmentation also would be mistake, RA packets
447 * cannot be fragmented, because there is no warranty
448 * that different fragments will go along one path. --ANK
449 */
450 if (opt->ra) {
d56f90a7 451 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
452 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
453 return 0;
454 }
455
456 /*
457 * check and decrement ttl
458 */
459 if (hdr->hop_limit <= 1) {
460 /* Force OUTPUT device used as source address */
461 skb->dev = dst->dev;
462 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
463 0, skb->dev);
483a47d2
DL
464 IP6_INC_STATS_BH(net,
465 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
466
467 kfree_skb(skb);
468 return -ETIMEDOUT;
469 }
470
fbea49e1 471 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 472 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 473 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
474 int proxied = ip6_forward_proxy_check(skb);
475 if (proxied > 0)
e21e0b5f 476 return ip6_input(skb);
74553b09 477 else if (proxied < 0) {
3bd653c8
DL
478 IP6_INC_STATS(net, ip6_dst_idev(dst),
479 IPSTATS_MIB_INDISCARDS);
74553b09
VN
480 goto drop;
481 }
e21e0b5f
VN
482 }
483
1da177e4 484 if (!xfrm6_route_forward(skb)) {
3bd653c8 485 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
486 goto drop;
487 }
adf30907 488 dst = skb_dst(skb);
1da177e4
LT
489
490 /* IPv6 specs say nothing about it, but it is clear that we cannot
491 send redirects to source routed frames.
1e5dc146 492 We don't send redirects to frames decapsulated from IPsec.
1da177e4 493 */
1e5dc146 494 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
def8b4fa 495 !skb_sec_path(skb)) {
1da177e4
LT
496 struct in6_addr *target = NULL;
497 struct rt6_info *rt;
498 struct neighbour *n = dst->neighbour;
499
500 /*
501 * incoming and outgoing devices are the same
502 * send a redirect.
503 */
504
505 rt = (struct rt6_info *) dst;
506 if ((rt->rt6i_flags & RTF_GATEWAY))
507 target = (struct in6_addr*)&n->primary_key;
508 else
509 target = &hdr->daddr;
510
511 /* Limit redirects both by destination (here)
512 and by source (inside ndisc_send_redirect)
513 */
514 if (xrlim_allow(dst, 1*HZ))
515 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
516 } else {
517 int addrtype = ipv6_addr_type(&hdr->saddr);
518
1da177e4 519 /* This check is security critical. */
f81b2e7d
YH
520 if (addrtype == IPV6_ADDR_ANY ||
521 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
522 goto error;
523 if (addrtype & IPV6_ADDR_LINKLOCAL) {
524 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
525 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
526 goto error;
527 }
1da177e4
LT
528 }
529
530 if (skb->len > dst_mtu(dst)) {
531 /* Again, force OUTPUT device used as source address */
532 skb->dev = dst->dev;
533 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
483a47d2
DL
534 IP6_INC_STATS_BH(net,
535 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
536 IP6_INC_STATS_BH(net,
537 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
538 kfree_skb(skb);
539 return -EMSGSIZE;
540 }
541
542 if (skb_cow(skb, dst->dev->hard_header_len)) {
3bd653c8 543 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
544 goto drop;
545 }
546
0660e03f 547 hdr = ipv6_hdr(skb);
1da177e4
LT
548
549 /* Mangling hops number delayed to point after skb COW */
1ab1457c 550
1da177e4
LT
551 hdr->hop_limit--;
552
483a47d2 553 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
554 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
555 ip6_forward_finish);
1da177e4
LT
556
557error:
483a47d2 558 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
559drop:
560 kfree_skb(skb);
561 return -EINVAL;
562}
563
564static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
565{
566 to->pkt_type = from->pkt_type;
567 to->priority = from->priority;
568 to->protocol = from->protocol;
adf30907
ED
569 skb_dst_drop(to);
570 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 571 to->dev = from->dev;
82e91ffe 572 to->mark = from->mark;
1da177e4
LT
573
574#ifdef CONFIG_NET_SCHED
575 to->tc_index = from->tc_index;
576#endif
e7ac05f3 577 nf_copy(to, from);
ba9dda3a
JK
578#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
579 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
580 to->nf_trace = from->nf_trace;
581#endif
984bc16c 582 skb_copy_secmark(to, from);
1da177e4
LT
583}
584
585int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
586{
587 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
588 struct ipv6_opt_hdr *exthdr =
589 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 590 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 591 int found_rhdr = 0;
0660e03f 592 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
593
594 while (offset + 1 <= packet_len) {
595
596 switch (**nexthdr) {
597
598 case NEXTHDR_HOP:
27637df9 599 break;
1da177e4 600 case NEXTHDR_ROUTING:
27637df9
MN
601 found_rhdr = 1;
602 break;
1da177e4 603 case NEXTHDR_DEST:
59fbb3a6 604#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
605 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
606 break;
607#endif
608 if (found_rhdr)
609 return offset;
1da177e4
LT
610 break;
611 default :
612 return offset;
613 }
27637df9
MN
614
615 offset += ipv6_optlen(exthdr);
616 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
617 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
618 offset);
1da177e4
LT
619 }
620
621 return offset;
622}
623
624static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625{
1da177e4 626 struct sk_buff *frag;
adf30907 627 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 628 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
629 struct ipv6hdr *tmp_hdr;
630 struct frag_hdr *fh;
631 unsigned int mtu, hlen, left, len;
ae08e1f0 632 __be32 frag_id = 0;
1da177e4
LT
633 int ptr, offset = 0, err=0;
634 u8 *prevhdr, nexthdr = 0;
adf30907 635 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 636
1da177e4
LT
637 hlen = ip6_find_1stfragopt(skb, &prevhdr);
638 nexthdr = *prevhdr;
639
628a5c56 640 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
641
642 /* We must not fragment if the socket is set to force MTU discovery
643 * or if the skb it not generated by a local socket. (This last
644 * check should be redundant, but it's free.)
645 */
b5c15fc0 646 if (!skb->local_df) {
adf30907 647 skb->dev = skb_dst(skb)->dev;
b881ef76 648 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
adf30907 649 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 650 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
651 kfree_skb(skb);
652 return -EMSGSIZE;
653 }
654
d91675f9
YH
655 if (np && np->frag_size < mtu) {
656 if (np->frag_size)
657 mtu = np->frag_size;
658 }
659 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 660
4d9092bb 661 if (skb_has_frags(skb)) {
1da177e4 662 int first_len = skb_pagelen(skb);
29ffe1a5 663 int truesizes = 0;
1da177e4
LT
664
665 if (first_len - hlen > mtu ||
666 ((first_len - hlen) & 7) ||
667 skb_cloned(skb))
668 goto slow_path;
669
4d9092bb 670 skb_walk_frags(skb, frag) {
1da177e4
LT
671 /* Correct geometry. */
672 if (frag->len > mtu ||
673 ((frag->len & 7) && frag->next) ||
674 skb_headroom(frag) < hlen)
675 goto slow_path;
676
1da177e4
LT
677 /* Partially cloned skb? */
678 if (skb_shared(frag))
679 goto slow_path;
2fdba6b0
HX
680
681 BUG_ON(frag->sk);
682 if (skb->sk) {
2fdba6b0
HX
683 frag->sk = skb->sk;
684 frag->destructor = sock_wfree;
29ffe1a5 685 truesizes += frag->truesize;
2fdba6b0 686 }
1da177e4
LT
687 }
688
689 err = 0;
690 offset = 0;
691 frag = skb_shinfo(skb)->frag_list;
4d9092bb 692 skb_frag_list_init(skb);
1da177e4
LT
693 /* BUILD HEADER */
694
9a217a1c 695 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 696 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 697 if (!tmp_hdr) {
adf30907 698 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 699 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
700 return -ENOMEM;
701 }
702
1da177e4
LT
703 __skb_pull(skb, hlen);
704 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
705 __skb_push(skb, hlen);
706 skb_reset_network_header(skb);
d56f90a7 707 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4
LT
708
709 ipv6_select_ident(skb, fh);
710 fh->nexthdr = nexthdr;
711 fh->reserved = 0;
712 fh->frag_off = htons(IP6_MF);
713 frag_id = fh->identification;
714
715 first_len = skb_pagelen(skb);
716 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 717 skb->truesize -= truesizes;
1da177e4 718 skb->len = first_len;
0660e03f
ACM
719 ipv6_hdr(skb)->payload_len = htons(first_len -
720 sizeof(struct ipv6hdr));
a11d206d
YH
721
722 dst_hold(&rt->u.dst);
1da177e4
LT
723
724 for (;;) {
725 /* Prepare header of the next frame,
726 * before previous one went down. */
727 if (frag) {
728 frag->ip_summed = CHECKSUM_NONE;
badff6d0 729 skb_reset_transport_header(frag);
1da177e4 730 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
731 __skb_push(frag, hlen);
732 skb_reset_network_header(frag);
d56f90a7
ACM
733 memcpy(skb_network_header(frag), tmp_hdr,
734 hlen);
1da177e4
LT
735 offset += skb->len - hlen - sizeof(struct frag_hdr);
736 fh->nexthdr = nexthdr;
737 fh->reserved = 0;
738 fh->frag_off = htons(offset);
739 if (frag->next != NULL)
740 fh->frag_off |= htons(IP6_MF);
741 fh->identification = frag_id;
0660e03f
ACM
742 ipv6_hdr(frag)->payload_len =
743 htons(frag->len -
744 sizeof(struct ipv6hdr));
1da177e4
LT
745 ip6_copy_metadata(frag, skb);
746 }
1ab1457c 747
1da177e4 748 err = output(skb);
dafee490 749 if(!err)
3bd653c8
DL
750 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751 IPSTATS_MIB_FRAGCREATES);
dafee490 752
1da177e4
LT
753 if (err || !frag)
754 break;
755
756 skb = frag;
757 frag = skb->next;
758 skb->next = NULL;
759 }
760
a51482bd 761 kfree(tmp_hdr);
1da177e4
LT
762
763 if (err == 0) {
3bd653c8
DL
764 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
765 IPSTATS_MIB_FRAGOKS);
a11d206d 766 dst_release(&rt->u.dst);
1da177e4
LT
767 return 0;
768 }
769
770 while (frag) {
771 skb = frag->next;
772 kfree_skb(frag);
773 frag = skb;
774 }
775
3bd653c8
DL
776 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
777 IPSTATS_MIB_FRAGFAILS);
a11d206d 778 dst_release(&rt->u.dst);
1da177e4
LT
779 return err;
780 }
781
782slow_path:
783 left = skb->len - hlen; /* Space per frame */
784 ptr = hlen; /* Where to start from */
785
786 /*
787 * Fragment the datagram.
788 */
789
790 *prevhdr = NEXTHDR_FRAGMENT;
791
792 /*
793 * Keep copying data until we run out.
794 */
795 while(left > 0) {
796 len = left;
797 /* IF: it doesn't fit, use 'mtu' - the data space left */
798 if (len > mtu)
799 len = mtu;
800 /* IF: we are not sending upto and including the packet end
801 then align the next start on an eight byte boundary */
802 if (len < left) {
803 len &= ~7;
804 }
805 /*
806 * Allocate buffer.
807 */
808
f5184d26 809 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 810 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 811 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 812 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
813 err = -ENOMEM;
814 goto fail;
815 }
816
817 /*
818 * Set up data on packet
819 */
820
821 ip6_copy_metadata(frag, skb);
822 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
823 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 824 skb_reset_network_header(frag);
badff6d0 825 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
826 frag->transport_header = (frag->network_header + hlen +
827 sizeof(struct frag_hdr));
1da177e4
LT
828
829 /*
830 * Charge the memory for the fragment to any owner
831 * it might possess
832 */
833 if (skb->sk)
834 skb_set_owner_w(frag, skb->sk);
835
836 /*
837 * Copy the packet header into the new buffer.
838 */
d626f62b 839 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
840
841 /*
842 * Build fragment header.
843 */
844 fh->nexthdr = nexthdr;
845 fh->reserved = 0;
f36d6ab1 846 if (!frag_id) {
1da177e4
LT
847 ipv6_select_ident(skb, fh);
848 frag_id = fh->identification;
849 } else
850 fh->identification = frag_id;
851
852 /*
853 * Copy a block of the IP datagram.
854 */
8984e41d 855 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
856 BUG();
857 left -= len;
858
859 fh->frag_off = htons(offset);
860 if (left > 0)
861 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
862 ipv6_hdr(frag)->payload_len = htons(frag->len -
863 sizeof(struct ipv6hdr));
1da177e4
LT
864
865 ptr += len;
866 offset += len;
867
868 /*
869 * Put this fragment into the sending queue.
870 */
1da177e4
LT
871 err = output(frag);
872 if (err)
873 goto fail;
dafee490 874
adf30907 875 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 876 IPSTATS_MIB_FRAGCREATES);
1da177e4 877 }
adf30907 878 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 879 IPSTATS_MIB_FRAGOKS);
1da177e4 880 kfree_skb(skb);
1da177e4
LT
881 return err;
882
883fail:
adf30907 884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 885 IPSTATS_MIB_FRAGFAILS);
1ab1457c 886 kfree_skb(skb);
1da177e4
LT
887 return err;
888}
889
cf6b1982
YH
890static inline int ip6_rt_check(struct rt6key *rt_key,
891 struct in6_addr *fl_addr,
892 struct in6_addr *addr_cache)
893{
894 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
895 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
896}
897
497c615a
HX
898static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
899 struct dst_entry *dst,
900 struct flowi *fl)
1da177e4 901{
497c615a
HX
902 struct ipv6_pinfo *np = inet6_sk(sk);
903 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 904
497c615a
HX
905 if (!dst)
906 goto out;
907
908 /* Yes, checking route validity in not connected
909 * case is not very simple. Take into account,
910 * that we do not support routing by source, TOS,
911 * and MSG_DONTROUTE --ANK (980726)
912 *
cf6b1982
YH
913 * 1. ip6_rt_check(): If route was host route,
914 * check that cached destination is current.
497c615a
HX
915 * If it is network route, we still may
916 * check its validity using saved pointer
917 * to the last used address: daddr_cache.
918 * We do not want to save whole address now,
919 * (because main consumer of this service
920 * is tcp, which has not this problem),
921 * so that the last trick works only on connected
922 * sockets.
923 * 2. oif also should be the same.
924 */
cf6b1982 925 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
926#ifdef CONFIG_IPV6_SUBTREES
927 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
928#endif
cf6b1982 929 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
930 dst_release(dst);
931 dst = NULL;
1da177e4
LT
932 }
933
497c615a
HX
934out:
935 return dst;
936}
937
938static int ip6_dst_lookup_tail(struct sock *sk,
939 struct dst_entry **dst, struct flowi *fl)
940{
941 int err;
3b1e0a65 942 struct net *net = sock_net(sk);
497c615a 943
1da177e4 944 if (*dst == NULL)
8a3edd80 945 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
946
947 if ((err = (*dst)->error))
948 goto out_err_release;
949
950 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 951 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
952 &fl->fl6_dst,
953 sk ? inet6_sk(sk)->srcprefs : 0,
954 &fl->fl6_src);
44456d37 955 if (err)
1da177e4 956 goto out_err_release;
1da177e4
LT
957 }
958
95c385b4 959#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
960 /*
961 * Here if the dst entry we've looked up
962 * has a neighbour entry that is in the INCOMPLETE
963 * state and the src address from the flow is
964 * marked as OPTIMISTIC, we release the found
965 * dst entry and replace it instead with the
966 * dst entry of the nexthop router
967 */
968 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
969 struct inet6_ifaddr *ifp;
970 struct flowi fl_gw;
971 int redirect;
972
973 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
974 (*dst)->dev, 1);
975
976 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
977 if (ifp)
978 in6_ifa_put(ifp);
979
980 if (redirect) {
981 /*
982 * We need to get the dst entry for the
983 * default router instead
984 */
985 dst_release(*dst);
986 memcpy(&fl_gw, fl, sizeof(struct flowi));
987 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
988 *dst = ip6_route_output(net, sk, &fl_gw);
989 if ((err = (*dst)->error))
990 goto out_err_release;
95c385b4 991 }
e550dfb0 992 }
95c385b4
NH
993#endif
994
1da177e4
LT
995 return 0;
996
997out_err_release:
ca46f9c8 998 if (err == -ENETUNREACH)
483a47d2 999 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1000 dst_release(*dst);
1001 *dst = NULL;
1002 return err;
1003}
34a0b3cd 1004
497c615a
HX
1005/**
1006 * ip6_dst_lookup - perform route lookup on flow
1007 * @sk: socket which provides route info
1008 * @dst: pointer to dst_entry * for result
1009 * @fl: flow to lookup
1010 *
1011 * This function performs a route lookup on the given flow.
1012 *
1013 * It returns zero on success, or a standard errno code on error.
1014 */
1015int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1016{
1017 *dst = NULL;
1018 return ip6_dst_lookup_tail(sk, dst, fl);
1019}
3cf3dc6c
ACM
1020EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1021
497c615a
HX
1022/**
1023 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1024 * @sk: socket which provides the dst cache and route info
1025 * @dst: pointer to dst_entry * for result
1026 * @fl: flow to lookup
1027 *
1028 * This function performs a route lookup on the given flow with the
1029 * possibility of using the cached route in the socket if it is valid.
1030 * It will take the socket dst lock when operating on the dst cache.
1031 * As a result, this function can only be used in process context.
1032 *
1033 * It returns zero on success, or a standard errno code on error.
1034 */
1035int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1036{
1037 *dst = NULL;
1038 if (sk) {
1039 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1040 *dst = ip6_sk_dst_check(sk, *dst, fl);
1041 }
1042
1043 return ip6_dst_lookup_tail(sk, dst, fl);
1044}
1045EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1046
34a0b3cd 1047static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1048 int getfrag(void *from, char *to, int offset, int len,
1049 int odd, struct sk_buff *skb),
1050 void *from, int length, int hh_len, int fragheaderlen,
1051 int transhdrlen, int mtu,unsigned int flags)
1052
1053{
1054 struct sk_buff *skb;
1055 int err;
1056
1057 /* There is support for UDP large send offload by network
1058 * device, so create one single skb packet containing complete
1059 * udp datagram
1060 */
1061 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1062 skb = sock_alloc_send_skb(sk,
1063 hh_len + fragheaderlen + transhdrlen + 20,
1064 (flags & MSG_DONTWAIT), &err);
1065 if (skb == NULL)
1066 return -ENOMEM;
1067
1068 /* reserve space for Hardware header */
1069 skb_reserve(skb, hh_len);
1070
1071 /* create space for UDP/IP header */
1072 skb_put(skb,fragheaderlen + transhdrlen);
1073
1074 /* initialize network header pointer */
c1d2bbe1 1075 skb_reset_network_header(skb);
e89e9cf5
AR
1076
1077 /* initialize protocol header pointer */
b0e380b1 1078 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1079
84fa7933 1080 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1081 skb->csum = 0;
1082 sk->sk_sndmsg_off = 0;
1083 }
1084
1085 err = skb_append_datato_frags(sk,skb, getfrag, from,
1086 (length - transhdrlen));
1087 if (!err) {
1088 struct frag_hdr fhdr;
1089
c31d5326
SS
1090 /* Specify the length of each IPv6 datagram fragment.
1091 * It has to be a multiple of 8.
1092 */
1093 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1094 sizeof(struct frag_hdr)) & ~7;
f83ef8c0 1095 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5
AR
1096 ipv6_select_ident(skb, &fhdr);
1097 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1098 __skb_queue_tail(&sk->sk_write_queue, skb);
1099
1100 return 0;
1101 }
1102 /* There is not enough support do UPD LSO,
1103 * so follow normal path
1104 */
1105 kfree_skb(skb);
1106
1107 return err;
1108}
1da177e4 1109
0178b695
HX
1110static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1111 gfp_t gfp)
1112{
1113 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1114}
1115
1116static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1117 gfp_t gfp)
1118{
1119 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1120}
1121
41a1f8ea
YH
1122int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1123 int offset, int len, int odd, struct sk_buff *skb),
1124 void *from, int length, int transhdrlen,
1125 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1126 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1127{
1128 struct inet_sock *inet = inet_sk(sk);
1129 struct ipv6_pinfo *np = inet6_sk(sk);
1130 struct sk_buff *skb;
1131 unsigned int maxfraglen, fragheaderlen;
1132 int exthdrlen;
1133 int hh_len;
1134 int mtu;
1135 int copy;
1136 int err;
1137 int offset = 0;
1138 int csummode = CHECKSUM_NONE;
1139
1140 if (flags&MSG_PROBE)
1141 return 0;
1142 if (skb_queue_empty(&sk->sk_write_queue)) {
1143 /*
1144 * setup for corking
1145 */
1146 if (opt) {
0178b695 1147 if (WARN_ON(np->cork.opt))
1da177e4 1148 return -EINVAL;
0178b695
HX
1149
1150 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1151 if (unlikely(np->cork.opt == NULL))
1152 return -ENOBUFS;
1153
1154 np->cork.opt->tot_len = opt->tot_len;
1155 np->cork.opt->opt_flen = opt->opt_flen;
1156 np->cork.opt->opt_nflen = opt->opt_nflen;
1157
1158 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1159 sk->sk_allocation);
1160 if (opt->dst0opt && !np->cork.opt->dst0opt)
1161 return -ENOBUFS;
1162
1163 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1164 sk->sk_allocation);
1165 if (opt->dst1opt && !np->cork.opt->dst1opt)
1166 return -ENOBUFS;
1167
1168 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1169 sk->sk_allocation);
1170 if (opt->hopopt && !np->cork.opt->hopopt)
1171 return -ENOBUFS;
1172
1173 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1174 sk->sk_allocation);
1175 if (opt->srcrt && !np->cork.opt->srcrt)
1176 return -ENOBUFS;
1177
1da177e4
LT
1178 /* need source address above miyazawa*/
1179 }
1180 dst_hold(&rt->u.dst);
c8cdaf99 1181 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1182 inet->cork.fl = *fl;
1183 np->cork.hop_limit = hlimit;
41a1f8ea 1184 np->cork.tclass = tclass;
628a5c56
JH
1185 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1186 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1187 if (np->frag_size < mtu) {
d91675f9
YH
1188 if (np->frag_size)
1189 mtu = np->frag_size;
1190 }
1191 inet->cork.fragsize = mtu;
1da177e4
LT
1192 if (dst_allfrag(rt->u.dst.path))
1193 inet->cork.flags |= IPCORK_ALLFRAG;
1194 inet->cork.length = 0;
1195 sk->sk_sndmsg_page = NULL;
1196 sk->sk_sndmsg_off = 0;
01488942 1197 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1198 rt->rt6i_nfheader_len;
1da177e4
LT
1199 length += exthdrlen;
1200 transhdrlen += exthdrlen;
1201 } else {
c8cdaf99 1202 rt = (struct rt6_info *)inet->cork.dst;
1da177e4 1203 fl = &inet->cork.fl;
0178b695 1204 opt = np->cork.opt;
1da177e4
LT
1205 transhdrlen = 0;
1206 exthdrlen = 0;
1207 mtu = inet->cork.fragsize;
1208 }
1209
1210 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1211
a1b05140 1212 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1213 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1214 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1215
1216 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1217 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1218 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1219 return -EMSGSIZE;
1220 }
1221 }
1222
1223 /*
1224 * Let's try using as much space as possible.
1225 * Use MTU if total length of the message fits into the MTU.
1226 * Otherwise, we need to reserve fragment header and
1227 * fragment alignment (= 8-15 octects, in total).
1228 *
1229 * Note that we may need to "move" the data from the tail of
1ab1457c 1230 * of the buffer to the new fragment when we split
1da177e4
LT
1231 * the message.
1232 *
1ab1457c 1233 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1234 * at once if non-fragmentable extension headers
1235 * are too large.
1ab1457c 1236 * --yoshfuji
1da177e4
LT
1237 */
1238
1239 inet->cork.length += length;
e89e9cf5
AR
1240 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1241 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1242
baa829d8
PM
1243 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1244 fragheaderlen, transhdrlen, mtu,
1245 flags);
1246 if (err)
e89e9cf5 1247 goto error;
e89e9cf5
AR
1248 return 0;
1249 }
1da177e4
LT
1250
1251 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1252 goto alloc_new_skb;
1253
1254 while (length > 0) {
1255 /* Check if the remaining data fits into current packet. */
1256 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1257 if (copy < length)
1258 copy = maxfraglen - skb->len;
1259
1260 if (copy <= 0) {
1261 char *data;
1262 unsigned int datalen;
1263 unsigned int fraglen;
1264 unsigned int fraggap;
1265 unsigned int alloclen;
1266 struct sk_buff *skb_prev;
1267alloc_new_skb:
1268 skb_prev = skb;
1269
1270 /* There's no room in the current skb */
1271 if (skb_prev)
1272 fraggap = skb_prev->len - maxfraglen;
1273 else
1274 fraggap = 0;
1275
1276 /*
1277 * If remaining data exceeds the mtu,
1278 * we know we need more fragment(s).
1279 */
1280 datalen = length + fraggap;
1281 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1282 datalen = maxfraglen - fragheaderlen;
1283
1284 fraglen = datalen + fragheaderlen;
1285 if ((flags & MSG_MORE) &&
1286 !(rt->u.dst.dev->features&NETIF_F_SG))
1287 alloclen = mtu;
1288 else
1289 alloclen = datalen + fragheaderlen;
1290
1291 /*
1292 * The last fragment gets additional space at tail.
1293 * Note: we overallocate on fragments with MSG_MODE
1294 * because we have no idea if we're the last one.
1295 */
1296 if (datalen == length + fraggap)
1297 alloclen += rt->u.dst.trailer_len;
1298
1299 /*
1300 * We just reserve space for fragment header.
1ab1457c 1301 * Note: this may be overallocation if the message
1da177e4
LT
1302 * (without MSG_MORE) fits into the MTU.
1303 */
1304 alloclen += sizeof(struct frag_hdr);
1305
1306 if (transhdrlen) {
1307 skb = sock_alloc_send_skb(sk,
1308 alloclen + hh_len,
1309 (flags & MSG_DONTWAIT), &err);
1310 } else {
1311 skb = NULL;
1312 if (atomic_read(&sk->sk_wmem_alloc) <=
1313 2 * sk->sk_sndbuf)
1314 skb = sock_wmalloc(sk,
1315 alloclen + hh_len, 1,
1316 sk->sk_allocation);
1317 if (unlikely(skb == NULL))
1318 err = -ENOBUFS;
1319 }
1320 if (skb == NULL)
1321 goto error;
1322 /*
1323 * Fill in the control structures
1324 */
1325 skb->ip_summed = csummode;
1326 skb->csum = 0;
1327 /* reserve for fragmentation */
1328 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1329
1330 /*
1331 * Find where to start putting bytes
1332 */
1333 data = skb_put(skb, fraglen);
c14d2450 1334 skb_set_network_header(skb, exthdrlen);
1da177e4 1335 data += fragheaderlen;
b0e380b1
ACM
1336 skb->transport_header = (skb->network_header +
1337 fragheaderlen);
1da177e4
LT
1338 if (fraggap) {
1339 skb->csum = skb_copy_and_csum_bits(
1340 skb_prev, maxfraglen,
1341 data + transhdrlen, fraggap, 0);
1342 skb_prev->csum = csum_sub(skb_prev->csum,
1343 skb->csum);
1344 data += fraggap;
e9fa4f7b 1345 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1346 }
1347 copy = datalen - transhdrlen - fraggap;
1348 if (copy < 0) {
1349 err = -EINVAL;
1350 kfree_skb(skb);
1351 goto error;
1352 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1353 err = -EFAULT;
1354 kfree_skb(skb);
1355 goto error;
1356 }
1357
1358 offset += copy;
1359 length -= datalen - fraggap;
1360 transhdrlen = 0;
1361 exthdrlen = 0;
1362 csummode = CHECKSUM_NONE;
1363
1364 /*
1365 * Put the packet on the pending queue
1366 */
1367 __skb_queue_tail(&sk->sk_write_queue, skb);
1368 continue;
1369 }
1370
1371 if (copy > length)
1372 copy = length;
1373
1374 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1375 unsigned int off;
1376
1377 off = skb->len;
1378 if (getfrag(from, skb_put(skb, copy),
1379 offset, copy, off, skb) < 0) {
1380 __skb_trim(skb, off);
1381 err = -EFAULT;
1382 goto error;
1383 }
1384 } else {
1385 int i = skb_shinfo(skb)->nr_frags;
1386 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1387 struct page *page = sk->sk_sndmsg_page;
1388 int off = sk->sk_sndmsg_off;
1389 unsigned int left;
1390
1391 if (page && (left = PAGE_SIZE - off) > 0) {
1392 if (copy >= left)
1393 copy = left;
1394 if (page != frag->page) {
1395 if (i == MAX_SKB_FRAGS) {
1396 err = -EMSGSIZE;
1397 goto error;
1398 }
1399 get_page(page);
1400 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1401 frag = &skb_shinfo(skb)->frags[i];
1402 }
1403 } else if(i < MAX_SKB_FRAGS) {
1404 if (copy > PAGE_SIZE)
1405 copy = PAGE_SIZE;
1406 page = alloc_pages(sk->sk_allocation, 0);
1407 if (page == NULL) {
1408 err = -ENOMEM;
1409 goto error;
1410 }
1411 sk->sk_sndmsg_page = page;
1412 sk->sk_sndmsg_off = 0;
1413
1414 skb_fill_page_desc(skb, i, page, 0, 0);
1415 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1416 } else {
1417 err = -EMSGSIZE;
1418 goto error;
1419 }
1420 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1421 err = -EFAULT;
1422 goto error;
1423 }
1424 sk->sk_sndmsg_off += copy;
1425 frag->size += copy;
1426 skb->len += copy;
1427 skb->data_len += copy;
f945fa7a
HX
1428 skb->truesize += copy;
1429 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1430 }
1431 offset += copy;
1432 length -= copy;
1433 }
1434 return 0;
1435error:
1436 inet->cork.length -= length;
3bd653c8 1437 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1438 return err;
1439}
1440
bf138862
PE
1441static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1442{
0178b695
HX
1443 if (np->cork.opt) {
1444 kfree(np->cork.opt->dst0opt);
1445 kfree(np->cork.opt->dst1opt);
1446 kfree(np->cork.opt->hopopt);
1447 kfree(np->cork.opt->srcrt);
1448 kfree(np->cork.opt);
1449 np->cork.opt = NULL;
1450 }
1451
c8cdaf99
YH
1452 if (inet->cork.dst) {
1453 dst_release(inet->cork.dst);
1454 inet->cork.dst = NULL;
bf138862
PE
1455 inet->cork.flags &= ~IPCORK_ALLFRAG;
1456 }
1457 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1458}
1459
1da177e4
LT
1460int ip6_push_pending_frames(struct sock *sk)
1461{
1462 struct sk_buff *skb, *tmp_skb;
1463 struct sk_buff **tail_skb;
1464 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1465 struct inet_sock *inet = inet_sk(sk);
1466 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1467 struct net *net = sock_net(sk);
1da177e4
LT
1468 struct ipv6hdr *hdr;
1469 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1470 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1471 struct flowi *fl = &inet->cork.fl;
1472 unsigned char proto = fl->proto;
1473 int err = 0;
1474
1475 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1476 goto out;
1477 tail_skb = &(skb_shinfo(skb)->frag_list);
1478
1479 /* move skb->data to ip header from ext header */
d56f90a7 1480 if (skb->data < skb_network_header(skb))
bbe735e4 1481 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1482 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1483 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1484 *tail_skb = tmp_skb;
1485 tail_skb = &(tmp_skb->next);
1486 skb->len += tmp_skb->len;
1487 skb->data_len += tmp_skb->len;
1da177e4
LT
1488 skb->truesize += tmp_skb->truesize;
1489 __sock_put(tmp_skb->sk);
1490 tmp_skb->destructor = NULL;
1491 tmp_skb->sk = NULL;
1da177e4
LT
1492 }
1493
28a89453 1494 /* Allow local fragmentation. */
b5c15fc0 1495 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1496 skb->local_df = 1;
1497
1da177e4 1498 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1499 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1500 if (opt && opt->opt_flen)
1501 ipv6_push_frag_opts(skb, opt, &proto);
1502 if (opt && opt->opt_nflen)
1503 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1504
e2d1bca7
ACM
1505 skb_push(skb, sizeof(struct ipv6hdr));
1506 skb_reset_network_header(skb);
0660e03f 1507 hdr = ipv6_hdr(skb);
1ab1457c 1508
90bcaf7b 1509 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1510 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1511
1da177e4
LT
1512 hdr->hop_limit = np->cork.hop_limit;
1513 hdr->nexthdr = proto;
1514 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1515 ipv6_addr_copy(&hdr->daddr, final_dst);
1516
a2c2064f 1517 skb->priority = sk->sk_priority;
4a19ec58 1518 skb->mark = sk->sk_mark;
a2c2064f 1519
adf30907 1520 skb_dst_set(skb, dst_clone(&rt->u.dst));
edf391ff 1521 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1522 if (proto == IPPROTO_ICMPV6) {
adf30907 1523 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1524
5a57d4c7 1525 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1526 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1527 }
1528
ef76bc23 1529 err = ip6_local_out(skb);
1da177e4
LT
1530 if (err) {
1531 if (err > 0)
3320da89 1532 err = np->recverr ? net_xmit_errno(err) : 0;
1da177e4
LT
1533 if (err)
1534 goto error;
1535 }
1536
1537out:
bf138862 1538 ip6_cork_release(inet, np);
1da177e4
LT
1539 return err;
1540error:
1541 goto out;
1542}
1543
1544void ip6_flush_pending_frames(struct sock *sk)
1545{
1da177e4
LT
1546 struct sk_buff *skb;
1547
1548 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1549 if (skb_dst(skb))
1550 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1551 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1552 kfree_skb(skb);
1553 }
1554
bf138862 1555 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1556}