ipv4: yet another new IP_MTU_DISCOVER option IP_PMTUDISC_OMIT
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
5a0e3ad6 40#include <linux/slab.h>
1da177e4
LT
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
7bc570c8 57#include <linux/mroute6.h>
1da177e4 58
9e508490 59static int ip6_finish_output2(struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
f6b72b62 63 struct neighbour *neigh;
6fd6ce20
YH
64 struct in6_addr *nexthop;
65 int ret;
1da177e4
LT
66
67 skb->protocol = htons(ETH_P_IPV6);
68 skb->dev = dev;
69
0660e03f 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 72
7ad6848c 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
d1db275d 74 ((mroute6_socket(dev_net(dev), skb) &&
bd91b8bf 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
b2e0b385
JE
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 newskb, NULL, newskb->dev,
95603e22 86 dev_loopback_xmit);
1da177e4 87
0660e03f 88 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
89 IP6_INC_STATS(dev_net(dev), idev,
90 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
edf391ff
NH
96 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 skb->len);
dd408515
HFS
98
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
102 kfree_skb(skb);
103 return 0;
104 }
1da177e4
LT
105 }
106
6fd6ce20 107 rcu_read_lock_bh();
550bab42 108 nexthop = rt6_nexthop((struct rt6_info *)dst);
6fd6ce20
YH
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
115 return ret;
116 }
117 rcu_read_unlock_bh();
05e3aa09 118
7f88c6b2
HFS
119 IP6_INC_STATS(dev_net(dst->dev),
120 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
9e508490
JE
121 kfree_skb(skb);
122 return -EINVAL;
1da177e4
LT
123}
124
9e508490
JE
125static int ip6_finish_output(struct sk_buff *skb)
126{
127 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
9037c357
JP
128 dst_allfrag(skb_dst(skb)) ||
129 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
9e508490
JE
130 return ip6_fragment(skb, ip6_finish_output2);
131 else
132 return ip6_finish_output2(skb);
133}
134
1da177e4
LT
135int ip6_output(struct sk_buff *skb)
136{
9e508490 137 struct net_device *dev = skb_dst(skb)->dev;
adf30907 138 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 139 if (unlikely(idev->cnf.disable_ipv6)) {
9e508490 140 IP6_INC_STATS(dev_net(dev), idev,
3bd653c8 141 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
142 kfree_skb(skb);
143 return 0;
144 }
145
9c6eb28a
JE
146 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 ip6_finish_output,
148 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4
LT
149}
150
1da177e4 151/*
b5d43998 152 * xmit an sk_buff (used by TCP, SCTP and DCCP)
1da177e4
LT
153 */
154
4c9483b2 155int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
b903d324 156 struct ipv6_txoptions *opt, int tclass)
1da177e4 157{
3bd653c8 158 struct net *net = sock_net(sk);
b30bd282 159 struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 160 struct in6_addr *first_hop = &fl6->daddr;
adf30907 161 struct dst_entry *dst = skb_dst(skb);
1da177e4 162 struct ipv6hdr *hdr;
4c9483b2 163 u8 proto = fl6->flowi6_proto;
1da177e4 164 int seg_len = skb->len;
e651f03a 165 int hlimit = -1;
1da177e4
LT
166 u32 mtu;
167
168 if (opt) {
c2636b4d 169 unsigned int head_room;
1da177e4
LT
170
171 /* First: exthdrs may take lots of space (~8K for now)
172 MAX_HEADER is not enough.
173 */
174 head_room = opt->opt_nflen + opt->opt_flen;
175 seg_len += head_room;
176 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177
178 if (skb_headroom(skb) < head_room) {
179 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 180 if (skb2 == NULL) {
adf30907 181 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
182 IPSTATS_MIB_OUTDISCARDS);
183 kfree_skb(skb);
1da177e4
LT
184 return -ENOBUFS;
185 }
808db80a 186 consume_skb(skb);
a11d206d 187 skb = skb2;
83d7eb29 188 skb_set_owner_w(skb, sk);
1da177e4
LT
189 }
190 if (opt->opt_flen)
191 ipv6_push_frag_opts(skb, opt, &proto);
192 if (opt->opt_nflen)
193 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194 }
195
e2d1bca7
ACM
196 skb_push(skb, sizeof(struct ipv6hdr));
197 skb_reset_network_header(skb);
0660e03f 198 hdr = ipv6_hdr(skb);
1da177e4
LT
199
200 /*
201 * Fill in the IPv6 header
202 */
b903d324 203 if (np)
1da177e4
LT
204 hlimit = np->hop_limit;
205 if (hlimit < 0)
6b75d090 206 hlimit = ip6_dst_hoplimit(dst);
1da177e4 207
3e4e4c1f 208 ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
41a1f8ea 209
1da177e4
LT
210 hdr->payload_len = htons(seg_len);
211 hdr->nexthdr = proto;
212 hdr->hop_limit = hlimit;
213
4e3fd7a0
AD
214 hdr->saddr = fl6->saddr;
215 hdr->daddr = *first_hop;
1da177e4 216
9c9c9ad5 217 skb->protocol = htons(ETH_P_IPV6);
a2c2064f 218 skb->priority = sk->sk_priority;
4a19ec58 219 skb->mark = sk->sk_mark;
a2c2064f 220
1da177e4 221 mtu = dst_mtu(dst);
283d07ac 222 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
adf30907 223 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 224 IPSTATS_MIB_OUT, skb->len);
b2e0b385
JE
225 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
226 dst->dev, dst_output);
1da177e4
LT
227 }
228
1da177e4 229 skb->dev = dst->dev;
f4e53e29 230 ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
adf30907 231 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
232 kfree_skb(skb);
233 return -EMSGSIZE;
234}
235
7159039a
YH
236EXPORT_SYMBOL(ip6_xmit);
237
1da177e4
LT
238static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239{
240 struct ip6_ra_chain *ra;
241 struct sock *last = NULL;
242
243 read_lock(&ip6_ra_lock);
244 for (ra = ip6_ra_chain; ra; ra = ra->next) {
245 struct sock *sk = ra->sk;
0bd1b59b
AM
246 if (sk && ra->sel == sel &&
247 (!sk->sk_bound_dev_if ||
248 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
249 if (last) {
250 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251 if (skb2)
252 rawv6_rcv(last, skb2);
253 }
254 last = sk;
255 }
256 }
257
258 if (last) {
259 rawv6_rcv(last, skb);
260 read_unlock(&ip6_ra_lock);
261 return 1;
262 }
263 read_unlock(&ip6_ra_lock);
264 return 0;
265}
266
e21e0b5f
VN
267static int ip6_forward_proxy_check(struct sk_buff *skb)
268{
0660e03f 269 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 270 u8 nexthdr = hdr->nexthdr;
75f2811c 271 __be16 frag_off;
e21e0b5f
VN
272 int offset;
273
274 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 275 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
276 if (offset < 0)
277 return 0;
278 } else
279 offset = sizeof(struct ipv6hdr);
280
281 if (nexthdr == IPPROTO_ICMPV6) {
282 struct icmp6hdr *icmp6;
283
d56f90a7
ACM
284 if (!pskb_may_pull(skb, (skb_network_header(skb) +
285 offset + 1 - skb->data)))
e21e0b5f
VN
286 return 0;
287
d56f90a7 288 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
289
290 switch (icmp6->icmp6_type) {
291 case NDISC_ROUTER_SOLICITATION:
292 case NDISC_ROUTER_ADVERTISEMENT:
293 case NDISC_NEIGHBOUR_SOLICITATION:
294 case NDISC_NEIGHBOUR_ADVERTISEMENT:
295 case NDISC_REDIRECT:
296 /* For reaction involving unicast neighbor discovery
297 * message destined to the proxied address, pass it to
298 * input function.
299 */
300 return 1;
301 default:
302 break;
303 }
304 }
305
74553b09
VN
306 /*
307 * The proxying router can't forward traffic sent to a link-local
308 * address, so signal the sender and discard the packet. This
309 * behavior is clarified by the MIPv6 specification.
310 */
311 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312 dst_link_failure(skb);
313 return -1;
314 }
315
e21e0b5f
VN
316 return 0;
317}
318
1da177e4
LT
319static inline int ip6_forward_finish(struct sk_buff *skb)
320{
321 return dst_output(skb);
322}
323
0954cf9c
HFS
324static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
325{
326 unsigned int mtu;
327 struct inet6_dev *idev;
328
329 if (dst_metric_locked(dst, RTAX_MTU)) {
330 mtu = dst_metric_raw(dst, RTAX_MTU);
331 if (mtu)
332 return mtu;
333 }
334
335 mtu = IPV6_MIN_MTU;
336 rcu_read_lock();
337 idev = __in6_dev_get(dst->dev);
338 if (idev)
339 mtu = idev->cnf.mtu6;
340 rcu_read_unlock();
341
342 return mtu;
343}
344
fe6cc55f
FW
345static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
346{
347 if (skb->len <= mtu || skb->local_df)
348 return false;
349
350 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
351 return true;
352
353 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
354 return false;
355
356 return true;
357}
358
1da177e4
LT
359int ip6_forward(struct sk_buff *skb)
360{
adf30907 361 struct dst_entry *dst = skb_dst(skb);
0660e03f 362 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 363 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 364 struct net *net = dev_net(dst->dev);
14f3ad6f 365 u32 mtu;
1ab1457c 366
53b7997f 367 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
368 goto error;
369
4497b076
BH
370 if (skb_warn_if_lro(skb))
371 goto drop;
372
1da177e4 373 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
15c77d8b
ED
374 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
375 IPSTATS_MIB_INDISCARDS);
1da177e4
LT
376 goto drop;
377 }
378
72b43d08
AK
379 if (skb->pkt_type != PACKET_HOST)
380 goto drop;
381
35fc92a9 382 skb_forward_csum(skb);
1da177e4
LT
383
384 /*
385 * We DO NOT make any processing on
386 * RA packets, pushing them to user level AS IS
387 * without ane WARRANTY that application will be able
388 * to interpret them. The reason is that we
389 * cannot make anything clever here.
390 *
391 * We are not end-node, so that if packet contains
392 * AH/ESP, we cannot make anything.
393 * Defragmentation also would be mistake, RA packets
394 * cannot be fragmented, because there is no warranty
395 * that different fragments will go along one path. --ANK
396 */
ab4eb353
YH
397 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
398 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
399 return 0;
400 }
401
402 /*
403 * check and decrement ttl
404 */
405 if (hdr->hop_limit <= 1) {
406 /* Force OUTPUT device used as source address */
407 skb->dev = dst->dev;
3ffe533c 408 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
15c77d8b
ED
409 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
410 IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
411
412 kfree_skb(skb);
413 return -ETIMEDOUT;
414 }
415
fbea49e1 416 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 417 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 418 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
419 int proxied = ip6_forward_proxy_check(skb);
420 if (proxied > 0)
e21e0b5f 421 return ip6_input(skb);
74553b09 422 else if (proxied < 0) {
15c77d8b
ED
423 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
424 IPSTATS_MIB_INDISCARDS);
74553b09
VN
425 goto drop;
426 }
e21e0b5f
VN
427 }
428
1da177e4 429 if (!xfrm6_route_forward(skb)) {
15c77d8b
ED
430 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
431 IPSTATS_MIB_INDISCARDS);
1da177e4
LT
432 goto drop;
433 }
adf30907 434 dst = skb_dst(skb);
1da177e4
LT
435
436 /* IPv6 specs say nothing about it, but it is clear that we cannot
437 send redirects to source routed frames.
1e5dc146 438 We don't send redirects to frames decapsulated from IPsec.
1da177e4 439 */
c45a3dfb 440 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 441 struct in6_addr *target = NULL;
fbfe95a4 442 struct inet_peer *peer;
1da177e4 443 struct rt6_info *rt;
1da177e4
LT
444
445 /*
446 * incoming and outgoing devices are the same
447 * send a redirect.
448 */
449
450 rt = (struct rt6_info *) dst;
c45a3dfb
DM
451 if (rt->rt6i_flags & RTF_GATEWAY)
452 target = &rt->rt6i_gateway;
1da177e4
LT
453 else
454 target = &hdr->daddr;
455
1d861aa4 456 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
92d86829 457
1da177e4
LT
458 /* Limit redirects both by destination (here)
459 and by source (inside ndisc_send_redirect)
460 */
fbfe95a4 461 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 462 ndisc_send_redirect(skb, target);
1d861aa4
DM
463 if (peer)
464 inet_putpeer(peer);
5bb1ab09
DS
465 } else {
466 int addrtype = ipv6_addr_type(&hdr->saddr);
467
1da177e4 468 /* This check is security critical. */
f81b2e7d
YH
469 if (addrtype == IPV6_ADDR_ANY ||
470 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
471 goto error;
472 if (addrtype & IPV6_ADDR_LINKLOCAL) {
473 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 474 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
475 goto error;
476 }
1da177e4
LT
477 }
478
0954cf9c 479 mtu = ip6_dst_mtu_forward(dst);
14f3ad6f
UW
480 if (mtu < IPV6_MIN_MTU)
481 mtu = IPV6_MIN_MTU;
482
fe6cc55f 483 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
484 /* Again, force OUTPUT device used as source address */
485 skb->dev = dst->dev;
14f3ad6f 486 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
15c77d8b
ED
487 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
488 IPSTATS_MIB_INTOOBIGERRORS);
489 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
490 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
491 kfree_skb(skb);
492 return -EMSGSIZE;
493 }
494
495 if (skb_cow(skb, dst->dev->hard_header_len)) {
15c77d8b
ED
496 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
498 goto drop;
499 }
500
0660e03f 501 hdr = ipv6_hdr(skb);
1da177e4
LT
502
503 /* Mangling hops number delayed to point after skb COW */
1ab1457c 504
1da177e4
LT
505 hdr->hop_limit--;
506
483a47d2 507 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
2d8dbb04 508 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
b2e0b385 509 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
6e23ae2a 510 ip6_forward_finish);
1da177e4
LT
511
512error:
483a47d2 513 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
514drop:
515 kfree_skb(skb);
516 return -EINVAL;
517}
518
519static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
520{
521 to->pkt_type = from->pkt_type;
522 to->priority = from->priority;
523 to->protocol = from->protocol;
adf30907
ED
524 skb_dst_drop(to);
525 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 526 to->dev = from->dev;
82e91ffe 527 to->mark = from->mark;
1da177e4
LT
528
529#ifdef CONFIG_NET_SCHED
530 to->tc_index = from->tc_index;
531#endif
e7ac05f3 532 nf_copy(to, from);
07a93626 533#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
ba9dda3a
JK
534 to->nf_trace = from->nf_trace;
535#endif
984bc16c 536 skb_copy_secmark(to, from);
1da177e4
LT
537}
538
ad0081e4 539int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4 540{
1da177e4 541 struct sk_buff *frag;
adf30907 542 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 543 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
544 struct ipv6hdr *tmp_hdr;
545 struct frag_hdr *fh;
546 unsigned int mtu, hlen, left, len;
a7ae1992 547 int hroom, troom;
ae08e1f0 548 __be32 frag_id = 0;
1da177e4
LT
549 int ptr, offset = 0, err=0;
550 u8 *prevhdr, nexthdr = 0;
adf30907 551 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 552
1da177e4
LT
553 hlen = ip6_find_1stfragopt(skb, &prevhdr);
554 nexthdr = *prevhdr;
555
628a5c56 556 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
557
558 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 559 * or if the skb it not generated by a local socket.
b881ef76 560 */
4cdd3408
PM
561 if (unlikely(!skb->local_df && skb->len > mtu) ||
562 (IP6CB(skb)->frag_max_size &&
563 IP6CB(skb)->frag_max_size > mtu)) {
a34a101e
ED
564 if (skb->sk && dst_allfrag(skb_dst(skb)))
565 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
566
adf30907 567 skb->dev = skb_dst(skb)->dev;
3ffe533c 568 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 569 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 570 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
571 kfree_skb(skb);
572 return -EMSGSIZE;
573 }
574
d91675f9
YH
575 if (np && np->frag_size < mtu) {
576 if (np->frag_size)
577 mtu = np->frag_size;
578 }
579 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 580
21dc3301 581 if (skb_has_frag_list(skb)) {
1da177e4 582 int first_len = skb_pagelen(skb);
3d13008e 583 struct sk_buff *frag2;
1da177e4
LT
584
585 if (first_len - hlen > mtu ||
586 ((first_len - hlen) & 7) ||
587 skb_cloned(skb))
588 goto slow_path;
589
4d9092bb 590 skb_walk_frags(skb, frag) {
1da177e4
LT
591 /* Correct geometry. */
592 if (frag->len > mtu ||
593 ((frag->len & 7) && frag->next) ||
594 skb_headroom(frag) < hlen)
3d13008e 595 goto slow_path_clean;
1da177e4 596
1da177e4
LT
597 /* Partially cloned skb? */
598 if (skb_shared(frag))
3d13008e 599 goto slow_path_clean;
2fdba6b0
HX
600
601 BUG_ON(frag->sk);
602 if (skb->sk) {
2fdba6b0
HX
603 frag->sk = skb->sk;
604 frag->destructor = sock_wfree;
2fdba6b0 605 }
3d13008e 606 skb->truesize -= frag->truesize;
1da177e4
LT
607 }
608
609 err = 0;
610 offset = 0;
611 frag = skb_shinfo(skb)->frag_list;
4d9092bb 612 skb_frag_list_init(skb);
1da177e4
LT
613 /* BUILD HEADER */
614
9a217a1c 615 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 616 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 617 if (!tmp_hdr) {
adf30907 618 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 619 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
620 return -ENOMEM;
621 }
622
1da177e4
LT
623 __skb_pull(skb, hlen);
624 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
625 __skb_push(skb, hlen);
626 skb_reset_network_header(skb);
d56f90a7 627 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4 628
87c48fa3 629 ipv6_select_ident(fh, rt);
1da177e4
LT
630 fh->nexthdr = nexthdr;
631 fh->reserved = 0;
632 fh->frag_off = htons(IP6_MF);
633 frag_id = fh->identification;
634
635 first_len = skb_pagelen(skb);
636 skb->data_len = first_len - skb_headlen(skb);
637 skb->len = first_len;
0660e03f
ACM
638 ipv6_hdr(skb)->payload_len = htons(first_len -
639 sizeof(struct ipv6hdr));
a11d206d 640
d8d1f30b 641 dst_hold(&rt->dst);
1da177e4
LT
642
643 for (;;) {
644 /* Prepare header of the next frame,
645 * before previous one went down. */
646 if (frag) {
647 frag->ip_summed = CHECKSUM_NONE;
badff6d0 648 skb_reset_transport_header(frag);
1da177e4 649 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
650 __skb_push(frag, hlen);
651 skb_reset_network_header(frag);
d56f90a7
ACM
652 memcpy(skb_network_header(frag), tmp_hdr,
653 hlen);
1da177e4
LT
654 offset += skb->len - hlen - sizeof(struct frag_hdr);
655 fh->nexthdr = nexthdr;
656 fh->reserved = 0;
657 fh->frag_off = htons(offset);
658 if (frag->next != NULL)
659 fh->frag_off |= htons(IP6_MF);
660 fh->identification = frag_id;
0660e03f
ACM
661 ipv6_hdr(frag)->payload_len =
662 htons(frag->len -
663 sizeof(struct ipv6hdr));
1da177e4
LT
664 ip6_copy_metadata(frag, skb);
665 }
1ab1457c 666
1da177e4 667 err = output(skb);
dafee490 668 if(!err)
d8d1f30b 669 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 670 IPSTATS_MIB_FRAGCREATES);
dafee490 671
1da177e4
LT
672 if (err || !frag)
673 break;
674
675 skb = frag;
676 frag = skb->next;
677 skb->next = NULL;
678 }
679
a51482bd 680 kfree(tmp_hdr);
1da177e4
LT
681
682 if (err == 0) {
d8d1f30b 683 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 684 IPSTATS_MIB_FRAGOKS);
94e187c0 685 ip6_rt_put(rt);
1da177e4
LT
686 return 0;
687 }
688
689 while (frag) {
690 skb = frag->next;
691 kfree_skb(frag);
692 frag = skb;
693 }
694
d8d1f30b 695 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 696 IPSTATS_MIB_FRAGFAILS);
94e187c0 697 ip6_rt_put(rt);
1da177e4 698 return err;
3d13008e
ED
699
700slow_path_clean:
701 skb_walk_frags(skb, frag2) {
702 if (frag2 == frag)
703 break;
704 frag2->sk = NULL;
705 frag2->destructor = NULL;
706 skb->truesize += frag2->truesize;
707 }
1da177e4
LT
708 }
709
710slow_path:
72e843bb
ED
711 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
712 skb_checksum_help(skb))
713 goto fail;
714
1da177e4
LT
715 left = skb->len - hlen; /* Space per frame */
716 ptr = hlen; /* Where to start from */
717
718 /*
719 * Fragment the datagram.
720 */
721
722 *prevhdr = NEXTHDR_FRAGMENT;
a7ae1992
HX
723 hroom = LL_RESERVED_SPACE(rt->dst.dev);
724 troom = rt->dst.dev->needed_tailroom;
1da177e4
LT
725
726 /*
727 * Keep copying data until we run out.
728 */
729 while(left > 0) {
730 len = left;
731 /* IF: it doesn't fit, use 'mtu' - the data space left */
732 if (len > mtu)
733 len = mtu;
25985edc 734 /* IF: we are not sending up to and including the packet end
1da177e4
LT
735 then align the next start on an eight byte boundary */
736 if (len < left) {
737 len &= ~7;
738 }
739 /*
740 * Allocate buffer.
741 */
742
a7ae1992
HX
743 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
744 hroom + troom, GFP_ATOMIC)) == NULL) {
64ce2073 745 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 746 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 747 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
748 err = -ENOMEM;
749 goto fail;
750 }
751
752 /*
753 * Set up data on packet
754 */
755
756 ip6_copy_metadata(frag, skb);
a7ae1992 757 skb_reserve(frag, hroom);
1da177e4 758 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 759 skb_reset_network_header(frag);
badff6d0 760 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
761 frag->transport_header = (frag->network_header + hlen +
762 sizeof(struct frag_hdr));
1da177e4
LT
763
764 /*
765 * Charge the memory for the fragment to any owner
766 * it might possess
767 */
768 if (skb->sk)
769 skb_set_owner_w(frag, skb->sk);
770
771 /*
772 * Copy the packet header into the new buffer.
773 */
d626f62b 774 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
775
776 /*
777 * Build fragment header.
778 */
779 fh->nexthdr = nexthdr;
780 fh->reserved = 0;
f36d6ab1 781 if (!frag_id) {
87c48fa3 782 ipv6_select_ident(fh, rt);
1da177e4
LT
783 frag_id = fh->identification;
784 } else
785 fh->identification = frag_id;
786
787 /*
788 * Copy a block of the IP datagram.
789 */
8984e41d 790 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
791 BUG();
792 left -= len;
793
794 fh->frag_off = htons(offset);
795 if (left > 0)
796 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
797 ipv6_hdr(frag)->payload_len = htons(frag->len -
798 sizeof(struct ipv6hdr));
1da177e4
LT
799
800 ptr += len;
801 offset += len;
802
803 /*
804 * Put this fragment into the sending queue.
805 */
1da177e4
LT
806 err = output(frag);
807 if (err)
808 goto fail;
dafee490 809
adf30907 810 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 811 IPSTATS_MIB_FRAGCREATES);
1da177e4 812 }
adf30907 813 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 814 IPSTATS_MIB_FRAGOKS);
808db80a 815 consume_skb(skb);
1da177e4
LT
816 return err;
817
818fail:
adf30907 819 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 820 IPSTATS_MIB_FRAGFAILS);
1ab1457c 821 kfree_skb(skb);
1da177e4
LT
822 return err;
823}
824
b71d1d42
ED
825static inline int ip6_rt_check(const struct rt6key *rt_key,
826 const struct in6_addr *fl_addr,
827 const struct in6_addr *addr_cache)
cf6b1982 828{
a02cec21
ED
829 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
830 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
831}
832
497c615a
HX
833static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
834 struct dst_entry *dst,
b71d1d42 835 const struct flowi6 *fl6)
1da177e4 836{
497c615a 837 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 838 struct rt6_info *rt;
1da177e4 839
497c615a
HX
840 if (!dst)
841 goto out;
842
a963a37d
ED
843 if (dst->ops->family != AF_INET6) {
844 dst_release(dst);
845 return NULL;
846 }
847
848 rt = (struct rt6_info *)dst;
497c615a
HX
849 /* Yes, checking route validity in not connected
850 * case is not very simple. Take into account,
851 * that we do not support routing by source, TOS,
852 * and MSG_DONTROUTE --ANK (980726)
853 *
cf6b1982
YH
854 * 1. ip6_rt_check(): If route was host route,
855 * check that cached destination is current.
497c615a
HX
856 * If it is network route, we still may
857 * check its validity using saved pointer
858 * to the last used address: daddr_cache.
859 * We do not want to save whole address now,
860 * (because main consumer of this service
861 * is tcp, which has not this problem),
862 * so that the last trick works only on connected
863 * sockets.
864 * 2. oif also should be the same.
865 */
4c9483b2 866 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 867#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 868 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 869#endif
4c9483b2 870 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
497c615a
HX
871 dst_release(dst);
872 dst = NULL;
1da177e4
LT
873 }
874
497c615a
HX
875out:
876 return dst;
877}
878
879static int ip6_dst_lookup_tail(struct sock *sk,
4c9483b2 880 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 881{
3b1e0a65 882 struct net *net = sock_net(sk);
69cce1d1
DM
883#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
884 struct neighbour *n;
97cac082 885 struct rt6_info *rt;
69cce1d1
DM
886#endif
887 int err;
497c615a 888
1da177e4 889 if (*dst == NULL)
4c9483b2 890 *dst = ip6_route_output(net, sk, fl6);
1da177e4
LT
891
892 if ((err = (*dst)->error))
893 goto out_err_release;
894
4c9483b2 895 if (ipv6_addr_any(&fl6->saddr)) {
c3968a85
DW
896 struct rt6_info *rt = (struct rt6_info *) *dst;
897 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
898 sk ? inet6_sk(sk)->srcprefs : 0,
899 &fl6->saddr);
44456d37 900 if (err)
1da177e4 901 goto out_err_release;
1da177e4
LT
902 }
903
95c385b4 904#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
905 /*
906 * Here if the dst entry we've looked up
907 * has a neighbour entry that is in the INCOMPLETE
908 * state and the src address from the flow is
909 * marked as OPTIMISTIC, we release the found
910 * dst entry and replace it instead with the
911 * dst entry of the nexthop router
912 */
c56bf6fe 913 rt = (struct rt6_info *) *dst;
707be1ff 914 rcu_read_lock_bh();
550bab42 915 n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
707be1ff
YH
916 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
917 rcu_read_unlock_bh();
918
919 if (err) {
e550dfb0 920 struct inet6_ifaddr *ifp;
4c9483b2 921 struct flowi6 fl_gw6;
e550dfb0
NH
922 int redirect;
923
4c9483b2 924 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
925 (*dst)->dev, 1);
926
927 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
928 if (ifp)
929 in6_ifa_put(ifp);
930
931 if (redirect) {
932 /*
933 * We need to get the dst entry for the
934 * default router instead
935 */
936 dst_release(*dst);
4c9483b2
DM
937 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
938 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
939 *dst = ip6_route_output(net, sk, &fl_gw6);
e550dfb0
NH
940 if ((err = (*dst)->error))
941 goto out_err_release;
95c385b4 942 }
e550dfb0 943 }
95c385b4
NH
944#endif
945
1da177e4
LT
946 return 0;
947
948out_err_release:
ca46f9c8 949 if (err == -ENETUNREACH)
5ac68e7c 950 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
951 dst_release(*dst);
952 *dst = NULL;
953 return err;
954}
34a0b3cd 955
497c615a
HX
956/**
957 * ip6_dst_lookup - perform route lookup on flow
958 * @sk: socket which provides route info
959 * @dst: pointer to dst_entry * for result
4c9483b2 960 * @fl6: flow to lookup
497c615a
HX
961 *
962 * This function performs a route lookup on the given flow.
963 *
964 * It returns zero on success, or a standard errno code on error.
965 */
4c9483b2 966int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
497c615a
HX
967{
968 *dst = NULL;
4c9483b2 969 return ip6_dst_lookup_tail(sk, dst, fl6);
497c615a 970}
3cf3dc6c
ACM
971EXPORT_SYMBOL_GPL(ip6_dst_lookup);
972
497c615a 973/**
68d0c6d3
DM
974 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
975 * @sk: socket which provides route info
4c9483b2 976 * @fl6: flow to lookup
68d0c6d3 977 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
978 *
979 * This function performs a route lookup on the given flow.
980 *
981 * It returns a valid dst pointer on success, or a pointer encoded
982 * error code.
983 */
4c9483b2 984struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
0e0d44ab 985 const struct in6_addr *final_dst)
68d0c6d3
DM
986{
987 struct dst_entry *dst = NULL;
988 int err;
989
4c9483b2 990 err = ip6_dst_lookup_tail(sk, &dst, fl6);
68d0c6d3
DM
991 if (err)
992 return ERR_PTR(err);
993 if (final_dst)
4e3fd7a0 994 fl6->daddr = *final_dst;
2774c131 995
4c9483b2 996 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
997}
998EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
999
1000/**
1001 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1002 * @sk: socket which provides the dst cache and route info
4c9483b2 1003 * @fl6: flow to lookup
68d0c6d3 1004 * @final_dst: final destination address for ipsec lookup
497c615a
HX
1005 *
1006 * This function performs a route lookup on the given flow with the
1007 * possibility of using the cached route in the socket if it is valid.
1008 * It will take the socket dst lock when operating on the dst cache.
1009 * As a result, this function can only be used in process context.
1010 *
68d0c6d3
DM
1011 * It returns a valid dst pointer on success, or a pointer encoded
1012 * error code.
497c615a 1013 */
4c9483b2 1014struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1015 const struct in6_addr *final_dst)
497c615a 1016{
68d0c6d3
DM
1017 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1018 int err;
497c615a 1019
4c9483b2 1020 dst = ip6_sk_dst_check(sk, dst, fl6);
68d0c6d3 1021
4c9483b2 1022 err = ip6_dst_lookup_tail(sk, &dst, fl6);
68d0c6d3
DM
1023 if (err)
1024 return ERR_PTR(err);
1025 if (final_dst)
4e3fd7a0 1026 fl6->daddr = *final_dst;
2774c131 1027
4c9483b2 1028 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
497c615a 1029}
68d0c6d3 1030EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1031
34a0b3cd 1032static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1033 int getfrag(void *from, char *to, int offset, int len,
1034 int odd, struct sk_buff *skb),
1035 void *from, int length, int hh_len, int fragheaderlen,
87c48fa3
ED
1036 int transhdrlen, int mtu,unsigned int flags,
1037 struct rt6_info *rt)
e89e9cf5
AR
1038
1039{
1040 struct sk_buff *skb;
c547dbf5 1041 struct frag_hdr fhdr;
e89e9cf5
AR
1042 int err;
1043
1044 /* There is support for UDP large send offload by network
1045 * device, so create one single skb packet containing complete
1046 * udp datagram
1047 */
1048 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1049 skb = sock_alloc_send_skb(sk,
1050 hh_len + fragheaderlen + transhdrlen + 20,
1051 (flags & MSG_DONTWAIT), &err);
1052 if (skb == NULL)
504744e4 1053 return err;
e89e9cf5
AR
1054
1055 /* reserve space for Hardware header */
1056 skb_reserve(skb, hh_len);
1057
1058 /* create space for UDP/IP header */
1059 skb_put(skb,fragheaderlen + transhdrlen);
1060
1061 /* initialize network header pointer */
c1d2bbe1 1062 skb_reset_network_header(skb);
e89e9cf5
AR
1063
1064 /* initialize protocol header pointer */
b0e380b1 1065 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1066
9c9c9ad5 1067 skb->protocol = htons(ETH_P_IPV6);
e89e9cf5 1068 skb->csum = 0;
e89e9cf5 1069
e89e9cf5 1070 __skb_queue_tail(&sk->sk_write_queue, skb);
c547dbf5
JP
1071 } else if (skb_is_gso(skb)) {
1072 goto append;
e89e9cf5 1073 }
e89e9cf5 1074
c547dbf5
JP
1075 skb->ip_summed = CHECKSUM_PARTIAL;
1076 /* Specify the length of each IPv6 datagram fragment.
1077 * It has to be a multiple of 8.
1078 */
1079 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080 sizeof(struct frag_hdr)) & ~7;
1081 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1082 ipv6_select_ident(&fhdr, rt);
1083 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084
1085append:
2811ebac
HFS
1086 return skb_append_datato_frags(sk, skb, getfrag, from,
1087 (length - transhdrlen));
e89e9cf5 1088}
1da177e4 1089
0178b695
HX
1090static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1091 gfp_t gfp)
1092{
1093 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094}
1095
1096static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1097 gfp_t gfp)
1098{
1099 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100}
1101
75a493e6 1102static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1103 int *maxfraglen,
1104 unsigned int fragheaderlen,
1105 struct sk_buff *skb,
75a493e6
HFS
1106 struct rt6_info *rt,
1107 bool pmtuprobe)
0c183379
G
1108{
1109 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1110 if (skb == NULL) {
1111 /* first fragment, reserve header_len */
1112 *mtu = *mtu - rt->dst.header_len;
1113
1114 } else {
1115 /*
1116 * this fragment is not first, the headers
1117 * space is regarded as data space.
1118 */
75a493e6
HFS
1119 *mtu = min(*mtu, pmtuprobe ?
1120 rt->dst.dev->mtu :
1121 dst_mtu(rt->dst.path));
0c183379
G
1122 }
1123 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1124 + fragheaderlen - sizeof(struct frag_hdr);
1125 }
1126}
1127
41a1f8ea
YH
1128int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1129 int offset, int len, int odd, struct sk_buff *skb),
1130 void *from, int length, int transhdrlen,
4c9483b2 1131 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
13b52cd4 1132 struct rt6_info *rt, unsigned int flags, int dontfrag)
1da177e4
LT
1133{
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct ipv6_pinfo *np = inet6_sk(sk);
bdc712b4 1136 struct inet_cork *cork;
0c183379 1137 struct sk_buff *skb, *skb_prev = NULL;
75a493e6 1138 unsigned int maxfraglen, fragheaderlen, mtu;
1da177e4 1139 int exthdrlen;
299b0767 1140 int dst_exthdrlen;
1da177e4 1141 int hh_len;
1da177e4
LT
1142 int copy;
1143 int err;
1144 int offset = 0;
a693e698 1145 __u8 tx_flags = 0;
1da177e4
LT
1146
1147 if (flags&MSG_PROBE)
1148 return 0;
bdc712b4 1149 cork = &inet->cork.base;
1da177e4
LT
1150 if (skb_queue_empty(&sk->sk_write_queue)) {
1151 /*
1152 * setup for corking
1153 */
1154 if (opt) {
0178b695 1155 if (WARN_ON(np->cork.opt))
1da177e4 1156 return -EINVAL;
0178b695 1157
284041ef 1158 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
0178b695
HX
1159 if (unlikely(np->cork.opt == NULL))
1160 return -ENOBUFS;
1161
1162 np->cork.opt->tot_len = opt->tot_len;
1163 np->cork.opt->opt_flen = opt->opt_flen;
1164 np->cork.opt->opt_nflen = opt->opt_nflen;
1165
1166 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1167 sk->sk_allocation);
1168 if (opt->dst0opt && !np->cork.opt->dst0opt)
1169 return -ENOBUFS;
1170
1171 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1172 sk->sk_allocation);
1173 if (opt->dst1opt && !np->cork.opt->dst1opt)
1174 return -ENOBUFS;
1175
1176 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1177 sk->sk_allocation);
1178 if (opt->hopopt && !np->cork.opt->hopopt)
1179 return -ENOBUFS;
1180
1181 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1182 sk->sk_allocation);
1183 if (opt->srcrt && !np->cork.opt->srcrt)
1184 return -ENOBUFS;
1185
1da177e4
LT
1186 /* need source address above miyazawa*/
1187 }
d8d1f30b 1188 dst_hold(&rt->dst);
bdc712b4 1189 cork->dst = &rt->dst;
4c9483b2 1190 inet->cork.fl.u.ip6 = *fl6;
1da177e4 1191 np->cork.hop_limit = hlimit;
41a1f8ea 1192 np->cork.tclass = tclass;
0c183379 1193 if (rt->dst.flags & DST_XFRM_TUNNEL)
93b36cf3 1194 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
0c183379
G
1195 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1196 else
93b36cf3 1197 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
0c183379 1198 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
c7503609 1199 if (np->frag_size < mtu) {
d91675f9
YH
1200 if (np->frag_size)
1201 mtu = np->frag_size;
1202 }
bdc712b4 1203 cork->fragsize = mtu;
d8d1f30b 1204 if (dst_allfrag(rt->dst.path))
bdc712b4
DM
1205 cork->flags |= IPCORK_ALLFRAG;
1206 cork->length = 0;
7efdba5b 1207 exthdrlen = (opt ? opt->opt_flen : 0);
1da177e4
LT
1208 length += exthdrlen;
1209 transhdrlen += exthdrlen;
7efdba5b 1210 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1211 } else {
bdc712b4 1212 rt = (struct rt6_info *)cork->dst;
4c9483b2 1213 fl6 = &inet->cork.fl.u.ip6;
0178b695 1214 opt = np->cork.opt;
1da177e4
LT
1215 transhdrlen = 0;
1216 exthdrlen = 0;
299b0767 1217 dst_exthdrlen = 0;
bdc712b4 1218 mtu = cork->fragsize;
1da177e4
LT
1219 }
1220
d8d1f30b 1221 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1222
a1b05140 1223 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1224 (opt ? opt->opt_nflen : 0);
4df98e76
HFS
1225 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1226 sizeof(struct frag_hdr);
1da177e4
LT
1227
1228 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
4df98e76
HFS
1229 unsigned int maxnonfragsize, headersize;
1230
1231 headersize = sizeof(struct ipv6hdr) +
1232 (opt ? opt->tot_len : 0) +
1233 (dst_allfrag(&rt->dst) ?
1234 sizeof(struct frag_hdr) : 0) +
1235 rt->rt6i_nfheader_len;
1236
1237 maxnonfragsize = (np->pmtudisc >= IPV6_PMTUDISC_DO) ?
1238 mtu : sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1239
1240 /* dontfrag active */
1241 if ((cork->length + length > mtu - headersize) && dontfrag &&
1242 (sk->sk_protocol == IPPROTO_UDP ||
1243 sk->sk_protocol == IPPROTO_RAW)) {
1244 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1245 sizeof(struct ipv6hdr));
1246 goto emsgsize;
1247 }
1248
1249 if (cork->length + length > maxnonfragsize - headersize) {
1250emsgsize:
1251 ipv6_local_error(sk, EMSGSIZE, fl6,
1252 mtu - headersize +
1253 sizeof(struct ipv6hdr));
1da177e4
LT
1254 return -EMSGSIZE;
1255 }
1256 }
1257
a693e698 1258 /* For UDP, check if TX timestamp is enabled */
bf84a010
DB
1259 if (sk->sk_type == SOCK_DGRAM)
1260 sock_tx_timestamp(sk, &tx_flags);
a693e698 1261
1da177e4
LT
1262 /*
1263 * Let's try using as much space as possible.
1264 * Use MTU if total length of the message fits into the MTU.
1265 * Otherwise, we need to reserve fragment header and
1266 * fragment alignment (= 8-15 octects, in total).
1267 *
1268 * Note that we may need to "move" the data from the tail of
1ab1457c 1269 * of the buffer to the new fragment when we split
1da177e4
LT
1270 * the message.
1271 *
1ab1457c 1272 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1273 * at once if non-fragmentable extension headers
1274 * are too large.
1ab1457c 1275 * --yoshfuji
1da177e4
LT
1276 */
1277
2811ebac
HFS
1278 skb = skb_peek_tail(&sk->sk_write_queue);
1279 cork->length += length;
1280 if (((length > mtu) ||
1281 (skb && skb_is_gso(skb))) &&
1282 (sk->sk_protocol == IPPROTO_UDP) &&
1283 (rt->dst.dev->features & NETIF_F_UFO)) {
1284 err = ip6_ufo_append_data(sk, getfrag, from, length,
1285 hh_len, fragheaderlen,
1286 transhdrlen, mtu, flags, rt);
1287 if (err)
1288 goto error;
1289 return 0;
e89e9cf5 1290 }
1da177e4 1291
2811ebac 1292 if (!skb)
1da177e4
LT
1293 goto alloc_new_skb;
1294
1295 while (length > 0) {
1296 /* Check if the remaining data fits into current packet. */
bdc712b4 1297 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1298 if (copy < length)
1299 copy = maxfraglen - skb->len;
1300
1301 if (copy <= 0) {
1302 char *data;
1303 unsigned int datalen;
1304 unsigned int fraglen;
1305 unsigned int fraggap;
1306 unsigned int alloclen;
1da177e4 1307alloc_new_skb:
1da177e4 1308 /* There's no room in the current skb */
0c183379
G
1309 if (skb)
1310 fraggap = skb->len - maxfraglen;
1da177e4
LT
1311 else
1312 fraggap = 0;
0c183379
G
1313 /* update mtu and maxfraglen if necessary */
1314 if (skb == NULL || skb_prev == NULL)
1315 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1316 fragheaderlen, skb, rt,
93b36cf3 1317 np->pmtudisc >=
75a493e6 1318 IPV6_PMTUDISC_PROBE);
0c183379
G
1319
1320 skb_prev = skb;
1da177e4
LT
1321
1322 /*
1323 * If remaining data exceeds the mtu,
1324 * we know we need more fragment(s).
1325 */
1326 datalen = length + fraggap;
1da177e4 1327
0c183379
G
1328 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1329 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1da177e4 1330 if ((flags & MSG_MORE) &&
d8d1f30b 1331 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1332 alloclen = mtu;
1333 else
1334 alloclen = datalen + fragheaderlen;
1335
299b0767
SK
1336 alloclen += dst_exthdrlen;
1337
0c183379
G
1338 if (datalen != length + fraggap) {
1339 /*
1340 * this is not the last fragment, the trailer
1341 * space is regarded as data space.
1342 */
1343 datalen += rt->dst.trailer_len;
1344 }
1345
1346 alloclen += rt->dst.trailer_len;
1347 fraglen = datalen + fragheaderlen;
1da177e4
LT
1348
1349 /*
1350 * We just reserve space for fragment header.
1ab1457c 1351 * Note: this may be overallocation if the message
1da177e4
LT
1352 * (without MSG_MORE) fits into the MTU.
1353 */
1354 alloclen += sizeof(struct frag_hdr);
1355
1356 if (transhdrlen) {
1357 skb = sock_alloc_send_skb(sk,
1358 alloclen + hh_len,
1359 (flags & MSG_DONTWAIT), &err);
1360 } else {
1361 skb = NULL;
1362 if (atomic_read(&sk->sk_wmem_alloc) <=
1363 2 * sk->sk_sndbuf)
1364 skb = sock_wmalloc(sk,
1365 alloclen + hh_len, 1,
1366 sk->sk_allocation);
1367 if (unlikely(skb == NULL))
1368 err = -ENOBUFS;
a693e698
AB
1369 else {
1370 /* Only the initial fragment
1371 * is time stamped.
1372 */
1373 tx_flags = 0;
1374 }
1da177e4
LT
1375 }
1376 if (skb == NULL)
1377 goto error;
1378 /*
1379 * Fill in the control structures
1380 */
9c9c9ad5 1381 skb->protocol = htons(ETH_P_IPV6);
d7f7c0ac 1382 skb->ip_summed = CHECKSUM_NONE;
1da177e4 1383 skb->csum = 0;
1f85851e
G
1384 /* reserve for fragmentation and ipsec header */
1385 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1386 dst_exthdrlen);
1da177e4 1387
a693e698
AB
1388 if (sk->sk_type == SOCK_DGRAM)
1389 skb_shinfo(skb)->tx_flags = tx_flags;
1390
1da177e4
LT
1391 /*
1392 * Find where to start putting bytes
1393 */
1f85851e
G
1394 data = skb_put(skb, fraglen);
1395 skb_set_network_header(skb, exthdrlen);
1396 data += fragheaderlen;
b0e380b1
ACM
1397 skb->transport_header = (skb->network_header +
1398 fragheaderlen);
1da177e4
LT
1399 if (fraggap) {
1400 skb->csum = skb_copy_and_csum_bits(
1401 skb_prev, maxfraglen,
1402 data + transhdrlen, fraggap, 0);
1403 skb_prev->csum = csum_sub(skb_prev->csum,
1404 skb->csum);
1405 data += fraggap;
e9fa4f7b 1406 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1407 }
1408 copy = datalen - transhdrlen - fraggap;
299b0767 1409
1da177e4
LT
1410 if (copy < 0) {
1411 err = -EINVAL;
1412 kfree_skb(skb);
1413 goto error;
1414 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1415 err = -EFAULT;
1416 kfree_skb(skb);
1417 goto error;
1418 }
1419
1420 offset += copy;
1421 length -= datalen - fraggap;
1422 transhdrlen = 0;
1423 exthdrlen = 0;
299b0767 1424 dst_exthdrlen = 0;
1da177e4
LT
1425
1426 /*
1427 * Put the packet on the pending queue
1428 */
1429 __skb_queue_tail(&sk->sk_write_queue, skb);
1430 continue;
1431 }
1432
1433 if (copy > length)
1434 copy = length;
1435
d8d1f30b 1436 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
1437 unsigned int off;
1438
1439 off = skb->len;
1440 if (getfrag(from, skb_put(skb, copy),
1441 offset, copy, off, skb) < 0) {
1442 __skb_trim(skb, off);
1443 err = -EFAULT;
1444 goto error;
1445 }
1446 } else {
1447 int i = skb_shinfo(skb)->nr_frags;
5640f768 1448 struct page_frag *pfrag = sk_page_frag(sk);
1da177e4 1449
5640f768
ED
1450 err = -ENOMEM;
1451 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1452 goto error;
5640f768
ED
1453
1454 if (!skb_can_coalesce(skb, i, pfrag->page,
1455 pfrag->offset)) {
1456 err = -EMSGSIZE;
1457 if (i == MAX_SKB_FRAGS)
1458 goto error;
1459
1460 __skb_fill_page_desc(skb, i, pfrag->page,
1461 pfrag->offset, 0);
1462 skb_shinfo(skb)->nr_frags = ++i;
1463 get_page(pfrag->page);
1da177e4 1464 }
5640f768 1465 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1466 if (getfrag(from,
5640f768
ED
1467 page_address(pfrag->page) + pfrag->offset,
1468 offset, copy, skb->len, skb) < 0)
1469 goto error_efault;
1470
1471 pfrag->offset += copy;
1472 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1473 skb->len += copy;
1474 skb->data_len += copy;
f945fa7a
HX
1475 skb->truesize += copy;
1476 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1477 }
1478 offset += copy;
1479 length -= copy;
1480 }
5640f768 1481
1da177e4 1482 return 0;
5640f768
ED
1483
1484error_efault:
1485 err = -EFAULT;
1da177e4 1486error:
bdc712b4 1487 cork->length -= length;
3bd653c8 1488 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1489 return err;
1490}
a495f836 1491EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1492
bf138862
PE
1493static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1494{
0178b695
HX
1495 if (np->cork.opt) {
1496 kfree(np->cork.opt->dst0opt);
1497 kfree(np->cork.opt->dst1opt);
1498 kfree(np->cork.opt->hopopt);
1499 kfree(np->cork.opt->srcrt);
1500 kfree(np->cork.opt);
1501 np->cork.opt = NULL;
1502 }
1503
bdc712b4
DM
1504 if (inet->cork.base.dst) {
1505 dst_release(inet->cork.base.dst);
1506 inet->cork.base.dst = NULL;
1507 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
bf138862
PE
1508 }
1509 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1510}
1511
1da177e4
LT
1512int ip6_push_pending_frames(struct sock *sk)
1513{
1514 struct sk_buff *skb, *tmp_skb;
1515 struct sk_buff **tail_skb;
1516 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1517 struct inet_sock *inet = inet_sk(sk);
1518 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1519 struct net *net = sock_net(sk);
1da177e4
LT
1520 struct ipv6hdr *hdr;
1521 struct ipv6_txoptions *opt = np->cork.opt;
bdc712b4 1522 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
4c9483b2
DM
1523 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1524 unsigned char proto = fl6->flowi6_proto;
1da177e4
LT
1525 int err = 0;
1526
1527 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1528 goto out;
1529 tail_skb = &(skb_shinfo(skb)->frag_list);
1530
1531 /* move skb->data to ip header from ext header */
d56f90a7 1532 if (skb->data < skb_network_header(skb))
bbe735e4 1533 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1534 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1535 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1536 *tail_skb = tmp_skb;
1537 tail_skb = &(tmp_skb->next);
1538 skb->len += tmp_skb->len;
1539 skb->data_len += tmp_skb->len;
1da177e4 1540 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1541 tmp_skb->destructor = NULL;
1542 tmp_skb->sk = NULL;
1da177e4
LT
1543 }
1544
28a89453 1545 /* Allow local fragmentation. */
b5c15fc0 1546 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1547 skb->local_df = 1;
1548
4e3fd7a0 1549 *final_dst = fl6->daddr;
cfe1fc77 1550 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1551 if (opt && opt->opt_flen)
1552 ipv6_push_frag_opts(skb, opt, &proto);
1553 if (opt && opt->opt_nflen)
1554 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1555
e2d1bca7
ACM
1556 skb_push(skb, sizeof(struct ipv6hdr));
1557 skb_reset_network_header(skb);
0660e03f 1558 hdr = ipv6_hdr(skb);
1ab1457c 1559
3e4e4c1f 1560 ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1da177e4
LT
1561 hdr->hop_limit = np->cork.hop_limit;
1562 hdr->nexthdr = proto;
4e3fd7a0
AD
1563 hdr->saddr = fl6->saddr;
1564 hdr->daddr = *final_dst;
1da177e4 1565
a2c2064f 1566 skb->priority = sk->sk_priority;
4a19ec58 1567 skb->mark = sk->sk_mark;
a2c2064f 1568
d8d1f30b 1569 skb_dst_set(skb, dst_clone(&rt->dst));
edf391ff 1570 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1571 if (proto == IPPROTO_ICMPV6) {
adf30907 1572 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1573
5a57d4c7 1574 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1575 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1576 }
1577
ef76bc23 1578 err = ip6_local_out(skb);
1da177e4
LT
1579 if (err) {
1580 if (err > 0)
6ce9e7b5 1581 err = net_xmit_errno(err);
1da177e4
LT
1582 if (err)
1583 goto error;
1584 }
1585
1586out:
bf138862 1587 ip6_cork_release(inet, np);
1da177e4
LT
1588 return err;
1589error:
06254914 1590 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1591 goto out;
1592}
a495f836 1593EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4
LT
1594
1595void ip6_flush_pending_frames(struct sock *sk)
1596{
1da177e4
LT
1597 struct sk_buff *skb;
1598
1599 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1600 if (skb_dst(skb))
1601 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1602 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1603 kfree_skb(skb);
1604 }
1605
bf138862 1606 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1607}
a495f836 1608EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);