skbuff: introduce skb_expand_head()
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/ipv6.h>
46#include <net/ndisc.h>
47#include <net/protocol.h>
48#include <net/ip6_route.h>
49#include <net/addrconf.h>
50#include <net/rawv6.h>
51#include <net/icmp.h>
52#include <net/xfrm.h>
53#include <net/checksum.h>
7bc570c8 54#include <linux/mroute6.h>
ca254490 55#include <net/l3mdev.h>
14972cbd 56#include <net/lwtunnel.h>
571912c6 57#include <net/ip_tunnels.h>
1da177e4 58
7d8c6e39 59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
5796015f
VA
63 unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 int delta = hh_len - skb_headroom(skb);
9b1c1ef1 65 const struct in6_addr *nexthop;
f6b72b62 66 struct neighbour *neigh;
6fd6ce20 67 int ret;
1da177e4 68
5796015f
VA
69 /* Be paranoid, rather than too clever. */
70 if (unlikely(delta > 0) && dev->header_ops) {
71 /* pskb_expand_head() might crash, if skb is shared */
72 if (skb_shared(skb)) {
73 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74
75 if (likely(nskb)) {
76 if (skb->sk)
2d85a1b3 77 skb_set_owner_w(nskb, skb->sk);
5796015f
VA
78 consume_skb(skb);
79 } else {
80 kfree_skb(skb);
81 }
82 skb = nskb;
83 }
84 if (skb &&
85 pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 kfree_skb(skb);
87 skb = NULL;
88 }
89 if (!skb) {
90 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 return -ENOMEM;
92 }
93 }
94
0660e03f 95 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 96 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 97
7026b1dd 98 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 99 ((mroute6_is_socket(net, skb) &&
bd91b8bf 100 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
101 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
103 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105 /* Do not check for IFF_ALLMULTI; multicast routing
106 is not supported in any case.
107 */
108 if (newskb)
b2e0b385 109 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 110 net, sk, newskb, NULL, newskb->dev,
95603e22 111 dev_loopback_xmit);
1da177e4 112
0660e03f 113 if (ipv6_hdr(skb)->hop_limit == 0) {
78126c41 114 IP6_INC_STATS(net, idev,
3bd653c8 115 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
116 kfree_skb(skb);
117 return 0;
118 }
119 }
120
78126c41 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
dd408515
HFS
122
123 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 IPV6_ADDR_SCOPE_NODELOCAL &&
125 !(dev->flags & IFF_LOOPBACK)) {
126 kfree_skb(skb);
127 return 0;
128 }
1da177e4
LT
129 }
130
14972cbd
RP
131 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 int res = lwtunnel_xmit(skb);
133
134 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
135 return res;
136 }
137
6fd6ce20 138 rcu_read_lock_bh();
2647a9b0 139 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
6fd6ce20
YH
140 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 if (unlikely(!neigh))
142 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 if (!IS_ERR(neigh)) {
4ff06203 144 sock_confirm_neigh(skb, neigh);
0353f282 145 ret = neigh_output(neigh, skb, false);
6fd6ce20
YH
146 rcu_read_unlock_bh();
147 return ret;
148 }
149 rcu_read_unlock_bh();
05e3aa09 150
78126c41 151 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
9e508490
JE
152 kfree_skb(skb);
153 return -EINVAL;
1da177e4
LT
154}
155
b210de4f
AL
156static int
157ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 struct sk_buff *skb, unsigned int mtu)
159{
160 struct sk_buff *segs, *nskb;
161 netdev_features_t features;
162 int ret = 0;
163
164 /* Please see corresponding comment in ip_finish_output_gso
165 * describing the cases where GSO segment length exceeds the
166 * egress MTU.
167 */
168 features = netif_skb_features(skb);
169 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 if (IS_ERR_OR_NULL(segs)) {
171 kfree_skb(skb);
172 return -ENOMEM;
173 }
174
175 consume_skb(skb);
176
177 skb_list_walk_safe(segs, segs, nskb) {
178 int err;
179
180 skb_mark_not_on_list(segs);
181 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
182 if (err && ret == 0)
183 ret = err;
184 }
185
186 return ret;
187}
188
956fe219 189static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 190{
b210de4f
AL
191 unsigned int mtu;
192
09ee9dba
TB
193#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 /* Policy lookup after SNAT yielded a new policy */
195 if (skb_dst(skb)->xfrm) {
196 IPCB(skb)->flags |= IPSKB_REROUTED;
197 return dst_output(net, sk, skb);
198 }
199#endif
200
b210de4f
AL
201 mtu = ip6_skb_dst_mtu(skb);
202 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
203 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204
205 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
206 dst_allfrag(skb_dst(skb)) ||
207 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 208 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 209 else
7d8c6e39 210 return ip6_finish_output2(net, sk, skb);
9e508490
JE
211}
212
956fe219 213static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214{
215 int ret;
216
217 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 switch (ret) {
219 case NET_XMIT_SUCCESS:
220 return __ip6_finish_output(net, sk, skb);
221 case NET_XMIT_CN:
222 return __ip6_finish_output(net, sk, skb) ? : ret;
223 default:
224 kfree_skb(skb);
225 return ret;
226 }
227}
228
ede2059d 229int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 230{
28f8bfd1 231 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 232 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 233
97a7a37a
CF
234 skb->protocol = htons(ETH_P_IPV6);
235 skb->dev = dev;
236
778d80be 237 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 238 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
239 kfree_skb(skb);
240 return 0;
241 }
242
29a26a56 243 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 244 net, sk, skb, indev, dev,
9c6eb28a
JE
245 ip6_finish_output,
246 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 247}
6585d7dc 248EXPORT_SYMBOL(ip6_output);
1da177e4 249
e9191ffb 250bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
251{
252 if (!np->autoflowlabel_set)
253 return ip6_default_np_autolabel(net);
254 else
255 return np->autoflowlabel;
256}
257
1da177e4 258/*
1c1e9d2b
ED
259 * xmit an sk_buff (used by TCP, SCTP and DCCP)
260 * Note : socket lock is not held for SYNACK packets, but might be modified
261 * by calls to skb_set_owner_w() and ipv6_local_error(),
262 * which are using proper atomic operations or spinlocks.
1da177e4 263 */
1c1e9d2b 264int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 265 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 266{
3bd653c8 267 struct net *net = sock_net(sk);
1c1e9d2b 268 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 269 struct in6_addr *first_hop = &fl6->daddr;
adf30907 270 struct dst_entry *dst = skb_dst(skb);
66033f47 271 unsigned int head_room;
1da177e4 272 struct ipv6hdr *hdr;
4c9483b2 273 u8 proto = fl6->flowi6_proto;
1da177e4 274 int seg_len = skb->len;
e651f03a 275 int hlimit = -1;
1da177e4
LT
276 u32 mtu;
277
66033f47
SB
278 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
279 if (opt)
280 head_room += opt->opt_nflen + opt->opt_flen;
281
282 if (unlikely(skb_headroom(skb) < head_room)) {
283 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
284 if (!skb2) {
285 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
286 IPSTATS_MIB_OUTDISCARDS);
287 kfree_skb(skb);
288 return -ENOBUFS;
1da177e4 289 }
66033f47
SB
290 if (skb->sk)
291 skb_set_owner_w(skb2, skb->sk);
292 consume_skb(skb);
293 skb = skb2;
294 }
295
296 if (opt) {
297 seg_len += opt->opt_nflen + opt->opt_flen;
298
1da177e4
LT
299 if (opt->opt_flen)
300 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 301
1da177e4 302 if (opt->opt_nflen)
613fa3ca
DL
303 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
304 &fl6->saddr);
1da177e4
LT
305 }
306
e2d1bca7
ACM
307 skb_push(skb, sizeof(struct ipv6hdr));
308 skb_reset_network_header(skb);
0660e03f 309 hdr = ipv6_hdr(skb);
1da177e4
LT
310
311 /*
312 * Fill in the IPv6 header
313 */
b903d324 314 if (np)
1da177e4
LT
315 hlimit = np->hop_limit;
316 if (hlimit < 0)
6b75d090 317 hlimit = ip6_dst_hoplimit(dst);
1da177e4 318
cb1ce2ef 319 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 320 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 321
1da177e4
LT
322 hdr->payload_len = htons(seg_len);
323 hdr->nexthdr = proto;
324 hdr->hop_limit = hlimit;
325
4e3fd7a0
AD
326 hdr->saddr = fl6->saddr;
327 hdr->daddr = *first_hop;
1da177e4 328
9c9c9ad5 329 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 330 skb->priority = priority;
92e55f41 331 skb->mark = mark;
a2c2064f 332
1da177e4 333 mtu = dst_mtu(dst);
60ff7467 334 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
adf30907 335 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 336 IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
337
338 /* if egress device is enslaved to an L3 master device pass the
339 * skb to its handler for processing
340 */
341 skb = l3mdev_ip6_out((struct sock *)sk, skb);
342 if (unlikely(!skb))
343 return 0;
344
1c1e9d2b
ED
345 /* hooks should never assume socket lock is held.
346 * we promote our socket to non const
347 */
29a26a56 348 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
1c1e9d2b 349 net, (struct sock *)sk, skb, NULL, dst->dev,
13206b6b 350 dst_output);
1da177e4
LT
351 }
352
1da177e4 353 skb->dev = dst->dev;
1c1e9d2b
ED
354 /* ipv6_local_error() does not require socket lock,
355 * we promote our socket to non const
356 */
357 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
358
adf30907 359 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
360 kfree_skb(skb);
361 return -EMSGSIZE;
362}
7159039a
YH
363EXPORT_SYMBOL(ip6_xmit);
364
1da177e4
LT
365static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
366{
367 struct ip6_ra_chain *ra;
368 struct sock *last = NULL;
369
370 read_lock(&ip6_ra_lock);
371 for (ra = ip6_ra_chain; ra; ra = ra->next) {
372 struct sock *sk = ra->sk;
0bd1b59b
AM
373 if (sk && ra->sel == sel &&
374 (!sk->sk_bound_dev_if ||
375 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
376 struct ipv6_pinfo *np = inet6_sk(sk);
377
378 if (np && np->rtalert_isolate &&
379 !net_eq(sock_net(sk), dev_net(skb->dev))) {
380 continue;
381 }
1da177e4
LT
382 if (last) {
383 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
384 if (skb2)
385 rawv6_rcv(last, skb2);
386 }
387 last = sk;
388 }
389 }
390
391 if (last) {
392 rawv6_rcv(last, skb);
393 read_unlock(&ip6_ra_lock);
394 return 1;
395 }
396 read_unlock(&ip6_ra_lock);
397 return 0;
398}
399
e21e0b5f
VN
400static int ip6_forward_proxy_check(struct sk_buff *skb)
401{
0660e03f 402 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 403 u8 nexthdr = hdr->nexthdr;
75f2811c 404 __be16 frag_off;
e21e0b5f
VN
405 int offset;
406
407 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 408 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
409 if (offset < 0)
410 return 0;
411 } else
412 offset = sizeof(struct ipv6hdr);
413
414 if (nexthdr == IPPROTO_ICMPV6) {
415 struct icmp6hdr *icmp6;
416
d56f90a7
ACM
417 if (!pskb_may_pull(skb, (skb_network_header(skb) +
418 offset + 1 - skb->data)))
e21e0b5f
VN
419 return 0;
420
d56f90a7 421 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
422
423 switch (icmp6->icmp6_type) {
424 case NDISC_ROUTER_SOLICITATION:
425 case NDISC_ROUTER_ADVERTISEMENT:
426 case NDISC_NEIGHBOUR_SOLICITATION:
427 case NDISC_NEIGHBOUR_ADVERTISEMENT:
428 case NDISC_REDIRECT:
429 /* For reaction involving unicast neighbor discovery
430 * message destined to the proxied address, pass it to
431 * input function.
432 */
433 return 1;
434 default:
435 break;
436 }
437 }
438
74553b09
VN
439 /*
440 * The proxying router can't forward traffic sent to a link-local
441 * address, so signal the sender and discard the packet. This
442 * behavior is clarified by the MIPv6 specification.
443 */
444 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
445 dst_link_failure(skb);
446 return -1;
447 }
448
e21e0b5f
VN
449 return 0;
450}
451
0c4b51f0
EB
452static inline int ip6_forward_finish(struct net *net, struct sock *sk,
453 struct sk_buff *skb)
1da177e4 454{
71a1c915
JB
455 struct dst_entry *dst = skb_dst(skb);
456
457 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
458 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
459
f839a6c9
IS
460#ifdef CONFIG_NET_SWITCHDEV
461 if (skb->offload_l3_fwd_mark) {
462 consume_skb(skb);
463 return 0;
464 }
465#endif
466
8203e2d8 467 skb->tstamp = 0;
13206b6b 468 return dst_output(net, sk, skb);
1da177e4
LT
469}
470
fe6cc55f
FW
471static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
472{
418a3156 473 if (skb->len <= mtu)
fe6cc55f
FW
474 return false;
475
60ff7467 476 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
477 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
478 return true;
479
60ff7467 480 if (skb->ignore_df)
418a3156
FW
481 return false;
482
779b7931 483 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
484 return false;
485
486 return true;
487}
488
1da177e4
LT
489int ip6_forward(struct sk_buff *skb)
490{
bdb7cc64 491 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
adf30907 492 struct dst_entry *dst = skb_dst(skb);
0660e03f 493 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 494 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 495 struct net *net = dev_net(dst->dev);
14f3ad6f 496 u32 mtu;
1ab1457c 497
53b7997f 498 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
499 goto error;
500
090f1166
LR
501 if (skb->pkt_type != PACKET_HOST)
502 goto drop;
503
9ef2e965
HFS
504 if (unlikely(skb->sk))
505 goto drop;
506
4497b076
BH
507 if (skb_warn_if_lro(skb))
508 goto drop;
509
ccd27f05
ND
510 if (!net->ipv6.devconf_all->disable_policy &&
511 !idev->cnf.disable_policy &&
512 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 513 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
514 goto drop;
515 }
516
35fc92a9 517 skb_forward_csum(skb);
1da177e4
LT
518
519 /*
520 * We DO NOT make any processing on
521 * RA packets, pushing them to user level AS IS
522 * without ane WARRANTY that application will be able
523 * to interpret them. The reason is that we
524 * cannot make anything clever here.
525 *
526 * We are not end-node, so that if packet contains
527 * AH/ESP, we cannot make anything.
528 * Defragmentation also would be mistake, RA packets
529 * cannot be fragmented, because there is no warranty
530 * that different fragments will go along one path. --ANK
531 */
ab4eb353
YH
532 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
533 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
534 return 0;
535 }
536
537 /*
538 * check and decrement ttl
539 */
540 if (hdr->hop_limit <= 1) {
3ffe533c 541 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 542 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
543
544 kfree_skb(skb);
545 return -ETIMEDOUT;
546 }
547
fbea49e1 548 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 549 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 550 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 551 int proxied = ip6_forward_proxy_check(skb);
46c7655f
KP
552 if (proxied > 0) {
553 hdr->hop_limit--;
e21e0b5f 554 return ip6_input(skb);
46c7655f 555 } else if (proxied < 0) {
bdb7cc64 556 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
557 goto drop;
558 }
e21e0b5f
VN
559 }
560
1da177e4 561 if (!xfrm6_route_forward(skb)) {
bdb7cc64 562 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
563 goto drop;
564 }
adf30907 565 dst = skb_dst(skb);
1da177e4
LT
566
567 /* IPv6 specs say nothing about it, but it is clear that we cannot
568 send redirects to source routed frames.
1e5dc146 569 We don't send redirects to frames decapsulated from IPsec.
1da177e4 570 */
2f17becf
SS
571 if (IP6CB(skb)->iif == dst->dev->ifindex &&
572 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 573 struct in6_addr *target = NULL;
fbfe95a4 574 struct inet_peer *peer;
1da177e4 575 struct rt6_info *rt;
1da177e4
LT
576
577 /*
578 * incoming and outgoing devices are the same
579 * send a redirect.
580 */
581
582 rt = (struct rt6_info *) dst;
c45a3dfb
DM
583 if (rt->rt6i_flags & RTF_GATEWAY)
584 target = &rt->rt6i_gateway;
1da177e4
LT
585 else
586 target = &hdr->daddr;
587
fd0273d7 588 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 589
1da177e4
LT
590 /* Limit redirects both by destination (here)
591 and by source (inside ndisc_send_redirect)
592 */
fbfe95a4 593 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 594 ndisc_send_redirect(skb, target);
1d861aa4
DM
595 if (peer)
596 inet_putpeer(peer);
5bb1ab09
DS
597 } else {
598 int addrtype = ipv6_addr_type(&hdr->saddr);
599
1da177e4 600 /* This check is security critical. */
f81b2e7d
YH
601 if (addrtype == IPV6_ADDR_ANY ||
602 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
603 goto error;
604 if (addrtype & IPV6_ADDR_LINKLOCAL) {
605 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 606 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
607 goto error;
608 }
1da177e4
LT
609 }
610
427faee1 611 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
612 if (mtu < IPV6_MIN_MTU)
613 mtu = IPV6_MIN_MTU;
614
fe6cc55f 615 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
616 /* Again, force OUTPUT device used as source address */
617 skb->dev = dst->dev;
14f3ad6f 618 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 619 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
620 __IP6_INC_STATS(net, ip6_dst_idev(dst),
621 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
622 kfree_skb(skb);
623 return -EMSGSIZE;
624 }
625
626 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
627 __IP6_INC_STATS(net, ip6_dst_idev(dst),
628 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
629 goto drop;
630 }
631
0660e03f 632 hdr = ipv6_hdr(skb);
1da177e4
LT
633
634 /* Mangling hops number delayed to point after skb COW */
1ab1457c 635
1da177e4
LT
636 hdr->hop_limit--;
637
29a26a56
EB
638 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
639 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 640 ip6_forward_finish);
1da177e4
LT
641
642error:
bdb7cc64 643 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
644drop:
645 kfree_skb(skb);
646 return -EINVAL;
647}
648
649static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
650{
651 to->pkt_type = from->pkt_type;
652 to->priority = from->priority;
653 to->protocol = from->protocol;
adf30907
ED
654 skb_dst_drop(to);
655 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 656 to->dev = from->dev;
82e91ffe 657 to->mark = from->mark;
1da177e4 658
3dd1c9a1
PA
659 skb_copy_hash(to, from);
660
1da177e4
LT
661#ifdef CONFIG_NET_SCHED
662 to->tc_index = from->tc_index;
663#endif
e7ac05f3 664 nf_copy(to, from);
df5042f4 665 skb_ext_copy(to, from);
984bc16c 666 skb_copy_secmark(to, from);
1da177e4
LT
667}
668
0feca619
PNA
669int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
670 u8 nexthdr, __be32 frag_id,
671 struct ip6_fraglist_iter *iter)
672{
673 unsigned int first_len;
674 struct frag_hdr *fh;
675
676 /* BUILD HEADER */
677 *prevhdr = NEXTHDR_FRAGMENT;
678 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679 if (!iter->tmp_hdr)
680 return -ENOMEM;
681
b7034146 682 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
683 skb_frag_list_init(skb);
684
685 iter->offset = 0;
686 iter->hlen = hlen;
687 iter->frag_id = frag_id;
688 iter->nexthdr = nexthdr;
689
690 __skb_pull(skb, hlen);
691 fh = __skb_push(skb, sizeof(struct frag_hdr));
692 __skb_push(skb, hlen);
693 skb_reset_network_header(skb);
694 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
695
696 fh->nexthdr = nexthdr;
697 fh->reserved = 0;
698 fh->frag_off = htons(IP6_MF);
699 fh->identification = frag_id;
700
701 first_len = skb_pagelen(skb);
702 skb->data_len = first_len - skb_headlen(skb);
703 skb->len = first_len;
704 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
705
706 return 0;
707}
708EXPORT_SYMBOL(ip6_fraglist_init);
709
710void ip6_fraglist_prepare(struct sk_buff *skb,
711 struct ip6_fraglist_iter *iter)
712{
713 struct sk_buff *frag = iter->frag;
714 unsigned int hlen = iter->hlen;
715 struct frag_hdr *fh;
716
717 frag->ip_summed = CHECKSUM_NONE;
718 skb_reset_transport_header(frag);
719 fh = __skb_push(frag, sizeof(struct frag_hdr));
720 __skb_push(frag, hlen);
721 skb_reset_network_header(frag);
722 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
723 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
724 fh->nexthdr = iter->nexthdr;
725 fh->reserved = 0;
726 fh->frag_off = htons(iter->offset);
727 if (frag->next)
728 fh->frag_off |= htons(IP6_MF);
729 fh->identification = iter->frag_id;
730 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
731 ip6_copy_metadata(frag, skb);
732}
733EXPORT_SYMBOL(ip6_fraglist_prepare);
734
8a6a1f17
PNA
735void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
736 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
737 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
738{
739 state->prevhdr = prevhdr;
740 state->nexthdr = nexthdr;
741 state->frag_id = frag_id;
742
743 state->hlen = hlen;
744 state->mtu = mtu;
745
746 state->left = skb->len - hlen; /* Space per frame */
747 state->ptr = hlen; /* Where to start from */
748
749 state->hroom = hdr_room;
750 state->troom = needed_tailroom;
751
752 state->offset = 0;
753}
754EXPORT_SYMBOL(ip6_frag_init);
755
756struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
757{
758 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
759 struct sk_buff *frag;
760 struct frag_hdr *fh;
761 unsigned int len;
762
763 len = state->left;
764 /* IF: it doesn't fit, use 'mtu' - the data space left */
765 if (len > state->mtu)
766 len = state->mtu;
767 /* IF: we are not sending up to and including the packet end
768 then align the next start on an eight byte boundary */
769 if (len < state->left)
770 len &= ~7;
771
772 /* Allocate buffer */
773 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
774 state->hroom + state->troom, GFP_ATOMIC);
775 if (!frag)
776 return ERR_PTR(-ENOMEM);
777
778 /*
779 * Set up data on packet
780 */
781
782 ip6_copy_metadata(frag, skb);
783 skb_reserve(frag, state->hroom);
784 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
785 skb_reset_network_header(frag);
786 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
787 frag->transport_header = (frag->network_header + state->hlen +
788 sizeof(struct frag_hdr));
789
790 /*
791 * Charge the memory for the fragment to any owner
792 * it might possess
793 */
794 if (skb->sk)
795 skb_set_owner_w(frag, skb->sk);
796
797 /*
798 * Copy the packet header into the new buffer.
799 */
800 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
801
802 fragnexthdr_offset = skb_network_header(frag);
803 fragnexthdr_offset += prevhdr - skb_network_header(skb);
804 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
805
806 /*
807 * Build fragment header.
808 */
809 fh->nexthdr = state->nexthdr;
810 fh->reserved = 0;
811 fh->identification = state->frag_id;
812
813 /*
814 * Copy a block of the IP datagram.
815 */
816 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
817 len));
818 state->left -= len;
819
820 fh->frag_off = htons(state->offset);
821 if (state->left > 0)
822 fh->frag_off |= htons(IP6_MF);
823 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
824
825 state->ptr += len;
826 state->offset += len;
827
828 return frag;
829}
830EXPORT_SYMBOL(ip6_frag_next);
831
7d8c6e39
EB
832int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
833 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 834{
1da177e4 835 struct sk_buff *frag;
67ba4152 836 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 837 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
838 inet6_sk(skb->sk) : NULL;
8a6a1f17
PNA
839 struct ip6_frag_state state;
840 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 841 ktime_t tstamp = skb->tstamp;
8a6a1f17 842 int hroom, err = 0;
286c2349 843 __be32 frag_id;
1da177e4
LT
844 u8 *prevhdr, nexthdr = 0;
845
7dd7eb95
DM
846 err = ip6_find_1stfragopt(skb, &prevhdr);
847 if (err < 0)
2423496a 848 goto fail;
7dd7eb95 849 hlen = err;
1da177e4 850 nexthdr = *prevhdr;
ef0efcd3 851 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 852
628a5c56 853 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
854
855 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 856 * or if the skb it not generated by a local socket.
b881ef76 857 */
485fca66
FW
858 if (unlikely(!skb->ignore_df && skb->len > mtu))
859 goto fail_toobig;
a34a101e 860
485fca66
FW
861 if (IP6CB(skb)->frag_max_size) {
862 if (IP6CB(skb)->frag_max_size > mtu)
863 goto fail_toobig;
864
865 /* don't send fragments larger than what we received */
866 mtu = IP6CB(skb)->frag_max_size;
867 if (mtu < IPV6_MIN_MTU)
868 mtu = IPV6_MIN_MTU;
b881ef76
JH
869 }
870
d91675f9
YH
871 if (np && np->frag_size < mtu) {
872 if (np->frag_size)
873 mtu = np->frag_size;
874 }
89bc7848 875 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 876 goto fail_toobig;
1e0d69a9 877 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 878
fd0273d7
MKL
879 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
880 &ipv6_hdr(skb)->saddr);
286c2349 881
405c92f7
HFS
882 if (skb->ip_summed == CHECKSUM_PARTIAL &&
883 (err = skb_checksum_help(skb)))
884 goto fail;
885
ef0efcd3 886 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 887 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 888 if (skb_has_frag_list(skb)) {
c72d8cda 889 unsigned int first_len = skb_pagelen(skb);
0feca619 890 struct ip6_fraglist_iter iter;
3d13008e 891 struct sk_buff *frag2;
1da177e4
LT
892
893 if (first_len - hlen > mtu ||
894 ((first_len - hlen) & 7) ||
1d325d21
FW
895 skb_cloned(skb) ||
896 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
897 goto slow_path;
898
4d9092bb 899 skb_walk_frags(skb, frag) {
1da177e4
LT
900 /* Correct geometry. */
901 if (frag->len > mtu ||
902 ((frag->len & 7) && frag->next) ||
1d325d21 903 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 904 goto slow_path_clean;
1da177e4 905
1da177e4
LT
906 /* Partially cloned skb? */
907 if (skb_shared(frag))
3d13008e 908 goto slow_path_clean;
2fdba6b0
HX
909
910 BUG_ON(frag->sk);
911 if (skb->sk) {
2fdba6b0
HX
912 frag->sk = skb->sk;
913 frag->destructor = sock_wfree;
2fdba6b0 914 }
3d13008e 915 skb->truesize -= frag->truesize;
1da177e4
LT
916 }
917
0feca619
PNA
918 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
919 &iter);
920 if (err < 0)
1d325d21 921 goto fail;
a11d206d 922
1da177e4
LT
923 for (;;) {
924 /* Prepare header of the next frame,
925 * before previous one went down. */
0feca619
PNA
926 if (iter.frag)
927 ip6_fraglist_prepare(skb, &iter);
1ab1457c 928
9669fffc 929 skb->tstamp = tstamp;
7d8c6e39 930 err = output(net, sk, skb);
67ba4152 931 if (!err)
d8d1f30b 932 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 933 IPSTATS_MIB_FRAGCREATES);
dafee490 934
0feca619 935 if (err || !iter.frag)
1da177e4
LT
936 break;
937
0feca619 938 skb = ip6_fraglist_next(&iter);
1da177e4
LT
939 }
940
0feca619 941 kfree(iter.tmp_hdr);
1da177e4
LT
942
943 if (err == 0) {
d8d1f30b 944 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 945 IPSTATS_MIB_FRAGOKS);
1da177e4
LT
946 return 0;
947 }
948
b7034146 949 kfree_skb_list(iter.frag);
1da177e4 950
d8d1f30b 951 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 952 IPSTATS_MIB_FRAGFAILS);
1da177e4 953 return err;
3d13008e
ED
954
955slow_path_clean:
956 skb_walk_frags(skb, frag2) {
957 if (frag2 == frag)
958 break;
959 frag2->sk = NULL;
960 frag2->destructor = NULL;
961 skb->truesize += frag2->truesize;
962 }
1da177e4
LT
963 }
964
965slow_path:
1da177e4
LT
966 /*
967 * Fragment the datagram.
968 */
969
8a6a1f17
PNA
970 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
971 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
972 &state);
1da177e4
LT
973
974 /*
975 * Keep copying data until we run out.
976 */
1da177e4 977
8a6a1f17
PNA
978 while (state.left > 0) {
979 frag = ip6_frag_next(skb, &state);
980 if (IS_ERR(frag)) {
981 err = PTR_ERR(frag);
1da177e4
LT
982 goto fail;
983 }
984
1da177e4
LT
985 /*
986 * Put this fragment into the sending queue.
987 */
9669fffc 988 frag->tstamp = tstamp;
7d8c6e39 989 err = output(net, sk, frag);
1da177e4
LT
990 if (err)
991 goto fail;
dafee490 992
adf30907 993 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 994 IPSTATS_MIB_FRAGCREATES);
1da177e4 995 }
adf30907 996 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 997 IPSTATS_MIB_FRAGOKS);
808db80a 998 consume_skb(skb);
1da177e4
LT
999 return err;
1000
485fca66
FW
1001fail_toobig:
1002 if (skb->sk && dst_allfrag(skb_dst(skb)))
1003 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1004
485fca66
FW
1005 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006 err = -EMSGSIZE;
1007
1da177e4 1008fail:
adf30907 1009 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1010 IPSTATS_MIB_FRAGFAILS);
1ab1457c 1011 kfree_skb(skb);
1da177e4
LT
1012 return err;
1013}
1014
b71d1d42
ED
1015static inline int ip6_rt_check(const struct rt6key *rt_key,
1016 const struct in6_addr *fl_addr,
1017 const struct in6_addr *addr_cache)
cf6b1982 1018{
a02cec21 1019 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 1020 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
1021}
1022
497c615a
HX
1023static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024 struct dst_entry *dst,
b71d1d42 1025 const struct flowi6 *fl6)
1da177e4 1026{
497c615a 1027 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1028 struct rt6_info *rt;
1da177e4 1029
497c615a
HX
1030 if (!dst)
1031 goto out;
1032
a963a37d
ED
1033 if (dst->ops->family != AF_INET6) {
1034 dst_release(dst);
1035 return NULL;
1036 }
1037
1038 rt = (struct rt6_info *)dst;
497c615a
HX
1039 /* Yes, checking route validity in not connected
1040 * case is not very simple. Take into account,
1041 * that we do not support routing by source, TOS,
67ba4152 1042 * and MSG_DONTROUTE --ANK (980726)
497c615a 1043 *
cf6b1982
YH
1044 * 1. ip6_rt_check(): If route was host route,
1045 * check that cached destination is current.
497c615a
HX
1046 * If it is network route, we still may
1047 * check its validity using saved pointer
1048 * to the last used address: daddr_cache.
1049 * We do not want to save whole address now,
1050 * (because main consumer of this service
1051 * is tcp, which has not this problem),
1052 * so that the last trick works only on connected
1053 * sockets.
1054 * 2. oif also should be the same.
1055 */
4c9483b2 1056 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1057#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1058 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1059#endif
ca254490
DA
1060 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1061 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
497c615a
HX
1062 dst_release(dst);
1063 dst = NULL;
1da177e4
LT
1064 }
1065
497c615a
HX
1066out:
1067 return dst;
1068}
1069
3aef934f 1070static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1071 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1072{
69cce1d1
DM
1073#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1074 struct neighbour *n;
97cac082 1075 struct rt6_info *rt;
69cce1d1
DM
1076#endif
1077 int err;
6f21c96a 1078 int flags = 0;
497c615a 1079
e16e888b
MS
1080 /* The correct way to handle this would be to do
1081 * ip6_route_get_saddr, and then ip6_route_output; however,
1082 * the route-specific preferred source forces the
1083 * ip6_route_output call _before_ ip6_route_get_saddr.
1084 *
1085 * In source specific routing (no src=any default route),
1086 * ip6_route_output will fail given src=any saddr, though, so
1087 * that's why we try it again later.
1088 */
c305b9e6 1089 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1090 struct fib6_info *from;
e16e888b 1091 struct rt6_info *rt;
1da177e4 1092
c305b9e6 1093 *dst = ip6_route_output(net, sk, fl6);
e16e888b 1094 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1095
1096 rcu_read_lock();
1097 from = rt ? rcu_dereference(rt->from) : NULL;
1098 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1099 sk ? inet6_sk(sk)->srcprefs : 0,
1100 &fl6->saddr);
a68886a6
DA
1101 rcu_read_unlock();
1102
44456d37 1103 if (err)
1da177e4 1104 goto out_err_release;
e16e888b
MS
1105
1106 /* If we had an erroneous initial result, pretend it
1107 * never existed and let the SA-enabled version take
1108 * over.
1109 */
c305b9e6 1110 if ((*dst)->error) {
e16e888b
MS
1111 dst_release(*dst);
1112 *dst = NULL;
1113 }
6f21c96a
PA
1114
1115 if (fl6->flowi6_oif)
1116 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1117 }
1118
e16e888b 1119 if (!*dst)
6f21c96a 1120 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1121
1122 err = (*dst)->error;
1123 if (err)
1124 goto out_err_release;
1125
95c385b4 1126#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1127 /*
1128 * Here if the dst entry we've looked up
1129 * has a neighbour entry that is in the INCOMPLETE
1130 * state and the src address from the flow is
1131 * marked as OPTIMISTIC, we release the found
1132 * dst entry and replace it instead with the
1133 * dst entry of the nexthop router
1134 */
c56bf6fe 1135 rt = (struct rt6_info *) *dst;
707be1ff 1136 rcu_read_lock_bh();
2647a9b0
MKL
1137 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1138 rt6_nexthop(rt, &fl6->daddr));
707be1ff
YH
1139 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1140 rcu_read_unlock_bh();
1141
1142 if (err) {
e550dfb0 1143 struct inet6_ifaddr *ifp;
4c9483b2 1144 struct flowi6 fl_gw6;
e550dfb0
NH
1145 int redirect;
1146
4c9483b2 1147 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1148 (*dst)->dev, 1);
1149
1150 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1151 if (ifp)
1152 in6_ifa_put(ifp);
1153
1154 if (redirect) {
1155 /*
1156 * We need to get the dst entry for the
1157 * default router instead
1158 */
1159 dst_release(*dst);
4c9483b2
DM
1160 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1161 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1162 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1163 err = (*dst)->error;
1164 if (err)
e550dfb0 1165 goto out_err_release;
95c385b4 1166 }
e550dfb0 1167 }
95c385b4 1168#endif
ec5e3b0a 1169 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1170 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1171 err = -EAFNOSUPPORT;
1172 goto out_err_release;
1173 }
95c385b4 1174
1da177e4
LT
1175 return 0;
1176
1177out_err_release:
1178 dst_release(*dst);
1179 *dst = NULL;
8a966fc0 1180
0d240e78
DA
1181 if (err == -ENETUNREACH)
1182 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1183 return err;
1184}
34a0b3cd 1185
497c615a
HX
1186/**
1187 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1188 * @net: Network namespace to perform lookup in
497c615a
HX
1189 * @sk: socket which provides route info
1190 * @dst: pointer to dst_entry * for result
4c9483b2 1191 * @fl6: flow to lookup
497c615a
HX
1192 *
1193 * This function performs a route lookup on the given flow.
1194 *
1195 * It returns zero on success, or a standard errno code on error.
1196 */
343d60aa
RP
1197int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1198 struct flowi6 *fl6)
497c615a
HX
1199{
1200 *dst = NULL;
343d60aa 1201 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1202}
3cf3dc6c
ACM
1203EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1204
497c615a 1205/**
68d0c6d3 1206 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1207 * @net: Network namespace to perform lookup in
68d0c6d3 1208 * @sk: socket which provides route info
4c9483b2 1209 * @fl6: flow to lookup
68d0c6d3 1210 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1211 *
1212 * This function performs a route lookup on the given flow.
1213 *
1214 * It returns a valid dst pointer on success, or a pointer encoded
1215 * error code.
1216 */
c4e85f73 1217struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1218 const struct in6_addr *final_dst)
68d0c6d3
DM
1219{
1220 struct dst_entry *dst = NULL;
1221 int err;
1222
c4e85f73 1223 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1224 if (err)
1225 return ERR_PTR(err);
1226 if (final_dst)
4e3fd7a0 1227 fl6->daddr = *final_dst;
2774c131 1228
c4e85f73 1229 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1230}
1231EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1232
1233/**
1234 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1235 * @sk: socket which provides the dst cache and route info
4c9483b2 1236 * @fl6: flow to lookup
68d0c6d3 1237 * @final_dst: final destination address for ipsec lookup
96818159 1238 * @connected: whether @sk is connected or not
497c615a
HX
1239 *
1240 * This function performs a route lookup on the given flow with the
1241 * possibility of using the cached route in the socket if it is valid.
1242 * It will take the socket dst lock when operating on the dst cache.
1243 * As a result, this function can only be used in process context.
1244 *
96818159
AK
1245 * In addition, for a connected socket, cache the dst in the socket
1246 * if the current cache is not valid.
1247 *
68d0c6d3
DM
1248 * It returns a valid dst pointer on success, or a pointer encoded
1249 * error code.
497c615a 1250 */
4c9483b2 1251struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1252 const struct in6_addr *final_dst,
1253 bool connected)
497c615a 1254{
68d0c6d3 1255 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1256
4c9483b2 1257 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1258 if (dst)
1259 return dst;
1260
c4e85f73 1261 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1262 if (connected && !IS_ERR(dst))
1263 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1264
00bc0ef5 1265 return dst;
497c615a 1266}
68d0c6d3 1267EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1268
571912c6
MV
1269/**
1270 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1271 * @skb: Packet for which lookup is done
1272 * @dev: Tunnel device
1273 * @net: Network namespace of tunnel device
b51cd7c8 1274 * @sock: Socket which provides route info
571912c6
MV
1275 * @saddr: Memory to store the src ip address
1276 * @info: Tunnel information
1277 * @protocol: IP protocol
b51cd7c8 1278 * @use_cache: Flag to enable cache usage
571912c6
MV
1279 * This function performs a route lookup on a tunnel
1280 *
1281 * It returns a valid dst pointer and stores src address to be used in
1282 * tunnel in param saddr on success, else a pointer encoded error code.
1283 */
1284
1285struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1286 struct net_device *dev,
1287 struct net *net,
1288 struct socket *sock,
1289 struct in6_addr *saddr,
1290 const struct ip_tunnel_info *info,
1291 u8 protocol,
1292 bool use_cache)
1293{
1294 struct dst_entry *dst = NULL;
1295#ifdef CONFIG_DST_CACHE
1296 struct dst_cache *dst_cache;
1297#endif
1298 struct flowi6 fl6;
1299 __u8 prio;
1300
1301#ifdef CONFIG_DST_CACHE
1302 dst_cache = (struct dst_cache *)&info->dst_cache;
1303 if (use_cache) {
1304 dst = dst_cache_get_ip6(dst_cache, saddr);
1305 if (dst)
1306 return dst;
1307 }
1308#endif
1309 memset(&fl6, 0, sizeof(fl6));
1310 fl6.flowi6_mark = skb->mark;
1311 fl6.flowi6_proto = protocol;
1312 fl6.daddr = info->key.u.ipv6.dst;
1313 fl6.saddr = info->key.u.ipv6.src;
1314 prio = info->key.tos;
1315 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1316 info->key.label);
1317
1318 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1319 NULL);
1320 if (IS_ERR(dst)) {
1321 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1322 return ERR_PTR(-ENETUNREACH);
1323 }
1324 if (dst->dev == dev) { /* is this necessary? */
1325 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1326 dst_release(dst);
1327 return ERR_PTR(-ELOOP);
1328 }
1329#ifdef CONFIG_DST_CACHE
1330 if (use_cache)
1331 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1332#endif
1333 *saddr = fl6.saddr;
1334 return dst;
1335}
1336EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1337
0178b695
HX
1338static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1339 gfp_t gfp)
1340{
1341 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342}
1343
1344static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1345 gfp_t gfp)
1346{
1347 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1348}
1349
75a493e6 1350static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1351 int *maxfraglen,
1352 unsigned int fragheaderlen,
1353 struct sk_buff *skb,
75a493e6 1354 struct rt6_info *rt,
e367c2d0 1355 unsigned int orig_mtu)
0c183379
G
1356{
1357 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1358 if (!skb) {
0c183379 1359 /* first fragment, reserve header_len */
e367c2d0 1360 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1361
1362 } else {
1363 /*
1364 * this fragment is not first, the headers
1365 * space is regarded as data space.
1366 */
e367c2d0 1367 *mtu = orig_mtu;
0c183379
G
1368 }
1369 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1370 + fragheaderlen - sizeof(struct frag_hdr);
1371 }
1372}
1373
366e41d9 1374static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1375 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
5fdaa88d 1376 struct rt6_info *rt, struct flowi6 *fl6)
366e41d9
VY
1377{
1378 struct ipv6_pinfo *np = inet6_sk(sk);
1379 unsigned int mtu;
26879da5 1380 struct ipv6_txoptions *opt = ipc6->opt;
366e41d9
VY
1381
1382 /*
1383 * setup for corking
1384 */
1385 if (opt) {
1386 if (WARN_ON(v6_cork->opt))
1387 return -EINVAL;
1388
864e2a1f 1389 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
63159f29 1390 if (unlikely(!v6_cork->opt))
366e41d9
VY
1391 return -ENOBUFS;
1392
864e2a1f 1393 v6_cork->opt->tot_len = sizeof(*opt);
366e41d9
VY
1394 v6_cork->opt->opt_flen = opt->opt_flen;
1395 v6_cork->opt->opt_nflen = opt->opt_nflen;
1396
1397 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1398 sk->sk_allocation);
1399 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1400 return -ENOBUFS;
1401
1402 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1403 sk->sk_allocation);
1404 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1405 return -ENOBUFS;
1406
1407 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1408 sk->sk_allocation);
1409 if (opt->hopopt && !v6_cork->opt->hopopt)
1410 return -ENOBUFS;
1411
1412 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1413 sk->sk_allocation);
1414 if (opt->srcrt && !v6_cork->opt->srcrt)
1415 return -ENOBUFS;
1416
1417 /* need source address above miyazawa*/
1418 }
1419 dst_hold(&rt->dst);
1420 cork->base.dst = &rt->dst;
1421 cork->fl.u.ip6 = *fl6;
26879da5
WW
1422 v6_cork->hop_limit = ipc6->hlimit;
1423 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1424 if (rt->dst.flags & DST_XFRM_TUNNEL)
1425 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1426 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1427 else
1428 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1429 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1430 if (np->frag_size < mtu) {
1431 if (np->frag_size)
1432 mtu = np->frag_size;
1433 }
749439bf
MM
1434 if (mtu < IPV6_MIN_MTU)
1435 return -EINVAL;
366e41d9 1436 cork->base.fragsize = mtu;
fbf47813 1437 cork->base.gso_size = ipc6->gso_size;
678ca42d 1438 cork->base.tx_flags = 0;
c6af0c22 1439 cork->base.mark = ipc6->sockc.mark;
678ca42d 1440 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1441
0f6c480f 1442 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1443 cork->base.flags |= IPCORK_ALLFRAG;
1444 cork->base.length = 0;
1445
5fdaa88d 1446 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1447
366e41d9
VY
1448 return 0;
1449}
1450
0bbe84a6
VY
1451static int __ip6_append_data(struct sock *sk,
1452 struct flowi6 *fl6,
1453 struct sk_buff_head *queue,
1454 struct inet_cork *cork,
1455 struct inet6_cork *v6_cork,
1456 struct page_frag *pfrag,
1457 int getfrag(void *from, char *to, int offset,
1458 int len, int odd, struct sk_buff *skb),
1459 void *from, int length, int transhdrlen,
5fdaa88d 1460 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1461{
0c183379 1462 struct sk_buff *skb, *skb_prev = NULL;
10b8a3de 1463 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1464 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1465 int exthdrlen = 0;
1466 int dst_exthdrlen = 0;
1da177e4 1467 int hh_len;
1da177e4
LT
1468 int copy;
1469 int err;
1470 int offset = 0;
09c2d251 1471 u32 tskey = 0;
0bbe84a6
VY
1472 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1473 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1474 int csummode = CHECKSUM_NONE;
682b1a9d 1475 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1476 unsigned int wmem_alloc_delta = 0;
100f6d8e 1477 bool paged, extra_uref = false;
1da177e4 1478
0bbe84a6
VY
1479 skb = skb_peek_tail(queue);
1480 if (!skb) {
1481 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1482 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1483 }
0bbe84a6 1484
15e36f5b 1485 paged = !!cork->gso_size;
bec1f6f6 1486 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1487 orig_mtu = mtu;
1da177e4 1488
678ca42d
WB
1489 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1490 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1491 tskey = sk->sk_tskey++;
1492
d8d1f30b 1493 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1494
a1b05140 1495 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1496 (opt ? opt->opt_nflen : 0);
4df98e76
HFS
1497 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1498 sizeof(struct frag_hdr);
1da177e4 1499
682b1a9d
HFS
1500 headersize = sizeof(struct ipv6hdr) +
1501 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1502 (dst_allfrag(&rt->dst) ?
1503 sizeof(struct frag_hdr) : 0) +
1504 rt->rt6i_nfheader_len;
1505
10b8a3de
PA
1506 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1507 * the first fragment
1508 */
1509 if (headersize + transhdrlen > mtu)
1510 goto emsgsize;
1511
26879da5 1512 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d
HFS
1513 (sk->sk_protocol == IPPROTO_UDP ||
1514 sk->sk_protocol == IPPROTO_RAW)) {
1515 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1516 sizeof(struct ipv6hdr));
1517 goto emsgsize;
1518 }
4df98e76 1519
682b1a9d
HFS
1520 if (ip6_sk_ignore_df(sk))
1521 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1522 else
1523 maxnonfragsize = mtu;
4df98e76 1524
682b1a9d 1525 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1526emsgsize:
10b8a3de
PA
1527 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1528 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1529 return -EMSGSIZE;
1da177e4
LT
1530 }
1531
682b1a9d
HFS
1532 /* CHECKSUM_PARTIAL only with no extension headers and when
1533 * we are not going to fragment
1534 */
1535 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1536 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1537 length <= mtu - headersize &&
bec1f6f6 1538 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1539 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1540 csummode = CHECKSUM_PARTIAL;
1541
b5947e5d 1542 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
8c793822 1543 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
b5947e5d
WB
1544 if (!uarg)
1545 return -ENOBUFS;
522924b5 1546 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
b5947e5d
WB
1547 if (rt->dst.dev->features & NETIF_F_SG &&
1548 csummode == CHECKSUM_PARTIAL) {
1549 paged = true;
1550 } else {
1551 uarg->zerocopy = 0;
52900d22 1552 skb_zcopy_set(skb, uarg, &extra_uref);
b5947e5d
WB
1553 }
1554 }
1555
1da177e4
LT
1556 /*
1557 * Let's try using as much space as possible.
1558 * Use MTU if total length of the message fits into the MTU.
1559 * Otherwise, we need to reserve fragment header and
1560 * fragment alignment (= 8-15 octects, in total).
1561 *
634a63e7 1562 * Note that we may need to "move" the data from the tail
1ab1457c 1563 * of the buffer to the new fragment when we split
1da177e4
LT
1564 * the message.
1565 *
1ab1457c 1566 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1567 * at once if non-fragmentable extension headers
1568 * are too large.
1ab1457c 1569 * --yoshfuji
1da177e4
LT
1570 */
1571
2811ebac 1572 cork->length += length;
2811ebac 1573 if (!skb)
1da177e4
LT
1574 goto alloc_new_skb;
1575
1576 while (length > 0) {
1577 /* Check if the remaining data fits into current packet. */
bdc712b4 1578 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1579 if (copy < length)
1580 copy = maxfraglen - skb->len;
1581
1582 if (copy <= 0) {
1583 char *data;
1584 unsigned int datalen;
1585 unsigned int fraglen;
1586 unsigned int fraggap;
6d123b81 1587 unsigned int alloclen, alloc_extra;
aba36930 1588 unsigned int pagedlen;
1da177e4 1589alloc_new_skb:
1da177e4 1590 /* There's no room in the current skb */
0c183379
G
1591 if (skb)
1592 fraggap = skb->len - maxfraglen;
1da177e4
LT
1593 else
1594 fraggap = 0;
0c183379 1595 /* update mtu and maxfraglen if necessary */
63159f29 1596 if (!skb || !skb_prev)
0c183379 1597 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1598 fragheaderlen, skb, rt,
e367c2d0 1599 orig_mtu);
0c183379
G
1600
1601 skb_prev = skb;
1da177e4
LT
1602
1603 /*
1604 * If remaining data exceeds the mtu,
1605 * we know we need more fragment(s).
1606 */
1607 datalen = length + fraggap;
1da177e4 1608
0c183379
G
1609 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1610 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1611 fraglen = datalen + fragheaderlen;
aba36930 1612 pagedlen = 0;
15e36f5b 1613
6d123b81
JK
1614 alloc_extra = hh_len;
1615 alloc_extra += dst_exthdrlen;
1616 alloc_extra += rt->dst.trailer_len;
1617
1618 /* We just reserve space for fragment header.
1619 * Note: this may be overallocation if the message
1620 * (without MSG_MORE) fits into the MTU.
1621 */
1622 alloc_extra += sizeof(struct frag_hdr);
1623
1da177e4 1624 if ((flags & MSG_MORE) &&
d8d1f30b 1625 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1626 alloclen = mtu;
6d123b81
JK
1627 else if (!paged &&
1628 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1629 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b
WB
1630 alloclen = fraglen;
1631 else {
1632 alloclen = min_t(int, fraglen, MAX_HEADER);
1633 pagedlen = fraglen - alloclen;
1634 }
6d123b81 1635 alloclen += alloc_extra;
299b0767 1636
0c183379
G
1637 if (datalen != length + fraggap) {
1638 /*
1639 * this is not the last fragment, the trailer
1640 * space is regarded as data space.
1641 */
1642 datalen += rt->dst.trailer_len;
1643 }
1644
0c183379 1645 fraglen = datalen + fragheaderlen;
1da177e4 1646
15e36f5b 1647 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1648 if (copy < 0) {
1649 err = -EINVAL;
1650 goto error;
1651 }
1da177e4 1652 if (transhdrlen) {
6d123b81 1653 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1654 (flags & MSG_DONTWAIT), &err);
1655 } else {
1656 skb = NULL;
1f4c6eb2 1657 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1658 2 * sk->sk_sndbuf)
6d123b81 1659 skb = alloc_skb(alloclen,
1f4c6eb2 1660 sk->sk_allocation);
63159f29 1661 if (unlikely(!skb))
1da177e4
LT
1662 err = -ENOBUFS;
1663 }
63159f29 1664 if (!skb)
1da177e4
LT
1665 goto error;
1666 /*
1667 * Fill in the control structures
1668 */
9c9c9ad5 1669 skb->protocol = htons(ETH_P_IPV6);
32dce968 1670 skb->ip_summed = csummode;
1da177e4 1671 skb->csum = 0;
1f85851e
G
1672 /* reserve for fragmentation and ipsec header */
1673 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1674 dst_exthdrlen);
1da177e4
LT
1675
1676 /*
1677 * Find where to start putting bytes
1678 */
15e36f5b 1679 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1680 skb_set_network_header(skb, exthdrlen);
1681 data += fragheaderlen;
b0e380b1
ACM
1682 skb->transport_header = (skb->network_header +
1683 fragheaderlen);
1da177e4
LT
1684 if (fraggap) {
1685 skb->csum = skb_copy_and_csum_bits(
1686 skb_prev, maxfraglen,
8d5930df 1687 data + transhdrlen, fraggap);
1da177e4
LT
1688 skb_prev->csum = csum_sub(skb_prev->csum,
1689 skb->csum);
1690 data += fraggap;
e9fa4f7b 1691 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1692 }
232cd35d
ED
1693 if (copy > 0 &&
1694 getfrag(from, data + transhdrlen, offset,
1695 copy, fraggap, skb) < 0) {
1da177e4
LT
1696 err = -EFAULT;
1697 kfree_skb(skb);
1698 goto error;
1699 }
1700
1701 offset += copy;
15e36f5b 1702 length -= copy + transhdrlen;
1da177e4
LT
1703 transhdrlen = 0;
1704 exthdrlen = 0;
299b0767 1705 dst_exthdrlen = 0;
1da177e4 1706
52900d22
WB
1707 /* Only the initial fragment is time stamped */
1708 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1709 cork->tx_flags = 0;
1710 skb_shinfo(skb)->tskey = tskey;
1711 tskey = 0;
1712 skb_zcopy_set(skb, uarg, &extra_uref);
1713
0dec879f
JA
1714 if ((flags & MSG_CONFIRM) && !skb_prev)
1715 skb_set_dst_pending_confirm(skb, 1);
1716
1da177e4
LT
1717 /*
1718 * Put the packet on the pending queue
1719 */
1f4c6eb2
ED
1720 if (!skb->destructor) {
1721 skb->destructor = sock_wfree;
1722 skb->sk = sk;
1723 wmem_alloc_delta += skb->truesize;
1724 }
0bbe84a6 1725 __skb_queue_tail(queue, skb);
1da177e4
LT
1726 continue;
1727 }
1728
1729 if (copy > length)
1730 copy = length;
1731
113f99c3
WB
1732 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1733 skb_tailroom(skb) >= copy) {
1da177e4
LT
1734 unsigned int off;
1735
1736 off = skb->len;
1737 if (getfrag(from, skb_put(skb, copy),
1738 offset, copy, off, skb) < 0) {
1739 __skb_trim(skb, off);
1740 err = -EFAULT;
1741 goto error;
1742 }
b5947e5d 1743 } else if (!uarg || !uarg->zerocopy) {
1da177e4 1744 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1745
5640f768
ED
1746 err = -ENOMEM;
1747 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1748 goto error;
5640f768
ED
1749
1750 if (!skb_can_coalesce(skb, i, pfrag->page,
1751 pfrag->offset)) {
1752 err = -EMSGSIZE;
1753 if (i == MAX_SKB_FRAGS)
1754 goto error;
1755
1756 __skb_fill_page_desc(skb, i, pfrag->page,
1757 pfrag->offset, 0);
1758 skb_shinfo(skb)->nr_frags = ++i;
1759 get_page(pfrag->page);
1da177e4 1760 }
5640f768 1761 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1762 if (getfrag(from,
5640f768
ED
1763 page_address(pfrag->page) + pfrag->offset,
1764 offset, copy, skb->len, skb) < 0)
1765 goto error_efault;
1766
1767 pfrag->offset += copy;
1768 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1769 skb->len += copy;
1770 skb->data_len += copy;
f945fa7a 1771 skb->truesize += copy;
1f4c6eb2 1772 wmem_alloc_delta += copy;
b5947e5d
WB
1773 } else {
1774 err = skb_zerocopy_iter_dgram(skb, from, copy);
1775 if (err < 0)
1776 goto error;
1da177e4
LT
1777 }
1778 offset += copy;
1779 length -= copy;
1780 }
5640f768 1781
9e8445a5
PA
1782 if (wmem_alloc_delta)
1783 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1784 return 0;
5640f768
ED
1785
1786error_efault:
1787 err = -EFAULT;
1da177e4 1788error:
8e044917 1789 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1790 cork->length -= length;
3bd653c8 1791 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1792 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1793 return err;
1794}
0bbe84a6
VY
1795
1796int ip6_append_data(struct sock *sk,
1797 int getfrag(void *from, char *to, int offset, int len,
1798 int odd, struct sk_buff *skb),
26879da5
WW
1799 void *from, int length, int transhdrlen,
1800 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1801 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1802{
1803 struct inet_sock *inet = inet_sk(sk);
1804 struct ipv6_pinfo *np = inet6_sk(sk);
1805 int exthdrlen;
1806 int err;
1807
1808 if (flags&MSG_PROBE)
1809 return 0;
1810 if (skb_queue_empty(&sk->sk_write_queue)) {
1811 /*
1812 * setup for corking
1813 */
26879da5 1814 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
5fdaa88d 1815 ipc6, rt, fl6);
0bbe84a6
VY
1816 if (err)
1817 return err;
1818
26879da5 1819 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1820 length += exthdrlen;
1821 transhdrlen += exthdrlen;
1822 } else {
1823 fl6 = &inet->cork.fl.u.ip6;
1824 transhdrlen = 0;
1825 }
1826
1827 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1828 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1829 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1830}
a495f836 1831EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1832
366e41d9
VY
1833static void ip6_cork_release(struct inet_cork_full *cork,
1834 struct inet6_cork *v6_cork)
bf138862 1835{
366e41d9
VY
1836 if (v6_cork->opt) {
1837 kfree(v6_cork->opt->dst0opt);
1838 kfree(v6_cork->opt->dst1opt);
1839 kfree(v6_cork->opt->hopopt);
1840 kfree(v6_cork->opt->srcrt);
1841 kfree(v6_cork->opt);
1842 v6_cork->opt = NULL;
0178b695
HX
1843 }
1844
366e41d9
VY
1845 if (cork->base.dst) {
1846 dst_release(cork->base.dst);
1847 cork->base.dst = NULL;
1848 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1849 }
366e41d9 1850 memset(&cork->fl, 0, sizeof(cork->fl));
bf138862
PE
1851}
1852
6422398c
VY
1853struct sk_buff *__ip6_make_skb(struct sock *sk,
1854 struct sk_buff_head *queue,
1855 struct inet_cork_full *cork,
1856 struct inet6_cork *v6_cork)
1da177e4
LT
1857{
1858 struct sk_buff *skb, *tmp_skb;
1859 struct sk_buff **tail_skb;
1860 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1da177e4 1861 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1862 struct net *net = sock_net(sk);
1da177e4 1863 struct ipv6hdr *hdr;
6422398c
VY
1864 struct ipv6_txoptions *opt = v6_cork->opt;
1865 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1866 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1867 unsigned char proto = fl6->flowi6_proto;
1da177e4 1868
6422398c 1869 skb = __skb_dequeue(queue);
63159f29 1870 if (!skb)
1da177e4
LT
1871 goto out;
1872 tail_skb = &(skb_shinfo(skb)->frag_list);
1873
1874 /* move skb->data to ip header from ext header */
d56f90a7 1875 if (skb->data < skb_network_header(skb))
bbe735e4 1876 __skb_pull(skb, skb_network_offset(skb));
6422398c 1877 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1878 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1879 *tail_skb = tmp_skb;
1880 tail_skb = &(tmp_skb->next);
1881 skb->len += tmp_skb->len;
1882 skb->data_len += tmp_skb->len;
1da177e4 1883 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1884 tmp_skb->destructor = NULL;
1885 tmp_skb->sk = NULL;
1da177e4
LT
1886 }
1887
28a89453 1888 /* Allow local fragmentation. */
60ff7467 1889 skb->ignore_df = ip6_sk_ignore_df(sk);
28a89453 1890
4e3fd7a0 1891 *final_dst = fl6->daddr;
cfe1fc77 1892 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1893 if (opt && opt->opt_flen)
1894 ipv6_push_frag_opts(skb, opt, &proto);
1895 if (opt && opt->opt_nflen)
613fa3ca 1896 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1897
e2d1bca7
ACM
1898 skb_push(skb, sizeof(struct ipv6hdr));
1899 skb_reset_network_header(skb);
0660e03f 1900 hdr = ipv6_hdr(skb);
1ab1457c 1901
6422398c 1902 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1903 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1904 ip6_autoflowlabel(net, np), fl6));
6422398c 1905 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1906 hdr->nexthdr = proto;
4e3fd7a0
AD
1907 hdr->saddr = fl6->saddr;
1908 hdr->daddr = *final_dst;
1da177e4 1909
a2c2064f 1910 skb->priority = sk->sk_priority;
c6af0c22 1911 skb->mark = cork->base.mark;
a2c2064f 1912
a818f75e
JSP
1913 skb->tstamp = cork->base.transmit_time;
1914
d8d1f30b 1915 skb_dst_set(skb, dst_clone(&rt->dst));
edf391ff 1916 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1917 if (proto == IPPROTO_ICMPV6) {
adf30907 1918 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1919
43a43b60
HFS
1920 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1921 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1922 }
1923
6422398c
VY
1924 ip6_cork_release(cork, v6_cork);
1925out:
1926 return skb;
1927}
1928
1929int ip6_send_skb(struct sk_buff *skb)
1930{
1931 struct net *net = sock_net(skb->sk);
1932 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1933 int err;
1934
33224b16 1935 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1936 if (err) {
1937 if (err > 0)
6ce9e7b5 1938 err = net_xmit_errno(err);
1da177e4 1939 if (err)
6422398c
VY
1940 IP6_INC_STATS(net, rt->rt6i_idev,
1941 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1942 }
1943
1da177e4 1944 return err;
6422398c
VY
1945}
1946
1947int ip6_push_pending_frames(struct sock *sk)
1948{
1949 struct sk_buff *skb;
1950
1951 skb = ip6_finish_skb(sk);
1952 if (!skb)
1953 return 0;
1954
1955 return ip6_send_skb(skb);
1da177e4 1956}
a495f836 1957EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 1958
0bbe84a6 1959static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
1960 struct sk_buff_head *queue,
1961 struct inet_cork_full *cork,
1962 struct inet6_cork *v6_cork)
1da177e4 1963{
1da177e4
LT
1964 struct sk_buff *skb;
1965
0bbe84a6 1966 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
1967 if (skb_dst(skb))
1968 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1969 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1970 kfree_skb(skb);
1971 }
1972
6422398c 1973 ip6_cork_release(cork, v6_cork);
1da177e4 1974}
0bbe84a6
VY
1975
1976void ip6_flush_pending_frames(struct sock *sk)
1977{
6422398c
VY
1978 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1979 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 1980}
a495f836 1981EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
1982
1983struct sk_buff *ip6_make_skb(struct sock *sk,
1984 int getfrag(void *from, char *to, int offset,
1985 int len, int odd, struct sk_buff *skb),
1986 void *from, int length, int transhdrlen,
26879da5 1987 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
6422398c 1988 struct rt6_info *rt, unsigned int flags,
5fdaa88d 1989 struct inet_cork_full *cork)
6422398c 1990{
6422398c
VY
1991 struct inet6_cork v6_cork;
1992 struct sk_buff_head queue;
26879da5 1993 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
1994 int err;
1995
1996 if (flags & MSG_PROBE)
1997 return NULL;
1998
1999 __skb_queue_head_init(&queue);
2000
1cd7884d
WB
2001 cork->base.flags = 0;
2002 cork->base.addr = 0;
2003 cork->base.opt = NULL;
2004 cork->base.dst = NULL;
6422398c 2005 v6_cork.opt = NULL;
5fdaa88d 2006 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
862c03ee 2007 if (err) {
1cd7884d 2008 ip6_cork_release(cork, &v6_cork);
6422398c 2009 return ERR_PTR(err);
862c03ee 2010 }
26879da5
WW
2011 if (ipc6->dontfrag < 0)
2012 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 2013
1cd7884d 2014 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
6422398c
VY
2015 &current->task_frag, getfrag, from,
2016 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 2017 flags, ipc6);
6422398c 2018 if (err) {
1cd7884d 2019 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
2020 return ERR_PTR(err);
2021 }
2022
1cd7884d 2023 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2024}