Merge branch 'akpm' (patches from Andrew)
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/ipv6.h>
46#include <net/ndisc.h>
47#include <net/protocol.h>
48#include <net/ip6_route.h>
49#include <net/addrconf.h>
50#include <net/rawv6.h>
51#include <net/icmp.h>
52#include <net/xfrm.h>
53#include <net/checksum.h>
7bc570c8 54#include <linux/mroute6.h>
ca254490 55#include <net/l3mdev.h>
14972cbd 56#include <net/lwtunnel.h>
571912c6 57#include <net/ip_tunnels.h>
1da177e4 58
7d8c6e39 59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
9b1c1ef1 63 const struct in6_addr *nexthop;
f6b72b62 64 struct neighbour *neigh;
6fd6ce20 65 int ret;
1da177e4 66
0660e03f 67 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 68 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 69
7026b1dd 70 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 71 ((mroute6_is_socket(net, skb) &&
bd91b8bf 72 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
73 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
75 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
76
77 /* Do not check for IFF_ALLMULTI; multicast routing
78 is not supported in any case.
79 */
80 if (newskb)
b2e0b385 81 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 82 net, sk, newskb, NULL, newskb->dev,
95603e22 83 dev_loopback_xmit);
1da177e4 84
0660e03f 85 if (ipv6_hdr(skb)->hop_limit == 0) {
78126c41 86 IP6_INC_STATS(net, idev,
3bd653c8 87 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
88 kfree_skb(skb);
89 return 0;
90 }
91 }
92
78126c41 93 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
dd408515
HFS
94
95 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 IPV6_ADDR_SCOPE_NODELOCAL &&
97 !(dev->flags & IFF_LOOPBACK)) {
98 kfree_skb(skb);
99 return 0;
100 }
1da177e4
LT
101 }
102
14972cbd
RP
103 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 int res = lwtunnel_xmit(skb);
105
106 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
107 return res;
108 }
109
6fd6ce20 110 rcu_read_lock_bh();
2647a9b0 111 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
6fd6ce20
YH
112 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 if (unlikely(!neigh))
114 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 if (!IS_ERR(neigh)) {
4ff06203 116 sock_confirm_neigh(skb, neigh);
0353f282 117 ret = neigh_output(neigh, skb, false);
6fd6ce20
YH
118 rcu_read_unlock_bh();
119 return ret;
120 }
121 rcu_read_unlock_bh();
05e3aa09 122
78126c41 123 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
9e508490
JE
124 kfree_skb(skb);
125 return -EINVAL;
1da177e4
LT
126}
127
b210de4f
AL
128static int
129ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 struct sk_buff *skb, unsigned int mtu)
131{
132 struct sk_buff *segs, *nskb;
133 netdev_features_t features;
134 int ret = 0;
135
136 /* Please see corresponding comment in ip_finish_output_gso
137 * describing the cases where GSO segment length exceeds the
138 * egress MTU.
139 */
140 features = netif_skb_features(skb);
141 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 if (IS_ERR_OR_NULL(segs)) {
143 kfree_skb(skb);
144 return -ENOMEM;
145 }
146
147 consume_skb(skb);
148
149 skb_list_walk_safe(segs, segs, nskb) {
150 int err;
151
152 skb_mark_not_on_list(segs);
153 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
154 if (err && ret == 0)
155 ret = err;
156 }
157
158 return ret;
159}
160
956fe219 161static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 162{
b210de4f
AL
163 unsigned int mtu;
164
09ee9dba
TB
165#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 /* Policy lookup after SNAT yielded a new policy */
167 if (skb_dst(skb)->xfrm) {
168 IPCB(skb)->flags |= IPSKB_REROUTED;
169 return dst_output(net, sk, skb);
170 }
171#endif
172
b210de4f
AL
173 mtu = ip6_skb_dst_mtu(skb);
174 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
176
177 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
178 dst_allfrag(skb_dst(skb)) ||
179 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 180 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 181 else
7d8c6e39 182 return ip6_finish_output2(net, sk, skb);
9e508490
JE
183}
184
956fe219 185static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186{
187 int ret;
188
189 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
190 switch (ret) {
191 case NET_XMIT_SUCCESS:
192 return __ip6_finish_output(net, sk, skb);
193 case NET_XMIT_CN:
194 return __ip6_finish_output(net, sk, skb) ? : ret;
195 default:
196 kfree_skb(skb);
197 return ret;
198 }
199}
200
ede2059d 201int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 202{
28f8bfd1 203 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 204 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 205
97a7a37a
CF
206 skb->protocol = htons(ETH_P_IPV6);
207 skb->dev = dev;
208
778d80be 209 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 210 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
211 kfree_skb(skb);
212 return 0;
213 }
214
29a26a56 215 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 216 net, sk, skb, indev, dev,
9c6eb28a
JE
217 ip6_finish_output,
218 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 219}
6585d7dc 220EXPORT_SYMBOL(ip6_output);
1da177e4 221
e9191ffb 222bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
223{
224 if (!np->autoflowlabel_set)
225 return ip6_default_np_autolabel(net);
226 else
227 return np->autoflowlabel;
228}
229
1da177e4 230/*
1c1e9d2b
ED
231 * xmit an sk_buff (used by TCP, SCTP and DCCP)
232 * Note : socket lock is not held for SYNACK packets, but might be modified
233 * by calls to skb_set_owner_w() and ipv6_local_error(),
234 * which are using proper atomic operations or spinlocks.
1da177e4 235 */
1c1e9d2b 236int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 237 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 238{
3bd653c8 239 struct net *net = sock_net(sk);
1c1e9d2b 240 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 241 struct in6_addr *first_hop = &fl6->daddr;
adf30907 242 struct dst_entry *dst = skb_dst(skb);
66033f47 243 unsigned int head_room;
1da177e4 244 struct ipv6hdr *hdr;
4c9483b2 245 u8 proto = fl6->flowi6_proto;
1da177e4 246 int seg_len = skb->len;
e651f03a 247 int hlimit = -1;
1da177e4
LT
248 u32 mtu;
249
66033f47
SB
250 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
251 if (opt)
252 head_room += opt->opt_nflen + opt->opt_flen;
253
254 if (unlikely(skb_headroom(skb) < head_room)) {
255 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
256 if (!skb2) {
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
258 IPSTATS_MIB_OUTDISCARDS);
259 kfree_skb(skb);
260 return -ENOBUFS;
1da177e4 261 }
66033f47
SB
262 if (skb->sk)
263 skb_set_owner_w(skb2, skb->sk);
264 consume_skb(skb);
265 skb = skb2;
266 }
267
268 if (opt) {
269 seg_len += opt->opt_nflen + opt->opt_flen;
270
1da177e4
LT
271 if (opt->opt_flen)
272 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 273
1da177e4 274 if (opt->opt_nflen)
613fa3ca
DL
275 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
276 &fl6->saddr);
1da177e4
LT
277 }
278
e2d1bca7
ACM
279 skb_push(skb, sizeof(struct ipv6hdr));
280 skb_reset_network_header(skb);
0660e03f 281 hdr = ipv6_hdr(skb);
1da177e4
LT
282
283 /*
284 * Fill in the IPv6 header
285 */
b903d324 286 if (np)
1da177e4
LT
287 hlimit = np->hop_limit;
288 if (hlimit < 0)
6b75d090 289 hlimit = ip6_dst_hoplimit(dst);
1da177e4 290
cb1ce2ef 291 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 292 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 293
1da177e4
LT
294 hdr->payload_len = htons(seg_len);
295 hdr->nexthdr = proto;
296 hdr->hop_limit = hlimit;
297
4e3fd7a0
AD
298 hdr->saddr = fl6->saddr;
299 hdr->daddr = *first_hop;
1da177e4 300
9c9c9ad5 301 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 302 skb->priority = priority;
92e55f41 303 skb->mark = mark;
a2c2064f 304
1da177e4 305 mtu = dst_mtu(dst);
60ff7467 306 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
adf30907 307 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 308 IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
309
310 /* if egress device is enslaved to an L3 master device pass the
311 * skb to its handler for processing
312 */
313 skb = l3mdev_ip6_out((struct sock *)sk, skb);
314 if (unlikely(!skb))
315 return 0;
316
1c1e9d2b
ED
317 /* hooks should never assume socket lock is held.
318 * we promote our socket to non const
319 */
29a26a56 320 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
1c1e9d2b 321 net, (struct sock *)sk, skb, NULL, dst->dev,
13206b6b 322 dst_output);
1da177e4
LT
323 }
324
1da177e4 325 skb->dev = dst->dev;
1c1e9d2b
ED
326 /* ipv6_local_error() does not require socket lock,
327 * we promote our socket to non const
328 */
329 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
330
adf30907 331 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
332 kfree_skb(skb);
333 return -EMSGSIZE;
334}
7159039a
YH
335EXPORT_SYMBOL(ip6_xmit);
336
1da177e4
LT
337static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
338{
339 struct ip6_ra_chain *ra;
340 struct sock *last = NULL;
341
342 read_lock(&ip6_ra_lock);
343 for (ra = ip6_ra_chain; ra; ra = ra->next) {
344 struct sock *sk = ra->sk;
0bd1b59b
AM
345 if (sk && ra->sel == sel &&
346 (!sk->sk_bound_dev_if ||
347 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
348 struct ipv6_pinfo *np = inet6_sk(sk);
349
350 if (np && np->rtalert_isolate &&
351 !net_eq(sock_net(sk), dev_net(skb->dev))) {
352 continue;
353 }
1da177e4
LT
354 if (last) {
355 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
356 if (skb2)
357 rawv6_rcv(last, skb2);
358 }
359 last = sk;
360 }
361 }
362
363 if (last) {
364 rawv6_rcv(last, skb);
365 read_unlock(&ip6_ra_lock);
366 return 1;
367 }
368 read_unlock(&ip6_ra_lock);
369 return 0;
370}
371
e21e0b5f
VN
372static int ip6_forward_proxy_check(struct sk_buff *skb)
373{
0660e03f 374 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 375 u8 nexthdr = hdr->nexthdr;
75f2811c 376 __be16 frag_off;
e21e0b5f
VN
377 int offset;
378
379 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 380 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
381 if (offset < 0)
382 return 0;
383 } else
384 offset = sizeof(struct ipv6hdr);
385
386 if (nexthdr == IPPROTO_ICMPV6) {
387 struct icmp6hdr *icmp6;
388
d56f90a7
ACM
389 if (!pskb_may_pull(skb, (skb_network_header(skb) +
390 offset + 1 - skb->data)))
e21e0b5f
VN
391 return 0;
392
d56f90a7 393 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
394
395 switch (icmp6->icmp6_type) {
396 case NDISC_ROUTER_SOLICITATION:
397 case NDISC_ROUTER_ADVERTISEMENT:
398 case NDISC_NEIGHBOUR_SOLICITATION:
399 case NDISC_NEIGHBOUR_ADVERTISEMENT:
400 case NDISC_REDIRECT:
401 /* For reaction involving unicast neighbor discovery
402 * message destined to the proxied address, pass it to
403 * input function.
404 */
405 return 1;
406 default:
407 break;
408 }
409 }
410
74553b09
VN
411 /*
412 * The proxying router can't forward traffic sent to a link-local
413 * address, so signal the sender and discard the packet. This
414 * behavior is clarified by the MIPv6 specification.
415 */
416 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
417 dst_link_failure(skb);
418 return -1;
419 }
420
e21e0b5f
VN
421 return 0;
422}
423
0c4b51f0
EB
424static inline int ip6_forward_finish(struct net *net, struct sock *sk,
425 struct sk_buff *skb)
1da177e4 426{
71a1c915
JB
427 struct dst_entry *dst = skb_dst(skb);
428
429 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
430 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
431
f839a6c9
IS
432#ifdef CONFIG_NET_SWITCHDEV
433 if (skb->offload_l3_fwd_mark) {
434 consume_skb(skb);
435 return 0;
436 }
437#endif
438
8203e2d8 439 skb->tstamp = 0;
13206b6b 440 return dst_output(net, sk, skb);
1da177e4
LT
441}
442
fe6cc55f
FW
443static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
444{
418a3156 445 if (skb->len <= mtu)
fe6cc55f
FW
446 return false;
447
60ff7467 448 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
449 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
450 return true;
451
60ff7467 452 if (skb->ignore_df)
418a3156
FW
453 return false;
454
779b7931 455 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
456 return false;
457
458 return true;
459}
460
1da177e4
LT
461int ip6_forward(struct sk_buff *skb)
462{
bdb7cc64 463 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
adf30907 464 struct dst_entry *dst = skb_dst(skb);
0660e03f 465 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 466 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 467 struct net *net = dev_net(dst->dev);
14f3ad6f 468 u32 mtu;
1ab1457c 469
53b7997f 470 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
471 goto error;
472
090f1166
LR
473 if (skb->pkt_type != PACKET_HOST)
474 goto drop;
475
9ef2e965
HFS
476 if (unlikely(skb->sk))
477 goto drop;
478
4497b076
BH
479 if (skb_warn_if_lro(skb))
480 goto drop;
481
1da177e4 482 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 483 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
484 goto drop;
485 }
486
35fc92a9 487 skb_forward_csum(skb);
1da177e4
LT
488
489 /*
490 * We DO NOT make any processing on
491 * RA packets, pushing them to user level AS IS
492 * without ane WARRANTY that application will be able
493 * to interpret them. The reason is that we
494 * cannot make anything clever here.
495 *
496 * We are not end-node, so that if packet contains
497 * AH/ESP, we cannot make anything.
498 * Defragmentation also would be mistake, RA packets
499 * cannot be fragmented, because there is no warranty
500 * that different fragments will go along one path. --ANK
501 */
ab4eb353
YH
502 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
503 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
504 return 0;
505 }
506
507 /*
508 * check and decrement ttl
509 */
510 if (hdr->hop_limit <= 1) {
3ffe533c 511 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 512 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
513
514 kfree_skb(skb);
515 return -ETIMEDOUT;
516 }
517
fbea49e1 518 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 519 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 520 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
521 int proxied = ip6_forward_proxy_check(skb);
522 if (proxied > 0)
e21e0b5f 523 return ip6_input(skb);
74553b09 524 else if (proxied < 0) {
bdb7cc64 525 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
526 goto drop;
527 }
e21e0b5f
VN
528 }
529
1da177e4 530 if (!xfrm6_route_forward(skb)) {
bdb7cc64 531 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
532 goto drop;
533 }
adf30907 534 dst = skb_dst(skb);
1da177e4
LT
535
536 /* IPv6 specs say nothing about it, but it is clear that we cannot
537 send redirects to source routed frames.
1e5dc146 538 We don't send redirects to frames decapsulated from IPsec.
1da177e4 539 */
2f17becf
SS
540 if (IP6CB(skb)->iif == dst->dev->ifindex &&
541 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 542 struct in6_addr *target = NULL;
fbfe95a4 543 struct inet_peer *peer;
1da177e4 544 struct rt6_info *rt;
1da177e4
LT
545
546 /*
547 * incoming and outgoing devices are the same
548 * send a redirect.
549 */
550
551 rt = (struct rt6_info *) dst;
c45a3dfb
DM
552 if (rt->rt6i_flags & RTF_GATEWAY)
553 target = &rt->rt6i_gateway;
1da177e4
LT
554 else
555 target = &hdr->daddr;
556
fd0273d7 557 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 558
1da177e4
LT
559 /* Limit redirects both by destination (here)
560 and by source (inside ndisc_send_redirect)
561 */
fbfe95a4 562 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 563 ndisc_send_redirect(skb, target);
1d861aa4
DM
564 if (peer)
565 inet_putpeer(peer);
5bb1ab09
DS
566 } else {
567 int addrtype = ipv6_addr_type(&hdr->saddr);
568
1da177e4 569 /* This check is security critical. */
f81b2e7d
YH
570 if (addrtype == IPV6_ADDR_ANY ||
571 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
572 goto error;
573 if (addrtype & IPV6_ADDR_LINKLOCAL) {
574 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 575 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
576 goto error;
577 }
1da177e4
LT
578 }
579
0954cf9c 580 mtu = ip6_dst_mtu_forward(dst);
14f3ad6f
UW
581 if (mtu < IPV6_MIN_MTU)
582 mtu = IPV6_MIN_MTU;
583
fe6cc55f 584 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
585 /* Again, force OUTPUT device used as source address */
586 skb->dev = dst->dev;
14f3ad6f 587 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 588 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
589 __IP6_INC_STATS(net, ip6_dst_idev(dst),
590 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
591 kfree_skb(skb);
592 return -EMSGSIZE;
593 }
594
595 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
596 __IP6_INC_STATS(net, ip6_dst_idev(dst),
597 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
598 goto drop;
599 }
600
0660e03f 601 hdr = ipv6_hdr(skb);
1da177e4
LT
602
603 /* Mangling hops number delayed to point after skb COW */
1ab1457c 604
1da177e4
LT
605 hdr->hop_limit--;
606
29a26a56
EB
607 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
608 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 609 ip6_forward_finish);
1da177e4
LT
610
611error:
bdb7cc64 612 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
613drop:
614 kfree_skb(skb);
615 return -EINVAL;
616}
617
618static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
619{
620 to->pkt_type = from->pkt_type;
621 to->priority = from->priority;
622 to->protocol = from->protocol;
adf30907
ED
623 skb_dst_drop(to);
624 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 625 to->dev = from->dev;
82e91ffe 626 to->mark = from->mark;
1da177e4 627
3dd1c9a1
PA
628 skb_copy_hash(to, from);
629
1da177e4
LT
630#ifdef CONFIG_NET_SCHED
631 to->tc_index = from->tc_index;
632#endif
e7ac05f3 633 nf_copy(to, from);
df5042f4 634 skb_ext_copy(to, from);
984bc16c 635 skb_copy_secmark(to, from);
1da177e4
LT
636}
637
0feca619
PNA
638int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
639 u8 nexthdr, __be32 frag_id,
640 struct ip6_fraglist_iter *iter)
641{
642 unsigned int first_len;
643 struct frag_hdr *fh;
644
645 /* BUILD HEADER */
646 *prevhdr = NEXTHDR_FRAGMENT;
647 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
648 if (!iter->tmp_hdr)
649 return -ENOMEM;
650
b7034146 651 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
652 skb_frag_list_init(skb);
653
654 iter->offset = 0;
655 iter->hlen = hlen;
656 iter->frag_id = frag_id;
657 iter->nexthdr = nexthdr;
658
659 __skb_pull(skb, hlen);
660 fh = __skb_push(skb, sizeof(struct frag_hdr));
661 __skb_push(skb, hlen);
662 skb_reset_network_header(skb);
663 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
664
665 fh->nexthdr = nexthdr;
666 fh->reserved = 0;
667 fh->frag_off = htons(IP6_MF);
668 fh->identification = frag_id;
669
670 first_len = skb_pagelen(skb);
671 skb->data_len = first_len - skb_headlen(skb);
672 skb->len = first_len;
673 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
674
675 return 0;
676}
677EXPORT_SYMBOL(ip6_fraglist_init);
678
679void ip6_fraglist_prepare(struct sk_buff *skb,
680 struct ip6_fraglist_iter *iter)
681{
682 struct sk_buff *frag = iter->frag;
683 unsigned int hlen = iter->hlen;
684 struct frag_hdr *fh;
685
686 frag->ip_summed = CHECKSUM_NONE;
687 skb_reset_transport_header(frag);
688 fh = __skb_push(frag, sizeof(struct frag_hdr));
689 __skb_push(frag, hlen);
690 skb_reset_network_header(frag);
691 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
692 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
693 fh->nexthdr = iter->nexthdr;
694 fh->reserved = 0;
695 fh->frag_off = htons(iter->offset);
696 if (frag->next)
697 fh->frag_off |= htons(IP6_MF);
698 fh->identification = iter->frag_id;
699 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700 ip6_copy_metadata(frag, skb);
701}
702EXPORT_SYMBOL(ip6_fraglist_prepare);
703
8a6a1f17
PNA
704void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
705 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
706 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
707{
708 state->prevhdr = prevhdr;
709 state->nexthdr = nexthdr;
710 state->frag_id = frag_id;
711
712 state->hlen = hlen;
713 state->mtu = mtu;
714
715 state->left = skb->len - hlen; /* Space per frame */
716 state->ptr = hlen; /* Where to start from */
717
718 state->hroom = hdr_room;
719 state->troom = needed_tailroom;
720
721 state->offset = 0;
722}
723EXPORT_SYMBOL(ip6_frag_init);
724
725struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
726{
727 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
728 struct sk_buff *frag;
729 struct frag_hdr *fh;
730 unsigned int len;
731
732 len = state->left;
733 /* IF: it doesn't fit, use 'mtu' - the data space left */
734 if (len > state->mtu)
735 len = state->mtu;
736 /* IF: we are not sending up to and including the packet end
737 then align the next start on an eight byte boundary */
738 if (len < state->left)
739 len &= ~7;
740
741 /* Allocate buffer */
742 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
743 state->hroom + state->troom, GFP_ATOMIC);
744 if (!frag)
745 return ERR_PTR(-ENOMEM);
746
747 /*
748 * Set up data on packet
749 */
750
751 ip6_copy_metadata(frag, skb);
752 skb_reserve(frag, state->hroom);
753 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
754 skb_reset_network_header(frag);
755 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
756 frag->transport_header = (frag->network_header + state->hlen +
757 sizeof(struct frag_hdr));
758
759 /*
760 * Charge the memory for the fragment to any owner
761 * it might possess
762 */
763 if (skb->sk)
764 skb_set_owner_w(frag, skb->sk);
765
766 /*
767 * Copy the packet header into the new buffer.
768 */
769 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
770
771 fragnexthdr_offset = skb_network_header(frag);
772 fragnexthdr_offset += prevhdr - skb_network_header(skb);
773 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
774
775 /*
776 * Build fragment header.
777 */
778 fh->nexthdr = state->nexthdr;
779 fh->reserved = 0;
780 fh->identification = state->frag_id;
781
782 /*
783 * Copy a block of the IP datagram.
784 */
785 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
786 len));
787 state->left -= len;
788
789 fh->frag_off = htons(state->offset);
790 if (state->left > 0)
791 fh->frag_off |= htons(IP6_MF);
792 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
793
794 state->ptr += len;
795 state->offset += len;
796
797 return frag;
798}
799EXPORT_SYMBOL(ip6_frag_next);
800
7d8c6e39
EB
801int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
802 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 803{
1da177e4 804 struct sk_buff *frag;
67ba4152 805 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 806 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
807 inet6_sk(skb->sk) : NULL;
8a6a1f17
PNA
808 struct ip6_frag_state state;
809 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 810 ktime_t tstamp = skb->tstamp;
8a6a1f17 811 int hroom, err = 0;
286c2349 812 __be32 frag_id;
1da177e4
LT
813 u8 *prevhdr, nexthdr = 0;
814
7dd7eb95
DM
815 err = ip6_find_1stfragopt(skb, &prevhdr);
816 if (err < 0)
2423496a 817 goto fail;
7dd7eb95 818 hlen = err;
1da177e4 819 nexthdr = *prevhdr;
ef0efcd3 820 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 821
628a5c56 822 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
823
824 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 825 * or if the skb it not generated by a local socket.
b881ef76 826 */
485fca66
FW
827 if (unlikely(!skb->ignore_df && skb->len > mtu))
828 goto fail_toobig;
a34a101e 829
485fca66
FW
830 if (IP6CB(skb)->frag_max_size) {
831 if (IP6CB(skb)->frag_max_size > mtu)
832 goto fail_toobig;
833
834 /* don't send fragments larger than what we received */
835 mtu = IP6CB(skb)->frag_max_size;
836 if (mtu < IPV6_MIN_MTU)
837 mtu = IPV6_MIN_MTU;
b881ef76
JH
838 }
839
d91675f9
YH
840 if (np && np->frag_size < mtu) {
841 if (np->frag_size)
842 mtu = np->frag_size;
843 }
89bc7848 844 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 845 goto fail_toobig;
1e0d69a9 846 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 847
fd0273d7
MKL
848 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
849 &ipv6_hdr(skb)->saddr);
286c2349 850
405c92f7
HFS
851 if (skb->ip_summed == CHECKSUM_PARTIAL &&
852 (err = skb_checksum_help(skb)))
853 goto fail;
854
ef0efcd3 855 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 856 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 857 if (skb_has_frag_list(skb)) {
c72d8cda 858 unsigned int first_len = skb_pagelen(skb);
0feca619 859 struct ip6_fraglist_iter iter;
3d13008e 860 struct sk_buff *frag2;
1da177e4
LT
861
862 if (first_len - hlen > mtu ||
863 ((first_len - hlen) & 7) ||
1d325d21
FW
864 skb_cloned(skb) ||
865 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
866 goto slow_path;
867
4d9092bb 868 skb_walk_frags(skb, frag) {
1da177e4
LT
869 /* Correct geometry. */
870 if (frag->len > mtu ||
871 ((frag->len & 7) && frag->next) ||
1d325d21 872 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 873 goto slow_path_clean;
1da177e4 874
1da177e4
LT
875 /* Partially cloned skb? */
876 if (skb_shared(frag))
3d13008e 877 goto slow_path_clean;
2fdba6b0
HX
878
879 BUG_ON(frag->sk);
880 if (skb->sk) {
2fdba6b0
HX
881 frag->sk = skb->sk;
882 frag->destructor = sock_wfree;
2fdba6b0 883 }
3d13008e 884 skb->truesize -= frag->truesize;
1da177e4
LT
885 }
886
0feca619
PNA
887 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
888 &iter);
889 if (err < 0)
1d325d21 890 goto fail;
a11d206d 891
1da177e4
LT
892 for (;;) {
893 /* Prepare header of the next frame,
894 * before previous one went down. */
0feca619
PNA
895 if (iter.frag)
896 ip6_fraglist_prepare(skb, &iter);
1ab1457c 897
9669fffc 898 skb->tstamp = tstamp;
7d8c6e39 899 err = output(net, sk, skb);
67ba4152 900 if (!err)
d8d1f30b 901 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 902 IPSTATS_MIB_FRAGCREATES);
dafee490 903
0feca619 904 if (err || !iter.frag)
1da177e4
LT
905 break;
906
0feca619 907 skb = ip6_fraglist_next(&iter);
1da177e4
LT
908 }
909
0feca619 910 kfree(iter.tmp_hdr);
1da177e4
LT
911
912 if (err == 0) {
d8d1f30b 913 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 914 IPSTATS_MIB_FRAGOKS);
1da177e4
LT
915 return 0;
916 }
917
b7034146 918 kfree_skb_list(iter.frag);
1da177e4 919
d8d1f30b 920 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 921 IPSTATS_MIB_FRAGFAILS);
1da177e4 922 return err;
3d13008e
ED
923
924slow_path_clean:
925 skb_walk_frags(skb, frag2) {
926 if (frag2 == frag)
927 break;
928 frag2->sk = NULL;
929 frag2->destructor = NULL;
930 skb->truesize += frag2->truesize;
931 }
1da177e4
LT
932 }
933
934slow_path:
1da177e4
LT
935 /*
936 * Fragment the datagram.
937 */
938
8a6a1f17
PNA
939 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
940 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
941 &state);
1da177e4
LT
942
943 /*
944 * Keep copying data until we run out.
945 */
1da177e4 946
8a6a1f17
PNA
947 while (state.left > 0) {
948 frag = ip6_frag_next(skb, &state);
949 if (IS_ERR(frag)) {
950 err = PTR_ERR(frag);
1da177e4
LT
951 goto fail;
952 }
953
1da177e4
LT
954 /*
955 * Put this fragment into the sending queue.
956 */
9669fffc 957 frag->tstamp = tstamp;
7d8c6e39 958 err = output(net, sk, frag);
1da177e4
LT
959 if (err)
960 goto fail;
dafee490 961
adf30907 962 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 963 IPSTATS_MIB_FRAGCREATES);
1da177e4 964 }
adf30907 965 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 966 IPSTATS_MIB_FRAGOKS);
808db80a 967 consume_skb(skb);
1da177e4
LT
968 return err;
969
485fca66
FW
970fail_toobig:
971 if (skb->sk && dst_allfrag(skb_dst(skb)))
972 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
973
485fca66
FW
974 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
975 err = -EMSGSIZE;
976
1da177e4 977fail:
adf30907 978 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 979 IPSTATS_MIB_FRAGFAILS);
1ab1457c 980 kfree_skb(skb);
1da177e4
LT
981 return err;
982}
983
b71d1d42
ED
984static inline int ip6_rt_check(const struct rt6key *rt_key,
985 const struct in6_addr *fl_addr,
986 const struct in6_addr *addr_cache)
cf6b1982 987{
a02cec21 988 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 989 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
990}
991
497c615a
HX
992static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
993 struct dst_entry *dst,
b71d1d42 994 const struct flowi6 *fl6)
1da177e4 995{
497c615a 996 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 997 struct rt6_info *rt;
1da177e4 998
497c615a
HX
999 if (!dst)
1000 goto out;
1001
a963a37d
ED
1002 if (dst->ops->family != AF_INET6) {
1003 dst_release(dst);
1004 return NULL;
1005 }
1006
1007 rt = (struct rt6_info *)dst;
497c615a
HX
1008 /* Yes, checking route validity in not connected
1009 * case is not very simple. Take into account,
1010 * that we do not support routing by source, TOS,
67ba4152 1011 * and MSG_DONTROUTE --ANK (980726)
497c615a 1012 *
cf6b1982
YH
1013 * 1. ip6_rt_check(): If route was host route,
1014 * check that cached destination is current.
497c615a
HX
1015 * If it is network route, we still may
1016 * check its validity using saved pointer
1017 * to the last used address: daddr_cache.
1018 * We do not want to save whole address now,
1019 * (because main consumer of this service
1020 * is tcp, which has not this problem),
1021 * so that the last trick works only on connected
1022 * sockets.
1023 * 2. oif also should be the same.
1024 */
4c9483b2 1025 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1026#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1027 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1028#endif
ca254490
DA
1029 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
497c615a
HX
1031 dst_release(dst);
1032 dst = NULL;
1da177e4
LT
1033 }
1034
497c615a
HX
1035out:
1036 return dst;
1037}
1038
3aef934f 1039static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1040 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1041{
69cce1d1
DM
1042#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043 struct neighbour *n;
97cac082 1044 struct rt6_info *rt;
69cce1d1
DM
1045#endif
1046 int err;
6f21c96a 1047 int flags = 0;
497c615a 1048
e16e888b
MS
1049 /* The correct way to handle this would be to do
1050 * ip6_route_get_saddr, and then ip6_route_output; however,
1051 * the route-specific preferred source forces the
1052 * ip6_route_output call _before_ ip6_route_get_saddr.
1053 *
1054 * In source specific routing (no src=any default route),
1055 * ip6_route_output will fail given src=any saddr, though, so
1056 * that's why we try it again later.
1057 */
1058 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
a68886a6 1059 struct fib6_info *from;
e16e888b
MS
1060 struct rt6_info *rt;
1061 bool had_dst = *dst != NULL;
1da177e4 1062
e16e888b
MS
1063 if (!had_dst)
1064 *dst = ip6_route_output(net, sk, fl6);
1065 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1066
1067 rcu_read_lock();
1068 from = rt ? rcu_dereference(rt->from) : NULL;
1069 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1070 sk ? inet6_sk(sk)->srcprefs : 0,
1071 &fl6->saddr);
a68886a6
DA
1072 rcu_read_unlock();
1073
44456d37 1074 if (err)
1da177e4 1075 goto out_err_release;
e16e888b
MS
1076
1077 /* If we had an erroneous initial result, pretend it
1078 * never existed and let the SA-enabled version take
1079 * over.
1080 */
1081 if (!had_dst && (*dst)->error) {
1082 dst_release(*dst);
1083 *dst = NULL;
1084 }
6f21c96a
PA
1085
1086 if (fl6->flowi6_oif)
1087 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1088 }
1089
e16e888b 1090 if (!*dst)
6f21c96a 1091 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1092
1093 err = (*dst)->error;
1094 if (err)
1095 goto out_err_release;
1096
95c385b4 1097#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1098 /*
1099 * Here if the dst entry we've looked up
1100 * has a neighbour entry that is in the INCOMPLETE
1101 * state and the src address from the flow is
1102 * marked as OPTIMISTIC, we release the found
1103 * dst entry and replace it instead with the
1104 * dst entry of the nexthop router
1105 */
c56bf6fe 1106 rt = (struct rt6_info *) *dst;
707be1ff 1107 rcu_read_lock_bh();
2647a9b0
MKL
1108 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1109 rt6_nexthop(rt, &fl6->daddr));
707be1ff
YH
1110 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1111 rcu_read_unlock_bh();
1112
1113 if (err) {
e550dfb0 1114 struct inet6_ifaddr *ifp;
4c9483b2 1115 struct flowi6 fl_gw6;
e550dfb0
NH
1116 int redirect;
1117
4c9483b2 1118 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1119 (*dst)->dev, 1);
1120
1121 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1122 if (ifp)
1123 in6_ifa_put(ifp);
1124
1125 if (redirect) {
1126 /*
1127 * We need to get the dst entry for the
1128 * default router instead
1129 */
1130 dst_release(*dst);
4c9483b2
DM
1131 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1132 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1133 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1134 err = (*dst)->error;
1135 if (err)
e550dfb0 1136 goto out_err_release;
95c385b4 1137 }
e550dfb0 1138 }
95c385b4 1139#endif
ec5e3b0a 1140 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1141 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1142 err = -EAFNOSUPPORT;
1143 goto out_err_release;
1144 }
95c385b4 1145
1da177e4
LT
1146 return 0;
1147
1148out_err_release:
1149 dst_release(*dst);
1150 *dst = NULL;
8a966fc0 1151
0d240e78
DA
1152 if (err == -ENETUNREACH)
1153 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1154 return err;
1155}
34a0b3cd 1156
497c615a
HX
1157/**
1158 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1159 * @net: Network namespace to perform lookup in
497c615a
HX
1160 * @sk: socket which provides route info
1161 * @dst: pointer to dst_entry * for result
4c9483b2 1162 * @fl6: flow to lookup
497c615a
HX
1163 *
1164 * This function performs a route lookup on the given flow.
1165 *
1166 * It returns zero on success, or a standard errno code on error.
1167 */
343d60aa
RP
1168int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1169 struct flowi6 *fl6)
497c615a
HX
1170{
1171 *dst = NULL;
343d60aa 1172 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1173}
3cf3dc6c
ACM
1174EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1175
497c615a 1176/**
68d0c6d3 1177 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1178 * @net: Network namespace to perform lookup in
68d0c6d3 1179 * @sk: socket which provides route info
4c9483b2 1180 * @fl6: flow to lookup
68d0c6d3 1181 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1182 *
1183 * This function performs a route lookup on the given flow.
1184 *
1185 * It returns a valid dst pointer on success, or a pointer encoded
1186 * error code.
1187 */
c4e85f73 1188struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1189 const struct in6_addr *final_dst)
68d0c6d3
DM
1190{
1191 struct dst_entry *dst = NULL;
1192 int err;
1193
c4e85f73 1194 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1195 if (err)
1196 return ERR_PTR(err);
1197 if (final_dst)
4e3fd7a0 1198 fl6->daddr = *final_dst;
2774c131 1199
c4e85f73 1200 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1201}
1202EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1203
1204/**
1205 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1206 * @sk: socket which provides the dst cache and route info
4c9483b2 1207 * @fl6: flow to lookup
68d0c6d3 1208 * @final_dst: final destination address for ipsec lookup
96818159 1209 * @connected: whether @sk is connected or not
497c615a
HX
1210 *
1211 * This function performs a route lookup on the given flow with the
1212 * possibility of using the cached route in the socket if it is valid.
1213 * It will take the socket dst lock when operating on the dst cache.
1214 * As a result, this function can only be used in process context.
1215 *
96818159
AK
1216 * In addition, for a connected socket, cache the dst in the socket
1217 * if the current cache is not valid.
1218 *
68d0c6d3
DM
1219 * It returns a valid dst pointer on success, or a pointer encoded
1220 * error code.
497c615a 1221 */
4c9483b2 1222struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1223 const struct in6_addr *final_dst,
1224 bool connected)
497c615a 1225{
68d0c6d3 1226 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1227
4c9483b2 1228 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1229 if (dst)
1230 return dst;
1231
c4e85f73 1232 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1233 if (connected && !IS_ERR(dst))
1234 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1235
00bc0ef5 1236 return dst;
497c615a 1237}
68d0c6d3 1238EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1239
571912c6
MV
1240/**
1241 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1242 * @skb: Packet for which lookup is done
1243 * @dev: Tunnel device
1244 * @net: Network namespace of tunnel device
b51cd7c8 1245 * @sock: Socket which provides route info
571912c6
MV
1246 * @saddr: Memory to store the src ip address
1247 * @info: Tunnel information
1248 * @protocol: IP protocol
b51cd7c8 1249 * @use_cache: Flag to enable cache usage
571912c6
MV
1250 * This function performs a route lookup on a tunnel
1251 *
1252 * It returns a valid dst pointer and stores src address to be used in
1253 * tunnel in param saddr on success, else a pointer encoded error code.
1254 */
1255
1256struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1257 struct net_device *dev,
1258 struct net *net,
1259 struct socket *sock,
1260 struct in6_addr *saddr,
1261 const struct ip_tunnel_info *info,
1262 u8 protocol,
1263 bool use_cache)
1264{
1265 struct dst_entry *dst = NULL;
1266#ifdef CONFIG_DST_CACHE
1267 struct dst_cache *dst_cache;
1268#endif
1269 struct flowi6 fl6;
1270 __u8 prio;
1271
1272#ifdef CONFIG_DST_CACHE
1273 dst_cache = (struct dst_cache *)&info->dst_cache;
1274 if (use_cache) {
1275 dst = dst_cache_get_ip6(dst_cache, saddr);
1276 if (dst)
1277 return dst;
1278 }
1279#endif
1280 memset(&fl6, 0, sizeof(fl6));
1281 fl6.flowi6_mark = skb->mark;
1282 fl6.flowi6_proto = protocol;
1283 fl6.daddr = info->key.u.ipv6.dst;
1284 fl6.saddr = info->key.u.ipv6.src;
1285 prio = info->key.tos;
1286 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1287 info->key.label);
1288
1289 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1290 NULL);
1291 if (IS_ERR(dst)) {
1292 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1293 return ERR_PTR(-ENETUNREACH);
1294 }
1295 if (dst->dev == dev) { /* is this necessary? */
1296 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1297 dst_release(dst);
1298 return ERR_PTR(-ELOOP);
1299 }
1300#ifdef CONFIG_DST_CACHE
1301 if (use_cache)
1302 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1303#endif
1304 *saddr = fl6.saddr;
1305 return dst;
1306}
1307EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1308
0178b695
HX
1309static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1310 gfp_t gfp)
1311{
1312 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1313}
1314
1315static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1316 gfp_t gfp)
1317{
1318 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319}
1320
75a493e6 1321static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1322 int *maxfraglen,
1323 unsigned int fragheaderlen,
1324 struct sk_buff *skb,
75a493e6 1325 struct rt6_info *rt,
e367c2d0 1326 unsigned int orig_mtu)
0c183379
G
1327{
1328 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1329 if (!skb) {
0c183379 1330 /* first fragment, reserve header_len */
e367c2d0 1331 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1332
1333 } else {
1334 /*
1335 * this fragment is not first, the headers
1336 * space is regarded as data space.
1337 */
e367c2d0 1338 *mtu = orig_mtu;
0c183379
G
1339 }
1340 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1341 + fragheaderlen - sizeof(struct frag_hdr);
1342 }
1343}
1344
366e41d9 1345static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1346 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
5fdaa88d 1347 struct rt6_info *rt, struct flowi6 *fl6)
366e41d9
VY
1348{
1349 struct ipv6_pinfo *np = inet6_sk(sk);
1350 unsigned int mtu;
26879da5 1351 struct ipv6_txoptions *opt = ipc6->opt;
366e41d9
VY
1352
1353 /*
1354 * setup for corking
1355 */
1356 if (opt) {
1357 if (WARN_ON(v6_cork->opt))
1358 return -EINVAL;
1359
864e2a1f 1360 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
63159f29 1361 if (unlikely(!v6_cork->opt))
366e41d9
VY
1362 return -ENOBUFS;
1363
864e2a1f 1364 v6_cork->opt->tot_len = sizeof(*opt);
366e41d9
VY
1365 v6_cork->opt->opt_flen = opt->opt_flen;
1366 v6_cork->opt->opt_nflen = opt->opt_nflen;
1367
1368 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1369 sk->sk_allocation);
1370 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1371 return -ENOBUFS;
1372
1373 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1374 sk->sk_allocation);
1375 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1376 return -ENOBUFS;
1377
1378 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1379 sk->sk_allocation);
1380 if (opt->hopopt && !v6_cork->opt->hopopt)
1381 return -ENOBUFS;
1382
1383 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1384 sk->sk_allocation);
1385 if (opt->srcrt && !v6_cork->opt->srcrt)
1386 return -ENOBUFS;
1387
1388 /* need source address above miyazawa*/
1389 }
1390 dst_hold(&rt->dst);
1391 cork->base.dst = &rt->dst;
1392 cork->fl.u.ip6 = *fl6;
26879da5
WW
1393 v6_cork->hop_limit = ipc6->hlimit;
1394 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1395 if (rt->dst.flags & DST_XFRM_TUNNEL)
1396 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1397 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1398 else
1399 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1400 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1401 if (np->frag_size < mtu) {
1402 if (np->frag_size)
1403 mtu = np->frag_size;
1404 }
749439bf
MM
1405 if (mtu < IPV6_MIN_MTU)
1406 return -EINVAL;
366e41d9 1407 cork->base.fragsize = mtu;
fbf47813 1408 cork->base.gso_size = ipc6->gso_size;
678ca42d 1409 cork->base.tx_flags = 0;
c6af0c22 1410 cork->base.mark = ipc6->sockc.mark;
678ca42d 1411 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1412
0f6c480f 1413 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1414 cork->base.flags |= IPCORK_ALLFRAG;
1415 cork->base.length = 0;
1416
5fdaa88d 1417 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1418
366e41d9
VY
1419 return 0;
1420}
1421
0bbe84a6
VY
1422static int __ip6_append_data(struct sock *sk,
1423 struct flowi6 *fl6,
1424 struct sk_buff_head *queue,
1425 struct inet_cork *cork,
1426 struct inet6_cork *v6_cork,
1427 struct page_frag *pfrag,
1428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
5fdaa88d 1431 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1432{
0c183379 1433 struct sk_buff *skb, *skb_prev = NULL;
10b8a3de 1434 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1435 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1436 int exthdrlen = 0;
1437 int dst_exthdrlen = 0;
1da177e4 1438 int hh_len;
1da177e4
LT
1439 int copy;
1440 int err;
1441 int offset = 0;
09c2d251 1442 u32 tskey = 0;
0bbe84a6
VY
1443 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1444 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1445 int csummode = CHECKSUM_NONE;
682b1a9d 1446 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1447 unsigned int wmem_alloc_delta = 0;
100f6d8e 1448 bool paged, extra_uref = false;
1da177e4 1449
0bbe84a6
VY
1450 skb = skb_peek_tail(queue);
1451 if (!skb) {
1452 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1453 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1454 }
0bbe84a6 1455
15e36f5b 1456 paged = !!cork->gso_size;
bec1f6f6 1457 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1458 orig_mtu = mtu;
1da177e4 1459
678ca42d
WB
1460 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1461 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1462 tskey = sk->sk_tskey++;
1463
d8d1f30b 1464 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1465
a1b05140 1466 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1467 (opt ? opt->opt_nflen : 0);
4df98e76
HFS
1468 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1469 sizeof(struct frag_hdr);
1da177e4 1470
682b1a9d
HFS
1471 headersize = sizeof(struct ipv6hdr) +
1472 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473 (dst_allfrag(&rt->dst) ?
1474 sizeof(struct frag_hdr) : 0) +
1475 rt->rt6i_nfheader_len;
1476
10b8a3de
PA
1477 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1478 * the first fragment
1479 */
1480 if (headersize + transhdrlen > mtu)
1481 goto emsgsize;
1482
26879da5 1483 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d
HFS
1484 (sk->sk_protocol == IPPROTO_UDP ||
1485 sk->sk_protocol == IPPROTO_RAW)) {
1486 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 sizeof(struct ipv6hdr));
1488 goto emsgsize;
1489 }
4df98e76 1490
682b1a9d
HFS
1491 if (ip6_sk_ignore_df(sk))
1492 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 else
1494 maxnonfragsize = mtu;
4df98e76 1495
682b1a9d 1496 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1497emsgsize:
10b8a3de
PA
1498 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1500 return -EMSGSIZE;
1da177e4
LT
1501 }
1502
682b1a9d
HFS
1503 /* CHECKSUM_PARTIAL only with no extension headers and when
1504 * we are not going to fragment
1505 */
1506 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1508 length <= mtu - headersize &&
bec1f6f6 1509 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1510 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1511 csummode = CHECKSUM_PARTIAL;
1512
b5947e5d 1513 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
8c793822 1514 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
b5947e5d
WB
1515 if (!uarg)
1516 return -ENOBUFS;
522924b5 1517 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
b5947e5d
WB
1518 if (rt->dst.dev->features & NETIF_F_SG &&
1519 csummode == CHECKSUM_PARTIAL) {
1520 paged = true;
1521 } else {
1522 uarg->zerocopy = 0;
52900d22 1523 skb_zcopy_set(skb, uarg, &extra_uref);
b5947e5d
WB
1524 }
1525 }
1526
1da177e4
LT
1527 /*
1528 * Let's try using as much space as possible.
1529 * Use MTU if total length of the message fits into the MTU.
1530 * Otherwise, we need to reserve fragment header and
1531 * fragment alignment (= 8-15 octects, in total).
1532 *
634a63e7 1533 * Note that we may need to "move" the data from the tail
1ab1457c 1534 * of the buffer to the new fragment when we split
1da177e4
LT
1535 * the message.
1536 *
1ab1457c 1537 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1538 * at once if non-fragmentable extension headers
1539 * are too large.
1ab1457c 1540 * --yoshfuji
1da177e4
LT
1541 */
1542
2811ebac 1543 cork->length += length;
2811ebac 1544 if (!skb)
1da177e4
LT
1545 goto alloc_new_skb;
1546
1547 while (length > 0) {
1548 /* Check if the remaining data fits into current packet. */
bdc712b4 1549 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1550 if (copy < length)
1551 copy = maxfraglen - skb->len;
1552
1553 if (copy <= 0) {
1554 char *data;
1555 unsigned int datalen;
1556 unsigned int fraglen;
1557 unsigned int fraggap;
1558 unsigned int alloclen;
aba36930 1559 unsigned int pagedlen;
1da177e4 1560alloc_new_skb:
1da177e4 1561 /* There's no room in the current skb */
0c183379
G
1562 if (skb)
1563 fraggap = skb->len - maxfraglen;
1da177e4
LT
1564 else
1565 fraggap = 0;
0c183379 1566 /* update mtu and maxfraglen if necessary */
63159f29 1567 if (!skb || !skb_prev)
0c183379 1568 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1569 fragheaderlen, skb, rt,
e367c2d0 1570 orig_mtu);
0c183379
G
1571
1572 skb_prev = skb;
1da177e4
LT
1573
1574 /*
1575 * If remaining data exceeds the mtu,
1576 * we know we need more fragment(s).
1577 */
1578 datalen = length + fraggap;
1da177e4 1579
0c183379
G
1580 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1581 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1582 fraglen = datalen + fragheaderlen;
aba36930 1583 pagedlen = 0;
15e36f5b 1584
1da177e4 1585 if ((flags & MSG_MORE) &&
d8d1f30b 1586 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1587 alloclen = mtu;
15e36f5b
WB
1588 else if (!paged)
1589 alloclen = fraglen;
1590 else {
1591 alloclen = min_t(int, fraglen, MAX_HEADER);
1592 pagedlen = fraglen - alloclen;
1593 }
1da177e4 1594
299b0767
SK
1595 alloclen += dst_exthdrlen;
1596
0c183379
G
1597 if (datalen != length + fraggap) {
1598 /*
1599 * this is not the last fragment, the trailer
1600 * space is regarded as data space.
1601 */
1602 datalen += rt->dst.trailer_len;
1603 }
1604
1605 alloclen += rt->dst.trailer_len;
1606 fraglen = datalen + fragheaderlen;
1da177e4
LT
1607
1608 /*
1609 * We just reserve space for fragment header.
1ab1457c 1610 * Note: this may be overallocation if the message
1da177e4
LT
1611 * (without MSG_MORE) fits into the MTU.
1612 */
1613 alloclen += sizeof(struct frag_hdr);
1614
15e36f5b 1615 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1616 if (copy < 0) {
1617 err = -EINVAL;
1618 goto error;
1619 }
1da177e4
LT
1620 if (transhdrlen) {
1621 skb = sock_alloc_send_skb(sk,
1622 alloclen + hh_len,
1623 (flags & MSG_DONTWAIT), &err);
1624 } else {
1625 skb = NULL;
1f4c6eb2 1626 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1627 2 * sk->sk_sndbuf)
1f4c6eb2
ED
1628 skb = alloc_skb(alloclen + hh_len,
1629 sk->sk_allocation);
63159f29 1630 if (unlikely(!skb))
1da177e4
LT
1631 err = -ENOBUFS;
1632 }
63159f29 1633 if (!skb)
1da177e4
LT
1634 goto error;
1635 /*
1636 * Fill in the control structures
1637 */
9c9c9ad5 1638 skb->protocol = htons(ETH_P_IPV6);
32dce968 1639 skb->ip_summed = csummode;
1da177e4 1640 skb->csum = 0;
1f85851e
G
1641 /* reserve for fragmentation and ipsec header */
1642 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1643 dst_exthdrlen);
1da177e4
LT
1644
1645 /*
1646 * Find where to start putting bytes
1647 */
15e36f5b 1648 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1649 skb_set_network_header(skb, exthdrlen);
1650 data += fragheaderlen;
b0e380b1
ACM
1651 skb->transport_header = (skb->network_header +
1652 fragheaderlen);
1da177e4
LT
1653 if (fraggap) {
1654 skb->csum = skb_copy_and_csum_bits(
1655 skb_prev, maxfraglen,
8d5930df 1656 data + transhdrlen, fraggap);
1da177e4
LT
1657 skb_prev->csum = csum_sub(skb_prev->csum,
1658 skb->csum);
1659 data += fraggap;
e9fa4f7b 1660 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1661 }
232cd35d
ED
1662 if (copy > 0 &&
1663 getfrag(from, data + transhdrlen, offset,
1664 copy, fraggap, skb) < 0) {
1da177e4
LT
1665 err = -EFAULT;
1666 kfree_skb(skb);
1667 goto error;
1668 }
1669
1670 offset += copy;
15e36f5b 1671 length -= copy + transhdrlen;
1da177e4
LT
1672 transhdrlen = 0;
1673 exthdrlen = 0;
299b0767 1674 dst_exthdrlen = 0;
1da177e4 1675
52900d22
WB
1676 /* Only the initial fragment is time stamped */
1677 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1678 cork->tx_flags = 0;
1679 skb_shinfo(skb)->tskey = tskey;
1680 tskey = 0;
1681 skb_zcopy_set(skb, uarg, &extra_uref);
1682
0dec879f
JA
1683 if ((flags & MSG_CONFIRM) && !skb_prev)
1684 skb_set_dst_pending_confirm(skb, 1);
1685
1da177e4
LT
1686 /*
1687 * Put the packet on the pending queue
1688 */
1f4c6eb2
ED
1689 if (!skb->destructor) {
1690 skb->destructor = sock_wfree;
1691 skb->sk = sk;
1692 wmem_alloc_delta += skb->truesize;
1693 }
0bbe84a6 1694 __skb_queue_tail(queue, skb);
1da177e4
LT
1695 continue;
1696 }
1697
1698 if (copy > length)
1699 copy = length;
1700
113f99c3
WB
1701 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1702 skb_tailroom(skb) >= copy) {
1da177e4
LT
1703 unsigned int off;
1704
1705 off = skb->len;
1706 if (getfrag(from, skb_put(skb, copy),
1707 offset, copy, off, skb) < 0) {
1708 __skb_trim(skb, off);
1709 err = -EFAULT;
1710 goto error;
1711 }
b5947e5d 1712 } else if (!uarg || !uarg->zerocopy) {
1da177e4 1713 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1714
5640f768
ED
1715 err = -ENOMEM;
1716 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1717 goto error;
5640f768
ED
1718
1719 if (!skb_can_coalesce(skb, i, pfrag->page,
1720 pfrag->offset)) {
1721 err = -EMSGSIZE;
1722 if (i == MAX_SKB_FRAGS)
1723 goto error;
1724
1725 __skb_fill_page_desc(skb, i, pfrag->page,
1726 pfrag->offset, 0);
1727 skb_shinfo(skb)->nr_frags = ++i;
1728 get_page(pfrag->page);
1da177e4 1729 }
5640f768 1730 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1731 if (getfrag(from,
5640f768
ED
1732 page_address(pfrag->page) + pfrag->offset,
1733 offset, copy, skb->len, skb) < 0)
1734 goto error_efault;
1735
1736 pfrag->offset += copy;
1737 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1738 skb->len += copy;
1739 skb->data_len += copy;
f945fa7a 1740 skb->truesize += copy;
1f4c6eb2 1741 wmem_alloc_delta += copy;
b5947e5d
WB
1742 } else {
1743 err = skb_zerocopy_iter_dgram(skb, from, copy);
1744 if (err < 0)
1745 goto error;
1da177e4
LT
1746 }
1747 offset += copy;
1748 length -= copy;
1749 }
5640f768 1750
9e8445a5
PA
1751 if (wmem_alloc_delta)
1752 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1753 return 0;
5640f768
ED
1754
1755error_efault:
1756 err = -EFAULT;
1da177e4 1757error:
8e044917 1758 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1759 cork->length -= length;
3bd653c8 1760 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1761 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1762 return err;
1763}
0bbe84a6
VY
1764
1765int ip6_append_data(struct sock *sk,
1766 int getfrag(void *from, char *to, int offset, int len,
1767 int odd, struct sk_buff *skb),
26879da5
WW
1768 void *from, int length, int transhdrlen,
1769 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1770 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1771{
1772 struct inet_sock *inet = inet_sk(sk);
1773 struct ipv6_pinfo *np = inet6_sk(sk);
1774 int exthdrlen;
1775 int err;
1776
1777 if (flags&MSG_PROBE)
1778 return 0;
1779 if (skb_queue_empty(&sk->sk_write_queue)) {
1780 /*
1781 * setup for corking
1782 */
26879da5 1783 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
5fdaa88d 1784 ipc6, rt, fl6);
0bbe84a6
VY
1785 if (err)
1786 return err;
1787
26879da5 1788 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1789 length += exthdrlen;
1790 transhdrlen += exthdrlen;
1791 } else {
1792 fl6 = &inet->cork.fl.u.ip6;
1793 transhdrlen = 0;
1794 }
1795
1796 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1798 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1799}
a495f836 1800EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1801
366e41d9
VY
1802static void ip6_cork_release(struct inet_cork_full *cork,
1803 struct inet6_cork *v6_cork)
bf138862 1804{
366e41d9
VY
1805 if (v6_cork->opt) {
1806 kfree(v6_cork->opt->dst0opt);
1807 kfree(v6_cork->opt->dst1opt);
1808 kfree(v6_cork->opt->hopopt);
1809 kfree(v6_cork->opt->srcrt);
1810 kfree(v6_cork->opt);
1811 v6_cork->opt = NULL;
0178b695
HX
1812 }
1813
366e41d9
VY
1814 if (cork->base.dst) {
1815 dst_release(cork->base.dst);
1816 cork->base.dst = NULL;
1817 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1818 }
366e41d9 1819 memset(&cork->fl, 0, sizeof(cork->fl));
bf138862
PE
1820}
1821
6422398c
VY
1822struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 struct sk_buff_head *queue,
1824 struct inet_cork_full *cork,
1825 struct inet6_cork *v6_cork)
1da177e4
LT
1826{
1827 struct sk_buff *skb, *tmp_skb;
1828 struct sk_buff **tail_skb;
1829 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1da177e4 1830 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1831 struct net *net = sock_net(sk);
1da177e4 1832 struct ipv6hdr *hdr;
6422398c
VY
1833 struct ipv6_txoptions *opt = v6_cork->opt;
1834 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1836 unsigned char proto = fl6->flowi6_proto;
1da177e4 1837
6422398c 1838 skb = __skb_dequeue(queue);
63159f29 1839 if (!skb)
1da177e4
LT
1840 goto out;
1841 tail_skb = &(skb_shinfo(skb)->frag_list);
1842
1843 /* move skb->data to ip header from ext header */
d56f90a7 1844 if (skb->data < skb_network_header(skb))
bbe735e4 1845 __skb_pull(skb, skb_network_offset(skb));
6422398c 1846 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1847 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1848 *tail_skb = tmp_skb;
1849 tail_skb = &(tmp_skb->next);
1850 skb->len += tmp_skb->len;
1851 skb->data_len += tmp_skb->len;
1da177e4 1852 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1853 tmp_skb->destructor = NULL;
1854 tmp_skb->sk = NULL;
1da177e4
LT
1855 }
1856
28a89453 1857 /* Allow local fragmentation. */
60ff7467 1858 skb->ignore_df = ip6_sk_ignore_df(sk);
28a89453 1859
4e3fd7a0 1860 *final_dst = fl6->daddr;
cfe1fc77 1861 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1862 if (opt && opt->opt_flen)
1863 ipv6_push_frag_opts(skb, opt, &proto);
1864 if (opt && opt->opt_nflen)
613fa3ca 1865 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1866
e2d1bca7
ACM
1867 skb_push(skb, sizeof(struct ipv6hdr));
1868 skb_reset_network_header(skb);
0660e03f 1869 hdr = ipv6_hdr(skb);
1ab1457c 1870
6422398c 1871 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1872 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1873 ip6_autoflowlabel(net, np), fl6));
6422398c 1874 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1875 hdr->nexthdr = proto;
4e3fd7a0
AD
1876 hdr->saddr = fl6->saddr;
1877 hdr->daddr = *final_dst;
1da177e4 1878
a2c2064f 1879 skb->priority = sk->sk_priority;
c6af0c22 1880 skb->mark = cork->base.mark;
a2c2064f 1881
a818f75e
JSP
1882 skb->tstamp = cork->base.transmit_time;
1883
d8d1f30b 1884 skb_dst_set(skb, dst_clone(&rt->dst));
edf391ff 1885 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1886 if (proto == IPPROTO_ICMPV6) {
adf30907 1887 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1888
43a43b60
HFS
1889 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1891 }
1892
6422398c
VY
1893 ip6_cork_release(cork, v6_cork);
1894out:
1895 return skb;
1896}
1897
1898int ip6_send_skb(struct sk_buff *skb)
1899{
1900 struct net *net = sock_net(skb->sk);
1901 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902 int err;
1903
33224b16 1904 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1905 if (err) {
1906 if (err > 0)
6ce9e7b5 1907 err = net_xmit_errno(err);
1da177e4 1908 if (err)
6422398c
VY
1909 IP6_INC_STATS(net, rt->rt6i_idev,
1910 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1911 }
1912
1da177e4 1913 return err;
6422398c
VY
1914}
1915
1916int ip6_push_pending_frames(struct sock *sk)
1917{
1918 struct sk_buff *skb;
1919
1920 skb = ip6_finish_skb(sk);
1921 if (!skb)
1922 return 0;
1923
1924 return ip6_send_skb(skb);
1da177e4 1925}
a495f836 1926EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 1927
0bbe84a6 1928static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
1929 struct sk_buff_head *queue,
1930 struct inet_cork_full *cork,
1931 struct inet6_cork *v6_cork)
1da177e4 1932{
1da177e4
LT
1933 struct sk_buff *skb;
1934
0bbe84a6 1935 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
1936 if (skb_dst(skb))
1937 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1938 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1939 kfree_skb(skb);
1940 }
1941
6422398c 1942 ip6_cork_release(cork, v6_cork);
1da177e4 1943}
0bbe84a6
VY
1944
1945void ip6_flush_pending_frames(struct sock *sk)
1946{
6422398c
VY
1947 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 1949}
a495f836 1950EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
1951
1952struct sk_buff *ip6_make_skb(struct sock *sk,
1953 int getfrag(void *from, char *to, int offset,
1954 int len, int odd, struct sk_buff *skb),
1955 void *from, int length, int transhdrlen,
26879da5 1956 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
6422398c 1957 struct rt6_info *rt, unsigned int flags,
5fdaa88d 1958 struct inet_cork_full *cork)
6422398c 1959{
6422398c
VY
1960 struct inet6_cork v6_cork;
1961 struct sk_buff_head queue;
26879da5 1962 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
1963 int err;
1964
1965 if (flags & MSG_PROBE)
1966 return NULL;
1967
1968 __skb_queue_head_init(&queue);
1969
1cd7884d
WB
1970 cork->base.flags = 0;
1971 cork->base.addr = 0;
1972 cork->base.opt = NULL;
1973 cork->base.dst = NULL;
6422398c 1974 v6_cork.opt = NULL;
5fdaa88d 1975 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
862c03ee 1976 if (err) {
1cd7884d 1977 ip6_cork_release(cork, &v6_cork);
6422398c 1978 return ERR_PTR(err);
862c03ee 1979 }
26879da5
WW
1980 if (ipc6->dontfrag < 0)
1981 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 1982
1cd7884d 1983 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
6422398c
VY
1984 &current->task_frag, getfrag, from,
1985 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 1986 flags, ipc6);
6422398c 1987 if (err) {
1cd7884d 1988 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
1989 return ERR_PTR(err);
1990 }
1991
1cd7884d 1992 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 1993}