Merge tag 'io_uring-6.16-20250630' of git://git.kernel.dk/linux
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
d457a0e3 45#include <net/gso.h>
1da177e4
LT
46#include <net/ipv6.h>
47#include <net/ndisc.h>
48#include <net/protocol.h>
49#include <net/ip6_route.h>
50#include <net/addrconf.h>
51#include <net/rawv6.h>
52#include <net/icmp.h>
53#include <net/xfrm.h>
54#include <net/checksum.h>
7bc570c8 55#include <linux/mroute6.h>
ca254490 56#include <net/l3mdev.h>
14972cbd 57#include <net/lwtunnel.h>
571912c6 58#include <net/ip_tunnels.h>
1da177e4 59
7d8c6e39 60static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 61{
adf30907 62 struct dst_entry *dst = skb_dst(skb);
1da177e4 63 struct net_device *dev = dst->dev;
e415ed3a 64 struct inet6_dev *idev = ip6_dst_idev(dst);
5796015f 65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
e415ed3a
VA
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
f6b72b62 68 struct neighbour *neigh;
6fd6ce20 69 int ret;
1da177e4 70
5796015f 71 /* Be paranoid, rather than too clever. */
e415ed3a 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
da273b37
ED
73 /* Make sure idev stays alive */
74 rcu_read_lock();
e415ed3a 75 skb = skb_expand_head(skb, hh_len);
5796015f 76 if (!skb) {
e415ed3a 77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
da273b37 78 rcu_read_unlock();
5796015f
VA
79 return -ENOMEM;
80 }
da273b37 81 rcu_read_unlock();
5796015f
VA
82 }
83
e415ed3a
VA
84 hdr = ipv6_hdr(skb);
85 daddr = &hdr->daddr;
86 if (ipv6_addr_is_multicast(daddr)) {
7026b1dd 87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 88 ((mroute6_is_socket(net, skb) &&
bd91b8bf 89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
e415ed3a 90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
1da177e4
LT
91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92
93 /* Do not check for IFF_ALLMULTI; multicast routing
94 is not supported in any case.
95 */
96 if (newskb)
b2e0b385 97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 98 net, sk, newskb, NULL, newskb->dev,
95603e22 99 dev_loopback_xmit);
1da177e4 100
e415ed3a 101 if (hdr->hop_limit == 0) {
78126c41 102 IP6_INC_STATS(net, idev,
3bd653c8 103 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
104 kfree_skb(skb);
105 return 0;
106 }
107 }
108
78126c41 109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
e415ed3a 110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
dd408515
HFS
111 !(dev->flags & IFF_LOOPBACK)) {
112 kfree_skb(skb);
113 return 0;
114 }
1da177e4
LT
115 }
116
14972cbd
RP
117 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 int res = lwtunnel_xmit(skb);
119
a171fbec 120 if (res != LWTUNNEL_XMIT_CONTINUE)
14972cbd
RP
121 return res;
122 }
123
b4a11b20
HG
124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125
09eed119 126 rcu_read_lock();
e8dfd42c 127 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
e415ed3a 128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
58f71be5 129
3a1beabe 130 if (IS_ERR_OR_NULL(neigh)) {
58f71be5
PB
131 if (unlikely(!neigh))
132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 if (IS_ERR(neigh)) {
09eed119 134 rcu_read_unlock();
58f71be5
PB
135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 return -EINVAL;
138 }
6fd6ce20 139 }
58f71be5
PB
140 sock_confirm_neigh(skb, neigh);
141 ret = neigh_output(neigh, skb, false);
09eed119 142 rcu_read_unlock();
58f71be5 143 return ret;
1da177e4
LT
144}
145
b210de4f
AL
146static int
147ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 struct sk_buff *skb, unsigned int mtu)
149{
150 struct sk_buff *segs, *nskb;
151 netdev_features_t features;
152 int ret = 0;
153
154 /* Please see corresponding comment in ip_finish_output_gso
155 * describing the cases where GSO segment length exceeds the
156 * egress MTU.
157 */
158 features = netif_skb_features(skb);
159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 if (IS_ERR_OR_NULL(segs)) {
161 kfree_skb(skb);
162 return -ENOMEM;
163 }
164
165 consume_skb(skb);
166
167 skb_list_walk_safe(segs, segs, nskb) {
168 int err;
169
170 skb_mark_not_on_list(segs);
03d6c848
YZ
171 /* Last GSO segment can be smaller than gso_size (and MTU).
172 * Adding a fragment header would produce an "atomic fragment",
173 * which is considered harmful (RFC-8021). Avoid that.
174 */
175 err = segs->len > mtu ?
176 ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 ip6_finish_output2(net, sk, segs);
b210de4f
AL
178 if (err && ret == 0)
179 ret = err;
180 }
181
182 return ret;
183}
184
1f7ec1b3
YZ
185static int ip6_finish_output_gso(struct net *net, struct sock *sk,
186 struct sk_buff *skb, unsigned int mtu)
187{
188 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
189 !skb_gso_validate_network_len(skb, mtu))
190 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
191
192 return ip6_finish_output2(net, sk, skb);
193}
194
956fe219 195static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 196{
b210de4f
AL
197 unsigned int mtu;
198
09ee9dba
TB
199#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 /* Policy lookup after SNAT yielded a new policy */
201 if (skb_dst(skb)->xfrm) {
19d36c5f 202 IP6CB(skb)->flags |= IP6SKB_REROUTED;
09ee9dba
TB
203 return dst_output(net, sk, skb);
204 }
205#endif
206
b210de4f 207 mtu = ip6_skb_dst_mtu(skb);
1f7ec1b3
YZ
208 if (skb_is_gso(skb))
209 return ip6_finish_output_gso(net, sk, skb, mtu);
b210de4f 210
1f7ec1b3 211 if (skb->len > mtu ||
9037c357 212 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 213 return ip6_fragment(net, sk, skb, ip6_finish_output2);
1f7ec1b3
YZ
214
215 return ip6_finish_output2(net, sk, skb);
9e508490
JE
216}
217
956fe219 218static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
219{
220 int ret;
221
222 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
223 switch (ret) {
224 case NET_XMIT_SUCCESS:
956fe219 225 case NET_XMIT_CN:
226 return __ip6_finish_output(net, sk, skb) ? : ret;
227 default:
5e187189 228 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
956fe219 229 return ret;
230 }
231}
232
ede2059d 233int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 234{
28f8bfd1 235 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 236 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 237
97a7a37a
CF
238 skb->protocol = htons(ETH_P_IPV6);
239 skb->dev = dev;
240
4db783d6 241 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
19a0644c 242 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5e187189 243 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
778d80be
YH
244 return 0;
245 }
246
29a26a56 247 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 248 net, sk, skb, indev, dev,
9c6eb28a
JE
249 ip6_finish_output,
250 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 251}
6585d7dc 252EXPORT_SYMBOL(ip6_output);
1da177e4 253
5121516b 254bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
513674b5 255{
5121516b 256 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
513674b5 257 return ip6_default_np_autolabel(net);
5121516b 258 return inet6_test_bit(AUTOFLOWLABEL, sk);
513674b5
SL
259}
260
1da177e4 261/*
2a63dd0e 262 * xmit an sk_buff (used by TCP and SCTP)
1c1e9d2b
ED
263 * Note : socket lock is not held for SYNACK packets, but might be modified
264 * by calls to skb_set_owner_w() and ipv6_local_error(),
265 * which are using proper atomic operations or spinlocks.
1da177e4 266 */
1c1e9d2b 267int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 268 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 269{
3bd653c8 270 struct net *net = sock_net(sk);
1c1e9d2b 271 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 272 struct in6_addr *first_hop = &fl6->daddr;
adf30907 273 struct dst_entry *dst = skb_dst(skb);
0c9f227b
VA
274 struct net_device *dev = dst->dev;
275 struct inet6_dev *idev = ip6_dst_idev(dst);
80e425b6
CL
276 struct hop_jumbo_hdr *hop_jumbo;
277 int hoplen = sizeof(*hop_jumbo);
66033f47 278 unsigned int head_room;
1da177e4 279 struct ipv6hdr *hdr;
4c9483b2 280 u8 proto = fl6->flowi6_proto;
1da177e4 281 int seg_len = skb->len;
e651f03a 282 int hlimit = -1;
1da177e4
LT
283 u32 mtu;
284
80e425b6 285 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
66033f47
SB
286 if (opt)
287 head_room += opt->opt_nflen + opt->opt_flen;
288
0c9f227b 289 if (unlikely(head_room > skb_headroom(skb))) {
2d5ff7e3
ED
290 /* Make sure idev stays alive */
291 rcu_read_lock();
0c9f227b
VA
292 skb = skb_expand_head(skb, head_room);
293 if (!skb) {
294 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
2d5ff7e3 295 rcu_read_unlock();
66033f47 296 return -ENOBUFS;
1da177e4 297 }
2d5ff7e3 298 rcu_read_unlock();
66033f47
SB
299 }
300
301 if (opt) {
302 seg_len += opt->opt_nflen + opt->opt_flen;
303
1da177e4
LT
304 if (opt->opt_flen)
305 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 306
1da177e4 307 if (opt->opt_nflen)
613fa3ca
DL
308 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
309 &fl6->saddr);
1da177e4
LT
310 }
311
80e425b6
CL
312 if (unlikely(seg_len > IPV6_MAXPLEN)) {
313 hop_jumbo = skb_push(skb, hoplen);
314
315 hop_jumbo->nexthdr = proto;
316 hop_jumbo->hdrlen = 0;
317 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
318 hop_jumbo->tlv_len = 4;
319 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
320
321 proto = IPPROTO_HOPOPTS;
322 seg_len = 0;
323 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
324 }
325
e2d1bca7
ACM
326 skb_push(skb, sizeof(struct ipv6hdr));
327 skb_reset_network_header(skb);
0660e03f 328 hdr = ipv6_hdr(skb);
1da177e4
LT
329
330 /*
331 * Fill in the IPv6 header
332 */
b903d324 333 if (np)
b0adfba7 334 hlimit = READ_ONCE(np->hop_limit);
1da177e4 335 if (hlimit < 0)
6b75d090 336 hlimit = ip6_dst_hoplimit(dst);
1da177e4 337
cb1ce2ef 338 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
5121516b 339 ip6_autoflowlabel(net, sk), fl6));
41a1f8ea 340
1da177e4
LT
341 hdr->payload_len = htons(seg_len);
342 hdr->nexthdr = proto;
343 hdr->hop_limit = hlimit;
344
4e3fd7a0
AD
345 hdr->saddr = fl6->saddr;
346 hdr->daddr = *first_hop;
1da177e4 347
9c9c9ad5 348 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 349 skb->priority = priority;
92e55f41 350 skb->mark = mark;
a2c2064f 351
1da177e4 352 mtu = dst_mtu(dst);
60ff7467 353 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
b4a11b20 354 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
a8e3e1a9
DA
355
356 /* if egress device is enslaved to an L3 master device pass the
357 * skb to its handler for processing
358 */
359 skb = l3mdev_ip6_out((struct sock *)sk, skb);
360 if (unlikely(!skb))
361 return 0;
362
1c1e9d2b
ED
363 /* hooks should never assume socket lock is held.
364 * we promote our socket to non const
365 */
29a26a56 366 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0c9f227b 367 net, (struct sock *)sk, skb, NULL, dev,
13206b6b 368 dst_output);
1da177e4
LT
369 }
370
0c9f227b 371 skb->dev = dev;
1c1e9d2b
ED
372 /* ipv6_local_error() does not require socket lock,
373 * we promote our socket to non const
374 */
375 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
376
0c9f227b 377 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
378 kfree_skb(skb);
379 return -EMSGSIZE;
380}
7159039a
YH
381EXPORT_SYMBOL(ip6_xmit);
382
1da177e4
LT
383static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
384{
385 struct ip6_ra_chain *ra;
386 struct sock *last = NULL;
387
388 read_lock(&ip6_ra_lock);
389 for (ra = ip6_ra_chain; ra; ra = ra->next) {
390 struct sock *sk = ra->sk;
0bd1b59b
AM
391 if (sk && ra->sel == sel &&
392 (!sk->sk_bound_dev_if ||
393 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe 394
83cd5eb6 395 if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
9036b2fe
FR
396 !net_eq(sock_net(sk), dev_net(skb->dev))) {
397 continue;
398 }
1da177e4
LT
399 if (last) {
400 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
401 if (skb2)
402 rawv6_rcv(last, skb2);
403 }
404 last = sk;
405 }
406 }
407
408 if (last) {
409 rawv6_rcv(last, skb);
410 read_unlock(&ip6_ra_lock);
411 return 1;
412 }
413 read_unlock(&ip6_ra_lock);
414 return 0;
415}
416
e21e0b5f
VN
417static int ip6_forward_proxy_check(struct sk_buff *skb)
418{
0660e03f 419 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 420 u8 nexthdr = hdr->nexthdr;
75f2811c 421 __be16 frag_off;
e21e0b5f
VN
422 int offset;
423
424 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 425 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
426 if (offset < 0)
427 return 0;
428 } else
429 offset = sizeof(struct ipv6hdr);
430
431 if (nexthdr == IPPROTO_ICMPV6) {
432 struct icmp6hdr *icmp6;
433
d56f90a7
ACM
434 if (!pskb_may_pull(skb, (skb_network_header(skb) +
435 offset + 1 - skb->data)))
e21e0b5f
VN
436 return 0;
437
d56f90a7 438 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
439
440 switch (icmp6->icmp6_type) {
441 case NDISC_ROUTER_SOLICITATION:
442 case NDISC_ROUTER_ADVERTISEMENT:
443 case NDISC_NEIGHBOUR_SOLICITATION:
444 case NDISC_NEIGHBOUR_ADVERTISEMENT:
445 case NDISC_REDIRECT:
446 /* For reaction involving unicast neighbor discovery
447 * message destined to the proxied address, pass it to
448 * input function.
449 */
450 return 1;
451 default:
452 break;
453 }
454 }
455
74553b09
VN
456 /*
457 * The proxying router can't forward traffic sent to a link-local
458 * address, so signal the sender and discard the packet. This
459 * behavior is clarified by the MIPv6 specification.
460 */
461 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
462 dst_link_failure(skb);
463 return -1;
464 }
465
e21e0b5f
VN
466 return 0;
467}
468
0c4b51f0
EB
469static inline int ip6_forward_finish(struct net *net, struct sock *sk,
470 struct sk_buff *skb)
1da177e4 471{
f839a6c9
IS
472#ifdef CONFIG_NET_SWITCHDEV
473 if (skb->offload_l3_fwd_mark) {
474 consume_skb(skb);
475 return 0;
476 }
477#endif
478
de799101 479 skb_clear_tstamp(skb);
13206b6b 480 return dst_output(net, sk, skb);
1da177e4
LT
481}
482
fe6cc55f
FW
483static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
484{
418a3156 485 if (skb->len <= mtu)
fe6cc55f
FW
486 return false;
487
60ff7467 488 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
489 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
490 return true;
491
60ff7467 492 if (skb->ignore_df)
418a3156
FW
493 return false;
494
779b7931 495 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
496 return false;
497
498 return true;
499}
500
1da177e4
LT
501int ip6_forward(struct sk_buff *skb)
502{
adf30907 503 struct dst_entry *dst = skb_dst(skb);
0660e03f 504 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 505 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 506 struct net *net = dev_net(dst->dev);
0857d6f8 507 struct inet6_dev *idev;
2edc1a38 508 SKB_DR(reason);
14f3ad6f 509 u32 mtu;
1ab1457c 510
0857d6f8 511 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
32f75417 512 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
1da177e4
LT
513 goto error;
514
090f1166
LR
515 if (skb->pkt_type != PACKET_HOST)
516 goto drop;
517
9ef2e965
HFS
518 if (unlikely(skb->sk))
519 goto drop;
520
4497b076
BH
521 if (skb_warn_if_lro(skb))
522 goto drop;
523
624d5aec
ED
524 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
525 (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
ccd27f05 526 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 527 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
528 goto drop;
529 }
530
35fc92a9 531 skb_forward_csum(skb);
1da177e4
LT
532
533 /*
534 * We DO NOT make any processing on
535 * RA packets, pushing them to user level AS IS
536 * without ane WARRANTY that application will be able
537 * to interpret them. The reason is that we
538 * cannot make anything clever here.
539 *
540 * We are not end-node, so that if packet contains
541 * AH/ESP, we cannot make anything.
542 * Defragmentation also would be mistake, RA packets
543 * cannot be fragmented, because there is no warranty
544 * that different fragments will go along one path. --ANK
545 */
ab4eb353
YH
546 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
547 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
548 return 0;
549 }
550
551 /*
552 * check and decrement ttl
553 */
554 if (hdr->hop_limit <= 1) {
3ffe533c 555 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 556 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4 557
2edc1a38 558 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
1da177e4
LT
559 return -ETIMEDOUT;
560 }
561
fbea49e1 562 /* XXX: idev->cnf.proxy_ndp? */
a8fbd4d9 563 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
8a3edd80 564 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 565 int proxied = ip6_forward_proxy_check(skb);
46c7655f 566 if (proxied > 0) {
9f535c87
GR
567 /* It's tempting to decrease the hop limit
568 * here by 1, as we do at the end of the
569 * function too.
570 *
571 * But that would be incorrect, as proxying is
572 * not forwarding. The ip6_input function
573 * will handle this packet locally, and it
574 * depends on the hop limit being unchanged.
575 *
576 * One example is the NDP hop limit, that
577 * always has to stay 255, but other would be
578 * similar checks around RA packets, where the
579 * user can even change the desired limit.
580 */
e21e0b5f 581 return ip6_input(skb);
46c7655f 582 } else if (proxied < 0) {
bdb7cc64 583 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
584 goto drop;
585 }
e21e0b5f
VN
586 }
587
1da177e4 588 if (!xfrm6_route_forward(skb)) {
bdb7cc64 589 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
2edc1a38 590 SKB_DR_SET(reason, XFRM_POLICY);
1da177e4
LT
591 goto drop;
592 }
adf30907 593 dst = skb_dst(skb);
1da177e4
LT
594
595 /* IPv6 specs say nothing about it, but it is clear that we cannot
596 send redirects to source routed frames.
1e5dc146 597 We don't send redirects to frames decapsulated from IPsec.
1da177e4 598 */
2f17becf
SS
599 if (IP6CB(skb)->iif == dst->dev->ifindex &&
600 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 601 struct in6_addr *target = NULL;
fbfe95a4 602 struct inet_peer *peer;
1da177e4 603 struct rt6_info *rt;
1da177e4
LT
604
605 /*
606 * incoming and outgoing devices are the same
607 * send a redirect.
608 */
609
e8dfd42c 610 rt = dst_rt6_info(dst);
c45a3dfb
DM
611 if (rt->rt6i_flags & RTF_GATEWAY)
612 target = &rt->rt6i_gateway;
1da177e4
LT
613 else
614 target = &hdr->daddr;
615
a853c609 616 rcu_read_lock();
661cd8fc 617 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
92d86829 618
1da177e4
LT
619 /* Limit redirects both by destination (here)
620 and by source (inside ndisc_send_redirect)
621 */
fbfe95a4 622 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 623 ndisc_send_redirect(skb, target);
a853c609 624 rcu_read_unlock();
5bb1ab09
DS
625 } else {
626 int addrtype = ipv6_addr_type(&hdr->saddr);
627
1da177e4 628 /* This check is security critical. */
f81b2e7d
YH
629 if (addrtype == IPV6_ADDR_ANY ||
630 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
631 goto error;
632 if (addrtype & IPV6_ADDR_LINKLOCAL) {
633 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 634 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
635 goto error;
636 }
1da177e4
LT
637 }
638
cf8b49fb
HG
639 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
640
427faee1 641 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
642 if (mtu < IPV6_MIN_MTU)
643 mtu = IPV6_MIN_MTU;
644
fe6cc55f 645 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
646 /* Again, force OUTPUT device used as source address */
647 skb->dev = dst->dev;
14f3ad6f 648 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 649 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
650 __IP6_INC_STATS(net, ip6_dst_idev(dst),
651 IPSTATS_MIB_FRAGFAILS);
2edc1a38 652 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
1da177e4
LT
653 return -EMSGSIZE;
654 }
655
656 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
657 __IP6_INC_STATS(net, ip6_dst_idev(dst),
658 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
659 goto drop;
660 }
661
0660e03f 662 hdr = ipv6_hdr(skb);
1da177e4
LT
663
664 /* Mangling hops number delayed to point after skb COW */
1ab1457c 665
1da177e4
LT
666 hdr->hop_limit--;
667
29a26a56
EB
668 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
669 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 670 ip6_forward_finish);
1da177e4
LT
671
672error:
bdb7cc64 673 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
2edc1a38 674 SKB_DR_SET(reason, IP_INADDRERRORS);
1da177e4 675drop:
2edc1a38 676 kfree_skb_reason(skb, reason);
1da177e4
LT
677 return -EINVAL;
678}
679
680static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
681{
682 to->pkt_type = from->pkt_type;
683 to->priority = from->priority;
684 to->protocol = from->protocol;
adf30907
ED
685 skb_dst_drop(to);
686 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 687 to->dev = from->dev;
82e91ffe 688 to->mark = from->mark;
1da177e4 689
3dd1c9a1
PA
690 skb_copy_hash(to, from);
691
1da177e4
LT
692#ifdef CONFIG_NET_SCHED
693 to->tc_index = from->tc_index;
694#endif
e7ac05f3 695 nf_copy(to, from);
df5042f4 696 skb_ext_copy(to, from);
984bc16c 697 skb_copy_secmark(to, from);
1da177e4
LT
698}
699
0feca619
PNA
700int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
701 u8 nexthdr, __be32 frag_id,
702 struct ip6_fraglist_iter *iter)
703{
704 unsigned int first_len;
705 struct frag_hdr *fh;
706
707 /* BUILD HEADER */
708 *prevhdr = NEXTHDR_FRAGMENT;
709 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
710 if (!iter->tmp_hdr)
711 return -ENOMEM;
712
b7034146 713 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
714 skb_frag_list_init(skb);
715
716 iter->offset = 0;
717 iter->hlen = hlen;
718 iter->frag_id = frag_id;
719 iter->nexthdr = nexthdr;
720
721 __skb_pull(skb, hlen);
722 fh = __skb_push(skb, sizeof(struct frag_hdr));
723 __skb_push(skb, hlen);
724 skb_reset_network_header(skb);
725 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
726
727 fh->nexthdr = nexthdr;
728 fh->reserved = 0;
729 fh->frag_off = htons(IP6_MF);
730 fh->identification = frag_id;
731
732 first_len = skb_pagelen(skb);
733 skb->data_len = first_len - skb_headlen(skb);
734 skb->len = first_len;
735 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
736
737 return 0;
738}
739EXPORT_SYMBOL(ip6_fraglist_init);
740
741void ip6_fraglist_prepare(struct sk_buff *skb,
742 struct ip6_fraglist_iter *iter)
743{
744 struct sk_buff *frag = iter->frag;
745 unsigned int hlen = iter->hlen;
746 struct frag_hdr *fh;
747
748 frag->ip_summed = CHECKSUM_NONE;
749 skb_reset_transport_header(frag);
750 fh = __skb_push(frag, sizeof(struct frag_hdr));
751 __skb_push(frag, hlen);
752 skb_reset_network_header(frag);
753 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
754 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
755 fh->nexthdr = iter->nexthdr;
756 fh->reserved = 0;
757 fh->frag_off = htons(iter->offset);
758 if (frag->next)
759 fh->frag_off |= htons(IP6_MF);
760 fh->identification = iter->frag_id;
761 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
762 ip6_copy_metadata(frag, skb);
763}
764EXPORT_SYMBOL(ip6_fraglist_prepare);
765
8a6a1f17
PNA
766void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
767 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
768 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
769{
770 state->prevhdr = prevhdr;
771 state->nexthdr = nexthdr;
772 state->frag_id = frag_id;
773
774 state->hlen = hlen;
775 state->mtu = mtu;
776
777 state->left = skb->len - hlen; /* Space per frame */
778 state->ptr = hlen; /* Where to start from */
779
780 state->hroom = hdr_room;
781 state->troom = needed_tailroom;
782
783 state->offset = 0;
784}
785EXPORT_SYMBOL(ip6_frag_init);
786
787struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
788{
789 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
790 struct sk_buff *frag;
791 struct frag_hdr *fh;
792 unsigned int len;
793
794 len = state->left;
795 /* IF: it doesn't fit, use 'mtu' - the data space left */
796 if (len > state->mtu)
797 len = state->mtu;
798 /* IF: we are not sending up to and including the packet end
799 then align the next start on an eight byte boundary */
800 if (len < state->left)
801 len &= ~7;
802
803 /* Allocate buffer */
804 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
805 state->hroom + state->troom, GFP_ATOMIC);
806 if (!frag)
807 return ERR_PTR(-ENOMEM);
808
809 /*
810 * Set up data on packet
811 */
812
813 ip6_copy_metadata(frag, skb);
814 skb_reserve(frag, state->hroom);
815 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
816 skb_reset_network_header(frag);
817 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
818 frag->transport_header = (frag->network_header + state->hlen +
819 sizeof(struct frag_hdr));
820
821 /*
822 * Charge the memory for the fragment to any owner
823 * it might possess
824 */
825 if (skb->sk)
826 skb_set_owner_w(frag, skb->sk);
827
828 /*
829 * Copy the packet header into the new buffer.
830 */
831 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
832
833 fragnexthdr_offset = skb_network_header(frag);
834 fragnexthdr_offset += prevhdr - skb_network_header(skb);
835 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
836
837 /*
838 * Build fragment header.
839 */
840 fh->nexthdr = state->nexthdr;
841 fh->reserved = 0;
842 fh->identification = state->frag_id;
843
844 /*
845 * Copy a block of the IP datagram.
846 */
847 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
848 len));
849 state->left -= len;
850
851 fh->frag_off = htons(state->offset);
852 if (state->left > 0)
853 fh->frag_off |= htons(IP6_MF);
854 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
855
856 state->ptr += len;
857 state->offset += len;
858
859 return frag;
860}
861EXPORT_SYMBOL(ip6_frag_next);
862
7d8c6e39
EB
863int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
864 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 865{
1da177e4 866 struct sk_buff *frag;
e8dfd42c 867 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
f60e5990 868 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
869 inet6_sk(skb->sk) : NULL;
4d25ca2d 870 u8 tstamp_type = skb->tstamp_type;
8a6a1f17
PNA
871 struct ip6_frag_state state;
872 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 873 ktime_t tstamp = skb->tstamp;
8a6a1f17 874 int hroom, err = 0;
286c2349 875 __be32 frag_id;
1da177e4
LT
876 u8 *prevhdr, nexthdr = 0;
877
7dd7eb95
DM
878 err = ip6_find_1stfragopt(skb, &prevhdr);
879 if (err < 0)
2423496a 880 goto fail;
7dd7eb95 881 hlen = err;
1da177e4 882 nexthdr = *prevhdr;
ef0efcd3 883 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 884
628a5c56 885 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
886
887 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 888 * or if the skb it not generated by a local socket.
b881ef76 889 */
485fca66
FW
890 if (unlikely(!skb->ignore_df && skb->len > mtu))
891 goto fail_toobig;
a34a101e 892
485fca66
FW
893 if (IP6CB(skb)->frag_max_size) {
894 if (IP6CB(skb)->frag_max_size > mtu)
895 goto fail_toobig;
896
897 /* don't send fragments larger than what we received */
898 mtu = IP6CB(skb)->frag_max_size;
899 if (mtu < IPV6_MIN_MTU)
900 mtu = IPV6_MIN_MTU;
b881ef76
JH
901 }
902
15f926c4
ED
903 if (np) {
904 u32 frag_size = READ_ONCE(np->frag_size);
905
906 if (frag_size && frag_size < mtu)
907 mtu = frag_size;
d91675f9 908 }
89bc7848 909 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 910 goto fail_toobig;
1e0d69a9 911 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 912
fd0273d7
MKL
913 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
914 &ipv6_hdr(skb)->saddr);
286c2349 915
405c92f7
HFS
916 if (skb->ip_summed == CHECKSUM_PARTIAL &&
917 (err = skb_checksum_help(skb)))
918 goto fail;
919
ef0efcd3 920 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 921 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 922 if (skb_has_frag_list(skb)) {
c72d8cda 923 unsigned int first_len = skb_pagelen(skb);
0feca619 924 struct ip6_fraglist_iter iter;
3d13008e 925 struct sk_buff *frag2;
1da177e4
LT
926
927 if (first_len - hlen > mtu ||
928 ((first_len - hlen) & 7) ||
1d325d21
FW
929 skb_cloned(skb) ||
930 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
931 goto slow_path;
932
4d9092bb 933 skb_walk_frags(skb, frag) {
1da177e4
LT
934 /* Correct geometry. */
935 if (frag->len > mtu ||
936 ((frag->len & 7) && frag->next) ||
1d325d21 937 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 938 goto slow_path_clean;
1da177e4 939
1da177e4
LT
940 /* Partially cloned skb? */
941 if (skb_shared(frag))
3d13008e 942 goto slow_path_clean;
2fdba6b0
HX
943
944 BUG_ON(frag->sk);
945 if (skb->sk) {
2fdba6b0
HX
946 frag->sk = skb->sk;
947 frag->destructor = sock_wfree;
2fdba6b0 948 }
3d13008e 949 skb->truesize -= frag->truesize;
1da177e4
LT
950 }
951
0feca619
PNA
952 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
953 &iter);
954 if (err < 0)
1d325d21 955 goto fail;
a11d206d 956
803e8486
ED
957 /* We prevent @rt from being freed. */
958 rcu_read_lock();
959
1da177e4
LT
960 for (;;) {
961 /* Prepare header of the next frame,
962 * before previous one went down. */
0feca619
PNA
963 if (iter.frag)
964 ip6_fraglist_prepare(skb, &iter);
1ab1457c 965
4d25ca2d 966 skb_set_delivery_time(skb, tstamp, tstamp_type);
7d8c6e39 967 err = output(net, sk, skb);
67ba4152 968 if (!err)
d8d1f30b 969 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 970 IPSTATS_MIB_FRAGCREATES);
dafee490 971
0feca619 972 if (err || !iter.frag)
1da177e4
LT
973 break;
974
0feca619 975 skb = ip6_fraglist_next(&iter);
1da177e4
LT
976 }
977
0feca619 978 kfree(iter.tmp_hdr);
1da177e4
LT
979
980 if (err == 0) {
d8d1f30b 981 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 982 IPSTATS_MIB_FRAGOKS);
803e8486 983 rcu_read_unlock();
1da177e4
LT
984 return 0;
985 }
986
b7034146 987 kfree_skb_list(iter.frag);
1da177e4 988
d8d1f30b 989 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 990 IPSTATS_MIB_FRAGFAILS);
803e8486 991 rcu_read_unlock();
1da177e4 992 return err;
3d13008e
ED
993
994slow_path_clean:
995 skb_walk_frags(skb, frag2) {
996 if (frag2 == frag)
997 break;
998 frag2->sk = NULL;
999 frag2->destructor = NULL;
1000 skb->truesize += frag2->truesize;
1001 }
1da177e4
LT
1002 }
1003
1004slow_path:
1da177e4
LT
1005 /*
1006 * Fragment the datagram.
1007 */
1008
8a6a1f17
PNA
1009 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1010 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1011 &state);
1da177e4
LT
1012
1013 /*
1014 * Keep copying data until we run out.
1015 */
1da177e4 1016
8a6a1f17
PNA
1017 while (state.left > 0) {
1018 frag = ip6_frag_next(skb, &state);
1019 if (IS_ERR(frag)) {
1020 err = PTR_ERR(frag);
1da177e4
LT
1021 goto fail;
1022 }
1023
1da177e4
LT
1024 /*
1025 * Put this fragment into the sending queue.
1026 */
4d25ca2d 1027 skb_set_delivery_time(frag, tstamp, tstamp_type);
7d8c6e39 1028 err = output(net, sk, frag);
1da177e4
LT
1029 if (err)
1030 goto fail;
dafee490 1031
adf30907 1032 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 1033 IPSTATS_MIB_FRAGCREATES);
1da177e4 1034 }
adf30907 1035 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1036 IPSTATS_MIB_FRAGOKS);
808db80a 1037 consume_skb(skb);
1da177e4
LT
1038 return err;
1039
485fca66 1040fail_toobig:
485fca66
FW
1041 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1042 err = -EMSGSIZE;
1043
1da177e4 1044fail:
adf30907 1045 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1046 IPSTATS_MIB_FRAGFAILS);
1ab1457c 1047 kfree_skb(skb);
1da177e4
LT
1048 return err;
1049}
1050
b71d1d42
ED
1051static inline int ip6_rt_check(const struct rt6key *rt_key,
1052 const struct in6_addr *fl_addr,
1053 const struct in6_addr *addr_cache)
cf6b1982 1054{
a02cec21 1055 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 1056 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
1057}
1058
497c615a
HX
1059static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1060 struct dst_entry *dst,
b71d1d42 1061 const struct flowi6 *fl6)
1da177e4 1062{
497c615a 1063 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1064 struct rt6_info *rt;
1da177e4 1065
497c615a
HX
1066 if (!dst)
1067 goto out;
1068
a963a37d
ED
1069 if (dst->ops->family != AF_INET6) {
1070 dst_release(dst);
1071 return NULL;
1072 }
1073
e8dfd42c 1074 rt = dst_rt6_info(dst);
497c615a
HX
1075 /* Yes, checking route validity in not connected
1076 * case is not very simple. Take into account,
1077 * that we do not support routing by source, TOS,
67ba4152 1078 * and MSG_DONTROUTE --ANK (980726)
497c615a 1079 *
cf6b1982
YH
1080 * 1. ip6_rt_check(): If route was host route,
1081 * check that cached destination is current.
497c615a
HX
1082 * If it is network route, we still may
1083 * check its validity using saved pointer
1084 * to the last used address: daddr_cache.
1085 * We do not want to save whole address now,
1086 * (because main consumer of this service
1087 * is tcp, which has not this problem),
1088 * so that the last trick works only on connected
1089 * sockets.
1090 * 2. oif also should be the same.
1091 */
4c9483b2 1092 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1093#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1094 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1095#endif
40867d74 1096 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
497c615a
HX
1097 dst_release(dst);
1098 dst = NULL;
1da177e4
LT
1099 }
1100
497c615a
HX
1101out:
1102 return dst;
1103}
1104
3aef934f 1105static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1106 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1107{
69cce1d1
DM
1108#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1109 struct neighbour *n;
97cac082 1110 struct rt6_info *rt;
69cce1d1
DM
1111#endif
1112 int err;
6f21c96a 1113 int flags = 0;
497c615a 1114
e16e888b
MS
1115 /* The correct way to handle this would be to do
1116 * ip6_route_get_saddr, and then ip6_route_output; however,
1117 * the route-specific preferred source forces the
1118 * ip6_route_output call _before_ ip6_route_get_saddr.
1119 *
1120 * In source specific routing (no src=any default route),
1121 * ip6_route_output will fail given src=any saddr, though, so
1122 * that's why we try it again later.
1123 */
c305b9e6 1124 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1125 struct fib6_info *from;
e16e888b 1126 struct rt6_info *rt;
1da177e4 1127
c305b9e6 1128 *dst = ip6_route_output(net, sk, fl6);
e8dfd42c 1129 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
a68886a6
DA
1130
1131 rcu_read_lock();
1132 from = rt ? rcu_dereference(rt->from) : NULL;
1133 err = ip6_route_get_saddr(net, from, &fl6->daddr,
fa17a6d8 1134 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
252442f2 1135 fl6->flowi6_l3mdev,
c3968a85 1136 &fl6->saddr);
a68886a6
DA
1137 rcu_read_unlock();
1138
44456d37 1139 if (err)
1da177e4 1140 goto out_err_release;
e16e888b
MS
1141
1142 /* If we had an erroneous initial result, pretend it
1143 * never existed and let the SA-enabled version take
1144 * over.
1145 */
c305b9e6 1146 if ((*dst)->error) {
e16e888b
MS
1147 dst_release(*dst);
1148 *dst = NULL;
1149 }
6f21c96a
PA
1150
1151 if (fl6->flowi6_oif)
1152 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1153 }
1154
e16e888b 1155 if (!*dst)
6f21c96a 1156 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1157
1158 err = (*dst)->error;
1159 if (err)
1160 goto out_err_release;
1161
95c385b4 1162#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1163 /*
1164 * Here if the dst entry we've looked up
1165 * has a neighbour entry that is in the INCOMPLETE
1166 * state and the src address from the flow is
1167 * marked as OPTIMISTIC, we release the found
1168 * dst entry and replace it instead with the
1169 * dst entry of the nexthop router
1170 */
e8dfd42c 1171 rt = dst_rt6_info(*dst);
09eed119 1172 rcu_read_lock();
2647a9b0
MKL
1173 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1174 rt6_nexthop(rt, &fl6->daddr));
b071af52 1175 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
09eed119 1176 rcu_read_unlock();
707be1ff
YH
1177
1178 if (err) {
e550dfb0 1179 struct inet6_ifaddr *ifp;
4c9483b2 1180 struct flowi6 fl_gw6;
e550dfb0
NH
1181 int redirect;
1182
4c9483b2 1183 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1184 (*dst)->dev, 1);
1185
1186 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1187 if (ifp)
1188 in6_ifa_put(ifp);
1189
1190 if (redirect) {
1191 /*
1192 * We need to get the dst entry for the
1193 * default router instead
1194 */
1195 dst_release(*dst);
4c9483b2
DM
1196 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1197 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1198 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1199 err = (*dst)->error;
1200 if (err)
e550dfb0 1201 goto out_err_release;
95c385b4 1202 }
e550dfb0 1203 }
95c385b4 1204#endif
ec5e3b0a 1205 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1206 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1207 err = -EAFNOSUPPORT;
1208 goto out_err_release;
1209 }
95c385b4 1210
1da177e4
LT
1211 return 0;
1212
1213out_err_release:
1214 dst_release(*dst);
1215 *dst = NULL;
8a966fc0 1216
0d240e78
DA
1217 if (err == -ENETUNREACH)
1218 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1219 return err;
1220}
34a0b3cd 1221
497c615a
HX
1222/**
1223 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1224 * @net: Network namespace to perform lookup in
497c615a
HX
1225 * @sk: socket which provides route info
1226 * @dst: pointer to dst_entry * for result
4c9483b2 1227 * @fl6: flow to lookup
497c615a
HX
1228 *
1229 * This function performs a route lookup on the given flow.
1230 *
1231 * It returns zero on success, or a standard errno code on error.
1232 */
343d60aa
RP
1233int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1234 struct flowi6 *fl6)
497c615a
HX
1235{
1236 *dst = NULL;
343d60aa 1237 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1238}
3cf3dc6c
ACM
1239EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1240
497c615a 1241/**
68d0c6d3 1242 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1243 * @net: Network namespace to perform lookup in
68d0c6d3 1244 * @sk: socket which provides route info
4c9483b2 1245 * @fl6: flow to lookup
68d0c6d3 1246 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1247 *
1248 * This function performs a route lookup on the given flow.
1249 *
1250 * It returns a valid dst pointer on success, or a pointer encoded
1251 * error code.
1252 */
c4e85f73 1253struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1254 const struct in6_addr *final_dst)
68d0c6d3
DM
1255{
1256 struct dst_entry *dst = NULL;
1257 int err;
1258
c4e85f73 1259 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1260 if (err)
1261 return ERR_PTR(err);
1262 if (final_dst)
4e3fd7a0 1263 fl6->daddr = *final_dst;
2774c131 1264
c4e85f73 1265 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1266}
1267EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1268
1269/**
1270 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1271 * @sk: socket which provides the dst cache and route info
4c9483b2 1272 * @fl6: flow to lookup
68d0c6d3 1273 * @final_dst: final destination address for ipsec lookup
96818159 1274 * @connected: whether @sk is connected or not
497c615a
HX
1275 *
1276 * This function performs a route lookup on the given flow with the
1277 * possibility of using the cached route in the socket if it is valid.
1278 * It will take the socket dst lock when operating on the dst cache.
1279 * As a result, this function can only be used in process context.
1280 *
96818159
AK
1281 * In addition, for a connected socket, cache the dst in the socket
1282 * if the current cache is not valid.
1283 *
68d0c6d3
DM
1284 * It returns a valid dst pointer on success, or a pointer encoded
1285 * error code.
497c615a 1286 */
4c9483b2 1287struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1288 const struct in6_addr *final_dst,
1289 bool connected)
497c615a 1290{
68d0c6d3 1291 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1292
4c9483b2 1293 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1294 if (dst)
1295 return dst;
1296
c4e85f73 1297 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1298 if (connected && !IS_ERR(dst))
1299 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1300
00bc0ef5 1301 return dst;
497c615a 1302}
68d0c6d3 1303EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1304
0178b695
HX
1305static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1306 gfp_t gfp)
1307{
1308 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1309}
1310
1311static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1312 gfp_t gfp)
1313{
1314 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1315}
1316
75a493e6 1317static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1318 int *maxfraglen,
1319 unsigned int fragheaderlen,
1320 struct sk_buff *skb,
75a493e6 1321 struct rt6_info *rt,
e367c2d0 1322 unsigned int orig_mtu)
0c183379
G
1323{
1324 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1325 if (!skb) {
0c183379 1326 /* first fragment, reserve header_len */
e367c2d0 1327 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1328
1329 } else {
1330 /*
1331 * this fragment is not first, the headers
1332 * space is regarded as data space.
1333 */
e367c2d0 1334 *mtu = orig_mtu;
0c183379
G
1335 }
1336 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1337 + fragheaderlen - sizeof(struct frag_hdr);
1338 }
1339}
1340
366e41d9 1341static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1342 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
f37a4cc6 1343 struct rt6_info *rt)
366e41d9
VY
1344{
1345 struct ipv6_pinfo *np = inet6_sk(sk);
15f926c4 1346 unsigned int mtu, frag_size;
d656b2ea 1347 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
366e41d9 1348
40ac240c
PB
1349 /* callers pass dst together with a reference, set it first so
1350 * ip6_cork_release() can put it down even in case of an error.
1351 */
1352 cork->base.dst = &rt->dst;
1353
366e41d9
VY
1354 /*
1355 * setup for corking
1356 */
1357 if (opt) {
1358 if (WARN_ON(v6_cork->opt))
1359 return -EINVAL;
1360
d656b2ea
PB
1361 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1362 if (unlikely(!nopt))
366e41d9
VY
1363 return -ENOBUFS;
1364
d656b2ea
PB
1365 nopt->tot_len = sizeof(*opt);
1366 nopt->opt_flen = opt->opt_flen;
1367 nopt->opt_nflen = opt->opt_nflen;
366e41d9 1368
d656b2ea
PB
1369 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1370 if (opt->dst0opt && !nopt->dst0opt)
366e41d9
VY
1371 return -ENOBUFS;
1372
d656b2ea
PB
1373 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1374 if (opt->dst1opt && !nopt->dst1opt)
366e41d9
VY
1375 return -ENOBUFS;
1376
d656b2ea
PB
1377 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1378 if (opt->hopopt && !nopt->hopopt)
366e41d9
VY
1379 return -ENOBUFS;
1380
d656b2ea
PB
1381 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1382 if (opt->srcrt && !nopt->srcrt)
366e41d9
VY
1383 return -ENOBUFS;
1384
1385 /* need source address above miyazawa*/
1386 }
26879da5
WW
1387 v6_cork->hop_limit = ipc6->hlimit;
1388 v6_cork->tclass = ipc6->tclass;
a18dfa99 1389 v6_cork->dontfrag = ipc6->dontfrag;
366e41d9 1390 if (rt->dst.flags & DST_XFRM_TUNNEL)
6b724bc4 1391 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
749439bf 1392 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9 1393 else
6b724bc4 1394 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
c02b3741 1395 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
15f926c4
ED
1396
1397 frag_size = READ_ONCE(np->frag_size);
1398 if (frag_size && frag_size < mtu)
1399 mtu = frag_size;
1400
366e41d9 1401 cork->base.fragsize = mtu;
fbf47813 1402 cork->base.gso_size = ipc6->gso_size;
678ca42d 1403 cork->base.tx_flags = 0;
c6af0c22 1404 cork->base.mark = ipc6->sockc.mark;
a32f3e9d 1405 cork->base.priority = ipc6->sockc.priority;
822b5bc6 1406 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
4aecca4c
VF
1407 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1408 cork->base.flags |= IPCORK_TS_OPT_ID;
1409 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1410 }
366e41d9 1411 cork->base.length = 0;
5fdaa88d 1412 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1413
366e41d9
VY
1414 return 0;
1415}
1416
0bbe84a6 1417static int __ip6_append_data(struct sock *sk,
0bbe84a6 1418 struct sk_buff_head *queue,
f3b46a3e 1419 struct inet_cork_full *cork_full,
0bbe84a6
VY
1420 struct inet6_cork *v6_cork,
1421 struct page_frag *pfrag,
1422 int getfrag(void *from, char *to, int offset,
1423 int len, int odd, struct sk_buff *skb),
f93431c8 1424 void *from, size_t length, int transhdrlen,
a18dfa99 1425 unsigned int flags)
1da177e4 1426{
0c183379 1427 struct sk_buff *skb, *skb_prev = NULL;
f3b46a3e 1428 struct inet_cork *cork = &cork_full->base;
f37a4cc6 1429 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
10b8a3de 1430 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1431 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1432 int exthdrlen = 0;
1433 int dst_exthdrlen = 0;
1da177e4 1434 int hh_len;
1da177e4
LT
1435 int copy;
1436 int err;
1437 int offset = 0;
773ba4fe 1438 bool zc = false;
09c2d251 1439 u32 tskey = 0;
e8dfd42c 1440 struct rt6_info *rt = dst_rt6_info(cork->dst);
4aecca4c 1441 bool paged, hold_tskey = false, extra_uref = false;
0bbe84a6 1442 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1443 int csummode = CHECKSUM_NONE;
682b1a9d 1444 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1445 unsigned int wmem_alloc_delta = 0;
1da177e4 1446
0bbe84a6
VY
1447 skb = skb_peek_tail(queue);
1448 if (!skb) {
1449 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1450 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1451 }
0bbe84a6 1452
15e36f5b 1453 paged = !!cork->gso_size;
bec1f6f6 1454 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1455 orig_mtu = mtu;
1da177e4 1456
d8d1f30b 1457 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1458
a1b05140 1459 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1460 (opt ? opt->opt_nflen : 0);
1da177e4 1461
682b1a9d
HFS
1462 headersize = sizeof(struct ipv6hdr) +
1463 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
682b1a9d
HFS
1464 rt->rt6i_nfheader_len;
1465
5e34af41
TS
1466 if (mtu <= fragheaderlen ||
1467 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
6596a022
JB
1468 goto emsgsize;
1469
1470 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1471 sizeof(struct frag_hdr);
1472
10b8a3de
PA
1473 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1474 * the first fragment
1475 */
1476 if (headersize + transhdrlen > mtu)
1477 goto emsgsize;
1478
a18dfa99 1479 if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
682b1a9d 1480 (sk->sk_protocol == IPPROTO_UDP ||
13651224 1481 sk->sk_protocol == IPPROTO_ICMPV6 ||
682b1a9d
HFS
1482 sk->sk_protocol == IPPROTO_RAW)) {
1483 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1484 sizeof(struct ipv6hdr));
1485 goto emsgsize;
1486 }
4df98e76 1487
682b1a9d
HFS
1488 if (ip6_sk_ignore_df(sk))
1489 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1490 else
1491 maxnonfragsize = mtu;
4df98e76 1492
682b1a9d 1493 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1494emsgsize:
10b8a3de
PA
1495 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1496 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1497 return -EMSGSIZE;
1da177e4
LT
1498 }
1499
682b1a9d
HFS
1500 /* CHECKSUM_PARTIAL only with no extension headers and when
1501 * we are not going to fragment
1502 */
1503 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1504 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1505 length <= mtu - headersize &&
bec1f6f6 1506 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1507 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1508 csummode = CHECKSUM_PARTIAL;
1509
1fd3ae8c
PB
1510 if ((flags & MSG_ZEROCOPY) && length) {
1511 struct msghdr *msg = from;
1512
1513 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1514 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1515 return -EINVAL;
1516
1517 /* Leave uarg NULL if can't zerocopy, callers should
1518 * be able to handle it.
1519 */
1520 if ((rt->dst.dev->features & NETIF_F_SG) &&
1521 csummode == CHECKSUM_PARTIAL) {
1522 paged = true;
1523 zc = true;
1524 uarg = msg->msg_ubuf;
1525 }
1526 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
bd618489
MA
1527 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1528 false);
1fd3ae8c
PB
1529 if (!uarg)
1530 return -ENOBUFS;
1531 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1532 if (rt->dst.dev->features & NETIF_F_SG &&
1533 csummode == CHECKSUM_PARTIAL) {
1534 paged = true;
1535 zc = true;
1536 } else {
e7d2b510 1537 uarg_to_msgzc(uarg)->zerocopy = 0;
1fd3ae8c
PB
1538 skb_zcopy_set(skb, uarg, &extra_uref);
1539 }
b5947e5d 1540 }
6d8192bd 1541 } else if ((flags & MSG_SPLICE_PAGES) && length) {
cafbe182 1542 if (inet_test_bit(HDRINCL, sk))
6d8192bd 1543 return -EPERM;
5a6f6873
DH
1544 if (rt->dst.dev->features & NETIF_F_SG &&
1545 getfrag == ip_generic_getfrag)
6d8192bd
DH
1546 /* We need an empty buffer to attach stuff to */
1547 paged = true;
1548 else
1549 flags &= ~MSG_SPLICE_PAGES;
b5947e5d
WB
1550 }
1551
4aecca4c
VF
1552 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1553 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1554 if (cork->flags & IPCORK_TS_OPT_ID) {
1555 tskey = cork->ts_opt_id;
1556 } else {
1557 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1558 hold_tskey = true;
1559 }
1560 }
488b6d91 1561
1da177e4
LT
1562 /*
1563 * Let's try using as much space as possible.
1564 * Use MTU if total length of the message fits into the MTU.
1565 * Otherwise, we need to reserve fragment header and
1566 * fragment alignment (= 8-15 octects, in total).
1567 *
634a63e7 1568 * Note that we may need to "move" the data from the tail
1ab1457c 1569 * of the buffer to the new fragment when we split
1da177e4
LT
1570 * the message.
1571 *
1ab1457c 1572 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1573 * at once if non-fragmentable extension headers
1574 * are too large.
1ab1457c 1575 * --yoshfuji
1da177e4
LT
1576 */
1577
2811ebac 1578 cork->length += length;
2811ebac 1579 if (!skb)
1da177e4
LT
1580 goto alloc_new_skb;
1581
1582 while (length > 0) {
1583 /* Check if the remaining data fits into current packet. */
e57a3447 1584 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1585 if (copy < length)
1586 copy = maxfraglen - skb->len;
1587
1588 if (copy <= 0) {
1589 char *data;
1590 unsigned int datalen;
1591 unsigned int fraglen;
1592 unsigned int fraggap;
6d123b81 1593 unsigned int alloclen, alloc_extra;
aba36930 1594 unsigned int pagedlen;
1da177e4 1595alloc_new_skb:
1da177e4 1596 /* There's no room in the current skb */
0c183379
G
1597 if (skb)
1598 fraggap = skb->len - maxfraglen;
1da177e4
LT
1599 else
1600 fraggap = 0;
0c183379 1601 /* update mtu and maxfraglen if necessary */
63159f29 1602 if (!skb || !skb_prev)
0c183379 1603 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1604 fragheaderlen, skb, rt,
e367c2d0 1605 orig_mtu);
0c183379
G
1606
1607 skb_prev = skb;
1da177e4
LT
1608
1609 /*
1610 * If remaining data exceeds the mtu,
1611 * we know we need more fragment(s).
1612 */
1613 datalen = length + fraggap;
1da177e4 1614
e57a3447 1615 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
0c183379 1616 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1617 fraglen = datalen + fragheaderlen;
aba36930 1618 pagedlen = 0;
15e36f5b 1619
6d123b81
JK
1620 alloc_extra = hh_len;
1621 alloc_extra += dst_exthdrlen;
1622 alloc_extra += rt->dst.trailer_len;
1623
1624 /* We just reserve space for fragment header.
1625 * Note: this may be overallocation if the message
1626 * (without MSG_MORE) fits into the MTU.
1627 */
1628 alloc_extra += sizeof(struct frag_hdr);
1629
1da177e4 1630 if ((flags & MSG_MORE) &&
d8d1f30b 1631 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1632 alloclen = mtu;
6d123b81
JK
1633 else if (!paged &&
1634 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1635 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b 1636 alloclen = fraglen;
47cf8899 1637 else {
773ba4fe
PB
1638 alloclen = fragheaderlen + transhdrlen;
1639 pagedlen = datalen - transhdrlen;
15e36f5b 1640 }
6d123b81 1641 alloclen += alloc_extra;
299b0767 1642
0c183379
G
1643 if (datalen != length + fraggap) {
1644 /*
1645 * this is not the last fragment, the trailer
1646 * space is regarded as data space.
1647 */
1648 datalen += rt->dst.trailer_len;
1649 }
1650
0c183379 1651 fraglen = datalen + fragheaderlen;
1da177e4 1652
15e36f5b 1653 copy = datalen - transhdrlen - fraggap - pagedlen;
ce650a16
DH
1654 /* [!] NOTE: copy may be negative if pagedlen>0
1655 * because then the equation may reduces to -fraggap.
1656 */
1657 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
232cd35d
ED
1658 err = -EINVAL;
1659 goto error;
1660 }
1da177e4 1661 if (transhdrlen) {
6d123b81 1662 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1663 (flags & MSG_DONTWAIT), &err);
1664 } else {
1665 skb = NULL;
1f4c6eb2 1666 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1667 2 * sk->sk_sndbuf)
6d123b81 1668 skb = alloc_skb(alloclen,
1f4c6eb2 1669 sk->sk_allocation);
63159f29 1670 if (unlikely(!skb))
1da177e4
LT
1671 err = -ENOBUFS;
1672 }
63159f29 1673 if (!skb)
1da177e4
LT
1674 goto error;
1675 /*
1676 * Fill in the control structures
1677 */
9c9c9ad5 1678 skb->protocol = htons(ETH_P_IPV6);
32dce968 1679 skb->ip_summed = csummode;
1da177e4 1680 skb->csum = 0;
1f85851e
G
1681 /* reserve for fragmentation and ipsec header */
1682 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1683 dst_exthdrlen);
1da177e4
LT
1684
1685 /*
1686 * Find where to start putting bytes
1687 */
15e36f5b 1688 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1689 skb_set_network_header(skb, exthdrlen);
1690 data += fragheaderlen;
b0e380b1
ACM
1691 skb->transport_header = (skb->network_header +
1692 fragheaderlen);
1da177e4
LT
1693 if (fraggap) {
1694 skb->csum = skb_copy_and_csum_bits(
1695 skb_prev, maxfraglen,
8d5930df 1696 data + transhdrlen, fraggap);
1da177e4
LT
1697 skb_prev->csum = csum_sub(skb_prev->csum,
1698 skb->csum);
1699 data += fraggap;
e9fa4f7b 1700 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1701 }
232cd35d 1702 if (copy > 0 &&
5204ccbf
ED
1703 INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1704 from, data + transhdrlen, offset,
1705 copy, fraggap, skb) < 0) {
1da177e4
LT
1706 err = -EFAULT;
1707 kfree_skb(skb);
1708 goto error;
ce650a16
DH
1709 } else if (flags & MSG_SPLICE_PAGES) {
1710 copy = 0;
1da177e4
LT
1711 }
1712
1713 offset += copy;
15e36f5b 1714 length -= copy + transhdrlen;
1da177e4
LT
1715 transhdrlen = 0;
1716 exthdrlen = 0;
299b0767 1717 dst_exthdrlen = 0;
1da177e4 1718
52900d22
WB
1719 /* Only the initial fragment is time stamped */
1720 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1721 cork->tx_flags = 0;
1722 skb_shinfo(skb)->tskey = tskey;
1723 tskey = 0;
1724 skb_zcopy_set(skb, uarg, &extra_uref);
1725
0dec879f
JA
1726 if ((flags & MSG_CONFIRM) && !skb_prev)
1727 skb_set_dst_pending_confirm(skb, 1);
1728
1da177e4
LT
1729 /*
1730 * Put the packet on the pending queue
1731 */
1f4c6eb2
ED
1732 if (!skb->destructor) {
1733 skb->destructor = sock_wfree;
1734 skb->sk = sk;
1735 wmem_alloc_delta += skb->truesize;
1736 }
0bbe84a6 1737 __skb_queue_tail(queue, skb);
1da177e4
LT
1738 continue;
1739 }
1740
1741 if (copy > length)
1742 copy = length;
1743
113f99c3
WB
1744 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1745 skb_tailroom(skb) >= copy) {
1da177e4
LT
1746 unsigned int off;
1747
1748 off = skb->len;
5204ccbf
ED
1749 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1750 from, skb_put(skb, copy),
1751 offset, copy, off, skb) < 0) {
1da177e4
LT
1752 __skb_trim(skb, off);
1753 err = -EFAULT;
1754 goto error;
1755 }
6d8192bd
DH
1756 } else if (flags & MSG_SPLICE_PAGES) {
1757 struct msghdr *msg = from;
1758
ce650a16
DH
1759 err = -EIO;
1760 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1761 goto error;
1762
6d8192bd
DH
1763 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1764 sk->sk_allocation);
1765 if (err < 0)
1766 goto error;
1767 copy = err;
1768 wmem_alloc_delta += copy;
1fd3ae8c 1769 } else if (!zc) {
1da177e4 1770 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1771
5640f768
ED
1772 err = -ENOMEM;
1773 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1774 goto error;
5640f768 1775
1fd3ae8c 1776 skb_zcopy_downgrade_managed(skb);
5640f768
ED
1777 if (!skb_can_coalesce(skb, i, pfrag->page,
1778 pfrag->offset)) {
1779 err = -EMSGSIZE;
1780 if (i == MAX_SKB_FRAGS)
1781 goto error;
1782
1783 __skb_fill_page_desc(skb, i, pfrag->page,
1784 pfrag->offset, 0);
1785 skb_shinfo(skb)->nr_frags = ++i;
1786 get_page(pfrag->page);
1da177e4 1787 }
5640f768 1788 copy = min_t(int, copy, pfrag->size - pfrag->offset);
5204ccbf
ED
1789 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1790 from,
5640f768
ED
1791 page_address(pfrag->page) + pfrag->offset,
1792 offset, copy, skb->len, skb) < 0)
1793 goto error_efault;
1794
1795 pfrag->offset += copy;
1796 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1797 skb->len += copy;
1798 skb->data_len += copy;
f945fa7a 1799 skb->truesize += copy;
1f4c6eb2 1800 wmem_alloc_delta += copy;
b5947e5d
WB
1801 } else {
1802 err = skb_zerocopy_iter_dgram(skb, from, copy);
1803 if (err < 0)
1804 goto error;
1da177e4
LT
1805 }
1806 offset += copy;
1807 length -= copy;
1808 }
5640f768 1809
9e8445a5
PA
1810 if (wmem_alloc_delta)
1811 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1812 return 0;
5640f768
ED
1813
1814error_efault:
1815 err = -EFAULT;
1da177e4 1816error:
8e044917 1817 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1818 cork->length -= length;
3bd653c8 1819 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1820 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
488b6d91
VF
1821 if (hold_tskey)
1822 atomic_dec(&sk->sk_tskey);
1da177e4
LT
1823 return err;
1824}
0bbe84a6
VY
1825
1826int ip6_append_data(struct sock *sk,
1827 int getfrag(void *from, char *to, int offset, int len,
1828 int odd, struct sk_buff *skb),
f93431c8 1829 void *from, size_t length, int transhdrlen,
26879da5 1830 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1831 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1832{
1833 struct inet_sock *inet = inet_sk(sk);
1834 struct ipv6_pinfo *np = inet6_sk(sk);
1835 int exthdrlen;
1836 int err;
1837
1838 if (flags&MSG_PROBE)
1839 return 0;
1840 if (skb_queue_empty(&sk->sk_write_queue)) {
1841 /*
1842 * setup for corking
1843 */
40ac240c 1844 dst_hold(&rt->dst);
26879da5 1845 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
f37a4cc6 1846 ipc6, rt);
0bbe84a6
VY
1847 if (err)
1848 return err;
1849
f37a4cc6 1850 inet->cork.fl.u.ip6 = *fl6;
26879da5 1851 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1852 length += exthdrlen;
1853 transhdrlen += exthdrlen;
1854 } else {
0bbe84a6
VY
1855 transhdrlen = 0;
1856 }
1857
f37a4cc6 1858 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
0bbe84a6 1859 &np->cork, sk_page_frag(sk), getfrag,
a18dfa99 1860 from, length, transhdrlen, flags);
0bbe84a6 1861}
a495f836 1862EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1863
cd3c7480
PB
1864static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1865{
1866 struct dst_entry *dst = cork->base.dst;
1867
1868 cork->base.dst = NULL;
cd3c7480
PB
1869 skb_dst_set(skb, dst);
1870}
1871
366e41d9
VY
1872static void ip6_cork_release(struct inet_cork_full *cork,
1873 struct inet6_cork *v6_cork)
bf138862 1874{
366e41d9 1875 if (v6_cork->opt) {
d656b2ea
PB
1876 struct ipv6_txoptions *opt = v6_cork->opt;
1877
1878 kfree(opt->dst0opt);
1879 kfree(opt->dst1opt);
1880 kfree(opt->hopopt);
1881 kfree(opt->srcrt);
1882 kfree(opt);
366e41d9 1883 v6_cork->opt = NULL;
0178b695
HX
1884 }
1885
366e41d9
VY
1886 if (cork->base.dst) {
1887 dst_release(cork->base.dst);
1888 cork->base.dst = NULL;
bf138862 1889 }
bf138862
PE
1890}
1891
6422398c
VY
1892struct sk_buff *__ip6_make_skb(struct sock *sk,
1893 struct sk_buff_head *queue,
1894 struct inet_cork_full *cork,
1895 struct inet6_cork *v6_cork)
1da177e4
LT
1896{
1897 struct sk_buff *skb, *tmp_skb;
1898 struct sk_buff **tail_skb;
b60d4e58 1899 struct in6_addr *final_dst;
3bd653c8 1900 struct net *net = sock_net(sk);
1da177e4 1901 struct ipv6hdr *hdr;
6422398c 1902 struct ipv6_txoptions *opt = v6_cork->opt;
e8dfd42c 1903 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
6422398c 1904 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1905 unsigned char proto = fl6->flowi6_proto;
1da177e4 1906
6422398c 1907 skb = __skb_dequeue(queue);
63159f29 1908 if (!skb)
1da177e4
LT
1909 goto out;
1910 tail_skb = &(skb_shinfo(skb)->frag_list);
1911
1912 /* move skb->data to ip header from ext header */
d56f90a7 1913 if (skb->data < skb_network_header(skb))
bbe735e4 1914 __skb_pull(skb, skb_network_offset(skb));
6422398c 1915 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1916 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1917 *tail_skb = tmp_skb;
1918 tail_skb = &(tmp_skb->next);
1919 skb->len += tmp_skb->len;
1920 skb->data_len += tmp_skb->len;
1da177e4 1921 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1922 tmp_skb->destructor = NULL;
1923 tmp_skb->sk = NULL;
1da177e4
LT
1924 }
1925
28a89453 1926 /* Allow local fragmentation. */
60ff7467 1927 skb->ignore_df = ip6_sk_ignore_df(sk);
cfe1fc77 1928 __skb_pull(skb, skb_network_header_len(skb));
b60d4e58
PB
1929
1930 final_dst = &fl6->daddr;
1da177e4
LT
1931 if (opt && opt->opt_flen)
1932 ipv6_push_frag_opts(skb, opt, &proto);
1933 if (opt && opt->opt_nflen)
613fa3ca 1934 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1935
e2d1bca7
ACM
1936 skb_push(skb, sizeof(struct ipv6hdr));
1937 skb_reset_network_header(skb);
0660e03f 1938 hdr = ipv6_hdr(skb);
1ab1457c 1939
6422398c 1940 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1941 ip6_make_flowlabel(net, skb, fl6->flowlabel,
5121516b 1942 ip6_autoflowlabel(net, sk), fl6));
6422398c 1943 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1944 hdr->nexthdr = proto;
4e3fd7a0
AD
1945 hdr->saddr = fl6->saddr;
1946 hdr->daddr = *final_dst;
1da177e4 1947
a32f3e9d 1948 skb->priority = cork->base.priority;
c6af0c22 1949 skb->mark = cork->base.mark;
1693c5db
AC
1950 if (sk_is_tcp(sk))
1951 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1952 else
1953 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
35c3e279 1954
cd3c7480 1955 ip6_cork_steal_dst(skb, cork);
b4a11b20 1956 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
14878f75 1957 if (proto == IPPROTO_ICMPV6) {
adf30907 1958 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
ea30388b 1959 u8 icmp6_type;
14878f75 1960
cafbe182 1961 if (sk->sk_socket->type == SOCK_RAW &&
4e13d3a9 1962 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
ea30388b
ZX
1963 icmp6_type = fl6->fl6_icmp_type;
1964 else
1965 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1966 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
43a43b60 1967 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1968 }
1969
6422398c
VY
1970 ip6_cork_release(cork, v6_cork);
1971out:
1972 return skb;
1973}
1974
1975int ip6_send_skb(struct sk_buff *skb)
1976{
1977 struct net *net = sock_net(skb->sk);
e8dfd42c 1978 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
6422398c
VY
1979 int err;
1980
faa389b2 1981 rcu_read_lock();
33224b16 1982 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1983 if (err) {
1984 if (err > 0)
6ce9e7b5 1985 err = net_xmit_errno(err);
1da177e4 1986 if (err)
6422398c
VY
1987 IP6_INC_STATS(net, rt->rt6i_idev,
1988 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1989 }
1990
faa389b2 1991 rcu_read_unlock();
1da177e4 1992 return err;
6422398c
VY
1993}
1994
1995int ip6_push_pending_frames(struct sock *sk)
1996{
1997 struct sk_buff *skb;
1998
1999 skb = ip6_finish_skb(sk);
2000 if (!skb)
2001 return 0;
2002
2003 return ip6_send_skb(skb);
1da177e4 2004}
a495f836 2005EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 2006
0bbe84a6 2007static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
2008 struct sk_buff_head *queue,
2009 struct inet_cork_full *cork,
2010 struct inet6_cork *v6_cork)
1da177e4 2011{
1da177e4
LT
2012 struct sk_buff *skb;
2013
0bbe84a6 2014 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
2015 if (skb_dst(skb))
2016 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 2017 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
2018 kfree_skb(skb);
2019 }
2020
6422398c 2021 ip6_cork_release(cork, v6_cork);
1da177e4 2022}
0bbe84a6
VY
2023
2024void ip6_flush_pending_frames(struct sock *sk)
2025{
6422398c
VY
2026 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2027 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 2028}
a495f836 2029EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
2030
2031struct sk_buff *ip6_make_skb(struct sock *sk,
2032 int getfrag(void *from, char *to, int offset,
2033 int len, int odd, struct sk_buff *skb),
f93431c8 2034 void *from, size_t length, int transhdrlen,
f37a4cc6
PB
2035 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2036 unsigned int flags, struct inet_cork_full *cork)
6422398c 2037{
6422398c
VY
2038 struct inet6_cork v6_cork;
2039 struct sk_buff_head queue;
26879da5 2040 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
2041 int err;
2042
40ac240c
PB
2043 if (flags & MSG_PROBE) {
2044 dst_release(&rt->dst);
6422398c 2045 return NULL;
40ac240c 2046 }
6422398c
VY
2047
2048 __skb_queue_head_init(&queue);
2049
1cd7884d
WB
2050 cork->base.flags = 0;
2051 cork->base.addr = 0;
2052 cork->base.opt = NULL;
6422398c 2053 v6_cork.opt = NULL;
f37a4cc6 2054 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
862c03ee 2055 if (err) {
1cd7884d 2056 ip6_cork_release(cork, &v6_cork);
6422398c 2057 return ERR_PTR(err);
862c03ee 2058 }
6422398c 2059
f37a4cc6 2060 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
6422398c
VY
2061 &current->task_frag, getfrag, from,
2062 length + exthdrlen, transhdrlen + exthdrlen,
a18dfa99 2063 flags);
6422398c 2064 if (err) {
1cd7884d 2065 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
2066 return ERR_PTR(err);
2067 }
2068
1cd7884d 2069 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2070}