f2fs: Provide a splice-read wrapper
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/ipv6.h>
46#include <net/ndisc.h>
47#include <net/protocol.h>
48#include <net/ip6_route.h>
49#include <net/addrconf.h>
50#include <net/rawv6.h>
51#include <net/icmp.h>
52#include <net/xfrm.h>
53#include <net/checksum.h>
7bc570c8 54#include <linux/mroute6.h>
ca254490 55#include <net/l3mdev.h>
14972cbd 56#include <net/lwtunnel.h>
571912c6 57#include <net/ip_tunnels.h>
1da177e4 58
7d8c6e39 59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
e415ed3a 63 struct inet6_dev *idev = ip6_dst_idev(dst);
5796015f 64 unsigned int hh_len = LL_RESERVED_SPACE(dev);
e415ed3a
VA
65 const struct in6_addr *daddr, *nexthop;
66 struct ipv6hdr *hdr;
f6b72b62 67 struct neighbour *neigh;
6fd6ce20 68 int ret;
1da177e4 69
5796015f 70 /* Be paranoid, rather than too clever. */
e415ed3a
VA
71 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 skb = skb_expand_head(skb, hh_len);
5796015f 73 if (!skb) {
e415ed3a 74 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5796015f
VA
75 return -ENOMEM;
76 }
77 }
78
e415ed3a
VA
79 hdr = ipv6_hdr(skb);
80 daddr = &hdr->daddr;
81 if (ipv6_addr_is_multicast(daddr)) {
7026b1dd 82 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 83 ((mroute6_is_socket(net, skb) &&
bd91b8bf 84 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
e415ed3a 85 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
1da177e4
LT
86 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88 /* Do not check for IFF_ALLMULTI; multicast routing
89 is not supported in any case.
90 */
91 if (newskb)
b2e0b385 92 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 93 net, sk, newskb, NULL, newskb->dev,
95603e22 94 dev_loopback_xmit);
1da177e4 95
e415ed3a 96 if (hdr->hop_limit == 0) {
78126c41 97 IP6_INC_STATS(net, idev,
3bd653c8 98 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
99 kfree_skb(skb);
100 return 0;
101 }
102 }
103
78126c41 104 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
e415ed3a 105 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
dd408515
HFS
106 !(dev->flags & IFF_LOOPBACK)) {
107 kfree_skb(skb);
108 return 0;
109 }
1da177e4
LT
110 }
111
14972cbd
RP
112 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 int res = lwtunnel_xmit(skb);
114
115 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 return res;
117 }
118
09eed119 119 rcu_read_lock();
e415ed3a
VA
120 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
58f71be5
PB
122
123 if (unlikely(IS_ERR_OR_NULL(neigh))) {
124 if (unlikely(!neigh))
125 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126 if (IS_ERR(neigh)) {
09eed119 127 rcu_read_unlock();
58f71be5
PB
128 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130 return -EINVAL;
131 }
6fd6ce20 132 }
58f71be5
PB
133 sock_confirm_neigh(skb, neigh);
134 ret = neigh_output(neigh, skb, false);
09eed119 135 rcu_read_unlock();
58f71be5 136 return ret;
1da177e4
LT
137}
138
b210de4f
AL
139static int
140ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141 struct sk_buff *skb, unsigned int mtu)
142{
143 struct sk_buff *segs, *nskb;
144 netdev_features_t features;
145 int ret = 0;
146
147 /* Please see corresponding comment in ip_finish_output_gso
148 * describing the cases where GSO segment length exceeds the
149 * egress MTU.
150 */
151 features = netif_skb_features(skb);
152 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153 if (IS_ERR_OR_NULL(segs)) {
154 kfree_skb(skb);
155 return -ENOMEM;
156 }
157
158 consume_skb(skb);
159
160 skb_list_walk_safe(segs, segs, nskb) {
161 int err;
162
163 skb_mark_not_on_list(segs);
164 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165 if (err && ret == 0)
166 ret = err;
167 }
168
169 return ret;
170}
171
956fe219 172static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 173{
b210de4f
AL
174 unsigned int mtu;
175
09ee9dba
TB
176#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177 /* Policy lookup after SNAT yielded a new policy */
178 if (skb_dst(skb)->xfrm) {
19d36c5f 179 IP6CB(skb)->flags |= IP6SKB_REROUTED;
09ee9dba
TB
180 return dst_output(net, sk, skb);
181 }
182#endif
183
b210de4f 184 mtu = ip6_skb_dst_mtu(skb);
80e425b6
CL
185 if (skb_is_gso(skb) &&
186 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187 !skb_gso_validate_network_len(skb, mtu))
b210de4f
AL
188 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189
190 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
191 dst_allfrag(skb_dst(skb)) ||
192 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 193 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 194 else
7d8c6e39 195 return ip6_finish_output2(net, sk, skb);
9e508490
JE
196}
197
956fe219 198static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199{
200 int ret;
201
202 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203 switch (ret) {
204 case NET_XMIT_SUCCESS:
956fe219 205 case NET_XMIT_CN:
206 return __ip6_finish_output(net, sk, skb) ? : ret;
207 default:
5e187189 208 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
956fe219 209 return ret;
210 }
211}
212
ede2059d 213int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 214{
28f8bfd1 215 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 216 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 217
97a7a37a
CF
218 skb->protocol = htons(ETH_P_IPV6);
219 skb->dev = dev;
220
778d80be 221 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 222 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5e187189 223 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
778d80be
YH
224 return 0;
225 }
226
29a26a56 227 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 228 net, sk, skb, indev, dev,
9c6eb28a
JE
229 ip6_finish_output,
230 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 231}
6585d7dc 232EXPORT_SYMBOL(ip6_output);
1da177e4 233
e9191ffb 234bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
235{
236 if (!np->autoflowlabel_set)
237 return ip6_default_np_autolabel(net);
238 else
239 return np->autoflowlabel;
240}
241
1da177e4 242/*
1c1e9d2b
ED
243 * xmit an sk_buff (used by TCP, SCTP and DCCP)
244 * Note : socket lock is not held for SYNACK packets, but might be modified
245 * by calls to skb_set_owner_w() and ipv6_local_error(),
246 * which are using proper atomic operations or spinlocks.
1da177e4 247 */
1c1e9d2b 248int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 249 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 250{
3bd653c8 251 struct net *net = sock_net(sk);
1c1e9d2b 252 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 253 struct in6_addr *first_hop = &fl6->daddr;
adf30907 254 struct dst_entry *dst = skb_dst(skb);
0c9f227b
VA
255 struct net_device *dev = dst->dev;
256 struct inet6_dev *idev = ip6_dst_idev(dst);
80e425b6
CL
257 struct hop_jumbo_hdr *hop_jumbo;
258 int hoplen = sizeof(*hop_jumbo);
66033f47 259 unsigned int head_room;
1da177e4 260 struct ipv6hdr *hdr;
4c9483b2 261 u8 proto = fl6->flowi6_proto;
1da177e4 262 int seg_len = skb->len;
e651f03a 263 int hlimit = -1;
1da177e4
LT
264 u32 mtu;
265
80e425b6 266 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
66033f47
SB
267 if (opt)
268 head_room += opt->opt_nflen + opt->opt_flen;
269
0c9f227b
VA
270 if (unlikely(head_room > skb_headroom(skb))) {
271 skb = skb_expand_head(skb, head_room);
272 if (!skb) {
273 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
66033f47 274 return -ENOBUFS;
1da177e4 275 }
66033f47
SB
276 }
277
278 if (opt) {
279 seg_len += opt->opt_nflen + opt->opt_flen;
280
1da177e4
LT
281 if (opt->opt_flen)
282 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 283
1da177e4 284 if (opt->opt_nflen)
613fa3ca
DL
285 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286 &fl6->saddr);
1da177e4
LT
287 }
288
80e425b6
CL
289 if (unlikely(seg_len > IPV6_MAXPLEN)) {
290 hop_jumbo = skb_push(skb, hoplen);
291
292 hop_jumbo->nexthdr = proto;
293 hop_jumbo->hdrlen = 0;
294 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295 hop_jumbo->tlv_len = 4;
296 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297
298 proto = IPPROTO_HOPOPTS;
299 seg_len = 0;
300 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301 }
302
e2d1bca7
ACM
303 skb_push(skb, sizeof(struct ipv6hdr));
304 skb_reset_network_header(skb);
0660e03f 305 hdr = ipv6_hdr(skb);
1da177e4
LT
306
307 /*
308 * Fill in the IPv6 header
309 */
b903d324 310 if (np)
1da177e4
LT
311 hlimit = np->hop_limit;
312 if (hlimit < 0)
6b75d090 313 hlimit = ip6_dst_hoplimit(dst);
1da177e4 314
cb1ce2ef 315 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 316 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 317
1da177e4
LT
318 hdr->payload_len = htons(seg_len);
319 hdr->nexthdr = proto;
320 hdr->hop_limit = hlimit;
321
4e3fd7a0
AD
322 hdr->saddr = fl6->saddr;
323 hdr->daddr = *first_hop;
1da177e4 324
9c9c9ad5 325 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 326 skb->priority = priority;
92e55f41 327 skb->mark = mark;
a2c2064f 328
1da177e4 329 mtu = dst_mtu(dst);
60ff7467 330 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
0c9f227b 331 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
332
333 /* if egress device is enslaved to an L3 master device pass the
334 * skb to its handler for processing
335 */
336 skb = l3mdev_ip6_out((struct sock *)sk, skb);
337 if (unlikely(!skb))
338 return 0;
339
1c1e9d2b
ED
340 /* hooks should never assume socket lock is held.
341 * we promote our socket to non const
342 */
29a26a56 343 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0c9f227b 344 net, (struct sock *)sk, skb, NULL, dev,
13206b6b 345 dst_output);
1da177e4
LT
346 }
347
0c9f227b 348 skb->dev = dev;
1c1e9d2b
ED
349 /* ipv6_local_error() does not require socket lock,
350 * we promote our socket to non const
351 */
352 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353
0c9f227b 354 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
355 kfree_skb(skb);
356 return -EMSGSIZE;
357}
7159039a
YH
358EXPORT_SYMBOL(ip6_xmit);
359
1da177e4
LT
360static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361{
362 struct ip6_ra_chain *ra;
363 struct sock *last = NULL;
364
365 read_lock(&ip6_ra_lock);
366 for (ra = ip6_ra_chain; ra; ra = ra->next) {
367 struct sock *sk = ra->sk;
0bd1b59b
AM
368 if (sk && ra->sel == sel &&
369 (!sk->sk_bound_dev_if ||
370 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
371 struct ipv6_pinfo *np = inet6_sk(sk);
372
373 if (np && np->rtalert_isolate &&
374 !net_eq(sock_net(sk), dev_net(skb->dev))) {
375 continue;
376 }
1da177e4
LT
377 if (last) {
378 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379 if (skb2)
380 rawv6_rcv(last, skb2);
381 }
382 last = sk;
383 }
384 }
385
386 if (last) {
387 rawv6_rcv(last, skb);
388 read_unlock(&ip6_ra_lock);
389 return 1;
390 }
391 read_unlock(&ip6_ra_lock);
392 return 0;
393}
394
e21e0b5f
VN
395static int ip6_forward_proxy_check(struct sk_buff *skb)
396{
0660e03f 397 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 398 u8 nexthdr = hdr->nexthdr;
75f2811c 399 __be16 frag_off;
e21e0b5f
VN
400 int offset;
401
402 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 403 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
404 if (offset < 0)
405 return 0;
406 } else
407 offset = sizeof(struct ipv6hdr);
408
409 if (nexthdr == IPPROTO_ICMPV6) {
410 struct icmp6hdr *icmp6;
411
d56f90a7
ACM
412 if (!pskb_may_pull(skb, (skb_network_header(skb) +
413 offset + 1 - skb->data)))
e21e0b5f
VN
414 return 0;
415
d56f90a7 416 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
417
418 switch (icmp6->icmp6_type) {
419 case NDISC_ROUTER_SOLICITATION:
420 case NDISC_ROUTER_ADVERTISEMENT:
421 case NDISC_NEIGHBOUR_SOLICITATION:
422 case NDISC_NEIGHBOUR_ADVERTISEMENT:
423 case NDISC_REDIRECT:
424 /* For reaction involving unicast neighbor discovery
425 * message destined to the proxied address, pass it to
426 * input function.
427 */
428 return 1;
429 default:
430 break;
431 }
432 }
433
74553b09
VN
434 /*
435 * The proxying router can't forward traffic sent to a link-local
436 * address, so signal the sender and discard the packet. This
437 * behavior is clarified by the MIPv6 specification.
438 */
439 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440 dst_link_failure(skb);
441 return -1;
442 }
443
e21e0b5f
VN
444 return 0;
445}
446
0c4b51f0
EB
447static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448 struct sk_buff *skb)
1da177e4 449{
71a1c915
JB
450 struct dst_entry *dst = skb_dst(skb);
451
452 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454
f839a6c9
IS
455#ifdef CONFIG_NET_SWITCHDEV
456 if (skb->offload_l3_fwd_mark) {
457 consume_skb(skb);
458 return 0;
459 }
460#endif
461
de799101 462 skb_clear_tstamp(skb);
13206b6b 463 return dst_output(net, sk, skb);
1da177e4
LT
464}
465
fe6cc55f
FW
466static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467{
418a3156 468 if (skb->len <= mtu)
fe6cc55f
FW
469 return false;
470
60ff7467 471 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
472 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473 return true;
474
60ff7467 475 if (skb->ignore_df)
418a3156
FW
476 return false;
477
779b7931 478 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
479 return false;
480
481 return true;
482}
483
1da177e4
LT
484int ip6_forward(struct sk_buff *skb)
485{
adf30907 486 struct dst_entry *dst = skb_dst(skb);
0660e03f 487 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 488 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 489 struct net *net = dev_net(dst->dev);
0857d6f8 490 struct inet6_dev *idev;
2edc1a38 491 SKB_DR(reason);
14f3ad6f 492 u32 mtu;
1ab1457c 493
0857d6f8 494 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
53b7997f 495 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
496 goto error;
497
090f1166
LR
498 if (skb->pkt_type != PACKET_HOST)
499 goto drop;
500
9ef2e965
HFS
501 if (unlikely(skb->sk))
502 goto drop;
503
4497b076
BH
504 if (skb_warn_if_lro(skb))
505 goto drop;
506
ccd27f05 507 if (!net->ipv6.devconf_all->disable_policy &&
e3fa461d 508 (!idev || !idev->cnf.disable_policy) &&
ccd27f05 509 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 510 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
511 goto drop;
512 }
513
35fc92a9 514 skb_forward_csum(skb);
1da177e4
LT
515
516 /*
517 * We DO NOT make any processing on
518 * RA packets, pushing them to user level AS IS
519 * without ane WARRANTY that application will be able
520 * to interpret them. The reason is that we
521 * cannot make anything clever here.
522 *
523 * We are not end-node, so that if packet contains
524 * AH/ESP, we cannot make anything.
525 * Defragmentation also would be mistake, RA packets
526 * cannot be fragmented, because there is no warranty
527 * that different fragments will go along one path. --ANK
528 */
ab4eb353
YH
529 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
531 return 0;
532 }
533
534 /*
535 * check and decrement ttl
536 */
537 if (hdr->hop_limit <= 1) {
3ffe533c 538 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 539 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4 540
2edc1a38 541 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
1da177e4
LT
542 return -ETIMEDOUT;
543 }
544
fbea49e1 545 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 546 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 547 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 548 int proxied = ip6_forward_proxy_check(skb);
46c7655f 549 if (proxied > 0) {
9f535c87
GR
550 /* It's tempting to decrease the hop limit
551 * here by 1, as we do at the end of the
552 * function too.
553 *
554 * But that would be incorrect, as proxying is
555 * not forwarding. The ip6_input function
556 * will handle this packet locally, and it
557 * depends on the hop limit being unchanged.
558 *
559 * One example is the NDP hop limit, that
560 * always has to stay 255, but other would be
561 * similar checks around RA packets, where the
562 * user can even change the desired limit.
563 */
e21e0b5f 564 return ip6_input(skb);
46c7655f 565 } else if (proxied < 0) {
bdb7cc64 566 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
567 goto drop;
568 }
e21e0b5f
VN
569 }
570
1da177e4 571 if (!xfrm6_route_forward(skb)) {
bdb7cc64 572 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
2edc1a38 573 SKB_DR_SET(reason, XFRM_POLICY);
1da177e4
LT
574 goto drop;
575 }
adf30907 576 dst = skb_dst(skb);
1da177e4
LT
577
578 /* IPv6 specs say nothing about it, but it is clear that we cannot
579 send redirects to source routed frames.
1e5dc146 580 We don't send redirects to frames decapsulated from IPsec.
1da177e4 581 */
2f17becf
SS
582 if (IP6CB(skb)->iif == dst->dev->ifindex &&
583 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 584 struct in6_addr *target = NULL;
fbfe95a4 585 struct inet_peer *peer;
1da177e4 586 struct rt6_info *rt;
1da177e4
LT
587
588 /*
589 * incoming and outgoing devices are the same
590 * send a redirect.
591 */
592
593 rt = (struct rt6_info *) dst;
c45a3dfb
DM
594 if (rt->rt6i_flags & RTF_GATEWAY)
595 target = &rt->rt6i_gateway;
1da177e4
LT
596 else
597 target = &hdr->daddr;
598
fd0273d7 599 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 600
1da177e4
LT
601 /* Limit redirects both by destination (here)
602 and by source (inside ndisc_send_redirect)
603 */
fbfe95a4 604 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 605 ndisc_send_redirect(skb, target);
1d861aa4
DM
606 if (peer)
607 inet_putpeer(peer);
5bb1ab09
DS
608 } else {
609 int addrtype = ipv6_addr_type(&hdr->saddr);
610
1da177e4 611 /* This check is security critical. */
f81b2e7d
YH
612 if (addrtype == IPV6_ADDR_ANY ||
613 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
614 goto error;
615 if (addrtype & IPV6_ADDR_LINKLOCAL) {
616 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 617 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
618 goto error;
619 }
1da177e4
LT
620 }
621
427faee1 622 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
623 if (mtu < IPV6_MIN_MTU)
624 mtu = IPV6_MIN_MTU;
625
fe6cc55f 626 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
627 /* Again, force OUTPUT device used as source address */
628 skb->dev = dst->dev;
14f3ad6f 629 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 630 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
631 __IP6_INC_STATS(net, ip6_dst_idev(dst),
632 IPSTATS_MIB_FRAGFAILS);
2edc1a38 633 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
1da177e4
LT
634 return -EMSGSIZE;
635 }
636
637 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
638 __IP6_INC_STATS(net, ip6_dst_idev(dst),
639 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
640 goto drop;
641 }
642
0660e03f 643 hdr = ipv6_hdr(skb);
1da177e4
LT
644
645 /* Mangling hops number delayed to point after skb COW */
1ab1457c 646
1da177e4
LT
647 hdr->hop_limit--;
648
29a26a56
EB
649 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 651 ip6_forward_finish);
1da177e4
LT
652
653error:
bdb7cc64 654 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
2edc1a38 655 SKB_DR_SET(reason, IP_INADDRERRORS);
1da177e4 656drop:
2edc1a38 657 kfree_skb_reason(skb, reason);
1da177e4
LT
658 return -EINVAL;
659}
660
661static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662{
663 to->pkt_type = from->pkt_type;
664 to->priority = from->priority;
665 to->protocol = from->protocol;
adf30907
ED
666 skb_dst_drop(to);
667 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 668 to->dev = from->dev;
82e91ffe 669 to->mark = from->mark;
1da177e4 670
3dd1c9a1
PA
671 skb_copy_hash(to, from);
672
1da177e4
LT
673#ifdef CONFIG_NET_SCHED
674 to->tc_index = from->tc_index;
675#endif
e7ac05f3 676 nf_copy(to, from);
df5042f4 677 skb_ext_copy(to, from);
984bc16c 678 skb_copy_secmark(to, from);
1da177e4
LT
679}
680
0feca619
PNA
681int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682 u8 nexthdr, __be32 frag_id,
683 struct ip6_fraglist_iter *iter)
684{
685 unsigned int first_len;
686 struct frag_hdr *fh;
687
688 /* BUILD HEADER */
689 *prevhdr = NEXTHDR_FRAGMENT;
690 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 if (!iter->tmp_hdr)
692 return -ENOMEM;
693
b7034146 694 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
695 skb_frag_list_init(skb);
696
697 iter->offset = 0;
698 iter->hlen = hlen;
699 iter->frag_id = frag_id;
700 iter->nexthdr = nexthdr;
701
702 __skb_pull(skb, hlen);
703 fh = __skb_push(skb, sizeof(struct frag_hdr));
704 __skb_push(skb, hlen);
705 skb_reset_network_header(skb);
706 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707
708 fh->nexthdr = nexthdr;
709 fh->reserved = 0;
710 fh->frag_off = htons(IP6_MF);
711 fh->identification = frag_id;
712
713 first_len = skb_pagelen(skb);
714 skb->data_len = first_len - skb_headlen(skb);
715 skb->len = first_len;
716 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717
718 return 0;
719}
720EXPORT_SYMBOL(ip6_fraglist_init);
721
722void ip6_fraglist_prepare(struct sk_buff *skb,
723 struct ip6_fraglist_iter *iter)
724{
725 struct sk_buff *frag = iter->frag;
726 unsigned int hlen = iter->hlen;
727 struct frag_hdr *fh;
728
729 frag->ip_summed = CHECKSUM_NONE;
730 skb_reset_transport_header(frag);
731 fh = __skb_push(frag, sizeof(struct frag_hdr));
732 __skb_push(frag, hlen);
733 skb_reset_network_header(frag);
734 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736 fh->nexthdr = iter->nexthdr;
737 fh->reserved = 0;
738 fh->frag_off = htons(iter->offset);
739 if (frag->next)
740 fh->frag_off |= htons(IP6_MF);
741 fh->identification = iter->frag_id;
742 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743 ip6_copy_metadata(frag, skb);
744}
745EXPORT_SYMBOL(ip6_fraglist_prepare);
746
8a6a1f17
PNA
747void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750{
751 state->prevhdr = prevhdr;
752 state->nexthdr = nexthdr;
753 state->frag_id = frag_id;
754
755 state->hlen = hlen;
756 state->mtu = mtu;
757
758 state->left = skb->len - hlen; /* Space per frame */
759 state->ptr = hlen; /* Where to start from */
760
761 state->hroom = hdr_room;
762 state->troom = needed_tailroom;
763
764 state->offset = 0;
765}
766EXPORT_SYMBOL(ip6_frag_init);
767
768struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769{
770 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771 struct sk_buff *frag;
772 struct frag_hdr *fh;
773 unsigned int len;
774
775 len = state->left;
776 /* IF: it doesn't fit, use 'mtu' - the data space left */
777 if (len > state->mtu)
778 len = state->mtu;
779 /* IF: we are not sending up to and including the packet end
780 then align the next start on an eight byte boundary */
781 if (len < state->left)
782 len &= ~7;
783
784 /* Allocate buffer */
785 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786 state->hroom + state->troom, GFP_ATOMIC);
787 if (!frag)
788 return ERR_PTR(-ENOMEM);
789
790 /*
791 * Set up data on packet
792 */
793
794 ip6_copy_metadata(frag, skb);
795 skb_reserve(frag, state->hroom);
796 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797 skb_reset_network_header(frag);
798 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799 frag->transport_header = (frag->network_header + state->hlen +
800 sizeof(struct frag_hdr));
801
802 /*
803 * Charge the memory for the fragment to any owner
804 * it might possess
805 */
806 if (skb->sk)
807 skb_set_owner_w(frag, skb->sk);
808
809 /*
810 * Copy the packet header into the new buffer.
811 */
812 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813
814 fragnexthdr_offset = skb_network_header(frag);
815 fragnexthdr_offset += prevhdr - skb_network_header(skb);
816 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
817
818 /*
819 * Build fragment header.
820 */
821 fh->nexthdr = state->nexthdr;
822 fh->reserved = 0;
823 fh->identification = state->frag_id;
824
825 /*
826 * Copy a block of the IP datagram.
827 */
828 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829 len));
830 state->left -= len;
831
832 fh->frag_off = htons(state->offset);
833 if (state->left > 0)
834 fh->frag_off |= htons(IP6_MF);
835 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836
837 state->ptr += len;
838 state->offset += len;
839
840 return frag;
841}
842EXPORT_SYMBOL(ip6_frag_next);
843
7d8c6e39
EB
844int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 846{
1da177e4 847 struct sk_buff *frag;
67ba4152 848 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 849 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850 inet6_sk(skb->sk) : NULL;
a1ac9c8a 851 bool mono_delivery_time = skb->mono_delivery_time;
8a6a1f17
PNA
852 struct ip6_frag_state state;
853 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 854 ktime_t tstamp = skb->tstamp;
8a6a1f17 855 int hroom, err = 0;
286c2349 856 __be32 frag_id;
1da177e4
LT
857 u8 *prevhdr, nexthdr = 0;
858
7dd7eb95
DM
859 err = ip6_find_1stfragopt(skb, &prevhdr);
860 if (err < 0)
2423496a 861 goto fail;
7dd7eb95 862 hlen = err;
1da177e4 863 nexthdr = *prevhdr;
ef0efcd3 864 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 865
628a5c56 866 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
867
868 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 869 * or if the skb it not generated by a local socket.
b881ef76 870 */
485fca66
FW
871 if (unlikely(!skb->ignore_df && skb->len > mtu))
872 goto fail_toobig;
a34a101e 873
485fca66
FW
874 if (IP6CB(skb)->frag_max_size) {
875 if (IP6CB(skb)->frag_max_size > mtu)
876 goto fail_toobig;
877
878 /* don't send fragments larger than what we received */
879 mtu = IP6CB(skb)->frag_max_size;
880 if (mtu < IPV6_MIN_MTU)
881 mtu = IPV6_MIN_MTU;
b881ef76
JH
882 }
883
d91675f9
YH
884 if (np && np->frag_size < mtu) {
885 if (np->frag_size)
886 mtu = np->frag_size;
887 }
89bc7848 888 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 889 goto fail_toobig;
1e0d69a9 890 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 891
fd0273d7
MKL
892 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 &ipv6_hdr(skb)->saddr);
286c2349 894
405c92f7
HFS
895 if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 (err = skb_checksum_help(skb)))
897 goto fail;
898
ef0efcd3 899 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 900 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 901 if (skb_has_frag_list(skb)) {
c72d8cda 902 unsigned int first_len = skb_pagelen(skb);
0feca619 903 struct ip6_fraglist_iter iter;
3d13008e 904 struct sk_buff *frag2;
1da177e4
LT
905
906 if (first_len - hlen > mtu ||
907 ((first_len - hlen) & 7) ||
1d325d21
FW
908 skb_cloned(skb) ||
909 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
910 goto slow_path;
911
4d9092bb 912 skb_walk_frags(skb, frag) {
1da177e4
LT
913 /* Correct geometry. */
914 if (frag->len > mtu ||
915 ((frag->len & 7) && frag->next) ||
1d325d21 916 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 917 goto slow_path_clean;
1da177e4 918
1da177e4
LT
919 /* Partially cloned skb? */
920 if (skb_shared(frag))
3d13008e 921 goto slow_path_clean;
2fdba6b0
HX
922
923 BUG_ON(frag->sk);
924 if (skb->sk) {
2fdba6b0
HX
925 frag->sk = skb->sk;
926 frag->destructor = sock_wfree;
2fdba6b0 927 }
3d13008e 928 skb->truesize -= frag->truesize;
1da177e4
LT
929 }
930
0feca619
PNA
931 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 &iter);
933 if (err < 0)
1d325d21 934 goto fail;
a11d206d 935
803e8486
ED
936 /* We prevent @rt from being freed. */
937 rcu_read_lock();
938
1da177e4
LT
939 for (;;) {
940 /* Prepare header of the next frame,
941 * before previous one went down. */
0feca619
PNA
942 if (iter.frag)
943 ip6_fraglist_prepare(skb, &iter);
1ab1457c 944
a1ac9c8a 945 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
7d8c6e39 946 err = output(net, sk, skb);
67ba4152 947 if (!err)
d8d1f30b 948 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 949 IPSTATS_MIB_FRAGCREATES);
dafee490 950
0feca619 951 if (err || !iter.frag)
1da177e4
LT
952 break;
953
0feca619 954 skb = ip6_fraglist_next(&iter);
1da177e4
LT
955 }
956
0feca619 957 kfree(iter.tmp_hdr);
1da177e4
LT
958
959 if (err == 0) {
d8d1f30b 960 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 961 IPSTATS_MIB_FRAGOKS);
803e8486 962 rcu_read_unlock();
1da177e4
LT
963 return 0;
964 }
965
b7034146 966 kfree_skb_list(iter.frag);
1da177e4 967
d8d1f30b 968 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 969 IPSTATS_MIB_FRAGFAILS);
803e8486 970 rcu_read_unlock();
1da177e4 971 return err;
3d13008e
ED
972
973slow_path_clean:
974 skb_walk_frags(skb, frag2) {
975 if (frag2 == frag)
976 break;
977 frag2->sk = NULL;
978 frag2->destructor = NULL;
979 skb->truesize += frag2->truesize;
980 }
1da177e4
LT
981 }
982
983slow_path:
1da177e4
LT
984 /*
985 * Fragment the datagram.
986 */
987
8a6a1f17
PNA
988 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 &state);
1da177e4
LT
991
992 /*
993 * Keep copying data until we run out.
994 */
1da177e4 995
8a6a1f17
PNA
996 while (state.left > 0) {
997 frag = ip6_frag_next(skb, &state);
998 if (IS_ERR(frag)) {
999 err = PTR_ERR(frag);
1da177e4
LT
1000 goto fail;
1001 }
1002
1da177e4
LT
1003 /*
1004 * Put this fragment into the sending queue.
1005 */
a1ac9c8a 1006 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
7d8c6e39 1007 err = output(net, sk, frag);
1da177e4
LT
1008 if (err)
1009 goto fail;
dafee490 1010
adf30907 1011 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 1012 IPSTATS_MIB_FRAGCREATES);
1da177e4 1013 }
adf30907 1014 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1015 IPSTATS_MIB_FRAGOKS);
808db80a 1016 consume_skb(skb);
1da177e4
LT
1017 return err;
1018
485fca66
FW
1019fail_toobig:
1020 if (skb->sk && dst_allfrag(skb_dst(skb)))
aba54656 1021 sk_gso_disable(skb->sk);
485fca66 1022
485fca66
FW
1023 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 err = -EMSGSIZE;
1025
1da177e4 1026fail:
adf30907 1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1028 IPSTATS_MIB_FRAGFAILS);
1ab1457c 1029 kfree_skb(skb);
1da177e4
LT
1030 return err;
1031}
1032
b71d1d42
ED
1033static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 const struct in6_addr *fl_addr,
1035 const struct in6_addr *addr_cache)
cf6b1982 1036{
a02cec21 1037 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 1038 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
1039}
1040
497c615a
HX
1041static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 struct dst_entry *dst,
b71d1d42 1043 const struct flowi6 *fl6)
1da177e4 1044{
497c615a 1045 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1046 struct rt6_info *rt;
1da177e4 1047
497c615a
HX
1048 if (!dst)
1049 goto out;
1050
a963a37d
ED
1051 if (dst->ops->family != AF_INET6) {
1052 dst_release(dst);
1053 return NULL;
1054 }
1055
1056 rt = (struct rt6_info *)dst;
497c615a
HX
1057 /* Yes, checking route validity in not connected
1058 * case is not very simple. Take into account,
1059 * that we do not support routing by source, TOS,
67ba4152 1060 * and MSG_DONTROUTE --ANK (980726)
497c615a 1061 *
cf6b1982
YH
1062 * 1. ip6_rt_check(): If route was host route,
1063 * check that cached destination is current.
497c615a
HX
1064 * If it is network route, we still may
1065 * check its validity using saved pointer
1066 * to the last used address: daddr_cache.
1067 * We do not want to save whole address now,
1068 * (because main consumer of this service
1069 * is tcp, which has not this problem),
1070 * so that the last trick works only on connected
1071 * sockets.
1072 * 2. oif also should be the same.
1073 */
4c9483b2 1074 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1075#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1076 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1077#endif
40867d74 1078 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
497c615a
HX
1079 dst_release(dst);
1080 dst = NULL;
1da177e4
LT
1081 }
1082
497c615a
HX
1083out:
1084 return dst;
1085}
1086
3aef934f 1087static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1088 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1089{
69cce1d1
DM
1090#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 struct neighbour *n;
97cac082 1092 struct rt6_info *rt;
69cce1d1
DM
1093#endif
1094 int err;
6f21c96a 1095 int flags = 0;
497c615a 1096
e16e888b
MS
1097 /* The correct way to handle this would be to do
1098 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 * the route-specific preferred source forces the
1100 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 *
1102 * In source specific routing (no src=any default route),
1103 * ip6_route_output will fail given src=any saddr, though, so
1104 * that's why we try it again later.
1105 */
c305b9e6 1106 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1107 struct fib6_info *from;
e16e888b 1108 struct rt6_info *rt;
1da177e4 1109
c305b9e6 1110 *dst = ip6_route_output(net, sk, fl6);
e16e888b 1111 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1112
1113 rcu_read_lock();
1114 from = rt ? rcu_dereference(rt->from) : NULL;
1115 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1116 sk ? inet6_sk(sk)->srcprefs : 0,
1117 &fl6->saddr);
a68886a6
DA
1118 rcu_read_unlock();
1119
44456d37 1120 if (err)
1da177e4 1121 goto out_err_release;
e16e888b
MS
1122
1123 /* If we had an erroneous initial result, pretend it
1124 * never existed and let the SA-enabled version take
1125 * over.
1126 */
c305b9e6 1127 if ((*dst)->error) {
e16e888b
MS
1128 dst_release(*dst);
1129 *dst = NULL;
1130 }
6f21c96a
PA
1131
1132 if (fl6->flowi6_oif)
1133 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1134 }
1135
e16e888b 1136 if (!*dst)
6f21c96a 1137 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1138
1139 err = (*dst)->error;
1140 if (err)
1141 goto out_err_release;
1142
95c385b4 1143#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1144 /*
1145 * Here if the dst entry we've looked up
1146 * has a neighbour entry that is in the INCOMPLETE
1147 * state and the src address from the flow is
1148 * marked as OPTIMISTIC, we release the found
1149 * dst entry and replace it instead with the
1150 * dst entry of the nexthop router
1151 */
c56bf6fe 1152 rt = (struct rt6_info *) *dst;
09eed119 1153 rcu_read_lock();
2647a9b0
MKL
1154 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 rt6_nexthop(rt, &fl6->daddr));
b071af52 1156 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
09eed119 1157 rcu_read_unlock();
707be1ff
YH
1158
1159 if (err) {
e550dfb0 1160 struct inet6_ifaddr *ifp;
4c9483b2 1161 struct flowi6 fl_gw6;
e550dfb0
NH
1162 int redirect;
1163
4c9483b2 1164 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1165 (*dst)->dev, 1);
1166
1167 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 if (ifp)
1169 in6_ifa_put(ifp);
1170
1171 if (redirect) {
1172 /*
1173 * We need to get the dst entry for the
1174 * default router instead
1175 */
1176 dst_release(*dst);
4c9483b2
DM
1177 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1180 err = (*dst)->error;
1181 if (err)
e550dfb0 1182 goto out_err_release;
95c385b4 1183 }
e550dfb0 1184 }
95c385b4 1185#endif
ec5e3b0a 1186 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1187 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 err = -EAFNOSUPPORT;
1189 goto out_err_release;
1190 }
95c385b4 1191
1da177e4
LT
1192 return 0;
1193
1194out_err_release:
1195 dst_release(*dst);
1196 *dst = NULL;
8a966fc0 1197
0d240e78
DA
1198 if (err == -ENETUNREACH)
1199 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1200 return err;
1201}
34a0b3cd 1202
497c615a
HX
1203/**
1204 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1205 * @net: Network namespace to perform lookup in
497c615a
HX
1206 * @sk: socket which provides route info
1207 * @dst: pointer to dst_entry * for result
4c9483b2 1208 * @fl6: flow to lookup
497c615a
HX
1209 *
1210 * This function performs a route lookup on the given flow.
1211 *
1212 * It returns zero on success, or a standard errno code on error.
1213 */
343d60aa
RP
1214int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 struct flowi6 *fl6)
497c615a
HX
1216{
1217 *dst = NULL;
343d60aa 1218 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1219}
3cf3dc6c
ACM
1220EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221
497c615a 1222/**
68d0c6d3 1223 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1224 * @net: Network namespace to perform lookup in
68d0c6d3 1225 * @sk: socket which provides route info
4c9483b2 1226 * @fl6: flow to lookup
68d0c6d3 1227 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1228 *
1229 * This function performs a route lookup on the given flow.
1230 *
1231 * It returns a valid dst pointer on success, or a pointer encoded
1232 * error code.
1233 */
c4e85f73 1234struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1235 const struct in6_addr *final_dst)
68d0c6d3
DM
1236{
1237 struct dst_entry *dst = NULL;
1238 int err;
1239
c4e85f73 1240 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1241 if (err)
1242 return ERR_PTR(err);
1243 if (final_dst)
4e3fd7a0 1244 fl6->daddr = *final_dst;
2774c131 1245
c4e85f73 1246 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1247}
1248EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249
1250/**
1251 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1252 * @sk: socket which provides the dst cache and route info
4c9483b2 1253 * @fl6: flow to lookup
68d0c6d3 1254 * @final_dst: final destination address for ipsec lookup
96818159 1255 * @connected: whether @sk is connected or not
497c615a
HX
1256 *
1257 * This function performs a route lookup on the given flow with the
1258 * possibility of using the cached route in the socket if it is valid.
1259 * It will take the socket dst lock when operating on the dst cache.
1260 * As a result, this function can only be used in process context.
1261 *
96818159
AK
1262 * In addition, for a connected socket, cache the dst in the socket
1263 * if the current cache is not valid.
1264 *
68d0c6d3
DM
1265 * It returns a valid dst pointer on success, or a pointer encoded
1266 * error code.
497c615a 1267 */
4c9483b2 1268struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1269 const struct in6_addr *final_dst,
1270 bool connected)
497c615a 1271{
68d0c6d3 1272 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1273
4c9483b2 1274 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1275 if (dst)
1276 return dst;
1277
c4e85f73 1278 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1279 if (connected && !IS_ERR(dst))
1280 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1281
00bc0ef5 1282 return dst;
497c615a 1283}
68d0c6d3 1284EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1285
571912c6
MV
1286/**
1287 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288 * @skb: Packet for which lookup is done
1289 * @dev: Tunnel device
1290 * @net: Network namespace of tunnel device
b51cd7c8 1291 * @sock: Socket which provides route info
571912c6
MV
1292 * @saddr: Memory to store the src ip address
1293 * @info: Tunnel information
1294 * @protocol: IP protocol
b51cd7c8 1295 * @use_cache: Flag to enable cache usage
571912c6
MV
1296 * This function performs a route lookup on a tunnel
1297 *
1298 * It returns a valid dst pointer and stores src address to be used in
1299 * tunnel in param saddr on success, else a pointer encoded error code.
1300 */
1301
1302struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303 struct net_device *dev,
1304 struct net *net,
1305 struct socket *sock,
1306 struct in6_addr *saddr,
1307 const struct ip_tunnel_info *info,
1308 u8 protocol,
1309 bool use_cache)
1310{
1311 struct dst_entry *dst = NULL;
1312#ifdef CONFIG_DST_CACHE
1313 struct dst_cache *dst_cache;
1314#endif
1315 struct flowi6 fl6;
1316 __u8 prio;
1317
1318#ifdef CONFIG_DST_CACHE
1319 dst_cache = (struct dst_cache *)&info->dst_cache;
1320 if (use_cache) {
1321 dst = dst_cache_get_ip6(dst_cache, saddr);
1322 if (dst)
1323 return dst;
1324 }
1325#endif
1326 memset(&fl6, 0, sizeof(fl6));
1327 fl6.flowi6_mark = skb->mark;
1328 fl6.flowi6_proto = protocol;
1329 fl6.daddr = info->key.u.ipv6.dst;
1330 fl6.saddr = info->key.u.ipv6.src;
1331 prio = info->key.tos;
ab7e2e0d 1332 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
571912c6
MV
1333
1334 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335 NULL);
1336 if (IS_ERR(dst)) {
1337 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338 return ERR_PTR(-ENETUNREACH);
1339 }
1340 if (dst->dev == dev) { /* is this necessary? */
1341 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342 dst_release(dst);
1343 return ERR_PTR(-ELOOP);
1344 }
1345#ifdef CONFIG_DST_CACHE
1346 if (use_cache)
1347 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348#endif
1349 *saddr = fl6.saddr;
1350 return dst;
1351}
1352EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353
0178b695
HX
1354static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355 gfp_t gfp)
1356{
1357 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358}
1359
1360static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361 gfp_t gfp)
1362{
1363 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364}
1365
75a493e6 1366static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1367 int *maxfraglen,
1368 unsigned int fragheaderlen,
1369 struct sk_buff *skb,
75a493e6 1370 struct rt6_info *rt,
e367c2d0 1371 unsigned int orig_mtu)
0c183379
G
1372{
1373 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1374 if (!skb) {
0c183379 1375 /* first fragment, reserve header_len */
e367c2d0 1376 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1377
1378 } else {
1379 /*
1380 * this fragment is not first, the headers
1381 * space is regarded as data space.
1382 */
e367c2d0 1383 *mtu = orig_mtu;
0c183379
G
1384 }
1385 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386 + fragheaderlen - sizeof(struct frag_hdr);
1387 }
1388}
1389
366e41d9 1390static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1391 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
f37a4cc6 1392 struct rt6_info *rt)
366e41d9
VY
1393{
1394 struct ipv6_pinfo *np = inet6_sk(sk);
1395 unsigned int mtu;
d656b2ea 1396 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
366e41d9 1397
40ac240c
PB
1398 /* callers pass dst together with a reference, set it first so
1399 * ip6_cork_release() can put it down even in case of an error.
1400 */
1401 cork->base.dst = &rt->dst;
1402
366e41d9
VY
1403 /*
1404 * setup for corking
1405 */
1406 if (opt) {
1407 if (WARN_ON(v6_cork->opt))
1408 return -EINVAL;
1409
d656b2ea
PB
1410 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411 if (unlikely(!nopt))
366e41d9
VY
1412 return -ENOBUFS;
1413
d656b2ea
PB
1414 nopt->tot_len = sizeof(*opt);
1415 nopt->opt_flen = opt->opt_flen;
1416 nopt->opt_nflen = opt->opt_nflen;
366e41d9 1417
d656b2ea
PB
1418 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419 if (opt->dst0opt && !nopt->dst0opt)
366e41d9
VY
1420 return -ENOBUFS;
1421
d656b2ea
PB
1422 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423 if (opt->dst1opt && !nopt->dst1opt)
366e41d9
VY
1424 return -ENOBUFS;
1425
d656b2ea
PB
1426 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427 if (opt->hopopt && !nopt->hopopt)
366e41d9
VY
1428 return -ENOBUFS;
1429
d656b2ea
PB
1430 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431 if (opt->srcrt && !nopt->srcrt)
366e41d9
VY
1432 return -ENOBUFS;
1433
1434 /* need source address above miyazawa*/
1435 }
26879da5
WW
1436 v6_cork->hop_limit = ipc6->hlimit;
1437 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1438 if (rt->dst.flags & DST_XFRM_TUNNEL)
1439 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1440 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1441 else
1442 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1443 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1444 if (np->frag_size < mtu) {
1445 if (np->frag_size)
1446 mtu = np->frag_size;
1447 }
1448 cork->base.fragsize = mtu;
fbf47813 1449 cork->base.gso_size = ipc6->gso_size;
678ca42d 1450 cork->base.tx_flags = 0;
c6af0c22 1451 cork->base.mark = ipc6->sockc.mark;
678ca42d 1452 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1453
0f6c480f 1454 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1455 cork->base.flags |= IPCORK_ALLFRAG;
1456 cork->base.length = 0;
1457
5fdaa88d 1458 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1459
366e41d9
VY
1460 return 0;
1461}
1462
0bbe84a6 1463static int __ip6_append_data(struct sock *sk,
0bbe84a6 1464 struct sk_buff_head *queue,
f3b46a3e 1465 struct inet_cork_full *cork_full,
0bbe84a6
VY
1466 struct inet6_cork *v6_cork,
1467 struct page_frag *pfrag,
1468 int getfrag(void *from, char *to, int offset,
1469 int len, int odd, struct sk_buff *skb),
f93431c8 1470 void *from, size_t length, int transhdrlen,
5fdaa88d 1471 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1472{
0c183379 1473 struct sk_buff *skb, *skb_prev = NULL;
f3b46a3e 1474 struct inet_cork *cork = &cork_full->base;
f37a4cc6 1475 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
10b8a3de 1476 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1477 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1478 int exthdrlen = 0;
1479 int dst_exthdrlen = 0;
1da177e4 1480 int hh_len;
1da177e4
LT
1481 int copy;
1482 int err;
1483 int offset = 0;
773ba4fe 1484 bool zc = false;
09c2d251 1485 u32 tskey = 0;
0bbe84a6
VY
1486 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1488 int csummode = CHECKSUM_NONE;
682b1a9d 1489 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1490 unsigned int wmem_alloc_delta = 0;
100f6d8e 1491 bool paged, extra_uref = false;
1da177e4 1492
0bbe84a6
VY
1493 skb = skb_peek_tail(queue);
1494 if (!skb) {
1495 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1496 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1497 }
0bbe84a6 1498
15e36f5b 1499 paged = !!cork->gso_size;
bec1f6f6 1500 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1501 orig_mtu = mtu;
1da177e4 1502
8ca5a579 1503 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
678ca42d 1504 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
a1cdec57 1505 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
678ca42d 1506
d8d1f30b 1507 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1508
a1b05140 1509 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1510 (opt ? opt->opt_nflen : 0);
1da177e4 1511
682b1a9d
HFS
1512 headersize = sizeof(struct ipv6hdr) +
1513 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514 (dst_allfrag(&rt->dst) ?
1515 sizeof(struct frag_hdr) : 0) +
1516 rt->rt6i_nfheader_len;
1517
5e34af41
TS
1518 if (mtu <= fragheaderlen ||
1519 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
6596a022
JB
1520 goto emsgsize;
1521
1522 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523 sizeof(struct frag_hdr);
1524
10b8a3de
PA
1525 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526 * the first fragment
1527 */
1528 if (headersize + transhdrlen > mtu)
1529 goto emsgsize;
1530
26879da5 1531 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d 1532 (sk->sk_protocol == IPPROTO_UDP ||
13651224 1533 sk->sk_protocol == IPPROTO_ICMPV6 ||
682b1a9d
HFS
1534 sk->sk_protocol == IPPROTO_RAW)) {
1535 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536 sizeof(struct ipv6hdr));
1537 goto emsgsize;
1538 }
4df98e76 1539
682b1a9d
HFS
1540 if (ip6_sk_ignore_df(sk))
1541 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542 else
1543 maxnonfragsize = mtu;
4df98e76 1544
682b1a9d 1545 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1546emsgsize:
10b8a3de
PA
1547 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1549 return -EMSGSIZE;
1da177e4
LT
1550 }
1551
682b1a9d
HFS
1552 /* CHECKSUM_PARTIAL only with no extension headers and when
1553 * we are not going to fragment
1554 */
1555 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1557 length <= mtu - headersize &&
bec1f6f6 1558 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1559 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1560 csummode = CHECKSUM_PARTIAL;
1561
1fd3ae8c
PB
1562 if ((flags & MSG_ZEROCOPY) && length) {
1563 struct msghdr *msg = from;
1564
1565 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567 return -EINVAL;
1568
1569 /* Leave uarg NULL if can't zerocopy, callers should
1570 * be able to handle it.
1571 */
1572 if ((rt->dst.dev->features & NETIF_F_SG) &&
1573 csummode == CHECKSUM_PARTIAL) {
1574 paged = true;
1575 zc = true;
1576 uarg = msg->msg_ubuf;
1577 }
1578 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580 if (!uarg)
1581 return -ENOBUFS;
1582 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1583 if (rt->dst.dev->features & NETIF_F_SG &&
1584 csummode == CHECKSUM_PARTIAL) {
1585 paged = true;
1586 zc = true;
1587 } else {
e7d2b510 1588 uarg_to_msgzc(uarg)->zerocopy = 0;
1fd3ae8c
PB
1589 skb_zcopy_set(skb, uarg, &extra_uref);
1590 }
b5947e5d
WB
1591 }
1592 }
1593
1da177e4
LT
1594 /*
1595 * Let's try using as much space as possible.
1596 * Use MTU if total length of the message fits into the MTU.
1597 * Otherwise, we need to reserve fragment header and
1598 * fragment alignment (= 8-15 octects, in total).
1599 *
634a63e7 1600 * Note that we may need to "move" the data from the tail
1ab1457c 1601 * of the buffer to the new fragment when we split
1da177e4
LT
1602 * the message.
1603 *
1ab1457c 1604 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1605 * at once if non-fragmentable extension headers
1606 * are too large.
1ab1457c 1607 * --yoshfuji
1da177e4
LT
1608 */
1609
2811ebac 1610 cork->length += length;
2811ebac 1611 if (!skb)
1da177e4
LT
1612 goto alloc_new_skb;
1613
1614 while (length > 0) {
1615 /* Check if the remaining data fits into current packet. */
bdc712b4 1616 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1617 if (copy < length)
1618 copy = maxfraglen - skb->len;
1619
1620 if (copy <= 0) {
1621 char *data;
1622 unsigned int datalen;
1623 unsigned int fraglen;
1624 unsigned int fraggap;
6d123b81 1625 unsigned int alloclen, alloc_extra;
aba36930 1626 unsigned int pagedlen;
1da177e4 1627alloc_new_skb:
1da177e4 1628 /* There's no room in the current skb */
0c183379
G
1629 if (skb)
1630 fraggap = skb->len - maxfraglen;
1da177e4
LT
1631 else
1632 fraggap = 0;
0c183379 1633 /* update mtu and maxfraglen if necessary */
63159f29 1634 if (!skb || !skb_prev)
0c183379 1635 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1636 fragheaderlen, skb, rt,
e367c2d0 1637 orig_mtu);
0c183379
G
1638
1639 skb_prev = skb;
1da177e4
LT
1640
1641 /*
1642 * If remaining data exceeds the mtu,
1643 * we know we need more fragment(s).
1644 */
1645 datalen = length + fraggap;
1da177e4 1646
0c183379
G
1647 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1648 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1649 fraglen = datalen + fragheaderlen;
aba36930 1650 pagedlen = 0;
15e36f5b 1651
6d123b81
JK
1652 alloc_extra = hh_len;
1653 alloc_extra += dst_exthdrlen;
1654 alloc_extra += rt->dst.trailer_len;
1655
1656 /* We just reserve space for fragment header.
1657 * Note: this may be overallocation if the message
1658 * (without MSG_MORE) fits into the MTU.
1659 */
1660 alloc_extra += sizeof(struct frag_hdr);
1661
1da177e4 1662 if ((flags & MSG_MORE) &&
d8d1f30b 1663 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1664 alloclen = mtu;
6d123b81
JK
1665 else if (!paged &&
1666 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1667 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b 1668 alloclen = fraglen;
47cf8899 1669 else {
773ba4fe
PB
1670 alloclen = fragheaderlen + transhdrlen;
1671 pagedlen = datalen - transhdrlen;
15e36f5b 1672 }
6d123b81 1673 alloclen += alloc_extra;
299b0767 1674
0c183379
G
1675 if (datalen != length + fraggap) {
1676 /*
1677 * this is not the last fragment, the trailer
1678 * space is regarded as data space.
1679 */
1680 datalen += rt->dst.trailer_len;
1681 }
1682
0c183379 1683 fraglen = datalen + fragheaderlen;
1da177e4 1684
15e36f5b 1685 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1686 if (copy < 0) {
1687 err = -EINVAL;
1688 goto error;
1689 }
1da177e4 1690 if (transhdrlen) {
6d123b81 1691 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1692 (flags & MSG_DONTWAIT), &err);
1693 } else {
1694 skb = NULL;
1f4c6eb2 1695 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1696 2 * sk->sk_sndbuf)
6d123b81 1697 skb = alloc_skb(alloclen,
1f4c6eb2 1698 sk->sk_allocation);
63159f29 1699 if (unlikely(!skb))
1da177e4
LT
1700 err = -ENOBUFS;
1701 }
63159f29 1702 if (!skb)
1da177e4
LT
1703 goto error;
1704 /*
1705 * Fill in the control structures
1706 */
9c9c9ad5 1707 skb->protocol = htons(ETH_P_IPV6);
32dce968 1708 skb->ip_summed = csummode;
1da177e4 1709 skb->csum = 0;
1f85851e
G
1710 /* reserve for fragmentation and ipsec header */
1711 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1712 dst_exthdrlen);
1da177e4
LT
1713
1714 /*
1715 * Find where to start putting bytes
1716 */
15e36f5b 1717 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1718 skb_set_network_header(skb, exthdrlen);
1719 data += fragheaderlen;
b0e380b1
ACM
1720 skb->transport_header = (skb->network_header +
1721 fragheaderlen);
1da177e4
LT
1722 if (fraggap) {
1723 skb->csum = skb_copy_and_csum_bits(
1724 skb_prev, maxfraglen,
8d5930df 1725 data + transhdrlen, fraggap);
1da177e4
LT
1726 skb_prev->csum = csum_sub(skb_prev->csum,
1727 skb->csum);
1728 data += fraggap;
e9fa4f7b 1729 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1730 }
232cd35d
ED
1731 if (copy > 0 &&
1732 getfrag(from, data + transhdrlen, offset,
1733 copy, fraggap, skb) < 0) {
1da177e4
LT
1734 err = -EFAULT;
1735 kfree_skb(skb);
1736 goto error;
1737 }
1738
1739 offset += copy;
15e36f5b 1740 length -= copy + transhdrlen;
1da177e4
LT
1741 transhdrlen = 0;
1742 exthdrlen = 0;
299b0767 1743 dst_exthdrlen = 0;
1da177e4 1744
52900d22
WB
1745 /* Only the initial fragment is time stamped */
1746 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1747 cork->tx_flags = 0;
1748 skb_shinfo(skb)->tskey = tskey;
1749 tskey = 0;
1750 skb_zcopy_set(skb, uarg, &extra_uref);
1751
0dec879f
JA
1752 if ((flags & MSG_CONFIRM) && !skb_prev)
1753 skb_set_dst_pending_confirm(skb, 1);
1754
1da177e4
LT
1755 /*
1756 * Put the packet on the pending queue
1757 */
1f4c6eb2
ED
1758 if (!skb->destructor) {
1759 skb->destructor = sock_wfree;
1760 skb->sk = sk;
1761 wmem_alloc_delta += skb->truesize;
1762 }
0bbe84a6 1763 __skb_queue_tail(queue, skb);
1da177e4
LT
1764 continue;
1765 }
1766
1767 if (copy > length)
1768 copy = length;
1769
113f99c3
WB
1770 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1771 skb_tailroom(skb) >= copy) {
1da177e4
LT
1772 unsigned int off;
1773
1774 off = skb->len;
1775 if (getfrag(from, skb_put(skb, copy),
1776 offset, copy, off, skb) < 0) {
1777 __skb_trim(skb, off);
1778 err = -EFAULT;
1779 goto error;
1780 }
1fd3ae8c 1781 } else if (!zc) {
1da177e4 1782 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1783
5640f768
ED
1784 err = -ENOMEM;
1785 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1786 goto error;
5640f768 1787
1fd3ae8c 1788 skb_zcopy_downgrade_managed(skb);
5640f768
ED
1789 if (!skb_can_coalesce(skb, i, pfrag->page,
1790 pfrag->offset)) {
1791 err = -EMSGSIZE;
1792 if (i == MAX_SKB_FRAGS)
1793 goto error;
1794
1795 __skb_fill_page_desc(skb, i, pfrag->page,
1796 pfrag->offset, 0);
1797 skb_shinfo(skb)->nr_frags = ++i;
1798 get_page(pfrag->page);
1da177e4 1799 }
5640f768 1800 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1801 if (getfrag(from,
5640f768
ED
1802 page_address(pfrag->page) + pfrag->offset,
1803 offset, copy, skb->len, skb) < 0)
1804 goto error_efault;
1805
1806 pfrag->offset += copy;
1807 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1808 skb->len += copy;
1809 skb->data_len += copy;
f945fa7a 1810 skb->truesize += copy;
1f4c6eb2 1811 wmem_alloc_delta += copy;
b5947e5d
WB
1812 } else {
1813 err = skb_zerocopy_iter_dgram(skb, from, copy);
1814 if (err < 0)
1815 goto error;
1da177e4
LT
1816 }
1817 offset += copy;
1818 length -= copy;
1819 }
5640f768 1820
9e8445a5
PA
1821 if (wmem_alloc_delta)
1822 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1823 return 0;
5640f768
ED
1824
1825error_efault:
1826 err = -EFAULT;
1da177e4 1827error:
8e044917 1828 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1829 cork->length -= length;
3bd653c8 1830 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1831 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1832 return err;
1833}
0bbe84a6
VY
1834
1835int ip6_append_data(struct sock *sk,
1836 int getfrag(void *from, char *to, int offset, int len,
1837 int odd, struct sk_buff *skb),
f93431c8 1838 void *from, size_t length, int transhdrlen,
26879da5 1839 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1840 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1841{
1842 struct inet_sock *inet = inet_sk(sk);
1843 struct ipv6_pinfo *np = inet6_sk(sk);
1844 int exthdrlen;
1845 int err;
1846
1847 if (flags&MSG_PROBE)
1848 return 0;
1849 if (skb_queue_empty(&sk->sk_write_queue)) {
1850 /*
1851 * setup for corking
1852 */
40ac240c 1853 dst_hold(&rt->dst);
26879da5 1854 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
f37a4cc6 1855 ipc6, rt);
0bbe84a6
VY
1856 if (err)
1857 return err;
1858
f37a4cc6 1859 inet->cork.fl.u.ip6 = *fl6;
26879da5 1860 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1861 length += exthdrlen;
1862 transhdrlen += exthdrlen;
1863 } else {
0bbe84a6
VY
1864 transhdrlen = 0;
1865 }
1866
f37a4cc6 1867 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
0bbe84a6 1868 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1869 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1870}
a495f836 1871EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1872
cd3c7480
PB
1873static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1874{
1875 struct dst_entry *dst = cork->base.dst;
1876
1877 cork->base.dst = NULL;
1878 cork->base.flags &= ~IPCORK_ALLFRAG;
1879 skb_dst_set(skb, dst);
1880}
1881
366e41d9
VY
1882static void ip6_cork_release(struct inet_cork_full *cork,
1883 struct inet6_cork *v6_cork)
bf138862 1884{
366e41d9 1885 if (v6_cork->opt) {
d656b2ea
PB
1886 struct ipv6_txoptions *opt = v6_cork->opt;
1887
1888 kfree(opt->dst0opt);
1889 kfree(opt->dst1opt);
1890 kfree(opt->hopopt);
1891 kfree(opt->srcrt);
1892 kfree(opt);
366e41d9 1893 v6_cork->opt = NULL;
0178b695
HX
1894 }
1895
366e41d9
VY
1896 if (cork->base.dst) {
1897 dst_release(cork->base.dst);
1898 cork->base.dst = NULL;
1899 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1900 }
bf138862
PE
1901}
1902
6422398c
VY
1903struct sk_buff *__ip6_make_skb(struct sock *sk,
1904 struct sk_buff_head *queue,
1905 struct inet_cork_full *cork,
1906 struct inet6_cork *v6_cork)
1da177e4
LT
1907{
1908 struct sk_buff *skb, *tmp_skb;
1909 struct sk_buff **tail_skb;
b60d4e58 1910 struct in6_addr *final_dst;
1da177e4 1911 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1912 struct net *net = sock_net(sk);
1da177e4 1913 struct ipv6hdr *hdr;
6422398c
VY
1914 struct ipv6_txoptions *opt = v6_cork->opt;
1915 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1916 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1917 unsigned char proto = fl6->flowi6_proto;
1da177e4 1918
6422398c 1919 skb = __skb_dequeue(queue);
63159f29 1920 if (!skb)
1da177e4
LT
1921 goto out;
1922 tail_skb = &(skb_shinfo(skb)->frag_list);
1923
1924 /* move skb->data to ip header from ext header */
d56f90a7 1925 if (skb->data < skb_network_header(skb))
bbe735e4 1926 __skb_pull(skb, skb_network_offset(skb));
6422398c 1927 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1928 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1929 *tail_skb = tmp_skb;
1930 tail_skb = &(tmp_skb->next);
1931 skb->len += tmp_skb->len;
1932 skb->data_len += tmp_skb->len;
1da177e4 1933 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1934 tmp_skb->destructor = NULL;
1935 tmp_skb->sk = NULL;
1da177e4
LT
1936 }
1937
28a89453 1938 /* Allow local fragmentation. */
60ff7467 1939 skb->ignore_df = ip6_sk_ignore_df(sk);
cfe1fc77 1940 __skb_pull(skb, skb_network_header_len(skb));
b60d4e58
PB
1941
1942 final_dst = &fl6->daddr;
1da177e4
LT
1943 if (opt && opt->opt_flen)
1944 ipv6_push_frag_opts(skb, opt, &proto);
1945 if (opt && opt->opt_nflen)
613fa3ca 1946 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1947
e2d1bca7
ACM
1948 skb_push(skb, sizeof(struct ipv6hdr));
1949 skb_reset_network_header(skb);
0660e03f 1950 hdr = ipv6_hdr(skb);
1ab1457c 1951
6422398c 1952 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1953 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1954 ip6_autoflowlabel(net, np), fl6));
6422398c 1955 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1956 hdr->nexthdr = proto;
4e3fd7a0
AD
1957 hdr->saddr = fl6->saddr;
1958 hdr->daddr = *final_dst;
1da177e4 1959
a2c2064f 1960 skb->priority = sk->sk_priority;
c6af0c22 1961 skb->mark = cork->base.mark;
a818f75e
JSP
1962 skb->tstamp = cork->base.transmit_time;
1963
cd3c7480 1964 ip6_cork_steal_dst(skb, cork);
edf391ff 1965 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1966 if (proto == IPPROTO_ICMPV6) {
adf30907 1967 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
ea30388b 1968 u8 icmp6_type;
14878f75 1969
ea30388b
ZX
1970 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1971 icmp6_type = fl6->fl6_icmp_type;
1972 else
1973 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1974 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
43a43b60 1975 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1976 }
1977
6422398c
VY
1978 ip6_cork_release(cork, v6_cork);
1979out:
1980 return skb;
1981}
1982
1983int ip6_send_skb(struct sk_buff *skb)
1984{
1985 struct net *net = sock_net(skb->sk);
1986 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1987 int err;
1988
33224b16 1989 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1990 if (err) {
1991 if (err > 0)
6ce9e7b5 1992 err = net_xmit_errno(err);
1da177e4 1993 if (err)
6422398c
VY
1994 IP6_INC_STATS(net, rt->rt6i_idev,
1995 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1996 }
1997
1da177e4 1998 return err;
6422398c
VY
1999}
2000
2001int ip6_push_pending_frames(struct sock *sk)
2002{
2003 struct sk_buff *skb;
2004
2005 skb = ip6_finish_skb(sk);
2006 if (!skb)
2007 return 0;
2008
2009 return ip6_send_skb(skb);
1da177e4 2010}
a495f836 2011EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 2012
0bbe84a6 2013static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
2014 struct sk_buff_head *queue,
2015 struct inet_cork_full *cork,
2016 struct inet6_cork *v6_cork)
1da177e4 2017{
1da177e4
LT
2018 struct sk_buff *skb;
2019
0bbe84a6 2020 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
2021 if (skb_dst(skb))
2022 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 2023 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
2024 kfree_skb(skb);
2025 }
2026
6422398c 2027 ip6_cork_release(cork, v6_cork);
1da177e4 2028}
0bbe84a6
VY
2029
2030void ip6_flush_pending_frames(struct sock *sk)
2031{
6422398c
VY
2032 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2033 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 2034}
a495f836 2035EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
2036
2037struct sk_buff *ip6_make_skb(struct sock *sk,
2038 int getfrag(void *from, char *to, int offset,
2039 int len, int odd, struct sk_buff *skb),
f93431c8 2040 void *from, size_t length, int transhdrlen,
f37a4cc6
PB
2041 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2042 unsigned int flags, struct inet_cork_full *cork)
6422398c 2043{
6422398c
VY
2044 struct inet6_cork v6_cork;
2045 struct sk_buff_head queue;
26879da5 2046 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
2047 int err;
2048
40ac240c
PB
2049 if (flags & MSG_PROBE) {
2050 dst_release(&rt->dst);
6422398c 2051 return NULL;
40ac240c 2052 }
6422398c
VY
2053
2054 __skb_queue_head_init(&queue);
2055
1cd7884d
WB
2056 cork->base.flags = 0;
2057 cork->base.addr = 0;
2058 cork->base.opt = NULL;
6422398c 2059 v6_cork.opt = NULL;
f37a4cc6 2060 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
862c03ee 2061 if (err) {
1cd7884d 2062 ip6_cork_release(cork, &v6_cork);
6422398c 2063 return ERR_PTR(err);
862c03ee 2064 }
26879da5
WW
2065 if (ipc6->dontfrag < 0)
2066 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 2067
f37a4cc6 2068 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
6422398c
VY
2069 &current->task_frag, getfrag, from,
2070 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 2071 flags, ipc6);
6422398c 2072 if (err) {
1cd7884d 2073 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
2074 return ERR_PTR(err);
2075 }
2076
1cd7884d 2077 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2078}