Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next...
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/ipv6.h>
46#include <net/ndisc.h>
47#include <net/protocol.h>
48#include <net/ip6_route.h>
49#include <net/addrconf.h>
50#include <net/rawv6.h>
51#include <net/icmp.h>
52#include <net/xfrm.h>
53#include <net/checksum.h>
7bc570c8 54#include <linux/mroute6.h>
ca254490 55#include <net/l3mdev.h>
14972cbd 56#include <net/lwtunnel.h>
571912c6 57#include <net/ip_tunnels.h>
1da177e4 58
7d8c6e39 59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
e415ed3a 63 struct inet6_dev *idev = ip6_dst_idev(dst);
5796015f 64 unsigned int hh_len = LL_RESERVED_SPACE(dev);
e415ed3a
VA
65 const struct in6_addr *daddr, *nexthop;
66 struct ipv6hdr *hdr;
f6b72b62 67 struct neighbour *neigh;
6fd6ce20 68 int ret;
1da177e4 69
5796015f 70 /* Be paranoid, rather than too clever. */
e415ed3a
VA
71 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 skb = skb_expand_head(skb, hh_len);
5796015f 73 if (!skb) {
e415ed3a 74 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5796015f
VA
75 return -ENOMEM;
76 }
77 }
78
e415ed3a
VA
79 hdr = ipv6_hdr(skb);
80 daddr = &hdr->daddr;
81 if (ipv6_addr_is_multicast(daddr)) {
7026b1dd 82 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 83 ((mroute6_is_socket(net, skb) &&
bd91b8bf 84 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
e415ed3a 85 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
1da177e4
LT
86 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88 /* Do not check for IFF_ALLMULTI; multicast routing
89 is not supported in any case.
90 */
91 if (newskb)
b2e0b385 92 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 93 net, sk, newskb, NULL, newskb->dev,
95603e22 94 dev_loopback_xmit);
1da177e4 95
e415ed3a 96 if (hdr->hop_limit == 0) {
78126c41 97 IP6_INC_STATS(net, idev,
3bd653c8 98 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
99 kfree_skb(skb);
100 return 0;
101 }
102 }
103
78126c41 104 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
e415ed3a 105 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
dd408515
HFS
106 !(dev->flags & IFF_LOOPBACK)) {
107 kfree_skb(skb);
108 return 0;
109 }
1da177e4
LT
110 }
111
14972cbd
RP
112 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 int res = lwtunnel_xmit(skb);
114
115 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 return res;
117 }
118
6fd6ce20 119 rcu_read_lock_bh();
e415ed3a
VA
120 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
6fd6ce20 122 if (unlikely(!neigh))
e415ed3a 123 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
6fd6ce20 124 if (!IS_ERR(neigh)) {
4ff06203 125 sock_confirm_neigh(skb, neigh);
0353f282 126 ret = neigh_output(neigh, skb, false);
6fd6ce20
YH
127 rcu_read_unlock_bh();
128 return ret;
129 }
130 rcu_read_unlock_bh();
05e3aa09 131
e415ed3a 132 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
5e187189 133 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
9e508490 134 return -EINVAL;
1da177e4
LT
135}
136
b210de4f
AL
137static int
138ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
139 struct sk_buff *skb, unsigned int mtu)
140{
141 struct sk_buff *segs, *nskb;
142 netdev_features_t features;
143 int ret = 0;
144
145 /* Please see corresponding comment in ip_finish_output_gso
146 * describing the cases where GSO segment length exceeds the
147 * egress MTU.
148 */
149 features = netif_skb_features(skb);
150 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
151 if (IS_ERR_OR_NULL(segs)) {
152 kfree_skb(skb);
153 return -ENOMEM;
154 }
155
156 consume_skb(skb);
157
158 skb_list_walk_safe(segs, segs, nskb) {
159 int err;
160
161 skb_mark_not_on_list(segs);
162 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
163 if (err && ret == 0)
164 ret = err;
165 }
166
167 return ret;
168}
169
956fe219 170static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 171{
b210de4f
AL
172 unsigned int mtu;
173
09ee9dba
TB
174#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
175 /* Policy lookup after SNAT yielded a new policy */
176 if (skb_dst(skb)->xfrm) {
19d36c5f 177 IP6CB(skb)->flags |= IP6SKB_REROUTED;
09ee9dba
TB
178 return dst_output(net, sk, skb);
179 }
180#endif
181
b210de4f
AL
182 mtu = ip6_skb_dst_mtu(skb);
183 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
185
186 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
187 dst_allfrag(skb_dst(skb)) ||
188 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 189 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 190 else
7d8c6e39 191 return ip6_finish_output2(net, sk, skb);
9e508490
JE
192}
193
956fe219 194static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
195{
196 int ret;
197
198 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
199 switch (ret) {
200 case NET_XMIT_SUCCESS:
201 return __ip6_finish_output(net, sk, skb);
202 case NET_XMIT_CN:
203 return __ip6_finish_output(net, sk, skb) ? : ret;
204 default:
5e187189 205 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
956fe219 206 return ret;
207 }
208}
209
ede2059d 210int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 211{
28f8bfd1 212 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 213 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 214
97a7a37a
CF
215 skb->protocol = htons(ETH_P_IPV6);
216 skb->dev = dev;
217
778d80be 218 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 219 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5e187189 220 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
778d80be
YH
221 return 0;
222 }
223
29a26a56 224 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 225 net, sk, skb, indev, dev,
9c6eb28a
JE
226 ip6_finish_output,
227 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 228}
6585d7dc 229EXPORT_SYMBOL(ip6_output);
1da177e4 230
e9191ffb 231bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
232{
233 if (!np->autoflowlabel_set)
234 return ip6_default_np_autolabel(net);
235 else
236 return np->autoflowlabel;
237}
238
1da177e4 239/*
1c1e9d2b
ED
240 * xmit an sk_buff (used by TCP, SCTP and DCCP)
241 * Note : socket lock is not held for SYNACK packets, but might be modified
242 * by calls to skb_set_owner_w() and ipv6_local_error(),
243 * which are using proper atomic operations or spinlocks.
1da177e4 244 */
1c1e9d2b 245int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 246 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 247{
3bd653c8 248 struct net *net = sock_net(sk);
1c1e9d2b 249 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 250 struct in6_addr *first_hop = &fl6->daddr;
adf30907 251 struct dst_entry *dst = skb_dst(skb);
0c9f227b
VA
252 struct net_device *dev = dst->dev;
253 struct inet6_dev *idev = ip6_dst_idev(dst);
66033f47 254 unsigned int head_room;
1da177e4 255 struct ipv6hdr *hdr;
4c9483b2 256 u8 proto = fl6->flowi6_proto;
1da177e4 257 int seg_len = skb->len;
e651f03a 258 int hlimit = -1;
1da177e4
LT
259 u32 mtu;
260
0c9f227b 261 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
66033f47
SB
262 if (opt)
263 head_room += opt->opt_nflen + opt->opt_flen;
264
0c9f227b
VA
265 if (unlikely(head_room > skb_headroom(skb))) {
266 skb = skb_expand_head(skb, head_room);
267 if (!skb) {
268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
66033f47 269 return -ENOBUFS;
1da177e4 270 }
66033f47
SB
271 }
272
273 if (opt) {
274 seg_len += opt->opt_nflen + opt->opt_flen;
275
1da177e4
LT
276 if (opt->opt_flen)
277 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 278
1da177e4 279 if (opt->opt_nflen)
613fa3ca
DL
280 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
281 &fl6->saddr);
1da177e4
LT
282 }
283
e2d1bca7
ACM
284 skb_push(skb, sizeof(struct ipv6hdr));
285 skb_reset_network_header(skb);
0660e03f 286 hdr = ipv6_hdr(skb);
1da177e4
LT
287
288 /*
289 * Fill in the IPv6 header
290 */
b903d324 291 if (np)
1da177e4
LT
292 hlimit = np->hop_limit;
293 if (hlimit < 0)
6b75d090 294 hlimit = ip6_dst_hoplimit(dst);
1da177e4 295
cb1ce2ef 296 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 297 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 298
1da177e4
LT
299 hdr->payload_len = htons(seg_len);
300 hdr->nexthdr = proto;
301 hdr->hop_limit = hlimit;
302
4e3fd7a0
AD
303 hdr->saddr = fl6->saddr;
304 hdr->daddr = *first_hop;
1da177e4 305
9c9c9ad5 306 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 307 skb->priority = priority;
92e55f41 308 skb->mark = mark;
a2c2064f 309
1da177e4 310 mtu = dst_mtu(dst);
60ff7467 311 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
0c9f227b 312 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
313
314 /* if egress device is enslaved to an L3 master device pass the
315 * skb to its handler for processing
316 */
317 skb = l3mdev_ip6_out((struct sock *)sk, skb);
318 if (unlikely(!skb))
319 return 0;
320
1c1e9d2b
ED
321 /* hooks should never assume socket lock is held.
322 * we promote our socket to non const
323 */
29a26a56 324 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0c9f227b 325 net, (struct sock *)sk, skb, NULL, dev,
13206b6b 326 dst_output);
1da177e4
LT
327 }
328
0c9f227b 329 skb->dev = dev;
1c1e9d2b
ED
330 /* ipv6_local_error() does not require socket lock,
331 * we promote our socket to non const
332 */
333 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
334
0c9f227b 335 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
336 kfree_skb(skb);
337 return -EMSGSIZE;
338}
7159039a
YH
339EXPORT_SYMBOL(ip6_xmit);
340
1da177e4
LT
341static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
342{
343 struct ip6_ra_chain *ra;
344 struct sock *last = NULL;
345
346 read_lock(&ip6_ra_lock);
347 for (ra = ip6_ra_chain; ra; ra = ra->next) {
348 struct sock *sk = ra->sk;
0bd1b59b
AM
349 if (sk && ra->sel == sel &&
350 (!sk->sk_bound_dev_if ||
351 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
352 struct ipv6_pinfo *np = inet6_sk(sk);
353
354 if (np && np->rtalert_isolate &&
355 !net_eq(sock_net(sk), dev_net(skb->dev))) {
356 continue;
357 }
1da177e4
LT
358 if (last) {
359 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
360 if (skb2)
361 rawv6_rcv(last, skb2);
362 }
363 last = sk;
364 }
365 }
366
367 if (last) {
368 rawv6_rcv(last, skb);
369 read_unlock(&ip6_ra_lock);
370 return 1;
371 }
372 read_unlock(&ip6_ra_lock);
373 return 0;
374}
375
e21e0b5f
VN
376static int ip6_forward_proxy_check(struct sk_buff *skb)
377{
0660e03f 378 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 379 u8 nexthdr = hdr->nexthdr;
75f2811c 380 __be16 frag_off;
e21e0b5f
VN
381 int offset;
382
383 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 384 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
385 if (offset < 0)
386 return 0;
387 } else
388 offset = sizeof(struct ipv6hdr);
389
390 if (nexthdr == IPPROTO_ICMPV6) {
391 struct icmp6hdr *icmp6;
392
d56f90a7
ACM
393 if (!pskb_may_pull(skb, (skb_network_header(skb) +
394 offset + 1 - skb->data)))
e21e0b5f
VN
395 return 0;
396
d56f90a7 397 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
398
399 switch (icmp6->icmp6_type) {
400 case NDISC_ROUTER_SOLICITATION:
401 case NDISC_ROUTER_ADVERTISEMENT:
402 case NDISC_NEIGHBOUR_SOLICITATION:
403 case NDISC_NEIGHBOUR_ADVERTISEMENT:
404 case NDISC_REDIRECT:
405 /* For reaction involving unicast neighbor discovery
406 * message destined to the proxied address, pass it to
407 * input function.
408 */
409 return 1;
410 default:
411 break;
412 }
413 }
414
74553b09
VN
415 /*
416 * The proxying router can't forward traffic sent to a link-local
417 * address, so signal the sender and discard the packet. This
418 * behavior is clarified by the MIPv6 specification.
419 */
420 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
421 dst_link_failure(skb);
422 return -1;
423 }
424
e21e0b5f
VN
425 return 0;
426}
427
0c4b51f0
EB
428static inline int ip6_forward_finish(struct net *net, struct sock *sk,
429 struct sk_buff *skb)
1da177e4 430{
71a1c915
JB
431 struct dst_entry *dst = skb_dst(skb);
432
433 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
434 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
435
f839a6c9
IS
436#ifdef CONFIG_NET_SWITCHDEV
437 if (skb->offload_l3_fwd_mark) {
438 consume_skb(skb);
439 return 0;
440 }
441#endif
442
de799101 443 skb_clear_tstamp(skb);
13206b6b 444 return dst_output(net, sk, skb);
1da177e4
LT
445}
446
fe6cc55f
FW
447static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
448{
418a3156 449 if (skb->len <= mtu)
fe6cc55f
FW
450 return false;
451
60ff7467 452 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
453 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
454 return true;
455
60ff7467 456 if (skb->ignore_df)
418a3156
FW
457 return false;
458
779b7931 459 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
460 return false;
461
462 return true;
463}
464
1da177e4
LT
465int ip6_forward(struct sk_buff *skb)
466{
adf30907 467 struct dst_entry *dst = skb_dst(skb);
0660e03f 468 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 469 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 470 struct net *net = dev_net(dst->dev);
0857d6f8 471 struct inet6_dev *idev;
14f3ad6f 472 u32 mtu;
1ab1457c 473
0857d6f8 474 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
53b7997f 475 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
476 goto error;
477
090f1166
LR
478 if (skb->pkt_type != PACKET_HOST)
479 goto drop;
480
9ef2e965
HFS
481 if (unlikely(skb->sk))
482 goto drop;
483
4497b076
BH
484 if (skb_warn_if_lro(skb))
485 goto drop;
486
ccd27f05
ND
487 if (!net->ipv6.devconf_all->disable_policy &&
488 !idev->cnf.disable_policy &&
489 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 490 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
491 goto drop;
492 }
493
35fc92a9 494 skb_forward_csum(skb);
1da177e4
LT
495
496 /*
497 * We DO NOT make any processing on
498 * RA packets, pushing them to user level AS IS
499 * without ane WARRANTY that application will be able
500 * to interpret them. The reason is that we
501 * cannot make anything clever here.
502 *
503 * We are not end-node, so that if packet contains
504 * AH/ESP, we cannot make anything.
505 * Defragmentation also would be mistake, RA packets
506 * cannot be fragmented, because there is no warranty
507 * that different fragments will go along one path. --ANK
508 */
ab4eb353
YH
509 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
510 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
511 return 0;
512 }
513
514 /*
515 * check and decrement ttl
516 */
517 if (hdr->hop_limit <= 1) {
3ffe533c 518 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
520
521 kfree_skb(skb);
522 return -ETIMEDOUT;
523 }
524
fbea49e1 525 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 526 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 527 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 528 int proxied = ip6_forward_proxy_check(skb);
46c7655f
KP
529 if (proxied > 0) {
530 hdr->hop_limit--;
e21e0b5f 531 return ip6_input(skb);
46c7655f 532 } else if (proxied < 0) {
bdb7cc64 533 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
534 goto drop;
535 }
e21e0b5f
VN
536 }
537
1da177e4 538 if (!xfrm6_route_forward(skb)) {
bdb7cc64 539 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
540 goto drop;
541 }
adf30907 542 dst = skb_dst(skb);
1da177e4
LT
543
544 /* IPv6 specs say nothing about it, but it is clear that we cannot
545 send redirects to source routed frames.
1e5dc146 546 We don't send redirects to frames decapsulated from IPsec.
1da177e4 547 */
2f17becf
SS
548 if (IP6CB(skb)->iif == dst->dev->ifindex &&
549 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 550 struct in6_addr *target = NULL;
fbfe95a4 551 struct inet_peer *peer;
1da177e4 552 struct rt6_info *rt;
1da177e4
LT
553
554 /*
555 * incoming and outgoing devices are the same
556 * send a redirect.
557 */
558
559 rt = (struct rt6_info *) dst;
c45a3dfb
DM
560 if (rt->rt6i_flags & RTF_GATEWAY)
561 target = &rt->rt6i_gateway;
1da177e4
LT
562 else
563 target = &hdr->daddr;
564
fd0273d7 565 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 566
1da177e4
LT
567 /* Limit redirects both by destination (here)
568 and by source (inside ndisc_send_redirect)
569 */
fbfe95a4 570 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 571 ndisc_send_redirect(skb, target);
1d861aa4
DM
572 if (peer)
573 inet_putpeer(peer);
5bb1ab09
DS
574 } else {
575 int addrtype = ipv6_addr_type(&hdr->saddr);
576
1da177e4 577 /* This check is security critical. */
f81b2e7d
YH
578 if (addrtype == IPV6_ADDR_ANY ||
579 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
580 goto error;
581 if (addrtype & IPV6_ADDR_LINKLOCAL) {
582 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 583 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
584 goto error;
585 }
1da177e4
LT
586 }
587
427faee1 588 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
589 if (mtu < IPV6_MIN_MTU)
590 mtu = IPV6_MIN_MTU;
591
fe6cc55f 592 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
593 /* Again, force OUTPUT device used as source address */
594 skb->dev = dst->dev;
14f3ad6f 595 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 596 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
597 __IP6_INC_STATS(net, ip6_dst_idev(dst),
598 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
599 kfree_skb(skb);
600 return -EMSGSIZE;
601 }
602
603 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
604 __IP6_INC_STATS(net, ip6_dst_idev(dst),
605 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
606 goto drop;
607 }
608
0660e03f 609 hdr = ipv6_hdr(skb);
1da177e4
LT
610
611 /* Mangling hops number delayed to point after skb COW */
1ab1457c 612
1da177e4
LT
613 hdr->hop_limit--;
614
29a26a56
EB
615 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
616 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 617 ip6_forward_finish);
1da177e4
LT
618
619error:
bdb7cc64 620 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
621drop:
622 kfree_skb(skb);
623 return -EINVAL;
624}
625
626static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
627{
628 to->pkt_type = from->pkt_type;
629 to->priority = from->priority;
630 to->protocol = from->protocol;
adf30907
ED
631 skb_dst_drop(to);
632 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 633 to->dev = from->dev;
82e91ffe 634 to->mark = from->mark;
1da177e4 635
3dd1c9a1
PA
636 skb_copy_hash(to, from);
637
1da177e4
LT
638#ifdef CONFIG_NET_SCHED
639 to->tc_index = from->tc_index;
640#endif
e7ac05f3 641 nf_copy(to, from);
df5042f4 642 skb_ext_copy(to, from);
984bc16c 643 skb_copy_secmark(to, from);
1da177e4
LT
644}
645
0feca619
PNA
646int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
647 u8 nexthdr, __be32 frag_id,
648 struct ip6_fraglist_iter *iter)
649{
650 unsigned int first_len;
651 struct frag_hdr *fh;
652
653 /* BUILD HEADER */
654 *prevhdr = NEXTHDR_FRAGMENT;
655 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
656 if (!iter->tmp_hdr)
657 return -ENOMEM;
658
b7034146 659 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
660 skb_frag_list_init(skb);
661
662 iter->offset = 0;
663 iter->hlen = hlen;
664 iter->frag_id = frag_id;
665 iter->nexthdr = nexthdr;
666
667 __skb_pull(skb, hlen);
668 fh = __skb_push(skb, sizeof(struct frag_hdr));
669 __skb_push(skb, hlen);
670 skb_reset_network_header(skb);
671 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
672
673 fh->nexthdr = nexthdr;
674 fh->reserved = 0;
675 fh->frag_off = htons(IP6_MF);
676 fh->identification = frag_id;
677
678 first_len = skb_pagelen(skb);
679 skb->data_len = first_len - skb_headlen(skb);
680 skb->len = first_len;
681 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
682
683 return 0;
684}
685EXPORT_SYMBOL(ip6_fraglist_init);
686
687void ip6_fraglist_prepare(struct sk_buff *skb,
688 struct ip6_fraglist_iter *iter)
689{
690 struct sk_buff *frag = iter->frag;
691 unsigned int hlen = iter->hlen;
692 struct frag_hdr *fh;
693
694 frag->ip_summed = CHECKSUM_NONE;
695 skb_reset_transport_header(frag);
696 fh = __skb_push(frag, sizeof(struct frag_hdr));
697 __skb_push(frag, hlen);
698 skb_reset_network_header(frag);
699 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
700 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
701 fh->nexthdr = iter->nexthdr;
702 fh->reserved = 0;
703 fh->frag_off = htons(iter->offset);
704 if (frag->next)
705 fh->frag_off |= htons(IP6_MF);
706 fh->identification = iter->frag_id;
707 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
708 ip6_copy_metadata(frag, skb);
709}
710EXPORT_SYMBOL(ip6_fraglist_prepare);
711
8a6a1f17
PNA
712void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
713 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
714 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
715{
716 state->prevhdr = prevhdr;
717 state->nexthdr = nexthdr;
718 state->frag_id = frag_id;
719
720 state->hlen = hlen;
721 state->mtu = mtu;
722
723 state->left = skb->len - hlen; /* Space per frame */
724 state->ptr = hlen; /* Where to start from */
725
726 state->hroom = hdr_room;
727 state->troom = needed_tailroom;
728
729 state->offset = 0;
730}
731EXPORT_SYMBOL(ip6_frag_init);
732
733struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
734{
735 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
736 struct sk_buff *frag;
737 struct frag_hdr *fh;
738 unsigned int len;
739
740 len = state->left;
741 /* IF: it doesn't fit, use 'mtu' - the data space left */
742 if (len > state->mtu)
743 len = state->mtu;
744 /* IF: we are not sending up to and including the packet end
745 then align the next start on an eight byte boundary */
746 if (len < state->left)
747 len &= ~7;
748
749 /* Allocate buffer */
750 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
751 state->hroom + state->troom, GFP_ATOMIC);
752 if (!frag)
753 return ERR_PTR(-ENOMEM);
754
755 /*
756 * Set up data on packet
757 */
758
759 ip6_copy_metadata(frag, skb);
760 skb_reserve(frag, state->hroom);
761 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
762 skb_reset_network_header(frag);
763 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
764 frag->transport_header = (frag->network_header + state->hlen +
765 sizeof(struct frag_hdr));
766
767 /*
768 * Charge the memory for the fragment to any owner
769 * it might possess
770 */
771 if (skb->sk)
772 skb_set_owner_w(frag, skb->sk);
773
774 /*
775 * Copy the packet header into the new buffer.
776 */
777 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
778
779 fragnexthdr_offset = skb_network_header(frag);
780 fragnexthdr_offset += prevhdr - skb_network_header(skb);
781 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
782
783 /*
784 * Build fragment header.
785 */
786 fh->nexthdr = state->nexthdr;
787 fh->reserved = 0;
788 fh->identification = state->frag_id;
789
790 /*
791 * Copy a block of the IP datagram.
792 */
793 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
794 len));
795 state->left -= len;
796
797 fh->frag_off = htons(state->offset);
798 if (state->left > 0)
799 fh->frag_off |= htons(IP6_MF);
800 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
801
802 state->ptr += len;
803 state->offset += len;
804
805 return frag;
806}
807EXPORT_SYMBOL(ip6_frag_next);
808
7d8c6e39
EB
809int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
810 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 811{
1da177e4 812 struct sk_buff *frag;
67ba4152 813 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 814 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
815 inet6_sk(skb->sk) : NULL;
a1ac9c8a 816 bool mono_delivery_time = skb->mono_delivery_time;
8a6a1f17
PNA
817 struct ip6_frag_state state;
818 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 819 ktime_t tstamp = skb->tstamp;
8a6a1f17 820 int hroom, err = 0;
286c2349 821 __be32 frag_id;
1da177e4
LT
822 u8 *prevhdr, nexthdr = 0;
823
7dd7eb95
DM
824 err = ip6_find_1stfragopt(skb, &prevhdr);
825 if (err < 0)
2423496a 826 goto fail;
7dd7eb95 827 hlen = err;
1da177e4 828 nexthdr = *prevhdr;
ef0efcd3 829 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 830
628a5c56 831 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
832
833 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 834 * or if the skb it not generated by a local socket.
b881ef76 835 */
485fca66
FW
836 if (unlikely(!skb->ignore_df && skb->len > mtu))
837 goto fail_toobig;
a34a101e 838
485fca66
FW
839 if (IP6CB(skb)->frag_max_size) {
840 if (IP6CB(skb)->frag_max_size > mtu)
841 goto fail_toobig;
842
843 /* don't send fragments larger than what we received */
844 mtu = IP6CB(skb)->frag_max_size;
845 if (mtu < IPV6_MIN_MTU)
846 mtu = IPV6_MIN_MTU;
b881ef76
JH
847 }
848
d91675f9
YH
849 if (np && np->frag_size < mtu) {
850 if (np->frag_size)
851 mtu = np->frag_size;
852 }
89bc7848 853 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 854 goto fail_toobig;
1e0d69a9 855 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 856
fd0273d7
MKL
857 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
858 &ipv6_hdr(skb)->saddr);
286c2349 859
405c92f7
HFS
860 if (skb->ip_summed == CHECKSUM_PARTIAL &&
861 (err = skb_checksum_help(skb)))
862 goto fail;
863
ef0efcd3 864 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 865 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 866 if (skb_has_frag_list(skb)) {
c72d8cda 867 unsigned int first_len = skb_pagelen(skb);
0feca619 868 struct ip6_fraglist_iter iter;
3d13008e 869 struct sk_buff *frag2;
1da177e4
LT
870
871 if (first_len - hlen > mtu ||
872 ((first_len - hlen) & 7) ||
1d325d21
FW
873 skb_cloned(skb) ||
874 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
875 goto slow_path;
876
4d9092bb 877 skb_walk_frags(skb, frag) {
1da177e4
LT
878 /* Correct geometry. */
879 if (frag->len > mtu ||
880 ((frag->len & 7) && frag->next) ||
1d325d21 881 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 882 goto slow_path_clean;
1da177e4 883
1da177e4
LT
884 /* Partially cloned skb? */
885 if (skb_shared(frag))
3d13008e 886 goto slow_path_clean;
2fdba6b0
HX
887
888 BUG_ON(frag->sk);
889 if (skb->sk) {
2fdba6b0
HX
890 frag->sk = skb->sk;
891 frag->destructor = sock_wfree;
2fdba6b0 892 }
3d13008e 893 skb->truesize -= frag->truesize;
1da177e4
LT
894 }
895
0feca619
PNA
896 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
897 &iter);
898 if (err < 0)
1d325d21 899 goto fail;
a11d206d 900
1da177e4
LT
901 for (;;) {
902 /* Prepare header of the next frame,
903 * before previous one went down. */
0feca619
PNA
904 if (iter.frag)
905 ip6_fraglist_prepare(skb, &iter);
1ab1457c 906
a1ac9c8a 907 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
7d8c6e39 908 err = output(net, sk, skb);
67ba4152 909 if (!err)
d8d1f30b 910 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 911 IPSTATS_MIB_FRAGCREATES);
dafee490 912
0feca619 913 if (err || !iter.frag)
1da177e4
LT
914 break;
915
0feca619 916 skb = ip6_fraglist_next(&iter);
1da177e4
LT
917 }
918
0feca619 919 kfree(iter.tmp_hdr);
1da177e4
LT
920
921 if (err == 0) {
d8d1f30b 922 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 923 IPSTATS_MIB_FRAGOKS);
1da177e4
LT
924 return 0;
925 }
926
b7034146 927 kfree_skb_list(iter.frag);
1da177e4 928
d8d1f30b 929 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 930 IPSTATS_MIB_FRAGFAILS);
1da177e4 931 return err;
3d13008e
ED
932
933slow_path_clean:
934 skb_walk_frags(skb, frag2) {
935 if (frag2 == frag)
936 break;
937 frag2->sk = NULL;
938 frag2->destructor = NULL;
939 skb->truesize += frag2->truesize;
940 }
1da177e4
LT
941 }
942
943slow_path:
1da177e4
LT
944 /*
945 * Fragment the datagram.
946 */
947
8a6a1f17
PNA
948 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
949 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
950 &state);
1da177e4
LT
951
952 /*
953 * Keep copying data until we run out.
954 */
1da177e4 955
8a6a1f17
PNA
956 while (state.left > 0) {
957 frag = ip6_frag_next(skb, &state);
958 if (IS_ERR(frag)) {
959 err = PTR_ERR(frag);
1da177e4
LT
960 goto fail;
961 }
962
1da177e4
LT
963 /*
964 * Put this fragment into the sending queue.
965 */
a1ac9c8a 966 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
7d8c6e39 967 err = output(net, sk, frag);
1da177e4
LT
968 if (err)
969 goto fail;
dafee490 970
adf30907 971 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 972 IPSTATS_MIB_FRAGCREATES);
1da177e4 973 }
adf30907 974 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 975 IPSTATS_MIB_FRAGOKS);
808db80a 976 consume_skb(skb);
1da177e4
LT
977 return err;
978
485fca66
FW
979fail_toobig:
980 if (skb->sk && dst_allfrag(skb_dst(skb)))
aba54656 981 sk_gso_disable(skb->sk);
485fca66 982
485fca66
FW
983 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
984 err = -EMSGSIZE;
985
1da177e4 986fail:
adf30907 987 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 988 IPSTATS_MIB_FRAGFAILS);
1ab1457c 989 kfree_skb(skb);
1da177e4
LT
990 return err;
991}
992
b71d1d42
ED
993static inline int ip6_rt_check(const struct rt6key *rt_key,
994 const struct in6_addr *fl_addr,
995 const struct in6_addr *addr_cache)
cf6b1982 996{
a02cec21 997 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 998 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
999}
1000
497c615a
HX
1001static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1002 struct dst_entry *dst,
b71d1d42 1003 const struct flowi6 *fl6)
1da177e4 1004{
497c615a 1005 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1006 struct rt6_info *rt;
1da177e4 1007
497c615a
HX
1008 if (!dst)
1009 goto out;
1010
a963a37d
ED
1011 if (dst->ops->family != AF_INET6) {
1012 dst_release(dst);
1013 return NULL;
1014 }
1015
1016 rt = (struct rt6_info *)dst;
497c615a
HX
1017 /* Yes, checking route validity in not connected
1018 * case is not very simple. Take into account,
1019 * that we do not support routing by source, TOS,
67ba4152 1020 * and MSG_DONTROUTE --ANK (980726)
497c615a 1021 *
cf6b1982
YH
1022 * 1. ip6_rt_check(): If route was host route,
1023 * check that cached destination is current.
497c615a
HX
1024 * If it is network route, we still may
1025 * check its validity using saved pointer
1026 * to the last used address: daddr_cache.
1027 * We do not want to save whole address now,
1028 * (because main consumer of this service
1029 * is tcp, which has not this problem),
1030 * so that the last trick works only on connected
1031 * sockets.
1032 * 2. oif also should be the same.
1033 */
4c9483b2 1034 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1035#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1036 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1037#endif
ca254490
DA
1038 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1039 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
497c615a
HX
1040 dst_release(dst);
1041 dst = NULL;
1da177e4
LT
1042 }
1043
497c615a
HX
1044out:
1045 return dst;
1046}
1047
3aef934f 1048static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1049 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1050{
69cce1d1
DM
1051#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1052 struct neighbour *n;
97cac082 1053 struct rt6_info *rt;
69cce1d1
DM
1054#endif
1055 int err;
6f21c96a 1056 int flags = 0;
497c615a 1057
e16e888b
MS
1058 /* The correct way to handle this would be to do
1059 * ip6_route_get_saddr, and then ip6_route_output; however,
1060 * the route-specific preferred source forces the
1061 * ip6_route_output call _before_ ip6_route_get_saddr.
1062 *
1063 * In source specific routing (no src=any default route),
1064 * ip6_route_output will fail given src=any saddr, though, so
1065 * that's why we try it again later.
1066 */
c305b9e6 1067 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1068 struct fib6_info *from;
e16e888b 1069 struct rt6_info *rt;
1da177e4 1070
c305b9e6 1071 *dst = ip6_route_output(net, sk, fl6);
e16e888b 1072 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1073
1074 rcu_read_lock();
1075 from = rt ? rcu_dereference(rt->from) : NULL;
1076 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1077 sk ? inet6_sk(sk)->srcprefs : 0,
1078 &fl6->saddr);
a68886a6
DA
1079 rcu_read_unlock();
1080
44456d37 1081 if (err)
1da177e4 1082 goto out_err_release;
e16e888b
MS
1083
1084 /* If we had an erroneous initial result, pretend it
1085 * never existed and let the SA-enabled version take
1086 * over.
1087 */
c305b9e6 1088 if ((*dst)->error) {
e16e888b
MS
1089 dst_release(*dst);
1090 *dst = NULL;
1091 }
6f21c96a
PA
1092
1093 if (fl6->flowi6_oif)
1094 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1095 }
1096
e16e888b 1097 if (!*dst)
6f21c96a 1098 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1099
1100 err = (*dst)->error;
1101 if (err)
1102 goto out_err_release;
1103
95c385b4 1104#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1105 /*
1106 * Here if the dst entry we've looked up
1107 * has a neighbour entry that is in the INCOMPLETE
1108 * state and the src address from the flow is
1109 * marked as OPTIMISTIC, we release the found
1110 * dst entry and replace it instead with the
1111 * dst entry of the nexthop router
1112 */
c56bf6fe 1113 rt = (struct rt6_info *) *dst;
707be1ff 1114 rcu_read_lock_bh();
2647a9b0
MKL
1115 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1116 rt6_nexthop(rt, &fl6->daddr));
707be1ff
YH
1117 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1118 rcu_read_unlock_bh();
1119
1120 if (err) {
e550dfb0 1121 struct inet6_ifaddr *ifp;
4c9483b2 1122 struct flowi6 fl_gw6;
e550dfb0
NH
1123 int redirect;
1124
4c9483b2 1125 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1126 (*dst)->dev, 1);
1127
1128 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1129 if (ifp)
1130 in6_ifa_put(ifp);
1131
1132 if (redirect) {
1133 /*
1134 * We need to get the dst entry for the
1135 * default router instead
1136 */
1137 dst_release(*dst);
4c9483b2
DM
1138 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1139 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1140 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1141 err = (*dst)->error;
1142 if (err)
e550dfb0 1143 goto out_err_release;
95c385b4 1144 }
e550dfb0 1145 }
95c385b4 1146#endif
ec5e3b0a 1147 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1148 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1149 err = -EAFNOSUPPORT;
1150 goto out_err_release;
1151 }
95c385b4 1152
1da177e4
LT
1153 return 0;
1154
1155out_err_release:
1156 dst_release(*dst);
1157 *dst = NULL;
8a966fc0 1158
0d240e78
DA
1159 if (err == -ENETUNREACH)
1160 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1161 return err;
1162}
34a0b3cd 1163
497c615a
HX
1164/**
1165 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1166 * @net: Network namespace to perform lookup in
497c615a
HX
1167 * @sk: socket which provides route info
1168 * @dst: pointer to dst_entry * for result
4c9483b2 1169 * @fl6: flow to lookup
497c615a
HX
1170 *
1171 * This function performs a route lookup on the given flow.
1172 *
1173 * It returns zero on success, or a standard errno code on error.
1174 */
343d60aa
RP
1175int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1176 struct flowi6 *fl6)
497c615a
HX
1177{
1178 *dst = NULL;
343d60aa 1179 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1180}
3cf3dc6c
ACM
1181EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1182
497c615a 1183/**
68d0c6d3 1184 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1185 * @net: Network namespace to perform lookup in
68d0c6d3 1186 * @sk: socket which provides route info
4c9483b2 1187 * @fl6: flow to lookup
68d0c6d3 1188 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1189 *
1190 * This function performs a route lookup on the given flow.
1191 *
1192 * It returns a valid dst pointer on success, or a pointer encoded
1193 * error code.
1194 */
c4e85f73 1195struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1196 const struct in6_addr *final_dst)
68d0c6d3
DM
1197{
1198 struct dst_entry *dst = NULL;
1199 int err;
1200
c4e85f73 1201 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1202 if (err)
1203 return ERR_PTR(err);
1204 if (final_dst)
4e3fd7a0 1205 fl6->daddr = *final_dst;
2774c131 1206
c4e85f73 1207 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1208}
1209EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1210
1211/**
1212 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1213 * @sk: socket which provides the dst cache and route info
4c9483b2 1214 * @fl6: flow to lookup
68d0c6d3 1215 * @final_dst: final destination address for ipsec lookup
96818159 1216 * @connected: whether @sk is connected or not
497c615a
HX
1217 *
1218 * This function performs a route lookup on the given flow with the
1219 * possibility of using the cached route in the socket if it is valid.
1220 * It will take the socket dst lock when operating on the dst cache.
1221 * As a result, this function can only be used in process context.
1222 *
96818159
AK
1223 * In addition, for a connected socket, cache the dst in the socket
1224 * if the current cache is not valid.
1225 *
68d0c6d3
DM
1226 * It returns a valid dst pointer on success, or a pointer encoded
1227 * error code.
497c615a 1228 */
4c9483b2 1229struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1230 const struct in6_addr *final_dst,
1231 bool connected)
497c615a 1232{
68d0c6d3 1233 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1234
4c9483b2 1235 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1236 if (dst)
1237 return dst;
1238
c4e85f73 1239 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1240 if (connected && !IS_ERR(dst))
1241 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1242
00bc0ef5 1243 return dst;
497c615a 1244}
68d0c6d3 1245EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1246
571912c6
MV
1247/**
1248 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1249 * @skb: Packet for which lookup is done
1250 * @dev: Tunnel device
1251 * @net: Network namespace of tunnel device
b51cd7c8 1252 * @sock: Socket which provides route info
571912c6
MV
1253 * @saddr: Memory to store the src ip address
1254 * @info: Tunnel information
1255 * @protocol: IP protocol
b51cd7c8 1256 * @use_cache: Flag to enable cache usage
571912c6
MV
1257 * This function performs a route lookup on a tunnel
1258 *
1259 * It returns a valid dst pointer and stores src address to be used in
1260 * tunnel in param saddr on success, else a pointer encoded error code.
1261 */
1262
1263struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1264 struct net_device *dev,
1265 struct net *net,
1266 struct socket *sock,
1267 struct in6_addr *saddr,
1268 const struct ip_tunnel_info *info,
1269 u8 protocol,
1270 bool use_cache)
1271{
1272 struct dst_entry *dst = NULL;
1273#ifdef CONFIG_DST_CACHE
1274 struct dst_cache *dst_cache;
1275#endif
1276 struct flowi6 fl6;
1277 __u8 prio;
1278
1279#ifdef CONFIG_DST_CACHE
1280 dst_cache = (struct dst_cache *)&info->dst_cache;
1281 if (use_cache) {
1282 dst = dst_cache_get_ip6(dst_cache, saddr);
1283 if (dst)
1284 return dst;
1285 }
1286#endif
1287 memset(&fl6, 0, sizeof(fl6));
1288 fl6.flowi6_mark = skb->mark;
1289 fl6.flowi6_proto = protocol;
1290 fl6.daddr = info->key.u.ipv6.dst;
1291 fl6.saddr = info->key.u.ipv6.src;
1292 prio = info->key.tos;
1293 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1294 info->key.label);
1295
1296 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1297 NULL);
1298 if (IS_ERR(dst)) {
1299 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1300 return ERR_PTR(-ENETUNREACH);
1301 }
1302 if (dst->dev == dev) { /* is this necessary? */
1303 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1304 dst_release(dst);
1305 return ERR_PTR(-ELOOP);
1306 }
1307#ifdef CONFIG_DST_CACHE
1308 if (use_cache)
1309 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1310#endif
1311 *saddr = fl6.saddr;
1312 return dst;
1313}
1314EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1315
0178b695
HX
1316static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1317 gfp_t gfp)
1318{
1319 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1320}
1321
1322static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1323 gfp_t gfp)
1324{
1325 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1326}
1327
75a493e6 1328static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1329 int *maxfraglen,
1330 unsigned int fragheaderlen,
1331 struct sk_buff *skb,
75a493e6 1332 struct rt6_info *rt,
e367c2d0 1333 unsigned int orig_mtu)
0c183379
G
1334{
1335 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1336 if (!skb) {
0c183379 1337 /* first fragment, reserve header_len */
e367c2d0 1338 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1339
1340 } else {
1341 /*
1342 * this fragment is not first, the headers
1343 * space is regarded as data space.
1344 */
e367c2d0 1345 *mtu = orig_mtu;
0c183379
G
1346 }
1347 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1348 + fragheaderlen - sizeof(struct frag_hdr);
1349 }
1350}
1351
366e41d9 1352static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1353 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
f37a4cc6 1354 struct rt6_info *rt)
366e41d9
VY
1355{
1356 struct ipv6_pinfo *np = inet6_sk(sk);
1357 unsigned int mtu;
d656b2ea 1358 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
366e41d9 1359
40ac240c
PB
1360 /* callers pass dst together with a reference, set it first so
1361 * ip6_cork_release() can put it down even in case of an error.
1362 */
1363 cork->base.dst = &rt->dst;
1364
366e41d9
VY
1365 /*
1366 * setup for corking
1367 */
1368 if (opt) {
1369 if (WARN_ON(v6_cork->opt))
1370 return -EINVAL;
1371
d656b2ea
PB
1372 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1373 if (unlikely(!nopt))
366e41d9
VY
1374 return -ENOBUFS;
1375
d656b2ea
PB
1376 nopt->tot_len = sizeof(*opt);
1377 nopt->opt_flen = opt->opt_flen;
1378 nopt->opt_nflen = opt->opt_nflen;
366e41d9 1379
d656b2ea
PB
1380 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1381 if (opt->dst0opt && !nopt->dst0opt)
366e41d9
VY
1382 return -ENOBUFS;
1383
d656b2ea
PB
1384 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1385 if (opt->dst1opt && !nopt->dst1opt)
366e41d9
VY
1386 return -ENOBUFS;
1387
d656b2ea
PB
1388 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1389 if (opt->hopopt && !nopt->hopopt)
366e41d9
VY
1390 return -ENOBUFS;
1391
d656b2ea
PB
1392 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1393 if (opt->srcrt && !nopt->srcrt)
366e41d9
VY
1394 return -ENOBUFS;
1395
1396 /* need source address above miyazawa*/
1397 }
26879da5
WW
1398 v6_cork->hop_limit = ipc6->hlimit;
1399 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1400 if (rt->dst.flags & DST_XFRM_TUNNEL)
1401 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1402 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1403 else
1404 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1405 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1406 if (np->frag_size < mtu) {
1407 if (np->frag_size)
1408 mtu = np->frag_size;
1409 }
1410 cork->base.fragsize = mtu;
fbf47813 1411 cork->base.gso_size = ipc6->gso_size;
678ca42d 1412 cork->base.tx_flags = 0;
c6af0c22 1413 cork->base.mark = ipc6->sockc.mark;
678ca42d 1414 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1415
0f6c480f 1416 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1417 cork->base.flags |= IPCORK_ALLFRAG;
1418 cork->base.length = 0;
1419
5fdaa88d 1420 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1421
366e41d9
VY
1422 return 0;
1423}
1424
0bbe84a6 1425static int __ip6_append_data(struct sock *sk,
0bbe84a6 1426 struct sk_buff_head *queue,
f3b46a3e 1427 struct inet_cork_full *cork_full,
0bbe84a6
VY
1428 struct inet6_cork *v6_cork,
1429 struct page_frag *pfrag,
1430 int getfrag(void *from, char *to, int offset,
1431 int len, int odd, struct sk_buff *skb),
1432 void *from, int length, int transhdrlen,
5fdaa88d 1433 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1434{
0c183379 1435 struct sk_buff *skb, *skb_prev = NULL;
f3b46a3e 1436 struct inet_cork *cork = &cork_full->base;
f37a4cc6 1437 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
10b8a3de 1438 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1439 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1440 int exthdrlen = 0;
1441 int dst_exthdrlen = 0;
1da177e4 1442 int hh_len;
1da177e4
LT
1443 int copy;
1444 int err;
1445 int offset = 0;
09c2d251 1446 u32 tskey = 0;
0bbe84a6
VY
1447 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1448 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1449 int csummode = CHECKSUM_NONE;
682b1a9d 1450 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1451 unsigned int wmem_alloc_delta = 0;
100f6d8e 1452 bool paged, extra_uref = false;
1da177e4 1453
0bbe84a6
VY
1454 skb = skb_peek_tail(queue);
1455 if (!skb) {
1456 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1457 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1458 }
0bbe84a6 1459
15e36f5b 1460 paged = !!cork->gso_size;
bec1f6f6 1461 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1462 orig_mtu = mtu;
1da177e4 1463
678ca42d
WB
1464 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1465 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
a1cdec57 1466 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
678ca42d 1467
d8d1f30b 1468 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1469
a1b05140 1470 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1471 (opt ? opt->opt_nflen : 0);
1da177e4 1472
682b1a9d
HFS
1473 headersize = sizeof(struct ipv6hdr) +
1474 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1475 (dst_allfrag(&rt->dst) ?
1476 sizeof(struct frag_hdr) : 0) +
1477 rt->rt6i_nfheader_len;
1478
6596a022
JB
1479 if (mtu < fragheaderlen ||
1480 ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr))
1481 goto emsgsize;
1482
1483 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1484 sizeof(struct frag_hdr);
1485
10b8a3de
PA
1486 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1487 * the first fragment
1488 */
1489 if (headersize + transhdrlen > mtu)
1490 goto emsgsize;
1491
26879da5 1492 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d 1493 (sk->sk_protocol == IPPROTO_UDP ||
13651224 1494 sk->sk_protocol == IPPROTO_ICMPV6 ||
682b1a9d
HFS
1495 sk->sk_protocol == IPPROTO_RAW)) {
1496 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1497 sizeof(struct ipv6hdr));
1498 goto emsgsize;
1499 }
4df98e76 1500
682b1a9d
HFS
1501 if (ip6_sk_ignore_df(sk))
1502 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1503 else
1504 maxnonfragsize = mtu;
4df98e76 1505
682b1a9d 1506 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1507emsgsize:
10b8a3de
PA
1508 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1509 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1510 return -EMSGSIZE;
1da177e4
LT
1511 }
1512
682b1a9d
HFS
1513 /* CHECKSUM_PARTIAL only with no extension headers and when
1514 * we are not going to fragment
1515 */
1516 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1517 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1518 length <= mtu - headersize &&
bec1f6f6 1519 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1520 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1521 csummode = CHECKSUM_PARTIAL;
1522
b5947e5d 1523 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
8c793822 1524 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
b5947e5d
WB
1525 if (!uarg)
1526 return -ENOBUFS;
522924b5 1527 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
b5947e5d
WB
1528 if (rt->dst.dev->features & NETIF_F_SG &&
1529 csummode == CHECKSUM_PARTIAL) {
1530 paged = true;
1531 } else {
1532 uarg->zerocopy = 0;
52900d22 1533 skb_zcopy_set(skb, uarg, &extra_uref);
b5947e5d
WB
1534 }
1535 }
1536
1da177e4
LT
1537 /*
1538 * Let's try using as much space as possible.
1539 * Use MTU if total length of the message fits into the MTU.
1540 * Otherwise, we need to reserve fragment header and
1541 * fragment alignment (= 8-15 octects, in total).
1542 *
634a63e7 1543 * Note that we may need to "move" the data from the tail
1ab1457c 1544 * of the buffer to the new fragment when we split
1da177e4
LT
1545 * the message.
1546 *
1ab1457c 1547 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1548 * at once if non-fragmentable extension headers
1549 * are too large.
1ab1457c 1550 * --yoshfuji
1da177e4
LT
1551 */
1552
2811ebac 1553 cork->length += length;
2811ebac 1554 if (!skb)
1da177e4
LT
1555 goto alloc_new_skb;
1556
1557 while (length > 0) {
1558 /* Check if the remaining data fits into current packet. */
bdc712b4 1559 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1560 if (copy < length)
1561 copy = maxfraglen - skb->len;
1562
1563 if (copy <= 0) {
1564 char *data;
1565 unsigned int datalen;
1566 unsigned int fraglen;
1567 unsigned int fraggap;
6d123b81 1568 unsigned int alloclen, alloc_extra;
aba36930 1569 unsigned int pagedlen;
1da177e4 1570alloc_new_skb:
1da177e4 1571 /* There's no room in the current skb */
0c183379
G
1572 if (skb)
1573 fraggap = skb->len - maxfraglen;
1da177e4
LT
1574 else
1575 fraggap = 0;
0c183379 1576 /* update mtu and maxfraglen if necessary */
63159f29 1577 if (!skb || !skb_prev)
0c183379 1578 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1579 fragheaderlen, skb, rt,
e367c2d0 1580 orig_mtu);
0c183379
G
1581
1582 skb_prev = skb;
1da177e4
LT
1583
1584 /*
1585 * If remaining data exceeds the mtu,
1586 * we know we need more fragment(s).
1587 */
1588 datalen = length + fraggap;
1da177e4 1589
0c183379
G
1590 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1591 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1592 fraglen = datalen + fragheaderlen;
aba36930 1593 pagedlen = 0;
15e36f5b 1594
6d123b81
JK
1595 alloc_extra = hh_len;
1596 alloc_extra += dst_exthdrlen;
1597 alloc_extra += rt->dst.trailer_len;
1598
1599 /* We just reserve space for fragment header.
1600 * Note: this may be overallocation if the message
1601 * (without MSG_MORE) fits into the MTU.
1602 */
1603 alloc_extra += sizeof(struct frag_hdr);
1604
1da177e4 1605 if ((flags & MSG_MORE) &&
d8d1f30b 1606 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1607 alloclen = mtu;
6d123b81
JK
1608 else if (!paged &&
1609 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1610 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b
WB
1611 alloclen = fraglen;
1612 else {
1613 alloclen = min_t(int, fraglen, MAX_HEADER);
1614 pagedlen = fraglen - alloclen;
1615 }
6d123b81 1616 alloclen += alloc_extra;
299b0767 1617
0c183379
G
1618 if (datalen != length + fraggap) {
1619 /*
1620 * this is not the last fragment, the trailer
1621 * space is regarded as data space.
1622 */
1623 datalen += rt->dst.trailer_len;
1624 }
1625
0c183379 1626 fraglen = datalen + fragheaderlen;
1da177e4 1627
15e36f5b 1628 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1629 if (copy < 0) {
1630 err = -EINVAL;
1631 goto error;
1632 }
1da177e4 1633 if (transhdrlen) {
6d123b81 1634 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1635 (flags & MSG_DONTWAIT), &err);
1636 } else {
1637 skb = NULL;
1f4c6eb2 1638 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1639 2 * sk->sk_sndbuf)
6d123b81 1640 skb = alloc_skb(alloclen,
1f4c6eb2 1641 sk->sk_allocation);
63159f29 1642 if (unlikely(!skb))
1da177e4
LT
1643 err = -ENOBUFS;
1644 }
63159f29 1645 if (!skb)
1da177e4
LT
1646 goto error;
1647 /*
1648 * Fill in the control structures
1649 */
9c9c9ad5 1650 skb->protocol = htons(ETH_P_IPV6);
32dce968 1651 skb->ip_summed = csummode;
1da177e4 1652 skb->csum = 0;
1f85851e
G
1653 /* reserve for fragmentation and ipsec header */
1654 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1655 dst_exthdrlen);
1da177e4
LT
1656
1657 /*
1658 * Find where to start putting bytes
1659 */
15e36f5b 1660 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1661 skb_set_network_header(skb, exthdrlen);
1662 data += fragheaderlen;
b0e380b1
ACM
1663 skb->transport_header = (skb->network_header +
1664 fragheaderlen);
1da177e4
LT
1665 if (fraggap) {
1666 skb->csum = skb_copy_and_csum_bits(
1667 skb_prev, maxfraglen,
8d5930df 1668 data + transhdrlen, fraggap);
1da177e4
LT
1669 skb_prev->csum = csum_sub(skb_prev->csum,
1670 skb->csum);
1671 data += fraggap;
e9fa4f7b 1672 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1673 }
232cd35d
ED
1674 if (copy > 0 &&
1675 getfrag(from, data + transhdrlen, offset,
1676 copy, fraggap, skb) < 0) {
1da177e4
LT
1677 err = -EFAULT;
1678 kfree_skb(skb);
1679 goto error;
1680 }
1681
1682 offset += copy;
15e36f5b 1683 length -= copy + transhdrlen;
1da177e4
LT
1684 transhdrlen = 0;
1685 exthdrlen = 0;
299b0767 1686 dst_exthdrlen = 0;
1da177e4 1687
52900d22
WB
1688 /* Only the initial fragment is time stamped */
1689 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1690 cork->tx_flags = 0;
1691 skb_shinfo(skb)->tskey = tskey;
1692 tskey = 0;
1693 skb_zcopy_set(skb, uarg, &extra_uref);
1694
0dec879f
JA
1695 if ((flags & MSG_CONFIRM) && !skb_prev)
1696 skb_set_dst_pending_confirm(skb, 1);
1697
1da177e4
LT
1698 /*
1699 * Put the packet on the pending queue
1700 */
1f4c6eb2
ED
1701 if (!skb->destructor) {
1702 skb->destructor = sock_wfree;
1703 skb->sk = sk;
1704 wmem_alloc_delta += skb->truesize;
1705 }
0bbe84a6 1706 __skb_queue_tail(queue, skb);
1da177e4
LT
1707 continue;
1708 }
1709
1710 if (copy > length)
1711 copy = length;
1712
113f99c3
WB
1713 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1714 skb_tailroom(skb) >= copy) {
1da177e4
LT
1715 unsigned int off;
1716
1717 off = skb->len;
1718 if (getfrag(from, skb_put(skb, copy),
1719 offset, copy, off, skb) < 0) {
1720 __skb_trim(skb, off);
1721 err = -EFAULT;
1722 goto error;
1723 }
b5947e5d 1724 } else if (!uarg || !uarg->zerocopy) {
1da177e4 1725 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1726
5640f768
ED
1727 err = -ENOMEM;
1728 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1729 goto error;
5640f768
ED
1730
1731 if (!skb_can_coalesce(skb, i, pfrag->page,
1732 pfrag->offset)) {
1733 err = -EMSGSIZE;
1734 if (i == MAX_SKB_FRAGS)
1735 goto error;
1736
1737 __skb_fill_page_desc(skb, i, pfrag->page,
1738 pfrag->offset, 0);
1739 skb_shinfo(skb)->nr_frags = ++i;
1740 get_page(pfrag->page);
1da177e4 1741 }
5640f768 1742 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1743 if (getfrag(from,
5640f768
ED
1744 page_address(pfrag->page) + pfrag->offset,
1745 offset, copy, skb->len, skb) < 0)
1746 goto error_efault;
1747
1748 pfrag->offset += copy;
1749 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1750 skb->len += copy;
1751 skb->data_len += copy;
f945fa7a 1752 skb->truesize += copy;
1f4c6eb2 1753 wmem_alloc_delta += copy;
b5947e5d
WB
1754 } else {
1755 err = skb_zerocopy_iter_dgram(skb, from, copy);
1756 if (err < 0)
1757 goto error;
1da177e4
LT
1758 }
1759 offset += copy;
1760 length -= copy;
1761 }
5640f768 1762
9e8445a5
PA
1763 if (wmem_alloc_delta)
1764 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1765 return 0;
5640f768
ED
1766
1767error_efault:
1768 err = -EFAULT;
1da177e4 1769error:
8e044917 1770 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1771 cork->length -= length;
3bd653c8 1772 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1773 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1774 return err;
1775}
0bbe84a6
VY
1776
1777int ip6_append_data(struct sock *sk,
1778 int getfrag(void *from, char *to, int offset, int len,
1779 int odd, struct sk_buff *skb),
26879da5
WW
1780 void *from, int length, int transhdrlen,
1781 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1782 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1783{
1784 struct inet_sock *inet = inet_sk(sk);
1785 struct ipv6_pinfo *np = inet6_sk(sk);
1786 int exthdrlen;
1787 int err;
1788
1789 if (flags&MSG_PROBE)
1790 return 0;
1791 if (skb_queue_empty(&sk->sk_write_queue)) {
1792 /*
1793 * setup for corking
1794 */
40ac240c 1795 dst_hold(&rt->dst);
26879da5 1796 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
f37a4cc6 1797 ipc6, rt);
0bbe84a6
VY
1798 if (err)
1799 return err;
1800
f37a4cc6 1801 inet->cork.fl.u.ip6 = *fl6;
26879da5 1802 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1803 length += exthdrlen;
1804 transhdrlen += exthdrlen;
1805 } else {
0bbe84a6
VY
1806 transhdrlen = 0;
1807 }
1808
f37a4cc6 1809 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
0bbe84a6 1810 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1811 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1812}
a495f836 1813EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1814
cd3c7480
PB
1815static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1816{
1817 struct dst_entry *dst = cork->base.dst;
1818
1819 cork->base.dst = NULL;
1820 cork->base.flags &= ~IPCORK_ALLFRAG;
1821 skb_dst_set(skb, dst);
1822}
1823
366e41d9
VY
1824static void ip6_cork_release(struct inet_cork_full *cork,
1825 struct inet6_cork *v6_cork)
bf138862 1826{
366e41d9 1827 if (v6_cork->opt) {
d656b2ea
PB
1828 struct ipv6_txoptions *opt = v6_cork->opt;
1829
1830 kfree(opt->dst0opt);
1831 kfree(opt->dst1opt);
1832 kfree(opt->hopopt);
1833 kfree(opt->srcrt);
1834 kfree(opt);
366e41d9 1835 v6_cork->opt = NULL;
0178b695
HX
1836 }
1837
366e41d9
VY
1838 if (cork->base.dst) {
1839 dst_release(cork->base.dst);
1840 cork->base.dst = NULL;
1841 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1842 }
bf138862
PE
1843}
1844
6422398c
VY
1845struct sk_buff *__ip6_make_skb(struct sock *sk,
1846 struct sk_buff_head *queue,
1847 struct inet_cork_full *cork,
1848 struct inet6_cork *v6_cork)
1da177e4
LT
1849{
1850 struct sk_buff *skb, *tmp_skb;
1851 struct sk_buff **tail_skb;
b60d4e58 1852 struct in6_addr *final_dst;
1da177e4 1853 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1854 struct net *net = sock_net(sk);
1da177e4 1855 struct ipv6hdr *hdr;
6422398c
VY
1856 struct ipv6_txoptions *opt = v6_cork->opt;
1857 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1858 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1859 unsigned char proto = fl6->flowi6_proto;
1da177e4 1860
6422398c 1861 skb = __skb_dequeue(queue);
63159f29 1862 if (!skb)
1da177e4
LT
1863 goto out;
1864 tail_skb = &(skb_shinfo(skb)->frag_list);
1865
1866 /* move skb->data to ip header from ext header */
d56f90a7 1867 if (skb->data < skb_network_header(skb))
bbe735e4 1868 __skb_pull(skb, skb_network_offset(skb));
6422398c 1869 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1870 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1871 *tail_skb = tmp_skb;
1872 tail_skb = &(tmp_skb->next);
1873 skb->len += tmp_skb->len;
1874 skb->data_len += tmp_skb->len;
1da177e4 1875 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1876 tmp_skb->destructor = NULL;
1877 tmp_skb->sk = NULL;
1da177e4
LT
1878 }
1879
28a89453 1880 /* Allow local fragmentation. */
60ff7467 1881 skb->ignore_df = ip6_sk_ignore_df(sk);
cfe1fc77 1882 __skb_pull(skb, skb_network_header_len(skb));
b60d4e58
PB
1883
1884 final_dst = &fl6->daddr;
1da177e4
LT
1885 if (opt && opt->opt_flen)
1886 ipv6_push_frag_opts(skb, opt, &proto);
1887 if (opt && opt->opt_nflen)
613fa3ca 1888 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1889
e2d1bca7
ACM
1890 skb_push(skb, sizeof(struct ipv6hdr));
1891 skb_reset_network_header(skb);
0660e03f 1892 hdr = ipv6_hdr(skb);
1ab1457c 1893
6422398c 1894 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1895 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1896 ip6_autoflowlabel(net, np), fl6));
6422398c 1897 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1898 hdr->nexthdr = proto;
4e3fd7a0
AD
1899 hdr->saddr = fl6->saddr;
1900 hdr->daddr = *final_dst;
1da177e4 1901
a2c2064f 1902 skb->priority = sk->sk_priority;
c6af0c22 1903 skb->mark = cork->base.mark;
a818f75e
JSP
1904 skb->tstamp = cork->base.transmit_time;
1905
cd3c7480 1906 ip6_cork_steal_dst(skb, cork);
edf391ff 1907 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1908 if (proto == IPPROTO_ICMPV6) {
adf30907 1909 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1910
43a43b60
HFS
1911 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1912 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1913 }
1914
6422398c
VY
1915 ip6_cork_release(cork, v6_cork);
1916out:
1917 return skb;
1918}
1919
1920int ip6_send_skb(struct sk_buff *skb)
1921{
1922 struct net *net = sock_net(skb->sk);
1923 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1924 int err;
1925
33224b16 1926 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1927 if (err) {
1928 if (err > 0)
6ce9e7b5 1929 err = net_xmit_errno(err);
1da177e4 1930 if (err)
6422398c
VY
1931 IP6_INC_STATS(net, rt->rt6i_idev,
1932 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1933 }
1934
1da177e4 1935 return err;
6422398c
VY
1936}
1937
1938int ip6_push_pending_frames(struct sock *sk)
1939{
1940 struct sk_buff *skb;
1941
1942 skb = ip6_finish_skb(sk);
1943 if (!skb)
1944 return 0;
1945
1946 return ip6_send_skb(skb);
1da177e4 1947}
a495f836 1948EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 1949
0bbe84a6 1950static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
1951 struct sk_buff_head *queue,
1952 struct inet_cork_full *cork,
1953 struct inet6_cork *v6_cork)
1da177e4 1954{
1da177e4
LT
1955 struct sk_buff *skb;
1956
0bbe84a6 1957 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
1958 if (skb_dst(skb))
1959 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1960 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1961 kfree_skb(skb);
1962 }
1963
6422398c 1964 ip6_cork_release(cork, v6_cork);
1da177e4 1965}
0bbe84a6
VY
1966
1967void ip6_flush_pending_frames(struct sock *sk)
1968{
6422398c
VY
1969 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1970 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 1971}
a495f836 1972EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
1973
1974struct sk_buff *ip6_make_skb(struct sock *sk,
1975 int getfrag(void *from, char *to, int offset,
1976 int len, int odd, struct sk_buff *skb),
1977 void *from, int length, int transhdrlen,
f37a4cc6
PB
1978 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
1979 unsigned int flags, struct inet_cork_full *cork)
6422398c 1980{
6422398c
VY
1981 struct inet6_cork v6_cork;
1982 struct sk_buff_head queue;
26879da5 1983 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
1984 int err;
1985
40ac240c
PB
1986 if (flags & MSG_PROBE) {
1987 dst_release(&rt->dst);
6422398c 1988 return NULL;
40ac240c 1989 }
6422398c
VY
1990
1991 __skb_queue_head_init(&queue);
1992
1cd7884d
WB
1993 cork->base.flags = 0;
1994 cork->base.addr = 0;
1995 cork->base.opt = NULL;
6422398c 1996 v6_cork.opt = NULL;
f37a4cc6 1997 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
862c03ee 1998 if (err) {
1cd7884d 1999 ip6_cork_release(cork, &v6_cork);
6422398c 2000 return ERR_PTR(err);
862c03ee 2001 }
26879da5
WW
2002 if (ipc6->dontfrag < 0)
2003 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 2004
f37a4cc6 2005 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
6422398c
VY
2006 &current->task_frag, getfrag, from,
2007 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 2008 flags, ipc6);
6422398c 2009 if (err) {
1cd7884d 2010 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
2011 return ERR_PTR(err);
2012 }
2013
1cd7884d 2014 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2015}