selftests/tc-testings: Be compatible with newer tc output
[linux-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
45#include <net/ipv6.h>
46#include <net/ndisc.h>
47#include <net/protocol.h>
48#include <net/ip6_route.h>
49#include <net/addrconf.h>
50#include <net/rawv6.h>
51#include <net/icmp.h>
52#include <net/xfrm.h>
53#include <net/checksum.h>
7bc570c8 54#include <linux/mroute6.h>
ca254490 55#include <net/l3mdev.h>
14972cbd 56#include <net/lwtunnel.h>
571912c6 57#include <net/ip_tunnels.h>
1da177e4 58
7d8c6e39 59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
e415ed3a 63 struct inet6_dev *idev = ip6_dst_idev(dst);
5796015f 64 unsigned int hh_len = LL_RESERVED_SPACE(dev);
e415ed3a
VA
65 const struct in6_addr *daddr, *nexthop;
66 struct ipv6hdr *hdr;
f6b72b62 67 struct neighbour *neigh;
6fd6ce20 68 int ret;
1da177e4 69
5796015f 70 /* Be paranoid, rather than too clever. */
e415ed3a
VA
71 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 skb = skb_expand_head(skb, hh_len);
5796015f 73 if (!skb) {
e415ed3a 74 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5796015f
VA
75 return -ENOMEM;
76 }
77 }
78
e415ed3a
VA
79 hdr = ipv6_hdr(skb);
80 daddr = &hdr->daddr;
81 if (ipv6_addr_is_multicast(daddr)) {
7026b1dd 82 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 83 ((mroute6_is_socket(net, skb) &&
bd91b8bf 84 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
e415ed3a 85 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
1da177e4
LT
86 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88 /* Do not check for IFF_ALLMULTI; multicast routing
89 is not supported in any case.
90 */
91 if (newskb)
b2e0b385 92 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 93 net, sk, newskb, NULL, newskb->dev,
95603e22 94 dev_loopback_xmit);
1da177e4 95
e415ed3a 96 if (hdr->hop_limit == 0) {
78126c41 97 IP6_INC_STATS(net, idev,
3bd653c8 98 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
99 kfree_skb(skb);
100 return 0;
101 }
102 }
103
78126c41 104 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
e415ed3a 105 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
dd408515
HFS
106 !(dev->flags & IFF_LOOPBACK)) {
107 kfree_skb(skb);
108 return 0;
109 }
1da177e4
LT
110 }
111
14972cbd
RP
112 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 int res = lwtunnel_xmit(skb);
114
115 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 return res;
117 }
118
6fd6ce20 119 rcu_read_lock_bh();
e415ed3a
VA
120 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
6fd6ce20 122 if (unlikely(!neigh))
e415ed3a 123 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
6fd6ce20 124 if (!IS_ERR(neigh)) {
4ff06203 125 sock_confirm_neigh(skb, neigh);
0353f282 126 ret = neigh_output(neigh, skb, false);
6fd6ce20
YH
127 rcu_read_unlock_bh();
128 return ret;
129 }
130 rcu_read_unlock_bh();
05e3aa09 131
e415ed3a 132 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
9e508490
JE
133 kfree_skb(skb);
134 return -EINVAL;
1da177e4
LT
135}
136
b210de4f
AL
137static int
138ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
139 struct sk_buff *skb, unsigned int mtu)
140{
141 struct sk_buff *segs, *nskb;
142 netdev_features_t features;
143 int ret = 0;
144
145 /* Please see corresponding comment in ip_finish_output_gso
146 * describing the cases where GSO segment length exceeds the
147 * egress MTU.
148 */
149 features = netif_skb_features(skb);
150 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
151 if (IS_ERR_OR_NULL(segs)) {
152 kfree_skb(skb);
153 return -ENOMEM;
154 }
155
156 consume_skb(skb);
157
158 skb_list_walk_safe(segs, segs, nskb) {
159 int err;
160
161 skb_mark_not_on_list(segs);
162 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
163 if (err && ret == 0)
164 ret = err;
165 }
166
167 return ret;
168}
169
956fe219 170static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 171{
b210de4f
AL
172 unsigned int mtu;
173
09ee9dba
TB
174#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
175 /* Policy lookup after SNAT yielded a new policy */
176 if (skb_dst(skb)->xfrm) {
177 IPCB(skb)->flags |= IPSKB_REROUTED;
178 return dst_output(net, sk, skb);
179 }
180#endif
181
b210de4f
AL
182 mtu = ip6_skb_dst_mtu(skb);
183 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
185
186 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
187 dst_allfrag(skb_dst(skb)) ||
188 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 189 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 190 else
7d8c6e39 191 return ip6_finish_output2(net, sk, skb);
9e508490
JE
192}
193
956fe219 194static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
195{
196 int ret;
197
198 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
199 switch (ret) {
200 case NET_XMIT_SUCCESS:
201 return __ip6_finish_output(net, sk, skb);
202 case NET_XMIT_CN:
203 return __ip6_finish_output(net, sk, skb) ? : ret;
204 default:
205 kfree_skb(skb);
206 return ret;
207 }
208}
209
ede2059d 210int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 211{
28f8bfd1 212 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 213 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 214
97a7a37a
CF
215 skb->protocol = htons(ETH_P_IPV6);
216 skb->dev = dev;
217
778d80be 218 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 219 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
220 kfree_skb(skb);
221 return 0;
222 }
223
29a26a56 224 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 225 net, sk, skb, indev, dev,
9c6eb28a
JE
226 ip6_finish_output,
227 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 228}
6585d7dc 229EXPORT_SYMBOL(ip6_output);
1da177e4 230
e9191ffb 231bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
232{
233 if (!np->autoflowlabel_set)
234 return ip6_default_np_autolabel(net);
235 else
236 return np->autoflowlabel;
237}
238
1da177e4 239/*
1c1e9d2b
ED
240 * xmit an sk_buff (used by TCP, SCTP and DCCP)
241 * Note : socket lock is not held for SYNACK packets, but might be modified
242 * by calls to skb_set_owner_w() and ipv6_local_error(),
243 * which are using proper atomic operations or spinlocks.
1da177e4 244 */
1c1e9d2b 245int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 246 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 247{
3bd653c8 248 struct net *net = sock_net(sk);
1c1e9d2b 249 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 250 struct in6_addr *first_hop = &fl6->daddr;
adf30907 251 struct dst_entry *dst = skb_dst(skb);
0c9f227b
VA
252 struct net_device *dev = dst->dev;
253 struct inet6_dev *idev = ip6_dst_idev(dst);
66033f47 254 unsigned int head_room;
1da177e4 255 struct ipv6hdr *hdr;
4c9483b2 256 u8 proto = fl6->flowi6_proto;
1da177e4 257 int seg_len = skb->len;
e651f03a 258 int hlimit = -1;
1da177e4
LT
259 u32 mtu;
260
0c9f227b 261 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
66033f47
SB
262 if (opt)
263 head_room += opt->opt_nflen + opt->opt_flen;
264
0c9f227b
VA
265 if (unlikely(head_room > skb_headroom(skb))) {
266 skb = skb_expand_head(skb, head_room);
267 if (!skb) {
268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
66033f47 269 return -ENOBUFS;
1da177e4 270 }
66033f47
SB
271 }
272
273 if (opt) {
274 seg_len += opt->opt_nflen + opt->opt_flen;
275
1da177e4
LT
276 if (opt->opt_flen)
277 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 278
1da177e4 279 if (opt->opt_nflen)
613fa3ca
DL
280 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
281 &fl6->saddr);
1da177e4
LT
282 }
283
e2d1bca7
ACM
284 skb_push(skb, sizeof(struct ipv6hdr));
285 skb_reset_network_header(skb);
0660e03f 286 hdr = ipv6_hdr(skb);
1da177e4
LT
287
288 /*
289 * Fill in the IPv6 header
290 */
b903d324 291 if (np)
1da177e4
LT
292 hlimit = np->hop_limit;
293 if (hlimit < 0)
6b75d090 294 hlimit = ip6_dst_hoplimit(dst);
1da177e4 295
cb1ce2ef 296 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 297 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 298
1da177e4
LT
299 hdr->payload_len = htons(seg_len);
300 hdr->nexthdr = proto;
301 hdr->hop_limit = hlimit;
302
4e3fd7a0
AD
303 hdr->saddr = fl6->saddr;
304 hdr->daddr = *first_hop;
1da177e4 305
9c9c9ad5 306 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 307 skb->priority = priority;
92e55f41 308 skb->mark = mark;
a2c2064f 309
1da177e4 310 mtu = dst_mtu(dst);
60ff7467 311 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
0c9f227b 312 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
313
314 /* if egress device is enslaved to an L3 master device pass the
315 * skb to its handler for processing
316 */
317 skb = l3mdev_ip6_out((struct sock *)sk, skb);
318 if (unlikely(!skb))
319 return 0;
320
1c1e9d2b
ED
321 /* hooks should never assume socket lock is held.
322 * we promote our socket to non const
323 */
29a26a56 324 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0c9f227b 325 net, (struct sock *)sk, skb, NULL, dev,
13206b6b 326 dst_output);
1da177e4
LT
327 }
328
0c9f227b 329 skb->dev = dev;
1c1e9d2b
ED
330 /* ipv6_local_error() does not require socket lock,
331 * we promote our socket to non const
332 */
333 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
334
0c9f227b 335 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
336 kfree_skb(skb);
337 return -EMSGSIZE;
338}
7159039a
YH
339EXPORT_SYMBOL(ip6_xmit);
340
1da177e4
LT
341static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
342{
343 struct ip6_ra_chain *ra;
344 struct sock *last = NULL;
345
346 read_lock(&ip6_ra_lock);
347 for (ra = ip6_ra_chain; ra; ra = ra->next) {
348 struct sock *sk = ra->sk;
0bd1b59b
AM
349 if (sk && ra->sel == sel &&
350 (!sk->sk_bound_dev_if ||
351 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
352 struct ipv6_pinfo *np = inet6_sk(sk);
353
354 if (np && np->rtalert_isolate &&
355 !net_eq(sock_net(sk), dev_net(skb->dev))) {
356 continue;
357 }
1da177e4
LT
358 if (last) {
359 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
360 if (skb2)
361 rawv6_rcv(last, skb2);
362 }
363 last = sk;
364 }
365 }
366
367 if (last) {
368 rawv6_rcv(last, skb);
369 read_unlock(&ip6_ra_lock);
370 return 1;
371 }
372 read_unlock(&ip6_ra_lock);
373 return 0;
374}
375
e21e0b5f
VN
376static int ip6_forward_proxy_check(struct sk_buff *skb)
377{
0660e03f 378 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 379 u8 nexthdr = hdr->nexthdr;
75f2811c 380 __be16 frag_off;
e21e0b5f
VN
381 int offset;
382
383 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 384 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
385 if (offset < 0)
386 return 0;
387 } else
388 offset = sizeof(struct ipv6hdr);
389
390 if (nexthdr == IPPROTO_ICMPV6) {
391 struct icmp6hdr *icmp6;
392
d56f90a7
ACM
393 if (!pskb_may_pull(skb, (skb_network_header(skb) +
394 offset + 1 - skb->data)))
e21e0b5f
VN
395 return 0;
396
d56f90a7 397 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
398
399 switch (icmp6->icmp6_type) {
400 case NDISC_ROUTER_SOLICITATION:
401 case NDISC_ROUTER_ADVERTISEMENT:
402 case NDISC_NEIGHBOUR_SOLICITATION:
403 case NDISC_NEIGHBOUR_ADVERTISEMENT:
404 case NDISC_REDIRECT:
405 /* For reaction involving unicast neighbor discovery
406 * message destined to the proxied address, pass it to
407 * input function.
408 */
409 return 1;
410 default:
411 break;
412 }
413 }
414
74553b09
VN
415 /*
416 * The proxying router can't forward traffic sent to a link-local
417 * address, so signal the sender and discard the packet. This
418 * behavior is clarified by the MIPv6 specification.
419 */
420 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
421 dst_link_failure(skb);
422 return -1;
423 }
424
e21e0b5f
VN
425 return 0;
426}
427
0c4b51f0
EB
428static inline int ip6_forward_finish(struct net *net, struct sock *sk,
429 struct sk_buff *skb)
1da177e4 430{
71a1c915
JB
431 struct dst_entry *dst = skb_dst(skb);
432
433 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
434 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
435
f839a6c9
IS
436#ifdef CONFIG_NET_SWITCHDEV
437 if (skb->offload_l3_fwd_mark) {
438 consume_skb(skb);
439 return 0;
440 }
441#endif
442
8203e2d8 443 skb->tstamp = 0;
13206b6b 444 return dst_output(net, sk, skb);
1da177e4
LT
445}
446
fe6cc55f
FW
447static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
448{
418a3156 449 if (skb->len <= mtu)
fe6cc55f
FW
450 return false;
451
60ff7467 452 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
453 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
454 return true;
455
60ff7467 456 if (skb->ignore_df)
418a3156
FW
457 return false;
458
779b7931 459 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
460 return false;
461
462 return true;
463}
464
1da177e4
LT
465int ip6_forward(struct sk_buff *skb)
466{
adf30907 467 struct dst_entry *dst = skb_dst(skb);
0660e03f 468 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 469 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 470 struct net *net = dev_net(dst->dev);
0857d6f8 471 struct inet6_dev *idev;
14f3ad6f 472 u32 mtu;
1ab1457c 473
0857d6f8 474 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
53b7997f 475 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
476 goto error;
477
090f1166
LR
478 if (skb->pkt_type != PACKET_HOST)
479 goto drop;
480
9ef2e965
HFS
481 if (unlikely(skb->sk))
482 goto drop;
483
4497b076
BH
484 if (skb_warn_if_lro(skb))
485 goto drop;
486
ccd27f05
ND
487 if (!net->ipv6.devconf_all->disable_policy &&
488 !idev->cnf.disable_policy &&
489 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 490 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
491 goto drop;
492 }
493
35fc92a9 494 skb_forward_csum(skb);
1da177e4
LT
495
496 /*
497 * We DO NOT make any processing on
498 * RA packets, pushing them to user level AS IS
499 * without ane WARRANTY that application will be able
500 * to interpret them. The reason is that we
501 * cannot make anything clever here.
502 *
503 * We are not end-node, so that if packet contains
504 * AH/ESP, we cannot make anything.
505 * Defragmentation also would be mistake, RA packets
506 * cannot be fragmented, because there is no warranty
507 * that different fragments will go along one path. --ANK
508 */
ab4eb353
YH
509 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
510 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
511 return 0;
512 }
513
514 /*
515 * check and decrement ttl
516 */
517 if (hdr->hop_limit <= 1) {
3ffe533c 518 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
520
521 kfree_skb(skb);
522 return -ETIMEDOUT;
523 }
524
fbea49e1 525 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 526 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 527 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 528 int proxied = ip6_forward_proxy_check(skb);
46c7655f
KP
529 if (proxied > 0) {
530 hdr->hop_limit--;
e21e0b5f 531 return ip6_input(skb);
46c7655f 532 } else if (proxied < 0) {
bdb7cc64 533 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
534 goto drop;
535 }
e21e0b5f
VN
536 }
537
1da177e4 538 if (!xfrm6_route_forward(skb)) {
bdb7cc64 539 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
540 goto drop;
541 }
adf30907 542 dst = skb_dst(skb);
1da177e4
LT
543
544 /* IPv6 specs say nothing about it, but it is clear that we cannot
545 send redirects to source routed frames.
1e5dc146 546 We don't send redirects to frames decapsulated from IPsec.
1da177e4 547 */
2f17becf
SS
548 if (IP6CB(skb)->iif == dst->dev->ifindex &&
549 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 550 struct in6_addr *target = NULL;
fbfe95a4 551 struct inet_peer *peer;
1da177e4 552 struct rt6_info *rt;
1da177e4
LT
553
554 /*
555 * incoming and outgoing devices are the same
556 * send a redirect.
557 */
558
559 rt = (struct rt6_info *) dst;
c45a3dfb
DM
560 if (rt->rt6i_flags & RTF_GATEWAY)
561 target = &rt->rt6i_gateway;
1da177e4
LT
562 else
563 target = &hdr->daddr;
564
fd0273d7 565 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 566
1da177e4
LT
567 /* Limit redirects both by destination (here)
568 and by source (inside ndisc_send_redirect)
569 */
fbfe95a4 570 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 571 ndisc_send_redirect(skb, target);
1d861aa4
DM
572 if (peer)
573 inet_putpeer(peer);
5bb1ab09
DS
574 } else {
575 int addrtype = ipv6_addr_type(&hdr->saddr);
576
1da177e4 577 /* This check is security critical. */
f81b2e7d
YH
578 if (addrtype == IPV6_ADDR_ANY ||
579 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
580 goto error;
581 if (addrtype & IPV6_ADDR_LINKLOCAL) {
582 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 583 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
584 goto error;
585 }
1da177e4
LT
586 }
587
427faee1 588 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
589 if (mtu < IPV6_MIN_MTU)
590 mtu = IPV6_MIN_MTU;
591
fe6cc55f 592 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
593 /* Again, force OUTPUT device used as source address */
594 skb->dev = dst->dev;
14f3ad6f 595 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 596 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
597 __IP6_INC_STATS(net, ip6_dst_idev(dst),
598 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
599 kfree_skb(skb);
600 return -EMSGSIZE;
601 }
602
603 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
604 __IP6_INC_STATS(net, ip6_dst_idev(dst),
605 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
606 goto drop;
607 }
608
0660e03f 609 hdr = ipv6_hdr(skb);
1da177e4
LT
610
611 /* Mangling hops number delayed to point after skb COW */
1ab1457c 612
1da177e4
LT
613 hdr->hop_limit--;
614
29a26a56
EB
615 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
616 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 617 ip6_forward_finish);
1da177e4
LT
618
619error:
bdb7cc64 620 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
621drop:
622 kfree_skb(skb);
623 return -EINVAL;
624}
625
626static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
627{
628 to->pkt_type = from->pkt_type;
629 to->priority = from->priority;
630 to->protocol = from->protocol;
adf30907
ED
631 skb_dst_drop(to);
632 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 633 to->dev = from->dev;
82e91ffe 634 to->mark = from->mark;
1da177e4 635
3dd1c9a1
PA
636 skb_copy_hash(to, from);
637
1da177e4
LT
638#ifdef CONFIG_NET_SCHED
639 to->tc_index = from->tc_index;
640#endif
e7ac05f3 641 nf_copy(to, from);
df5042f4 642 skb_ext_copy(to, from);
984bc16c 643 skb_copy_secmark(to, from);
1da177e4
LT
644}
645
0feca619
PNA
646int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
647 u8 nexthdr, __be32 frag_id,
648 struct ip6_fraglist_iter *iter)
649{
650 unsigned int first_len;
651 struct frag_hdr *fh;
652
653 /* BUILD HEADER */
654 *prevhdr = NEXTHDR_FRAGMENT;
655 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
656 if (!iter->tmp_hdr)
657 return -ENOMEM;
658
b7034146 659 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
660 skb_frag_list_init(skb);
661
662 iter->offset = 0;
663 iter->hlen = hlen;
664 iter->frag_id = frag_id;
665 iter->nexthdr = nexthdr;
666
667 __skb_pull(skb, hlen);
668 fh = __skb_push(skb, sizeof(struct frag_hdr));
669 __skb_push(skb, hlen);
670 skb_reset_network_header(skb);
671 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
672
673 fh->nexthdr = nexthdr;
674 fh->reserved = 0;
675 fh->frag_off = htons(IP6_MF);
676 fh->identification = frag_id;
677
678 first_len = skb_pagelen(skb);
679 skb->data_len = first_len - skb_headlen(skb);
680 skb->len = first_len;
681 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
682
683 return 0;
684}
685EXPORT_SYMBOL(ip6_fraglist_init);
686
687void ip6_fraglist_prepare(struct sk_buff *skb,
688 struct ip6_fraglist_iter *iter)
689{
690 struct sk_buff *frag = iter->frag;
691 unsigned int hlen = iter->hlen;
692 struct frag_hdr *fh;
693
694 frag->ip_summed = CHECKSUM_NONE;
695 skb_reset_transport_header(frag);
696 fh = __skb_push(frag, sizeof(struct frag_hdr));
697 __skb_push(frag, hlen);
698 skb_reset_network_header(frag);
699 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
700 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
701 fh->nexthdr = iter->nexthdr;
702 fh->reserved = 0;
703 fh->frag_off = htons(iter->offset);
704 if (frag->next)
705 fh->frag_off |= htons(IP6_MF);
706 fh->identification = iter->frag_id;
707 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
708 ip6_copy_metadata(frag, skb);
709}
710EXPORT_SYMBOL(ip6_fraglist_prepare);
711
8a6a1f17
PNA
712void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
713 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
714 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
715{
716 state->prevhdr = prevhdr;
717 state->nexthdr = nexthdr;
718 state->frag_id = frag_id;
719
720 state->hlen = hlen;
721 state->mtu = mtu;
722
723 state->left = skb->len - hlen; /* Space per frame */
724 state->ptr = hlen; /* Where to start from */
725
726 state->hroom = hdr_room;
727 state->troom = needed_tailroom;
728
729 state->offset = 0;
730}
731EXPORT_SYMBOL(ip6_frag_init);
732
733struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
734{
735 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
736 struct sk_buff *frag;
737 struct frag_hdr *fh;
738 unsigned int len;
739
740 len = state->left;
741 /* IF: it doesn't fit, use 'mtu' - the data space left */
742 if (len > state->mtu)
743 len = state->mtu;
744 /* IF: we are not sending up to and including the packet end
745 then align the next start on an eight byte boundary */
746 if (len < state->left)
747 len &= ~7;
748
749 /* Allocate buffer */
750 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
751 state->hroom + state->troom, GFP_ATOMIC);
752 if (!frag)
753 return ERR_PTR(-ENOMEM);
754
755 /*
756 * Set up data on packet
757 */
758
759 ip6_copy_metadata(frag, skb);
760 skb_reserve(frag, state->hroom);
761 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
762 skb_reset_network_header(frag);
763 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
764 frag->transport_header = (frag->network_header + state->hlen +
765 sizeof(struct frag_hdr));
766
767 /*
768 * Charge the memory for the fragment to any owner
769 * it might possess
770 */
771 if (skb->sk)
772 skb_set_owner_w(frag, skb->sk);
773
774 /*
775 * Copy the packet header into the new buffer.
776 */
777 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
778
779 fragnexthdr_offset = skb_network_header(frag);
780 fragnexthdr_offset += prevhdr - skb_network_header(skb);
781 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
782
783 /*
784 * Build fragment header.
785 */
786 fh->nexthdr = state->nexthdr;
787 fh->reserved = 0;
788 fh->identification = state->frag_id;
789
790 /*
791 * Copy a block of the IP datagram.
792 */
793 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
794 len));
795 state->left -= len;
796
797 fh->frag_off = htons(state->offset);
798 if (state->left > 0)
799 fh->frag_off |= htons(IP6_MF);
800 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
801
802 state->ptr += len;
803 state->offset += len;
804
805 return frag;
806}
807EXPORT_SYMBOL(ip6_frag_next);
808
7d8c6e39
EB
809int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
810 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 811{
1da177e4 812 struct sk_buff *frag;
67ba4152 813 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 814 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
815 inet6_sk(skb->sk) : NULL;
8a6a1f17
PNA
816 struct ip6_frag_state state;
817 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 818 ktime_t tstamp = skb->tstamp;
8a6a1f17 819 int hroom, err = 0;
286c2349 820 __be32 frag_id;
1da177e4
LT
821 u8 *prevhdr, nexthdr = 0;
822
7dd7eb95
DM
823 err = ip6_find_1stfragopt(skb, &prevhdr);
824 if (err < 0)
2423496a 825 goto fail;
7dd7eb95 826 hlen = err;
1da177e4 827 nexthdr = *prevhdr;
ef0efcd3 828 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 829
628a5c56 830 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
831
832 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 833 * or if the skb it not generated by a local socket.
b881ef76 834 */
485fca66
FW
835 if (unlikely(!skb->ignore_df && skb->len > mtu))
836 goto fail_toobig;
a34a101e 837
485fca66
FW
838 if (IP6CB(skb)->frag_max_size) {
839 if (IP6CB(skb)->frag_max_size > mtu)
840 goto fail_toobig;
841
842 /* don't send fragments larger than what we received */
843 mtu = IP6CB(skb)->frag_max_size;
844 if (mtu < IPV6_MIN_MTU)
845 mtu = IPV6_MIN_MTU;
b881ef76
JH
846 }
847
d91675f9
YH
848 if (np && np->frag_size < mtu) {
849 if (np->frag_size)
850 mtu = np->frag_size;
851 }
89bc7848 852 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 853 goto fail_toobig;
1e0d69a9 854 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 855
fd0273d7
MKL
856 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
857 &ipv6_hdr(skb)->saddr);
286c2349 858
405c92f7
HFS
859 if (skb->ip_summed == CHECKSUM_PARTIAL &&
860 (err = skb_checksum_help(skb)))
861 goto fail;
862
ef0efcd3 863 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 864 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 865 if (skb_has_frag_list(skb)) {
c72d8cda 866 unsigned int first_len = skb_pagelen(skb);
0feca619 867 struct ip6_fraglist_iter iter;
3d13008e 868 struct sk_buff *frag2;
1da177e4
LT
869
870 if (first_len - hlen > mtu ||
871 ((first_len - hlen) & 7) ||
1d325d21
FW
872 skb_cloned(skb) ||
873 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
874 goto slow_path;
875
4d9092bb 876 skb_walk_frags(skb, frag) {
1da177e4
LT
877 /* Correct geometry. */
878 if (frag->len > mtu ||
879 ((frag->len & 7) && frag->next) ||
1d325d21 880 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 881 goto slow_path_clean;
1da177e4 882
1da177e4
LT
883 /* Partially cloned skb? */
884 if (skb_shared(frag))
3d13008e 885 goto slow_path_clean;
2fdba6b0
HX
886
887 BUG_ON(frag->sk);
888 if (skb->sk) {
2fdba6b0
HX
889 frag->sk = skb->sk;
890 frag->destructor = sock_wfree;
2fdba6b0 891 }
3d13008e 892 skb->truesize -= frag->truesize;
1da177e4
LT
893 }
894
0feca619
PNA
895 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
896 &iter);
897 if (err < 0)
1d325d21 898 goto fail;
a11d206d 899
1da177e4
LT
900 for (;;) {
901 /* Prepare header of the next frame,
902 * before previous one went down. */
0feca619
PNA
903 if (iter.frag)
904 ip6_fraglist_prepare(skb, &iter);
1ab1457c 905
9669fffc 906 skb->tstamp = tstamp;
7d8c6e39 907 err = output(net, sk, skb);
67ba4152 908 if (!err)
d8d1f30b 909 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 910 IPSTATS_MIB_FRAGCREATES);
dafee490 911
0feca619 912 if (err || !iter.frag)
1da177e4
LT
913 break;
914
0feca619 915 skb = ip6_fraglist_next(&iter);
1da177e4
LT
916 }
917
0feca619 918 kfree(iter.tmp_hdr);
1da177e4
LT
919
920 if (err == 0) {
d8d1f30b 921 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 922 IPSTATS_MIB_FRAGOKS);
1da177e4
LT
923 return 0;
924 }
925
b7034146 926 kfree_skb_list(iter.frag);
1da177e4 927
d8d1f30b 928 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 929 IPSTATS_MIB_FRAGFAILS);
1da177e4 930 return err;
3d13008e
ED
931
932slow_path_clean:
933 skb_walk_frags(skb, frag2) {
934 if (frag2 == frag)
935 break;
936 frag2->sk = NULL;
937 frag2->destructor = NULL;
938 skb->truesize += frag2->truesize;
939 }
1da177e4
LT
940 }
941
942slow_path:
1da177e4
LT
943 /*
944 * Fragment the datagram.
945 */
946
8a6a1f17
PNA
947 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
948 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
949 &state);
1da177e4
LT
950
951 /*
952 * Keep copying data until we run out.
953 */
1da177e4 954
8a6a1f17
PNA
955 while (state.left > 0) {
956 frag = ip6_frag_next(skb, &state);
957 if (IS_ERR(frag)) {
958 err = PTR_ERR(frag);
1da177e4
LT
959 goto fail;
960 }
961
1da177e4
LT
962 /*
963 * Put this fragment into the sending queue.
964 */
9669fffc 965 frag->tstamp = tstamp;
7d8c6e39 966 err = output(net, sk, frag);
1da177e4
LT
967 if (err)
968 goto fail;
dafee490 969
adf30907 970 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 971 IPSTATS_MIB_FRAGCREATES);
1da177e4 972 }
adf30907 973 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 974 IPSTATS_MIB_FRAGOKS);
808db80a 975 consume_skb(skb);
1da177e4
LT
976 return err;
977
485fca66
FW
978fail_toobig:
979 if (skb->sk && dst_allfrag(skb_dst(skb)))
980 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
981
485fca66
FW
982 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
983 err = -EMSGSIZE;
984
1da177e4 985fail:
adf30907 986 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 987 IPSTATS_MIB_FRAGFAILS);
1ab1457c 988 kfree_skb(skb);
1da177e4
LT
989 return err;
990}
991
b71d1d42
ED
992static inline int ip6_rt_check(const struct rt6key *rt_key,
993 const struct in6_addr *fl_addr,
994 const struct in6_addr *addr_cache)
cf6b1982 995{
a02cec21 996 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 997 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
998}
999
497c615a
HX
1000static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1001 struct dst_entry *dst,
b71d1d42 1002 const struct flowi6 *fl6)
1da177e4 1003{
497c615a 1004 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1005 struct rt6_info *rt;
1da177e4 1006
497c615a
HX
1007 if (!dst)
1008 goto out;
1009
a963a37d
ED
1010 if (dst->ops->family != AF_INET6) {
1011 dst_release(dst);
1012 return NULL;
1013 }
1014
1015 rt = (struct rt6_info *)dst;
497c615a
HX
1016 /* Yes, checking route validity in not connected
1017 * case is not very simple. Take into account,
1018 * that we do not support routing by source, TOS,
67ba4152 1019 * and MSG_DONTROUTE --ANK (980726)
497c615a 1020 *
cf6b1982
YH
1021 * 1. ip6_rt_check(): If route was host route,
1022 * check that cached destination is current.
497c615a
HX
1023 * If it is network route, we still may
1024 * check its validity using saved pointer
1025 * to the last used address: daddr_cache.
1026 * We do not want to save whole address now,
1027 * (because main consumer of this service
1028 * is tcp, which has not this problem),
1029 * so that the last trick works only on connected
1030 * sockets.
1031 * 2. oif also should be the same.
1032 */
4c9483b2 1033 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1034#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1035 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1036#endif
ca254490
DA
1037 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1038 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
497c615a
HX
1039 dst_release(dst);
1040 dst = NULL;
1da177e4
LT
1041 }
1042
497c615a
HX
1043out:
1044 return dst;
1045}
1046
3aef934f 1047static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1048 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1049{
69cce1d1
DM
1050#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1051 struct neighbour *n;
97cac082 1052 struct rt6_info *rt;
69cce1d1
DM
1053#endif
1054 int err;
6f21c96a 1055 int flags = 0;
497c615a 1056
e16e888b
MS
1057 /* The correct way to handle this would be to do
1058 * ip6_route_get_saddr, and then ip6_route_output; however,
1059 * the route-specific preferred source forces the
1060 * ip6_route_output call _before_ ip6_route_get_saddr.
1061 *
1062 * In source specific routing (no src=any default route),
1063 * ip6_route_output will fail given src=any saddr, though, so
1064 * that's why we try it again later.
1065 */
c305b9e6 1066 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1067 struct fib6_info *from;
e16e888b 1068 struct rt6_info *rt;
1da177e4 1069
c305b9e6 1070 *dst = ip6_route_output(net, sk, fl6);
e16e888b 1071 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1072
1073 rcu_read_lock();
1074 from = rt ? rcu_dereference(rt->from) : NULL;
1075 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1076 sk ? inet6_sk(sk)->srcprefs : 0,
1077 &fl6->saddr);
a68886a6
DA
1078 rcu_read_unlock();
1079
44456d37 1080 if (err)
1da177e4 1081 goto out_err_release;
e16e888b
MS
1082
1083 /* If we had an erroneous initial result, pretend it
1084 * never existed and let the SA-enabled version take
1085 * over.
1086 */
c305b9e6 1087 if ((*dst)->error) {
e16e888b
MS
1088 dst_release(*dst);
1089 *dst = NULL;
1090 }
6f21c96a
PA
1091
1092 if (fl6->flowi6_oif)
1093 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1094 }
1095
e16e888b 1096 if (!*dst)
6f21c96a 1097 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1098
1099 err = (*dst)->error;
1100 if (err)
1101 goto out_err_release;
1102
95c385b4 1103#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1104 /*
1105 * Here if the dst entry we've looked up
1106 * has a neighbour entry that is in the INCOMPLETE
1107 * state and the src address from the flow is
1108 * marked as OPTIMISTIC, we release the found
1109 * dst entry and replace it instead with the
1110 * dst entry of the nexthop router
1111 */
c56bf6fe 1112 rt = (struct rt6_info *) *dst;
707be1ff 1113 rcu_read_lock_bh();
2647a9b0
MKL
1114 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1115 rt6_nexthop(rt, &fl6->daddr));
707be1ff
YH
1116 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1117 rcu_read_unlock_bh();
1118
1119 if (err) {
e550dfb0 1120 struct inet6_ifaddr *ifp;
4c9483b2 1121 struct flowi6 fl_gw6;
e550dfb0
NH
1122 int redirect;
1123
4c9483b2 1124 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1125 (*dst)->dev, 1);
1126
1127 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1128 if (ifp)
1129 in6_ifa_put(ifp);
1130
1131 if (redirect) {
1132 /*
1133 * We need to get the dst entry for the
1134 * default router instead
1135 */
1136 dst_release(*dst);
4c9483b2
DM
1137 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1138 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1139 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1140 err = (*dst)->error;
1141 if (err)
e550dfb0 1142 goto out_err_release;
95c385b4 1143 }
e550dfb0 1144 }
95c385b4 1145#endif
ec5e3b0a 1146 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1147 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1148 err = -EAFNOSUPPORT;
1149 goto out_err_release;
1150 }
95c385b4 1151
1da177e4
LT
1152 return 0;
1153
1154out_err_release:
1155 dst_release(*dst);
1156 *dst = NULL;
8a966fc0 1157
0d240e78
DA
1158 if (err == -ENETUNREACH)
1159 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1160 return err;
1161}
34a0b3cd 1162
497c615a
HX
1163/**
1164 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1165 * @net: Network namespace to perform lookup in
497c615a
HX
1166 * @sk: socket which provides route info
1167 * @dst: pointer to dst_entry * for result
4c9483b2 1168 * @fl6: flow to lookup
497c615a
HX
1169 *
1170 * This function performs a route lookup on the given flow.
1171 *
1172 * It returns zero on success, or a standard errno code on error.
1173 */
343d60aa
RP
1174int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1175 struct flowi6 *fl6)
497c615a
HX
1176{
1177 *dst = NULL;
343d60aa 1178 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1179}
3cf3dc6c
ACM
1180EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1181
497c615a 1182/**
68d0c6d3 1183 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1184 * @net: Network namespace to perform lookup in
68d0c6d3 1185 * @sk: socket which provides route info
4c9483b2 1186 * @fl6: flow to lookup
68d0c6d3 1187 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1188 *
1189 * This function performs a route lookup on the given flow.
1190 *
1191 * It returns a valid dst pointer on success, or a pointer encoded
1192 * error code.
1193 */
c4e85f73 1194struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1195 const struct in6_addr *final_dst)
68d0c6d3
DM
1196{
1197 struct dst_entry *dst = NULL;
1198 int err;
1199
c4e85f73 1200 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1201 if (err)
1202 return ERR_PTR(err);
1203 if (final_dst)
4e3fd7a0 1204 fl6->daddr = *final_dst;
2774c131 1205
c4e85f73 1206 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1207}
1208EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1209
1210/**
1211 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1212 * @sk: socket which provides the dst cache and route info
4c9483b2 1213 * @fl6: flow to lookup
68d0c6d3 1214 * @final_dst: final destination address for ipsec lookup
96818159 1215 * @connected: whether @sk is connected or not
497c615a
HX
1216 *
1217 * This function performs a route lookup on the given flow with the
1218 * possibility of using the cached route in the socket if it is valid.
1219 * It will take the socket dst lock when operating on the dst cache.
1220 * As a result, this function can only be used in process context.
1221 *
96818159
AK
1222 * In addition, for a connected socket, cache the dst in the socket
1223 * if the current cache is not valid.
1224 *
68d0c6d3
DM
1225 * It returns a valid dst pointer on success, or a pointer encoded
1226 * error code.
497c615a 1227 */
4c9483b2 1228struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1229 const struct in6_addr *final_dst,
1230 bool connected)
497c615a 1231{
68d0c6d3 1232 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1233
4c9483b2 1234 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1235 if (dst)
1236 return dst;
1237
c4e85f73 1238 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1239 if (connected && !IS_ERR(dst))
1240 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1241
00bc0ef5 1242 return dst;
497c615a 1243}
68d0c6d3 1244EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1245
571912c6
MV
1246/**
1247 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1248 * @skb: Packet for which lookup is done
1249 * @dev: Tunnel device
1250 * @net: Network namespace of tunnel device
b51cd7c8 1251 * @sock: Socket which provides route info
571912c6
MV
1252 * @saddr: Memory to store the src ip address
1253 * @info: Tunnel information
1254 * @protocol: IP protocol
b51cd7c8 1255 * @use_cache: Flag to enable cache usage
571912c6
MV
1256 * This function performs a route lookup on a tunnel
1257 *
1258 * It returns a valid dst pointer and stores src address to be used in
1259 * tunnel in param saddr on success, else a pointer encoded error code.
1260 */
1261
1262struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1263 struct net_device *dev,
1264 struct net *net,
1265 struct socket *sock,
1266 struct in6_addr *saddr,
1267 const struct ip_tunnel_info *info,
1268 u8 protocol,
1269 bool use_cache)
1270{
1271 struct dst_entry *dst = NULL;
1272#ifdef CONFIG_DST_CACHE
1273 struct dst_cache *dst_cache;
1274#endif
1275 struct flowi6 fl6;
1276 __u8 prio;
1277
1278#ifdef CONFIG_DST_CACHE
1279 dst_cache = (struct dst_cache *)&info->dst_cache;
1280 if (use_cache) {
1281 dst = dst_cache_get_ip6(dst_cache, saddr);
1282 if (dst)
1283 return dst;
1284 }
1285#endif
1286 memset(&fl6, 0, sizeof(fl6));
1287 fl6.flowi6_mark = skb->mark;
1288 fl6.flowi6_proto = protocol;
1289 fl6.daddr = info->key.u.ipv6.dst;
1290 fl6.saddr = info->key.u.ipv6.src;
1291 prio = info->key.tos;
1292 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1293 info->key.label);
1294
1295 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1296 NULL);
1297 if (IS_ERR(dst)) {
1298 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1299 return ERR_PTR(-ENETUNREACH);
1300 }
1301 if (dst->dev == dev) { /* is this necessary? */
1302 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1303 dst_release(dst);
1304 return ERR_PTR(-ELOOP);
1305 }
1306#ifdef CONFIG_DST_CACHE
1307 if (use_cache)
1308 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1309#endif
1310 *saddr = fl6.saddr;
1311 return dst;
1312}
1313EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1314
0178b695
HX
1315static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1316 gfp_t gfp)
1317{
1318 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319}
1320
1321static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1322 gfp_t gfp)
1323{
1324 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1325}
1326
75a493e6 1327static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1328 int *maxfraglen,
1329 unsigned int fragheaderlen,
1330 struct sk_buff *skb,
75a493e6 1331 struct rt6_info *rt,
e367c2d0 1332 unsigned int orig_mtu)
0c183379
G
1333{
1334 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1335 if (!skb) {
0c183379 1336 /* first fragment, reserve header_len */
e367c2d0 1337 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1338
1339 } else {
1340 /*
1341 * this fragment is not first, the headers
1342 * space is regarded as data space.
1343 */
e367c2d0 1344 *mtu = orig_mtu;
0c183379
G
1345 }
1346 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1347 + fragheaderlen - sizeof(struct frag_hdr);
1348 }
1349}
1350
366e41d9 1351static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1352 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
5fdaa88d 1353 struct rt6_info *rt, struct flowi6 *fl6)
366e41d9
VY
1354{
1355 struct ipv6_pinfo *np = inet6_sk(sk);
1356 unsigned int mtu;
26879da5 1357 struct ipv6_txoptions *opt = ipc6->opt;
366e41d9
VY
1358
1359 /*
1360 * setup for corking
1361 */
1362 if (opt) {
1363 if (WARN_ON(v6_cork->opt))
1364 return -EINVAL;
1365
864e2a1f 1366 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
63159f29 1367 if (unlikely(!v6_cork->opt))
366e41d9
VY
1368 return -ENOBUFS;
1369
864e2a1f 1370 v6_cork->opt->tot_len = sizeof(*opt);
366e41d9
VY
1371 v6_cork->opt->opt_flen = opt->opt_flen;
1372 v6_cork->opt->opt_nflen = opt->opt_nflen;
1373
1374 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1375 sk->sk_allocation);
1376 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1377 return -ENOBUFS;
1378
1379 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1380 sk->sk_allocation);
1381 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1382 return -ENOBUFS;
1383
1384 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1385 sk->sk_allocation);
1386 if (opt->hopopt && !v6_cork->opt->hopopt)
1387 return -ENOBUFS;
1388
1389 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1390 sk->sk_allocation);
1391 if (opt->srcrt && !v6_cork->opt->srcrt)
1392 return -ENOBUFS;
1393
1394 /* need source address above miyazawa*/
1395 }
1396 dst_hold(&rt->dst);
1397 cork->base.dst = &rt->dst;
1398 cork->fl.u.ip6 = *fl6;
26879da5
WW
1399 v6_cork->hop_limit = ipc6->hlimit;
1400 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1401 if (rt->dst.flags & DST_XFRM_TUNNEL)
1402 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1403 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1404 else
1405 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1406 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1407 if (np->frag_size < mtu) {
1408 if (np->frag_size)
1409 mtu = np->frag_size;
1410 }
749439bf
MM
1411 if (mtu < IPV6_MIN_MTU)
1412 return -EINVAL;
366e41d9 1413 cork->base.fragsize = mtu;
fbf47813 1414 cork->base.gso_size = ipc6->gso_size;
678ca42d 1415 cork->base.tx_flags = 0;
c6af0c22 1416 cork->base.mark = ipc6->sockc.mark;
678ca42d 1417 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1418
0f6c480f 1419 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1420 cork->base.flags |= IPCORK_ALLFRAG;
1421 cork->base.length = 0;
1422
5fdaa88d 1423 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1424
366e41d9
VY
1425 return 0;
1426}
1427
0bbe84a6
VY
1428static int __ip6_append_data(struct sock *sk,
1429 struct flowi6 *fl6,
1430 struct sk_buff_head *queue,
1431 struct inet_cork *cork,
1432 struct inet6_cork *v6_cork,
1433 struct page_frag *pfrag,
1434 int getfrag(void *from, char *to, int offset,
1435 int len, int odd, struct sk_buff *skb),
1436 void *from, int length, int transhdrlen,
5fdaa88d 1437 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1438{
0c183379 1439 struct sk_buff *skb, *skb_prev = NULL;
10b8a3de 1440 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1441 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1442 int exthdrlen = 0;
1443 int dst_exthdrlen = 0;
1da177e4 1444 int hh_len;
1da177e4
LT
1445 int copy;
1446 int err;
1447 int offset = 0;
09c2d251 1448 u32 tskey = 0;
0bbe84a6
VY
1449 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1450 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1451 int csummode = CHECKSUM_NONE;
682b1a9d 1452 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1453 unsigned int wmem_alloc_delta = 0;
100f6d8e 1454 bool paged, extra_uref = false;
1da177e4 1455
0bbe84a6
VY
1456 skb = skb_peek_tail(queue);
1457 if (!skb) {
1458 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1459 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1460 }
0bbe84a6 1461
15e36f5b 1462 paged = !!cork->gso_size;
bec1f6f6 1463 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1464 orig_mtu = mtu;
1da177e4 1465
678ca42d
WB
1466 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1467 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1468 tskey = sk->sk_tskey++;
1469
d8d1f30b 1470 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1471
a1b05140 1472 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1473 (opt ? opt->opt_nflen : 0);
4df98e76
HFS
1474 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1475 sizeof(struct frag_hdr);
1da177e4 1476
682b1a9d
HFS
1477 headersize = sizeof(struct ipv6hdr) +
1478 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1479 (dst_allfrag(&rt->dst) ?
1480 sizeof(struct frag_hdr) : 0) +
1481 rt->rt6i_nfheader_len;
1482
10b8a3de
PA
1483 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1484 * the first fragment
1485 */
1486 if (headersize + transhdrlen > mtu)
1487 goto emsgsize;
1488
26879da5 1489 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d
HFS
1490 (sk->sk_protocol == IPPROTO_UDP ||
1491 sk->sk_protocol == IPPROTO_RAW)) {
1492 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1493 sizeof(struct ipv6hdr));
1494 goto emsgsize;
1495 }
4df98e76 1496
682b1a9d
HFS
1497 if (ip6_sk_ignore_df(sk))
1498 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1499 else
1500 maxnonfragsize = mtu;
4df98e76 1501
682b1a9d 1502 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1503emsgsize:
10b8a3de
PA
1504 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1505 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1506 return -EMSGSIZE;
1da177e4
LT
1507 }
1508
682b1a9d
HFS
1509 /* CHECKSUM_PARTIAL only with no extension headers and when
1510 * we are not going to fragment
1511 */
1512 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1513 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1514 length <= mtu - headersize &&
bec1f6f6 1515 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1516 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1517 csummode = CHECKSUM_PARTIAL;
1518
b5947e5d 1519 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
8c793822 1520 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
b5947e5d
WB
1521 if (!uarg)
1522 return -ENOBUFS;
522924b5 1523 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
b5947e5d
WB
1524 if (rt->dst.dev->features & NETIF_F_SG &&
1525 csummode == CHECKSUM_PARTIAL) {
1526 paged = true;
1527 } else {
1528 uarg->zerocopy = 0;
52900d22 1529 skb_zcopy_set(skb, uarg, &extra_uref);
b5947e5d
WB
1530 }
1531 }
1532
1da177e4
LT
1533 /*
1534 * Let's try using as much space as possible.
1535 * Use MTU if total length of the message fits into the MTU.
1536 * Otherwise, we need to reserve fragment header and
1537 * fragment alignment (= 8-15 octects, in total).
1538 *
634a63e7 1539 * Note that we may need to "move" the data from the tail
1ab1457c 1540 * of the buffer to the new fragment when we split
1da177e4
LT
1541 * the message.
1542 *
1ab1457c 1543 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1544 * at once if non-fragmentable extension headers
1545 * are too large.
1ab1457c 1546 * --yoshfuji
1da177e4
LT
1547 */
1548
2811ebac 1549 cork->length += length;
2811ebac 1550 if (!skb)
1da177e4
LT
1551 goto alloc_new_skb;
1552
1553 while (length > 0) {
1554 /* Check if the remaining data fits into current packet. */
bdc712b4 1555 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1556 if (copy < length)
1557 copy = maxfraglen - skb->len;
1558
1559 if (copy <= 0) {
1560 char *data;
1561 unsigned int datalen;
1562 unsigned int fraglen;
1563 unsigned int fraggap;
6d123b81 1564 unsigned int alloclen, alloc_extra;
aba36930 1565 unsigned int pagedlen;
1da177e4 1566alloc_new_skb:
1da177e4 1567 /* There's no room in the current skb */
0c183379
G
1568 if (skb)
1569 fraggap = skb->len - maxfraglen;
1da177e4
LT
1570 else
1571 fraggap = 0;
0c183379 1572 /* update mtu and maxfraglen if necessary */
63159f29 1573 if (!skb || !skb_prev)
0c183379 1574 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1575 fragheaderlen, skb, rt,
e367c2d0 1576 orig_mtu);
0c183379
G
1577
1578 skb_prev = skb;
1da177e4
LT
1579
1580 /*
1581 * If remaining data exceeds the mtu,
1582 * we know we need more fragment(s).
1583 */
1584 datalen = length + fraggap;
1da177e4 1585
0c183379
G
1586 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1587 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1588 fraglen = datalen + fragheaderlen;
aba36930 1589 pagedlen = 0;
15e36f5b 1590
6d123b81
JK
1591 alloc_extra = hh_len;
1592 alloc_extra += dst_exthdrlen;
1593 alloc_extra += rt->dst.trailer_len;
1594
1595 /* We just reserve space for fragment header.
1596 * Note: this may be overallocation if the message
1597 * (without MSG_MORE) fits into the MTU.
1598 */
1599 alloc_extra += sizeof(struct frag_hdr);
1600
1da177e4 1601 if ((flags & MSG_MORE) &&
d8d1f30b 1602 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1603 alloclen = mtu;
6d123b81
JK
1604 else if (!paged &&
1605 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1606 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b
WB
1607 alloclen = fraglen;
1608 else {
1609 alloclen = min_t(int, fraglen, MAX_HEADER);
1610 pagedlen = fraglen - alloclen;
1611 }
6d123b81 1612 alloclen += alloc_extra;
299b0767 1613
0c183379
G
1614 if (datalen != length + fraggap) {
1615 /*
1616 * this is not the last fragment, the trailer
1617 * space is regarded as data space.
1618 */
1619 datalen += rt->dst.trailer_len;
1620 }
1621
0c183379 1622 fraglen = datalen + fragheaderlen;
1da177e4 1623
15e36f5b 1624 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1625 if (copy < 0) {
1626 err = -EINVAL;
1627 goto error;
1628 }
1da177e4 1629 if (transhdrlen) {
6d123b81 1630 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1631 (flags & MSG_DONTWAIT), &err);
1632 } else {
1633 skb = NULL;
1f4c6eb2 1634 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1635 2 * sk->sk_sndbuf)
6d123b81 1636 skb = alloc_skb(alloclen,
1f4c6eb2 1637 sk->sk_allocation);
63159f29 1638 if (unlikely(!skb))
1da177e4
LT
1639 err = -ENOBUFS;
1640 }
63159f29 1641 if (!skb)
1da177e4
LT
1642 goto error;
1643 /*
1644 * Fill in the control structures
1645 */
9c9c9ad5 1646 skb->protocol = htons(ETH_P_IPV6);
32dce968 1647 skb->ip_summed = csummode;
1da177e4 1648 skb->csum = 0;
1f85851e
G
1649 /* reserve for fragmentation and ipsec header */
1650 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1651 dst_exthdrlen);
1da177e4
LT
1652
1653 /*
1654 * Find where to start putting bytes
1655 */
15e36f5b 1656 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1657 skb_set_network_header(skb, exthdrlen);
1658 data += fragheaderlen;
b0e380b1
ACM
1659 skb->transport_header = (skb->network_header +
1660 fragheaderlen);
1da177e4
LT
1661 if (fraggap) {
1662 skb->csum = skb_copy_and_csum_bits(
1663 skb_prev, maxfraglen,
8d5930df 1664 data + transhdrlen, fraggap);
1da177e4
LT
1665 skb_prev->csum = csum_sub(skb_prev->csum,
1666 skb->csum);
1667 data += fraggap;
e9fa4f7b 1668 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1669 }
232cd35d
ED
1670 if (copy > 0 &&
1671 getfrag(from, data + transhdrlen, offset,
1672 copy, fraggap, skb) < 0) {
1da177e4
LT
1673 err = -EFAULT;
1674 kfree_skb(skb);
1675 goto error;
1676 }
1677
1678 offset += copy;
15e36f5b 1679 length -= copy + transhdrlen;
1da177e4
LT
1680 transhdrlen = 0;
1681 exthdrlen = 0;
299b0767 1682 dst_exthdrlen = 0;
1da177e4 1683
52900d22
WB
1684 /* Only the initial fragment is time stamped */
1685 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1686 cork->tx_flags = 0;
1687 skb_shinfo(skb)->tskey = tskey;
1688 tskey = 0;
1689 skb_zcopy_set(skb, uarg, &extra_uref);
1690
0dec879f
JA
1691 if ((flags & MSG_CONFIRM) && !skb_prev)
1692 skb_set_dst_pending_confirm(skb, 1);
1693
1da177e4
LT
1694 /*
1695 * Put the packet on the pending queue
1696 */
1f4c6eb2
ED
1697 if (!skb->destructor) {
1698 skb->destructor = sock_wfree;
1699 skb->sk = sk;
1700 wmem_alloc_delta += skb->truesize;
1701 }
0bbe84a6 1702 __skb_queue_tail(queue, skb);
1da177e4
LT
1703 continue;
1704 }
1705
1706 if (copy > length)
1707 copy = length;
1708
113f99c3
WB
1709 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1710 skb_tailroom(skb) >= copy) {
1da177e4
LT
1711 unsigned int off;
1712
1713 off = skb->len;
1714 if (getfrag(from, skb_put(skb, copy),
1715 offset, copy, off, skb) < 0) {
1716 __skb_trim(skb, off);
1717 err = -EFAULT;
1718 goto error;
1719 }
b5947e5d 1720 } else if (!uarg || !uarg->zerocopy) {
1da177e4 1721 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1722
5640f768
ED
1723 err = -ENOMEM;
1724 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1725 goto error;
5640f768
ED
1726
1727 if (!skb_can_coalesce(skb, i, pfrag->page,
1728 pfrag->offset)) {
1729 err = -EMSGSIZE;
1730 if (i == MAX_SKB_FRAGS)
1731 goto error;
1732
1733 __skb_fill_page_desc(skb, i, pfrag->page,
1734 pfrag->offset, 0);
1735 skb_shinfo(skb)->nr_frags = ++i;
1736 get_page(pfrag->page);
1da177e4 1737 }
5640f768 1738 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1739 if (getfrag(from,
5640f768
ED
1740 page_address(pfrag->page) + pfrag->offset,
1741 offset, copy, skb->len, skb) < 0)
1742 goto error_efault;
1743
1744 pfrag->offset += copy;
1745 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1746 skb->len += copy;
1747 skb->data_len += copy;
f945fa7a 1748 skb->truesize += copy;
1f4c6eb2 1749 wmem_alloc_delta += copy;
b5947e5d
WB
1750 } else {
1751 err = skb_zerocopy_iter_dgram(skb, from, copy);
1752 if (err < 0)
1753 goto error;
1da177e4
LT
1754 }
1755 offset += copy;
1756 length -= copy;
1757 }
5640f768 1758
9e8445a5
PA
1759 if (wmem_alloc_delta)
1760 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1761 return 0;
5640f768
ED
1762
1763error_efault:
1764 err = -EFAULT;
1da177e4 1765error:
8e044917 1766 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1767 cork->length -= length;
3bd653c8 1768 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1769 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1770 return err;
1771}
0bbe84a6
VY
1772
1773int ip6_append_data(struct sock *sk,
1774 int getfrag(void *from, char *to, int offset, int len,
1775 int odd, struct sk_buff *skb),
26879da5
WW
1776 void *from, int length, int transhdrlen,
1777 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1778 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1779{
1780 struct inet_sock *inet = inet_sk(sk);
1781 struct ipv6_pinfo *np = inet6_sk(sk);
1782 int exthdrlen;
1783 int err;
1784
1785 if (flags&MSG_PROBE)
1786 return 0;
1787 if (skb_queue_empty(&sk->sk_write_queue)) {
1788 /*
1789 * setup for corking
1790 */
26879da5 1791 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
5fdaa88d 1792 ipc6, rt, fl6);
0bbe84a6
VY
1793 if (err)
1794 return err;
1795
26879da5 1796 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1797 length += exthdrlen;
1798 transhdrlen += exthdrlen;
1799 } else {
1800 fl6 = &inet->cork.fl.u.ip6;
1801 transhdrlen = 0;
1802 }
1803
1804 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1805 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1806 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1807}
a495f836 1808EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1809
366e41d9
VY
1810static void ip6_cork_release(struct inet_cork_full *cork,
1811 struct inet6_cork *v6_cork)
bf138862 1812{
366e41d9
VY
1813 if (v6_cork->opt) {
1814 kfree(v6_cork->opt->dst0opt);
1815 kfree(v6_cork->opt->dst1opt);
1816 kfree(v6_cork->opt->hopopt);
1817 kfree(v6_cork->opt->srcrt);
1818 kfree(v6_cork->opt);
1819 v6_cork->opt = NULL;
0178b695
HX
1820 }
1821
366e41d9
VY
1822 if (cork->base.dst) {
1823 dst_release(cork->base.dst);
1824 cork->base.dst = NULL;
1825 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1826 }
366e41d9 1827 memset(&cork->fl, 0, sizeof(cork->fl));
bf138862
PE
1828}
1829
6422398c
VY
1830struct sk_buff *__ip6_make_skb(struct sock *sk,
1831 struct sk_buff_head *queue,
1832 struct inet_cork_full *cork,
1833 struct inet6_cork *v6_cork)
1da177e4
LT
1834{
1835 struct sk_buff *skb, *tmp_skb;
1836 struct sk_buff **tail_skb;
1837 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1da177e4 1838 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1839 struct net *net = sock_net(sk);
1da177e4 1840 struct ipv6hdr *hdr;
6422398c
VY
1841 struct ipv6_txoptions *opt = v6_cork->opt;
1842 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1843 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1844 unsigned char proto = fl6->flowi6_proto;
1da177e4 1845
6422398c 1846 skb = __skb_dequeue(queue);
63159f29 1847 if (!skb)
1da177e4
LT
1848 goto out;
1849 tail_skb = &(skb_shinfo(skb)->frag_list);
1850
1851 /* move skb->data to ip header from ext header */
d56f90a7 1852 if (skb->data < skb_network_header(skb))
bbe735e4 1853 __skb_pull(skb, skb_network_offset(skb));
6422398c 1854 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1855 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1856 *tail_skb = tmp_skb;
1857 tail_skb = &(tmp_skb->next);
1858 skb->len += tmp_skb->len;
1859 skb->data_len += tmp_skb->len;
1da177e4 1860 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1861 tmp_skb->destructor = NULL;
1862 tmp_skb->sk = NULL;
1da177e4
LT
1863 }
1864
28a89453 1865 /* Allow local fragmentation. */
60ff7467 1866 skb->ignore_df = ip6_sk_ignore_df(sk);
28a89453 1867
4e3fd7a0 1868 *final_dst = fl6->daddr;
cfe1fc77 1869 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1870 if (opt && opt->opt_flen)
1871 ipv6_push_frag_opts(skb, opt, &proto);
1872 if (opt && opt->opt_nflen)
613fa3ca 1873 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1874
e2d1bca7
ACM
1875 skb_push(skb, sizeof(struct ipv6hdr));
1876 skb_reset_network_header(skb);
0660e03f 1877 hdr = ipv6_hdr(skb);
1ab1457c 1878
6422398c 1879 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1880 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1881 ip6_autoflowlabel(net, np), fl6));
6422398c 1882 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1883 hdr->nexthdr = proto;
4e3fd7a0
AD
1884 hdr->saddr = fl6->saddr;
1885 hdr->daddr = *final_dst;
1da177e4 1886
a2c2064f 1887 skb->priority = sk->sk_priority;
c6af0c22 1888 skb->mark = cork->base.mark;
a2c2064f 1889
a818f75e
JSP
1890 skb->tstamp = cork->base.transmit_time;
1891
d8d1f30b 1892 skb_dst_set(skb, dst_clone(&rt->dst));
edf391ff 1893 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1894 if (proto == IPPROTO_ICMPV6) {
adf30907 1895 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1896
43a43b60
HFS
1897 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1898 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1899 }
1900
6422398c
VY
1901 ip6_cork_release(cork, v6_cork);
1902out:
1903 return skb;
1904}
1905
1906int ip6_send_skb(struct sk_buff *skb)
1907{
1908 struct net *net = sock_net(skb->sk);
1909 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1910 int err;
1911
33224b16 1912 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
1913 if (err) {
1914 if (err > 0)
6ce9e7b5 1915 err = net_xmit_errno(err);
1da177e4 1916 if (err)
6422398c
VY
1917 IP6_INC_STATS(net, rt->rt6i_idev,
1918 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1919 }
1920
1da177e4 1921 return err;
6422398c
VY
1922}
1923
1924int ip6_push_pending_frames(struct sock *sk)
1925{
1926 struct sk_buff *skb;
1927
1928 skb = ip6_finish_skb(sk);
1929 if (!skb)
1930 return 0;
1931
1932 return ip6_send_skb(skb);
1da177e4 1933}
a495f836 1934EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 1935
0bbe84a6 1936static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
1937 struct sk_buff_head *queue,
1938 struct inet_cork_full *cork,
1939 struct inet6_cork *v6_cork)
1da177e4 1940{
1da177e4
LT
1941 struct sk_buff *skb;
1942
0bbe84a6 1943 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
1944 if (skb_dst(skb))
1945 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1946 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1947 kfree_skb(skb);
1948 }
1949
6422398c 1950 ip6_cork_release(cork, v6_cork);
1da177e4 1951}
0bbe84a6
VY
1952
1953void ip6_flush_pending_frames(struct sock *sk)
1954{
6422398c
VY
1955 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1956 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 1957}
a495f836 1958EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
1959
1960struct sk_buff *ip6_make_skb(struct sock *sk,
1961 int getfrag(void *from, char *to, int offset,
1962 int len, int odd, struct sk_buff *skb),
1963 void *from, int length, int transhdrlen,
26879da5 1964 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
6422398c 1965 struct rt6_info *rt, unsigned int flags,
5fdaa88d 1966 struct inet_cork_full *cork)
6422398c 1967{
6422398c
VY
1968 struct inet6_cork v6_cork;
1969 struct sk_buff_head queue;
26879da5 1970 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
1971 int err;
1972
1973 if (flags & MSG_PROBE)
1974 return NULL;
1975
1976 __skb_queue_head_init(&queue);
1977
1cd7884d
WB
1978 cork->base.flags = 0;
1979 cork->base.addr = 0;
1980 cork->base.opt = NULL;
1981 cork->base.dst = NULL;
6422398c 1982 v6_cork.opt = NULL;
5fdaa88d 1983 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
862c03ee 1984 if (err) {
1cd7884d 1985 ip6_cork_release(cork, &v6_cork);
6422398c 1986 return ERR_PTR(err);
862c03ee 1987 }
26879da5
WW
1988 if (ipc6->dontfrag < 0)
1989 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 1990
1cd7884d 1991 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
6422398c
VY
1992 &current->task_frag, getfrag, from,
1993 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 1994 flags, ipc6);
6422398c 1995 if (err) {
1cd7884d 1996 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
1997 return ERR_PTR(err);
1998 }
1999
1cd7884d 2000 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2001}