Merge tag 'mlx5-updates-2023-06-09' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * IPv6 output functions
1ab1457c 4 * Linux INET6 implementation
1da177e4
LT
5 *
6 * Authors:
1ab1457c 7 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 8 *
1da177e4
LT
9 * Based on linux/net/ipv4/ip_output.c
10 *
1da177e4
LT
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
67ba4152 19 * Imran Patel : frag id should be in NBO
1da177e4
LT
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
1da177e4 25#include <linux/errno.h>
ef76bc23 26#include <linux/kernel.h>
1da177e4
LT
27#include <linux/string.h>
28#include <linux/socket.h>
29#include <linux/net.h>
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/in6.h>
33#include <linux/tcp.h>
34#include <linux/route.h>
b59f45d0 35#include <linux/module.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4 37
33b48679 38#include <linux/bpf-cgroup.h>
1da177e4
LT
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv6.h>
41
42#include <net/sock.h>
43#include <net/snmp.h>
44
d457a0e3 45#include <net/gso.h>
1da177e4
LT
46#include <net/ipv6.h>
47#include <net/ndisc.h>
48#include <net/protocol.h>
49#include <net/ip6_route.h>
50#include <net/addrconf.h>
51#include <net/rawv6.h>
52#include <net/icmp.h>
53#include <net/xfrm.h>
54#include <net/checksum.h>
7bc570c8 55#include <linux/mroute6.h>
ca254490 56#include <net/l3mdev.h>
14972cbd 57#include <net/lwtunnel.h>
571912c6 58#include <net/ip_tunnels.h>
1da177e4 59
7d8c6e39 60static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 61{
adf30907 62 struct dst_entry *dst = skb_dst(skb);
1da177e4 63 struct net_device *dev = dst->dev;
e415ed3a 64 struct inet6_dev *idev = ip6_dst_idev(dst);
5796015f 65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
e415ed3a
VA
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
f6b72b62 68 struct neighbour *neigh;
6fd6ce20 69 int ret;
1da177e4 70
5796015f 71 /* Be paranoid, rather than too clever. */
e415ed3a
VA
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 skb = skb_expand_head(skb, hh_len);
5796015f 74 if (!skb) {
e415ed3a 75 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5796015f
VA
76 return -ENOMEM;
77 }
78 }
79
e415ed3a
VA
80 hdr = ipv6_hdr(skb);
81 daddr = &hdr->daddr;
82 if (ipv6_addr_is_multicast(daddr)) {
7026b1dd 83 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
8571ab47 84 ((mroute6_is_socket(net, skb) &&
bd91b8bf 85 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
e415ed3a 86 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
1da177e4
LT
87 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88
89 /* Do not check for IFF_ALLMULTI; multicast routing
90 is not supported in any case.
91 */
92 if (newskb)
b2e0b385 93 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
29a26a56 94 net, sk, newskb, NULL, newskb->dev,
95603e22 95 dev_loopback_xmit);
1da177e4 96
e415ed3a 97 if (hdr->hop_limit == 0) {
78126c41 98 IP6_INC_STATS(net, idev,
3bd653c8 99 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
100 kfree_skb(skb);
101 return 0;
102 }
103 }
104
78126c41 105 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
e415ed3a 106 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
dd408515
HFS
107 !(dev->flags & IFF_LOOPBACK)) {
108 kfree_skb(skb);
109 return 0;
110 }
1da177e4
LT
111 }
112
14972cbd
RP
113 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 int res = lwtunnel_xmit(skb);
115
116 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
117 return res;
118 }
119
09eed119 120 rcu_read_lock();
e415ed3a
VA
121 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
58f71be5
PB
123
124 if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 if (unlikely(!neigh))
126 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 if (IS_ERR(neigh)) {
09eed119 128 rcu_read_unlock();
58f71be5
PB
129 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 return -EINVAL;
132 }
6fd6ce20 133 }
58f71be5
PB
134 sock_confirm_neigh(skb, neigh);
135 ret = neigh_output(neigh, skb, false);
09eed119 136 rcu_read_unlock();
58f71be5 137 return ret;
1da177e4
LT
138}
139
b210de4f
AL
140static int
141ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 struct sk_buff *skb, unsigned int mtu)
143{
144 struct sk_buff *segs, *nskb;
145 netdev_features_t features;
146 int ret = 0;
147
148 /* Please see corresponding comment in ip_finish_output_gso
149 * describing the cases where GSO segment length exceeds the
150 * egress MTU.
151 */
152 features = netif_skb_features(skb);
153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 if (IS_ERR_OR_NULL(segs)) {
155 kfree_skb(skb);
156 return -ENOMEM;
157 }
158
159 consume_skb(skb);
160
161 skb_list_walk_safe(segs, segs, nskb) {
162 int err;
163
164 skb_mark_not_on_list(segs);
165 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 if (err && ret == 0)
167 ret = err;
168 }
169
170 return ret;
171}
172
956fe219 173static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
9e508490 174{
b210de4f
AL
175 unsigned int mtu;
176
09ee9dba
TB
177#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 /* Policy lookup after SNAT yielded a new policy */
179 if (skb_dst(skb)->xfrm) {
19d36c5f 180 IP6CB(skb)->flags |= IP6SKB_REROUTED;
09ee9dba
TB
181 return dst_output(net, sk, skb);
182 }
183#endif
184
b210de4f 185 mtu = ip6_skb_dst_mtu(skb);
80e425b6
CL
186 if (skb_is_gso(skb) &&
187 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 !skb_gso_validate_network_len(skb, mtu))
b210de4f
AL
189 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190
191 if ((skb->len > mtu && !skb_is_gso(skb)) ||
9037c357
JP
192 dst_allfrag(skb_dst(skb)) ||
193 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
7d8c6e39 194 return ip6_fragment(net, sk, skb, ip6_finish_output2);
9e508490 195 else
7d8c6e39 196 return ip6_finish_output2(net, sk, skb);
9e508490
JE
197}
198
956fe219 199static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200{
201 int ret;
202
203 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 switch (ret) {
205 case NET_XMIT_SUCCESS:
956fe219 206 case NET_XMIT_CN:
207 return __ip6_finish_output(net, sk, skb) ? : ret;
208 default:
5e187189 209 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
956fe219 210 return ret;
211 }
212}
213
ede2059d 214int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 215{
28f8bfd1 216 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
adf30907 217 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
be10de0a 218
97a7a37a
CF
219 skb->protocol = htons(ETH_P_IPV6);
220 skb->dev = dev;
221
778d80be 222 if (unlikely(idev->cnf.disable_ipv6)) {
19a0644c 223 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
5e187189 224 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
778d80be
YH
225 return 0;
226 }
227
29a26a56 228 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
28f8bfd1 229 net, sk, skb, indev, dev,
9c6eb28a
JE
230 ip6_finish_output,
231 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 232}
6585d7dc 233EXPORT_SYMBOL(ip6_output);
1da177e4 234
e9191ffb 235bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
513674b5
SL
236{
237 if (!np->autoflowlabel_set)
238 return ip6_default_np_autolabel(net);
239 else
240 return np->autoflowlabel;
241}
242
1da177e4 243/*
1c1e9d2b
ED
244 * xmit an sk_buff (used by TCP, SCTP and DCCP)
245 * Note : socket lock is not held for SYNACK packets, but might be modified
246 * by calls to skb_set_owner_w() and ipv6_local_error(),
247 * which are using proper atomic operations or spinlocks.
1da177e4 248 */
1c1e9d2b 249int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
4f6570d7 250 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
1da177e4 251{
3bd653c8 252 struct net *net = sock_net(sk);
1c1e9d2b 253 const struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 254 struct in6_addr *first_hop = &fl6->daddr;
adf30907 255 struct dst_entry *dst = skb_dst(skb);
0c9f227b
VA
256 struct net_device *dev = dst->dev;
257 struct inet6_dev *idev = ip6_dst_idev(dst);
80e425b6
CL
258 struct hop_jumbo_hdr *hop_jumbo;
259 int hoplen = sizeof(*hop_jumbo);
66033f47 260 unsigned int head_room;
1da177e4 261 struct ipv6hdr *hdr;
4c9483b2 262 u8 proto = fl6->flowi6_proto;
1da177e4 263 int seg_len = skb->len;
e651f03a 264 int hlimit = -1;
1da177e4
LT
265 u32 mtu;
266
80e425b6 267 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
66033f47
SB
268 if (opt)
269 head_room += opt->opt_nflen + opt->opt_flen;
270
0c9f227b
VA
271 if (unlikely(head_room > skb_headroom(skb))) {
272 skb = skb_expand_head(skb, head_room);
273 if (!skb) {
274 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
66033f47 275 return -ENOBUFS;
1da177e4 276 }
66033f47
SB
277 }
278
279 if (opt) {
280 seg_len += opt->opt_nflen + opt->opt_flen;
281
1da177e4
LT
282 if (opt->opt_flen)
283 ipv6_push_frag_opts(skb, opt, &proto);
66033f47 284
1da177e4 285 if (opt->opt_nflen)
613fa3ca
DL
286 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 &fl6->saddr);
1da177e4
LT
288 }
289
80e425b6
CL
290 if (unlikely(seg_len > IPV6_MAXPLEN)) {
291 hop_jumbo = skb_push(skb, hoplen);
292
293 hop_jumbo->nexthdr = proto;
294 hop_jumbo->hdrlen = 0;
295 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296 hop_jumbo->tlv_len = 4;
297 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298
299 proto = IPPROTO_HOPOPTS;
300 seg_len = 0;
301 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302 }
303
e2d1bca7
ACM
304 skb_push(skb, sizeof(struct ipv6hdr));
305 skb_reset_network_header(skb);
0660e03f 306 hdr = ipv6_hdr(skb);
1da177e4
LT
307
308 /*
309 * Fill in the IPv6 header
310 */
b903d324 311 if (np)
1da177e4
LT
312 hlimit = np->hop_limit;
313 if (hlimit < 0)
6b75d090 314 hlimit = ip6_dst_hoplimit(dst);
1da177e4 315
cb1ce2ef 316 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 317 ip6_autoflowlabel(net, np), fl6));
41a1f8ea 318
1da177e4
LT
319 hdr->payload_len = htons(seg_len);
320 hdr->nexthdr = proto;
321 hdr->hop_limit = hlimit;
322
4e3fd7a0
AD
323 hdr->saddr = fl6->saddr;
324 hdr->daddr = *first_hop;
1da177e4 325
9c9c9ad5 326 skb->protocol = htons(ETH_P_IPV6);
4f6570d7 327 skb->priority = priority;
92e55f41 328 skb->mark = mark;
a2c2064f 329
1da177e4 330 mtu = dst_mtu(dst);
60ff7467 331 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
0c9f227b 332 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
a8e3e1a9
DA
333
334 /* if egress device is enslaved to an L3 master device pass the
335 * skb to its handler for processing
336 */
337 skb = l3mdev_ip6_out((struct sock *)sk, skb);
338 if (unlikely(!skb))
339 return 0;
340
1c1e9d2b
ED
341 /* hooks should never assume socket lock is held.
342 * we promote our socket to non const
343 */
29a26a56 344 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0c9f227b 345 net, (struct sock *)sk, skb, NULL, dev,
13206b6b 346 dst_output);
1da177e4
LT
347 }
348
0c9f227b 349 skb->dev = dev;
1c1e9d2b
ED
350 /* ipv6_local_error() does not require socket lock,
351 * we promote our socket to non const
352 */
353 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354
0c9f227b 355 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
356 kfree_skb(skb);
357 return -EMSGSIZE;
358}
7159039a
YH
359EXPORT_SYMBOL(ip6_xmit);
360
1da177e4
LT
361static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362{
363 struct ip6_ra_chain *ra;
364 struct sock *last = NULL;
365
366 read_lock(&ip6_ra_lock);
367 for (ra = ip6_ra_chain; ra; ra = ra->next) {
368 struct sock *sk = ra->sk;
0bd1b59b
AM
369 if (sk && ra->sel == sel &&
370 (!sk->sk_bound_dev_if ||
371 sk->sk_bound_dev_if == skb->dev->ifindex)) {
9036b2fe
FR
372 struct ipv6_pinfo *np = inet6_sk(sk);
373
374 if (np && np->rtalert_isolate &&
375 !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 continue;
377 }
1da177e4
LT
378 if (last) {
379 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 if (skb2)
381 rawv6_rcv(last, skb2);
382 }
383 last = sk;
384 }
385 }
386
387 if (last) {
388 rawv6_rcv(last, skb);
389 read_unlock(&ip6_ra_lock);
390 return 1;
391 }
392 read_unlock(&ip6_ra_lock);
393 return 0;
394}
395
e21e0b5f
VN
396static int ip6_forward_proxy_check(struct sk_buff *skb)
397{
0660e03f 398 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 399 u8 nexthdr = hdr->nexthdr;
75f2811c 400 __be16 frag_off;
e21e0b5f
VN
401 int offset;
402
403 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 404 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
405 if (offset < 0)
406 return 0;
407 } else
408 offset = sizeof(struct ipv6hdr);
409
410 if (nexthdr == IPPROTO_ICMPV6) {
411 struct icmp6hdr *icmp6;
412
d56f90a7
ACM
413 if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 offset + 1 - skb->data)))
e21e0b5f
VN
415 return 0;
416
d56f90a7 417 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
418
419 switch (icmp6->icmp6_type) {
420 case NDISC_ROUTER_SOLICITATION:
421 case NDISC_ROUTER_ADVERTISEMENT:
422 case NDISC_NEIGHBOUR_SOLICITATION:
423 case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 case NDISC_REDIRECT:
425 /* For reaction involving unicast neighbor discovery
426 * message destined to the proxied address, pass it to
427 * input function.
428 */
429 return 1;
430 default:
431 break;
432 }
433 }
434
74553b09
VN
435 /*
436 * The proxying router can't forward traffic sent to a link-local
437 * address, so signal the sender and discard the packet. This
438 * behavior is clarified by the MIPv6 specification.
439 */
440 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 dst_link_failure(skb);
442 return -1;
443 }
444
e21e0b5f
VN
445 return 0;
446}
447
0c4b51f0
EB
448static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 struct sk_buff *skb)
1da177e4 450{
71a1c915
JB
451 struct dst_entry *dst = skb_dst(skb);
452
453 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
455
f839a6c9
IS
456#ifdef CONFIG_NET_SWITCHDEV
457 if (skb->offload_l3_fwd_mark) {
458 consume_skb(skb);
459 return 0;
460 }
461#endif
462
de799101 463 skb_clear_tstamp(skb);
13206b6b 464 return dst_output(net, sk, skb);
1da177e4
LT
465}
466
fe6cc55f
FW
467static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
468{
418a3156 469 if (skb->len <= mtu)
fe6cc55f
FW
470 return false;
471
60ff7467 472 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
473 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
474 return true;
475
60ff7467 476 if (skb->ignore_df)
418a3156
FW
477 return false;
478
779b7931 479 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
fe6cc55f
FW
480 return false;
481
482 return true;
483}
484
1da177e4
LT
485int ip6_forward(struct sk_buff *skb)
486{
adf30907 487 struct dst_entry *dst = skb_dst(skb);
0660e03f 488 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 489 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 490 struct net *net = dev_net(dst->dev);
0857d6f8 491 struct inet6_dev *idev;
2edc1a38 492 SKB_DR(reason);
14f3ad6f 493 u32 mtu;
1ab1457c 494
0857d6f8 495 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
53b7997f 496 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
497 goto error;
498
090f1166
LR
499 if (skb->pkt_type != PACKET_HOST)
500 goto drop;
501
9ef2e965
HFS
502 if (unlikely(skb->sk))
503 goto drop;
504
4497b076
BH
505 if (skb_warn_if_lro(skb))
506 goto drop;
507
ccd27f05 508 if (!net->ipv6.devconf_all->disable_policy &&
e3fa461d 509 (!idev || !idev->cnf.disable_policy) &&
ccd27f05 510 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
bdb7cc64 511 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
1da177e4
LT
512 goto drop;
513 }
514
35fc92a9 515 skb_forward_csum(skb);
1da177e4
LT
516
517 /*
518 * We DO NOT make any processing on
519 * RA packets, pushing them to user level AS IS
520 * without ane WARRANTY that application will be able
521 * to interpret them. The reason is that we
522 * cannot make anything clever here.
523 *
524 * We are not end-node, so that if packet contains
525 * AH/ESP, we cannot make anything.
526 * Defragmentation also would be mistake, RA packets
527 * cannot be fragmented, because there is no warranty
528 * that different fragments will go along one path. --ANK
529 */
ab4eb353
YH
530 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
531 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
532 return 0;
533 }
534
535 /*
536 * check and decrement ttl
537 */
538 if (hdr->hop_limit <= 1) {
3ffe533c 539 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
bdb7cc64 540 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
1da177e4 541
2edc1a38 542 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
1da177e4
LT
543 return -ETIMEDOUT;
544 }
545
fbea49e1 546 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 547 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 548 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09 549 int proxied = ip6_forward_proxy_check(skb);
46c7655f 550 if (proxied > 0) {
9f535c87
GR
551 /* It's tempting to decrease the hop limit
552 * here by 1, as we do at the end of the
553 * function too.
554 *
555 * But that would be incorrect, as proxying is
556 * not forwarding. The ip6_input function
557 * will handle this packet locally, and it
558 * depends on the hop limit being unchanged.
559 *
560 * One example is the NDP hop limit, that
561 * always has to stay 255, but other would be
562 * similar checks around RA packets, where the
563 * user can even change the desired limit.
564 */
e21e0b5f 565 return ip6_input(skb);
46c7655f 566 } else if (proxied < 0) {
bdb7cc64 567 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
74553b09
VN
568 goto drop;
569 }
e21e0b5f
VN
570 }
571
1da177e4 572 if (!xfrm6_route_forward(skb)) {
bdb7cc64 573 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
2edc1a38 574 SKB_DR_SET(reason, XFRM_POLICY);
1da177e4
LT
575 goto drop;
576 }
adf30907 577 dst = skb_dst(skb);
1da177e4
LT
578
579 /* IPv6 specs say nothing about it, but it is clear that we cannot
580 send redirects to source routed frames.
1e5dc146 581 We don't send redirects to frames decapsulated from IPsec.
1da177e4 582 */
2f17becf
SS
583 if (IP6CB(skb)->iif == dst->dev->ifindex &&
584 opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 585 struct in6_addr *target = NULL;
fbfe95a4 586 struct inet_peer *peer;
1da177e4 587 struct rt6_info *rt;
1da177e4
LT
588
589 /*
590 * incoming and outgoing devices are the same
591 * send a redirect.
592 */
593
594 rt = (struct rt6_info *) dst;
c45a3dfb
DM
595 if (rt->rt6i_flags & RTF_GATEWAY)
596 target = &rt->rt6i_gateway;
1da177e4
LT
597 else
598 target = &hdr->daddr;
599
fd0273d7 600 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
92d86829 601
1da177e4
LT
602 /* Limit redirects both by destination (here)
603 and by source (inside ndisc_send_redirect)
604 */
fbfe95a4 605 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 606 ndisc_send_redirect(skb, target);
1d861aa4
DM
607 if (peer)
608 inet_putpeer(peer);
5bb1ab09
DS
609 } else {
610 int addrtype = ipv6_addr_type(&hdr->saddr);
611
1da177e4 612 /* This check is security critical. */
f81b2e7d
YH
613 if (addrtype == IPV6_ADDR_ANY ||
614 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
615 goto error;
616 if (addrtype & IPV6_ADDR_LINKLOCAL) {
617 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 618 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
619 goto error;
620 }
1da177e4
LT
621 }
622
427faee1 623 mtu = ip6_dst_mtu_maybe_forward(dst, true);
14f3ad6f
UW
624 if (mtu < IPV6_MIN_MTU)
625 mtu = IPV6_MIN_MTU;
626
fe6cc55f 627 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
628 /* Again, force OUTPUT device used as source address */
629 skb->dev = dst->dev;
14f3ad6f 630 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
bdb7cc64 631 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
1d015503
ED
632 __IP6_INC_STATS(net, ip6_dst_idev(dst),
633 IPSTATS_MIB_FRAGFAILS);
2edc1a38 634 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
1da177e4
LT
635 return -EMSGSIZE;
636 }
637
638 if (skb_cow(skb, dst->dev->hard_header_len)) {
1d015503
ED
639 __IP6_INC_STATS(net, ip6_dst_idev(dst),
640 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
641 goto drop;
642 }
643
0660e03f 644 hdr = ipv6_hdr(skb);
1da177e4
LT
645
646 /* Mangling hops number delayed to point after skb COW */
1ab1457c 647
1da177e4
LT
648 hdr->hop_limit--;
649
29a26a56
EB
650 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
651 net, NULL, skb, skb->dev, dst->dev,
6e23ae2a 652 ip6_forward_finish);
1da177e4
LT
653
654error:
bdb7cc64 655 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
2edc1a38 656 SKB_DR_SET(reason, IP_INADDRERRORS);
1da177e4 657drop:
2edc1a38 658 kfree_skb_reason(skb, reason);
1da177e4
LT
659 return -EINVAL;
660}
661
662static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
663{
664 to->pkt_type = from->pkt_type;
665 to->priority = from->priority;
666 to->protocol = from->protocol;
adf30907
ED
667 skb_dst_drop(to);
668 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 669 to->dev = from->dev;
82e91ffe 670 to->mark = from->mark;
1da177e4 671
3dd1c9a1
PA
672 skb_copy_hash(to, from);
673
1da177e4
LT
674#ifdef CONFIG_NET_SCHED
675 to->tc_index = from->tc_index;
676#endif
e7ac05f3 677 nf_copy(to, from);
df5042f4 678 skb_ext_copy(to, from);
984bc16c 679 skb_copy_secmark(to, from);
1da177e4
LT
680}
681
0feca619
PNA
682int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
683 u8 nexthdr, __be32 frag_id,
684 struct ip6_fraglist_iter *iter)
685{
686 unsigned int first_len;
687 struct frag_hdr *fh;
688
689 /* BUILD HEADER */
690 *prevhdr = NEXTHDR_FRAGMENT;
691 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 if (!iter->tmp_hdr)
693 return -ENOMEM;
694
b7034146 695 iter->frag = skb_shinfo(skb)->frag_list;
0feca619
PNA
696 skb_frag_list_init(skb);
697
698 iter->offset = 0;
699 iter->hlen = hlen;
700 iter->frag_id = frag_id;
701 iter->nexthdr = nexthdr;
702
703 __skb_pull(skb, hlen);
704 fh = __skb_push(skb, sizeof(struct frag_hdr));
705 __skb_push(skb, hlen);
706 skb_reset_network_header(skb);
707 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
708
709 fh->nexthdr = nexthdr;
710 fh->reserved = 0;
711 fh->frag_off = htons(IP6_MF);
712 fh->identification = frag_id;
713
714 first_len = skb_pagelen(skb);
715 skb->data_len = first_len - skb_headlen(skb);
716 skb->len = first_len;
717 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
718
719 return 0;
720}
721EXPORT_SYMBOL(ip6_fraglist_init);
722
723void ip6_fraglist_prepare(struct sk_buff *skb,
724 struct ip6_fraglist_iter *iter)
725{
726 struct sk_buff *frag = iter->frag;
727 unsigned int hlen = iter->hlen;
728 struct frag_hdr *fh;
729
730 frag->ip_summed = CHECKSUM_NONE;
731 skb_reset_transport_header(frag);
732 fh = __skb_push(frag, sizeof(struct frag_hdr));
733 __skb_push(frag, hlen);
734 skb_reset_network_header(frag);
735 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
736 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
737 fh->nexthdr = iter->nexthdr;
738 fh->reserved = 0;
739 fh->frag_off = htons(iter->offset);
740 if (frag->next)
741 fh->frag_off |= htons(IP6_MF);
742 fh->identification = iter->frag_id;
743 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
744 ip6_copy_metadata(frag, skb);
745}
746EXPORT_SYMBOL(ip6_fraglist_prepare);
747
8a6a1f17
PNA
748void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
749 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
750 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
751{
752 state->prevhdr = prevhdr;
753 state->nexthdr = nexthdr;
754 state->frag_id = frag_id;
755
756 state->hlen = hlen;
757 state->mtu = mtu;
758
759 state->left = skb->len - hlen; /* Space per frame */
760 state->ptr = hlen; /* Where to start from */
761
762 state->hroom = hdr_room;
763 state->troom = needed_tailroom;
764
765 state->offset = 0;
766}
767EXPORT_SYMBOL(ip6_frag_init);
768
769struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
770{
771 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
772 struct sk_buff *frag;
773 struct frag_hdr *fh;
774 unsigned int len;
775
776 len = state->left;
777 /* IF: it doesn't fit, use 'mtu' - the data space left */
778 if (len > state->mtu)
779 len = state->mtu;
780 /* IF: we are not sending up to and including the packet end
781 then align the next start on an eight byte boundary */
782 if (len < state->left)
783 len &= ~7;
784
785 /* Allocate buffer */
786 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
787 state->hroom + state->troom, GFP_ATOMIC);
788 if (!frag)
789 return ERR_PTR(-ENOMEM);
790
791 /*
792 * Set up data on packet
793 */
794
795 ip6_copy_metadata(frag, skb);
796 skb_reserve(frag, state->hroom);
797 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
798 skb_reset_network_header(frag);
799 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
800 frag->transport_header = (frag->network_header + state->hlen +
801 sizeof(struct frag_hdr));
802
803 /*
804 * Charge the memory for the fragment to any owner
805 * it might possess
806 */
807 if (skb->sk)
808 skb_set_owner_w(frag, skb->sk);
809
810 /*
811 * Copy the packet header into the new buffer.
812 */
813 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
814
815 fragnexthdr_offset = skb_network_header(frag);
816 fragnexthdr_offset += prevhdr - skb_network_header(skb);
817 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
818
819 /*
820 * Build fragment header.
821 */
822 fh->nexthdr = state->nexthdr;
823 fh->reserved = 0;
824 fh->identification = state->frag_id;
825
826 /*
827 * Copy a block of the IP datagram.
828 */
829 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
830 len));
831 state->left -= len;
832
833 fh->frag_off = htons(state->offset);
834 if (state->left > 0)
835 fh->frag_off |= htons(IP6_MF);
836 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
837
838 state->ptr += len;
839 state->offset += len;
840
841 return frag;
842}
843EXPORT_SYMBOL(ip6_frag_next);
844
7d8c6e39
EB
845int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
846 int (*output)(struct net *, struct sock *, struct sk_buff *))
1da177e4 847{
1da177e4 848 struct sk_buff *frag;
67ba4152 849 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
f60e5990 850 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
851 inet6_sk(skb->sk) : NULL;
a1ac9c8a 852 bool mono_delivery_time = skb->mono_delivery_time;
8a6a1f17
PNA
853 struct ip6_frag_state state;
854 unsigned int mtu, hlen, nexthdr_offset;
9669fffc 855 ktime_t tstamp = skb->tstamp;
8a6a1f17 856 int hroom, err = 0;
286c2349 857 __be32 frag_id;
1da177e4
LT
858 u8 *prevhdr, nexthdr = 0;
859
7dd7eb95
DM
860 err = ip6_find_1stfragopt(skb, &prevhdr);
861 if (err < 0)
2423496a 862 goto fail;
7dd7eb95 863 hlen = err;
1da177e4 864 nexthdr = *prevhdr;
ef0efcd3 865 nexthdr_offset = prevhdr - skb_network_header(skb);
1da177e4 866
628a5c56 867 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
868
869 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 870 * or if the skb it not generated by a local socket.
b881ef76 871 */
485fca66
FW
872 if (unlikely(!skb->ignore_df && skb->len > mtu))
873 goto fail_toobig;
a34a101e 874
485fca66
FW
875 if (IP6CB(skb)->frag_max_size) {
876 if (IP6CB(skb)->frag_max_size > mtu)
877 goto fail_toobig;
878
879 /* don't send fragments larger than what we received */
880 mtu = IP6CB(skb)->frag_max_size;
881 if (mtu < IPV6_MIN_MTU)
882 mtu = IPV6_MIN_MTU;
b881ef76
JH
883 }
884
d91675f9
YH
885 if (np && np->frag_size < mtu) {
886 if (np->frag_size)
887 mtu = np->frag_size;
888 }
89bc7848 889 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
b72a2b01 890 goto fail_toobig;
1e0d69a9 891 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 892
fd0273d7
MKL
893 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
894 &ipv6_hdr(skb)->saddr);
286c2349 895
405c92f7
HFS
896 if (skb->ip_summed == CHECKSUM_PARTIAL &&
897 (err = skb_checksum_help(skb)))
898 goto fail;
899
ef0efcd3 900 prevhdr = skb_network_header(skb) + nexthdr_offset;
1d325d21 901 hroom = LL_RESERVED_SPACE(rt->dst.dev);
21dc3301 902 if (skb_has_frag_list(skb)) {
c72d8cda 903 unsigned int first_len = skb_pagelen(skb);
0feca619 904 struct ip6_fraglist_iter iter;
3d13008e 905 struct sk_buff *frag2;
1da177e4
LT
906
907 if (first_len - hlen > mtu ||
908 ((first_len - hlen) & 7) ||
1d325d21
FW
909 skb_cloned(skb) ||
910 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
1da177e4
LT
911 goto slow_path;
912
4d9092bb 913 skb_walk_frags(skb, frag) {
1da177e4
LT
914 /* Correct geometry. */
915 if (frag->len > mtu ||
916 ((frag->len & 7) && frag->next) ||
1d325d21 917 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
3d13008e 918 goto slow_path_clean;
1da177e4 919
1da177e4
LT
920 /* Partially cloned skb? */
921 if (skb_shared(frag))
3d13008e 922 goto slow_path_clean;
2fdba6b0
HX
923
924 BUG_ON(frag->sk);
925 if (skb->sk) {
2fdba6b0
HX
926 frag->sk = skb->sk;
927 frag->destructor = sock_wfree;
2fdba6b0 928 }
3d13008e 929 skb->truesize -= frag->truesize;
1da177e4
LT
930 }
931
0feca619
PNA
932 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
933 &iter);
934 if (err < 0)
1d325d21 935 goto fail;
a11d206d 936
803e8486
ED
937 /* We prevent @rt from being freed. */
938 rcu_read_lock();
939
1da177e4
LT
940 for (;;) {
941 /* Prepare header of the next frame,
942 * before previous one went down. */
0feca619
PNA
943 if (iter.frag)
944 ip6_fraglist_prepare(skb, &iter);
1ab1457c 945
a1ac9c8a 946 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
7d8c6e39 947 err = output(net, sk, skb);
67ba4152 948 if (!err)
d8d1f30b 949 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 950 IPSTATS_MIB_FRAGCREATES);
dafee490 951
0feca619 952 if (err || !iter.frag)
1da177e4
LT
953 break;
954
0feca619 955 skb = ip6_fraglist_next(&iter);
1da177e4
LT
956 }
957
0feca619 958 kfree(iter.tmp_hdr);
1da177e4
LT
959
960 if (err == 0) {
d8d1f30b 961 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 962 IPSTATS_MIB_FRAGOKS);
803e8486 963 rcu_read_unlock();
1da177e4
LT
964 return 0;
965 }
966
b7034146 967 kfree_skb_list(iter.frag);
1da177e4 968
d8d1f30b 969 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 970 IPSTATS_MIB_FRAGFAILS);
803e8486 971 rcu_read_unlock();
1da177e4 972 return err;
3d13008e
ED
973
974slow_path_clean:
975 skb_walk_frags(skb, frag2) {
976 if (frag2 == frag)
977 break;
978 frag2->sk = NULL;
979 frag2->destructor = NULL;
980 skb->truesize += frag2->truesize;
981 }
1da177e4
LT
982 }
983
984slow_path:
1da177e4
LT
985 /*
986 * Fragment the datagram.
987 */
988
8a6a1f17
PNA
989 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
990 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
991 &state);
1da177e4
LT
992
993 /*
994 * Keep copying data until we run out.
995 */
1da177e4 996
8a6a1f17
PNA
997 while (state.left > 0) {
998 frag = ip6_frag_next(skb, &state);
999 if (IS_ERR(frag)) {
1000 err = PTR_ERR(frag);
1da177e4
LT
1001 goto fail;
1002 }
1003
1da177e4
LT
1004 /*
1005 * Put this fragment into the sending queue.
1006 */
a1ac9c8a 1007 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
7d8c6e39 1008 err = output(net, sk, frag);
1da177e4
LT
1009 if (err)
1010 goto fail;
dafee490 1011
adf30907 1012 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 1013 IPSTATS_MIB_FRAGCREATES);
1da177e4 1014 }
adf30907 1015 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1016 IPSTATS_MIB_FRAGOKS);
808db80a 1017 consume_skb(skb);
1da177e4
LT
1018 return err;
1019
485fca66
FW
1020fail_toobig:
1021 if (skb->sk && dst_allfrag(skb_dst(skb)))
aba54656 1022 sk_gso_disable(skb->sk);
485fca66 1023
485fca66
FW
1024 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1025 err = -EMSGSIZE;
1026
1da177e4 1027fail:
adf30907 1028 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 1029 IPSTATS_MIB_FRAGFAILS);
1ab1457c 1030 kfree_skb(skb);
1da177e4
LT
1031 return err;
1032}
1033
b71d1d42
ED
1034static inline int ip6_rt_check(const struct rt6key *rt_key,
1035 const struct in6_addr *fl_addr,
1036 const struct in6_addr *addr_cache)
cf6b1982 1037{
a02cec21 1038 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
63159f29 1039 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
1040}
1041
497c615a
HX
1042static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1043 struct dst_entry *dst,
b71d1d42 1044 const struct flowi6 *fl6)
1da177e4 1045{
497c615a 1046 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 1047 struct rt6_info *rt;
1da177e4 1048
497c615a
HX
1049 if (!dst)
1050 goto out;
1051
a963a37d
ED
1052 if (dst->ops->family != AF_INET6) {
1053 dst_release(dst);
1054 return NULL;
1055 }
1056
1057 rt = (struct rt6_info *)dst;
497c615a
HX
1058 /* Yes, checking route validity in not connected
1059 * case is not very simple. Take into account,
1060 * that we do not support routing by source, TOS,
67ba4152 1061 * and MSG_DONTROUTE --ANK (980726)
497c615a 1062 *
cf6b1982
YH
1063 * 1. ip6_rt_check(): If route was host route,
1064 * check that cached destination is current.
497c615a
HX
1065 * If it is network route, we still may
1066 * check its validity using saved pointer
1067 * to the last used address: daddr_cache.
1068 * We do not want to save whole address now,
1069 * (because main consumer of this service
1070 * is tcp, which has not this problem),
1071 * so that the last trick works only on connected
1072 * sockets.
1073 * 2. oif also should be the same.
1074 */
4c9483b2 1075 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 1076#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 1077 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 1078#endif
40867d74 1079 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
497c615a
HX
1080 dst_release(dst);
1081 dst = NULL;
1da177e4
LT
1082 }
1083
497c615a
HX
1084out:
1085 return dst;
1086}
1087
3aef934f 1088static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
4c9483b2 1089 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 1090{
69cce1d1
DM
1091#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1092 struct neighbour *n;
97cac082 1093 struct rt6_info *rt;
69cce1d1
DM
1094#endif
1095 int err;
6f21c96a 1096 int flags = 0;
497c615a 1097
e16e888b
MS
1098 /* The correct way to handle this would be to do
1099 * ip6_route_get_saddr, and then ip6_route_output; however,
1100 * the route-specific preferred source forces the
1101 * ip6_route_output call _before_ ip6_route_get_saddr.
1102 *
1103 * In source specific routing (no src=any default route),
1104 * ip6_route_output will fail given src=any saddr, though, so
1105 * that's why we try it again later.
1106 */
c305b9e6 1107 if (ipv6_addr_any(&fl6->saddr)) {
a68886a6 1108 struct fib6_info *from;
e16e888b 1109 struct rt6_info *rt;
1da177e4 1110
c305b9e6 1111 *dst = ip6_route_output(net, sk, fl6);
e16e888b 1112 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
a68886a6
DA
1113
1114 rcu_read_lock();
1115 from = rt ? rcu_dereference(rt->from) : NULL;
1116 err = ip6_route_get_saddr(net, from, &fl6->daddr,
c3968a85
DW
1117 sk ? inet6_sk(sk)->srcprefs : 0,
1118 &fl6->saddr);
a68886a6
DA
1119 rcu_read_unlock();
1120
44456d37 1121 if (err)
1da177e4 1122 goto out_err_release;
e16e888b
MS
1123
1124 /* If we had an erroneous initial result, pretend it
1125 * never existed and let the SA-enabled version take
1126 * over.
1127 */
c305b9e6 1128 if ((*dst)->error) {
e16e888b
MS
1129 dst_release(*dst);
1130 *dst = NULL;
1131 }
6f21c96a
PA
1132
1133 if (fl6->flowi6_oif)
1134 flags |= RT6_LOOKUP_F_IFACE;
1da177e4
LT
1135 }
1136
e16e888b 1137 if (!*dst)
6f21c96a 1138 *dst = ip6_route_output_flags(net, sk, fl6, flags);
e16e888b
MS
1139
1140 err = (*dst)->error;
1141 if (err)
1142 goto out_err_release;
1143
95c385b4 1144#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
1145 /*
1146 * Here if the dst entry we've looked up
1147 * has a neighbour entry that is in the INCOMPLETE
1148 * state and the src address from the flow is
1149 * marked as OPTIMISTIC, we release the found
1150 * dst entry and replace it instead with the
1151 * dst entry of the nexthop router
1152 */
c56bf6fe 1153 rt = (struct rt6_info *) *dst;
09eed119 1154 rcu_read_lock();
2647a9b0
MKL
1155 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1156 rt6_nexthop(rt, &fl6->daddr));
b071af52 1157 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
09eed119 1158 rcu_read_unlock();
707be1ff
YH
1159
1160 if (err) {
e550dfb0 1161 struct inet6_ifaddr *ifp;
4c9483b2 1162 struct flowi6 fl_gw6;
e550dfb0
NH
1163 int redirect;
1164
4c9483b2 1165 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
1166 (*dst)->dev, 1);
1167
1168 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1169 if (ifp)
1170 in6_ifa_put(ifp);
1171
1172 if (redirect) {
1173 /*
1174 * We need to get the dst entry for the
1175 * default router instead
1176 */
1177 dst_release(*dst);
4c9483b2
DM
1178 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1179 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1180 *dst = ip6_route_output(net, sk, &fl_gw6);
e5d08d71
IM
1181 err = (*dst)->error;
1182 if (err)
e550dfb0 1183 goto out_err_release;
95c385b4 1184 }
e550dfb0 1185 }
95c385b4 1186#endif
ec5e3b0a 1187 if (ipv6_addr_v4mapped(&fl6->saddr) &&
00ea1cee
WB
1188 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1189 err = -EAFNOSUPPORT;
1190 goto out_err_release;
1191 }
95c385b4 1192
1da177e4
LT
1193 return 0;
1194
1195out_err_release:
1196 dst_release(*dst);
1197 *dst = NULL;
8a966fc0 1198
0d240e78
DA
1199 if (err == -ENETUNREACH)
1200 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
1201 return err;
1202}
34a0b3cd 1203
497c615a
HX
1204/**
1205 * ip6_dst_lookup - perform route lookup on flow
b51cd7c8 1206 * @net: Network namespace to perform lookup in
497c615a
HX
1207 * @sk: socket which provides route info
1208 * @dst: pointer to dst_entry * for result
4c9483b2 1209 * @fl6: flow to lookup
497c615a
HX
1210 *
1211 * This function performs a route lookup on the given flow.
1212 *
1213 * It returns zero on success, or a standard errno code on error.
1214 */
343d60aa
RP
1215int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1216 struct flowi6 *fl6)
497c615a
HX
1217{
1218 *dst = NULL;
343d60aa 1219 return ip6_dst_lookup_tail(net, sk, dst, fl6);
497c615a 1220}
3cf3dc6c
ACM
1221EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1222
497c615a 1223/**
68d0c6d3 1224 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
b51cd7c8 1225 * @net: Network namespace to perform lookup in
68d0c6d3 1226 * @sk: socket which provides route info
4c9483b2 1227 * @fl6: flow to lookup
68d0c6d3 1228 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
1229 *
1230 * This function performs a route lookup on the given flow.
1231 *
1232 * It returns a valid dst pointer on success, or a pointer encoded
1233 * error code.
1234 */
c4e85f73 1235struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1236 const struct in6_addr *final_dst)
68d0c6d3
DM
1237{
1238 struct dst_entry *dst = NULL;
1239 int err;
1240
c4e85f73 1241 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
68d0c6d3
DM
1242 if (err)
1243 return ERR_PTR(err);
1244 if (final_dst)
4e3fd7a0 1245 fl6->daddr = *final_dst;
2774c131 1246
c4e85f73 1247 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1248}
1249EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1250
1251/**
1252 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1253 * @sk: socket which provides the dst cache and route info
4c9483b2 1254 * @fl6: flow to lookup
68d0c6d3 1255 * @final_dst: final destination address for ipsec lookup
96818159 1256 * @connected: whether @sk is connected or not
497c615a
HX
1257 *
1258 * This function performs a route lookup on the given flow with the
1259 * possibility of using the cached route in the socket if it is valid.
1260 * It will take the socket dst lock when operating on the dst cache.
1261 * As a result, this function can only be used in process context.
1262 *
96818159
AK
1263 * In addition, for a connected socket, cache the dst in the socket
1264 * if the current cache is not valid.
1265 *
68d0c6d3
DM
1266 * It returns a valid dst pointer on success, or a pointer encoded
1267 * error code.
497c615a 1268 */
4c9483b2 1269struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
96818159
AK
1270 const struct in6_addr *final_dst,
1271 bool connected)
497c615a 1272{
68d0c6d3 1273 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
497c615a 1274
4c9483b2 1275 dst = ip6_sk_dst_check(sk, dst, fl6);
96818159
AK
1276 if (dst)
1277 return dst;
1278
c4e85f73 1279 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
96818159
AK
1280 if (connected && !IS_ERR(dst))
1281 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
68d0c6d3 1282
00bc0ef5 1283 return dst;
497c615a 1284}
68d0c6d3 1285EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1286
571912c6
MV
1287/**
1288 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1289 * @skb: Packet for which lookup is done
1290 * @dev: Tunnel device
1291 * @net: Network namespace of tunnel device
b51cd7c8 1292 * @sock: Socket which provides route info
571912c6
MV
1293 * @saddr: Memory to store the src ip address
1294 * @info: Tunnel information
1295 * @protocol: IP protocol
b51cd7c8 1296 * @use_cache: Flag to enable cache usage
571912c6
MV
1297 * This function performs a route lookup on a tunnel
1298 *
1299 * It returns a valid dst pointer and stores src address to be used in
1300 * tunnel in param saddr on success, else a pointer encoded error code.
1301 */
1302
1303struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1304 struct net_device *dev,
1305 struct net *net,
1306 struct socket *sock,
1307 struct in6_addr *saddr,
1308 const struct ip_tunnel_info *info,
1309 u8 protocol,
1310 bool use_cache)
1311{
1312 struct dst_entry *dst = NULL;
1313#ifdef CONFIG_DST_CACHE
1314 struct dst_cache *dst_cache;
1315#endif
1316 struct flowi6 fl6;
1317 __u8 prio;
1318
1319#ifdef CONFIG_DST_CACHE
1320 dst_cache = (struct dst_cache *)&info->dst_cache;
1321 if (use_cache) {
1322 dst = dst_cache_get_ip6(dst_cache, saddr);
1323 if (dst)
1324 return dst;
1325 }
1326#endif
1327 memset(&fl6, 0, sizeof(fl6));
1328 fl6.flowi6_mark = skb->mark;
1329 fl6.flowi6_proto = protocol;
1330 fl6.daddr = info->key.u.ipv6.dst;
1331 fl6.saddr = info->key.u.ipv6.src;
1332 prio = info->key.tos;
ab7e2e0d 1333 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
571912c6
MV
1334
1335 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1336 NULL);
1337 if (IS_ERR(dst)) {
1338 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1339 return ERR_PTR(-ENETUNREACH);
1340 }
1341 if (dst->dev == dev) { /* is this necessary? */
1342 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1343 dst_release(dst);
1344 return ERR_PTR(-ELOOP);
1345 }
1346#ifdef CONFIG_DST_CACHE
1347 if (use_cache)
1348 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1349#endif
1350 *saddr = fl6.saddr;
1351 return dst;
1352}
1353EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1354
0178b695
HX
1355static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1356 gfp_t gfp)
1357{
1358 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359}
1360
1361static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1362 gfp_t gfp)
1363{
1364 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1365}
1366
75a493e6 1367static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1368 int *maxfraglen,
1369 unsigned int fragheaderlen,
1370 struct sk_buff *skb,
75a493e6 1371 struct rt6_info *rt,
e367c2d0 1372 unsigned int orig_mtu)
0c183379
G
1373{
1374 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
63159f29 1375 if (!skb) {
0c183379 1376 /* first fragment, reserve header_len */
e367c2d0 1377 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1378
1379 } else {
1380 /*
1381 * this fragment is not first, the headers
1382 * space is regarded as data space.
1383 */
e367c2d0 1384 *mtu = orig_mtu;
0c183379
G
1385 }
1386 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1387 + fragheaderlen - sizeof(struct frag_hdr);
1388 }
1389}
1390
366e41d9 1391static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
26879da5 1392 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
f37a4cc6 1393 struct rt6_info *rt)
366e41d9
VY
1394{
1395 struct ipv6_pinfo *np = inet6_sk(sk);
1396 unsigned int mtu;
d656b2ea 1397 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
366e41d9 1398
40ac240c
PB
1399 /* callers pass dst together with a reference, set it first so
1400 * ip6_cork_release() can put it down even in case of an error.
1401 */
1402 cork->base.dst = &rt->dst;
1403
366e41d9
VY
1404 /*
1405 * setup for corking
1406 */
1407 if (opt) {
1408 if (WARN_ON(v6_cork->opt))
1409 return -EINVAL;
1410
d656b2ea
PB
1411 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1412 if (unlikely(!nopt))
366e41d9
VY
1413 return -ENOBUFS;
1414
d656b2ea
PB
1415 nopt->tot_len = sizeof(*opt);
1416 nopt->opt_flen = opt->opt_flen;
1417 nopt->opt_nflen = opt->opt_nflen;
366e41d9 1418
d656b2ea
PB
1419 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1420 if (opt->dst0opt && !nopt->dst0opt)
366e41d9
VY
1421 return -ENOBUFS;
1422
d656b2ea
PB
1423 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1424 if (opt->dst1opt && !nopt->dst1opt)
366e41d9
VY
1425 return -ENOBUFS;
1426
d656b2ea
PB
1427 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1428 if (opt->hopopt && !nopt->hopopt)
366e41d9
VY
1429 return -ENOBUFS;
1430
d656b2ea
PB
1431 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1432 if (opt->srcrt && !nopt->srcrt)
366e41d9
VY
1433 return -ENOBUFS;
1434
1435 /* need source address above miyazawa*/
1436 }
26879da5
WW
1437 v6_cork->hop_limit = ipc6->hlimit;
1438 v6_cork->tclass = ipc6->tclass;
366e41d9
VY
1439 if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
749439bf 1441 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
366e41d9
VY
1442 else
1443 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
c02b3741 1444 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
366e41d9
VY
1445 if (np->frag_size < mtu) {
1446 if (np->frag_size)
1447 mtu = np->frag_size;
1448 }
1449 cork->base.fragsize = mtu;
fbf47813 1450 cork->base.gso_size = ipc6->gso_size;
678ca42d 1451 cork->base.tx_flags = 0;
c6af0c22 1452 cork->base.mark = ipc6->sockc.mark;
678ca42d 1453 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
bec1f6f6 1454
0f6c480f 1455 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
366e41d9
VY
1456 cork->base.flags |= IPCORK_ALLFRAG;
1457 cork->base.length = 0;
1458
5fdaa88d 1459 cork->base.transmit_time = ipc6->sockc.transmit_time;
a818f75e 1460
366e41d9
VY
1461 return 0;
1462}
1463
0bbe84a6 1464static int __ip6_append_data(struct sock *sk,
0bbe84a6 1465 struct sk_buff_head *queue,
f3b46a3e 1466 struct inet_cork_full *cork_full,
0bbe84a6
VY
1467 struct inet6_cork *v6_cork,
1468 struct page_frag *pfrag,
1469 int getfrag(void *from, char *to, int offset,
1470 int len, int odd, struct sk_buff *skb),
f93431c8 1471 void *from, size_t length, int transhdrlen,
5fdaa88d 1472 unsigned int flags, struct ipcm6_cookie *ipc6)
1da177e4 1473{
0c183379 1474 struct sk_buff *skb, *skb_prev = NULL;
f3b46a3e 1475 struct inet_cork *cork = &cork_full->base;
f37a4cc6 1476 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
10b8a3de 1477 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
b5947e5d 1478 struct ubuf_info *uarg = NULL;
0bbe84a6
VY
1479 int exthdrlen = 0;
1480 int dst_exthdrlen = 0;
1da177e4 1481 int hh_len;
1da177e4
LT
1482 int copy;
1483 int err;
1484 int offset = 0;
773ba4fe 1485 bool zc = false;
09c2d251 1486 u32 tskey = 0;
0bbe84a6
VY
1487 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488 struct ipv6_txoptions *opt = v6_cork->opt;
32dce968 1489 int csummode = CHECKSUM_NONE;
682b1a9d 1490 unsigned int maxnonfragsize, headersize;
1f4c6eb2 1491 unsigned int wmem_alloc_delta = 0;
100f6d8e 1492 bool paged, extra_uref = false;
1da177e4 1493
0bbe84a6
VY
1494 skb = skb_peek_tail(queue);
1495 if (!skb) {
1496 exthdrlen = opt ? opt->opt_flen : 0;
7efdba5b 1497 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1498 }
0bbe84a6 1499
15e36f5b 1500 paged = !!cork->gso_size;
bec1f6f6 1501 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
e367c2d0 1502 orig_mtu = mtu;
1da177e4 1503
8ca5a579 1504 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
678ca42d 1505 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
a1cdec57 1506 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
678ca42d 1507
d8d1f30b 1508 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1509
a1b05140 1510 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1511 (opt ? opt->opt_nflen : 0);
1da177e4 1512
682b1a9d
HFS
1513 headersize = sizeof(struct ipv6hdr) +
1514 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515 (dst_allfrag(&rt->dst) ?
1516 sizeof(struct frag_hdr) : 0) +
1517 rt->rt6i_nfheader_len;
1518
5e34af41
TS
1519 if (mtu <= fragheaderlen ||
1520 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
6596a022
JB
1521 goto emsgsize;
1522
1523 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524 sizeof(struct frag_hdr);
1525
10b8a3de
PA
1526 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527 * the first fragment
1528 */
1529 if (headersize + transhdrlen > mtu)
1530 goto emsgsize;
1531
26879da5 1532 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
682b1a9d 1533 (sk->sk_protocol == IPPROTO_UDP ||
13651224 1534 sk->sk_protocol == IPPROTO_ICMPV6 ||
682b1a9d
HFS
1535 sk->sk_protocol == IPPROTO_RAW)) {
1536 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537 sizeof(struct ipv6hdr));
1538 goto emsgsize;
1539 }
4df98e76 1540
682b1a9d
HFS
1541 if (ip6_sk_ignore_df(sk))
1542 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543 else
1544 maxnonfragsize = mtu;
4df98e76 1545
682b1a9d 1546 if (cork->length + length > maxnonfragsize - headersize) {
4df98e76 1547emsgsize:
10b8a3de
PA
1548 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
682b1a9d 1550 return -EMSGSIZE;
1da177e4
LT
1551 }
1552
682b1a9d
HFS
1553 /* CHECKSUM_PARTIAL only with no extension headers and when
1554 * we are not going to fragment
1555 */
1556 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557 headersize == sizeof(struct ipv6hdr) &&
2b89ed65 1558 length <= mtu - headersize &&
bec1f6f6 1559 (!(flags & MSG_MORE) || cork->gso_size) &&
c8cd0989 1560 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
682b1a9d
HFS
1561 csummode = CHECKSUM_PARTIAL;
1562
1fd3ae8c
PB
1563 if ((flags & MSG_ZEROCOPY) && length) {
1564 struct msghdr *msg = from;
1565
1566 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568 return -EINVAL;
1569
1570 /* Leave uarg NULL if can't zerocopy, callers should
1571 * be able to handle it.
1572 */
1573 if ((rt->dst.dev->features & NETIF_F_SG) &&
1574 csummode == CHECKSUM_PARTIAL) {
1575 paged = true;
1576 zc = true;
1577 uarg = msg->msg_ubuf;
1578 }
1579 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581 if (!uarg)
1582 return -ENOBUFS;
1583 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1584 if (rt->dst.dev->features & NETIF_F_SG &&
1585 csummode == CHECKSUM_PARTIAL) {
1586 paged = true;
1587 zc = true;
1588 } else {
e7d2b510 1589 uarg_to_msgzc(uarg)->zerocopy = 0;
1fd3ae8c
PB
1590 skb_zcopy_set(skb, uarg, &extra_uref);
1591 }
b5947e5d 1592 }
6d8192bd
DH
1593 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1594 if (inet_sk(sk)->hdrincl)
1595 return -EPERM;
1596 if (rt->dst.dev->features & NETIF_F_SG)
1597 /* We need an empty buffer to attach stuff to */
1598 paged = true;
1599 else
1600 flags &= ~MSG_SPLICE_PAGES;
b5947e5d
WB
1601 }
1602
1da177e4
LT
1603 /*
1604 * Let's try using as much space as possible.
1605 * Use MTU if total length of the message fits into the MTU.
1606 * Otherwise, we need to reserve fragment header and
1607 * fragment alignment (= 8-15 octects, in total).
1608 *
634a63e7 1609 * Note that we may need to "move" the data from the tail
1ab1457c 1610 * of the buffer to the new fragment when we split
1da177e4
LT
1611 * the message.
1612 *
1ab1457c 1613 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1614 * at once if non-fragmentable extension headers
1615 * are too large.
1ab1457c 1616 * --yoshfuji
1da177e4
LT
1617 */
1618
2811ebac 1619 cork->length += length;
2811ebac 1620 if (!skb)
1da177e4
LT
1621 goto alloc_new_skb;
1622
1623 while (length > 0) {
1624 /* Check if the remaining data fits into current packet. */
bdc712b4 1625 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1626 if (copy < length)
1627 copy = maxfraglen - skb->len;
1628
1629 if (copy <= 0) {
1630 char *data;
1631 unsigned int datalen;
1632 unsigned int fraglen;
1633 unsigned int fraggap;
6d123b81 1634 unsigned int alloclen, alloc_extra;
aba36930 1635 unsigned int pagedlen;
1da177e4 1636alloc_new_skb:
1da177e4 1637 /* There's no room in the current skb */
0c183379
G
1638 if (skb)
1639 fraggap = skb->len - maxfraglen;
1da177e4
LT
1640 else
1641 fraggap = 0;
0c183379 1642 /* update mtu and maxfraglen if necessary */
63159f29 1643 if (!skb || !skb_prev)
0c183379 1644 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1645 fragheaderlen, skb, rt,
e367c2d0 1646 orig_mtu);
0c183379
G
1647
1648 skb_prev = skb;
1da177e4
LT
1649
1650 /*
1651 * If remaining data exceeds the mtu,
1652 * we know we need more fragment(s).
1653 */
1654 datalen = length + fraggap;
1da177e4 1655
0c183379
G
1656 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1657 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
15e36f5b 1658 fraglen = datalen + fragheaderlen;
aba36930 1659 pagedlen = 0;
15e36f5b 1660
6d123b81
JK
1661 alloc_extra = hh_len;
1662 alloc_extra += dst_exthdrlen;
1663 alloc_extra += rt->dst.trailer_len;
1664
1665 /* We just reserve space for fragment header.
1666 * Note: this may be overallocation if the message
1667 * (without MSG_MORE) fits into the MTU.
1668 */
1669 alloc_extra += sizeof(struct frag_hdr);
1670
1da177e4 1671 if ((flags & MSG_MORE) &&
d8d1f30b 1672 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4 1673 alloclen = mtu;
6d123b81
JK
1674 else if (!paged &&
1675 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1676 !(rt->dst.dev->features & NETIF_F_SG)))
15e36f5b 1677 alloclen = fraglen;
47cf8899 1678 else {
773ba4fe
PB
1679 alloclen = fragheaderlen + transhdrlen;
1680 pagedlen = datalen - transhdrlen;
15e36f5b 1681 }
6d123b81 1682 alloclen += alloc_extra;
299b0767 1683
0c183379
G
1684 if (datalen != length + fraggap) {
1685 /*
1686 * this is not the last fragment, the trailer
1687 * space is regarded as data space.
1688 */
1689 datalen += rt->dst.trailer_len;
1690 }
1691
0c183379 1692 fraglen = datalen + fragheaderlen;
1da177e4 1693
15e36f5b 1694 copy = datalen - transhdrlen - fraggap - pagedlen;
232cd35d
ED
1695 if (copy < 0) {
1696 err = -EINVAL;
1697 goto error;
1698 }
1da177e4 1699 if (transhdrlen) {
6d123b81 1700 skb = sock_alloc_send_skb(sk, alloclen,
1da177e4
LT
1701 (flags & MSG_DONTWAIT), &err);
1702 } else {
1703 skb = NULL;
1f4c6eb2 1704 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1da177e4 1705 2 * sk->sk_sndbuf)
6d123b81 1706 skb = alloc_skb(alloclen,
1f4c6eb2 1707 sk->sk_allocation);
63159f29 1708 if (unlikely(!skb))
1da177e4
LT
1709 err = -ENOBUFS;
1710 }
63159f29 1711 if (!skb)
1da177e4
LT
1712 goto error;
1713 /*
1714 * Fill in the control structures
1715 */
9c9c9ad5 1716 skb->protocol = htons(ETH_P_IPV6);
32dce968 1717 skb->ip_summed = csummode;
1da177e4 1718 skb->csum = 0;
1f85851e
G
1719 /* reserve for fragmentation and ipsec header */
1720 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1721 dst_exthdrlen);
1da177e4
LT
1722
1723 /*
1724 * Find where to start putting bytes
1725 */
15e36f5b 1726 data = skb_put(skb, fraglen - pagedlen);
1f85851e
G
1727 skb_set_network_header(skb, exthdrlen);
1728 data += fragheaderlen;
b0e380b1
ACM
1729 skb->transport_header = (skb->network_header +
1730 fragheaderlen);
1da177e4
LT
1731 if (fraggap) {
1732 skb->csum = skb_copy_and_csum_bits(
1733 skb_prev, maxfraglen,
8d5930df 1734 data + transhdrlen, fraggap);
1da177e4
LT
1735 skb_prev->csum = csum_sub(skb_prev->csum,
1736 skb->csum);
1737 data += fraggap;
e9fa4f7b 1738 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4 1739 }
232cd35d
ED
1740 if (copy > 0 &&
1741 getfrag(from, data + transhdrlen, offset,
1742 copy, fraggap, skb) < 0) {
1da177e4
LT
1743 err = -EFAULT;
1744 kfree_skb(skb);
1745 goto error;
1746 }
1747
1748 offset += copy;
15e36f5b 1749 length -= copy + transhdrlen;
1da177e4
LT
1750 transhdrlen = 0;
1751 exthdrlen = 0;
299b0767 1752 dst_exthdrlen = 0;
1da177e4 1753
52900d22
WB
1754 /* Only the initial fragment is time stamped */
1755 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1756 cork->tx_flags = 0;
1757 skb_shinfo(skb)->tskey = tskey;
1758 tskey = 0;
1759 skb_zcopy_set(skb, uarg, &extra_uref);
1760
0dec879f
JA
1761 if ((flags & MSG_CONFIRM) && !skb_prev)
1762 skb_set_dst_pending_confirm(skb, 1);
1763
1da177e4
LT
1764 /*
1765 * Put the packet on the pending queue
1766 */
1f4c6eb2
ED
1767 if (!skb->destructor) {
1768 skb->destructor = sock_wfree;
1769 skb->sk = sk;
1770 wmem_alloc_delta += skb->truesize;
1771 }
0bbe84a6 1772 __skb_queue_tail(queue, skb);
1da177e4
LT
1773 continue;
1774 }
1775
1776 if (copy > length)
1777 copy = length;
1778
113f99c3
WB
1779 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1780 skb_tailroom(skb) >= copy) {
1da177e4
LT
1781 unsigned int off;
1782
1783 off = skb->len;
1784 if (getfrag(from, skb_put(skb, copy),
1785 offset, copy, off, skb) < 0) {
1786 __skb_trim(skb, off);
1787 err = -EFAULT;
1788 goto error;
1789 }
6d8192bd
DH
1790 } else if (flags & MSG_SPLICE_PAGES) {
1791 struct msghdr *msg = from;
1792
1793 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1794 sk->sk_allocation);
1795 if (err < 0)
1796 goto error;
1797 copy = err;
1798 wmem_alloc_delta += copy;
1fd3ae8c 1799 } else if (!zc) {
1da177e4 1800 int i = skb_shinfo(skb)->nr_frags;
1da177e4 1801
5640f768
ED
1802 err = -ENOMEM;
1803 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1804 goto error;
5640f768 1805
1fd3ae8c 1806 skb_zcopy_downgrade_managed(skb);
5640f768
ED
1807 if (!skb_can_coalesce(skb, i, pfrag->page,
1808 pfrag->offset)) {
1809 err = -EMSGSIZE;
1810 if (i == MAX_SKB_FRAGS)
1811 goto error;
1812
1813 __skb_fill_page_desc(skb, i, pfrag->page,
1814 pfrag->offset, 0);
1815 skb_shinfo(skb)->nr_frags = ++i;
1816 get_page(pfrag->page);
1da177e4 1817 }
5640f768 1818 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1819 if (getfrag(from,
5640f768
ED
1820 page_address(pfrag->page) + pfrag->offset,
1821 offset, copy, skb->len, skb) < 0)
1822 goto error_efault;
1823
1824 pfrag->offset += copy;
1825 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1826 skb->len += copy;
1827 skb->data_len += copy;
f945fa7a 1828 skb->truesize += copy;
1f4c6eb2 1829 wmem_alloc_delta += copy;
b5947e5d
WB
1830 } else {
1831 err = skb_zerocopy_iter_dgram(skb, from, copy);
1832 if (err < 0)
1833 goto error;
1da177e4
LT
1834 }
1835 offset += copy;
1836 length -= copy;
1837 }
5640f768 1838
9e8445a5
PA
1839 if (wmem_alloc_delta)
1840 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4 1841 return 0;
5640f768
ED
1842
1843error_efault:
1844 err = -EFAULT;
1da177e4 1845error:
8e044917 1846 net_zcopy_put_abort(uarg, extra_uref);
bdc712b4 1847 cork->length -= length;
3bd653c8 1848 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1f4c6eb2 1849 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1da177e4
LT
1850 return err;
1851}
0bbe84a6
VY
1852
1853int ip6_append_data(struct sock *sk,
1854 int getfrag(void *from, char *to, int offset, int len,
1855 int odd, struct sk_buff *skb),
f93431c8 1856 void *from, size_t length, int transhdrlen,
26879da5 1857 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
5fdaa88d 1858 struct rt6_info *rt, unsigned int flags)
0bbe84a6
VY
1859{
1860 struct inet_sock *inet = inet_sk(sk);
1861 struct ipv6_pinfo *np = inet6_sk(sk);
1862 int exthdrlen;
1863 int err;
1864
1865 if (flags&MSG_PROBE)
1866 return 0;
1867 if (skb_queue_empty(&sk->sk_write_queue)) {
1868 /*
1869 * setup for corking
1870 */
40ac240c 1871 dst_hold(&rt->dst);
26879da5 1872 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
f37a4cc6 1873 ipc6, rt);
0bbe84a6
VY
1874 if (err)
1875 return err;
1876
f37a4cc6 1877 inet->cork.fl.u.ip6 = *fl6;
26879da5 1878 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
0bbe84a6
VY
1879 length += exthdrlen;
1880 transhdrlen += exthdrlen;
1881 } else {
0bbe84a6
VY
1882 transhdrlen = 0;
1883 }
1884
f37a4cc6 1885 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
0bbe84a6 1886 &np->cork, sk_page_frag(sk), getfrag,
5fdaa88d 1887 from, length, transhdrlen, flags, ipc6);
0bbe84a6 1888}
a495f836 1889EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1890
cd3c7480
PB
1891static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1892{
1893 struct dst_entry *dst = cork->base.dst;
1894
1895 cork->base.dst = NULL;
1896 cork->base.flags &= ~IPCORK_ALLFRAG;
1897 skb_dst_set(skb, dst);
1898}
1899
366e41d9
VY
1900static void ip6_cork_release(struct inet_cork_full *cork,
1901 struct inet6_cork *v6_cork)
bf138862 1902{
366e41d9 1903 if (v6_cork->opt) {
d656b2ea
PB
1904 struct ipv6_txoptions *opt = v6_cork->opt;
1905
1906 kfree(opt->dst0opt);
1907 kfree(opt->dst1opt);
1908 kfree(opt->hopopt);
1909 kfree(opt->srcrt);
1910 kfree(opt);
366e41d9 1911 v6_cork->opt = NULL;
0178b695
HX
1912 }
1913
366e41d9
VY
1914 if (cork->base.dst) {
1915 dst_release(cork->base.dst);
1916 cork->base.dst = NULL;
1917 cork->base.flags &= ~IPCORK_ALLFRAG;
bf138862 1918 }
bf138862
PE
1919}
1920
6422398c
VY
1921struct sk_buff *__ip6_make_skb(struct sock *sk,
1922 struct sk_buff_head *queue,
1923 struct inet_cork_full *cork,
1924 struct inet6_cork *v6_cork)
1da177e4
LT
1925{
1926 struct sk_buff *skb, *tmp_skb;
1927 struct sk_buff **tail_skb;
b60d4e58 1928 struct in6_addr *final_dst;
1da177e4 1929 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1930 struct net *net = sock_net(sk);
1da177e4 1931 struct ipv6hdr *hdr;
6422398c
VY
1932 struct ipv6_txoptions *opt = v6_cork->opt;
1933 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1934 struct flowi6 *fl6 = &cork->fl.u.ip6;
4c9483b2 1935 unsigned char proto = fl6->flowi6_proto;
1da177e4 1936
6422398c 1937 skb = __skb_dequeue(queue);
63159f29 1938 if (!skb)
1da177e4
LT
1939 goto out;
1940 tail_skb = &(skb_shinfo(skb)->frag_list);
1941
1942 /* move skb->data to ip header from ext header */
d56f90a7 1943 if (skb->data < skb_network_header(skb))
bbe735e4 1944 __skb_pull(skb, skb_network_offset(skb));
6422398c 1945 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1946 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1947 *tail_skb = tmp_skb;
1948 tail_skb = &(tmp_skb->next);
1949 skb->len += tmp_skb->len;
1950 skb->data_len += tmp_skb->len;
1da177e4 1951 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1952 tmp_skb->destructor = NULL;
1953 tmp_skb->sk = NULL;
1da177e4
LT
1954 }
1955
28a89453 1956 /* Allow local fragmentation. */
60ff7467 1957 skb->ignore_df = ip6_sk_ignore_df(sk);
cfe1fc77 1958 __skb_pull(skb, skb_network_header_len(skb));
b60d4e58
PB
1959
1960 final_dst = &fl6->daddr;
1da177e4
LT
1961 if (opt && opt->opt_flen)
1962 ipv6_push_frag_opts(skb, opt, &proto);
1963 if (opt && opt->opt_nflen)
613fa3ca 1964 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1da177e4 1965
e2d1bca7
ACM
1966 skb_push(skb, sizeof(struct ipv6hdr));
1967 skb_reset_network_header(skb);
0660e03f 1968 hdr = ipv6_hdr(skb);
1ab1457c 1969
6422398c 1970 ip6_flow_hdr(hdr, v6_cork->tclass,
cb1ce2ef 1971 ip6_make_flowlabel(net, skb, fl6->flowlabel,
513674b5 1972 ip6_autoflowlabel(net, np), fl6));
6422398c 1973 hdr->hop_limit = v6_cork->hop_limit;
1da177e4 1974 hdr->nexthdr = proto;
4e3fd7a0
AD
1975 hdr->saddr = fl6->saddr;
1976 hdr->daddr = *final_dst;
1da177e4 1977
a2c2064f 1978 skb->priority = sk->sk_priority;
c6af0c22 1979 skb->mark = cork->base.mark;
a818f75e
JSP
1980 skb->tstamp = cork->base.transmit_time;
1981
cd3c7480 1982 ip6_cork_steal_dst(skb, cork);
edf391ff 1983 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1984 if (proto == IPPROTO_ICMPV6) {
adf30907 1985 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
ea30388b 1986 u8 icmp6_type;
14878f75 1987
ea30388b
ZX
1988 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1989 icmp6_type = fl6->fl6_icmp_type;
1990 else
1991 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1992 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
43a43b60 1993 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1994 }
1995
6422398c
VY
1996 ip6_cork_release(cork, v6_cork);
1997out:
1998 return skb;
1999}
2000
2001int ip6_send_skb(struct sk_buff *skb)
2002{
2003 struct net *net = sock_net(skb->sk);
2004 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2005 int err;
2006
33224b16 2007 err = ip6_local_out(net, skb->sk, skb);
1da177e4
LT
2008 if (err) {
2009 if (err > 0)
6ce9e7b5 2010 err = net_xmit_errno(err);
1da177e4 2011 if (err)
6422398c
VY
2012 IP6_INC_STATS(net, rt->rt6i_idev,
2013 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
2014 }
2015
1da177e4 2016 return err;
6422398c
VY
2017}
2018
2019int ip6_push_pending_frames(struct sock *sk)
2020{
2021 struct sk_buff *skb;
2022
2023 skb = ip6_finish_skb(sk);
2024 if (!skb)
2025 return 0;
2026
2027 return ip6_send_skb(skb);
1da177e4 2028}
a495f836 2029EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4 2030
0bbe84a6 2031static void __ip6_flush_pending_frames(struct sock *sk,
6422398c
VY
2032 struct sk_buff_head *queue,
2033 struct inet_cork_full *cork,
2034 struct inet6_cork *v6_cork)
1da177e4 2035{
1da177e4
LT
2036 struct sk_buff *skb;
2037
0bbe84a6 2038 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
adf30907
ED
2039 if (skb_dst(skb))
2040 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 2041 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
2042 kfree_skb(skb);
2043 }
2044
6422398c 2045 ip6_cork_release(cork, v6_cork);
1da177e4 2046}
0bbe84a6
VY
2047
2048void ip6_flush_pending_frames(struct sock *sk)
2049{
6422398c
VY
2050 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2051 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
0bbe84a6 2052}
a495f836 2053EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
6422398c
VY
2054
2055struct sk_buff *ip6_make_skb(struct sock *sk,
2056 int getfrag(void *from, char *to, int offset,
2057 int len, int odd, struct sk_buff *skb),
f93431c8 2058 void *from, size_t length, int transhdrlen,
f37a4cc6
PB
2059 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2060 unsigned int flags, struct inet_cork_full *cork)
6422398c 2061{
6422398c
VY
2062 struct inet6_cork v6_cork;
2063 struct sk_buff_head queue;
26879da5 2064 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
6422398c
VY
2065 int err;
2066
40ac240c
PB
2067 if (flags & MSG_PROBE) {
2068 dst_release(&rt->dst);
6422398c 2069 return NULL;
40ac240c 2070 }
6422398c
VY
2071
2072 __skb_queue_head_init(&queue);
2073
1cd7884d
WB
2074 cork->base.flags = 0;
2075 cork->base.addr = 0;
2076 cork->base.opt = NULL;
6422398c 2077 v6_cork.opt = NULL;
f37a4cc6 2078 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
862c03ee 2079 if (err) {
1cd7884d 2080 ip6_cork_release(cork, &v6_cork);
6422398c 2081 return ERR_PTR(err);
862c03ee 2082 }
26879da5
WW
2083 if (ipc6->dontfrag < 0)
2084 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
6422398c 2085
f37a4cc6 2086 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
6422398c
VY
2087 &current->task_frag, getfrag, from,
2088 length + exthdrlen, transhdrlen + exthdrlen,
5fdaa88d 2089 flags, ipc6);
6422398c 2090 if (err) {
1cd7884d 2091 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
6422398c
VY
2092 return ERR_PTR(err);
2093 }
2094
1cd7884d 2095 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
6422398c 2096}