Merge tag 'spdx-5.2-rc3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[linux-2.6-block.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57
58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
59 {
60         struct dst_entry *dst = skb_dst(skb);
61         struct net_device *dev = dst->dev;
62         struct neighbour *neigh;
63         struct in6_addr *nexthop;
64         int ret;
65
66         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
67                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
68
69                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
70                     ((mroute6_is_socket(net, skb) &&
71                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
72                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
73                                          &ipv6_hdr(skb)->saddr))) {
74                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
75
76                         /* Do not check for IFF_ALLMULTI; multicast routing
77                            is not supported in any case.
78                          */
79                         if (newskb)
80                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
81                                         net, sk, newskb, NULL, newskb->dev,
82                                         dev_loopback_xmit);
83
84                         if (ipv6_hdr(skb)->hop_limit == 0) {
85                                 IP6_INC_STATS(net, idev,
86                                               IPSTATS_MIB_OUTDISCARDS);
87                                 kfree_skb(skb);
88                                 return 0;
89                         }
90                 }
91
92                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
93
94                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
95                     IPV6_ADDR_SCOPE_NODELOCAL &&
96                     !(dev->flags & IFF_LOOPBACK)) {
97                         kfree_skb(skb);
98                         return 0;
99                 }
100         }
101
102         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
103                 int res = lwtunnel_xmit(skb);
104
105                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
106                         return res;
107         }
108
109         rcu_read_lock_bh();
110         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
111         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
112         if (unlikely(!neigh))
113                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
114         if (!IS_ERR(neigh)) {
115                 sock_confirm_neigh(skb, neigh);
116                 ret = neigh_output(neigh, skb, false);
117                 rcu_read_unlock_bh();
118                 return ret;
119         }
120         rcu_read_unlock_bh();
121
122         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
123         kfree_skb(skb);
124         return -EINVAL;
125 }
126
127 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
128 {
129         int ret;
130
131         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
132         if (ret) {
133                 kfree_skb(skb);
134                 return ret;
135         }
136
137 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
138         /* Policy lookup after SNAT yielded a new policy */
139         if (skb_dst(skb)->xfrm) {
140                 IPCB(skb)->flags |= IPSKB_REROUTED;
141                 return dst_output(net, sk, skb);
142         }
143 #endif
144
145         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
146             dst_allfrag(skb_dst(skb)) ||
147             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
148                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
149         else
150                 return ip6_finish_output2(net, sk, skb);
151 }
152
153 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
154 {
155         struct net_device *dev = skb_dst(skb)->dev;
156         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
157
158         skb->protocol = htons(ETH_P_IPV6);
159         skb->dev = dev;
160
161         if (unlikely(idev->cnf.disable_ipv6)) {
162                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
163                 kfree_skb(skb);
164                 return 0;
165         }
166
167         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
168                             net, sk, skb, NULL, dev,
169                             ip6_finish_output,
170                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
174 {
175         if (!np->autoflowlabel_set)
176                 return ip6_default_np_autolabel(net);
177         else
178                 return np->autoflowlabel;
179 }
180
181 /*
182  * xmit an sk_buff (used by TCP, SCTP and DCCP)
183  * Note : socket lock is not held for SYNACK packets, but might be modified
184  * by calls to skb_set_owner_w() and ipv6_local_error(),
185  * which are using proper atomic operations or spinlocks.
186  */
187 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
188              __u32 mark, struct ipv6_txoptions *opt, int tclass)
189 {
190         struct net *net = sock_net(sk);
191         const struct ipv6_pinfo *np = inet6_sk(sk);
192         struct in6_addr *first_hop = &fl6->daddr;
193         struct dst_entry *dst = skb_dst(skb);
194         unsigned int head_room;
195         struct ipv6hdr *hdr;
196         u8  proto = fl6->flowi6_proto;
197         int seg_len = skb->len;
198         int hlimit = -1;
199         u32 mtu;
200
201         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
202         if (opt)
203                 head_room += opt->opt_nflen + opt->opt_flen;
204
205         if (unlikely(skb_headroom(skb) < head_room)) {
206                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                 if (!skb2) {
208                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                       IPSTATS_MIB_OUTDISCARDS);
210                         kfree_skb(skb);
211                         return -ENOBUFS;
212                 }
213                 if (skb->sk)
214                         skb_set_owner_w(skb2, skb->sk);
215                 consume_skb(skb);
216                 skb = skb2;
217         }
218
219         if (opt) {
220                 seg_len += opt->opt_nflen + opt->opt_flen;
221
222                 if (opt->opt_flen)
223                         ipv6_push_frag_opts(skb, opt, &proto);
224
225                 if (opt->opt_nflen)
226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
227                                              &fl6->saddr);
228         }
229
230         skb_push(skb, sizeof(struct ipv6hdr));
231         skb_reset_network_header(skb);
232         hdr = ipv6_hdr(skb);
233
234         /*
235          *      Fill in the IPv6 header
236          */
237         if (np)
238                 hlimit = np->hop_limit;
239         if (hlimit < 0)
240                 hlimit = ip6_dst_hoplimit(dst);
241
242         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
243                                 ip6_autoflowlabel(net, np), fl6));
244
245         hdr->payload_len = htons(seg_len);
246         hdr->nexthdr = proto;
247         hdr->hop_limit = hlimit;
248
249         hdr->saddr = fl6->saddr;
250         hdr->daddr = *first_hop;
251
252         skb->protocol = htons(ETH_P_IPV6);
253         skb->priority = sk->sk_priority;
254         skb->mark = mark;
255
256         mtu = dst_mtu(dst);
257         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
258                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
259                               IPSTATS_MIB_OUT, skb->len);
260
261                 /* if egress device is enslaved to an L3 master device pass the
262                  * skb to its handler for processing
263                  */
264                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
265                 if (unlikely(!skb))
266                         return 0;
267
268                 /* hooks should never assume socket lock is held.
269                  * we promote our socket to non const
270                  */
271                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
272                                net, (struct sock *)sk, skb, NULL, dst->dev,
273                                dst_output);
274         }
275
276         skb->dev = dst->dev;
277         /* ipv6_local_error() does not require socket lock,
278          * we promote our socket to non const
279          */
280         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
281
282         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
283         kfree_skb(skb);
284         return -EMSGSIZE;
285 }
286 EXPORT_SYMBOL(ip6_xmit);
287
288 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
289 {
290         struct ip6_ra_chain *ra;
291         struct sock *last = NULL;
292
293         read_lock(&ip6_ra_lock);
294         for (ra = ip6_ra_chain; ra; ra = ra->next) {
295                 struct sock *sk = ra->sk;
296                 if (sk && ra->sel == sel &&
297                     (!sk->sk_bound_dev_if ||
298                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
299                         struct ipv6_pinfo *np = inet6_sk(sk);
300
301                         if (np && np->rtalert_isolate &&
302                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
303                                 continue;
304                         }
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         struct dst_entry *dst = skb_dst(skb);
379
380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382
383 #ifdef CONFIG_NET_SWITCHDEV
384         if (skb->offload_l3_fwd_mark) {
385                 consume_skb(skb);
386                 return 0;
387         }
388 #endif
389
390         skb->tstamp = 0;
391         return dst_output(net, sk, skb);
392 }
393
394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
395 {
396         if (skb->len <= mtu)
397                 return false;
398
399         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
400         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
401                 return true;
402
403         if (skb->ignore_df)
404                 return false;
405
406         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
407                 return false;
408
409         return true;
410 }
411
412 int ip6_forward(struct sk_buff *skb)
413 {
414         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
415         struct dst_entry *dst = skb_dst(skb);
416         struct ipv6hdr *hdr = ipv6_hdr(skb);
417         struct inet6_skb_parm *opt = IP6CB(skb);
418         struct net *net = dev_net(dst->dev);
419         u32 mtu;
420
421         if (net->ipv6.devconf_all->forwarding == 0)
422                 goto error;
423
424         if (skb->pkt_type != PACKET_HOST)
425                 goto drop;
426
427         if (unlikely(skb->sk))
428                 goto drop;
429
430         if (skb_warn_if_lro(skb))
431                 goto drop;
432
433         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
434                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
435                 goto drop;
436         }
437
438         skb_forward_csum(skb);
439
440         /*
441          *      We DO NOT make any processing on
442          *      RA packets, pushing them to user level AS IS
443          *      without ane WARRANTY that application will be able
444          *      to interpret them. The reason is that we
445          *      cannot make anything clever here.
446          *
447          *      We are not end-node, so that if packet contains
448          *      AH/ESP, we cannot make anything.
449          *      Defragmentation also would be mistake, RA packets
450          *      cannot be fragmented, because there is no warranty
451          *      that different fragments will go along one path. --ANK
452          */
453         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
454                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
455                         return 0;
456         }
457
458         /*
459          *      check and decrement ttl
460          */
461         if (hdr->hop_limit <= 1) {
462                 /* Force OUTPUT device used as source address */
463                 skb->dev = dst->dev;
464                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
465                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
466
467                 kfree_skb(skb);
468                 return -ETIMEDOUT;
469         }
470
471         /* XXX: idev->cnf.proxy_ndp? */
472         if (net->ipv6.devconf_all->proxy_ndp &&
473             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
474                 int proxied = ip6_forward_proxy_check(skb);
475                 if (proxied > 0)
476                         return ip6_input(skb);
477                 else if (proxied < 0) {
478                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
479                         goto drop;
480                 }
481         }
482
483         if (!xfrm6_route_forward(skb)) {
484                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
485                 goto drop;
486         }
487         dst = skb_dst(skb);
488
489         /* IPv6 specs say nothing about it, but it is clear that we cannot
490            send redirects to source routed frames.
491            We don't send redirects to frames decapsulated from IPsec.
492          */
493         if (IP6CB(skb)->iif == dst->dev->ifindex &&
494             opt->srcrt == 0 && !skb_sec_path(skb)) {
495                 struct in6_addr *target = NULL;
496                 struct inet_peer *peer;
497                 struct rt6_info *rt;
498
499                 /*
500                  *      incoming and outgoing devices are the same
501                  *      send a redirect.
502                  */
503
504                 rt = (struct rt6_info *) dst;
505                 if (rt->rt6i_flags & RTF_GATEWAY)
506                         target = &rt->rt6i_gateway;
507                 else
508                         target = &hdr->daddr;
509
510                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
511
512                 /* Limit redirects both by destination (here)
513                    and by source (inside ndisc_send_redirect)
514                  */
515                 if (inet_peer_xrlim_allow(peer, 1*HZ))
516                         ndisc_send_redirect(skb, target);
517                 if (peer)
518                         inet_putpeer(peer);
519         } else {
520                 int addrtype = ipv6_addr_type(&hdr->saddr);
521
522                 /* This check is security critical. */
523                 if (addrtype == IPV6_ADDR_ANY ||
524                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
525                         goto error;
526                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
527                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
528                                     ICMPV6_NOT_NEIGHBOUR, 0);
529                         goto error;
530                 }
531         }
532
533         mtu = ip6_dst_mtu_forward(dst);
534         if (mtu < IPV6_MIN_MTU)
535                 mtu = IPV6_MIN_MTU;
536
537         if (ip6_pkt_too_big(skb, mtu)) {
538                 /* Again, force OUTPUT device used as source address */
539                 skb->dev = dst->dev;
540                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
541                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
542                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
543                                 IPSTATS_MIB_FRAGFAILS);
544                 kfree_skb(skb);
545                 return -EMSGSIZE;
546         }
547
548         if (skb_cow(skb, dst->dev->hard_header_len)) {
549                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
550                                 IPSTATS_MIB_OUTDISCARDS);
551                 goto drop;
552         }
553
554         hdr = ipv6_hdr(skb);
555
556         /* Mangling hops number delayed to point after skb COW */
557
558         hdr->hop_limit--;
559
560         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
561                        net, NULL, skb, skb->dev, dst->dev,
562                        ip6_forward_finish);
563
564 error:
565         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
566 drop:
567         kfree_skb(skb);
568         return -EINVAL;
569 }
570
571 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
572 {
573         to->pkt_type = from->pkt_type;
574         to->priority = from->priority;
575         to->protocol = from->protocol;
576         skb_dst_drop(to);
577         skb_dst_set(to, dst_clone(skb_dst(from)));
578         to->dev = from->dev;
579         to->mark = from->mark;
580
581         skb_copy_hash(to, from);
582
583 #ifdef CONFIG_NET_SCHED
584         to->tc_index = from->tc_index;
585 #endif
586         nf_copy(to, from);
587         skb_ext_copy(to, from);
588         skb_copy_secmark(to, from);
589 }
590
591 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
592                  int (*output)(struct net *, struct sock *, struct sk_buff *))
593 {
594         struct sk_buff *frag;
595         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
596         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
597                                 inet6_sk(skb->sk) : NULL;
598         struct ipv6hdr *tmp_hdr;
599         struct frag_hdr *fh;
600         unsigned int mtu, hlen, left, len, nexthdr_offset;
601         int hroom, troom;
602         __be32 frag_id;
603         int ptr, offset = 0, err = 0;
604         u8 *prevhdr, nexthdr = 0;
605
606         err = ip6_find_1stfragopt(skb, &prevhdr);
607         if (err < 0)
608                 goto fail;
609         hlen = err;
610         nexthdr = *prevhdr;
611         nexthdr_offset = prevhdr - skb_network_header(skb);
612
613         mtu = ip6_skb_dst_mtu(skb);
614
615         /* We must not fragment if the socket is set to force MTU discovery
616          * or if the skb it not generated by a local socket.
617          */
618         if (unlikely(!skb->ignore_df && skb->len > mtu))
619                 goto fail_toobig;
620
621         if (IP6CB(skb)->frag_max_size) {
622                 if (IP6CB(skb)->frag_max_size > mtu)
623                         goto fail_toobig;
624
625                 /* don't send fragments larger than what we received */
626                 mtu = IP6CB(skb)->frag_max_size;
627                 if (mtu < IPV6_MIN_MTU)
628                         mtu = IPV6_MIN_MTU;
629         }
630
631         if (np && np->frag_size < mtu) {
632                 if (np->frag_size)
633                         mtu = np->frag_size;
634         }
635         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
636                 goto fail_toobig;
637         mtu -= hlen + sizeof(struct frag_hdr);
638
639         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
640                                     &ipv6_hdr(skb)->saddr);
641
642         if (skb->ip_summed == CHECKSUM_PARTIAL &&
643             (err = skb_checksum_help(skb)))
644                 goto fail;
645
646         prevhdr = skb_network_header(skb) + nexthdr_offset;
647         hroom = LL_RESERVED_SPACE(rt->dst.dev);
648         if (skb_has_frag_list(skb)) {
649                 unsigned int first_len = skb_pagelen(skb);
650                 struct sk_buff *frag2;
651
652                 if (first_len - hlen > mtu ||
653                     ((first_len - hlen) & 7) ||
654                     skb_cloned(skb) ||
655                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
656                         goto slow_path;
657
658                 skb_walk_frags(skb, frag) {
659                         /* Correct geometry. */
660                         if (frag->len > mtu ||
661                             ((frag->len & 7) && frag->next) ||
662                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
663                                 goto slow_path_clean;
664
665                         /* Partially cloned skb? */
666                         if (skb_shared(frag))
667                                 goto slow_path_clean;
668
669                         BUG_ON(frag->sk);
670                         if (skb->sk) {
671                                 frag->sk = skb->sk;
672                                 frag->destructor = sock_wfree;
673                         }
674                         skb->truesize -= frag->truesize;
675                 }
676
677                 err = 0;
678                 offset = 0;
679                 /* BUILD HEADER */
680
681                 *prevhdr = NEXTHDR_FRAGMENT;
682                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
683                 if (!tmp_hdr) {
684                         err = -ENOMEM;
685                         goto fail;
686                 }
687                 frag = skb_shinfo(skb)->frag_list;
688                 skb_frag_list_init(skb);
689
690                 __skb_pull(skb, hlen);
691                 fh = __skb_push(skb, sizeof(struct frag_hdr));
692                 __skb_push(skb, hlen);
693                 skb_reset_network_header(skb);
694                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
695
696                 fh->nexthdr = nexthdr;
697                 fh->reserved = 0;
698                 fh->frag_off = htons(IP6_MF);
699                 fh->identification = frag_id;
700
701                 first_len = skb_pagelen(skb);
702                 skb->data_len = first_len - skb_headlen(skb);
703                 skb->len = first_len;
704                 ipv6_hdr(skb)->payload_len = htons(first_len -
705                                                    sizeof(struct ipv6hdr));
706
707                 for (;;) {
708                         /* Prepare header of the next frame,
709                          * before previous one went down. */
710                         if (frag) {
711                                 frag->ip_summed = CHECKSUM_NONE;
712                                 skb_reset_transport_header(frag);
713                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
714                                 __skb_push(frag, hlen);
715                                 skb_reset_network_header(frag);
716                                 memcpy(skb_network_header(frag), tmp_hdr,
717                                        hlen);
718                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
719                                 fh->nexthdr = nexthdr;
720                                 fh->reserved = 0;
721                                 fh->frag_off = htons(offset);
722                                 if (frag->next)
723                                         fh->frag_off |= htons(IP6_MF);
724                                 fh->identification = frag_id;
725                                 ipv6_hdr(frag)->payload_len =
726                                                 htons(frag->len -
727                                                       sizeof(struct ipv6hdr));
728                                 ip6_copy_metadata(frag, skb);
729                         }
730
731                         err = output(net, sk, skb);
732                         if (!err)
733                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
734                                               IPSTATS_MIB_FRAGCREATES);
735
736                         if (err || !frag)
737                                 break;
738
739                         skb = frag;
740                         frag = skb->next;
741                         skb_mark_not_on_list(skb);
742                 }
743
744                 kfree(tmp_hdr);
745
746                 if (err == 0) {
747                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748                                       IPSTATS_MIB_FRAGOKS);
749                         return 0;
750                 }
751
752                 kfree_skb_list(frag);
753
754                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
755                               IPSTATS_MIB_FRAGFAILS);
756                 return err;
757
758 slow_path_clean:
759                 skb_walk_frags(skb, frag2) {
760                         if (frag2 == frag)
761                                 break;
762                         frag2->sk = NULL;
763                         frag2->destructor = NULL;
764                         skb->truesize += frag2->truesize;
765                 }
766         }
767
768 slow_path:
769         left = skb->len - hlen;         /* Space per frame */
770         ptr = hlen;                     /* Where to start from */
771
772         /*
773          *      Fragment the datagram.
774          */
775
776         troom = rt->dst.dev->needed_tailroom;
777
778         /*
779          *      Keep copying data until we run out.
780          */
781         while (left > 0)        {
782                 u8 *fragnexthdr_offset;
783
784                 len = left;
785                 /* IF: it doesn't fit, use 'mtu' - the data space left */
786                 if (len > mtu)
787                         len = mtu;
788                 /* IF: we are not sending up to and including the packet end
789                    then align the next start on an eight byte boundary */
790                 if (len < left) {
791                         len &= ~7;
792                 }
793
794                 /* Allocate buffer */
795                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
796                                  hroom + troom, GFP_ATOMIC);
797                 if (!frag) {
798                         err = -ENOMEM;
799                         goto fail;
800                 }
801
802                 /*
803                  *      Set up data on packet
804                  */
805
806                 ip6_copy_metadata(frag, skb);
807                 skb_reserve(frag, hroom);
808                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809                 skb_reset_network_header(frag);
810                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811                 frag->transport_header = (frag->network_header + hlen +
812                                           sizeof(struct frag_hdr));
813
814                 /*
815                  *      Charge the memory for the fragment to any owner
816                  *      it might possess
817                  */
818                 if (skb->sk)
819                         skb_set_owner_w(frag, skb->sk);
820
821                 /*
822                  *      Copy the packet header into the new buffer.
823                  */
824                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825
826                 fragnexthdr_offset = skb_network_header(frag);
827                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
828                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
829
830                 /*
831                  *      Build fragment header.
832                  */
833                 fh->nexthdr = nexthdr;
834                 fh->reserved = 0;
835                 fh->identification = frag_id;
836
837                 /*
838                  *      Copy a block of the IP datagram.
839                  */
840                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
841                                      len));
842                 left -= len;
843
844                 fh->frag_off = htons(offset);
845                 if (left > 0)
846                         fh->frag_off |= htons(IP6_MF);
847                 ipv6_hdr(frag)->payload_len = htons(frag->len -
848                                                     sizeof(struct ipv6hdr));
849
850                 ptr += len;
851                 offset += len;
852
853                 /*
854                  *      Put this fragment into the sending queue.
855                  */
856                 err = output(net, sk, frag);
857                 if (err)
858                         goto fail;
859
860                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861                               IPSTATS_MIB_FRAGCREATES);
862         }
863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864                       IPSTATS_MIB_FRAGOKS);
865         consume_skb(skb);
866         return err;
867
868 fail_toobig:
869         if (skb->sk && dst_allfrag(skb_dst(skb)))
870                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
871
872         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873         err = -EMSGSIZE;
874
875 fail:
876         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877                       IPSTATS_MIB_FRAGFAILS);
878         kfree_skb(skb);
879         return err;
880 }
881
882 static inline int ip6_rt_check(const struct rt6key *rt_key,
883                                const struct in6_addr *fl_addr,
884                                const struct in6_addr *addr_cache)
885 {
886         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
887                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
888 }
889
890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
891                                           struct dst_entry *dst,
892                                           const struct flowi6 *fl6)
893 {
894         struct ipv6_pinfo *np = inet6_sk(sk);
895         struct rt6_info *rt;
896
897         if (!dst)
898                 goto out;
899
900         if (dst->ops->family != AF_INET6) {
901                 dst_release(dst);
902                 return NULL;
903         }
904
905         rt = (struct rt6_info *)dst;
906         /* Yes, checking route validity in not connected
907          * case is not very simple. Take into account,
908          * that we do not support routing by source, TOS,
909          * and MSG_DONTROUTE            --ANK (980726)
910          *
911          * 1. ip6_rt_check(): If route was host route,
912          *    check that cached destination is current.
913          *    If it is network route, we still may
914          *    check its validity using saved pointer
915          *    to the last used address: daddr_cache.
916          *    We do not want to save whole address now,
917          *    (because main consumer of this service
918          *    is tcp, which has not this problem),
919          *    so that the last trick works only on connected
920          *    sockets.
921          * 2. oif also should be the same.
922          */
923         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
924 #ifdef CONFIG_IPV6_SUBTREES
925             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
926 #endif
927            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
928               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
929                 dst_release(dst);
930                 dst = NULL;
931         }
932
933 out:
934         return dst;
935 }
936
937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
938                                struct dst_entry **dst, struct flowi6 *fl6)
939 {
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941         struct neighbour *n;
942         struct rt6_info *rt;
943 #endif
944         int err;
945         int flags = 0;
946
947         /* The correct way to handle this would be to do
948          * ip6_route_get_saddr, and then ip6_route_output; however,
949          * the route-specific preferred source forces the
950          * ip6_route_output call _before_ ip6_route_get_saddr.
951          *
952          * In source specific routing (no src=any default route),
953          * ip6_route_output will fail given src=any saddr, though, so
954          * that's why we try it again later.
955          */
956         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
957                 struct fib6_info *from;
958                 struct rt6_info *rt;
959                 bool had_dst = *dst != NULL;
960
961                 if (!had_dst)
962                         *dst = ip6_route_output(net, sk, fl6);
963                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
964
965                 rcu_read_lock();
966                 from = rt ? rcu_dereference(rt->from) : NULL;
967                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
968                                           sk ? inet6_sk(sk)->srcprefs : 0,
969                                           &fl6->saddr);
970                 rcu_read_unlock();
971
972                 if (err)
973                         goto out_err_release;
974
975                 /* If we had an erroneous initial result, pretend it
976                  * never existed and let the SA-enabled version take
977                  * over.
978                  */
979                 if (!had_dst && (*dst)->error) {
980                         dst_release(*dst);
981                         *dst = NULL;
982                 }
983
984                 if (fl6->flowi6_oif)
985                         flags |= RT6_LOOKUP_F_IFACE;
986         }
987
988         if (!*dst)
989                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
990
991         err = (*dst)->error;
992         if (err)
993                 goto out_err_release;
994
995 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
996         /*
997          * Here if the dst entry we've looked up
998          * has a neighbour entry that is in the INCOMPLETE
999          * state and the src address from the flow is
1000          * marked as OPTIMISTIC, we release the found
1001          * dst entry and replace it instead with the
1002          * dst entry of the nexthop router
1003          */
1004         rt = (struct rt6_info *) *dst;
1005         rcu_read_lock_bh();
1006         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1007                                       rt6_nexthop(rt, &fl6->daddr));
1008         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1009         rcu_read_unlock_bh();
1010
1011         if (err) {
1012                 struct inet6_ifaddr *ifp;
1013                 struct flowi6 fl_gw6;
1014                 int redirect;
1015
1016                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1017                                       (*dst)->dev, 1);
1018
1019                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1020                 if (ifp)
1021                         in6_ifa_put(ifp);
1022
1023                 if (redirect) {
1024                         /*
1025                          * We need to get the dst entry for the
1026                          * default router instead
1027                          */
1028                         dst_release(*dst);
1029                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1030                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1031                         *dst = ip6_route_output(net, sk, &fl_gw6);
1032                         err = (*dst)->error;
1033                         if (err)
1034                                 goto out_err_release;
1035                 }
1036         }
1037 #endif
1038         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1039             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1040                 err = -EAFNOSUPPORT;
1041                 goto out_err_release;
1042         }
1043
1044         return 0;
1045
1046 out_err_release:
1047         dst_release(*dst);
1048         *dst = NULL;
1049
1050         if (err == -ENETUNREACH)
1051                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1052         return err;
1053 }
1054
1055 /**
1056  *      ip6_dst_lookup - perform route lookup on flow
1057  *      @sk: socket which provides route info
1058  *      @dst: pointer to dst_entry * for result
1059  *      @fl6: flow to lookup
1060  *
1061  *      This function performs a route lookup on the given flow.
1062  *
1063  *      It returns zero on success, or a standard errno code on error.
1064  */
1065 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1066                    struct flowi6 *fl6)
1067 {
1068         *dst = NULL;
1069         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1072
1073 /**
1074  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1075  *      @sk: socket which provides route info
1076  *      @fl6: flow to lookup
1077  *      @final_dst: final destination address for ipsec lookup
1078  *
1079  *      This function performs a route lookup on the given flow.
1080  *
1081  *      It returns a valid dst pointer on success, or a pointer encoded
1082  *      error code.
1083  */
1084 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1085                                       const struct in6_addr *final_dst)
1086 {
1087         struct dst_entry *dst = NULL;
1088         int err;
1089
1090         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1091         if (err)
1092                 return ERR_PTR(err);
1093         if (final_dst)
1094                 fl6->daddr = *final_dst;
1095
1096         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1099
1100 /**
1101  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1102  *      @sk: socket which provides the dst cache and route info
1103  *      @fl6: flow to lookup
1104  *      @final_dst: final destination address for ipsec lookup
1105  *      @connected: whether @sk is connected or not
1106  *
1107  *      This function performs a route lookup on the given flow with the
1108  *      possibility of using the cached route in the socket if it is valid.
1109  *      It will take the socket dst lock when operating on the dst cache.
1110  *      As a result, this function can only be used in process context.
1111  *
1112  *      In addition, for a connected socket, cache the dst in the socket
1113  *      if the current cache is not valid.
1114  *
1115  *      It returns a valid dst pointer on success, or a pointer encoded
1116  *      error code.
1117  */
1118 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1119                                          const struct in6_addr *final_dst,
1120                                          bool connected)
1121 {
1122         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123
1124         dst = ip6_sk_dst_check(sk, dst, fl6);
1125         if (dst)
1126                 return dst;
1127
1128         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1129         if (connected && !IS_ERR(dst))
1130                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1131
1132         return dst;
1133 }
1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135
1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1137                                                gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1143                                                 gfp_t gfp)
1144 {
1145         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147
1148 static void ip6_append_data_mtu(unsigned int *mtu,
1149                                 int *maxfraglen,
1150                                 unsigned int fragheaderlen,
1151                                 struct sk_buff *skb,
1152                                 struct rt6_info *rt,
1153                                 unsigned int orig_mtu)
1154 {
1155         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1156                 if (!skb) {
1157                         /* first fragment, reserve header_len */
1158                         *mtu = orig_mtu - rt->dst.header_len;
1159
1160                 } else {
1161                         /*
1162                          * this fragment is not first, the headers
1163                          * space is regarded as data space.
1164                          */
1165                         *mtu = orig_mtu;
1166                 }
1167                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1168                               + fragheaderlen - sizeof(struct frag_hdr);
1169         }
1170 }
1171
1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1173                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1174                           struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176         struct ipv6_pinfo *np = inet6_sk(sk);
1177         unsigned int mtu;
1178         struct ipv6_txoptions *opt = ipc6->opt;
1179
1180         /*
1181          * setup for corking
1182          */
1183         if (opt) {
1184                 if (WARN_ON(v6_cork->opt))
1185                         return -EINVAL;
1186
1187                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1188                 if (unlikely(!v6_cork->opt))
1189                         return -ENOBUFS;
1190
1191                 v6_cork->opt->tot_len = sizeof(*opt);
1192                 v6_cork->opt->opt_flen = opt->opt_flen;
1193                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1194
1195                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1196                                                     sk->sk_allocation);
1197                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1198                         return -ENOBUFS;
1199
1200                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1201                                                     sk->sk_allocation);
1202                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1203                         return -ENOBUFS;
1204
1205                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1206                                                    sk->sk_allocation);
1207                 if (opt->hopopt && !v6_cork->opt->hopopt)
1208                         return -ENOBUFS;
1209
1210                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1211                                                     sk->sk_allocation);
1212                 if (opt->srcrt && !v6_cork->opt->srcrt)
1213                         return -ENOBUFS;
1214
1215                 /* need source address above miyazawa*/
1216         }
1217         dst_hold(&rt->dst);
1218         cork->base.dst = &rt->dst;
1219         cork->fl.u.ip6 = *fl6;
1220         v6_cork->hop_limit = ipc6->hlimit;
1221         v6_cork->tclass = ipc6->tclass;
1222         if (rt->dst.flags & DST_XFRM_TUNNEL)
1223                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1225         else
1226                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1228         if (np->frag_size < mtu) {
1229                 if (np->frag_size)
1230                         mtu = np->frag_size;
1231         }
1232         if (mtu < IPV6_MIN_MTU)
1233                 return -EINVAL;
1234         cork->base.fragsize = mtu;
1235         cork->base.gso_size = ipc6->gso_size;
1236         cork->base.tx_flags = 0;
1237         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1238
1239         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1240                 cork->base.flags |= IPCORK_ALLFRAG;
1241         cork->base.length = 0;
1242
1243         cork->base.transmit_time = ipc6->sockc.transmit_time;
1244
1245         return 0;
1246 }
1247
1248 static int __ip6_append_data(struct sock *sk,
1249                              struct flowi6 *fl6,
1250                              struct sk_buff_head *queue,
1251                              struct inet_cork *cork,
1252                              struct inet6_cork *v6_cork,
1253                              struct page_frag *pfrag,
1254                              int getfrag(void *from, char *to, int offset,
1255                                          int len, int odd, struct sk_buff *skb),
1256                              void *from, int length, int transhdrlen,
1257                              unsigned int flags, struct ipcm6_cookie *ipc6)
1258 {
1259         struct sk_buff *skb, *skb_prev = NULL;
1260         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1261         struct ubuf_info *uarg = NULL;
1262         int exthdrlen = 0;
1263         int dst_exthdrlen = 0;
1264         int hh_len;
1265         int copy;
1266         int err;
1267         int offset = 0;
1268         u32 tskey = 0;
1269         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1270         struct ipv6_txoptions *opt = v6_cork->opt;
1271         int csummode = CHECKSUM_NONE;
1272         unsigned int maxnonfragsize, headersize;
1273         unsigned int wmem_alloc_delta = 0;
1274         bool paged, extra_uref = false;
1275
1276         skb = skb_peek_tail(queue);
1277         if (!skb) {
1278                 exthdrlen = opt ? opt->opt_flen : 0;
1279                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280         }
1281
1282         paged = !!cork->gso_size;
1283         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1284         orig_mtu = mtu;
1285
1286         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1287             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1288                 tskey = sk->sk_tskey++;
1289
1290         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1291
1292         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1293                         (opt ? opt->opt_nflen : 0);
1294         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1295                      sizeof(struct frag_hdr);
1296
1297         headersize = sizeof(struct ipv6hdr) +
1298                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1299                      (dst_allfrag(&rt->dst) ?
1300                       sizeof(struct frag_hdr) : 0) +
1301                      rt->rt6i_nfheader_len;
1302
1303         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1304          * the first fragment
1305          */
1306         if (headersize + transhdrlen > mtu)
1307                 goto emsgsize;
1308
1309         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1310             (sk->sk_protocol == IPPROTO_UDP ||
1311              sk->sk_protocol == IPPROTO_RAW)) {
1312                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1313                                 sizeof(struct ipv6hdr));
1314                 goto emsgsize;
1315         }
1316
1317         if (ip6_sk_ignore_df(sk))
1318                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1319         else
1320                 maxnonfragsize = mtu;
1321
1322         if (cork->length + length > maxnonfragsize - headersize) {
1323 emsgsize:
1324                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1325                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1326                 return -EMSGSIZE;
1327         }
1328
1329         /* CHECKSUM_PARTIAL only with no extension headers and when
1330          * we are not going to fragment
1331          */
1332         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1333             headersize == sizeof(struct ipv6hdr) &&
1334             length <= mtu - headersize &&
1335             (!(flags & MSG_MORE) || cork->gso_size) &&
1336             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1337                 csummode = CHECKSUM_PARTIAL;
1338
1339         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1340                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1341                 if (!uarg)
1342                         return -ENOBUFS;
1343                 extra_uref = !skb;      /* only extra ref if !MSG_MORE */
1344                 if (rt->dst.dev->features & NETIF_F_SG &&
1345                     csummode == CHECKSUM_PARTIAL) {
1346                         paged = true;
1347                 } else {
1348                         uarg->zerocopy = 0;
1349                         skb_zcopy_set(skb, uarg, &extra_uref);
1350                 }
1351         }
1352
1353         /*
1354          * Let's try using as much space as possible.
1355          * Use MTU if total length of the message fits into the MTU.
1356          * Otherwise, we need to reserve fragment header and
1357          * fragment alignment (= 8-15 octects, in total).
1358          *
1359          * Note that we may need to "move" the data from the tail of
1360          * of the buffer to the new fragment when we split
1361          * the message.
1362          *
1363          * FIXME: It may be fragmented into multiple chunks
1364          *        at once if non-fragmentable extension headers
1365          *        are too large.
1366          * --yoshfuji
1367          */
1368
1369         cork->length += length;
1370         if (!skb)
1371                 goto alloc_new_skb;
1372
1373         while (length > 0) {
1374                 /* Check if the remaining data fits into current packet. */
1375                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1376                 if (copy < length)
1377                         copy = maxfraglen - skb->len;
1378
1379                 if (copy <= 0) {
1380                         char *data;
1381                         unsigned int datalen;
1382                         unsigned int fraglen;
1383                         unsigned int fraggap;
1384                         unsigned int alloclen;
1385                         unsigned int pagedlen;
1386 alloc_new_skb:
1387                         /* There's no room in the current skb */
1388                         if (skb)
1389                                 fraggap = skb->len - maxfraglen;
1390                         else
1391                                 fraggap = 0;
1392                         /* update mtu and maxfraglen if necessary */
1393                         if (!skb || !skb_prev)
1394                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1395                                                     fragheaderlen, skb, rt,
1396                                                     orig_mtu);
1397
1398                         skb_prev = skb;
1399
1400                         /*
1401                          * If remaining data exceeds the mtu,
1402                          * we know we need more fragment(s).
1403                          */
1404                         datalen = length + fraggap;
1405
1406                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1407                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1408                         fraglen = datalen + fragheaderlen;
1409                         pagedlen = 0;
1410
1411                         if ((flags & MSG_MORE) &&
1412                             !(rt->dst.dev->features&NETIF_F_SG))
1413                                 alloclen = mtu;
1414                         else if (!paged)
1415                                 alloclen = fraglen;
1416                         else {
1417                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1418                                 pagedlen = fraglen - alloclen;
1419                         }
1420
1421                         alloclen += dst_exthdrlen;
1422
1423                         if (datalen != length + fraggap) {
1424                                 /*
1425                                  * this is not the last fragment, the trailer
1426                                  * space is regarded as data space.
1427                                  */
1428                                 datalen += rt->dst.trailer_len;
1429                         }
1430
1431                         alloclen += rt->dst.trailer_len;
1432                         fraglen = datalen + fragheaderlen;
1433
1434                         /*
1435                          * We just reserve space for fragment header.
1436                          * Note: this may be overallocation if the message
1437                          * (without MSG_MORE) fits into the MTU.
1438                          */
1439                         alloclen += sizeof(struct frag_hdr);
1440
1441                         copy = datalen - transhdrlen - fraggap - pagedlen;
1442                         if (copy < 0) {
1443                                 err = -EINVAL;
1444                                 goto error;
1445                         }
1446                         if (transhdrlen) {
1447                                 skb = sock_alloc_send_skb(sk,
1448                                                 alloclen + hh_len,
1449                                                 (flags & MSG_DONTWAIT), &err);
1450                         } else {
1451                                 skb = NULL;
1452                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1453                                     2 * sk->sk_sndbuf)
1454                                         skb = alloc_skb(alloclen + hh_len,
1455                                                         sk->sk_allocation);
1456                                 if (unlikely(!skb))
1457                                         err = -ENOBUFS;
1458                         }
1459                         if (!skb)
1460                                 goto error;
1461                         /*
1462                          *      Fill in the control structures
1463                          */
1464                         skb->protocol = htons(ETH_P_IPV6);
1465                         skb->ip_summed = csummode;
1466                         skb->csum = 0;
1467                         /* reserve for fragmentation and ipsec header */
1468                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1469                                     dst_exthdrlen);
1470
1471                         /*
1472                          *      Find where to start putting bytes
1473                          */
1474                         data = skb_put(skb, fraglen - pagedlen);
1475                         skb_set_network_header(skb, exthdrlen);
1476                         data += fragheaderlen;
1477                         skb->transport_header = (skb->network_header +
1478                                                  fragheaderlen);
1479                         if (fraggap) {
1480                                 skb->csum = skb_copy_and_csum_bits(
1481                                         skb_prev, maxfraglen,
1482                                         data + transhdrlen, fraggap, 0);
1483                                 skb_prev->csum = csum_sub(skb_prev->csum,
1484                                                           skb->csum);
1485                                 data += fraggap;
1486                                 pskb_trim_unique(skb_prev, maxfraglen);
1487                         }
1488                         if (copy > 0 &&
1489                             getfrag(from, data + transhdrlen, offset,
1490                                     copy, fraggap, skb) < 0) {
1491                                 err = -EFAULT;
1492                                 kfree_skb(skb);
1493                                 goto error;
1494                         }
1495
1496                         offset += copy;
1497                         length -= copy + transhdrlen;
1498                         transhdrlen = 0;
1499                         exthdrlen = 0;
1500                         dst_exthdrlen = 0;
1501
1502                         /* Only the initial fragment is time stamped */
1503                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1504                         cork->tx_flags = 0;
1505                         skb_shinfo(skb)->tskey = tskey;
1506                         tskey = 0;
1507                         skb_zcopy_set(skb, uarg, &extra_uref);
1508
1509                         if ((flags & MSG_CONFIRM) && !skb_prev)
1510                                 skb_set_dst_pending_confirm(skb, 1);
1511
1512                         /*
1513                          * Put the packet on the pending queue
1514                          */
1515                         if (!skb->destructor) {
1516                                 skb->destructor = sock_wfree;
1517                                 skb->sk = sk;
1518                                 wmem_alloc_delta += skb->truesize;
1519                         }
1520                         __skb_queue_tail(queue, skb);
1521                         continue;
1522                 }
1523
1524                 if (copy > length)
1525                         copy = length;
1526
1527                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1528                     skb_tailroom(skb) >= copy) {
1529                         unsigned int off;
1530
1531                         off = skb->len;
1532                         if (getfrag(from, skb_put(skb, copy),
1533                                                 offset, copy, off, skb) < 0) {
1534                                 __skb_trim(skb, off);
1535                                 err = -EFAULT;
1536                                 goto error;
1537                         }
1538                 } else if (!uarg || !uarg->zerocopy) {
1539                         int i = skb_shinfo(skb)->nr_frags;
1540
1541                         err = -ENOMEM;
1542                         if (!sk_page_frag_refill(sk, pfrag))
1543                                 goto error;
1544
1545                         if (!skb_can_coalesce(skb, i, pfrag->page,
1546                                               pfrag->offset)) {
1547                                 err = -EMSGSIZE;
1548                                 if (i == MAX_SKB_FRAGS)
1549                                         goto error;
1550
1551                                 __skb_fill_page_desc(skb, i, pfrag->page,
1552                                                      pfrag->offset, 0);
1553                                 skb_shinfo(skb)->nr_frags = ++i;
1554                                 get_page(pfrag->page);
1555                         }
1556                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1557                         if (getfrag(from,
1558                                     page_address(pfrag->page) + pfrag->offset,
1559                                     offset, copy, skb->len, skb) < 0)
1560                                 goto error_efault;
1561
1562                         pfrag->offset += copy;
1563                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1564                         skb->len += copy;
1565                         skb->data_len += copy;
1566                         skb->truesize += copy;
1567                         wmem_alloc_delta += copy;
1568                 } else {
1569                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1570                         if (err < 0)
1571                                 goto error;
1572                 }
1573                 offset += copy;
1574                 length -= copy;
1575         }
1576
1577         if (wmem_alloc_delta)
1578                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1579         return 0;
1580
1581 error_efault:
1582         err = -EFAULT;
1583 error:
1584         if (uarg)
1585                 sock_zerocopy_put_abort(uarg, extra_uref);
1586         cork->length -= length;
1587         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1588         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1589         return err;
1590 }
1591
1592 int ip6_append_data(struct sock *sk,
1593                     int getfrag(void *from, char *to, int offset, int len,
1594                                 int odd, struct sk_buff *skb),
1595                     void *from, int length, int transhdrlen,
1596                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1597                     struct rt6_info *rt, unsigned int flags)
1598 {
1599         struct inet_sock *inet = inet_sk(sk);
1600         struct ipv6_pinfo *np = inet6_sk(sk);
1601         int exthdrlen;
1602         int err;
1603
1604         if (flags&MSG_PROBE)
1605                 return 0;
1606         if (skb_queue_empty(&sk->sk_write_queue)) {
1607                 /*
1608                  * setup for corking
1609                  */
1610                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1611                                      ipc6, rt, fl6);
1612                 if (err)
1613                         return err;
1614
1615                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1616                 length += exthdrlen;
1617                 transhdrlen += exthdrlen;
1618         } else {
1619                 fl6 = &inet->cork.fl.u.ip6;
1620                 transhdrlen = 0;
1621         }
1622
1623         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1624                                  &np->cork, sk_page_frag(sk), getfrag,
1625                                  from, length, transhdrlen, flags, ipc6);
1626 }
1627 EXPORT_SYMBOL_GPL(ip6_append_data);
1628
1629 static void ip6_cork_release(struct inet_cork_full *cork,
1630                              struct inet6_cork *v6_cork)
1631 {
1632         if (v6_cork->opt) {
1633                 kfree(v6_cork->opt->dst0opt);
1634                 kfree(v6_cork->opt->dst1opt);
1635                 kfree(v6_cork->opt->hopopt);
1636                 kfree(v6_cork->opt->srcrt);
1637                 kfree(v6_cork->opt);
1638                 v6_cork->opt = NULL;
1639         }
1640
1641         if (cork->base.dst) {
1642                 dst_release(cork->base.dst);
1643                 cork->base.dst = NULL;
1644                 cork->base.flags &= ~IPCORK_ALLFRAG;
1645         }
1646         memset(&cork->fl, 0, sizeof(cork->fl));
1647 }
1648
1649 struct sk_buff *__ip6_make_skb(struct sock *sk,
1650                                struct sk_buff_head *queue,
1651                                struct inet_cork_full *cork,
1652                                struct inet6_cork *v6_cork)
1653 {
1654         struct sk_buff *skb, *tmp_skb;
1655         struct sk_buff **tail_skb;
1656         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1657         struct ipv6_pinfo *np = inet6_sk(sk);
1658         struct net *net = sock_net(sk);
1659         struct ipv6hdr *hdr;
1660         struct ipv6_txoptions *opt = v6_cork->opt;
1661         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1662         struct flowi6 *fl6 = &cork->fl.u.ip6;
1663         unsigned char proto = fl6->flowi6_proto;
1664
1665         skb = __skb_dequeue(queue);
1666         if (!skb)
1667                 goto out;
1668         tail_skb = &(skb_shinfo(skb)->frag_list);
1669
1670         /* move skb->data to ip header from ext header */
1671         if (skb->data < skb_network_header(skb))
1672                 __skb_pull(skb, skb_network_offset(skb));
1673         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1674                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1675                 *tail_skb = tmp_skb;
1676                 tail_skb = &(tmp_skb->next);
1677                 skb->len += tmp_skb->len;
1678                 skb->data_len += tmp_skb->len;
1679                 skb->truesize += tmp_skb->truesize;
1680                 tmp_skb->destructor = NULL;
1681                 tmp_skb->sk = NULL;
1682         }
1683
1684         /* Allow local fragmentation. */
1685         skb->ignore_df = ip6_sk_ignore_df(sk);
1686
1687         *final_dst = fl6->daddr;
1688         __skb_pull(skb, skb_network_header_len(skb));
1689         if (opt && opt->opt_flen)
1690                 ipv6_push_frag_opts(skb, opt, &proto);
1691         if (opt && opt->opt_nflen)
1692                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1693
1694         skb_push(skb, sizeof(struct ipv6hdr));
1695         skb_reset_network_header(skb);
1696         hdr = ipv6_hdr(skb);
1697
1698         ip6_flow_hdr(hdr, v6_cork->tclass,
1699                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1700                                         ip6_autoflowlabel(net, np), fl6));
1701         hdr->hop_limit = v6_cork->hop_limit;
1702         hdr->nexthdr = proto;
1703         hdr->saddr = fl6->saddr;
1704         hdr->daddr = *final_dst;
1705
1706         skb->priority = sk->sk_priority;
1707         skb->mark = sk->sk_mark;
1708
1709         skb->tstamp = cork->base.transmit_time;
1710
1711         skb_dst_set(skb, dst_clone(&rt->dst));
1712         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1713         if (proto == IPPROTO_ICMPV6) {
1714                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1715
1716                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1717                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1718         }
1719
1720         ip6_cork_release(cork, v6_cork);
1721 out:
1722         return skb;
1723 }
1724
1725 int ip6_send_skb(struct sk_buff *skb)
1726 {
1727         struct net *net = sock_net(skb->sk);
1728         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1729         int err;
1730
1731         err = ip6_local_out(net, skb->sk, skb);
1732         if (err) {
1733                 if (err > 0)
1734                         err = net_xmit_errno(err);
1735                 if (err)
1736                         IP6_INC_STATS(net, rt->rt6i_idev,
1737                                       IPSTATS_MIB_OUTDISCARDS);
1738         }
1739
1740         return err;
1741 }
1742
1743 int ip6_push_pending_frames(struct sock *sk)
1744 {
1745         struct sk_buff *skb;
1746
1747         skb = ip6_finish_skb(sk);
1748         if (!skb)
1749                 return 0;
1750
1751         return ip6_send_skb(skb);
1752 }
1753 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1754
1755 static void __ip6_flush_pending_frames(struct sock *sk,
1756                                        struct sk_buff_head *queue,
1757                                        struct inet_cork_full *cork,
1758                                        struct inet6_cork *v6_cork)
1759 {
1760         struct sk_buff *skb;
1761
1762         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1763                 if (skb_dst(skb))
1764                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1765                                       IPSTATS_MIB_OUTDISCARDS);
1766                 kfree_skb(skb);
1767         }
1768
1769         ip6_cork_release(cork, v6_cork);
1770 }
1771
1772 void ip6_flush_pending_frames(struct sock *sk)
1773 {
1774         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1775                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1776 }
1777 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1778
1779 struct sk_buff *ip6_make_skb(struct sock *sk,
1780                              int getfrag(void *from, char *to, int offset,
1781                                          int len, int odd, struct sk_buff *skb),
1782                              void *from, int length, int transhdrlen,
1783                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1784                              struct rt6_info *rt, unsigned int flags,
1785                              struct inet_cork_full *cork)
1786 {
1787         struct inet6_cork v6_cork;
1788         struct sk_buff_head queue;
1789         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1790         int err;
1791
1792         if (flags & MSG_PROBE)
1793                 return NULL;
1794
1795         __skb_queue_head_init(&queue);
1796
1797         cork->base.flags = 0;
1798         cork->base.addr = 0;
1799         cork->base.opt = NULL;
1800         cork->base.dst = NULL;
1801         v6_cork.opt = NULL;
1802         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1803         if (err) {
1804                 ip6_cork_release(cork, &v6_cork);
1805                 return ERR_PTR(err);
1806         }
1807         if (ipc6->dontfrag < 0)
1808                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1809
1810         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1811                                 &current->task_frag, getfrag, from,
1812                                 length + exthdrlen, transhdrlen + exthdrlen,
1813                                 flags, ipc6);
1814         if (err) {
1815                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1816                 return ERR_PTR(err);
1817         }
1818
1819         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1820 }