f2fs: Provide a splice-read wrapper
[linux-block.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct inet6_dev *idev = ip6_dst_idev(dst);
64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
65         const struct in6_addr *daddr, *nexthop;
66         struct ipv6hdr *hdr;
67         struct neighbour *neigh;
68         int ret;
69
70         /* Be paranoid, rather than too clever. */
71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72                 skb = skb_expand_head(skb, hh_len);
73                 if (!skb) {
74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75                         return -ENOMEM;
76                 }
77         }
78
79         hdr = ipv6_hdr(skb);
80         daddr = &hdr->daddr;
81         if (ipv6_addr_is_multicast(daddr)) {
82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83                     ((mroute6_is_socket(net, skb) &&
84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88                         /* Do not check for IFF_ALLMULTI; multicast routing
89                            is not supported in any case.
90                          */
91                         if (newskb)
92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93                                         net, sk, newskb, NULL, newskb->dev,
94                                         dev_loopback_xmit);
95
96                         if (hdr->hop_limit == 0) {
97                                 IP6_INC_STATS(net, idev,
98                                               IPSTATS_MIB_OUTDISCARDS);
99                                 kfree_skb(skb);
100                                 return 0;
101                         }
102                 }
103
104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106                     !(dev->flags & IFF_LOOPBACK)) {
107                         kfree_skb(skb);
108                         return 0;
109                 }
110         }
111
112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113                 int res = lwtunnel_xmit(skb);
114
115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116                         return res;
117         }
118
119         rcu_read_lock();
120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122
123         if (unlikely(IS_ERR_OR_NULL(neigh))) {
124                 if (unlikely(!neigh))
125                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126                 if (IS_ERR(neigh)) {
127                         rcu_read_unlock();
128                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130                         return -EINVAL;
131                 }
132         }
133         sock_confirm_neigh(skb, neigh);
134         ret = neigh_output(neigh, skb, false);
135         rcu_read_unlock();
136         return ret;
137 }
138
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141                                     struct sk_buff *skb, unsigned int mtu)
142 {
143         struct sk_buff *segs, *nskb;
144         netdev_features_t features;
145         int ret = 0;
146
147         /* Please see corresponding comment in ip_finish_output_gso
148          * describing the cases where GSO segment length exceeds the
149          * egress MTU.
150          */
151         features = netif_skb_features(skb);
152         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153         if (IS_ERR_OR_NULL(segs)) {
154                 kfree_skb(skb);
155                 return -ENOMEM;
156         }
157
158         consume_skb(skb);
159
160         skb_list_walk_safe(segs, segs, nskb) {
161                 int err;
162
163                 skb_mark_not_on_list(segs);
164                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165                 if (err && ret == 0)
166                         ret = err;
167         }
168
169         return ret;
170 }
171
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174         unsigned int mtu;
175
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177         /* Policy lookup after SNAT yielded a new policy */
178         if (skb_dst(skb)->xfrm) {
179                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
180                 return dst_output(net, sk, skb);
181         }
182 #endif
183
184         mtu = ip6_skb_dst_mtu(skb);
185         if (skb_is_gso(skb) &&
186             !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187             !skb_gso_validate_network_len(skb, mtu))
188                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189
190         if ((skb->len > mtu && !skb_is_gso(skb)) ||
191             dst_allfrag(skb_dst(skb)) ||
192             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
193                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
194         else
195                 return ip6_finish_output2(net, sk, skb);
196 }
197
198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199 {
200         int ret;
201
202         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203         switch (ret) {
204         case NET_XMIT_SUCCESS:
205         case NET_XMIT_CN:
206                 return __ip6_finish_output(net, sk, skb) ? : ret;
207         default:
208                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
209                 return ret;
210         }
211 }
212
213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
216         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
217
218         skb->protocol = htons(ETH_P_IPV6);
219         skb->dev = dev;
220
221         if (unlikely(idev->cnf.disable_ipv6)) {
222                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
223                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
224                 return 0;
225         }
226
227         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
228                             net, sk, skb, indev, dev,
229                             ip6_finish_output,
230                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
231 }
232 EXPORT_SYMBOL(ip6_output);
233
234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
235 {
236         if (!np->autoflowlabel_set)
237                 return ip6_default_np_autolabel(net);
238         else
239                 return np->autoflowlabel;
240 }
241
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251         struct net *net = sock_net(sk);
252         const struct ipv6_pinfo *np = inet6_sk(sk);
253         struct in6_addr *first_hop = &fl6->daddr;
254         struct dst_entry *dst = skb_dst(skb);
255         struct net_device *dev = dst->dev;
256         struct inet6_dev *idev = ip6_dst_idev(dst);
257         struct hop_jumbo_hdr *hop_jumbo;
258         int hoplen = sizeof(*hop_jumbo);
259         unsigned int head_room;
260         struct ipv6hdr *hdr;
261         u8  proto = fl6->flowi6_proto;
262         int seg_len = skb->len;
263         int hlimit = -1;
264         u32 mtu;
265
266         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267         if (opt)
268                 head_room += opt->opt_nflen + opt->opt_flen;
269
270         if (unlikely(head_room > skb_headroom(skb))) {
271                 skb = skb_expand_head(skb, head_room);
272                 if (!skb) {
273                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274                         return -ENOBUFS;
275                 }
276         }
277
278         if (opt) {
279                 seg_len += opt->opt_nflen + opt->opt_flen;
280
281                 if (opt->opt_flen)
282                         ipv6_push_frag_opts(skb, opt, &proto);
283
284                 if (opt->opt_nflen)
285                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286                                              &fl6->saddr);
287         }
288
289         if (unlikely(seg_len > IPV6_MAXPLEN)) {
290                 hop_jumbo = skb_push(skb, hoplen);
291
292                 hop_jumbo->nexthdr = proto;
293                 hop_jumbo->hdrlen = 0;
294                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295                 hop_jumbo->tlv_len = 4;
296                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297
298                 proto = IPPROTO_HOPOPTS;
299                 seg_len = 0;
300                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301         }
302
303         skb_push(skb, sizeof(struct ipv6hdr));
304         skb_reset_network_header(skb);
305         hdr = ipv6_hdr(skb);
306
307         /*
308          *      Fill in the IPv6 header
309          */
310         if (np)
311                 hlimit = np->hop_limit;
312         if (hlimit < 0)
313                 hlimit = ip6_dst_hoplimit(dst);
314
315         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316                                 ip6_autoflowlabel(net, np), fl6));
317
318         hdr->payload_len = htons(seg_len);
319         hdr->nexthdr = proto;
320         hdr->hop_limit = hlimit;
321
322         hdr->saddr = fl6->saddr;
323         hdr->daddr = *first_hop;
324
325         skb->protocol = htons(ETH_P_IPV6);
326         skb->priority = priority;
327         skb->mark = mark;
328
329         mtu = dst_mtu(dst);
330         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332
333                 /* if egress device is enslaved to an L3 master device pass the
334                  * skb to its handler for processing
335                  */
336                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
337                 if (unlikely(!skb))
338                         return 0;
339
340                 /* hooks should never assume socket lock is held.
341                  * we promote our socket to non const
342                  */
343                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344                                net, (struct sock *)sk, skb, NULL, dev,
345                                dst_output);
346         }
347
348         skb->dev = dev;
349         /* ipv6_local_error() does not require socket lock,
350          * we promote our socket to non const
351          */
352         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353
354         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355         kfree_skb(skb);
356         return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362         struct ip6_ra_chain *ra;
363         struct sock *last = NULL;
364
365         read_lock(&ip6_ra_lock);
366         for (ra = ip6_ra_chain; ra; ra = ra->next) {
367                 struct sock *sk = ra->sk;
368                 if (sk && ra->sel == sel &&
369                     (!sk->sk_bound_dev_if ||
370                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
371                         struct ipv6_pinfo *np = inet6_sk(sk);
372
373                         if (np && np->rtalert_isolate &&
374                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
375                                 continue;
376                         }
377                         if (last) {
378                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379                                 if (skb2)
380                                         rawv6_rcv(last, skb2);
381                         }
382                         last = sk;
383                 }
384         }
385
386         if (last) {
387                 rawv6_rcv(last, skb);
388                 read_unlock(&ip6_ra_lock);
389                 return 1;
390         }
391         read_unlock(&ip6_ra_lock);
392         return 0;
393 }
394
395 static int ip6_forward_proxy_check(struct sk_buff *skb)
396 {
397         struct ipv6hdr *hdr = ipv6_hdr(skb);
398         u8 nexthdr = hdr->nexthdr;
399         __be16 frag_off;
400         int offset;
401
402         if (ipv6_ext_hdr(nexthdr)) {
403                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
404                 if (offset < 0)
405                         return 0;
406         } else
407                 offset = sizeof(struct ipv6hdr);
408
409         if (nexthdr == IPPROTO_ICMPV6) {
410                 struct icmp6hdr *icmp6;
411
412                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
413                                          offset + 1 - skb->data)))
414                         return 0;
415
416                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
417
418                 switch (icmp6->icmp6_type) {
419                 case NDISC_ROUTER_SOLICITATION:
420                 case NDISC_ROUTER_ADVERTISEMENT:
421                 case NDISC_NEIGHBOUR_SOLICITATION:
422                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
423                 case NDISC_REDIRECT:
424                         /* For reaction involving unicast neighbor discovery
425                          * message destined to the proxied address, pass it to
426                          * input function.
427                          */
428                         return 1;
429                 default:
430                         break;
431                 }
432         }
433
434         /*
435          * The proxying router can't forward traffic sent to a link-local
436          * address, so signal the sender and discard the packet. This
437          * behavior is clarified by the MIPv6 specification.
438          */
439         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440                 dst_link_failure(skb);
441                 return -1;
442         }
443
444         return 0;
445 }
446
447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448                                      struct sk_buff *skb)
449 {
450         struct dst_entry *dst = skb_dst(skb);
451
452         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454
455 #ifdef CONFIG_NET_SWITCHDEV
456         if (skb->offload_l3_fwd_mark) {
457                 consume_skb(skb);
458                 return 0;
459         }
460 #endif
461
462         skb_clear_tstamp(skb);
463         return dst_output(net, sk, skb);
464 }
465
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468         if (skb->len <= mtu)
469                 return false;
470
471         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473                 return true;
474
475         if (skb->ignore_df)
476                 return false;
477
478         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479                 return false;
480
481         return true;
482 }
483
484 int ip6_forward(struct sk_buff *skb)
485 {
486         struct dst_entry *dst = skb_dst(skb);
487         struct ipv6hdr *hdr = ipv6_hdr(skb);
488         struct inet6_skb_parm *opt = IP6CB(skb);
489         struct net *net = dev_net(dst->dev);
490         struct inet6_dev *idev;
491         SKB_DR(reason);
492         u32 mtu;
493
494         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495         if (net->ipv6.devconf_all->forwarding == 0)
496                 goto error;
497
498         if (skb->pkt_type != PACKET_HOST)
499                 goto drop;
500
501         if (unlikely(skb->sk))
502                 goto drop;
503
504         if (skb_warn_if_lro(skb))
505                 goto drop;
506
507         if (!net->ipv6.devconf_all->disable_policy &&
508             (!idev || !idev->cnf.disable_policy) &&
509             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511                 goto drop;
512         }
513
514         skb_forward_csum(skb);
515
516         /*
517          *      We DO NOT make any processing on
518          *      RA packets, pushing them to user level AS IS
519          *      without ane WARRANTY that application will be able
520          *      to interpret them. The reason is that we
521          *      cannot make anything clever here.
522          *
523          *      We are not end-node, so that if packet contains
524          *      AH/ESP, we cannot make anything.
525          *      Defragmentation also would be mistake, RA packets
526          *      cannot be fragmented, because there is no warranty
527          *      that different fragments will go along one path. --ANK
528          */
529         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531                         return 0;
532         }
533
534         /*
535          *      check and decrement ttl
536          */
537         if (hdr->hop_limit <= 1) {
538                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540
541                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542                 return -ETIMEDOUT;
543         }
544
545         /* XXX: idev->cnf.proxy_ndp? */
546         if (net->ipv6.devconf_all->proxy_ndp &&
547             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548                 int proxied = ip6_forward_proxy_check(skb);
549                 if (proxied > 0) {
550                         /* It's tempting to decrease the hop limit
551                          * here by 1, as we do at the end of the
552                          * function too.
553                          *
554                          * But that would be incorrect, as proxying is
555                          * not forwarding.  The ip6_input function
556                          * will handle this packet locally, and it
557                          * depends on the hop limit being unchanged.
558                          *
559                          * One example is the NDP hop limit, that
560                          * always has to stay 255, but other would be
561                          * similar checks around RA packets, where the
562                          * user can even change the desired limit.
563                          */
564                         return ip6_input(skb);
565                 } else if (proxied < 0) {
566                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567                         goto drop;
568                 }
569         }
570
571         if (!xfrm6_route_forward(skb)) {
572                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573                 SKB_DR_SET(reason, XFRM_POLICY);
574                 goto drop;
575         }
576         dst = skb_dst(skb);
577
578         /* IPv6 specs say nothing about it, but it is clear that we cannot
579            send redirects to source routed frames.
580            We don't send redirects to frames decapsulated from IPsec.
581          */
582         if (IP6CB(skb)->iif == dst->dev->ifindex &&
583             opt->srcrt == 0 && !skb_sec_path(skb)) {
584                 struct in6_addr *target = NULL;
585                 struct inet_peer *peer;
586                 struct rt6_info *rt;
587
588                 /*
589                  *      incoming and outgoing devices are the same
590                  *      send a redirect.
591                  */
592
593                 rt = (struct rt6_info *) dst;
594                 if (rt->rt6i_flags & RTF_GATEWAY)
595                         target = &rt->rt6i_gateway;
596                 else
597                         target = &hdr->daddr;
598
599                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
600
601                 /* Limit redirects both by destination (here)
602                    and by source (inside ndisc_send_redirect)
603                  */
604                 if (inet_peer_xrlim_allow(peer, 1*HZ))
605                         ndisc_send_redirect(skb, target);
606                 if (peer)
607                         inet_putpeer(peer);
608         } else {
609                 int addrtype = ipv6_addr_type(&hdr->saddr);
610
611                 /* This check is security critical. */
612                 if (addrtype == IPV6_ADDR_ANY ||
613                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
614                         goto error;
615                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
616                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
617                                     ICMPV6_NOT_NEIGHBOUR, 0);
618                         goto error;
619                 }
620         }
621
622         mtu = ip6_dst_mtu_maybe_forward(dst, true);
623         if (mtu < IPV6_MIN_MTU)
624                 mtu = IPV6_MIN_MTU;
625
626         if (ip6_pkt_too_big(skb, mtu)) {
627                 /* Again, force OUTPUT device used as source address */
628                 skb->dev = dst->dev;
629                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
631                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
632                                 IPSTATS_MIB_FRAGFAILS);
633                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
634                 return -EMSGSIZE;
635         }
636
637         if (skb_cow(skb, dst->dev->hard_header_len)) {
638                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
639                                 IPSTATS_MIB_OUTDISCARDS);
640                 goto drop;
641         }
642
643         hdr = ipv6_hdr(skb);
644
645         /* Mangling hops number delayed to point after skb COW */
646
647         hdr->hop_limit--;
648
649         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650                        net, NULL, skb, skb->dev, dst->dev,
651                        ip6_forward_finish);
652
653 error:
654         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
655         SKB_DR_SET(reason, IP_INADDRERRORS);
656 drop:
657         kfree_skb_reason(skb, reason);
658         return -EINVAL;
659 }
660
661 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662 {
663         to->pkt_type = from->pkt_type;
664         to->priority = from->priority;
665         to->protocol = from->protocol;
666         skb_dst_drop(to);
667         skb_dst_set(to, dst_clone(skb_dst(from)));
668         to->dev = from->dev;
669         to->mark = from->mark;
670
671         skb_copy_hash(to, from);
672
673 #ifdef CONFIG_NET_SCHED
674         to->tc_index = from->tc_index;
675 #endif
676         nf_copy(to, from);
677         skb_ext_copy(to, from);
678         skb_copy_secmark(to, from);
679 }
680
681 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682                       u8 nexthdr, __be32 frag_id,
683                       struct ip6_fraglist_iter *iter)
684 {
685         unsigned int first_len;
686         struct frag_hdr *fh;
687
688         /* BUILD HEADER */
689         *prevhdr = NEXTHDR_FRAGMENT;
690         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691         if (!iter->tmp_hdr)
692                 return -ENOMEM;
693
694         iter->frag = skb_shinfo(skb)->frag_list;
695         skb_frag_list_init(skb);
696
697         iter->offset = 0;
698         iter->hlen = hlen;
699         iter->frag_id = frag_id;
700         iter->nexthdr = nexthdr;
701
702         __skb_pull(skb, hlen);
703         fh = __skb_push(skb, sizeof(struct frag_hdr));
704         __skb_push(skb, hlen);
705         skb_reset_network_header(skb);
706         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707
708         fh->nexthdr = nexthdr;
709         fh->reserved = 0;
710         fh->frag_off = htons(IP6_MF);
711         fh->identification = frag_id;
712
713         first_len = skb_pagelen(skb);
714         skb->data_len = first_len - skb_headlen(skb);
715         skb->len = first_len;
716         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717
718         return 0;
719 }
720 EXPORT_SYMBOL(ip6_fraglist_init);
721
722 void ip6_fraglist_prepare(struct sk_buff *skb,
723                           struct ip6_fraglist_iter *iter)
724 {
725         struct sk_buff *frag = iter->frag;
726         unsigned int hlen = iter->hlen;
727         struct frag_hdr *fh;
728
729         frag->ip_summed = CHECKSUM_NONE;
730         skb_reset_transport_header(frag);
731         fh = __skb_push(frag, sizeof(struct frag_hdr));
732         __skb_push(frag, hlen);
733         skb_reset_network_header(frag);
734         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736         fh->nexthdr = iter->nexthdr;
737         fh->reserved = 0;
738         fh->frag_off = htons(iter->offset);
739         if (frag->next)
740                 fh->frag_off |= htons(IP6_MF);
741         fh->identification = iter->frag_id;
742         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743         ip6_copy_metadata(frag, skb);
744 }
745 EXPORT_SYMBOL(ip6_fraglist_prepare);
746
747 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750 {
751         state->prevhdr = prevhdr;
752         state->nexthdr = nexthdr;
753         state->frag_id = frag_id;
754
755         state->hlen = hlen;
756         state->mtu = mtu;
757
758         state->left = skb->len - hlen;  /* Space per frame */
759         state->ptr = hlen;              /* Where to start from */
760
761         state->hroom = hdr_room;
762         state->troom = needed_tailroom;
763
764         state->offset = 0;
765 }
766 EXPORT_SYMBOL(ip6_frag_init);
767
768 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769 {
770         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771         struct sk_buff *frag;
772         struct frag_hdr *fh;
773         unsigned int len;
774
775         len = state->left;
776         /* IF: it doesn't fit, use 'mtu' - the data space left */
777         if (len > state->mtu)
778                 len = state->mtu;
779         /* IF: we are not sending up to and including the packet end
780            then align the next start on an eight byte boundary */
781         if (len < state->left)
782                 len &= ~7;
783
784         /* Allocate buffer */
785         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786                          state->hroom + state->troom, GFP_ATOMIC);
787         if (!frag)
788                 return ERR_PTR(-ENOMEM);
789
790         /*
791          *      Set up data on packet
792          */
793
794         ip6_copy_metadata(frag, skb);
795         skb_reserve(frag, state->hroom);
796         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797         skb_reset_network_header(frag);
798         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799         frag->transport_header = (frag->network_header + state->hlen +
800                                   sizeof(struct frag_hdr));
801
802         /*
803          *      Charge the memory for the fragment to any owner
804          *      it might possess
805          */
806         if (skb->sk)
807                 skb_set_owner_w(frag, skb->sk);
808
809         /*
810          *      Copy the packet header into the new buffer.
811          */
812         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813
814         fragnexthdr_offset = skb_network_header(frag);
815         fragnexthdr_offset += prevhdr - skb_network_header(skb);
816         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
817
818         /*
819          *      Build fragment header.
820          */
821         fh->nexthdr = state->nexthdr;
822         fh->reserved = 0;
823         fh->identification = state->frag_id;
824
825         /*
826          *      Copy a block of the IP datagram.
827          */
828         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829                              len));
830         state->left -= len;
831
832         fh->frag_off = htons(state->offset);
833         if (state->left > 0)
834                 fh->frag_off |= htons(IP6_MF);
835         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836
837         state->ptr += len;
838         state->offset += len;
839
840         return frag;
841 }
842 EXPORT_SYMBOL(ip6_frag_next);
843
844 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845                  int (*output)(struct net *, struct sock *, struct sk_buff *))
846 {
847         struct sk_buff *frag;
848         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
849         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850                                 inet6_sk(skb->sk) : NULL;
851         bool mono_delivery_time = skb->mono_delivery_time;
852         struct ip6_frag_state state;
853         unsigned int mtu, hlen, nexthdr_offset;
854         ktime_t tstamp = skb->tstamp;
855         int hroom, err = 0;
856         __be32 frag_id;
857         u8 *prevhdr, nexthdr = 0;
858
859         err = ip6_find_1stfragopt(skb, &prevhdr);
860         if (err < 0)
861                 goto fail;
862         hlen = err;
863         nexthdr = *prevhdr;
864         nexthdr_offset = prevhdr - skb_network_header(skb);
865
866         mtu = ip6_skb_dst_mtu(skb);
867
868         /* We must not fragment if the socket is set to force MTU discovery
869          * or if the skb it not generated by a local socket.
870          */
871         if (unlikely(!skb->ignore_df && skb->len > mtu))
872                 goto fail_toobig;
873
874         if (IP6CB(skb)->frag_max_size) {
875                 if (IP6CB(skb)->frag_max_size > mtu)
876                         goto fail_toobig;
877
878                 /* don't send fragments larger than what we received */
879                 mtu = IP6CB(skb)->frag_max_size;
880                 if (mtu < IPV6_MIN_MTU)
881                         mtu = IPV6_MIN_MTU;
882         }
883
884         if (np && np->frag_size < mtu) {
885                 if (np->frag_size)
886                         mtu = np->frag_size;
887         }
888         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889                 goto fail_toobig;
890         mtu -= hlen + sizeof(struct frag_hdr);
891
892         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893                                     &ipv6_hdr(skb)->saddr);
894
895         if (skb->ip_summed == CHECKSUM_PARTIAL &&
896             (err = skb_checksum_help(skb)))
897                 goto fail;
898
899         prevhdr = skb_network_header(skb) + nexthdr_offset;
900         hroom = LL_RESERVED_SPACE(rt->dst.dev);
901         if (skb_has_frag_list(skb)) {
902                 unsigned int first_len = skb_pagelen(skb);
903                 struct ip6_fraglist_iter iter;
904                 struct sk_buff *frag2;
905
906                 if (first_len - hlen > mtu ||
907                     ((first_len - hlen) & 7) ||
908                     skb_cloned(skb) ||
909                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910                         goto slow_path;
911
912                 skb_walk_frags(skb, frag) {
913                         /* Correct geometry. */
914                         if (frag->len > mtu ||
915                             ((frag->len & 7) && frag->next) ||
916                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917                                 goto slow_path_clean;
918
919                         /* Partially cloned skb? */
920                         if (skb_shared(frag))
921                                 goto slow_path_clean;
922
923                         BUG_ON(frag->sk);
924                         if (skb->sk) {
925                                 frag->sk = skb->sk;
926                                 frag->destructor = sock_wfree;
927                         }
928                         skb->truesize -= frag->truesize;
929                 }
930
931                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932                                         &iter);
933                 if (err < 0)
934                         goto fail;
935
936                 /* We prevent @rt from being freed. */
937                 rcu_read_lock();
938
939                 for (;;) {
940                         /* Prepare header of the next frame,
941                          * before previous one went down. */
942                         if (iter.frag)
943                                 ip6_fraglist_prepare(skb, &iter);
944
945                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946                         err = output(net, sk, skb);
947                         if (!err)
948                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949                                               IPSTATS_MIB_FRAGCREATES);
950
951                         if (err || !iter.frag)
952                                 break;
953
954                         skb = ip6_fraglist_next(&iter);
955                 }
956
957                 kfree(iter.tmp_hdr);
958
959                 if (err == 0) {
960                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961                                       IPSTATS_MIB_FRAGOKS);
962                         rcu_read_unlock();
963                         return 0;
964                 }
965
966                 kfree_skb_list(iter.frag);
967
968                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969                               IPSTATS_MIB_FRAGFAILS);
970                 rcu_read_unlock();
971                 return err;
972
973 slow_path_clean:
974                 skb_walk_frags(skb, frag2) {
975                         if (frag2 == frag)
976                                 break;
977                         frag2->sk = NULL;
978                         frag2->destructor = NULL;
979                         skb->truesize += frag2->truesize;
980                 }
981         }
982
983 slow_path:
984         /*
985          *      Fragment the datagram.
986          */
987
988         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990                       &state);
991
992         /*
993          *      Keep copying data until we run out.
994          */
995
996         while (state.left > 0) {
997                 frag = ip6_frag_next(skb, &state);
998                 if (IS_ERR(frag)) {
999                         err = PTR_ERR(frag);
1000                         goto fail;
1001                 }
1002
1003                 /*
1004                  *      Put this fragment into the sending queue.
1005                  */
1006                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007                 err = output(net, sk, frag);
1008                 if (err)
1009                         goto fail;
1010
1011                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012                               IPSTATS_MIB_FRAGCREATES);
1013         }
1014         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015                       IPSTATS_MIB_FRAGOKS);
1016         consume_skb(skb);
1017         return err;
1018
1019 fail_toobig:
1020         if (skb->sk && dst_allfrag(skb_dst(skb)))
1021                 sk_gso_disable(skb->sk);
1022
1023         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024         err = -EMSGSIZE;
1025
1026 fail:
1027         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028                       IPSTATS_MIB_FRAGFAILS);
1029         kfree_skb(skb);
1030         return err;
1031 }
1032
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034                                const struct in6_addr *fl_addr,
1035                                const struct in6_addr *addr_cache)
1036 {
1037         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042                                           struct dst_entry *dst,
1043                                           const struct flowi6 *fl6)
1044 {
1045         struct ipv6_pinfo *np = inet6_sk(sk);
1046         struct rt6_info *rt;
1047
1048         if (!dst)
1049                 goto out;
1050
1051         if (dst->ops->family != AF_INET6) {
1052                 dst_release(dst);
1053                 return NULL;
1054         }
1055
1056         rt = (struct rt6_info *)dst;
1057         /* Yes, checking route validity in not connected
1058          * case is not very simple. Take into account,
1059          * that we do not support routing by source, TOS,
1060          * and MSG_DONTROUTE            --ANK (980726)
1061          *
1062          * 1. ip6_rt_check(): If route was host route,
1063          *    check that cached destination is current.
1064          *    If it is network route, we still may
1065          *    check its validity using saved pointer
1066          *    to the last used address: daddr_cache.
1067          *    We do not want to save whole address now,
1068          *    (because main consumer of this service
1069          *    is tcp, which has not this problem),
1070          *    so that the last trick works only on connected
1071          *    sockets.
1072          * 2. oif also should be the same.
1073          */
1074         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079                 dst_release(dst);
1080                 dst = NULL;
1081         }
1082
1083 out:
1084         return dst;
1085 }
1086
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088                                struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091         struct neighbour *n;
1092         struct rt6_info *rt;
1093 #endif
1094         int err;
1095         int flags = 0;
1096
1097         /* The correct way to handle this would be to do
1098          * ip6_route_get_saddr, and then ip6_route_output; however,
1099          * the route-specific preferred source forces the
1100          * ip6_route_output call _before_ ip6_route_get_saddr.
1101          *
1102          * In source specific routing (no src=any default route),
1103          * ip6_route_output will fail given src=any saddr, though, so
1104          * that's why we try it again later.
1105          */
1106         if (ipv6_addr_any(&fl6->saddr)) {
1107                 struct fib6_info *from;
1108                 struct rt6_info *rt;
1109
1110                 *dst = ip6_route_output(net, sk, fl6);
1111                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112
1113                 rcu_read_lock();
1114                 from = rt ? rcu_dereference(rt->from) : NULL;
1115                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116                                           sk ? inet6_sk(sk)->srcprefs : 0,
1117                                           &fl6->saddr);
1118                 rcu_read_unlock();
1119
1120                 if (err)
1121                         goto out_err_release;
1122
1123                 /* If we had an erroneous initial result, pretend it
1124                  * never existed and let the SA-enabled version take
1125                  * over.
1126                  */
1127                 if ((*dst)->error) {
1128                         dst_release(*dst);
1129                         *dst = NULL;
1130                 }
1131
1132                 if (fl6->flowi6_oif)
1133                         flags |= RT6_LOOKUP_F_IFACE;
1134         }
1135
1136         if (!*dst)
1137                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1138
1139         err = (*dst)->error;
1140         if (err)
1141                 goto out_err_release;
1142
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144         /*
1145          * Here if the dst entry we've looked up
1146          * has a neighbour entry that is in the INCOMPLETE
1147          * state and the src address from the flow is
1148          * marked as OPTIMISTIC, we release the found
1149          * dst entry and replace it instead with the
1150          * dst entry of the nexthop router
1151          */
1152         rt = (struct rt6_info *) *dst;
1153         rcu_read_lock();
1154         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155                                       rt6_nexthop(rt, &fl6->daddr));
1156         err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157         rcu_read_unlock();
1158
1159         if (err) {
1160                 struct inet6_ifaddr *ifp;
1161                 struct flowi6 fl_gw6;
1162                 int redirect;
1163
1164                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165                                       (*dst)->dev, 1);
1166
1167                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168                 if (ifp)
1169                         in6_ifa_put(ifp);
1170
1171                 if (redirect) {
1172                         /*
1173                          * We need to get the dst entry for the
1174                          * default router instead
1175                          */
1176                         dst_release(*dst);
1177                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179                         *dst = ip6_route_output(net, sk, &fl_gw6);
1180                         err = (*dst)->error;
1181                         if (err)
1182                                 goto out_err_release;
1183                 }
1184         }
1185 #endif
1186         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188                 err = -EAFNOSUPPORT;
1189                 goto out_err_release;
1190         }
1191
1192         return 0;
1193
1194 out_err_release:
1195         dst_release(*dst);
1196         *dst = NULL;
1197
1198         if (err == -ENETUNREACH)
1199                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200         return err;
1201 }
1202
1203 /**
1204  *      ip6_dst_lookup - perform route lookup on flow
1205  *      @net: Network namespace to perform lookup in
1206  *      @sk: socket which provides route info
1207  *      @dst: pointer to dst_entry * for result
1208  *      @fl6: flow to lookup
1209  *
1210  *      This function performs a route lookup on the given flow.
1211  *
1212  *      It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215                    struct flowi6 *fl6)
1216 {
1217         *dst = NULL;
1218         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221
1222 /**
1223  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *      @net: Network namespace to perform lookup in
1225  *      @sk: socket which provides route info
1226  *      @fl6: flow to lookup
1227  *      @final_dst: final destination address for ipsec lookup
1228  *
1229  *      This function performs a route lookup on the given flow.
1230  *
1231  *      It returns a valid dst pointer on success, or a pointer encoded
1232  *      error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235                                       const struct in6_addr *final_dst)
1236 {
1237         struct dst_entry *dst = NULL;
1238         int err;
1239
1240         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241         if (err)
1242                 return ERR_PTR(err);
1243         if (final_dst)
1244                 fl6->daddr = *final_dst;
1245
1246         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249
1250 /**
1251  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *      @sk: socket which provides the dst cache and route info
1253  *      @fl6: flow to lookup
1254  *      @final_dst: final destination address for ipsec lookup
1255  *      @connected: whether @sk is connected or not
1256  *
1257  *      This function performs a route lookup on the given flow with the
1258  *      possibility of using the cached route in the socket if it is valid.
1259  *      It will take the socket dst lock when operating on the dst cache.
1260  *      As a result, this function can only be used in process context.
1261  *
1262  *      In addition, for a connected socket, cache the dst in the socket
1263  *      if the current cache is not valid.
1264  *
1265  *      It returns a valid dst pointer on success, or a pointer encoded
1266  *      error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269                                          const struct in6_addr *final_dst,
1270                                          bool connected)
1271 {
1272         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273
1274         dst = ip6_sk_dst_check(sk, dst, fl6);
1275         if (dst)
1276                 return dst;
1277
1278         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279         if (connected && !IS_ERR(dst))
1280                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281
1282         return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303                                         struct net_device *dev,
1304                                         struct net *net,
1305                                         struct socket *sock,
1306                                         struct in6_addr *saddr,
1307                                         const struct ip_tunnel_info *info,
1308                                         u8 protocol,
1309                                         bool use_cache)
1310 {
1311         struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313         struct dst_cache *dst_cache;
1314 #endif
1315         struct flowi6 fl6;
1316         __u8 prio;
1317
1318 #ifdef CONFIG_DST_CACHE
1319         dst_cache = (struct dst_cache *)&info->dst_cache;
1320         if (use_cache) {
1321                 dst = dst_cache_get_ip6(dst_cache, saddr);
1322                 if (dst)
1323                         return dst;
1324         }
1325 #endif
1326         memset(&fl6, 0, sizeof(fl6));
1327         fl6.flowi6_mark = skb->mark;
1328         fl6.flowi6_proto = protocol;
1329         fl6.daddr = info->key.u.ipv6.dst;
1330         fl6.saddr = info->key.u.ipv6.src;
1331         prio = info->key.tos;
1332         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333
1334         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335                                               NULL);
1336         if (IS_ERR(dst)) {
1337                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338                 return ERR_PTR(-ENETUNREACH);
1339         }
1340         if (dst->dev == dev) { /* is this necessary? */
1341                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342                 dst_release(dst);
1343                 return ERR_PTR(-ELOOP);
1344         }
1345 #ifdef CONFIG_DST_CACHE
1346         if (use_cache)
1347                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349         *saddr = fl6.saddr;
1350         return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355                                                gfp_t gfp)
1356 {
1357         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361                                                 gfp_t gfp)
1362 {
1363         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367                                 int *maxfraglen,
1368                                 unsigned int fragheaderlen,
1369                                 struct sk_buff *skb,
1370                                 struct rt6_info *rt,
1371                                 unsigned int orig_mtu)
1372 {
1373         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374                 if (!skb) {
1375                         /* first fragment, reserve header_len */
1376                         *mtu = orig_mtu - rt->dst.header_len;
1377
1378                 } else {
1379                         /*
1380                          * this fragment is not first, the headers
1381                          * space is regarded as data space.
1382                          */
1383                         *mtu = orig_mtu;
1384                 }
1385                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386                               + fragheaderlen - sizeof(struct frag_hdr);
1387         }
1388 }
1389
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392                           struct rt6_info *rt)
1393 {
1394         struct ipv6_pinfo *np = inet6_sk(sk);
1395         unsigned int mtu;
1396         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397
1398         /* callers pass dst together with a reference, set it first so
1399          * ip6_cork_release() can put it down even in case of an error.
1400          */
1401         cork->base.dst = &rt->dst;
1402
1403         /*
1404          * setup for corking
1405          */
1406         if (opt) {
1407                 if (WARN_ON(v6_cork->opt))
1408                         return -EINVAL;
1409
1410                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411                 if (unlikely(!nopt))
1412                         return -ENOBUFS;
1413
1414                 nopt->tot_len = sizeof(*opt);
1415                 nopt->opt_flen = opt->opt_flen;
1416                 nopt->opt_nflen = opt->opt_nflen;
1417
1418                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419                 if (opt->dst0opt && !nopt->dst0opt)
1420                         return -ENOBUFS;
1421
1422                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423                 if (opt->dst1opt && !nopt->dst1opt)
1424                         return -ENOBUFS;
1425
1426                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427                 if (opt->hopopt && !nopt->hopopt)
1428                         return -ENOBUFS;
1429
1430                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431                 if (opt->srcrt && !nopt->srcrt)
1432                         return -ENOBUFS;
1433
1434                 /* need source address above miyazawa*/
1435         }
1436         v6_cork->hop_limit = ipc6->hlimit;
1437         v6_cork->tclass = ipc6->tclass;
1438         if (rt->dst.flags & DST_XFRM_TUNNEL)
1439                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441         else
1442                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1443                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444         if (np->frag_size < mtu) {
1445                 if (np->frag_size)
1446                         mtu = np->frag_size;
1447         }
1448         cork->base.fragsize = mtu;
1449         cork->base.gso_size = ipc6->gso_size;
1450         cork->base.tx_flags = 0;
1451         cork->base.mark = ipc6->sockc.mark;
1452         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1453
1454         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1455                 cork->base.flags |= IPCORK_ALLFRAG;
1456         cork->base.length = 0;
1457
1458         cork->base.transmit_time = ipc6->sockc.transmit_time;
1459
1460         return 0;
1461 }
1462
1463 static int __ip6_append_data(struct sock *sk,
1464                              struct sk_buff_head *queue,
1465                              struct inet_cork_full *cork_full,
1466                              struct inet6_cork *v6_cork,
1467                              struct page_frag *pfrag,
1468                              int getfrag(void *from, char *to, int offset,
1469                                          int len, int odd, struct sk_buff *skb),
1470                              void *from, size_t length, int transhdrlen,
1471                              unsigned int flags, struct ipcm6_cookie *ipc6)
1472 {
1473         struct sk_buff *skb, *skb_prev = NULL;
1474         struct inet_cork *cork = &cork_full->base;
1475         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1476         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477         struct ubuf_info *uarg = NULL;
1478         int exthdrlen = 0;
1479         int dst_exthdrlen = 0;
1480         int hh_len;
1481         int copy;
1482         int err;
1483         int offset = 0;
1484         bool zc = false;
1485         u32 tskey = 0;
1486         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487         struct ipv6_txoptions *opt = v6_cork->opt;
1488         int csummode = CHECKSUM_NONE;
1489         unsigned int maxnonfragsize, headersize;
1490         unsigned int wmem_alloc_delta = 0;
1491         bool paged, extra_uref = false;
1492
1493         skb = skb_peek_tail(queue);
1494         if (!skb) {
1495                 exthdrlen = opt ? opt->opt_flen : 0;
1496                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1497         }
1498
1499         paged = !!cork->gso_size;
1500         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1501         orig_mtu = mtu;
1502
1503         if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1504             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1505                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1506
1507         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1508
1509         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1510                         (opt ? opt->opt_nflen : 0);
1511
1512         headersize = sizeof(struct ipv6hdr) +
1513                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514                      (dst_allfrag(&rt->dst) ?
1515                       sizeof(struct frag_hdr) : 0) +
1516                      rt->rt6i_nfheader_len;
1517
1518         if (mtu <= fragheaderlen ||
1519             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1520                 goto emsgsize;
1521
1522         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523                      sizeof(struct frag_hdr);
1524
1525         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526          * the first fragment
1527          */
1528         if (headersize + transhdrlen > mtu)
1529                 goto emsgsize;
1530
1531         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1532             (sk->sk_protocol == IPPROTO_UDP ||
1533              sk->sk_protocol == IPPROTO_ICMPV6 ||
1534              sk->sk_protocol == IPPROTO_RAW)) {
1535                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536                                 sizeof(struct ipv6hdr));
1537                 goto emsgsize;
1538         }
1539
1540         if (ip6_sk_ignore_df(sk))
1541                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542         else
1543                 maxnonfragsize = mtu;
1544
1545         if (cork->length + length > maxnonfragsize - headersize) {
1546 emsgsize:
1547                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1549                 return -EMSGSIZE;
1550         }
1551
1552         /* CHECKSUM_PARTIAL only with no extension headers and when
1553          * we are not going to fragment
1554          */
1555         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556             headersize == sizeof(struct ipv6hdr) &&
1557             length <= mtu - headersize &&
1558             (!(flags & MSG_MORE) || cork->gso_size) &&
1559             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1560                 csummode = CHECKSUM_PARTIAL;
1561
1562         if ((flags & MSG_ZEROCOPY) && length) {
1563                 struct msghdr *msg = from;
1564
1565                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567                                 return -EINVAL;
1568
1569                         /* Leave uarg NULL if can't zerocopy, callers should
1570                          * be able to handle it.
1571                          */
1572                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1573                             csummode == CHECKSUM_PARTIAL) {
1574                                 paged = true;
1575                                 zc = true;
1576                                 uarg = msg->msg_ubuf;
1577                         }
1578                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580                         if (!uarg)
1581                                 return -ENOBUFS;
1582                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1583                         if (rt->dst.dev->features & NETIF_F_SG &&
1584                             csummode == CHECKSUM_PARTIAL) {
1585                                 paged = true;
1586                                 zc = true;
1587                         } else {
1588                                 uarg_to_msgzc(uarg)->zerocopy = 0;
1589                                 skb_zcopy_set(skb, uarg, &extra_uref);
1590                         }
1591                 }
1592         }
1593
1594         /*
1595          * Let's try using as much space as possible.
1596          * Use MTU if total length of the message fits into the MTU.
1597          * Otherwise, we need to reserve fragment header and
1598          * fragment alignment (= 8-15 octects, in total).
1599          *
1600          * Note that we may need to "move" the data from the tail
1601          * of the buffer to the new fragment when we split
1602          * the message.
1603          *
1604          * FIXME: It may be fragmented into multiple chunks
1605          *        at once if non-fragmentable extension headers
1606          *        are too large.
1607          * --yoshfuji
1608          */
1609
1610         cork->length += length;
1611         if (!skb)
1612                 goto alloc_new_skb;
1613
1614         while (length > 0) {
1615                 /* Check if the remaining data fits into current packet. */
1616                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1617                 if (copy < length)
1618                         copy = maxfraglen - skb->len;
1619
1620                 if (copy <= 0) {
1621                         char *data;
1622                         unsigned int datalen;
1623                         unsigned int fraglen;
1624                         unsigned int fraggap;
1625                         unsigned int alloclen, alloc_extra;
1626                         unsigned int pagedlen;
1627 alloc_new_skb:
1628                         /* There's no room in the current skb */
1629                         if (skb)
1630                                 fraggap = skb->len - maxfraglen;
1631                         else
1632                                 fraggap = 0;
1633                         /* update mtu and maxfraglen if necessary */
1634                         if (!skb || !skb_prev)
1635                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1636                                                     fragheaderlen, skb, rt,
1637                                                     orig_mtu);
1638
1639                         skb_prev = skb;
1640
1641                         /*
1642                          * If remaining data exceeds the mtu,
1643                          * we know we need more fragment(s).
1644                          */
1645                         datalen = length + fraggap;
1646
1647                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1648                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1649                         fraglen = datalen + fragheaderlen;
1650                         pagedlen = 0;
1651
1652                         alloc_extra = hh_len;
1653                         alloc_extra += dst_exthdrlen;
1654                         alloc_extra += rt->dst.trailer_len;
1655
1656                         /* We just reserve space for fragment header.
1657                          * Note: this may be overallocation if the message
1658                          * (without MSG_MORE) fits into the MTU.
1659                          */
1660                         alloc_extra += sizeof(struct frag_hdr);
1661
1662                         if ((flags & MSG_MORE) &&
1663                             !(rt->dst.dev->features&NETIF_F_SG))
1664                                 alloclen = mtu;
1665                         else if (!paged &&
1666                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1667                                   !(rt->dst.dev->features & NETIF_F_SG)))
1668                                 alloclen = fraglen;
1669                         else {
1670                                 alloclen = fragheaderlen + transhdrlen;
1671                                 pagedlen = datalen - transhdrlen;
1672                         }
1673                         alloclen += alloc_extra;
1674
1675                         if (datalen != length + fraggap) {
1676                                 /*
1677                                  * this is not the last fragment, the trailer
1678                                  * space is regarded as data space.
1679                                  */
1680                                 datalen += rt->dst.trailer_len;
1681                         }
1682
1683                         fraglen = datalen + fragheaderlen;
1684
1685                         copy = datalen - transhdrlen - fraggap - pagedlen;
1686                         if (copy < 0) {
1687                                 err = -EINVAL;
1688                                 goto error;
1689                         }
1690                         if (transhdrlen) {
1691                                 skb = sock_alloc_send_skb(sk, alloclen,
1692                                                 (flags & MSG_DONTWAIT), &err);
1693                         } else {
1694                                 skb = NULL;
1695                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1696                                     2 * sk->sk_sndbuf)
1697                                         skb = alloc_skb(alloclen,
1698                                                         sk->sk_allocation);
1699                                 if (unlikely(!skb))
1700                                         err = -ENOBUFS;
1701                         }
1702                         if (!skb)
1703                                 goto error;
1704                         /*
1705                          *      Fill in the control structures
1706                          */
1707                         skb->protocol = htons(ETH_P_IPV6);
1708                         skb->ip_summed = csummode;
1709                         skb->csum = 0;
1710                         /* reserve for fragmentation and ipsec header */
1711                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1712                                     dst_exthdrlen);
1713
1714                         /*
1715                          *      Find where to start putting bytes
1716                          */
1717                         data = skb_put(skb, fraglen - pagedlen);
1718                         skb_set_network_header(skb, exthdrlen);
1719                         data += fragheaderlen;
1720                         skb->transport_header = (skb->network_header +
1721                                                  fragheaderlen);
1722                         if (fraggap) {
1723                                 skb->csum = skb_copy_and_csum_bits(
1724                                         skb_prev, maxfraglen,
1725                                         data + transhdrlen, fraggap);
1726                                 skb_prev->csum = csum_sub(skb_prev->csum,
1727                                                           skb->csum);
1728                                 data += fraggap;
1729                                 pskb_trim_unique(skb_prev, maxfraglen);
1730                         }
1731                         if (copy > 0 &&
1732                             getfrag(from, data + transhdrlen, offset,
1733                                     copy, fraggap, skb) < 0) {
1734                                 err = -EFAULT;
1735                                 kfree_skb(skb);
1736                                 goto error;
1737                         }
1738
1739                         offset += copy;
1740                         length -= copy + transhdrlen;
1741                         transhdrlen = 0;
1742                         exthdrlen = 0;
1743                         dst_exthdrlen = 0;
1744
1745                         /* Only the initial fragment is time stamped */
1746                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1747                         cork->tx_flags = 0;
1748                         skb_shinfo(skb)->tskey = tskey;
1749                         tskey = 0;
1750                         skb_zcopy_set(skb, uarg, &extra_uref);
1751
1752                         if ((flags & MSG_CONFIRM) && !skb_prev)
1753                                 skb_set_dst_pending_confirm(skb, 1);
1754
1755                         /*
1756                          * Put the packet on the pending queue
1757                          */
1758                         if (!skb->destructor) {
1759                                 skb->destructor = sock_wfree;
1760                                 skb->sk = sk;
1761                                 wmem_alloc_delta += skb->truesize;
1762                         }
1763                         __skb_queue_tail(queue, skb);
1764                         continue;
1765                 }
1766
1767                 if (copy > length)
1768                         copy = length;
1769
1770                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1771                     skb_tailroom(skb) >= copy) {
1772                         unsigned int off;
1773
1774                         off = skb->len;
1775                         if (getfrag(from, skb_put(skb, copy),
1776                                                 offset, copy, off, skb) < 0) {
1777                                 __skb_trim(skb, off);
1778                                 err = -EFAULT;
1779                                 goto error;
1780                         }
1781                 } else if (!zc) {
1782                         int i = skb_shinfo(skb)->nr_frags;
1783
1784                         err = -ENOMEM;
1785                         if (!sk_page_frag_refill(sk, pfrag))
1786                                 goto error;
1787
1788                         skb_zcopy_downgrade_managed(skb);
1789                         if (!skb_can_coalesce(skb, i, pfrag->page,
1790                                               pfrag->offset)) {
1791                                 err = -EMSGSIZE;
1792                                 if (i == MAX_SKB_FRAGS)
1793                                         goto error;
1794
1795                                 __skb_fill_page_desc(skb, i, pfrag->page,
1796                                                      pfrag->offset, 0);
1797                                 skb_shinfo(skb)->nr_frags = ++i;
1798                                 get_page(pfrag->page);
1799                         }
1800                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1801                         if (getfrag(from,
1802                                     page_address(pfrag->page) + pfrag->offset,
1803                                     offset, copy, skb->len, skb) < 0)
1804                                 goto error_efault;
1805
1806                         pfrag->offset += copy;
1807                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1808                         skb->len += copy;
1809                         skb->data_len += copy;
1810                         skb->truesize += copy;
1811                         wmem_alloc_delta += copy;
1812                 } else {
1813                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1814                         if (err < 0)
1815                                 goto error;
1816                 }
1817                 offset += copy;
1818                 length -= copy;
1819         }
1820
1821         if (wmem_alloc_delta)
1822                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1823         return 0;
1824
1825 error_efault:
1826         err = -EFAULT;
1827 error:
1828         net_zcopy_put_abort(uarg, extra_uref);
1829         cork->length -= length;
1830         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1831         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1832         return err;
1833 }
1834
1835 int ip6_append_data(struct sock *sk,
1836                     int getfrag(void *from, char *to, int offset, int len,
1837                                 int odd, struct sk_buff *skb),
1838                     void *from, size_t length, int transhdrlen,
1839                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1840                     struct rt6_info *rt, unsigned int flags)
1841 {
1842         struct inet_sock *inet = inet_sk(sk);
1843         struct ipv6_pinfo *np = inet6_sk(sk);
1844         int exthdrlen;
1845         int err;
1846
1847         if (flags&MSG_PROBE)
1848                 return 0;
1849         if (skb_queue_empty(&sk->sk_write_queue)) {
1850                 /*
1851                  * setup for corking
1852                  */
1853                 dst_hold(&rt->dst);
1854                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1855                                      ipc6, rt);
1856                 if (err)
1857                         return err;
1858
1859                 inet->cork.fl.u.ip6 = *fl6;
1860                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1861                 length += exthdrlen;
1862                 transhdrlen += exthdrlen;
1863         } else {
1864                 transhdrlen = 0;
1865         }
1866
1867         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1868                                  &np->cork, sk_page_frag(sk), getfrag,
1869                                  from, length, transhdrlen, flags, ipc6);
1870 }
1871 EXPORT_SYMBOL_GPL(ip6_append_data);
1872
1873 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1874 {
1875         struct dst_entry *dst = cork->base.dst;
1876
1877         cork->base.dst = NULL;
1878         cork->base.flags &= ~IPCORK_ALLFRAG;
1879         skb_dst_set(skb, dst);
1880 }
1881
1882 static void ip6_cork_release(struct inet_cork_full *cork,
1883                              struct inet6_cork *v6_cork)
1884 {
1885         if (v6_cork->opt) {
1886                 struct ipv6_txoptions *opt = v6_cork->opt;
1887
1888                 kfree(opt->dst0opt);
1889                 kfree(opt->dst1opt);
1890                 kfree(opt->hopopt);
1891                 kfree(opt->srcrt);
1892                 kfree(opt);
1893                 v6_cork->opt = NULL;
1894         }
1895
1896         if (cork->base.dst) {
1897                 dst_release(cork->base.dst);
1898                 cork->base.dst = NULL;
1899                 cork->base.flags &= ~IPCORK_ALLFRAG;
1900         }
1901 }
1902
1903 struct sk_buff *__ip6_make_skb(struct sock *sk,
1904                                struct sk_buff_head *queue,
1905                                struct inet_cork_full *cork,
1906                                struct inet6_cork *v6_cork)
1907 {
1908         struct sk_buff *skb, *tmp_skb;
1909         struct sk_buff **tail_skb;
1910         struct in6_addr *final_dst;
1911         struct ipv6_pinfo *np = inet6_sk(sk);
1912         struct net *net = sock_net(sk);
1913         struct ipv6hdr *hdr;
1914         struct ipv6_txoptions *opt = v6_cork->opt;
1915         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1916         struct flowi6 *fl6 = &cork->fl.u.ip6;
1917         unsigned char proto = fl6->flowi6_proto;
1918
1919         skb = __skb_dequeue(queue);
1920         if (!skb)
1921                 goto out;
1922         tail_skb = &(skb_shinfo(skb)->frag_list);
1923
1924         /* move skb->data to ip header from ext header */
1925         if (skb->data < skb_network_header(skb))
1926                 __skb_pull(skb, skb_network_offset(skb));
1927         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1928                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1929                 *tail_skb = tmp_skb;
1930                 tail_skb = &(tmp_skb->next);
1931                 skb->len += tmp_skb->len;
1932                 skb->data_len += tmp_skb->len;
1933                 skb->truesize += tmp_skb->truesize;
1934                 tmp_skb->destructor = NULL;
1935                 tmp_skb->sk = NULL;
1936         }
1937
1938         /* Allow local fragmentation. */
1939         skb->ignore_df = ip6_sk_ignore_df(sk);
1940         __skb_pull(skb, skb_network_header_len(skb));
1941
1942         final_dst = &fl6->daddr;
1943         if (opt && opt->opt_flen)
1944                 ipv6_push_frag_opts(skb, opt, &proto);
1945         if (opt && opt->opt_nflen)
1946                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1947
1948         skb_push(skb, sizeof(struct ipv6hdr));
1949         skb_reset_network_header(skb);
1950         hdr = ipv6_hdr(skb);
1951
1952         ip6_flow_hdr(hdr, v6_cork->tclass,
1953                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1954                                         ip6_autoflowlabel(net, np), fl6));
1955         hdr->hop_limit = v6_cork->hop_limit;
1956         hdr->nexthdr = proto;
1957         hdr->saddr = fl6->saddr;
1958         hdr->daddr = *final_dst;
1959
1960         skb->priority = sk->sk_priority;
1961         skb->mark = cork->base.mark;
1962         skb->tstamp = cork->base.transmit_time;
1963
1964         ip6_cork_steal_dst(skb, cork);
1965         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1966         if (proto == IPPROTO_ICMPV6) {
1967                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1968                 u8 icmp6_type;
1969
1970                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1971                         icmp6_type = fl6->fl6_icmp_type;
1972                 else
1973                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1974                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1975                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1976         }
1977
1978         ip6_cork_release(cork, v6_cork);
1979 out:
1980         return skb;
1981 }
1982
1983 int ip6_send_skb(struct sk_buff *skb)
1984 {
1985         struct net *net = sock_net(skb->sk);
1986         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1987         int err;
1988
1989         err = ip6_local_out(net, skb->sk, skb);
1990         if (err) {
1991                 if (err > 0)
1992                         err = net_xmit_errno(err);
1993                 if (err)
1994                         IP6_INC_STATS(net, rt->rt6i_idev,
1995                                       IPSTATS_MIB_OUTDISCARDS);
1996         }
1997
1998         return err;
1999 }
2000
2001 int ip6_push_pending_frames(struct sock *sk)
2002 {
2003         struct sk_buff *skb;
2004
2005         skb = ip6_finish_skb(sk);
2006         if (!skb)
2007                 return 0;
2008
2009         return ip6_send_skb(skb);
2010 }
2011 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2012
2013 static void __ip6_flush_pending_frames(struct sock *sk,
2014                                        struct sk_buff_head *queue,
2015                                        struct inet_cork_full *cork,
2016                                        struct inet6_cork *v6_cork)
2017 {
2018         struct sk_buff *skb;
2019
2020         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2021                 if (skb_dst(skb))
2022                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2023                                       IPSTATS_MIB_OUTDISCARDS);
2024                 kfree_skb(skb);
2025         }
2026
2027         ip6_cork_release(cork, v6_cork);
2028 }
2029
2030 void ip6_flush_pending_frames(struct sock *sk)
2031 {
2032         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2033                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2034 }
2035 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2036
2037 struct sk_buff *ip6_make_skb(struct sock *sk,
2038                              int getfrag(void *from, char *to, int offset,
2039                                          int len, int odd, struct sk_buff *skb),
2040                              void *from, size_t length, int transhdrlen,
2041                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2042                              unsigned int flags, struct inet_cork_full *cork)
2043 {
2044         struct inet6_cork v6_cork;
2045         struct sk_buff_head queue;
2046         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2047         int err;
2048
2049         if (flags & MSG_PROBE) {
2050                 dst_release(&rt->dst);
2051                 return NULL;
2052         }
2053
2054         __skb_queue_head_init(&queue);
2055
2056         cork->base.flags = 0;
2057         cork->base.addr = 0;
2058         cork->base.opt = NULL;
2059         v6_cork.opt = NULL;
2060         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2061         if (err) {
2062                 ip6_cork_release(cork, &v6_cork);
2063                 return ERR_PTR(err);
2064         }
2065         if (ipc6->dontfrag < 0)
2066                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2067
2068         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2069                                 &current->task_frag, getfrag, from,
2070                                 length + exthdrlen, transhdrlen + exthdrlen,
2071                                 flags, ipc6);
2072         if (err) {
2073                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2074                 return ERR_PTR(err);
2075         }
2076
2077         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2078 }