net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb_clear_tstamp(skb);
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct dst_entry *dst = skb_dst(skb);
 468         struct ipv6hdr *hdr = ipv6_hdr(skb);
 469         struct inet6_skb_parm *opt = IP6CB(skb);
 470         struct net *net = dev_net(dst->dev);
 471         struct inet6_dev *idev;
 472         u32 mtu;
 473
 474         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475         if (net->ipv6.devconf_all->forwarding == 0)
 476                 goto error;
 477
 478         if (skb->pkt_type != PACKET_HOST)
 479                 goto drop;
 480
 481         if (unlikely(skb->sk))
 482                 goto drop;
 483
 484         if (skb_warn_if_lro(skb))
 485                 goto drop;
 486
 487         if (!net->ipv6.devconf_all->disable_policy &&
 488             (!idev || !idev->cnf.disable_policy) &&
 489             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                 goto drop;
 492         }
 493
 494         skb_forward_csum(skb);
 495
 496         /*
 497          *      We DO NOT make any processing on
 498          *      RA packets, pushing them to user level AS IS
 499          *      without ane WARRANTY that application will be able
 500          *      to interpret them. The reason is that we
 501          *      cannot make anything clever here.
 502          *
 503          *      We are not end-node, so that if packet contains
 504          *      AH/ESP, we cannot make anything.
 505          *      Defragmentation also would be mistake, RA packets
 506          *      cannot be fragmented, because there is no warranty
 507          *      that different fragments will go along one path. --ANK
 508          */
 509         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                         return 0;
 512         }
 513
 514         /*
 515          *      check and decrement ttl
 516          */
 517         if (hdr->hop_limit <= 1) {
 518                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                 kfree_skb(skb);
 522                 return -ETIMEDOUT;
 523         }
 524
 525         /* XXX: idev->cnf.proxy_ndp? */
 526         if (net->ipv6.devconf_all->proxy_ndp &&
 527             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                 int proxied = ip6_forward_proxy_check(skb);
 529                 if (proxied > 0) {
 530                         hdr->hop_limit--;
 531                         return ip6_input(skb);
 532                 } else if (proxied < 0) {
 533                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 534                         goto drop;
 535                 }
 536         }
 537
 538         if (!xfrm6_route_forward(skb)) {
 539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 540                 goto drop;
 541         }
 542         dst = skb_dst(skb);
 543
 544         /* IPv6 specs say nothing about it, but it is clear that we cannot
 545            send redirects to source routed frames.
 546            We don't send redirects to frames decapsulated from IPsec.
 547          */
 548         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 549             opt->srcrt == 0 && !skb_sec_path(skb)) {
 550                 struct in6_addr *target = NULL;
 551                 struct inet_peer *peer;
 552                 struct rt6_info *rt;
 553
 554                 /*
 555                  *      incoming and outgoing devices are the same
 556                  *      send a redirect.
 557                  */
 558
 559                 rt = (struct rt6_info *) dst;
 560                 if (rt->rt6i_flags & RTF_GATEWAY)
 561                         target = &rt->rt6i_gateway;
 562                 else
 563                         target = &hdr->daddr;
 564
 565                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 566
 567                 /* Limit redirects both by destination (here)
 568                    and by source (inside ndisc_send_redirect)
 569                  */
 570                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 571                         ndisc_send_redirect(skb, target);
 572                 if (peer)
 573                         inet_putpeer(peer);
 574         } else {
 575                 int addrtype = ipv6_addr_type(&hdr->saddr);
 576
 577                 /* This check is security critical. */
 578                 if (addrtype == IPV6_ADDR_ANY ||
 579                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 580                         goto error;
 581                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 582                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 583                                     ICMPV6_NOT_NEIGHBOUR, 0);
 584                         goto error;
 585                 }
 586         }
 587
 588         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 589         if (mtu < IPV6_MIN_MTU)
 590                 mtu = IPV6_MIN_MTU;
 591
 592         if (ip6_pkt_too_big(skb, mtu)) {
 593                 /* Again, force OUTPUT device used as source address */
 594                 skb->dev = dst->dev;
 595                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 596                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 597                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 598                                 IPSTATS_MIB_FRAGFAILS);
 599                 kfree_skb(skb);
 600                 return -EMSGSIZE;
 601         }
 602
 603         if (skb_cow(skb, dst->dev->hard_header_len)) {
 604                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 605                                 IPSTATS_MIB_OUTDISCARDS);
 606                 goto drop;
 607         }
 608
 609         hdr = ipv6_hdr(skb);
 610
 611         /* Mangling hops number delayed to point after skb COW */
 612
 613         hdr->hop_limit--;
 614
 615         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 616                        net, NULL, skb, skb->dev, dst->dev,
 617                        ip6_forward_finish);
 618
 619 error:
 620         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 621 drop:
 622         kfree_skb(skb);
 623         return -EINVAL;
 624 }
 625
 626 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 627 {
 628         to->pkt_type = from->pkt_type;
 629         to->priority = from->priority;
 630         to->protocol = from->protocol;
 631         skb_dst_drop(to);
 632         skb_dst_set(to, dst_clone(skb_dst(from)));
 633         to->dev = from->dev;
 634         to->mark = from->mark;
 635
 636         skb_copy_hash(to, from);
 637
 638 #ifdef CONFIG_NET_SCHED
 639         to->tc_index = from->tc_index;
 640 #endif
 641         nf_copy(to, from);
 642         skb_ext_copy(to, from);
 643         skb_copy_secmark(to, from);
 644 }
 645
 646 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 647                       u8 nexthdr, __be32 frag_id,
 648                       struct ip6_fraglist_iter *iter)
 649 {
 650         unsigned int first_len;
 651         struct frag_hdr *fh;
 652
 653         /* BUILD HEADER */
 654         *prevhdr = NEXTHDR_FRAGMENT;
 655         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 656         if (!iter->tmp_hdr)
 657                 return -ENOMEM;
 658
 659         iter->frag = skb_shinfo(skb)->frag_list;
 660         skb_frag_list_init(skb);
 661
 662         iter->offset = 0;
 663         iter->hlen = hlen;
 664         iter->frag_id = frag_id;
 665         iter->nexthdr = nexthdr;
 666
 667         __skb_pull(skb, hlen);
 668         fh = __skb_push(skb, sizeof(struct frag_hdr));
 669         __skb_push(skb, hlen);
 670         skb_reset_network_header(skb);
 671         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 672
 673         fh->nexthdr = nexthdr;
 674         fh->reserved = 0;
 675         fh->frag_off = htons(IP6_MF);
 676         fh->identification = frag_id;
 677
 678         first_len = skb_pagelen(skb);
 679         skb->data_len = first_len - skb_headlen(skb);
 680         skb->len = first_len;
 681         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 682
 683         return 0;
 684 }
 685 EXPORT_SYMBOL(ip6_fraglist_init);
 686
 687 void ip6_fraglist_prepare(struct sk_buff *skb,
 688                           struct ip6_fraglist_iter *iter)
 689 {
 690         struct sk_buff *frag = iter->frag;
 691         unsigned int hlen = iter->hlen;
 692         struct frag_hdr *fh;
 693
 694         frag->ip_summed = CHECKSUM_NONE;
 695         skb_reset_transport_header(frag);
 696         fh = __skb_push(frag, sizeof(struct frag_hdr));
 697         __skb_push(frag, hlen);
 698         skb_reset_network_header(frag);
 699         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 700         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 701         fh->nexthdr = iter->nexthdr;
 702         fh->reserved = 0;
 703         fh->frag_off = htons(iter->offset);
 704         if (frag->next)
 705                 fh->frag_off |= htons(IP6_MF);
 706         fh->identification = iter->frag_id;
 707         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 708         ip6_copy_metadata(frag, skb);
 709 }
 710 EXPORT_SYMBOL(ip6_fraglist_prepare);
 711
 712 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 713                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 714                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 715 {
 716         state->prevhdr = prevhdr;
 717         state->nexthdr = nexthdr;
 718         state->frag_id = frag_id;
 719
 720         state->hlen = hlen;
 721         state->mtu = mtu;
 722
 723         state->left = skb->len - hlen;  /* Space per frame */
 724         state->ptr = hlen;              /* Where to start from */
 725
 726         state->hroom = hdr_room;
 727         state->troom = needed_tailroom;
 728
 729         state->offset = 0;
 730 }
 731 EXPORT_SYMBOL(ip6_frag_init);
 732
 733 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 734 {
 735         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 736         struct sk_buff *frag;
 737         struct frag_hdr *fh;
 738         unsigned int len;
 739
 740         len = state->left;
 741         /* IF: it doesn't fit, use 'mtu' - the data space left */
 742         if (len > state->mtu)
 743                 len = state->mtu;
 744         /* IF: we are not sending up to and including the packet end
 745            then align the next start on an eight byte boundary */
 746         if (len < state->left)
 747                 len &= ~7;
 748
 749         /* Allocate buffer */
 750         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 751                          state->hroom + state->troom, GFP_ATOMIC);
 752         if (!frag)
 753                 return ERR_PTR(-ENOMEM);
 754
 755         /*
 756          *      Set up data on packet
 757          */
 758
 759         ip6_copy_metadata(frag, skb);
 760         skb_reserve(frag, state->hroom);
 761         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 762         skb_reset_network_header(frag);
 763         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 764         frag->transport_header = (frag->network_header + state->hlen +
 765                                   sizeof(struct frag_hdr));
 766
 767         /*
 768          *      Charge the memory for the fragment to any owner
 769          *      it might possess
 770          */
 771         if (skb->sk)
 772                 skb_set_owner_w(frag, skb->sk);
 773
 774         /*
 775          *      Copy the packet header into the new buffer.
 776          */
 777         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 778
 779         fragnexthdr_offset = skb_network_header(frag);
 780         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 781         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 782
 783         /*
 784          *      Build fragment header.
 785          */
 786         fh->nexthdr = state->nexthdr;
 787         fh->reserved = 0;
 788         fh->identification = state->frag_id;
 789
 790         /*
 791          *      Copy a block of the IP datagram.
 792          */
 793         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 794                              len));
 795         state->left -= len;
 796
 797         fh->frag_off = htons(state->offset);
 798         if (state->left > 0)
 799                 fh->frag_off |= htons(IP6_MF);
 800         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 801
 802         state->ptr += len;
 803         state->offset += len;
 804
 805         return frag;
 806 }
 807 EXPORT_SYMBOL(ip6_frag_next);
 808
 809 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 810                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 811 {
 812         struct sk_buff *frag;
 813         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 814         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 815                                 inet6_sk(skb->sk) : NULL;
 816         bool mono_delivery_time = skb->mono_delivery_time;
 817         struct ip6_frag_state state;
 818         unsigned int mtu, hlen, nexthdr_offset;
 819         ktime_t tstamp = skb->tstamp;
 820         int hroom, err = 0;
 821         __be32 frag_id;
 822         u8 *prevhdr, nexthdr = 0;
 823
 824         err = ip6_find_1stfragopt(skb, &prevhdr);
 825         if (err < 0)
 826                 goto fail;
 827         hlen = err;
 828         nexthdr = *prevhdr;
 829         nexthdr_offset = prevhdr - skb_network_header(skb);
 830
 831         mtu = ip6_skb_dst_mtu(skb);
 832
 833         /* We must not fragment if the socket is set to force MTU discovery
 834          * or if the skb it not generated by a local socket.
 835          */
 836         if (unlikely(!skb->ignore_df && skb->len > mtu))
 837                 goto fail_toobig;
 838
 839         if (IP6CB(skb)->frag_max_size) {
 840                 if (IP6CB(skb)->frag_max_size > mtu)
 841                         goto fail_toobig;
 842
 843                 /* don't send fragments larger than what we received */
 844                 mtu = IP6CB(skb)->frag_max_size;
 845                 if (mtu < IPV6_MIN_MTU)
 846                         mtu = IPV6_MIN_MTU;
 847         }
 848
 849         if (np && np->frag_size < mtu) {
 850                 if (np->frag_size)
 851                         mtu = np->frag_size;
 852         }
 853         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 854                 goto fail_toobig;
 855         mtu -= hlen + sizeof(struct frag_hdr);
 856
 857         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 858                                     &ipv6_hdr(skb)->saddr);
 859
 860         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 861             (err = skb_checksum_help(skb)))
 862                 goto fail;
 863
 864         prevhdr = skb_network_header(skb) + nexthdr_offset;
 865         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 866         if (skb_has_frag_list(skb)) {
 867                 unsigned int first_len = skb_pagelen(skb);
 868                 struct ip6_fraglist_iter iter;
 869                 struct sk_buff *frag2;
 870
 871                 if (first_len - hlen > mtu ||
 872                     ((first_len - hlen) & 7) ||
 873                     skb_cloned(skb) ||
 874                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 875                         goto slow_path;
 876
 877                 skb_walk_frags(skb, frag) {
 878                         /* Correct geometry. */
 879                         if (frag->len > mtu ||
 880                             ((frag->len & 7) && frag->next) ||
 881                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 882                                 goto slow_path_clean;
 883
 884                         /* Partially cloned skb? */
 885                         if (skb_shared(frag))
 886                                 goto slow_path_clean;
 887
 888                         BUG_ON(frag->sk);
 889                         if (skb->sk) {
 890                                 frag->sk = skb->sk;
 891                                 frag->destructor = sock_wfree;
 892                         }
 893                         skb->truesize -= frag->truesize;
 894                 }
 895
 896                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 897                                         &iter);
 898                 if (err < 0)
 899                         goto fail;
 900
 901                 for (;;) {
 902                         /* Prepare header of the next frame,
 903                          * before previous one went down. */
 904                         if (iter.frag)
 905                                 ip6_fraglist_prepare(skb, &iter);
 906
 907                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
 908                         err = output(net, sk, skb);
 909                         if (!err)
 910                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 911                                               IPSTATS_MIB_FRAGCREATES);
 912
 913                         if (err || !iter.frag)
 914                                 break;
 915
 916                         skb = ip6_fraglist_next(&iter);
 917                 }
 918
 919                 kfree(iter.tmp_hdr);
 920
 921                 if (err == 0) {
 922                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 923                                       IPSTATS_MIB_FRAGOKS);
 924                         return 0;
 925                 }
 926
 927                 kfree_skb_list(iter.frag);
 928
 929                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 930                               IPSTATS_MIB_FRAGFAILS);
 931                 return err;
 932
 933 slow_path_clean:
 934                 skb_walk_frags(skb, frag2) {
 935                         if (frag2 == frag)
 936                                 break;
 937                         frag2->sk = NULL;
 938                         frag2->destructor = NULL;
 939                         skb->truesize += frag2->truesize;
 940                 }
 941         }
 942
 943 slow_path:
 944         /*
 945          *      Fragment the datagram.
 946          */
 947
 948         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 949                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 950                       &state);
 951
 952         /*
 953          *      Keep copying data until we run out.
 954          */
 955
 956         while (state.left > 0) {
 957                 frag = ip6_frag_next(skb, &state);
 958                 if (IS_ERR(frag)) {
 959                         err = PTR_ERR(frag);
 960                         goto fail;
 961                 }
 962
 963                 /*
 964                  *      Put this fragment into the sending queue.
 965                  */
 966                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
 967                 err = output(net, sk, frag);
 968                 if (err)
 969                         goto fail;
 970
 971                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 972                               IPSTATS_MIB_FRAGCREATES);
 973         }
 974         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 975                       IPSTATS_MIB_FRAGOKS);
 976         consume_skb(skb);
 977         return err;
 978
 979 fail_toobig:
 980         if (skb->sk && dst_allfrag(skb_dst(skb)))
 981                 sk_gso_disable(skb->sk);
 982
 983         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 984         err = -EMSGSIZE;
 985
 986 fail:
 987         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 988                       IPSTATS_MIB_FRAGFAILS);
 989         kfree_skb(skb);
 990         return err;
 991 }
 992
 993 static inline int ip6_rt_check(const struct rt6key *rt_key,
 994                                const struct in6_addr *fl_addr,
 995                                const struct in6_addr *addr_cache)
 996 {
 997         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 998                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 999 }
1000
1001 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1002                                           struct dst_entry *dst,
1003                                           const struct flowi6 *fl6)
1004 {
1005         struct ipv6_pinfo *np = inet6_sk(sk);
1006         struct rt6_info *rt;
1007
1008         if (!dst)
1009                 goto out;
1010
1011         if (dst->ops->family != AF_INET6) {
1012                 dst_release(dst);
1013                 return NULL;
1014         }
1015
1016         rt = (struct rt6_info *)dst;
1017         /* Yes, checking route validity in not connected
1018          * case is not very simple. Take into account,
1019          * that we do not support routing by source, TOS,
1020          * and MSG_DONTROUTE            --ANK (980726)
1021          *
1022          * 1. ip6_rt_check(): If route was host route,
1023          *    check that cached destination is current.
1024          *    If it is network route, we still may
1025          *    check its validity using saved pointer
1026          *    to the last used address: daddr_cache.
1027          *    We do not want to save whole address now,
1028          *    (because main consumer of this service
1029          *    is tcp, which has not this problem),
1030          *    so that the last trick works only on connected
1031          *    sockets.
1032          * 2. oif also should be the same.
1033          */
1034         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1035 #ifdef CONFIG_IPV6_SUBTREES
1036             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1037 #endif
1038            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1039                 dst_release(dst);
1040                 dst = NULL;
1041         }
1042
1043 out:
1044         return dst;
1045 }
1046
1047 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1048                                struct dst_entry **dst, struct flowi6 *fl6)
1049 {
1050 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1051         struct neighbour *n;
1052         struct rt6_info *rt;
1053 #endif
1054         int err;
1055         int flags = 0;
1056
1057         /* The correct way to handle this would be to do
1058          * ip6_route_get_saddr, and then ip6_route_output; however,
1059          * the route-specific preferred source forces the
1060          * ip6_route_output call _before_ ip6_route_get_saddr.
1061          *
1062          * In source specific routing (no src=any default route),
1063          * ip6_route_output will fail given src=any saddr, though, so
1064          * that's why we try it again later.
1065          */
1066         if (ipv6_addr_any(&fl6->saddr)) {
1067                 struct fib6_info *from;
1068                 struct rt6_info *rt;
1069
1070                 *dst = ip6_route_output(net, sk, fl6);
1071                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1072
1073                 rcu_read_lock();
1074                 from = rt ? rcu_dereference(rt->from) : NULL;
1075                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1076                                           sk ? inet6_sk(sk)->srcprefs : 0,
1077                                           &fl6->saddr);
1078                 rcu_read_unlock();
1079
1080                 if (err)
1081                         goto out_err_release;
1082
1083                 /* If we had an erroneous initial result, pretend it
1084                  * never existed and let the SA-enabled version take
1085                  * over.
1086                  */
1087                 if ((*dst)->error) {
1088                         dst_release(*dst);
1089                         *dst = NULL;
1090                 }
1091
1092                 if (fl6->flowi6_oif)
1093                         flags |= RT6_LOOKUP_F_IFACE;
1094         }
1095
1096         if (!*dst)
1097                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1098
1099         err = (*dst)->error;
1100         if (err)
1101                 goto out_err_release;
1102
1103 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1104         /*
1105          * Here if the dst entry we've looked up
1106          * has a neighbour entry that is in the INCOMPLETE
1107          * state and the src address from the flow is
1108          * marked as OPTIMISTIC, we release the found
1109          * dst entry and replace it instead with the
1110          * dst entry of the nexthop router
1111          */
1112         rt = (struct rt6_info *) *dst;
1113         rcu_read_lock_bh();
1114         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1115                                       rt6_nexthop(rt, &fl6->daddr));
1116         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1117         rcu_read_unlock_bh();
1118
1119         if (err) {
1120                 struct inet6_ifaddr *ifp;
1121                 struct flowi6 fl_gw6;
1122                 int redirect;
1123
1124                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1125                                       (*dst)->dev, 1);
1126
1127                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1128                 if (ifp)
1129                         in6_ifa_put(ifp);
1130
1131                 if (redirect) {
1132                         /*
1133                          * We need to get the dst entry for the
1134                          * default router instead
1135                          */
1136                         dst_release(*dst);
1137                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1138                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1139                         *dst = ip6_route_output(net, sk, &fl_gw6);
1140                         err = (*dst)->error;
1141                         if (err)
1142                                 goto out_err_release;
1143                 }
1144         }
1145 #endif
1146         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1147             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1148                 err = -EAFNOSUPPORT;
1149                 goto out_err_release;
1150         }
1151
1152         return 0;
1153
1154 out_err_release:
1155         dst_release(*dst);
1156         *dst = NULL;
1157
1158         if (err == -ENETUNREACH)
1159                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1160         return err;
1161 }
1162
1163 /**
1164  *      ip6_dst_lookup - perform route lookup on flow
1165  *      @net: Network namespace to perform lookup in
1166  *      @sk: socket which provides route info
1167  *      @dst: pointer to dst_entry * for result
1168  *      @fl6: flow to lookup
1169  *
1170  *      This function performs a route lookup on the given flow.
1171  *
1172  *      It returns zero on success, or a standard errno code on error.
1173  */
1174 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1175                    struct flowi6 *fl6)
1176 {
1177         *dst = NULL;
1178         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1179 }
1180 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1181
1182 /**
1183  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1184  *      @net: Network namespace to perform lookup in
1185  *      @sk: socket which provides route info
1186  *      @fl6: flow to lookup
1187  *      @final_dst: final destination address for ipsec lookup
1188  *
1189  *      This function performs a route lookup on the given flow.
1190  *
1191  *      It returns a valid dst pointer on success, or a pointer encoded
1192  *      error code.
1193  */
1194 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1195                                       const struct in6_addr *final_dst)
1196 {
1197         struct dst_entry *dst = NULL;
1198         int err;
1199
1200         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1201         if (err)
1202                 return ERR_PTR(err);
1203         if (final_dst)
1204                 fl6->daddr = *final_dst;
1205
1206         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1207 }
1208 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1209
1210 /**
1211  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1212  *      @sk: socket which provides the dst cache and route info
1213  *      @fl6: flow to lookup
1214  *      @final_dst: final destination address for ipsec lookup
1215  *      @connected: whether @sk is connected or not
1216  *
1217  *      This function performs a route lookup on the given flow with the
1218  *      possibility of using the cached route in the socket if it is valid.
1219  *      It will take the socket dst lock when operating on the dst cache.
1220  *      As a result, this function can only be used in process context.
1221  *
1222  *      In addition, for a connected socket, cache the dst in the socket
1223  *      if the current cache is not valid.
1224  *
1225  *      It returns a valid dst pointer on success, or a pointer encoded
1226  *      error code.
1227  */
1228 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1229                                          const struct in6_addr *final_dst,
1230                                          bool connected)
1231 {
1232         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1233
1234         dst = ip6_sk_dst_check(sk, dst, fl6);
1235         if (dst)
1236                 return dst;
1237
1238         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1239         if (connected && !IS_ERR(dst))
1240                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1241
1242         return dst;
1243 }
1244 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1245
1246 /**
1247  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1248  *      @skb: Packet for which lookup is done
1249  *      @dev: Tunnel device
1250  *      @net: Network namespace of tunnel device
1251  *      @sock: Socket which provides route info
1252  *      @saddr: Memory to store the src ip address
1253  *      @info: Tunnel information
1254  *      @protocol: IP protocol
1255  *      @use_cache: Flag to enable cache usage
1256  *      This function performs a route lookup on a tunnel
1257  *
1258  *      It returns a valid dst pointer and stores src address to be used in
1259  *      tunnel in param saddr on success, else a pointer encoded error code.
1260  */
1261
1262 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1263                                         struct net_device *dev,
1264                                         struct net *net,
1265                                         struct socket *sock,
1266                                         struct in6_addr *saddr,
1267                                         const struct ip_tunnel_info *info,
1268                                         u8 protocol,
1269                                         bool use_cache)
1270 {
1271         struct dst_entry *dst = NULL;
1272 #ifdef CONFIG_DST_CACHE
1273         struct dst_cache *dst_cache;
1274 #endif
1275         struct flowi6 fl6;
1276         __u8 prio;
1277
1278 #ifdef CONFIG_DST_CACHE
1279         dst_cache = (struct dst_cache *)&info->dst_cache;
1280         if (use_cache) {
1281                 dst = dst_cache_get_ip6(dst_cache, saddr);
1282                 if (dst)
1283                         return dst;
1284         }
1285 #endif
1286         memset(&fl6, 0, sizeof(fl6));
1287         fl6.flowi6_mark = skb->mark;
1288         fl6.flowi6_proto = protocol;
1289         fl6.daddr = info->key.u.ipv6.dst;
1290         fl6.saddr = info->key.u.ipv6.src;
1291         prio = info->key.tos;
1292         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1293                                           info->key.label);
1294
1295         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1296                                               NULL);
1297         if (IS_ERR(dst)) {
1298                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1299                 return ERR_PTR(-ENETUNREACH);
1300         }
1301         if (dst->dev == dev) { /* is this necessary? */
1302                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1303                 dst_release(dst);
1304                 return ERR_PTR(-ELOOP);
1305         }
1306 #ifdef CONFIG_DST_CACHE
1307         if (use_cache)
1308                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1309 #endif
1310         *saddr = fl6.saddr;
1311         return dst;
1312 }
1313 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1314
1315 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1316                                                gfp_t gfp)
1317 {
1318         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319 }
1320
1321 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1322                                                 gfp_t gfp)
1323 {
1324         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1325 }
1326
1327 static void ip6_append_data_mtu(unsigned int *mtu,
1328                                 int *maxfraglen,
1329                                 unsigned int fragheaderlen,
1330                                 struct sk_buff *skb,
1331                                 struct rt6_info *rt,
1332                                 unsigned int orig_mtu)
1333 {
1334         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1335                 if (!skb) {
1336                         /* first fragment, reserve header_len */
1337                         *mtu = orig_mtu - rt->dst.header_len;
1338
1339                 } else {
1340                         /*
1341                          * this fragment is not first, the headers
1342                          * space is regarded as data space.
1343                          */
1344                         *mtu = orig_mtu;
1345                 }
1346                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1347                               + fragheaderlen - sizeof(struct frag_hdr);
1348         }
1349 }
1350
1351 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1352                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1353                           struct rt6_info *rt)
1354 {
1355         struct ipv6_pinfo *np = inet6_sk(sk);
1356         unsigned int mtu;
1357         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1358
1359         /* callers pass dst together with a reference, set it first so
1360          * ip6_cork_release() can put it down even in case of an error.
1361          */
1362         cork->base.dst = &rt->dst;
1363
1364         /*
1365          * setup for corking
1366          */
1367         if (opt) {
1368                 if (WARN_ON(v6_cork->opt))
1369                         return -EINVAL;
1370
1371                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1372                 if (unlikely(!nopt))
1373                         return -ENOBUFS;
1374
1375                 nopt->tot_len = sizeof(*opt);
1376                 nopt->opt_flen = opt->opt_flen;
1377                 nopt->opt_nflen = opt->opt_nflen;
1378
1379                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1380                 if (opt->dst0opt && !nopt->dst0opt)
1381                         return -ENOBUFS;
1382
1383                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1384                 if (opt->dst1opt && !nopt->dst1opt)
1385                         return -ENOBUFS;
1386
1387                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1388                 if (opt->hopopt && !nopt->hopopt)
1389                         return -ENOBUFS;
1390
1391                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1392                 if (opt->srcrt && !nopt->srcrt)
1393                         return -ENOBUFS;
1394
1395                 /* need source address above miyazawa*/
1396         }
1397         v6_cork->hop_limit = ipc6->hlimit;
1398         v6_cork->tclass = ipc6->tclass;
1399         if (rt->dst.flags & DST_XFRM_TUNNEL)
1400                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1401                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1402         else
1403                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1404                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1405         if (np->frag_size < mtu) {
1406                 if (np->frag_size)
1407                         mtu = np->frag_size;
1408         }
1409         cork->base.fragsize = mtu;
1410         cork->base.gso_size = ipc6->gso_size;
1411         cork->base.tx_flags = 0;
1412         cork->base.mark = ipc6->sockc.mark;
1413         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1414
1415         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1416                 cork->base.flags |= IPCORK_ALLFRAG;
1417         cork->base.length = 0;
1418
1419         cork->base.transmit_time = ipc6->sockc.transmit_time;
1420
1421         return 0;
1422 }
1423
1424 static int __ip6_append_data(struct sock *sk,
1425                              struct sk_buff_head *queue,
1426                              struct inet_cork_full *cork_full,
1427                              struct inet6_cork *v6_cork,
1428                              struct page_frag *pfrag,
1429                              int getfrag(void *from, char *to, int offset,
1430                                          int len, int odd, struct sk_buff *skb),
1431                              void *from, int length, int transhdrlen,
1432                              unsigned int flags, struct ipcm6_cookie *ipc6)
1433 {
1434         struct sk_buff *skb, *skb_prev = NULL;
1435         struct inet_cork *cork = &cork_full->base;
1436         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1437         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1438         struct ubuf_info *uarg = NULL;
1439         int exthdrlen = 0;
1440         int dst_exthdrlen = 0;
1441         int hh_len;
1442         int copy;
1443         int err;
1444         int offset = 0;
1445         u32 tskey = 0;
1446         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1447         struct ipv6_txoptions *opt = v6_cork->opt;
1448         int csummode = CHECKSUM_NONE;
1449         unsigned int maxnonfragsize, headersize;
1450         unsigned int wmem_alloc_delta = 0;
1451         bool paged, extra_uref = false;
1452
1453         skb = skb_peek_tail(queue);
1454         if (!skb) {
1455                 exthdrlen = opt ? opt->opt_flen : 0;
1456                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1457         }
1458
1459         paged = !!cork->gso_size;
1460         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1461         orig_mtu = mtu;
1462
1463         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1464             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1465                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1466
1467         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1468
1469         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1470                         (opt ? opt->opt_nflen : 0);
1471
1472         headersize = sizeof(struct ipv6hdr) +
1473                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1474                      (dst_allfrag(&rt->dst) ?
1475                       sizeof(struct frag_hdr) : 0) +
1476                      rt->rt6i_nfheader_len;
1477
1478         if (mtu <= fragheaderlen ||
1479             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1480                 goto emsgsize;
1481
1482         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1483                      sizeof(struct frag_hdr);
1484
1485         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1486          * the first fragment
1487          */
1488         if (headersize + transhdrlen > mtu)
1489                 goto emsgsize;
1490
1491         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1492             (sk->sk_protocol == IPPROTO_UDP ||
1493              sk->sk_protocol == IPPROTO_ICMPV6 ||
1494              sk->sk_protocol == IPPROTO_RAW)) {
1495                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1496                                 sizeof(struct ipv6hdr));
1497                 goto emsgsize;
1498         }
1499
1500         if (ip6_sk_ignore_df(sk))
1501                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1502         else
1503                 maxnonfragsize = mtu;
1504
1505         if (cork->length + length > maxnonfragsize - headersize) {
1506 emsgsize:
1507                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1508                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1509                 return -EMSGSIZE;
1510         }
1511
1512         /* CHECKSUM_PARTIAL only with no extension headers and when
1513          * we are not going to fragment
1514          */
1515         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1516             headersize == sizeof(struct ipv6hdr) &&
1517             length <= mtu - headersize &&
1518             (!(flags & MSG_MORE) || cork->gso_size) &&
1519             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1520                 csummode = CHECKSUM_PARTIAL;
1521
1522         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1523                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1524                 if (!uarg)
1525                         return -ENOBUFS;
1526                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1527                 if (rt->dst.dev->features & NETIF_F_SG &&
1528                     csummode == CHECKSUM_PARTIAL) {
1529                         paged = true;
1530                 } else {
1531                         uarg->zerocopy = 0;
1532                         skb_zcopy_set(skb, uarg, &extra_uref);
1533                 }
1534         }
1535
1536         /*
1537          * Let's try using as much space as possible.
1538          * Use MTU if total length of the message fits into the MTU.
1539          * Otherwise, we need to reserve fragment header and
1540          * fragment alignment (= 8-15 octects, in total).
1541          *
1542          * Note that we may need to "move" the data from the tail
1543          * of the buffer to the new fragment when we split
1544          * the message.
1545          *
1546          * FIXME: It may be fragmented into multiple chunks
1547          *        at once if non-fragmentable extension headers
1548          *        are too large.
1549          * --yoshfuji
1550          */
1551
1552         cork->length += length;
1553         if (!skb)
1554                 goto alloc_new_skb;
1555
1556         while (length > 0) {
1557                 /* Check if the remaining data fits into current packet. */
1558                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1559                 if (copy < length)
1560                         copy = maxfraglen - skb->len;
1561
1562                 if (copy <= 0) {
1563                         char *data;
1564                         unsigned int datalen;
1565                         unsigned int fraglen;
1566                         unsigned int fraggap;
1567                         unsigned int alloclen, alloc_extra;
1568                         unsigned int pagedlen;
1569 alloc_new_skb:
1570                         /* There's no room in the current skb */
1571                         if (skb)
1572                                 fraggap = skb->len - maxfraglen;
1573                         else
1574                                 fraggap = 0;
1575                         /* update mtu and maxfraglen if necessary */
1576                         if (!skb || !skb_prev)
1577                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1578                                                     fragheaderlen, skb, rt,
1579                                                     orig_mtu);
1580
1581                         skb_prev = skb;
1582
1583                         /*
1584                          * If remaining data exceeds the mtu,
1585                          * we know we need more fragment(s).
1586                          */
1587                         datalen = length + fraggap;
1588
1589                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1590                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1591                         fraglen = datalen + fragheaderlen;
1592                         pagedlen = 0;
1593
1594                         alloc_extra = hh_len;
1595                         alloc_extra += dst_exthdrlen;
1596                         alloc_extra += rt->dst.trailer_len;
1597
1598                         /* We just reserve space for fragment header.
1599                          * Note: this may be overallocation if the message
1600                          * (without MSG_MORE) fits into the MTU.
1601                          */
1602                         alloc_extra += sizeof(struct frag_hdr);
1603
1604                         if ((flags & MSG_MORE) &&
1605                             !(rt->dst.dev->features&NETIF_F_SG))
1606                                 alloclen = mtu;
1607                         else if (!paged &&
1608                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1609                                   !(rt->dst.dev->features & NETIF_F_SG)))
1610                                 alloclen = fraglen;
1611                         else {
1612                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1613                                 pagedlen = fraglen - alloclen;
1614                         }
1615                         alloclen += alloc_extra;
1616
1617                         if (datalen != length + fraggap) {
1618                                 /*
1619                                  * this is not the last fragment, the trailer
1620                                  * space is regarded as data space.
1621                                  */
1622                                 datalen += rt->dst.trailer_len;
1623                         }
1624
1625                         fraglen = datalen + fragheaderlen;
1626
1627                         copy = datalen - transhdrlen - fraggap - pagedlen;
1628                         if (copy < 0) {
1629                                 err = -EINVAL;
1630                                 goto error;
1631                         }
1632                         if (transhdrlen) {
1633                                 skb = sock_alloc_send_skb(sk, alloclen,
1634                                                 (flags & MSG_DONTWAIT), &err);
1635                         } else {
1636                                 skb = NULL;
1637                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1638                                     2 * sk->sk_sndbuf)
1639                                         skb = alloc_skb(alloclen,
1640                                                         sk->sk_allocation);
1641                                 if (unlikely(!skb))
1642                                         err = -ENOBUFS;
1643                         }
1644                         if (!skb)
1645                                 goto error;
1646                         /*
1647                          *      Fill in the control structures
1648                          */
1649                         skb->protocol = htons(ETH_P_IPV6);
1650                         skb->ip_summed = csummode;
1651                         skb->csum = 0;
1652                         /* reserve for fragmentation and ipsec header */
1653                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1654                                     dst_exthdrlen);
1655
1656                         /*
1657                          *      Find where to start putting bytes
1658                          */
1659                         data = skb_put(skb, fraglen - pagedlen);
1660                         skb_set_network_header(skb, exthdrlen);
1661                         data += fragheaderlen;
1662                         skb->transport_header = (skb->network_header +
1663                                                  fragheaderlen);
1664                         if (fraggap) {
1665                                 skb->csum = skb_copy_and_csum_bits(
1666                                         skb_prev, maxfraglen,
1667                                         data + transhdrlen, fraggap);
1668                                 skb_prev->csum = csum_sub(skb_prev->csum,
1669                                                           skb->csum);
1670                                 data += fraggap;
1671                                 pskb_trim_unique(skb_prev, maxfraglen);
1672                         }
1673                         if (copy > 0 &&
1674                             getfrag(from, data + transhdrlen, offset,
1675                                     copy, fraggap, skb) < 0) {
1676                                 err = -EFAULT;
1677                                 kfree_skb(skb);
1678                                 goto error;
1679                         }
1680
1681                         offset += copy;
1682                         length -= copy + transhdrlen;
1683                         transhdrlen = 0;
1684                         exthdrlen = 0;
1685                         dst_exthdrlen = 0;
1686
1687                         /* Only the initial fragment is time stamped */
1688                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1689                         cork->tx_flags = 0;
1690                         skb_shinfo(skb)->tskey = tskey;
1691                         tskey = 0;
1692                         skb_zcopy_set(skb, uarg, &extra_uref);
1693
1694                         if ((flags & MSG_CONFIRM) && !skb_prev)
1695                                 skb_set_dst_pending_confirm(skb, 1);
1696
1697                         /*
1698                          * Put the packet on the pending queue
1699                          */
1700                         if (!skb->destructor) {
1701                                 skb->destructor = sock_wfree;
1702                                 skb->sk = sk;
1703                                 wmem_alloc_delta += skb->truesize;
1704                         }
1705                         __skb_queue_tail(queue, skb);
1706                         continue;
1707                 }
1708
1709                 if (copy > length)
1710                         copy = length;
1711
1712                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1713                     skb_tailroom(skb) >= copy) {
1714                         unsigned int off;
1715
1716                         off = skb->len;
1717                         if (getfrag(from, skb_put(skb, copy),
1718                                                 offset, copy, off, skb) < 0) {
1719                                 __skb_trim(skb, off);
1720                                 err = -EFAULT;
1721                                 goto error;
1722                         }
1723                 } else if (!uarg || !uarg->zerocopy) {
1724                         int i = skb_shinfo(skb)->nr_frags;
1725
1726                         err = -ENOMEM;
1727                         if (!sk_page_frag_refill(sk, pfrag))
1728                                 goto error;
1729
1730                         if (!skb_can_coalesce(skb, i, pfrag->page,
1731                                               pfrag->offset)) {
1732                                 err = -EMSGSIZE;
1733                                 if (i == MAX_SKB_FRAGS)
1734                                         goto error;
1735
1736                                 __skb_fill_page_desc(skb, i, pfrag->page,
1737                                                      pfrag->offset, 0);
1738                                 skb_shinfo(skb)->nr_frags = ++i;
1739                                 get_page(pfrag->page);
1740                         }
1741                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1742                         if (getfrag(from,
1743                                     page_address(pfrag->page) + pfrag->offset,
1744                                     offset, copy, skb->len, skb) < 0)
1745                                 goto error_efault;
1746
1747                         pfrag->offset += copy;
1748                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1749                         skb->len += copy;
1750                         skb->data_len += copy;
1751                         skb->truesize += copy;
1752                         wmem_alloc_delta += copy;
1753                 } else {
1754                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1755                         if (err < 0)
1756                                 goto error;
1757                 }
1758                 offset += copy;
1759                 length -= copy;
1760         }
1761
1762         if (wmem_alloc_delta)
1763                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1764         return 0;
1765
1766 error_efault:
1767         err = -EFAULT;
1768 error:
1769         net_zcopy_put_abort(uarg, extra_uref);
1770         cork->length -= length;
1771         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1772         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1773         return err;
1774 }
1775
1776 int ip6_append_data(struct sock *sk,
1777                     int getfrag(void *from, char *to, int offset, int len,
1778                                 int odd, struct sk_buff *skb),
1779                     void *from, int length, int transhdrlen,
1780                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1781                     struct rt6_info *rt, unsigned int flags)
1782 {
1783         struct inet_sock *inet = inet_sk(sk);
1784         struct ipv6_pinfo *np = inet6_sk(sk);
1785         int exthdrlen;
1786         int err;
1787
1788         if (flags&MSG_PROBE)
1789                 return 0;
1790         if (skb_queue_empty(&sk->sk_write_queue)) {
1791                 /*
1792                  * setup for corking
1793                  */
1794                 dst_hold(&rt->dst);
1795                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1796                                      ipc6, rt);
1797                 if (err)
1798                         return err;
1799
1800                 inet->cork.fl.u.ip6 = *fl6;
1801                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1802                 length += exthdrlen;
1803                 transhdrlen += exthdrlen;
1804         } else {
1805                 transhdrlen = 0;
1806         }
1807
1808         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1809                                  &np->cork, sk_page_frag(sk), getfrag,
1810                                  from, length, transhdrlen, flags, ipc6);
1811 }
1812 EXPORT_SYMBOL_GPL(ip6_append_data);
1813
1814 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1815 {
1816         struct dst_entry *dst = cork->base.dst;
1817
1818         cork->base.dst = NULL;
1819         cork->base.flags &= ~IPCORK_ALLFRAG;
1820         skb_dst_set(skb, dst);
1821 }
1822
1823 static void ip6_cork_release(struct inet_cork_full *cork,
1824                              struct inet6_cork *v6_cork)
1825 {
1826         if (v6_cork->opt) {
1827                 struct ipv6_txoptions *opt = v6_cork->opt;
1828
1829                 kfree(opt->dst0opt);
1830                 kfree(opt->dst1opt);
1831                 kfree(opt->hopopt);
1832                 kfree(opt->srcrt);
1833                 kfree(opt);
1834                 v6_cork->opt = NULL;
1835         }
1836
1837         if (cork->base.dst) {
1838                 dst_release(cork->base.dst);
1839                 cork->base.dst = NULL;
1840                 cork->base.flags &= ~IPCORK_ALLFRAG;
1841         }
1842 }
1843
1844 struct sk_buff *__ip6_make_skb(struct sock *sk,
1845                                struct sk_buff_head *queue,
1846                                struct inet_cork_full *cork,
1847                                struct inet6_cork *v6_cork)
1848 {
1849         struct sk_buff *skb, *tmp_skb;
1850         struct sk_buff **tail_skb;
1851         struct in6_addr *final_dst;
1852         struct ipv6_pinfo *np = inet6_sk(sk);
1853         struct net *net = sock_net(sk);
1854         struct ipv6hdr *hdr;
1855         struct ipv6_txoptions *opt = v6_cork->opt;
1856         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1857         struct flowi6 *fl6 = &cork->fl.u.ip6;
1858         unsigned char proto = fl6->flowi6_proto;
1859
1860         skb = __skb_dequeue(queue);
1861         if (!skb)
1862                 goto out;
1863         tail_skb = &(skb_shinfo(skb)->frag_list);
1864
1865         /* move skb->data to ip header from ext header */
1866         if (skb->data < skb_network_header(skb))
1867                 __skb_pull(skb, skb_network_offset(skb));
1868         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1869                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1870                 *tail_skb = tmp_skb;
1871                 tail_skb = &(tmp_skb->next);
1872                 skb->len += tmp_skb->len;
1873                 skb->data_len += tmp_skb->len;
1874                 skb->truesize += tmp_skb->truesize;
1875                 tmp_skb->destructor = NULL;
1876                 tmp_skb->sk = NULL;
1877         }
1878
1879         /* Allow local fragmentation. */
1880         skb->ignore_df = ip6_sk_ignore_df(sk);
1881         __skb_pull(skb, skb_network_header_len(skb));
1882
1883         final_dst = &fl6->daddr;
1884         if (opt && opt->opt_flen)
1885                 ipv6_push_frag_opts(skb, opt, &proto);
1886         if (opt && opt->opt_nflen)
1887                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1888
1889         skb_push(skb, sizeof(struct ipv6hdr));
1890         skb_reset_network_header(skb);
1891         hdr = ipv6_hdr(skb);
1892
1893         ip6_flow_hdr(hdr, v6_cork->tclass,
1894                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1895                                         ip6_autoflowlabel(net, np), fl6));
1896         hdr->hop_limit = v6_cork->hop_limit;
1897         hdr->nexthdr = proto;
1898         hdr->saddr = fl6->saddr;
1899         hdr->daddr = *final_dst;
1900
1901         skb->priority = sk->sk_priority;
1902         skb->mark = cork->base.mark;
1903         skb->tstamp = cork->base.transmit_time;
1904
1905         ip6_cork_steal_dst(skb, cork);
1906         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1907         if (proto == IPPROTO_ICMPV6) {
1908                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1909
1910                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1911                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1912         }
1913
1914         ip6_cork_release(cork, v6_cork);
1915 out:
1916         return skb;
1917 }
1918
1919 int ip6_send_skb(struct sk_buff *skb)
1920 {
1921         struct net *net = sock_net(skb->sk);
1922         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1923         int err;
1924
1925         err = ip6_local_out(net, skb->sk, skb);
1926         if (err) {
1927                 if (err > 0)
1928                         err = net_xmit_errno(err);
1929                 if (err)
1930                         IP6_INC_STATS(net, rt->rt6i_idev,
1931                                       IPSTATS_MIB_OUTDISCARDS);
1932         }
1933
1934         return err;
1935 }
1936
1937 int ip6_push_pending_frames(struct sock *sk)
1938 {
1939         struct sk_buff *skb;
1940
1941         skb = ip6_finish_skb(sk);
1942         if (!skb)
1943                 return 0;
1944
1945         return ip6_send_skb(skb);
1946 }
1947 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1948
1949 static void __ip6_flush_pending_frames(struct sock *sk,
1950                                        struct sk_buff_head *queue,
1951                                        struct inet_cork_full *cork,
1952                                        struct inet6_cork *v6_cork)
1953 {
1954         struct sk_buff *skb;
1955
1956         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1957                 if (skb_dst(skb))
1958                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1959                                       IPSTATS_MIB_OUTDISCARDS);
1960                 kfree_skb(skb);
1961         }
1962
1963         ip6_cork_release(cork, v6_cork);
1964 }
1965
1966 void ip6_flush_pending_frames(struct sock *sk)
1967 {
1968         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1969                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1970 }
1971 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1972
1973 struct sk_buff *ip6_make_skb(struct sock *sk,
1974                              int getfrag(void *from, char *to, int offset,
1975                                          int len, int odd, struct sk_buff *skb),
1976                              void *from, int length, int transhdrlen,
1977                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
1978                              unsigned int flags, struct inet_cork_full *cork)
1979 {
1980         struct inet6_cork v6_cork;
1981         struct sk_buff_head queue;
1982         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1983         int err;
1984
1985         if (flags & MSG_PROBE) {
1986                 dst_release(&rt->dst);
1987                 return NULL;
1988         }
1989
1990         __skb_queue_head_init(&queue);
1991
1992         cork->base.flags = 0;
1993         cork->base.addr = 0;
1994         cork->base.opt = NULL;
1995         v6_cork.opt = NULL;
1996         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
1997         if (err) {
1998                 ip6_cork_release(cork, &v6_cork);
1999                 return ERR_PTR(err);
2000         }
2001         if (ipc6->dontfrag < 0)
2002                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2003
2004         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2005                                 &current->task_frag, getfrag, from,
2006                                 length + exthdrlen, transhdrlen + exthdrlen,
2007                                 flags, ipc6);
2008         if (err) {
2009                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2010                 return ERR_PTR(err);
2011         }
2012
2013         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2014 }