net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58 #include <net/l3mdev.h>
  59 #include <net/lwtunnel.h>
  60
  61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  62 {
  63         struct dst_entry *dst = skb_dst(skb);
  64         struct net_device *dev = dst->dev;
  65         struct neighbour *neigh;
  66         struct in6_addr *nexthop;
  67         int ret;
  68
  69         skb->protocol = htons(ETH_P_IPV6);
  70         skb->dev = dev;
  71
  72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  74
  75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  76                     ((mroute6_socket(net, skb) &&
  77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  79                                          &ipv6_hdr(skb)->saddr))) {
  80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  81
  82                         /* Do not check for IFF_ALLMULTI; multicast routing
  83                            is not supported in any case.
  84                          */
  85                         if (newskb)
  86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  87                                         net, sk, newskb, NULL, newskb->dev,
  88                                         dev_loopback_xmit);
  89
  90                         if (ipv6_hdr(skb)->hop_limit == 0) {
  91                                 IP6_INC_STATS(net, idev,
  92                                               IPSTATS_MIB_OUTDISCARDS);
  93                                 kfree_skb(skb);
  94                                 return 0;
  95                         }
  96                 }
  97
  98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  99
 100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 101                     IPV6_ADDR_SCOPE_NODELOCAL &&
 102                     !(dev->flags & IFF_LOOPBACK)) {
 103                         kfree_skb(skb);
 104                         return 0;
 105                 }
 106         }
 107
 108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 109                 int res = lwtunnel_xmit(skb);
 110
 111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 112                         return res;
 113         }
 114
 115         rcu_read_lock_bh();
 116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 118         if (unlikely(!neigh))
 119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 120         if (!IS_ERR(neigh)) {
 121                 ret = dst_neigh_output(dst, neigh, skb);
 122                 rcu_read_unlock_bh();
 123                 return ret;
 124         }
 125         rcu_read_unlock_bh();
 126
 127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 128         kfree_skb(skb);
 129         return -EINVAL;
 130 }
 131
 132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 133 {
 134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 135             dst_allfrag(skb_dst(skb)) ||
 136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 138         else
 139                 return ip6_finish_output2(net, sk, skb);
 140 }
 141
 142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 143 {
 144         struct net_device *dev = skb_dst(skb)->dev;
 145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 146
 147         if (unlikely(idev->cnf.disable_ipv6)) {
 148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 149                 kfree_skb(skb);
 150                 return 0;
 151         }
 152
 153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 154                             net, sk, skb, NULL, dev,
 155                             ip6_finish_output,
 156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 157 }
 158
 159 /*
 160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 161  * Note : socket lock is not held for SYNACK packets, but might be modified
 162  * by calls to skb_set_owner_w() and ipv6_local_error(),
 163  * which are using proper atomic operations or spinlocks.
 164  */
 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 166              struct ipv6_txoptions *opt, int tclass)
 167 {
 168         struct net *net = sock_net(sk);
 169         const struct ipv6_pinfo *np = inet6_sk(sk);
 170         struct in6_addr *first_hop = &fl6->daddr;
 171         struct dst_entry *dst = skb_dst(skb);
 172         struct ipv6hdr *hdr;
 173         u8  proto = fl6->flowi6_proto;
 174         int seg_len = skb->len;
 175         int hlimit = -1;
 176         u32 mtu;
 177
 178         if (opt) {
 179                 unsigned int head_room;
 180
 181                 /* First: exthdrs may take lots of space (~8K for now)
 182                    MAX_HEADER is not enough.
 183                  */
 184                 head_room = opt->opt_nflen + opt->opt_flen;
 185                 seg_len += head_room;
 186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 187
 188                 if (skb_headroom(skb) < head_room) {
 189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 190                         if (!skb2) {
 191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 192                                               IPSTATS_MIB_OUTDISCARDS);
 193                                 kfree_skb(skb);
 194                                 return -ENOBUFS;
 195                         }
 196                         consume_skb(skb);
 197                         skb = skb2;
 198                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 199                          * it is safe to call in our context (socket lock not held)
 200                          */
 201                         skb_set_owner_w(skb, (struct sock *)sk);
 202                 }
 203                 if (opt->opt_flen)
 204                         ipv6_push_frag_opts(skb, opt, &proto);
 205                 if (opt->opt_nflen)
 206                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 207                                              &fl6->saddr);
 208         }
 209
 210         skb_push(skb, sizeof(struct ipv6hdr));
 211         skb_reset_network_header(skb);
 212         hdr = ipv6_hdr(skb);
 213
 214         /*
 215          *      Fill in the IPv6 header
 216          */
 217         if (np)
 218                 hlimit = np->hop_limit;
 219         if (hlimit < 0)
 220                 hlimit = ip6_dst_hoplimit(dst);
 221
 222         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 223                                                      np->autoflowlabel, fl6));
 224
 225         hdr->payload_len = htons(seg_len);
 226         hdr->nexthdr = proto;
 227         hdr->hop_limit = hlimit;
 228
 229         hdr->saddr = fl6->saddr;
 230         hdr->daddr = *first_hop;
 231
 232         skb->protocol = htons(ETH_P_IPV6);
 233         skb->priority = sk->sk_priority;
 234         skb->mark = sk->sk_mark;
 235
 236         mtu = dst_mtu(dst);
 237         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 238                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 239                               IPSTATS_MIB_OUT, skb->len);
 240
 241                 /* if egress device is enslaved to an L3 master device pass the
 242                  * skb to its handler for processing
 243                  */
 244                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 245                 if (unlikely(!skb))
 246                         return 0;
 247
 248                 /* hooks should never assume socket lock is held.
 249                  * we promote our socket to non const
 250                  */
 251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 252                                net, (struct sock *)sk, skb, NULL, dst->dev,
 253                                dst_output);
 254         }
 255
 256         skb->dev = dst->dev;
 257         /* ipv6_local_error() does not require socket lock,
 258          * we promote our socket to non const
 259          */
 260         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 261
 262         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 263         kfree_skb(skb);
 264         return -EMSGSIZE;
 265 }
 266 EXPORT_SYMBOL(ip6_xmit);
 267
 268 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 269 {
 270         struct ip6_ra_chain *ra;
 271         struct sock *last = NULL;
 272
 273         read_lock(&ip6_ra_lock);
 274         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 275                 struct sock *sk = ra->sk;
 276                 if (sk && ra->sel == sel &&
 277                     (!sk->sk_bound_dev_if ||
 278                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 279                         if (last) {
 280                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 281                                 if (skb2)
 282                                         rawv6_rcv(last, skb2);
 283                         }
 284                         last = sk;
 285                 }
 286         }
 287
 288         if (last) {
 289                 rawv6_rcv(last, skb);
 290                 read_unlock(&ip6_ra_lock);
 291                 return 1;
 292         }
 293         read_unlock(&ip6_ra_lock);
 294         return 0;
 295 }
 296
 297 static int ip6_forward_proxy_check(struct sk_buff *skb)
 298 {
 299         struct ipv6hdr *hdr = ipv6_hdr(skb);
 300         u8 nexthdr = hdr->nexthdr;
 301         __be16 frag_off;
 302         int offset;
 303
 304         if (ipv6_ext_hdr(nexthdr)) {
 305                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 306                 if (offset < 0)
 307                         return 0;
 308         } else
 309                 offset = sizeof(struct ipv6hdr);
 310
 311         if (nexthdr == IPPROTO_ICMPV6) {
 312                 struct icmp6hdr *icmp6;
 313
 314                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 315                                          offset + 1 - skb->data)))
 316                         return 0;
 317
 318                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 319
 320                 switch (icmp6->icmp6_type) {
 321                 case NDISC_ROUTER_SOLICITATION:
 322                 case NDISC_ROUTER_ADVERTISEMENT:
 323                 case NDISC_NEIGHBOUR_SOLICITATION:
 324                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 325                 case NDISC_REDIRECT:
 326                         /* For reaction involving unicast neighbor discovery
 327                          * message destined to the proxied address, pass it to
 328                          * input function.
 329                          */
 330                         return 1;
 331                 default:
 332                         break;
 333                 }
 334         }
 335
 336         /*
 337          * The proxying router can't forward traffic sent to a link-local
 338          * address, so signal the sender and discard the packet. This
 339          * behavior is clarified by the MIPv6 specification.
 340          */
 341         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 342                 dst_link_failure(skb);
 343                 return -1;
 344         }
 345
 346         return 0;
 347 }
 348
 349 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 350                                      struct sk_buff *skb)
 351 {
 352         return dst_output(net, sk, skb);
 353 }
 354
 355 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 356 {
 357         unsigned int mtu;
 358         struct inet6_dev *idev;
 359
 360         if (dst_metric_locked(dst, RTAX_MTU)) {
 361                 mtu = dst_metric_raw(dst, RTAX_MTU);
 362                 if (mtu)
 363                         return mtu;
 364         }
 365
 366         mtu = IPV6_MIN_MTU;
 367         rcu_read_lock();
 368         idev = __in6_dev_get(dst->dev);
 369         if (idev)
 370                 mtu = idev->cnf.mtu6;
 371         rcu_read_unlock();
 372
 373         return mtu;
 374 }
 375
 376 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 377 {
 378         if (skb->len <= mtu)
 379                 return false;
 380
 381         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 382         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 383                 return true;
 384
 385         if (skb->ignore_df)
 386                 return false;
 387
 388         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 389                 return false;
 390
 391         return true;
 392 }
 393
 394 int ip6_forward(struct sk_buff *skb)
 395 {
 396         struct dst_entry *dst = skb_dst(skb);
 397         struct ipv6hdr *hdr = ipv6_hdr(skb);
 398         struct inet6_skb_parm *opt = IP6CB(skb);
 399         struct net *net = dev_net(dst->dev);
 400         u32 mtu;
 401
 402         if (net->ipv6.devconf_all->forwarding == 0)
 403                 goto error;
 404
 405         if (skb->pkt_type != PACKET_HOST)
 406                 goto drop;
 407
 408         if (unlikely(skb->sk))
 409                 goto drop;
 410
 411         if (skb_warn_if_lro(skb))
 412                 goto drop;
 413
 414         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 415                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 416                                 IPSTATS_MIB_INDISCARDS);
 417                 goto drop;
 418         }
 419
 420         skb_forward_csum(skb);
 421
 422         /*
 423          *      We DO NOT make any processing on
 424          *      RA packets, pushing them to user level AS IS
 425          *      without ane WARRANTY that application will be able
 426          *      to interpret them. The reason is that we
 427          *      cannot make anything clever here.
 428          *
 429          *      We are not end-node, so that if packet contains
 430          *      AH/ESP, we cannot make anything.
 431          *      Defragmentation also would be mistake, RA packets
 432          *      cannot be fragmented, because there is no warranty
 433          *      that different fragments will go along one path. --ANK
 434          */
 435         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 436                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 437                         return 0;
 438         }
 439
 440         /*
 441          *      check and decrement ttl
 442          */
 443         if (hdr->hop_limit <= 1) {
 444                 /* Force OUTPUT device used as source address */
 445                 skb->dev = dst->dev;
 446                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 447                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 448                                 IPSTATS_MIB_INHDRERRORS);
 449
 450                 kfree_skb(skb);
 451                 return -ETIMEDOUT;
 452         }
 453
 454         /* XXX: idev->cnf.proxy_ndp? */
 455         if (net->ipv6.devconf_all->proxy_ndp &&
 456             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 457                 int proxied = ip6_forward_proxy_check(skb);
 458                 if (proxied > 0)
 459                         return ip6_input(skb);
 460                 else if (proxied < 0) {
 461                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 462                                         IPSTATS_MIB_INDISCARDS);
 463                         goto drop;
 464                 }
 465         }
 466
 467         if (!xfrm6_route_forward(skb)) {
 468                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 469                                 IPSTATS_MIB_INDISCARDS);
 470                 goto drop;
 471         }
 472         dst = skb_dst(skb);
 473
 474         /* IPv6 specs say nothing about it, but it is clear that we cannot
 475            send redirects to source routed frames.
 476            We don't send redirects to frames decapsulated from IPsec.
 477          */
 478         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 479                 struct in6_addr *target = NULL;
 480                 struct inet_peer *peer;
 481                 struct rt6_info *rt;
 482
 483                 /*
 484                  *      incoming and outgoing devices are the same
 485                  *      send a redirect.
 486                  */
 487
 488                 rt = (struct rt6_info *) dst;
 489                 if (rt->rt6i_flags & RTF_GATEWAY)
 490                         target = &rt->rt6i_gateway;
 491                 else
 492                         target = &hdr->daddr;
 493
 494                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 495
 496                 /* Limit redirects both by destination (here)
 497                    and by source (inside ndisc_send_redirect)
 498                  */
 499                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 500                         ndisc_send_redirect(skb, target);
 501                 if (peer)
 502                         inet_putpeer(peer);
 503         } else {
 504                 int addrtype = ipv6_addr_type(&hdr->saddr);
 505
 506                 /* This check is security critical. */
 507                 if (addrtype == IPV6_ADDR_ANY ||
 508                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 509                         goto error;
 510                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 511                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 512                                     ICMPV6_NOT_NEIGHBOUR, 0);
 513                         goto error;
 514                 }
 515         }
 516
 517         mtu = ip6_dst_mtu_forward(dst);
 518         if (mtu < IPV6_MIN_MTU)
 519                 mtu = IPV6_MIN_MTU;
 520
 521         if (ip6_pkt_too_big(skb, mtu)) {
 522                 /* Again, force OUTPUT device used as source address */
 523                 skb->dev = dst->dev;
 524                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 525                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 526                                 IPSTATS_MIB_INTOOBIGERRORS);
 527                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 528                                 IPSTATS_MIB_FRAGFAILS);
 529                 kfree_skb(skb);
 530                 return -EMSGSIZE;
 531         }
 532
 533         if (skb_cow(skb, dst->dev->hard_header_len)) {
 534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 535                                 IPSTATS_MIB_OUTDISCARDS);
 536                 goto drop;
 537         }
 538
 539         hdr = ipv6_hdr(skb);
 540
 541         /* Mangling hops number delayed to point after skb COW */
 542
 543         hdr->hop_limit--;
 544
 545         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 546         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 547         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 548                        net, NULL, skb, skb->dev, dst->dev,
 549                        ip6_forward_finish);
 550
 551 error:
 552         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 553 drop:
 554         kfree_skb(skb);
 555         return -EINVAL;
 556 }
 557
 558 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 559 {
 560         to->pkt_type = from->pkt_type;
 561         to->priority = from->priority;
 562         to->protocol = from->protocol;
 563         skb_dst_drop(to);
 564         skb_dst_set(to, dst_clone(skb_dst(from)));
 565         to->dev = from->dev;
 566         to->mark = from->mark;
 567
 568 #ifdef CONFIG_NET_SCHED
 569         to->tc_index = from->tc_index;
 570 #endif
 571         nf_copy(to, from);
 572         skb_copy_secmark(to, from);
 573 }
 574
 575 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 576                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 577 {
 578         struct sk_buff *frag;
 579         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 580         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 581                                 inet6_sk(skb->sk) : NULL;
 582         struct ipv6hdr *tmp_hdr;
 583         struct frag_hdr *fh;
 584         unsigned int mtu, hlen, left, len;
 585         int hroom, troom;
 586         __be32 frag_id;
 587         int ptr, offset = 0, err = 0;
 588         u8 *prevhdr, nexthdr = 0;
 589
 590         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 591         nexthdr = *prevhdr;
 592
 593         mtu = ip6_skb_dst_mtu(skb);
 594
 595         /* We must not fragment if the socket is set to force MTU discovery
 596          * or if the skb it not generated by a local socket.
 597          */
 598         if (unlikely(!skb->ignore_df && skb->len > mtu))
 599                 goto fail_toobig;
 600
 601         if (IP6CB(skb)->frag_max_size) {
 602                 if (IP6CB(skb)->frag_max_size > mtu)
 603                         goto fail_toobig;
 604
 605                 /* don't send fragments larger than what we received */
 606                 mtu = IP6CB(skb)->frag_max_size;
 607                 if (mtu < IPV6_MIN_MTU)
 608                         mtu = IPV6_MIN_MTU;
 609         }
 610
 611         if (np && np->frag_size < mtu) {
 612                 if (np->frag_size)
 613                         mtu = np->frag_size;
 614         }
 615         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 616                 goto fail_toobig;
 617         mtu -= hlen + sizeof(struct frag_hdr);
 618
 619         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 620                                     &ipv6_hdr(skb)->saddr);
 621
 622         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 623             (err = skb_checksum_help(skb)))
 624                 goto fail;
 625
 626         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 627         if (skb_has_frag_list(skb)) {
 628                 int first_len = skb_pagelen(skb);
 629                 struct sk_buff *frag2;
 630
 631                 if (first_len - hlen > mtu ||
 632                     ((first_len - hlen) & 7) ||
 633                     skb_cloned(skb) ||
 634                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 635                         goto slow_path;
 636
 637                 skb_walk_frags(skb, frag) {
 638                         /* Correct geometry. */
 639                         if (frag->len > mtu ||
 640                             ((frag->len & 7) && frag->next) ||
 641                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 642                                 goto slow_path_clean;
 643
 644                         /* Partially cloned skb? */
 645                         if (skb_shared(frag))
 646                                 goto slow_path_clean;
 647
 648                         BUG_ON(frag->sk);
 649                         if (skb->sk) {
 650                                 frag->sk = skb->sk;
 651                                 frag->destructor = sock_wfree;
 652                         }
 653                         skb->truesize -= frag->truesize;
 654                 }
 655
 656                 err = 0;
 657                 offset = 0;
 658                 /* BUILD HEADER */
 659
 660                 *prevhdr = NEXTHDR_FRAGMENT;
 661                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 662                 if (!tmp_hdr) {
 663                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 664                                       IPSTATS_MIB_FRAGFAILS);
 665                         err = -ENOMEM;
 666                         goto fail;
 667                 }
 668                 frag = skb_shinfo(skb)->frag_list;
 669                 skb_frag_list_init(skb);
 670
 671                 __skb_pull(skb, hlen);
 672                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 673                 __skb_push(skb, hlen);
 674                 skb_reset_network_header(skb);
 675                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 676
 677                 fh->nexthdr = nexthdr;
 678                 fh->reserved = 0;
 679                 fh->frag_off = htons(IP6_MF);
 680                 fh->identification = frag_id;
 681
 682                 first_len = skb_pagelen(skb);
 683                 skb->data_len = first_len - skb_headlen(skb);
 684                 skb->len = first_len;
 685                 ipv6_hdr(skb)->payload_len = htons(first_len -
 686                                                    sizeof(struct ipv6hdr));
 687
 688                 dst_hold(&rt->dst);
 689
 690                 for (;;) {
 691                         /* Prepare header of the next frame,
 692                          * before previous one went down. */
 693                         if (frag) {
 694                                 frag->ip_summed = CHECKSUM_NONE;
 695                                 skb_reset_transport_header(frag);
 696                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 697                                 __skb_push(frag, hlen);
 698                                 skb_reset_network_header(frag);
 699                                 memcpy(skb_network_header(frag), tmp_hdr,
 700                                        hlen);
 701                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 702                                 fh->nexthdr = nexthdr;
 703                                 fh->reserved = 0;
 704                                 fh->frag_off = htons(offset);
 705                                 if (frag->next)
 706                                         fh->frag_off |= htons(IP6_MF);
 707                                 fh->identification = frag_id;
 708                                 ipv6_hdr(frag)->payload_len =
 709                                                 htons(frag->len -
 710                                                       sizeof(struct ipv6hdr));
 711                                 ip6_copy_metadata(frag, skb);
 712                         }
 713
 714                         err = output(net, sk, skb);
 715                         if (!err)
 716                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 717                                               IPSTATS_MIB_FRAGCREATES);
 718
 719                         if (err || !frag)
 720                                 break;
 721
 722                         skb = frag;
 723                         frag = skb->next;
 724                         skb->next = NULL;
 725                 }
 726
 727                 kfree(tmp_hdr);
 728
 729                 if (err == 0) {
 730                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 731                                       IPSTATS_MIB_FRAGOKS);
 732                         ip6_rt_put(rt);
 733                         return 0;
 734                 }
 735
 736                 kfree_skb_list(frag);
 737
 738                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 739                               IPSTATS_MIB_FRAGFAILS);
 740                 ip6_rt_put(rt);
 741                 return err;
 742
 743 slow_path_clean:
 744                 skb_walk_frags(skb, frag2) {
 745                         if (frag2 == frag)
 746                                 break;
 747                         frag2->sk = NULL;
 748                         frag2->destructor = NULL;
 749                         skb->truesize += frag2->truesize;
 750                 }
 751         }
 752
 753 slow_path:
 754         left = skb->len - hlen;         /* Space per frame */
 755         ptr = hlen;                     /* Where to start from */
 756
 757         /*
 758          *      Fragment the datagram.
 759          */
 760
 761         *prevhdr = NEXTHDR_FRAGMENT;
 762         troom = rt->dst.dev->needed_tailroom;
 763
 764         /*
 765          *      Keep copying data until we run out.
 766          */
 767         while (left > 0)        {
 768                 len = left;
 769                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 770                 if (len > mtu)
 771                         len = mtu;
 772                 /* IF: we are not sending up to and including the packet end
 773                    then align the next start on an eight byte boundary */
 774                 if (len < left) {
 775                         len &= ~7;
 776                 }
 777
 778                 /* Allocate buffer */
 779                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 780                                  hroom + troom, GFP_ATOMIC);
 781                 if (!frag) {
 782                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 783                                       IPSTATS_MIB_FRAGFAILS);
 784                         err = -ENOMEM;
 785                         goto fail;
 786                 }
 787
 788                 /*
 789                  *      Set up data on packet
 790                  */
 791
 792                 ip6_copy_metadata(frag, skb);
 793                 skb_reserve(frag, hroom);
 794                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 795                 skb_reset_network_header(frag);
 796                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 797                 frag->transport_header = (frag->network_header + hlen +
 798                                           sizeof(struct frag_hdr));
 799
 800                 /*
 801                  *      Charge the memory for the fragment to any owner
 802                  *      it might possess
 803                  */
 804                 if (skb->sk)
 805                         skb_set_owner_w(frag, skb->sk);
 806
 807                 /*
 808                  *      Copy the packet header into the new buffer.
 809                  */
 810                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 811
 812                 /*
 813                  *      Build fragment header.
 814                  */
 815                 fh->nexthdr = nexthdr;
 816                 fh->reserved = 0;
 817                 fh->identification = frag_id;
 818
 819                 /*
 820                  *      Copy a block of the IP datagram.
 821                  */
 822                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 823                                      len));
 824                 left -= len;
 825
 826                 fh->frag_off = htons(offset);
 827                 if (left > 0)
 828                         fh->frag_off |= htons(IP6_MF);
 829                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 830                                                     sizeof(struct ipv6hdr));
 831
 832                 ptr += len;
 833                 offset += len;
 834
 835                 /*
 836                  *      Put this fragment into the sending queue.
 837                  */
 838                 err = output(net, sk, frag);
 839                 if (err)
 840                         goto fail;
 841
 842                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 843                               IPSTATS_MIB_FRAGCREATES);
 844         }
 845         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 846                       IPSTATS_MIB_FRAGOKS);
 847         consume_skb(skb);
 848         return err;
 849
 850 fail_toobig:
 851         if (skb->sk && dst_allfrag(skb_dst(skb)))
 852                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 853
 854         skb->dev = skb_dst(skb)->dev;
 855         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 856         err = -EMSGSIZE;
 857
 858 fail:
 859         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 860                       IPSTATS_MIB_FRAGFAILS);
 861         kfree_skb(skb);
 862         return err;
 863 }
 864
 865 static inline int ip6_rt_check(const struct rt6key *rt_key,
 866                                const struct in6_addr *fl_addr,
 867                                const struct in6_addr *addr_cache)
 868 {
 869         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 870                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 871 }
 872
 873 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 874                                           struct dst_entry *dst,
 875                                           const struct flowi6 *fl6)
 876 {
 877         struct ipv6_pinfo *np = inet6_sk(sk);
 878         struct rt6_info *rt;
 879
 880         if (!dst)
 881                 goto out;
 882
 883         if (dst->ops->family != AF_INET6) {
 884                 dst_release(dst);
 885                 return NULL;
 886         }
 887
 888         rt = (struct rt6_info *)dst;
 889         /* Yes, checking route validity in not connected
 890          * case is not very simple. Take into account,
 891          * that we do not support routing by source, TOS,
 892          * and MSG_DONTROUTE            --ANK (980726)
 893          *
 894          * 1. ip6_rt_check(): If route was host route,
 895          *    check that cached destination is current.
 896          *    If it is network route, we still may
 897          *    check its validity using saved pointer
 898          *    to the last used address: daddr_cache.
 899          *    We do not want to save whole address now,
 900          *    (because main consumer of this service
 901          *    is tcp, which has not this problem),
 902          *    so that the last trick works only on connected
 903          *    sockets.
 904          * 2. oif also should be the same.
 905          */
 906         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 907 #ifdef CONFIG_IPV6_SUBTREES
 908             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 909 #endif
 910            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 911               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 912                 dst_release(dst);
 913                 dst = NULL;
 914         }
 915
 916 out:
 917         return dst;
 918 }
 919
 920 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 921                                struct dst_entry **dst, struct flowi6 *fl6)
 922 {
 923 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 924         struct neighbour *n;
 925         struct rt6_info *rt;
 926 #endif
 927         int err;
 928         int flags = 0;
 929
 930         /* The correct way to handle this would be to do
 931          * ip6_route_get_saddr, and then ip6_route_output; however,
 932          * the route-specific preferred source forces the
 933          * ip6_route_output call _before_ ip6_route_get_saddr.
 934          *
 935          * In source specific routing (no src=any default route),
 936          * ip6_route_output will fail given src=any saddr, though, so
 937          * that's why we try it again later.
 938          */
 939         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 940                 struct rt6_info *rt;
 941                 bool had_dst = *dst != NULL;
 942
 943                 if (!had_dst)
 944                         *dst = ip6_route_output(net, sk, fl6);
 945                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 946                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 947                                           sk ? inet6_sk(sk)->srcprefs : 0,
 948                                           &fl6->saddr);
 949                 if (err)
 950                         goto out_err_release;
 951
 952                 /* If we had an erroneous initial result, pretend it
 953                  * never existed and let the SA-enabled version take
 954                  * over.
 955                  */
 956                 if (!had_dst && (*dst)->error) {
 957                         dst_release(*dst);
 958                         *dst = NULL;
 959                 }
 960
 961                 if (fl6->flowi6_oif)
 962                         flags |= RT6_LOOKUP_F_IFACE;
 963         }
 964
 965         if (!*dst)
 966                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 967
 968         err = (*dst)->error;
 969         if (err)
 970                 goto out_err_release;
 971
 972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 973         /*
 974          * Here if the dst entry we've looked up
 975          * has a neighbour entry that is in the INCOMPLETE
 976          * state and the src address from the flow is
 977          * marked as OPTIMISTIC, we release the found
 978          * dst entry and replace it instead with the
 979          * dst entry of the nexthop router
 980          */
 981         rt = (struct rt6_info *) *dst;
 982         rcu_read_lock_bh();
 983         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 984                                       rt6_nexthop(rt, &fl6->daddr));
 985         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 986         rcu_read_unlock_bh();
 987
 988         if (err) {
 989                 struct inet6_ifaddr *ifp;
 990                 struct flowi6 fl_gw6;
 991                 int redirect;
 992
 993                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 994                                       (*dst)->dev, 1);
 995
 996                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 997                 if (ifp)
 998                         in6_ifa_put(ifp);
 999
1000                 if (redirect) {
1001                         /*
1002                          * We need to get the dst entry for the
1003                          * default router instead
1004                          */
1005                         dst_release(*dst);
1006                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1007                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1008                         *dst = ip6_route_output(net, sk, &fl_gw6);
1009                         err = (*dst)->error;
1010                         if (err)
1011                                 goto out_err_release;
1012                 }
1013         }
1014 #endif
1015
1016         return 0;
1017
1018 out_err_release:
1019         dst_release(*dst);
1020         *dst = NULL;
1021
1022         if (err == -ENETUNREACH)
1023                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1024         return err;
1025 }
1026
1027 /**
1028  *      ip6_dst_lookup - perform route lookup on flow
1029  *      @sk: socket which provides route info
1030  *      @dst: pointer to dst_entry * for result
1031  *      @fl6: flow to lookup
1032  *
1033  *      This function performs a route lookup on the given flow.
1034  *
1035  *      It returns zero on success, or a standard errno code on error.
1036  */
1037 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1038                    struct flowi6 *fl6)
1039 {
1040         *dst = NULL;
1041         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1042 }
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1044
1045 /**
1046  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047  *      @sk: socket which provides route info
1048  *      @fl6: flow to lookup
1049  *      @final_dst: final destination address for ipsec lookup
1050  *
1051  *      This function performs a route lookup on the given flow.
1052  *
1053  *      It returns a valid dst pointer on success, or a pointer encoded
1054  *      error code.
1055  */
1056 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1057                                       const struct in6_addr *final_dst)
1058 {
1059         struct dst_entry *dst = NULL;
1060         int err;
1061
1062         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1063         if (err)
1064                 return ERR_PTR(err);
1065         if (final_dst)
1066                 fl6->daddr = *final_dst;
1067
1068         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 }
1070 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071
1072 /**
1073  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1074  *      @sk: socket which provides the dst cache and route info
1075  *      @fl6: flow to lookup
1076  *      @final_dst: final destination address for ipsec lookup
1077  *
1078  *      This function performs a route lookup on the given flow with the
1079  *      possibility of using the cached route in the socket if it is valid.
1080  *      It will take the socket dst lock when operating on the dst cache.
1081  *      As a result, this function can only be used in process context.
1082  *
1083  *      It returns a valid dst pointer on success, or a pointer encoded
1084  *      error code.
1085  */
1086 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1087                                          const struct in6_addr *final_dst)
1088 {
1089         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090
1091         dst = ip6_sk_dst_check(sk, dst, fl6);
1092         if (!dst)
1093                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1094
1095         return dst;
1096 }
1097 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1098
1099 static inline int ip6_ufo_append_data(struct sock *sk,
1100                         struct sk_buff_head *queue,
1101                         int getfrag(void *from, char *to, int offset, int len,
1102                         int odd, struct sk_buff *skb),
1103                         void *from, int length, int hh_len, int fragheaderlen,
1104                         int exthdrlen, int transhdrlen, int mtu,
1105                         unsigned int flags, const struct flowi6 *fl6)
1106
1107 {
1108         struct sk_buff *skb;
1109         int err;
1110
1111         /* There is support for UDP large send offload by network
1112          * device, so create one single skb packet containing complete
1113          * udp datagram
1114          */
1115         skb = skb_peek_tail(queue);
1116         if (!skb) {
1117                 skb = sock_alloc_send_skb(sk,
1118                         hh_len + fragheaderlen + transhdrlen + 20,
1119                         (flags & MSG_DONTWAIT), &err);
1120                 if (!skb)
1121                         return err;
1122
1123                 /* reserve space for Hardware header */
1124                 skb_reserve(skb, hh_len);
1125
1126                 /* create space for UDP/IP header */
1127                 skb_put(skb, fragheaderlen + transhdrlen);
1128
1129                 /* initialize network header pointer */
1130                 skb_set_network_header(skb, exthdrlen);
1131
1132                 /* initialize protocol header pointer */
1133                 skb->transport_header = skb->network_header + fragheaderlen;
1134
1135                 skb->protocol = htons(ETH_P_IPV6);
1136                 skb->csum = 0;
1137
1138                 __skb_queue_tail(queue, skb);
1139         } else if (skb_is_gso(skb)) {
1140                 goto append;
1141         }
1142
1143         skb->ip_summed = CHECKSUM_PARTIAL;
1144         /* Specify the length of each IPv6 datagram fragment.
1145          * It has to be a multiple of 8.
1146          */
1147         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1148                                      sizeof(struct frag_hdr)) & ~7;
1149         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1150         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1151                                                          &fl6->daddr,
1152                                                          &fl6->saddr);
1153
1154 append:
1155         return skb_append_datato_frags(sk, skb, getfrag, from,
1156                                        (length - transhdrlen));
1157 }
1158
1159 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1160                                                gfp_t gfp)
1161 {
1162         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1163 }
1164
1165 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1166                                                 gfp_t gfp)
1167 {
1168         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 }
1170
1171 static void ip6_append_data_mtu(unsigned int *mtu,
1172                                 int *maxfraglen,
1173                                 unsigned int fragheaderlen,
1174                                 struct sk_buff *skb,
1175                                 struct rt6_info *rt,
1176                                 unsigned int orig_mtu)
1177 {
1178         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1179                 if (!skb) {
1180                         /* first fragment, reserve header_len */
1181                         *mtu = orig_mtu - rt->dst.header_len;
1182
1183                 } else {
1184                         /*
1185                          * this fragment is not first, the headers
1186                          * space is regarded as data space.
1187                          */
1188                         *mtu = orig_mtu;
1189                 }
1190                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1191                               + fragheaderlen - sizeof(struct frag_hdr);
1192         }
1193 }
1194
1195 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1196                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1197                           struct rt6_info *rt, struct flowi6 *fl6)
1198 {
1199         struct ipv6_pinfo *np = inet6_sk(sk);
1200         unsigned int mtu;
1201         struct ipv6_txoptions *opt = ipc6->opt;
1202
1203         /*
1204          * setup for corking
1205          */
1206         if (opt) {
1207                 if (WARN_ON(v6_cork->opt))
1208                         return -EINVAL;
1209
1210                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1211                 if (unlikely(!v6_cork->opt))
1212                         return -ENOBUFS;
1213
1214                 v6_cork->opt->tot_len = opt->tot_len;
1215                 v6_cork->opt->opt_flen = opt->opt_flen;
1216                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1217
1218                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1219                                                     sk->sk_allocation);
1220                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1221                         return -ENOBUFS;
1222
1223                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1224                                                     sk->sk_allocation);
1225                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1226                         return -ENOBUFS;
1227
1228                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1229                                                    sk->sk_allocation);
1230                 if (opt->hopopt && !v6_cork->opt->hopopt)
1231                         return -ENOBUFS;
1232
1233                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1234                                                     sk->sk_allocation);
1235                 if (opt->srcrt && !v6_cork->opt->srcrt)
1236                         return -ENOBUFS;
1237
1238                 /* need source address above miyazawa*/
1239         }
1240         dst_hold(&rt->dst);
1241         cork->base.dst = &rt->dst;
1242         cork->fl.u.ip6 = *fl6;
1243         v6_cork->hop_limit = ipc6->hlimit;
1244         v6_cork->tclass = ipc6->tclass;
1245         if (rt->dst.flags & DST_XFRM_TUNNEL)
1246                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1247                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1248         else
1249                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1250                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1251         if (np->frag_size < mtu) {
1252                 if (np->frag_size)
1253                         mtu = np->frag_size;
1254         }
1255         cork->base.fragsize = mtu;
1256         if (dst_allfrag(rt->dst.path))
1257                 cork->base.flags |= IPCORK_ALLFRAG;
1258         cork->base.length = 0;
1259
1260         return 0;
1261 }
1262
1263 static int __ip6_append_data(struct sock *sk,
1264                              struct flowi6 *fl6,
1265                              struct sk_buff_head *queue,
1266                              struct inet_cork *cork,
1267                              struct inet6_cork *v6_cork,
1268                              struct page_frag *pfrag,
1269                              int getfrag(void *from, char *to, int offset,
1270                                          int len, int odd, struct sk_buff *skb),
1271                              void *from, int length, int transhdrlen,
1272                              unsigned int flags, struct ipcm6_cookie *ipc6,
1273                              const struct sockcm_cookie *sockc)
1274 {
1275         struct sk_buff *skb, *skb_prev = NULL;
1276         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1277         int exthdrlen = 0;
1278         int dst_exthdrlen = 0;
1279         int hh_len;
1280         int copy;
1281         int err;
1282         int offset = 0;
1283         __u8 tx_flags = 0;
1284         u32 tskey = 0;
1285         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1286         struct ipv6_txoptions *opt = v6_cork->opt;
1287         int csummode = CHECKSUM_NONE;
1288         unsigned int maxnonfragsize, headersize;
1289
1290         skb = skb_peek_tail(queue);
1291         if (!skb) {
1292                 exthdrlen = opt ? opt->opt_flen : 0;
1293                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1294         }
1295
1296         mtu = cork->fragsize;
1297         orig_mtu = mtu;
1298
1299         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1300
1301         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1302                         (opt ? opt->opt_nflen : 0);
1303         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1304                      sizeof(struct frag_hdr);
1305
1306         headersize = sizeof(struct ipv6hdr) +
1307                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1308                      (dst_allfrag(&rt->dst) ?
1309                       sizeof(struct frag_hdr) : 0) +
1310                      rt->rt6i_nfheader_len;
1311
1312         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1313             (sk->sk_protocol == IPPROTO_UDP ||
1314              sk->sk_protocol == IPPROTO_RAW)) {
1315                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1316                                 sizeof(struct ipv6hdr));
1317                 goto emsgsize;
1318         }
1319
1320         if (ip6_sk_ignore_df(sk))
1321                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1322         else
1323                 maxnonfragsize = mtu;
1324
1325         if (cork->length + length > maxnonfragsize - headersize) {
1326 emsgsize:
1327                 ipv6_local_error(sk, EMSGSIZE, fl6,
1328                                  mtu - headersize +
1329                                  sizeof(struct ipv6hdr));
1330                 return -EMSGSIZE;
1331         }
1332
1333         /* CHECKSUM_PARTIAL only with no extension headers and when
1334          * we are not going to fragment
1335          */
1336         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337             headersize == sizeof(struct ipv6hdr) &&
1338             length < mtu - headersize &&
1339             !(flags & MSG_MORE) &&
1340             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341                 csummode = CHECKSUM_PARTIAL;
1342
1343         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1344                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1345                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1346                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1347                         tskey = sk->sk_tskey++;
1348         }
1349
1350         /*
1351          * Let's try using as much space as possible.
1352          * Use MTU if total length of the message fits into the MTU.
1353          * Otherwise, we need to reserve fragment header and
1354          * fragment alignment (= 8-15 octects, in total).
1355          *
1356          * Note that we may need to "move" the data from the tail of
1357          * of the buffer to the new fragment when we split
1358          * the message.
1359          *
1360          * FIXME: It may be fragmented into multiple chunks
1361          *        at once if non-fragmentable extension headers
1362          *        are too large.
1363          * --yoshfuji
1364          */
1365
1366         cork->length += length;
1367         if (((length > mtu) ||
1368              (skb && skb_is_gso(skb))) &&
1369             (sk->sk_protocol == IPPROTO_UDP) &&
1370             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1371             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1372                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1373                                           hh_len, fragheaderlen, exthdrlen,
1374                                           transhdrlen, mtu, flags, fl6);
1375                 if (err)
1376                         goto error;
1377                 return 0;
1378         }
1379
1380         if (!skb)
1381                 goto alloc_new_skb;
1382
1383         while (length > 0) {
1384                 /* Check if the remaining data fits into current packet. */
1385                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1386                 if (copy < length)
1387                         copy = maxfraglen - skb->len;
1388
1389                 if (copy <= 0) {
1390                         char *data;
1391                         unsigned int datalen;
1392                         unsigned int fraglen;
1393                         unsigned int fraggap;
1394                         unsigned int alloclen;
1395 alloc_new_skb:
1396                         /* There's no room in the current skb */
1397                         if (skb)
1398                                 fraggap = skb->len - maxfraglen;
1399                         else
1400                                 fraggap = 0;
1401                         /* update mtu and maxfraglen if necessary */
1402                         if (!skb || !skb_prev)
1403                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1404                                                     fragheaderlen, skb, rt,
1405                                                     orig_mtu);
1406
1407                         skb_prev = skb;
1408
1409                         /*
1410                          * If remaining data exceeds the mtu,
1411                          * we know we need more fragment(s).
1412                          */
1413                         datalen = length + fraggap;
1414
1415                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1416                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1417                         if ((flags & MSG_MORE) &&
1418                             !(rt->dst.dev->features&NETIF_F_SG))
1419                                 alloclen = mtu;
1420                         else
1421                                 alloclen = datalen + fragheaderlen;
1422
1423                         alloclen += dst_exthdrlen;
1424
1425                         if (datalen != length + fraggap) {
1426                                 /*
1427                                  * this is not the last fragment, the trailer
1428                                  * space is regarded as data space.
1429                                  */
1430                                 datalen += rt->dst.trailer_len;
1431                         }
1432
1433                         alloclen += rt->dst.trailer_len;
1434                         fraglen = datalen + fragheaderlen;
1435
1436                         /*
1437                          * We just reserve space for fragment header.
1438                          * Note: this may be overallocation if the message
1439                          * (without MSG_MORE) fits into the MTU.
1440                          */
1441                         alloclen += sizeof(struct frag_hdr);
1442
1443                         if (transhdrlen) {
1444                                 skb = sock_alloc_send_skb(sk,
1445                                                 alloclen + hh_len,
1446                                                 (flags & MSG_DONTWAIT), &err);
1447                         } else {
1448                                 skb = NULL;
1449                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1450                                     2 * sk->sk_sndbuf)
1451                                         skb = sock_wmalloc(sk,
1452                                                            alloclen + hh_len, 1,
1453                                                            sk->sk_allocation);
1454                                 if (unlikely(!skb))
1455                                         err = -ENOBUFS;
1456                         }
1457                         if (!skb)
1458                                 goto error;
1459                         /*
1460                          *      Fill in the control structures
1461                          */
1462                         skb->protocol = htons(ETH_P_IPV6);
1463                         skb->ip_summed = csummode;
1464                         skb->csum = 0;
1465                         /* reserve for fragmentation and ipsec header */
1466                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1467                                     dst_exthdrlen);
1468
1469                         /* Only the initial fragment is time stamped */
1470                         skb_shinfo(skb)->tx_flags = tx_flags;
1471                         tx_flags = 0;
1472                         skb_shinfo(skb)->tskey = tskey;
1473                         tskey = 0;
1474
1475                         /*
1476                          *      Find where to start putting bytes
1477                          */
1478                         data = skb_put(skb, fraglen);
1479                         skb_set_network_header(skb, exthdrlen);
1480                         data += fragheaderlen;
1481                         skb->transport_header = (skb->network_header +
1482                                                  fragheaderlen);
1483                         if (fraggap) {
1484                                 skb->csum = skb_copy_and_csum_bits(
1485                                         skb_prev, maxfraglen,
1486                                         data + transhdrlen, fraggap, 0);
1487                                 skb_prev->csum = csum_sub(skb_prev->csum,
1488                                                           skb->csum);
1489                                 data += fraggap;
1490                                 pskb_trim_unique(skb_prev, maxfraglen);
1491                         }
1492                         copy = datalen - transhdrlen - fraggap;
1493
1494                         if (copy < 0) {
1495                                 err = -EINVAL;
1496                                 kfree_skb(skb);
1497                                 goto error;
1498                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1499                                 err = -EFAULT;
1500                                 kfree_skb(skb);
1501                                 goto error;
1502                         }
1503
1504                         offset += copy;
1505                         length -= datalen - fraggap;
1506                         transhdrlen = 0;
1507                         exthdrlen = 0;
1508                         dst_exthdrlen = 0;
1509
1510                         /*
1511                          * Put the packet on the pending queue
1512                          */
1513                         __skb_queue_tail(queue, skb);
1514                         continue;
1515                 }
1516
1517                 if (copy > length)
1518                         copy = length;
1519
1520                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1521                         unsigned int off;
1522
1523                         off = skb->len;
1524                         if (getfrag(from, skb_put(skb, copy),
1525                                                 offset, copy, off, skb) < 0) {
1526                                 __skb_trim(skb, off);
1527                                 err = -EFAULT;
1528                                 goto error;
1529                         }
1530                 } else {
1531                         int i = skb_shinfo(skb)->nr_frags;
1532
1533                         err = -ENOMEM;
1534                         if (!sk_page_frag_refill(sk, pfrag))
1535                                 goto error;
1536
1537                         if (!skb_can_coalesce(skb, i, pfrag->page,
1538                                               pfrag->offset)) {
1539                                 err = -EMSGSIZE;
1540                                 if (i == MAX_SKB_FRAGS)
1541                                         goto error;
1542
1543                                 __skb_fill_page_desc(skb, i, pfrag->page,
1544                                                      pfrag->offset, 0);
1545                                 skb_shinfo(skb)->nr_frags = ++i;
1546                                 get_page(pfrag->page);
1547                         }
1548                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1549                         if (getfrag(from,
1550                                     page_address(pfrag->page) + pfrag->offset,
1551                                     offset, copy, skb->len, skb) < 0)
1552                                 goto error_efault;
1553
1554                         pfrag->offset += copy;
1555                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1556                         skb->len += copy;
1557                         skb->data_len += copy;
1558                         skb->truesize += copy;
1559                         atomic_add(copy, &sk->sk_wmem_alloc);
1560                 }
1561                 offset += copy;
1562                 length -= copy;
1563         }
1564
1565         return 0;
1566
1567 error_efault:
1568         err = -EFAULT;
1569 error:
1570         cork->length -= length;
1571         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1572         return err;
1573 }
1574
1575 int ip6_append_data(struct sock *sk,
1576                     int getfrag(void *from, char *to, int offset, int len,
1577                                 int odd, struct sk_buff *skb),
1578                     void *from, int length, int transhdrlen,
1579                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1580                     struct rt6_info *rt, unsigned int flags,
1581                     const struct sockcm_cookie *sockc)
1582 {
1583         struct inet_sock *inet = inet_sk(sk);
1584         struct ipv6_pinfo *np = inet6_sk(sk);
1585         int exthdrlen;
1586         int err;
1587
1588         if (flags&MSG_PROBE)
1589                 return 0;
1590         if (skb_queue_empty(&sk->sk_write_queue)) {
1591                 /*
1592                  * setup for corking
1593                  */
1594                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1595                                      ipc6, rt, fl6);
1596                 if (err)
1597                         return err;
1598
1599                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1600                 length += exthdrlen;
1601                 transhdrlen += exthdrlen;
1602         } else {
1603                 fl6 = &inet->cork.fl.u.ip6;
1604                 transhdrlen = 0;
1605         }
1606
1607         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1608                                  &np->cork, sk_page_frag(sk), getfrag,
1609                                  from, length, transhdrlen, flags, ipc6, sockc);
1610 }
1611 EXPORT_SYMBOL_GPL(ip6_append_data);
1612
1613 static void ip6_cork_release(struct inet_cork_full *cork,
1614                              struct inet6_cork *v6_cork)
1615 {
1616         if (v6_cork->opt) {
1617                 kfree(v6_cork->opt->dst0opt);
1618                 kfree(v6_cork->opt->dst1opt);
1619                 kfree(v6_cork->opt->hopopt);
1620                 kfree(v6_cork->opt->srcrt);
1621                 kfree(v6_cork->opt);
1622                 v6_cork->opt = NULL;
1623         }
1624
1625         if (cork->base.dst) {
1626                 dst_release(cork->base.dst);
1627                 cork->base.dst = NULL;
1628                 cork->base.flags &= ~IPCORK_ALLFRAG;
1629         }
1630         memset(&cork->fl, 0, sizeof(cork->fl));
1631 }
1632
1633 struct sk_buff *__ip6_make_skb(struct sock *sk,
1634                                struct sk_buff_head *queue,
1635                                struct inet_cork_full *cork,
1636                                struct inet6_cork *v6_cork)
1637 {
1638         struct sk_buff *skb, *tmp_skb;
1639         struct sk_buff **tail_skb;
1640         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1641         struct ipv6_pinfo *np = inet6_sk(sk);
1642         struct net *net = sock_net(sk);
1643         struct ipv6hdr *hdr;
1644         struct ipv6_txoptions *opt = v6_cork->opt;
1645         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1646         struct flowi6 *fl6 = &cork->fl.u.ip6;
1647         unsigned char proto = fl6->flowi6_proto;
1648
1649         skb = __skb_dequeue(queue);
1650         if (!skb)
1651                 goto out;
1652         tail_skb = &(skb_shinfo(skb)->frag_list);
1653
1654         /* move skb->data to ip header from ext header */
1655         if (skb->data < skb_network_header(skb))
1656                 __skb_pull(skb, skb_network_offset(skb));
1657         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1658                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1659                 *tail_skb = tmp_skb;
1660                 tail_skb = &(tmp_skb->next);
1661                 skb->len += tmp_skb->len;
1662                 skb->data_len += tmp_skb->len;
1663                 skb->truesize += tmp_skb->truesize;
1664                 tmp_skb->destructor = NULL;
1665                 tmp_skb->sk = NULL;
1666         }
1667
1668         /* Allow local fragmentation. */
1669         skb->ignore_df = ip6_sk_ignore_df(sk);
1670
1671         *final_dst = fl6->daddr;
1672         __skb_pull(skb, skb_network_header_len(skb));
1673         if (opt && opt->opt_flen)
1674                 ipv6_push_frag_opts(skb, opt, &proto);
1675         if (opt && opt->opt_nflen)
1676                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1677
1678         skb_push(skb, sizeof(struct ipv6hdr));
1679         skb_reset_network_header(skb);
1680         hdr = ipv6_hdr(skb);
1681
1682         ip6_flow_hdr(hdr, v6_cork->tclass,
1683                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1684                                         np->autoflowlabel, fl6));
1685         hdr->hop_limit = v6_cork->hop_limit;
1686         hdr->nexthdr = proto;
1687         hdr->saddr = fl6->saddr;
1688         hdr->daddr = *final_dst;
1689
1690         skb->priority = sk->sk_priority;
1691         skb->mark = sk->sk_mark;
1692
1693         skb_dst_set(skb, dst_clone(&rt->dst));
1694         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1695         if (proto == IPPROTO_ICMPV6) {
1696                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1697
1698                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1699                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1700         }
1701
1702         ip6_cork_release(cork, v6_cork);
1703 out:
1704         return skb;
1705 }
1706
1707 int ip6_send_skb(struct sk_buff *skb)
1708 {
1709         struct net *net = sock_net(skb->sk);
1710         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1711         int err;
1712
1713         err = ip6_local_out(net, skb->sk, skb);
1714         if (err) {
1715                 if (err > 0)
1716                         err = net_xmit_errno(err);
1717                 if (err)
1718                         IP6_INC_STATS(net, rt->rt6i_idev,
1719                                       IPSTATS_MIB_OUTDISCARDS);
1720         }
1721
1722         return err;
1723 }
1724
1725 int ip6_push_pending_frames(struct sock *sk)
1726 {
1727         struct sk_buff *skb;
1728
1729         skb = ip6_finish_skb(sk);
1730         if (!skb)
1731                 return 0;
1732
1733         return ip6_send_skb(skb);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1736
1737 static void __ip6_flush_pending_frames(struct sock *sk,
1738                                        struct sk_buff_head *queue,
1739                                        struct inet_cork_full *cork,
1740                                        struct inet6_cork *v6_cork)
1741 {
1742         struct sk_buff *skb;
1743
1744         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1745                 if (skb_dst(skb))
1746                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1747                                       IPSTATS_MIB_OUTDISCARDS);
1748                 kfree_skb(skb);
1749         }
1750
1751         ip6_cork_release(cork, v6_cork);
1752 }
1753
1754 void ip6_flush_pending_frames(struct sock *sk)
1755 {
1756         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1757                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1758 }
1759 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1760
1761 struct sk_buff *ip6_make_skb(struct sock *sk,
1762                              int getfrag(void *from, char *to, int offset,
1763                                          int len, int odd, struct sk_buff *skb),
1764                              void *from, int length, int transhdrlen,
1765                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1766                              struct rt6_info *rt, unsigned int flags,
1767                              const struct sockcm_cookie *sockc)
1768 {
1769         struct inet_cork_full cork;
1770         struct inet6_cork v6_cork;
1771         struct sk_buff_head queue;
1772         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1773         int err;
1774
1775         if (flags & MSG_PROBE)
1776                 return NULL;
1777
1778         __skb_queue_head_init(&queue);
1779
1780         cork.base.flags = 0;
1781         cork.base.addr = 0;
1782         cork.base.opt = NULL;
1783         v6_cork.opt = NULL;
1784         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1785         if (err)
1786                 return ERR_PTR(err);
1787
1788         if (ipc6->dontfrag < 0)
1789                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1790
1791         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1792                                 &current->task_frag, getfrag, from,
1793                                 length + exthdrlen, transhdrlen + exthdrlen,
1794                                 flags, ipc6, sockc);
1795         if (err) {
1796                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1797                 return ERR_PTR(err);
1798         }
1799
1800         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1801 }