2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 int __ip6_local_out(struct sk_buff *skb)
64 len = skb->len - sizeof(struct ipv6hdr);
65 if (len > IPV6_MAXPLEN)
67 ipv6_hdr(skb)->payload_len = htons(len);
69 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
70 skb_dst(skb)->dev, dst_output);
73 int ip6_local_out(struct sk_buff *skb)
77 err = __ip6_local_out(skb);
79 err = dst_output(skb);
83 EXPORT_SYMBOL_GPL(ip6_local_out);
85 /* dev_loopback_xmit for use with netfilter. */
86 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 skb_reset_mac_header(newskb);
89 __skb_pull(newskb, skb_network_offset(newskb));
90 newskb->pkt_type = PACKET_LOOPBACK;
91 newskb->ip_summed = CHECKSUM_UNNECESSARY;
92 WARN_ON(!skb_dst(newskb));
98 static int ip6_finish_output2(struct sk_buff *skb)
100 struct dst_entry *dst = skb_dst(skb);
101 struct net_device *dev = dst->dev;
103 skb->protocol = htons(ETH_P_IPV6);
106 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
107 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
110 ((mroute6_socket(dev_net(dev)) &&
111 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
112 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
113 &ipv6_hdr(skb)->saddr))) {
114 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 /* Do not check for IFF_ALLMULTI; multicast routing
117 is not supported in any case.
120 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
121 newskb, NULL, newskb->dev,
122 ip6_dev_loopback_xmit);
124 if (ipv6_hdr(skb)->hop_limit == 0) {
125 IP6_INC_STATS(dev_net(dev), idev,
126 IPSTATS_MIB_OUTDISCARDS);
132 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
137 return neigh_hh_output(dst->hh, skb);
138 else if (dst->neighbour)
139 return dst->neighbour->output(skb);
141 IP6_INC_STATS_BH(dev_net(dst->dev),
142 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
147 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
152 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
155 static int ip6_finish_output(struct sk_buff *skb)
157 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
158 dst_allfrag(skb_dst(skb)))
159 return ip6_fragment(skb, ip6_finish_output2);
161 return ip6_finish_output2(skb);
164 int ip6_output(struct sk_buff *skb)
166 struct net_device *dev = skb_dst(skb)->dev;
167 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168 if (unlikely(idev->cnf.disable_ipv6)) {
169 IP6_INC_STATS(dev_net(dev), idev,
170 IPSTATS_MIB_OUTDISCARDS);
175 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 EXPORT_SYMBOL_GPL(ip6_output);
182 * xmit an sk_buff (used by TCP)
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186 struct ipv6_txoptions *opt, int ipfragok)
188 struct net *net = sock_net(sk);
189 struct ipv6_pinfo *np = inet6_sk(sk);
190 struct in6_addr *first_hop = &fl->fl6_dst;
191 struct dst_entry *dst = skb_dst(skb);
193 u8 proto = fl->proto;
194 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
232 /* Allow local fragmentation. */
237 * Fill in the IPv6 header
241 hlimit = np->hop_limit;
244 hlimit = ip6_dst_hoplimit(dst);
246 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
248 hdr->payload_len = htons(seg_len);
249 hdr->nexthdr = proto;
250 hdr->hop_limit = hlimit;
252 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
253 ipv6_addr_copy(&hdr->daddr, first_hop);
255 skb->priority = sk->sk_priority;
256 skb->mark = sk->sk_mark;
259 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
260 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
261 IPSTATS_MIB_OUT, skb->len);
262 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
263 dst->dev, dst_output);
267 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
269 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
270 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
275 EXPORT_SYMBOL(ip6_xmit);
278 * To avoid extra problems ND packets are send through this
279 * routine. It's code duplication but I really want to avoid
280 * extra checks since ipv6_build_header is used by TCP (which
281 * is for us performance critical)
284 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
285 const struct in6_addr *saddr, const struct in6_addr *daddr,
288 struct ipv6_pinfo *np = inet6_sk(sk);
292 skb->protocol = htons(ETH_P_IPV6);
295 totlen = len + sizeof(struct ipv6hdr);
297 skb_reset_network_header(skb);
298 skb_put(skb, sizeof(struct ipv6hdr));
301 *(__be32*)hdr = htonl(0x60000000);
303 hdr->payload_len = htons(len);
304 hdr->nexthdr = proto;
305 hdr->hop_limit = np->hop_limit;
307 ipv6_addr_copy(&hdr->saddr, saddr);
308 ipv6_addr_copy(&hdr->daddr, daddr);
313 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
315 struct ip6_ra_chain *ra;
316 struct sock *last = NULL;
318 read_lock(&ip6_ra_lock);
319 for (ra = ip6_ra_chain; ra; ra = ra->next) {
320 struct sock *sk = ra->sk;
321 if (sk && ra->sel == sel &&
322 (!sk->sk_bound_dev_if ||
323 sk->sk_bound_dev_if == skb->dev->ifindex)) {
325 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 rawv6_rcv(last, skb2);
334 rawv6_rcv(last, skb);
335 read_unlock(&ip6_ra_lock);
338 read_unlock(&ip6_ra_lock);
342 static int ip6_forward_proxy_check(struct sk_buff *skb)
344 struct ipv6hdr *hdr = ipv6_hdr(skb);
345 u8 nexthdr = hdr->nexthdr;
348 if (ipv6_ext_hdr(nexthdr)) {
349 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
353 offset = sizeof(struct ipv6hdr);
355 if (nexthdr == IPPROTO_ICMPV6) {
356 struct icmp6hdr *icmp6;
358 if (!pskb_may_pull(skb, (skb_network_header(skb) +
359 offset + 1 - skb->data)))
362 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
364 switch (icmp6->icmp6_type) {
365 case NDISC_ROUTER_SOLICITATION:
366 case NDISC_ROUTER_ADVERTISEMENT:
367 case NDISC_NEIGHBOUR_SOLICITATION:
368 case NDISC_NEIGHBOUR_ADVERTISEMENT:
370 /* For reaction involving unicast neighbor discovery
371 * message destined to the proxied address, pass it to
381 * The proxying router can't forward traffic sent to a link-local
382 * address, so signal the sender and discard the packet. This
383 * behavior is clarified by the MIPv6 specification.
385 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
386 dst_link_failure(skb);
393 static inline int ip6_forward_finish(struct sk_buff *skb)
395 return dst_output(skb);
398 int ip6_forward(struct sk_buff *skb)
400 struct dst_entry *dst = skb_dst(skb);
401 struct ipv6hdr *hdr = ipv6_hdr(skb);
402 struct inet6_skb_parm *opt = IP6CB(skb);
403 struct net *net = dev_net(dst->dev);
406 if (net->ipv6.devconf_all->forwarding == 0)
409 if (skb_warn_if_lro(skb))
412 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
413 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
417 skb_forward_csum(skb);
420 * We DO NOT make any processing on
421 * RA packets, pushing them to user level AS IS
422 * without ane WARRANTY that application will be able
423 * to interpret them. The reason is that we
424 * cannot make anything clever here.
426 * We are not end-node, so that if packet contains
427 * AH/ESP, we cannot make anything.
428 * Defragmentation also would be mistake, RA packets
429 * cannot be fragmented, because there is no warranty
430 * that different fragments will go along one path. --ANK
433 u8 *ptr = skb_network_header(skb) + opt->ra;
434 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
439 * check and decrement ttl
441 if (hdr->hop_limit <= 1) {
442 /* Force OUTPUT device used as source address */
444 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
445 IP6_INC_STATS_BH(net,
446 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
452 /* XXX: idev->cnf.proxy_ndp? */
453 if (net->ipv6.devconf_all->proxy_ndp &&
454 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
455 int proxied = ip6_forward_proxy_check(skb);
457 return ip6_input(skb);
458 else if (proxied < 0) {
459 IP6_INC_STATS(net, ip6_dst_idev(dst),
460 IPSTATS_MIB_INDISCARDS);
465 if (!xfrm6_route_forward(skb)) {
466 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
471 /* IPv6 specs say nothing about it, but it is clear that we cannot
472 send redirects to source routed frames.
473 We don't send redirects to frames decapsulated from IPsec.
475 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
476 !skb_sec_path(skb)) {
477 struct in6_addr *target = NULL;
479 struct neighbour *n = dst->neighbour;
482 * incoming and outgoing devices are the same
486 rt = (struct rt6_info *) dst;
487 if ((rt->rt6i_flags & RTF_GATEWAY))
488 target = (struct in6_addr*)&n->primary_key;
490 target = &hdr->daddr;
492 /* Limit redirects both by destination (here)
493 and by source (inside ndisc_send_redirect)
495 if (xrlim_allow(dst, 1*HZ))
496 ndisc_send_redirect(skb, n, target);
498 int addrtype = ipv6_addr_type(&hdr->saddr);
500 /* This check is security critical. */
501 if (addrtype == IPV6_ADDR_ANY ||
502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
504 if (addrtype & IPV6_ADDR_LINKLOCAL) {
505 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
506 ICMPV6_NOT_NEIGHBOUR, 0);
512 if (mtu < IPV6_MIN_MTU)
515 if (skb->len > mtu) {
516 /* Again, force OUTPUT device used as source address */
518 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
519 IP6_INC_STATS_BH(net,
520 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
521 IP6_INC_STATS_BH(net,
522 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
527 if (skb_cow(skb, dst->dev->hard_header_len)) {
528 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
534 /* Mangling hops number delayed to point after skb COW */
538 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
539 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
543 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
549 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
551 to->pkt_type = from->pkt_type;
552 to->priority = from->priority;
553 to->protocol = from->protocol;
555 skb_dst_set(to, dst_clone(skb_dst(from)));
557 to->mark = from->mark;
559 #ifdef CONFIG_NET_SCHED
560 to->tc_index = from->tc_index;
563 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
564 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
565 to->nf_trace = from->nf_trace;
567 skb_copy_secmark(to, from);
570 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
572 u16 offset = sizeof(struct ipv6hdr);
573 struct ipv6_opt_hdr *exthdr =
574 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
575 unsigned int packet_len = skb->tail - skb->network_header;
577 *nexthdr = &ipv6_hdr(skb)->nexthdr;
579 while (offset + 1 <= packet_len) {
585 case NEXTHDR_ROUTING:
589 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
590 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
600 offset += ipv6_optlen(exthdr);
601 *nexthdr = &exthdr->nexthdr;
602 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
609 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
611 struct sk_buff *frag;
612 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
613 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
614 struct ipv6hdr *tmp_hdr;
616 unsigned int mtu, hlen, left, len;
618 int ptr, offset = 0, err=0;
619 u8 *prevhdr, nexthdr = 0;
620 struct net *net = dev_net(skb_dst(skb)->dev);
622 hlen = ip6_find_1stfragopt(skb, &prevhdr);
625 mtu = ip6_skb_dst_mtu(skb);
627 /* We must not fragment if the socket is set to force MTU discovery
628 * or if the skb it not generated by a local socket.
630 if (!skb->local_df) {
631 skb->dev = skb_dst(skb)->dev;
632 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
633 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
634 IPSTATS_MIB_FRAGFAILS);
639 if (np && np->frag_size < mtu) {
643 mtu -= hlen + sizeof(struct frag_hdr);
645 if (skb_has_frags(skb)) {
646 int first_len = skb_pagelen(skb);
649 if (first_len - hlen > mtu ||
650 ((first_len - hlen) & 7) ||
654 skb_walk_frags(skb, frag) {
655 /* Correct geometry. */
656 if (frag->len > mtu ||
657 ((frag->len & 7) && frag->next) ||
658 skb_headroom(frag) < hlen)
661 /* Partially cloned skb? */
662 if (skb_shared(frag))
668 frag->destructor = sock_wfree;
669 truesizes += frag->truesize;
675 frag = skb_shinfo(skb)->frag_list;
676 skb_frag_list_init(skb);
679 *prevhdr = NEXTHDR_FRAGMENT;
680 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
682 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
683 IPSTATS_MIB_FRAGFAILS);
687 __skb_pull(skb, hlen);
688 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
689 __skb_push(skb, hlen);
690 skb_reset_network_header(skb);
691 memcpy(skb_network_header(skb), tmp_hdr, hlen);
693 ipv6_select_ident(fh);
694 fh->nexthdr = nexthdr;
696 fh->frag_off = htons(IP6_MF);
697 frag_id = fh->identification;
699 first_len = skb_pagelen(skb);
700 skb->data_len = first_len - skb_headlen(skb);
701 skb->truesize -= truesizes;
702 skb->len = first_len;
703 ipv6_hdr(skb)->payload_len = htons(first_len -
704 sizeof(struct ipv6hdr));
706 dst_hold(&rt->u.dst);
709 /* Prepare header of the next frame,
710 * before previous one went down. */
712 frag->ip_summed = CHECKSUM_NONE;
713 skb_reset_transport_header(frag);
714 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
715 __skb_push(frag, hlen);
716 skb_reset_network_header(frag);
717 memcpy(skb_network_header(frag), tmp_hdr,
719 offset += skb->len - hlen - sizeof(struct frag_hdr);
720 fh->nexthdr = nexthdr;
722 fh->frag_off = htons(offset);
723 if (frag->next != NULL)
724 fh->frag_off |= htons(IP6_MF);
725 fh->identification = frag_id;
726 ipv6_hdr(frag)->payload_len =
728 sizeof(struct ipv6hdr));
729 ip6_copy_metadata(frag, skb);
734 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
735 IPSTATS_MIB_FRAGCREATES);
748 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
749 IPSTATS_MIB_FRAGOKS);
750 dst_release(&rt->u.dst);
760 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
761 IPSTATS_MIB_FRAGFAILS);
762 dst_release(&rt->u.dst);
767 left = skb->len - hlen; /* Space per frame */
768 ptr = hlen; /* Where to start from */
771 * Fragment the datagram.
774 *prevhdr = NEXTHDR_FRAGMENT;
777 * Keep copying data until we run out.
781 /* IF: it doesn't fit, use 'mtu' - the data space left */
784 /* IF: we are not sending upto and including the packet end
785 then align the next start on an eight byte boundary */
793 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
794 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
795 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
796 IPSTATS_MIB_FRAGFAILS);
802 * Set up data on packet
805 ip6_copy_metadata(frag, skb);
806 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
807 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
808 skb_reset_network_header(frag);
809 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
810 frag->transport_header = (frag->network_header + hlen +
811 sizeof(struct frag_hdr));
814 * Charge the memory for the fragment to any owner
818 skb_set_owner_w(frag, skb->sk);
821 * Copy the packet header into the new buffer.
823 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
826 * Build fragment header.
828 fh->nexthdr = nexthdr;
831 ipv6_select_ident(fh);
832 frag_id = fh->identification;
834 fh->identification = frag_id;
837 * Copy a block of the IP datagram.
839 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
843 fh->frag_off = htons(offset);
845 fh->frag_off |= htons(IP6_MF);
846 ipv6_hdr(frag)->payload_len = htons(frag->len -
847 sizeof(struct ipv6hdr));
853 * Put this fragment into the sending queue.
859 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
860 IPSTATS_MIB_FRAGCREATES);
862 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863 IPSTATS_MIB_FRAGOKS);
868 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
869 IPSTATS_MIB_FRAGFAILS);
874 static inline int ip6_rt_check(struct rt6key *rt_key,
875 struct in6_addr *fl_addr,
876 struct in6_addr *addr_cache)
878 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
879 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
883 struct dst_entry *dst,
886 struct ipv6_pinfo *np = inet6_sk(sk);
887 struct rt6_info *rt = (struct rt6_info *)dst;
892 /* Yes, checking route validity in not connected
893 * case is not very simple. Take into account,
894 * that we do not support routing by source, TOS,
895 * and MSG_DONTROUTE --ANK (980726)
897 * 1. ip6_rt_check(): If route was host route,
898 * check that cached destination is current.
899 * If it is network route, we still may
900 * check its validity using saved pointer
901 * to the last used address: daddr_cache.
902 * We do not want to save whole address now,
903 * (because main consumer of this service
904 * is tcp, which has not this problem),
905 * so that the last trick works only on connected
907 * 2. oif also should be the same.
909 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
910 #ifdef CONFIG_IPV6_SUBTREES
911 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
913 (fl->oif && fl->oif != dst->dev->ifindex)) {
922 static int ip6_dst_lookup_tail(struct sock *sk,
923 struct dst_entry **dst, struct flowi *fl)
926 struct net *net = sock_net(sk);
929 *dst = ip6_route_output(net, sk, fl);
931 if ((err = (*dst)->error))
932 goto out_err_release;
934 if (ipv6_addr_any(&fl->fl6_src)) {
935 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
937 sk ? inet6_sk(sk)->srcprefs : 0,
940 goto out_err_release;
943 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
945 * Here if the dst entry we've looked up
946 * has a neighbour entry that is in the INCOMPLETE
947 * state and the src address from the flow is
948 * marked as OPTIMISTIC, we release the found
949 * dst entry and replace it instead with the
950 * dst entry of the nexthop router
952 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
953 struct inet6_ifaddr *ifp;
957 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
966 * We need to get the dst entry for the
967 * default router instead
970 memcpy(&fl_gw, fl, sizeof(struct flowi));
971 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
972 *dst = ip6_route_output(net, sk, &fl_gw);
973 if ((err = (*dst)->error))
974 goto out_err_release;
982 if (err == -ENETUNREACH)
983 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
990 * ip6_dst_lookup - perform route lookup on flow
991 * @sk: socket which provides route info
992 * @dst: pointer to dst_entry * for result
993 * @fl: flow to lookup
995 * This function performs a route lookup on the given flow.
997 * It returns zero on success, or a standard errno code on error.
999 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 return ip6_dst_lookup_tail(sk, dst, fl);
1004 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1008 * @sk: socket which provides the dst cache and route info
1009 * @dst: pointer to dst_entry * for result
1010 * @fl: flow to lookup
1012 * This function performs a route lookup on the given flow with the
1013 * possibility of using the cached route in the socket if it is valid.
1014 * It will take the socket dst lock when operating on the dst cache.
1015 * As a result, this function can only be used in process context.
1017 * It returns zero on success, or a standard errno code on error.
1019 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1023 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1024 *dst = ip6_sk_dst_check(sk, *dst, fl);
1027 return ip6_dst_lookup_tail(sk, dst, fl);
1029 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1031 static inline int ip6_ufo_append_data(struct sock *sk,
1032 int getfrag(void *from, char *to, int offset, int len,
1033 int odd, struct sk_buff *skb),
1034 void *from, int length, int hh_len, int fragheaderlen,
1035 int transhdrlen, int mtu,unsigned int flags)
1038 struct sk_buff *skb;
1041 /* There is support for UDP large send offload by network
1042 * device, so create one single skb packet containing complete
1045 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1046 skb = sock_alloc_send_skb(sk,
1047 hh_len + fragheaderlen + transhdrlen + 20,
1048 (flags & MSG_DONTWAIT), &err);
1052 /* reserve space for Hardware header */
1053 skb_reserve(skb, hh_len);
1055 /* create space for UDP/IP header */
1056 skb_put(skb,fragheaderlen + transhdrlen);
1058 /* initialize network header pointer */
1059 skb_reset_network_header(skb);
1061 /* initialize protocol header pointer */
1062 skb->transport_header = skb->network_header + fragheaderlen;
1064 skb->ip_summed = CHECKSUM_PARTIAL;
1066 sk->sk_sndmsg_off = 0;
1069 err = skb_append_datato_frags(sk,skb, getfrag, from,
1070 (length - transhdrlen));
1072 struct frag_hdr fhdr;
1074 /* Specify the length of each IPv6 datagram fragment.
1075 * It has to be a multiple of 8.
1077 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1078 sizeof(struct frag_hdr)) & ~7;
1079 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1080 ipv6_select_ident(&fhdr);
1081 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082 __skb_queue_tail(&sk->sk_write_queue, skb);
1086 /* There is not enough support do UPD LSO,
1087 * so follow normal path
1094 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1097 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1103 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1106 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1107 int offset, int len, int odd, struct sk_buff *skb),
1108 void *from, int length, int transhdrlen,
1109 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1110 struct rt6_info *rt, unsigned int flags)
1112 struct inet_sock *inet = inet_sk(sk);
1113 struct ipv6_pinfo *np = inet6_sk(sk);
1114 struct sk_buff *skb;
1115 unsigned int maxfraglen, fragheaderlen;
1122 int csummode = CHECKSUM_NONE;
1124 if (flags&MSG_PROBE)
1126 if (skb_queue_empty(&sk->sk_write_queue)) {
1131 if (WARN_ON(np->cork.opt))
1134 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1135 if (unlikely(np->cork.opt == NULL))
1138 np->cork.opt->tot_len = opt->tot_len;
1139 np->cork.opt->opt_flen = opt->opt_flen;
1140 np->cork.opt->opt_nflen = opt->opt_nflen;
1142 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1144 if (opt->dst0opt && !np->cork.opt->dst0opt)
1147 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1149 if (opt->dst1opt && !np->cork.opt->dst1opt)
1152 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1154 if (opt->hopopt && !np->cork.opt->hopopt)
1157 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1159 if (opt->srcrt && !np->cork.opt->srcrt)
1162 /* need source address above miyazawa*/
1164 dst_hold(&rt->u.dst);
1165 inet->cork.dst = &rt->u.dst;
1166 inet->cork.fl = *fl;
1167 np->cork.hop_limit = hlimit;
1168 np->cork.tclass = tclass;
1169 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1170 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1171 if (np->frag_size < mtu) {
1173 mtu = np->frag_size;
1175 inet->cork.fragsize = mtu;
1176 if (dst_allfrag(rt->u.dst.path))
1177 inet->cork.flags |= IPCORK_ALLFRAG;
1178 inet->cork.length = 0;
1179 sk->sk_sndmsg_page = NULL;
1180 sk->sk_sndmsg_off = 0;
1181 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1182 rt->rt6i_nfheader_len;
1183 length += exthdrlen;
1184 transhdrlen += exthdrlen;
1186 rt = (struct rt6_info *)inet->cork.dst;
1187 fl = &inet->cork.fl;
1191 mtu = inet->cork.fragsize;
1194 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1196 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1197 (opt ? opt->opt_nflen : 0);
1198 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1200 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1201 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1202 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1208 * Let's try using as much space as possible.
1209 * Use MTU if total length of the message fits into the MTU.
1210 * Otherwise, we need to reserve fragment header and
1211 * fragment alignment (= 8-15 octects, in total).
1213 * Note that we may need to "move" the data from the tail of
1214 * of the buffer to the new fragment when we split
1217 * FIXME: It may be fragmented into multiple chunks
1218 * at once if non-fragmentable extension headers
1223 inet->cork.length += length;
1224 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1225 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1227 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1228 fragheaderlen, transhdrlen, mtu,
1235 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1238 while (length > 0) {
1239 /* Check if the remaining data fits into current packet. */
1240 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1242 copy = maxfraglen - skb->len;
1246 unsigned int datalen;
1247 unsigned int fraglen;
1248 unsigned int fraggap;
1249 unsigned int alloclen;
1250 struct sk_buff *skb_prev;
1254 /* There's no room in the current skb */
1256 fraggap = skb_prev->len - maxfraglen;
1261 * If remaining data exceeds the mtu,
1262 * we know we need more fragment(s).
1264 datalen = length + fraggap;
1265 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1266 datalen = maxfraglen - fragheaderlen;
1268 fraglen = datalen + fragheaderlen;
1269 if ((flags & MSG_MORE) &&
1270 !(rt->u.dst.dev->features&NETIF_F_SG))
1273 alloclen = datalen + fragheaderlen;
1276 * The last fragment gets additional space at tail.
1277 * Note: we overallocate on fragments with MSG_MODE
1278 * because we have no idea if we're the last one.
1280 if (datalen == length + fraggap)
1281 alloclen += rt->u.dst.trailer_len;
1284 * We just reserve space for fragment header.
1285 * Note: this may be overallocation if the message
1286 * (without MSG_MORE) fits into the MTU.
1288 alloclen += sizeof(struct frag_hdr);
1291 skb = sock_alloc_send_skb(sk,
1293 (flags & MSG_DONTWAIT), &err);
1296 if (atomic_read(&sk->sk_wmem_alloc) <=
1298 skb = sock_wmalloc(sk,
1299 alloclen + hh_len, 1,
1301 if (unlikely(skb == NULL))
1307 * Fill in the control structures
1309 skb->ip_summed = csummode;
1311 /* reserve for fragmentation */
1312 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1315 * Find where to start putting bytes
1317 data = skb_put(skb, fraglen);
1318 skb_set_network_header(skb, exthdrlen);
1319 data += fragheaderlen;
1320 skb->transport_header = (skb->network_header +
1323 skb->csum = skb_copy_and_csum_bits(
1324 skb_prev, maxfraglen,
1325 data + transhdrlen, fraggap, 0);
1326 skb_prev->csum = csum_sub(skb_prev->csum,
1329 pskb_trim_unique(skb_prev, maxfraglen);
1331 copy = datalen - transhdrlen - fraggap;
1336 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1343 length -= datalen - fraggap;
1346 csummode = CHECKSUM_NONE;
1349 * Put the packet on the pending queue
1351 __skb_queue_tail(&sk->sk_write_queue, skb);
1358 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1362 if (getfrag(from, skb_put(skb, copy),
1363 offset, copy, off, skb) < 0) {
1364 __skb_trim(skb, off);
1369 int i = skb_shinfo(skb)->nr_frags;
1370 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1371 struct page *page = sk->sk_sndmsg_page;
1372 int off = sk->sk_sndmsg_off;
1375 if (page && (left = PAGE_SIZE - off) > 0) {
1378 if (page != frag->page) {
1379 if (i == MAX_SKB_FRAGS) {
1384 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1385 frag = &skb_shinfo(skb)->frags[i];
1387 } else if(i < MAX_SKB_FRAGS) {
1388 if (copy > PAGE_SIZE)
1390 page = alloc_pages(sk->sk_allocation, 0);
1395 sk->sk_sndmsg_page = page;
1396 sk->sk_sndmsg_off = 0;
1398 skb_fill_page_desc(skb, i, page, 0, 0);
1399 frag = &skb_shinfo(skb)->frags[i];
1404 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1408 sk->sk_sndmsg_off += copy;
1411 skb->data_len += copy;
1412 skb->truesize += copy;
1413 atomic_add(copy, &sk->sk_wmem_alloc);
1420 inet->cork.length -= length;
1421 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1425 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1428 kfree(np->cork.opt->dst0opt);
1429 kfree(np->cork.opt->dst1opt);
1430 kfree(np->cork.opt->hopopt);
1431 kfree(np->cork.opt->srcrt);
1432 kfree(np->cork.opt);
1433 np->cork.opt = NULL;
1436 if (inet->cork.dst) {
1437 dst_release(inet->cork.dst);
1438 inet->cork.dst = NULL;
1439 inet->cork.flags &= ~IPCORK_ALLFRAG;
1441 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1444 int ip6_push_pending_frames(struct sock *sk)
1446 struct sk_buff *skb, *tmp_skb;
1447 struct sk_buff **tail_skb;
1448 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1449 struct inet_sock *inet = inet_sk(sk);
1450 struct ipv6_pinfo *np = inet6_sk(sk);
1451 struct net *net = sock_net(sk);
1452 struct ipv6hdr *hdr;
1453 struct ipv6_txoptions *opt = np->cork.opt;
1454 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1455 struct flowi *fl = &inet->cork.fl;
1456 unsigned char proto = fl->proto;
1459 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1461 tail_skb = &(skb_shinfo(skb)->frag_list);
1463 /* move skb->data to ip header from ext header */
1464 if (skb->data < skb_network_header(skb))
1465 __skb_pull(skb, skb_network_offset(skb));
1466 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1467 __skb_pull(tmp_skb, skb_network_header_len(skb));
1468 *tail_skb = tmp_skb;
1469 tail_skb = &(tmp_skb->next);
1470 skb->len += tmp_skb->len;
1471 skb->data_len += tmp_skb->len;
1472 skb->truesize += tmp_skb->truesize;
1473 tmp_skb->destructor = NULL;
1477 /* Allow local fragmentation. */
1478 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1481 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1482 __skb_pull(skb, skb_network_header_len(skb));
1483 if (opt && opt->opt_flen)
1484 ipv6_push_frag_opts(skb, opt, &proto);
1485 if (opt && opt->opt_nflen)
1486 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1488 skb_push(skb, sizeof(struct ipv6hdr));
1489 skb_reset_network_header(skb);
1490 hdr = ipv6_hdr(skb);
1492 *(__be32*)hdr = fl->fl6_flowlabel |
1493 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1495 hdr->hop_limit = np->cork.hop_limit;
1496 hdr->nexthdr = proto;
1497 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1498 ipv6_addr_copy(&hdr->daddr, final_dst);
1500 skb->priority = sk->sk_priority;
1501 skb->mark = sk->sk_mark;
1503 skb_dst_set(skb, dst_clone(&rt->u.dst));
1504 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1505 if (proto == IPPROTO_ICMPV6) {
1506 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1508 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1509 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1512 err = ip6_local_out(skb);
1515 err = net_xmit_errno(err);
1521 ip6_cork_release(inet, np);
1524 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528 void ip6_flush_pending_frames(struct sock *sk)
1530 struct sk_buff *skb;
1532 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1534 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1535 IPSTATS_MIB_OUTDISCARDS);
1539 ip6_cork_release(inet_sk(sk), inet6_sk(sk));