net/core/skbuff.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      Routines having to do with the 'struct sk_buff' memory handlers.
   4  *
   5  *      Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
   6  *                      Florian La Roche <rzsfl@rz.uni-sb.de>
   7  *
   8  *      Fixes:
   9  *              Alan Cox        :       Fixed the worst of the load
  10  *                                      balancer bugs.
  11  *              Dave Platt      :       Interrupt stacking fix.
  12  *      Richard Kooijman        :       Timestamp fixes.
  13  *              Alan Cox        :       Changed buffer format.
  14  *              Alan Cox        :       destructor hook for AF_UNIX etc.
  15  *              Linus Torvalds  :       Better skb_clone.
  16  *              Alan Cox        :       Added skb_copy.
  17  *              Alan Cox        :       Added all the changed routines Linus
  18  *                                      only put in the headers
  19  *              Ray VanTassle   :       Fixed --skb->lock in free
  20  *              Alan Cox        :       skb_copy copy arp field
  21  *              Andi Kleen      :       slabified it.
  22  *              Robert Olsson   :       Removed skb_head_pool
  23  *
  24  *      NOTE:
  25  *              The __skb_ routines should be called with interrupts
  26  *      disabled, or you better be *real* sure that the operation is atomic
  27  *      with respect to whatever list is being frobbed (e.g. via lock_sock()
  28  *      or via disabling bottom half handlers, etc).
  29  */
  30
  31 /*
  32  *      The functions in this file will not compile correctly with gcc 2.4.x
  33  */
  34
  35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  36
  37 #include <linux/module.h>
  38 #include <linux/types.h>
  39 #include <linux/kernel.h>
  40 #include <linux/mm.h>
  41 #include <linux/interrupt.h>
  42 #include <linux/in.h>
  43 #include <linux/inet.h>
  44 #include <linux/slab.h>
  45 #include <linux/tcp.h>
  46 #include <linux/udp.h>
  47 #include <linux/sctp.h>
  48 #include <linux/netdevice.h>
  49 #ifdef CONFIG_NET_CLS_ACT
  50 #include <net/pkt_sched.h>
  51 #endif
  52 #include <linux/string.h>
  53 #include <linux/skbuff.h>
  54 #include <linux/skbuff_ref.h>
  55 #include <linux/splice.h>
  56 #include <linux/cache.h>
  57 #include <linux/rtnetlink.h>
  58 #include <linux/init.h>
  59 #include <linux/scatterlist.h>
  60 #include <linux/errqueue.h>
  61 #include <linux/prefetch.h>
  62 #include <linux/bitfield.h>
  63 #include <linux/if_vlan.h>
  64 #include <linux/mpls.h>
  65 #include <linux/kcov.h>
  66 #include <linux/iov_iter.h>
  67
  68 #include <net/protocol.h>
  69 #include <net/dst.h>
  70 #include <net/sock.h>
  71 #include <net/checksum.h>
  72 #include <net/gso.h>
  73 #include <net/hotdata.h>
  74 #include <net/ip6_checksum.h>
  75 #include <net/xfrm.h>
  76 #include <net/mpls.h>
  77 #include <net/mptcp.h>
  78 #include <net/mctp.h>
  79 #include <net/page_pool/helpers.h>
  80 #include <net/dropreason.h>
  81
  82 #include <linux/uaccess.h>
  83 #include <trace/events/skb.h>
  84 #include <linux/highmem.h>
  85 #include <linux/capability.h>
  86 #include <linux/user_namespace.h>
  87 #include <linux/indirect_call_wrapper.h>
  88 #include <linux/textsearch.h>
  89
  90 #include "dev.h"
  91 #include "sock_destructor.h"
  92
  93 #ifdef CONFIG_SKB_EXTENSIONS
  94 static struct kmem_cache *skbuff_ext_cache __ro_after_init;
  95 #endif
  96
  97 #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
  98
  99 /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 100  * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 101  * size, and we can differentiate heads from skb_small_head_cache
 102  * vs system slabs by looking at their size (skb_end_offset()).
 103  */
 104 #define SKB_SMALL_HEAD_CACHE_SIZE                                       \
 105         (is_power_of_2(SKB_SMALL_HEAD_SIZE) ?                   \
 106                 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :        \
 107                 SKB_SMALL_HEAD_SIZE)
 108
 109 #define SKB_SMALL_HEAD_HEADROOM                                         \
 110         SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
 111
 112 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
 113 EXPORT_SYMBOL(sysctl_max_skb_frags);
 114
 115 /* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 116  * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 117  * netmem is a page.
 118  */
 119 static_assert(offsetof(struct bio_vec, bv_page) ==
 120               offsetof(skb_frag_t, netmem));
 121 static_assert(sizeof_field(struct bio_vec, bv_page) ==
 122               sizeof_field(skb_frag_t, netmem));
 123
 124 static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
 125 static_assert(sizeof_field(struct bio_vec, bv_len) ==
 126               sizeof_field(skb_frag_t, len));
 127
 128 static_assert(offsetof(struct bio_vec, bv_offset) ==
 129               offsetof(skb_frag_t, offset));
 130 static_assert(sizeof_field(struct bio_vec, bv_offset) ==
 131               sizeof_field(skb_frag_t, offset));
 132
 133 #undef FN
 134 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
 135 static const char * const drop_reasons[] = {
 136         [SKB_CONSUMED] = "CONSUMED",
 137         DEFINE_DROP_REASON(FN, FN)
 138 };
 139
 140 static const struct drop_reason_list drop_reasons_core = {
 141         .reasons = drop_reasons,
 142         .n_reasons = ARRAY_SIZE(drop_reasons),
 143 };
 144
 145 const struct drop_reason_list __rcu *
 146 drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
 147         [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
 148 };
 149 EXPORT_SYMBOL(drop_reasons_by_subsys);
 150
 151 /**
 152  * drop_reasons_register_subsys - register another drop reason subsystem
 153  * @subsys: the subsystem to register, must not be the core
 154  * @list: the list of drop reasons within the subsystem, must point to
 155  *      a statically initialized list
 156  */
 157 void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
 158                                   const struct drop_reason_list *list)
 159 {
 160         if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
 161                  subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
 162                  "invalid subsystem %d\n", subsys))
 163                 return;
 164
 165         /* must point to statically allocated memory, so INIT is OK */
 166         RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
 167 }
 168 EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
 169
 170 /**
 171  * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 172  * @subsys: the subsystem to remove, must not be the core
 173  *
 174  * Note: This will synchronize_rcu() to ensure no users when it returns.
 175  */
 176 void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
 177 {
 178         if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
 179                  subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
 180                  "invalid subsystem %d\n", subsys))
 181                 return;
 182
 183         RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
 184
 185         synchronize_rcu();
 186 }
 187 EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
 188
 189 /**
 190  *      skb_panic - private function for out-of-line support
 191  *      @skb:   buffer
 192  *      @sz:    size
 193  *      @addr:  address
 194  *      @msg:   skb_over_panic or skb_under_panic
 195  *
 196  *      Out-of-line support for skb_put() and skb_push().
 197  *      Called via the wrapper skb_over_panic() or skb_under_panic().
 198  *      Keep out of line to prevent kernel bloat.
 199  *      __builtin_return_address is not used because it is not always reliable.
 200  */
 201 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
 202                       const char msg[])
 203 {
 204         pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
 205                  msg, addr, skb->len, sz, skb->head, skb->data,
 206                  (unsigned long)skb->tail, (unsigned long)skb->end,
 207                  skb->dev ? skb->dev->name : "<NULL>");
 208         BUG();
 209 }
 210
 211 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 212 {
 213         skb_panic(skb, sz, addr, __func__);
 214 }
 215
 216 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 217 {
 218         skb_panic(skb, sz, addr, __func__);
 219 }
 220
 221 #define NAPI_SKB_CACHE_SIZE     64
 222 #define NAPI_SKB_CACHE_BULK     16
 223 #define NAPI_SKB_CACHE_HALF     (NAPI_SKB_CACHE_SIZE / 2)
 224
 225 #if PAGE_SIZE == SZ_4K
 226
 227 #define NAPI_HAS_SMALL_PAGE_FRAG        1
 228 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc)  ((nc).pfmemalloc)
 229
 230 /* specialized page frag allocator using a single order 0 page
 231  * and slicing it into 1K sized fragment. Constrained to systems
 232  * with a very limited amount of 1K fragments fitting a single
 233  * page - to avoid excessive truesize underestimation
 234  */
 235
 236 struct page_frag_1k {
 237         void *va;
 238         u16 offset;
 239         bool pfmemalloc;
 240 };
 241
 242 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
 243 {
 244         struct page *page;
 245         int offset;
 246
 247         offset = nc->offset - SZ_1K;
 248         if (likely(offset >= 0))
 249                 goto use_frag;
 250
 251         page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 252         if (!page)
 253                 return NULL;
 254
 255         nc->va = page_address(page);
 256         nc->pfmemalloc = page_is_pfmemalloc(page);
 257         offset = PAGE_SIZE - SZ_1K;
 258         page_ref_add(page, offset / SZ_1K);
 259
 260 use_frag:
 261         nc->offset = offset;
 262         return nc->va + offset;
 263 }
 264 #else
 265
 266 /* the small page is actually unused in this build; add dummy helpers
 267  * to please the compiler and avoid later preprocessor's conditionals
 268  */
 269 #define NAPI_HAS_SMALL_PAGE_FRAG        0
 270 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc)  false
 271
 272 struct page_frag_1k {
 273 };
 274
 275 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
 276 {
 277         return NULL;
 278 }
 279
 280 #endif
 281
 282 struct napi_alloc_cache {
 283         struct page_frag_cache page;
 284         struct page_frag_1k page_small;
 285         unsigned int skb_count;
 286         void *skb_cache[NAPI_SKB_CACHE_SIZE];
 287 };
 288
 289 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 290 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 291
 292 /* Double check that napi_get_frags() allocates skbs with
 293  * skb->head being backed by slab, not a page fragment.
 294  * This is to make sure bug fixed in 3226b158e67c
 295  * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 296  * does not accidentally come back.
 297  */
 298 void napi_get_frags_check(struct napi_struct *napi)
 299 {
 300         struct sk_buff *skb;
 301
 302         local_bh_disable();
 303         skb = napi_get_frags(napi);
 304         WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
 305         napi_free_frags(napi);
 306         local_bh_enable();
 307 }
 308
 309 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 310 {
 311         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 312
 313         fragsz = SKB_DATA_ALIGN(fragsz);
 314
 315         return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
 316                                        align_mask);
 317 }
 318 EXPORT_SYMBOL(__napi_alloc_frag_align);
 319
 320 void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 321 {
 322         void *data;
 323
 324         fragsz = SKB_DATA_ALIGN(fragsz);
 325         if (in_hardirq() || irqs_disabled()) {
 326                 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
 327
 328                 data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
 329                                                align_mask);
 330         } else {
 331                 struct napi_alloc_cache *nc;
 332
 333                 local_bh_disable();
 334                 nc = this_cpu_ptr(&napi_alloc_cache);
 335                 data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
 336                                                align_mask);
 337                 local_bh_enable();
 338         }
 339         return data;
 340 }
 341 EXPORT_SYMBOL(__netdev_alloc_frag_align);
 342
 343 static struct sk_buff *napi_skb_cache_get(void)
 344 {
 345         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 346         struct sk_buff *skb;
 347
 348         if (unlikely(!nc->skb_count)) {
 349                 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
 350                                                       GFP_ATOMIC,
 351                                                       NAPI_SKB_CACHE_BULK,
 352                                                       nc->skb_cache);
 353                 if (unlikely(!nc->skb_count))
 354                         return NULL;
 355         }
 356
 357         skb = nc->skb_cache[--nc->skb_count];
 358         kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
 359
 360         return skb;
 361 }
 362
 363 static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
 364                                          unsigned int size)
 365 {
 366         struct skb_shared_info *shinfo;
 367
 368         size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 369
 370         /* Assumes caller memset cleared SKB */
 371         skb->truesize = SKB_TRUESIZE(size);
 372         refcount_set(&skb->users, 1);
 373         skb->head = data;
 374         skb->data = data;
 375         skb_reset_tail_pointer(skb);
 376         skb_set_end_offset(skb, size);
 377         skb->mac_header = (typeof(skb->mac_header))~0U;
 378         skb->transport_header = (typeof(skb->transport_header))~0U;
 379         skb->alloc_cpu = raw_smp_processor_id();
 380         /* make sure we initialize shinfo sequentially */
 381         shinfo = skb_shinfo(skb);
 382         memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 383         atomic_set(&shinfo->dataref, 1);
 384
 385         skb_set_kcov_handle(skb, kcov_common_handle());
 386 }
 387
 388 static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
 389                                      unsigned int *size)
 390 {
 391         void *resized;
 392
 393         /* Must find the allocation size (and grow it to match). */
 394         *size = ksize(data);
 395         /* krealloc() will immediately return "data" when
 396          * "ksize(data)" is requested: it is the existing upper
 397          * bounds. As a result, GFP_ATOMIC will be ignored. Note
 398          * that this "new" pointer needs to be passed back to the
 399          * caller for use so the __alloc_size hinting will be
 400          * tracked correctly.
 401          */
 402         resized = krealloc(data, *size, GFP_ATOMIC);
 403         WARN_ON_ONCE(resized != data);
 404         return resized;
 405 }
 406
 407 /* build_skb() variant which can operate on slab buffers.
 408  * Note that this should be used sparingly as slab buffers
 409  * cannot be combined efficiently by GRO!
 410  */
 411 struct sk_buff *slab_build_skb(void *data)
 412 {
 413         struct sk_buff *skb;
 414         unsigned int size;
 415
 416         skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
 417         if (unlikely(!skb))
 418                 return NULL;
 419
 420         memset(skb, 0, offsetof(struct sk_buff, tail));
 421         data = __slab_build_skb(skb, data, &size);
 422         __finalize_skb_around(skb, data, size);
 423
 424         return skb;
 425 }
 426 EXPORT_SYMBOL(slab_build_skb);
 427
 428 /* Caller must provide SKB that is memset cleared */
 429 static void __build_skb_around(struct sk_buff *skb, void *data,
 430                                unsigned int frag_size)
 431 {
 432         unsigned int size = frag_size;
 433
 434         /* frag_size == 0 is considered deprecated now. Callers
 435          * using slab buffer should use slab_build_skb() instead.
 436          */
 437         if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
 438                 data = __slab_build_skb(skb, data, &size);
 439
 440         __finalize_skb_around(skb, data, size);
 441 }
 442
 443 /**
 444  * __build_skb - build a network buffer
 445  * @data: data buffer provided by caller
 446  * @frag_size: size of data (must not be 0)
 447  *
 448  * Allocate a new &sk_buff. Caller provides space holding head and
 449  * skb_shared_info. @data must have been allocated from the page
 450  * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 451  * allocation is deprecated, and callers should use slab_build_skb()
 452  * instead.)
 453  * The return is the new skb buffer.
 454  * On a failure the return is %NULL, and @data is not freed.
 455  * Notes :
 456  *  Before IO, driver allocates only data buffer where NIC put incoming frame
 457  *  Driver should add room at head (NET_SKB_PAD) and
 458  *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 459  *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 460  *  before giving packet to stack.
 461  *  RX rings only contains data buffers, not full skbs.
 462  */
 463 struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 464 {
 465         struct sk_buff *skb;
 466
 467         skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
 468         if (unlikely(!skb))
 469                 return NULL;
 470
 471         memset(skb, 0, offsetof(struct sk_buff, tail));
 472         __build_skb_around(skb, data, frag_size);
 473
 474         return skb;
 475 }
 476
 477 /* build_skb() is wrapper over __build_skb(), that specifically
 478  * takes care of skb->head and skb->pfmemalloc
 479  */
 480 struct sk_buff *build_skb(void *data, unsigned int frag_size)
 481 {
 482         struct sk_buff *skb = __build_skb(data, frag_size);
 483
 484         if (likely(skb && frag_size)) {
 485                 skb->head_frag = 1;
 486                 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
 487         }
 488         return skb;
 489 }
 490 EXPORT_SYMBOL(build_skb);
 491
 492 /**
 493  * build_skb_around - build a network buffer around provided skb
 494  * @skb: sk_buff provide by caller, must be memset cleared
 495  * @data: data buffer provided by caller
 496  * @frag_size: size of data
 497  */
 498 struct sk_buff *build_skb_around(struct sk_buff *skb,
 499                                  void *data, unsigned int frag_size)
 500 {
 501         if (unlikely(!skb))
 502                 return NULL;
 503
 504         __build_skb_around(skb, data, frag_size);
 505
 506         if (frag_size) {
 507                 skb->head_frag = 1;
 508                 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
 509         }
 510         return skb;
 511 }
 512 EXPORT_SYMBOL(build_skb_around);
 513
 514 /**
 515  * __napi_build_skb - build a network buffer
 516  * @data: data buffer provided by caller
 517  * @frag_size: size of data
 518  *
 519  * Version of __build_skb() that uses NAPI percpu caches to obtain
 520  * skbuff_head instead of inplace allocation.
 521  *
 522  * Returns a new &sk_buff on success, %NULL on allocation failure.
 523  */
 524 static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
 525 {
 526         struct sk_buff *skb;
 527
 528         skb = napi_skb_cache_get();
 529         if (unlikely(!skb))
 530                 return NULL;
 531
 532         memset(skb, 0, offsetof(struct sk_buff, tail));
 533         __build_skb_around(skb, data, frag_size);
 534
 535         return skb;
 536 }
 537
 538 /**
 539  * napi_build_skb - build a network buffer
 540  * @data: data buffer provided by caller
 541  * @frag_size: size of data
 542  *
 543  * Version of __napi_build_skb() that takes care of skb->head_frag
 544  * and skb->pfmemalloc when the data is a page or page fragment.
 545  *
 546  * Returns a new &sk_buff on success, %NULL on allocation failure.
 547  */
 548 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
 549 {
 550         struct sk_buff *skb = __napi_build_skb(data, frag_size);
 551
 552         if (likely(skb) && frag_size) {
 553                 skb->head_frag = 1;
 554                 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
 555         }
 556
 557         return skb;
 558 }
 559 EXPORT_SYMBOL(napi_build_skb);
 560
 561 /*
 562  * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 563  * the caller if emergency pfmemalloc reserves are being used. If it is and
 564  * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 565  * may be used. Otherwise, the packet data may be discarded until enough
 566  * memory is free
 567  */
 568 static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
 569                              bool *pfmemalloc)
 570 {
 571         bool ret_pfmemalloc = false;
 572         size_t obj_size;
 573         void *obj;
 574
 575         obj_size = SKB_HEAD_ALIGN(*size);
 576         if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
 577             !(flags & KMALLOC_NOT_NORMAL_BITS)) {
 578                 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
 579                                 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
 580                                 node);
 581                 *size = SKB_SMALL_HEAD_CACHE_SIZE;
 582                 if (obj || !(gfp_pfmemalloc_allowed(flags)))
 583                         goto out;
 584                 /* Try again but now we are using pfmemalloc reserves */
 585                 ret_pfmemalloc = true;
 586                 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
 587                 goto out;
 588         }
 589
 590         obj_size = kmalloc_size_roundup(obj_size);
 591         /* The following cast might truncate high-order bits of obj_size, this
 592          * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
 593          */
 594         *size = (unsigned int)obj_size;
 595
 596         /*
 597          * Try a regular allocation, when that fails and we're not entitled
 598          * to the reserves, fail.
 599          */
 600         obj = kmalloc_node_track_caller(obj_size,
 601                                         flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
 602                                         node);
 603         if (obj || !(gfp_pfmemalloc_allowed(flags)))
 604                 goto out;
 605
 606         /* Try again but now we are using pfmemalloc reserves */
 607         ret_pfmemalloc = true;
 608         obj = kmalloc_node_track_caller(obj_size, flags, node);
 609
 610 out:
 611         if (pfmemalloc)
 612                 *pfmemalloc = ret_pfmemalloc;
 613
 614         return obj;
 615 }
 616
 617 /*      Allocate a new skbuff. We do this ourselves so we can fill in a few
 618  *      'private' fields and also do memory statistics to find all the
 619  *      [BEEP] leaks.
 620  *
 621  */
 622
 623 /**
 624  *      __alloc_skb     -       allocate a network buffer
 625  *      @size: size to allocate
 626  *      @gfp_mask: allocation mask
 627  *      @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 628  *              instead of head cache and allocate a cloned (child) skb.
 629  *              If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 630  *              allocations in case the data is required for writeback
 631  *      @node: numa node to allocate memory on
 632  *
 633  *      Allocate a new &sk_buff. The returned buffer has no headroom and a
 634  *      tail room of at least size bytes. The object has a reference count
 635  *      of one. The return is the buffer. On a failure the return is %NULL.
 636  *
 637  *      Buffers may only be allocated from interrupts using a @gfp_mask of
 638  *      %GFP_ATOMIC.
 639  */
 640 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 641                             int flags, int node)
 642 {
 643         struct kmem_cache *cache;
 644         struct sk_buff *skb;
 645         bool pfmemalloc;
 646         u8 *data;
 647
 648         cache = (flags & SKB_ALLOC_FCLONE)
 649                 ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
 650
 651         if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
 652                 gfp_mask |= __GFP_MEMALLOC;
 653
 654         /* Get the HEAD */
 655         if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
 656             likely(node == NUMA_NO_NODE || node == numa_mem_id()))
 657                 skb = napi_skb_cache_get();
 658         else
 659                 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
 660         if (unlikely(!skb))
 661                 return NULL;
 662         prefetchw(skb);
 663
 664         /* We do our best to align skb_shared_info on a separate cache
 665          * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 666          * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 667          * Both skb->head and skb_shared_info are cache line aligned.
 668          */
 669         data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
 670         if (unlikely(!data))
 671                 goto nodata;
 672         /* kmalloc_size_roundup() might give us more room than requested.
 673          * Put skb_shared_info exactly at the end of allocated zone,
 674          * to allow max possible filling before reallocation.
 675          */
 676         prefetchw(data + SKB_WITH_OVERHEAD(size));
 677
 678         /*
 679          * Only clear those fields we need to clear, not those that we will
 680          * actually initialise below. Hence, don't put any more fields after
 681          * the tail pointer in struct sk_buff!
 682          */
 683         memset(skb, 0, offsetof(struct sk_buff, tail));
 684         __build_skb_around(skb, data, size);
 685         skb->pfmemalloc = pfmemalloc;
 686
 687         if (flags & SKB_ALLOC_FCLONE) {
 688                 struct sk_buff_fclones *fclones;
 689
 690                 fclones = container_of(skb, struct sk_buff_fclones, skb1);
 691
 692                 skb->fclone = SKB_FCLONE_ORIG;
 693                 refcount_set(&fclones->fclone_ref, 1);
 694         }
 695
 696         return skb;
 697
 698 nodata:
 699         kmem_cache_free(cache, skb);
 700         return NULL;
 701 }
 702 EXPORT_SYMBOL(__alloc_skb);
 703
 704 /**
 705  *      __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 706  *      @dev: network device to receive on
 707  *      @len: length to allocate
 708  *      @gfp_mask: get_free_pages mask, passed to alloc_skb
 709  *
 710  *      Allocate a new &sk_buff and assign it a usage count of one. The
 711  *      buffer has NET_SKB_PAD headroom built in. Users should allocate
 712  *      the headroom they think they need without accounting for the
 713  *      built in space. The built in space is used for optimisations.
 714  *
 715  *      %NULL is returned if there is no free memory.
 716  */
 717 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 718                                    gfp_t gfp_mask)
 719 {
 720         struct page_frag_cache *nc;
 721         struct sk_buff *skb;
 722         bool pfmemalloc;
 723         void *data;
 724
 725         len += NET_SKB_PAD;
 726
 727         /* If requested length is either too small or too big,
 728          * we use kmalloc() for skb->head allocation.
 729          */
 730         if (len <= SKB_WITH_OVERHEAD(1024) ||
 731             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 732             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 733                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 734                 if (!skb)
 735                         goto skb_fail;
 736                 goto skb_success;
 737         }
 738
 739         len = SKB_HEAD_ALIGN(len);
 740
 741         if (sk_memalloc_socks())
 742                 gfp_mask |= __GFP_MEMALLOC;
 743
 744         if (in_hardirq() || irqs_disabled()) {
 745                 nc = this_cpu_ptr(&netdev_alloc_cache);
 746                 data = page_frag_alloc(nc, len, gfp_mask);
 747                 pfmemalloc = nc->pfmemalloc;
 748         } else {
 749                 local_bh_disable();
 750                 nc = this_cpu_ptr(&napi_alloc_cache.page);
 751                 data = page_frag_alloc(nc, len, gfp_mask);
 752                 pfmemalloc = nc->pfmemalloc;
 753                 local_bh_enable();
 754         }
 755
 756         if (unlikely(!data))
 757                 return NULL;
 758
 759         skb = __build_skb(data, len);
 760         if (unlikely(!skb)) {
 761                 skb_free_frag(data);
 762                 return NULL;
 763         }
 764
 765         if (pfmemalloc)
 766                 skb->pfmemalloc = 1;
 767         skb->head_frag = 1;
 768
 769 skb_success:
 770         skb_reserve(skb, NET_SKB_PAD);
 771         skb->dev = dev;
 772
 773 skb_fail:
 774         return skb;
 775 }
 776 EXPORT_SYMBOL(__netdev_alloc_skb);
 777
 778 /**
 779  *      napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 780  *      @napi: napi instance this buffer was allocated for
 781  *      @len: length to allocate
 782  *
 783  *      Allocate a new sk_buff for use in NAPI receive.  This buffer will
 784  *      attempt to allocate the head from a special reserved region used
 785  *      only for NAPI Rx allocation.  By doing this we can save several
 786  *      CPU cycles by avoiding having to disable and re-enable IRQs.
 787  *
 788  *      %NULL is returned if there is no free memory.
 789  */
 790 struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 791 {
 792         gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
 793         struct napi_alloc_cache *nc;
 794         struct sk_buff *skb;
 795         bool pfmemalloc;
 796         void *data;
 797
 798         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
 799         len += NET_SKB_PAD + NET_IP_ALIGN;
 800
 801         /* If requested length is either too small or too big,
 802          * we use kmalloc() for skb->head allocation.
 803          * When the small frag allocator is available, prefer it over kmalloc
 804          * for small fragments
 805          */
 806         if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
 807             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 808             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 809                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
 810                                   NUMA_NO_NODE);
 811                 if (!skb)
 812                         goto skb_fail;
 813                 goto skb_success;
 814         }
 815
 816         nc = this_cpu_ptr(&napi_alloc_cache);
 817
 818         if (sk_memalloc_socks())
 819                 gfp_mask |= __GFP_MEMALLOC;
 820
 821         if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
 822                 /* we are artificially inflating the allocation size, but
 823                  * that is not as bad as it may look like, as:
 824                  * - 'len' less than GRO_MAX_HEAD makes little sense
 825                  * - On most systems, larger 'len' values lead to fragment
 826                  *   size above 512 bytes
 827                  * - kmalloc would use the kmalloc-1k slab for such values
 828                  * - Builds with smaller GRO_MAX_HEAD will very likely do
 829                  *   little networking, as that implies no WiFi and no
 830                  *   tunnels support, and 32 bits arches.
 831                  */
 832                 len = SZ_1K;
 833
 834                 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
 835                 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
 836         } else {
 837                 len = SKB_HEAD_ALIGN(len);
 838
 839                 data = page_frag_alloc(&nc->page, len, gfp_mask);
 840                 pfmemalloc = nc->page.pfmemalloc;
 841         }
 842
 843         if (unlikely(!data))
 844                 return NULL;
 845
 846         skb = __napi_build_skb(data, len);
 847         if (unlikely(!skb)) {
 848                 skb_free_frag(data);
 849                 return NULL;
 850         }
 851
 852         if (pfmemalloc)
 853                 skb->pfmemalloc = 1;
 854         skb->head_frag = 1;
 855
 856 skb_success:
 857         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 858         skb->dev = napi->dev;
 859
 860 skb_fail:
 861         return skb;
 862 }
 863 EXPORT_SYMBOL(napi_alloc_skb);
 864
 865 void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
 866                             int off, int size, unsigned int truesize)
 867 {
 868         DEBUG_NET_WARN_ON_ONCE(size > truesize);
 869
 870         skb_fill_netmem_desc(skb, i, netmem, off, size);
 871         skb->len += size;
 872         skb->data_len += size;
 873         skb->truesize += truesize;
 874 }
 875 EXPORT_SYMBOL(skb_add_rx_frag_netmem);
 876
 877 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 878                           unsigned int truesize)
 879 {
 880         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 881
 882         DEBUG_NET_WARN_ON_ONCE(size > truesize);
 883
 884         skb_frag_size_add(frag, size);
 885         skb->len += size;
 886         skb->data_len += size;
 887         skb->truesize += truesize;
 888 }
 889 EXPORT_SYMBOL(skb_coalesce_rx_frag);
 890
 891 static void skb_drop_list(struct sk_buff **listp)
 892 {
 893         kfree_skb_list(*listp);
 894         *listp = NULL;
 895 }
 896
 897 static inline void skb_drop_fraglist(struct sk_buff *skb)
 898 {
 899         skb_drop_list(&skb_shinfo(skb)->frag_list);
 900 }
 901
 902 static void skb_clone_fraglist(struct sk_buff *skb)
 903 {
 904         struct sk_buff *list;
 905
 906         skb_walk_frags(skb, list)
 907                 skb_get(list);
 908 }
 909
 910 static bool is_pp_page(struct page *page)
 911 {
 912         return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
 913 }
 914
 915 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 916                     unsigned int headroom)
 917 {
 918 #if IS_ENABLED(CONFIG_PAGE_POOL)
 919         u32 size, truesize, len, max_head_size, off;
 920         struct sk_buff *skb = *pskb, *nskb;
 921         int err, i, head_off;
 922         void *data;
 923
 924         /* XDP does not support fraglist so we need to linearize
 925          * the skb.
 926          */
 927         if (skb_has_frag_list(skb))
 928                 return -EOPNOTSUPP;
 929
 930         max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
 931         if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
 932                 return -ENOMEM;
 933
 934         size = min_t(u32, skb->len, max_head_size);
 935         truesize = SKB_HEAD_ALIGN(size) + headroom;
 936         data = page_pool_dev_alloc_va(pool, &truesize);
 937         if (!data)
 938                 return -ENOMEM;
 939
 940         nskb = napi_build_skb(data, truesize);
 941         if (!nskb) {
 942                 page_pool_free_va(pool, data, true);
 943                 return -ENOMEM;
 944         }
 945
 946         skb_reserve(nskb, headroom);
 947         skb_copy_header(nskb, skb);
 948         skb_mark_for_recycle(nskb);
 949
 950         err = skb_copy_bits(skb, 0, nskb->data, size);
 951         if (err) {
 952                 consume_skb(nskb);
 953                 return err;
 954         }
 955         skb_put(nskb, size);
 956
 957         head_off = skb_headroom(nskb) - skb_headroom(skb);
 958         skb_headers_offset_update(nskb, head_off);
 959
 960         off = size;
 961         len = skb->len - off;
 962         for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
 963                 struct page *page;
 964                 u32 page_off;
 965
 966                 size = min_t(u32, len, PAGE_SIZE);
 967                 truesize = size;
 968
 969                 page = page_pool_dev_alloc(pool, &page_off, &truesize);
 970                 if (!page) {
 971                         consume_skb(nskb);
 972                         return -ENOMEM;
 973                 }
 974
 975                 skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
 976                 err = skb_copy_bits(skb, off, page_address(page) + page_off,
 977                                     size);
 978                 if (err) {
 979                         consume_skb(nskb);
 980                         return err;
 981                 }
 982
 983                 len -= size;
 984                 off += size;
 985         }
 986
 987         consume_skb(skb);
 988         *pskb = nskb;
 989
 990         return 0;
 991 #else
 992         return -EOPNOTSUPP;
 993 #endif
 994 }
 995 EXPORT_SYMBOL(skb_pp_cow_data);
 996
 997 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 998                          struct bpf_prog *prog)
 999 {
1000         if (!prog->aux->xdp_has_frags)
1001                 return -EINVAL;
1002
1003         return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
1004 }
1005 EXPORT_SYMBOL(skb_cow_data_for_xdp);
1006
1007 #if IS_ENABLED(CONFIG_PAGE_POOL)
1008 bool napi_pp_put_page(struct page *page)
1009 {
1010         page = compound_head(page);
1011
1012         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
1013          * in order to preserve any existing bits, such as bit 0 for the
1014          * head page of compound page and bit 1 for pfmemalloc page, so
1015          * mask those bits for freeing side when doing below checking,
1016          * and page_is_pfmemalloc() is checked in __page_pool_put_page()
1017          * to avoid recycling the pfmemalloc page.
1018          */
1019         if (unlikely(!is_pp_page(page)))
1020                 return false;
1021
1022         page_pool_put_full_page(page->pp, page, false);
1023
1024         return true;
1025 }
1026 EXPORT_SYMBOL(napi_pp_put_page);
1027 #endif
1028
1029 static bool skb_pp_recycle(struct sk_buff *skb, void *data)
1030 {
1031         if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
1032                 return false;
1033         return napi_pp_put_page(virt_to_page(data));
1034 }
1035
1036 /**
1037  * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
1038  * @skb:        page pool aware skb
1039  *
1040  * Increase the fragment reference count (pp_ref_count) of a skb. This is
1041  * intended to gain fragment references only for page pool aware skbs,
1042  * i.e. when skb->pp_recycle is true, and not for fragments in a
1043  * non-pp-recycling skb. It has a fallback to increase references on normal
1044  * pages, as page pool aware skbs may also have normal page fragments.
1045  */
1046 static int skb_pp_frag_ref(struct sk_buff *skb)
1047 {
1048         struct skb_shared_info *shinfo;
1049         struct page *head_page;
1050         int i;
1051
1052         if (!skb->pp_recycle)
1053                 return -EINVAL;
1054
1055         shinfo = skb_shinfo(skb);
1056
1057         for (i = 0; i < shinfo->nr_frags; i++) {
1058                 head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
1059                 if (likely(is_pp_page(head_page)))
1060                         page_pool_ref_page(head_page);
1061                 else
1062                         page_ref_inc(head_page);
1063         }
1064         return 0;
1065 }
1066
1067 static void skb_kfree_head(void *head, unsigned int end_offset)
1068 {
1069         if (end_offset == SKB_SMALL_HEAD_HEADROOM)
1070                 kmem_cache_free(net_hotdata.skb_small_head_cache, head);
1071         else
1072                 kfree(head);
1073 }
1074
1075 static void skb_free_head(struct sk_buff *skb)
1076 {
1077         unsigned char *head = skb->head;
1078
1079         if (skb->head_frag) {
1080                 if (skb_pp_recycle(skb, head))
1081                         return;
1082                 skb_free_frag(head);
1083         } else {
1084                 skb_kfree_head(head, skb_end_offset(skb));
1085         }
1086 }
1087
1088 static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
1089 {
1090         struct skb_shared_info *shinfo = skb_shinfo(skb);
1091         int i;
1092
1093         if (!skb_data_unref(skb, shinfo))
1094                 goto exit;
1095
1096         if (skb_zcopy(skb)) {
1097                 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
1098
1099                 skb_zcopy_clear(skb, true);
1100                 if (skip_unref)
1101                         goto free_head;
1102         }
1103
1104         for (i = 0; i < shinfo->nr_frags; i++)
1105                 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
1106
1107 free_head:
1108         if (shinfo->frag_list)
1109                 kfree_skb_list_reason(shinfo->frag_list, reason);
1110
1111         skb_free_head(skb);
1112 exit:
1113         /* When we clone an SKB we copy the reycling bit. The pp_recycle
1114          * bit is only set on the head though, so in order to avoid races
1115          * while trying to recycle fragments on __skb_frag_unref() we need
1116          * to make one SKB responsible for triggering the recycle path.
1117          * So disable the recycling bit if an SKB is cloned and we have
1118          * additional references to the fragmented part of the SKB.
1119          * Eventually the last SKB will have the recycling bit set and it's
1120          * dataref set to 0, which will trigger the recycling
1121          */
1122         skb->pp_recycle = 0;
1123 }
1124
1125 /*
1126  *      Free an skbuff by memory without cleaning the state.
1127  */
1128 static void kfree_skbmem(struct sk_buff *skb)
1129 {
1130         struct sk_buff_fclones *fclones;
1131
1132         switch (skb->fclone) {
1133         case SKB_FCLONE_UNAVAILABLE:
1134                 kmem_cache_free(net_hotdata.skbuff_cache, skb);
1135                 return;
1136
1137         case SKB_FCLONE_ORIG:
1138                 fclones = container_of(skb, struct sk_buff_fclones, skb1);
1139
1140                 /* We usually free the clone (TX completion) before original skb
1141                  * This test would have no chance to be true for the clone,
1142                  * while here, branch prediction will be good.
1143                  */
1144                 if (refcount_read(&fclones->fclone_ref) == 1)
1145                         goto fastpath;
1146                 break;
1147
1148         default: /* SKB_FCLONE_CLONE */
1149                 fclones = container_of(skb, struct sk_buff_fclones, skb2);
1150                 break;
1151         }
1152         if (!refcount_dec_and_test(&fclones->fclone_ref))
1153                 return;
1154 fastpath:
1155         kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
1156 }
1157
1158 void skb_release_head_state(struct sk_buff *skb)
1159 {
1160         skb_dst_drop(skb);
1161         if (skb->destructor) {
1162                 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1163                 skb->destructor(skb);
1164         }
1165 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1166         nf_conntrack_put(skb_nfct(skb));
1167 #endif
1168         skb_ext_put(skb);
1169 }
1170
1171 /* Free everything but the sk_buff shell. */
1172 static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
1173 {
1174         skb_release_head_state(skb);
1175         if (likely(skb->head))
1176                 skb_release_data(skb, reason);
1177 }
1178
1179 /**
1180  *      __kfree_skb - private function
1181  *      @skb: buffer
1182  *
1183  *      Free an sk_buff. Release anything attached to the buffer.
1184  *      Clean the state. This is an internal helper function. Users should
1185  *      always call kfree_skb
1186  */
1187
1188 void __kfree_skb(struct sk_buff *skb)
1189 {
1190         skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1191         kfree_skbmem(skb);
1192 }
1193 EXPORT_SYMBOL(__kfree_skb);
1194
1195 static __always_inline
1196 bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1197 {
1198         if (unlikely(!skb_unref(skb)))
1199                 return false;
1200
1201         DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1202                                u32_get_bits(reason,
1203                                             SKB_DROP_REASON_SUBSYS_MASK) >=
1204                                 SKB_DROP_REASON_SUBSYS_NUM);
1205
1206         if (reason == SKB_CONSUMED)
1207                 trace_consume_skb(skb, __builtin_return_address(0));
1208         else
1209                 trace_kfree_skb(skb, __builtin_return_address(0), reason);
1210         return true;
1211 }
1212
1213 /**
1214  *      kfree_skb_reason - free an sk_buff with special reason
1215  *      @skb: buffer to free
1216  *      @reason: reason why this skb is dropped
1217  *
1218  *      Drop a reference to the buffer and free it if the usage count has
1219  *      hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
1220  *      tracepoint.
1221  */
1222 void __fix_address
1223 kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1224 {
1225         if (__kfree_skb_reason(skb, reason))
1226                 __kfree_skb(skb);
1227 }
1228 EXPORT_SYMBOL(kfree_skb_reason);
1229
1230 #define KFREE_SKB_BULK_SIZE     16
1231
1232 struct skb_free_array {
1233         unsigned int skb_count;
1234         void *skb_array[KFREE_SKB_BULK_SIZE];
1235 };
1236
1237 static void kfree_skb_add_bulk(struct sk_buff *skb,
1238                                struct skb_free_array *sa,
1239                                enum skb_drop_reason reason)
1240 {
1241         /* if SKB is a clone, don't handle this case */
1242         if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
1243                 __kfree_skb(skb);
1244                 return;
1245         }
1246
1247         skb_release_all(skb, reason);
1248         sa->skb_array[sa->skb_count++] = skb;
1249
1250         if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
1251                 kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
1252                                      sa->skb_array);
1253                 sa->skb_count = 0;
1254         }
1255 }
1256
1257 void __fix_address
1258 kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
1259 {
1260         struct skb_free_array sa;
1261
1262         sa.skb_count = 0;
1263
1264         while (segs) {
1265                 struct sk_buff *next = segs->next;
1266
1267                 if (__kfree_skb_reason(segs, reason)) {
1268                         skb_poison_list(segs);
1269                         kfree_skb_add_bulk(segs, &sa, reason);
1270                 }
1271
1272                 segs = next;
1273         }
1274
1275         if (sa.skb_count)
1276                 kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
1277 }
1278 EXPORT_SYMBOL(kfree_skb_list_reason);
1279
1280 /* Dump skb information and contents.
1281  *
1282  * Must only be called from net_ratelimit()-ed paths.
1283  *
1284  * Dumps whole packets if full_pkt, only headers otherwise.
1285  */
1286 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
1287 {
1288         struct skb_shared_info *sh = skb_shinfo(skb);
1289         struct net_device *dev = skb->dev;
1290         struct sock *sk = skb->sk;
1291         struct sk_buff *list_skb;
1292         bool has_mac, has_trans;
1293         int headroom, tailroom;
1294         int i, len, seg_len;
1295
1296         if (full_pkt)
1297                 len = skb->len;
1298         else
1299                 len = min_t(int, skb->len, MAX_HEADER + 128);
1300
1301         headroom = skb_headroom(skb);
1302         tailroom = skb_tailroom(skb);
1303
1304         has_mac = skb_mac_header_was_set(skb);
1305         has_trans = skb_transport_header_was_set(skb);
1306
1307         printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
1308                "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
1309                "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
1310                "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
1311                "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
1312                "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
1313                "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
1314                level, skb->len, headroom, skb_headlen(skb), tailroom,
1315                has_mac ? skb->mac_header : -1,
1316                has_mac ? skb_mac_header_len(skb) : -1,
1317                skb->mac_len,
1318                skb->network_header,
1319                has_trans ? skb_network_header_len(skb) : -1,
1320                has_trans ? skb->transport_header : -1,
1321                sh->tx_flags, sh->nr_frags,
1322                sh->gso_size, sh->gso_type, sh->gso_segs,
1323                skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
1324                skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
1325                skb->hash, skb->sw_hash, skb->l4_hash,
1326                ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
1327                skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
1328                skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
1329                skb->inner_network_header, skb->inner_transport_header);
1330
1331         if (dev)
1332                 printk("%sdev name=%s feat=%pNF\n",
1333                        level, dev->name, &dev->features);
1334         if (sk)
1335                 printk("%ssk family=%hu type=%u proto=%u\n",
1336                        level, sk->sk_family, sk->sk_type, sk->sk_protocol);
1337
1338         if (full_pkt && headroom)
1339                 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
1340                                16, 1, skb->head, headroom, false);
1341
1342         seg_len = min_t(int, skb_headlen(skb), len);
1343         if (seg_len)
1344                 print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
1345                                16, 1, skb->data, seg_len, false);
1346         len -= seg_len;
1347
1348         if (full_pkt && tailroom)
1349                 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
1350                                16, 1, skb_tail_pointer(skb), tailroom, false);
1351
1352         for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
1353                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1354                 u32 p_off, p_len, copied;
1355                 struct page *p;
1356                 u8 *vaddr;
1357
1358                 skb_frag_foreach_page(frag, skb_frag_off(frag),
1359                                       skb_frag_size(frag), p, p_off, p_len,
1360                                       copied) {
1361                         seg_len = min_t(int, p_len, len);
1362                         vaddr = kmap_atomic(p);
1363                         print_hex_dump(level, "skb frag:     ",
1364                                        DUMP_PREFIX_OFFSET,
1365                                        16, 1, vaddr + p_off, seg_len, false);
1366                         kunmap_atomic(vaddr);
1367                         len -= seg_len;
1368                         if (!len)
1369                                 break;
1370                 }
1371         }
1372
1373         if (full_pkt && skb_has_frag_list(skb)) {
1374                 printk("skb fraglist:\n");
1375                 skb_walk_frags(skb, list_skb)
1376                         skb_dump(level, list_skb, true);
1377         }
1378 }
1379 EXPORT_SYMBOL(skb_dump);
1380
1381 /**
1382  *      skb_tx_error - report an sk_buff xmit error
1383  *      @skb: buffer that triggered an error
1384  *
1385  *      Report xmit error if a device callback is tracking this skb.
1386  *      skb must be freed afterwards.
1387  */
1388 void skb_tx_error(struct sk_buff *skb)
1389 {
1390         if (skb) {
1391                 skb_zcopy_downgrade_managed(skb);
1392                 skb_zcopy_clear(skb, true);
1393         }
1394 }
1395 EXPORT_SYMBOL(skb_tx_error);
1396
1397 #ifdef CONFIG_TRACEPOINTS
1398 /**
1399  *      consume_skb - free an skbuff
1400  *      @skb: buffer to free
1401  *
1402  *      Drop a ref to the buffer and free it if the usage count has hit zero
1403  *      Functions identically to kfree_skb, but kfree_skb assumes that the frame
1404  *      is being dropped after a failure and notes that
1405  */
1406 void consume_skb(struct sk_buff *skb)
1407 {
1408         if (!skb_unref(skb))
1409                 return;
1410
1411         trace_consume_skb(skb, __builtin_return_address(0));
1412         __kfree_skb(skb);
1413 }
1414 EXPORT_SYMBOL(consume_skb);
1415 #endif
1416
1417 /**
1418  *      __consume_stateless_skb - free an skbuff, assuming it is stateless
1419  *      @skb: buffer to free
1420  *
1421  *      Alike consume_skb(), but this variant assumes that this is the last
1422  *      skb reference and all the head states have been already dropped
1423  */
1424 void __consume_stateless_skb(struct sk_buff *skb)
1425 {
1426         trace_consume_skb(skb, __builtin_return_address(0));
1427         skb_release_data(skb, SKB_CONSUMED);
1428         kfree_skbmem(skb);
1429 }
1430
1431 static void napi_skb_cache_put(struct sk_buff *skb)
1432 {
1433         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
1434         u32 i;
1435
1436         if (!kasan_mempool_poison_object(skb))
1437                 return;
1438
1439         nc->skb_cache[nc->skb_count++] = skb;
1440
1441         if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
1442                 for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
1443                         kasan_mempool_unpoison_object(nc->skb_cache[i],
1444                                                 kmem_cache_size(net_hotdata.skbuff_cache));
1445
1446                 kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
1447                                      nc->skb_cache + NAPI_SKB_CACHE_HALF);
1448                 nc->skb_count = NAPI_SKB_CACHE_HALF;
1449         }
1450 }
1451
1452 void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
1453 {
1454         skb_release_all(skb, reason);
1455         napi_skb_cache_put(skb);
1456 }
1457
1458 void napi_skb_free_stolen_head(struct sk_buff *skb)
1459 {
1460         if (unlikely(skb->slow_gro)) {
1461                 nf_reset_ct(skb);
1462                 skb_dst_drop(skb);
1463                 skb_ext_put(skb);
1464                 skb_orphan(skb);
1465                 skb->slow_gro = 0;
1466         }
1467         napi_skb_cache_put(skb);
1468 }
1469
1470 void napi_consume_skb(struct sk_buff *skb, int budget)
1471 {
1472         /* Zero budget indicate non-NAPI context called us, like netpoll */
1473         if (unlikely(!budget)) {
1474                 dev_consume_skb_any(skb);
1475                 return;
1476         }
1477
1478         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
1479
1480         if (!skb_unref(skb))
1481                 return;
1482
1483         /* if reaching here SKB is ready to free */
1484         trace_consume_skb(skb, __builtin_return_address(0));
1485
1486         /* if SKB is a clone, don't handle this case */
1487         if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
1488                 __kfree_skb(skb);
1489                 return;
1490         }
1491
1492         skb_release_all(skb, SKB_CONSUMED);
1493         napi_skb_cache_put(skb);
1494 }
1495 EXPORT_SYMBOL(napi_consume_skb);
1496
1497 /* Make sure a field is contained by headers group */
1498 #define CHECK_SKB_FIELD(field) \
1499         BUILD_BUG_ON(offsetof(struct sk_buff, field) !=         \
1500                      offsetof(struct sk_buff, headers.field));  \
1501
1502 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
1503 {
1504         new->tstamp             = old->tstamp;
1505         /* We do not copy old->sk */
1506         new->dev                = old->dev;
1507         memcpy(new->cb, old->cb, sizeof(old->cb));
1508         skb_dst_copy(new, old);
1509         __skb_ext_copy(new, old);
1510         __nf_copy(new, old, false);
1511
1512         /* Note : this field could be in the headers group.
1513          * It is not yet because we do not want to have a 16 bit hole
1514          */
1515         new->queue_mapping = old->queue_mapping;
1516
1517         memcpy(&new->headers, &old->headers, sizeof(new->headers));
1518         CHECK_SKB_FIELD(protocol);
1519         CHECK_SKB_FIELD(csum);
1520         CHECK_SKB_FIELD(hash);
1521         CHECK_SKB_FIELD(priority);
1522         CHECK_SKB_FIELD(skb_iif);
1523         CHECK_SKB_FIELD(vlan_proto);
1524         CHECK_SKB_FIELD(vlan_tci);
1525         CHECK_SKB_FIELD(transport_header);
1526         CHECK_SKB_FIELD(network_header);
1527         CHECK_SKB_FIELD(mac_header);
1528         CHECK_SKB_FIELD(inner_protocol);
1529         CHECK_SKB_FIELD(inner_transport_header);
1530         CHECK_SKB_FIELD(inner_network_header);
1531         CHECK_SKB_FIELD(inner_mac_header);
1532         CHECK_SKB_FIELD(mark);
1533 #ifdef CONFIG_NETWORK_SECMARK
1534         CHECK_SKB_FIELD(secmark);
1535 #endif
1536 #ifdef CONFIG_NET_RX_BUSY_POLL
1537         CHECK_SKB_FIELD(napi_id);
1538 #endif
1539         CHECK_SKB_FIELD(alloc_cpu);
1540 #ifdef CONFIG_XPS
1541         CHECK_SKB_FIELD(sender_cpu);
1542 #endif
1543 #ifdef CONFIG_NET_SCHED
1544         CHECK_SKB_FIELD(tc_index);
1545 #endif
1546
1547 }
1548
1549 /*
1550  * You should not add any new code to this function.  Add it to
1551  * __copy_skb_header above instead.
1552  */
1553 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
1554 {
1555 #define C(x) n->x = skb->x
1556
1557         n->next = n->prev = NULL;
1558         n->sk = NULL;
1559         __copy_skb_header(n, skb);
1560
1561         C(len);
1562         C(data_len);
1563         C(mac_len);
1564         n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
1565         n->cloned = 1;
1566         n->nohdr = 0;
1567         n->peeked = 0;
1568         C(pfmemalloc);
1569         C(pp_recycle);
1570         n->destructor = NULL;
1571         C(tail);
1572         C(end);
1573         C(head);
1574         C(head_frag);
1575         C(data);
1576         C(truesize);
1577         refcount_set(&n->users, 1);
1578
1579         atomic_inc(&(skb_shinfo(skb)->dataref));
1580         skb->cloned = 1;
1581
1582         return n;
1583 #undef C
1584 }
1585
1586 /**
1587  * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
1588  * @first: first sk_buff of the msg
1589  */
1590 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
1591 {
1592         struct sk_buff *n;
1593
1594         n = alloc_skb(0, GFP_ATOMIC);
1595         if (!n)
1596                 return NULL;
1597
1598         n->len = first->len;
1599         n->data_len = first->len;
1600         n->truesize = first->truesize;
1601
1602         skb_shinfo(n)->frag_list = first;
1603
1604         __copy_skb_header(n, first);
1605         n->destructor = NULL;
1606
1607         return n;
1608 }
1609 EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
1610
1611 /**
1612  *      skb_morph       -       morph one skb into another
1613  *      @dst: the skb to receive the contents
1614  *      @src: the skb to supply the contents
1615  *
1616  *      This is identical to skb_clone except that the target skb is
1617  *      supplied by the user.
1618  *
1619  *      The target skb is returned upon exit.
1620  */
1621 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
1622 {
1623         skb_release_all(dst, SKB_CONSUMED);
1624         return __skb_clone(dst, src);
1625 }
1626 EXPORT_SYMBOL_GPL(skb_morph);
1627
1628 int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
1629 {
1630         unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
1631         struct user_struct *user;
1632
1633         if (capable(CAP_IPC_LOCK) || !size)
1634                 return 0;
1635
1636         rlim = rlimit(RLIMIT_MEMLOCK);
1637         if (rlim == RLIM_INFINITY)
1638                 return 0;
1639
1640         num_pg = (size >> PAGE_SHIFT) + 2;      /* worst case */
1641         max_pg = rlim >> PAGE_SHIFT;
1642         user = mmp->user ? : current_user();
1643
1644         old_pg = atomic_long_read(&user->locked_vm);
1645         do {
1646                 new_pg = old_pg + num_pg;
1647                 if (new_pg > max_pg)
1648                         return -ENOBUFS;
1649         } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
1650
1651         if (!mmp->user) {
1652                 mmp->user = get_uid(user);
1653                 mmp->num_pg = num_pg;
1654         } else {
1655                 mmp->num_pg += num_pg;
1656         }
1657
1658         return 0;
1659 }
1660 EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
1661
1662 void mm_unaccount_pinned_pages(struct mmpin *mmp)
1663 {
1664         if (mmp->user) {
1665                 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
1666                 free_uid(mmp->user);
1667         }
1668 }
1669 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
1670
1671 static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
1672 {
1673         struct ubuf_info_msgzc *uarg;
1674         struct sk_buff *skb;
1675
1676         WARN_ON_ONCE(!in_task());
1677
1678         skb = sock_omalloc(sk, 0, GFP_KERNEL);
1679         if (!skb)
1680                 return NULL;
1681
1682         BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
1683         uarg = (void *)skb->cb;
1684         uarg->mmp.user = NULL;
1685
1686         if (mm_account_pinned_pages(&uarg->mmp, size)) {
1687                 kfree_skb(skb);
1688                 return NULL;
1689         }
1690
1691         uarg->ubuf.callback = msg_zerocopy_callback;
1692         uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
1693         uarg->len = 1;
1694         uarg->bytelen = size;
1695         uarg->zerocopy = 1;
1696         uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
1697         refcount_set(&uarg->ubuf.refcnt, 1);
1698         sock_hold(sk);
1699
1700         return &uarg->ubuf;
1701 }
1702
1703 static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
1704 {
1705         return container_of((void *)uarg, struct sk_buff, cb);
1706 }
1707
1708 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
1709                                        struct ubuf_info *uarg)
1710 {
1711         if (uarg) {
1712                 struct ubuf_info_msgzc *uarg_zc;
1713                 const u32 byte_limit = 1 << 19;         /* limit to a few TSO */
1714                 u32 bytelen, next;
1715
1716                 /* there might be non MSG_ZEROCOPY users */
1717                 if (uarg->callback != msg_zerocopy_callback)
1718                         return NULL;
1719
1720                 /* realloc only when socket is locked (TCP, UDP cork),
1721                  * so uarg->len and sk_zckey access is serialized
1722                  */
1723                 if (!sock_owned_by_user(sk)) {
1724                         WARN_ON_ONCE(1);
1725                         return NULL;
1726                 }
1727
1728                 uarg_zc = uarg_to_msgzc(uarg);
1729                 bytelen = uarg_zc->bytelen + size;
1730                 if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
1731                         /* TCP can create new skb to attach new uarg */
1732                         if (sk->sk_type == SOCK_STREAM)
1733                                 goto new_alloc;
1734                         return NULL;
1735                 }
1736
1737                 next = (u32)atomic_read(&sk->sk_zckey);
1738                 if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
1739                         if (mm_account_pinned_pages(&uarg_zc->mmp, size))
1740                                 return NULL;
1741                         uarg_zc->len++;
1742                         uarg_zc->bytelen = bytelen;
1743                         atomic_set(&sk->sk_zckey, ++next);
1744
1745                         /* no extra ref when appending to datagram (MSG_MORE) */
1746                         if (sk->sk_type == SOCK_STREAM)
1747                                 net_zcopy_get(uarg);
1748
1749                         return uarg;
1750                 }
1751         }
1752
1753 new_alloc:
1754         return msg_zerocopy_alloc(sk, size);
1755 }
1756 EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
1757
1758 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
1759 {
1760         struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
1761         u32 old_lo, old_hi;
1762         u64 sum_len;
1763
1764         old_lo = serr->ee.ee_info;
1765         old_hi = serr->ee.ee_data;
1766         sum_len = old_hi - old_lo + 1ULL + len;
1767
1768         if (sum_len >= (1ULL << 32))
1769                 return false;
1770
1771         if (lo != old_hi + 1)
1772                 return false;
1773
1774         serr->ee.ee_data += len;
1775         return true;
1776 }
1777
1778 static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
1779 {
1780         struct sk_buff *tail, *skb = skb_from_uarg(uarg);
1781         struct sock_exterr_skb *serr;
1782         struct sock *sk = skb->sk;
1783         struct sk_buff_head *q;
1784         unsigned long flags;
1785         bool is_zerocopy;
1786         u32 lo, hi;
1787         u16 len;
1788
1789         mm_unaccount_pinned_pages(&uarg->mmp);
1790
1791         /* if !len, there was only 1 call, and it was aborted
1792          * so do not queue a completion notification
1793          */
1794         if (!uarg->len || sock_flag(sk, SOCK_DEAD))
1795                 goto release;
1796
1797         len = uarg->len;
1798         lo = uarg->id;
1799         hi = uarg->id + len - 1;
1800         is_zerocopy = uarg->zerocopy;
1801
1802         serr = SKB_EXT_ERR(skb);
1803         memset(serr, 0, sizeof(*serr));
1804         serr->ee.ee_errno = 0;
1805         serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
1806         serr->ee.ee_data = hi;
1807         serr->ee.ee_info = lo;
1808         if (!is_zerocopy)
1809                 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
1810
1811         q = &sk->sk_error_queue;
1812         spin_lock_irqsave(&q->lock, flags);
1813         tail = skb_peek_tail(q);
1814         if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
1815             !skb_zerocopy_notify_extend(tail, lo, len)) {
1816                 __skb_queue_tail(q, skb);
1817                 skb = NULL;
1818         }
1819         spin_unlock_irqrestore(&q->lock, flags);
1820
1821         sk_error_report(sk);
1822
1823 release:
1824         consume_skb(skb);
1825         sock_put(sk);
1826 }
1827
1828 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
1829                            bool success)
1830 {
1831         struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
1832
1833         uarg_zc->zerocopy = uarg_zc->zerocopy & success;
1834
1835         if (refcount_dec_and_test(&uarg->refcnt))
1836                 __msg_zerocopy_callback(uarg_zc);
1837 }
1838 EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
1839
1840 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
1841 {
1842         struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
1843
1844         atomic_dec(&sk->sk_zckey);
1845         uarg_to_msgzc(uarg)->len--;
1846
1847         if (have_uref)
1848                 msg_zerocopy_callback(NULL, uarg, true);
1849 }
1850 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
1851
1852 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1853                              struct msghdr *msg, int len,
1854                              struct ubuf_info *uarg)
1855 {
1856         struct ubuf_info *orig_uarg = skb_zcopy(skb);
1857         int err, orig_len = skb->len;
1858
1859         /* An skb can only point to one uarg. This edge case happens when
1860          * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
1861          */
1862         if (orig_uarg && uarg != orig_uarg)
1863                 return -EEXIST;
1864
1865         err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
1866         if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
1867                 struct sock *save_sk = skb->sk;
1868
1869                 /* Streams do not free skb on error. Reset to prev state. */
1870                 iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
1871                 skb->sk = sk;
1872                 ___pskb_trim(skb, orig_len);
1873                 skb->sk = save_sk;
1874                 return err;
1875         }
1876
1877         skb_zcopy_set(skb, uarg, NULL);
1878         return skb->len - orig_len;
1879 }
1880 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
1881
1882 void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
1883 {
1884         int i;
1885
1886         skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
1887         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1888                 skb_frag_ref(skb, i);
1889 }
1890 EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
1891
1892 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
1893                               gfp_t gfp_mask)
1894 {
1895         if (skb_zcopy(orig)) {
1896                 if (skb_zcopy(nskb)) {
1897                         /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
1898                         if (!gfp_mask) {
1899                                 WARN_ON_ONCE(1);
1900                                 return -ENOMEM;
1901                         }
1902                         if (skb_uarg(nskb) == skb_uarg(orig))
1903                                 return 0;
1904                         if (skb_copy_ubufs(nskb, GFP_ATOMIC))
1905                                 return -EIO;
1906                 }
1907                 skb_zcopy_set(nskb, skb_uarg(orig), NULL);
1908         }
1909         return 0;
1910 }
1911
1912 /**
1913  *      skb_copy_ubufs  -       copy userspace skb frags buffers to kernel
1914  *      @skb: the skb to modify
1915  *      @gfp_mask: allocation priority
1916  *
1917  *      This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
1918  *      It will copy all frags into kernel and drop the reference
1919  *      to userspace pages.
1920  *
1921  *      If this function is called from an interrupt gfp_mask() must be
1922  *      %GFP_ATOMIC.
1923  *
1924  *      Returns 0 on success or a negative error code on failure
1925  *      to allocate kernel memory to copy to.
1926  */
1927 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
1928 {
1929         int num_frags = skb_shinfo(skb)->nr_frags;
1930         struct page *page, *head = NULL;
1931         int i, order, psize, new_frags;
1932         u32 d_off;
1933
1934         if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
1935                 return -EINVAL;
1936
1937         if (!num_frags)
1938                 goto release;
1939
1940         /* We might have to allocate high order pages, so compute what minimum
1941          * page order is needed.
1942          */
1943         order = 0;
1944         while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
1945                 order++;
1946         psize = (PAGE_SIZE << order);
1947
1948         new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
1949         for (i = 0; i < new_frags; i++) {
1950                 page = alloc_pages(gfp_mask | __GFP_COMP, order);
1951                 if (!page) {
1952                         while (head) {
1953                                 struct page *next = (struct page *)page_private(head);
1954                                 put_page(head);
1955                                 head = next;
1956                         }
1957                         return -ENOMEM;
1958                 }
1959                 set_page_private(page, (unsigned long)head);
1960                 head = page;
1961         }
1962
1963         page = head;
1964         d_off = 0;
1965         for (i = 0; i < num_frags; i++) {
1966                 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1967                 u32 p_off, p_len, copied;
1968                 struct page *p;
1969                 u8 *vaddr;
1970
1971                 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
1972                                       p, p_off, p_len, copied) {
1973                         u32 copy, done = 0;
1974                         vaddr = kmap_atomic(p);
1975
1976                         while (done < p_len) {
1977                                 if (d_off == psize) {
1978                                         d_off = 0;
1979                                         page = (struct page *)page_private(page);
1980                                 }
1981                                 copy = min_t(u32, psize - d_off, p_len - done);
1982                                 memcpy(page_address(page) + d_off,
1983                                        vaddr + p_off + done, copy);
1984                                 done += copy;
1985                                 d_off += copy;
1986                         }
1987                         kunmap_atomic(vaddr);
1988                 }
1989         }
1990
1991         /* skb frags release userspace buffers */
1992         for (i = 0; i < num_frags; i++)
1993                 skb_frag_unref(skb, i);
1994
1995         /* skb frags point to kernel buffers */
1996         for (i = 0; i < new_frags - 1; i++) {
1997                 __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
1998                 head = (struct page *)page_private(head);
1999         }
2000         __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
2001                                d_off);
2002         skb_shinfo(skb)->nr_frags = new_frags;
2003
2004 release:
2005         skb_zcopy_clear(skb, false);
2006         return 0;
2007 }
2008 EXPORT_SYMBOL_GPL(skb_copy_ubufs);
2009
2010 /**
2011  *      skb_clone       -       duplicate an sk_buff
2012  *      @skb: buffer to clone
2013  *      @gfp_mask: allocation priority
2014  *
2015  *      Duplicate an &sk_buff. The new one is not owned by a socket. Both
2016  *      copies share the same packet data but not structure. The new
2017  *      buffer has a reference count of 1. If the allocation fails the
2018  *      function returns %NULL otherwise the new buffer is returned.
2019  *
2020  *      If this function is called from an interrupt gfp_mask() must be
2021  *      %GFP_ATOMIC.
2022  */
2023
2024 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
2025 {
2026         struct sk_buff_fclones *fclones = container_of(skb,
2027                                                        struct sk_buff_fclones,
2028                                                        skb1);
2029         struct sk_buff *n;
2030
2031         if (skb_orphan_frags(skb, gfp_mask))
2032                 return NULL;
2033
2034         if (skb->fclone == SKB_FCLONE_ORIG &&
2035             refcount_read(&fclones->fclone_ref) == 1) {
2036                 n = &fclones->skb2;
2037                 refcount_set(&fclones->fclone_ref, 2);
2038                 n->fclone = SKB_FCLONE_CLONE;
2039         } else {
2040                 if (skb_pfmemalloc(skb))
2041                         gfp_mask |= __GFP_MEMALLOC;
2042
2043                 n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
2044                 if (!n)
2045                         return NULL;
2046
2047                 n->fclone = SKB_FCLONE_UNAVAILABLE;
2048         }
2049
2050         return __skb_clone(n, skb);
2051 }
2052 EXPORT_SYMBOL(skb_clone);
2053
2054 void skb_headers_offset_update(struct sk_buff *skb, int off)
2055 {
2056         /* Only adjust this if it actually is csum_start rather than csum */
2057         if (skb->ip_summed == CHECKSUM_PARTIAL)
2058                 skb->csum_start += off;
2059         /* {transport,network,mac}_header and tail are relative to skb->head */
2060         skb->transport_header += off;
2061         skb->network_header   += off;
2062         if (skb_mac_header_was_set(skb))
2063                 skb->mac_header += off;
2064         skb->inner_transport_header += off;
2065         skb->inner_network_header += off;
2066         skb->inner_mac_header += off;
2067 }
2068 EXPORT_SYMBOL(skb_headers_offset_update);
2069
2070 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
2071 {
2072         __copy_skb_header(new, old);
2073
2074         skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
2075         skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
2076         skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
2077 }
2078 EXPORT_SYMBOL(skb_copy_header);
2079
2080 static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
2081 {
2082         if (skb_pfmemalloc(skb))
2083                 return SKB_ALLOC_RX;
2084         return 0;
2085 }
2086
2087 /**
2088  *      skb_copy        -       create private copy of an sk_buff
2089  *      @skb: buffer to copy
2090  *      @gfp_mask: allocation priority
2091  *
2092  *      Make a copy of both an &sk_buff and its data. This is used when the
2093  *      caller wishes to modify the data and needs a private copy of the
2094  *      data to alter. Returns %NULL on failure or the pointer to the buffer
2095  *      on success. The returned buffer has a reference count of 1.
2096  *
2097  *      As by-product this function converts non-linear &sk_buff to linear
2098  *      one, so that &sk_buff becomes completely private and caller is allowed
2099  *      to modify all the data of returned buffer. This means that this
2100  *      function is not recommended for use in circumstances when only
2101  *      header is going to be modified. Use pskb_copy() instead.
2102  */
2103
2104 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
2105 {
2106         int headerlen = skb_headroom(skb);
2107         unsigned int size = skb_end_offset(skb) + skb->data_len;
2108         struct sk_buff *n = __alloc_skb(size, gfp_mask,
2109                                         skb_alloc_rx_flag(skb), NUMA_NO_NODE);
2110
2111         if (!n)
2112                 return NULL;
2113
2114         /* Set the data pointer */
2115         skb_reserve(n, headerlen);
2116         /* Set the tail pointer and length */
2117         skb_put(n, skb->len);
2118
2119         BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
2120
2121         skb_copy_header(n, skb);
2122         return n;
2123 }
2124 EXPORT_SYMBOL(skb_copy);
2125
2126 /**
2127  *      __pskb_copy_fclone      -  create copy of an sk_buff with private head.
2128  *      @skb: buffer to copy
2129  *      @headroom: headroom of new skb
2130  *      @gfp_mask: allocation priority
2131  *      @fclone: if true allocate the copy of the skb from the fclone
2132  *      cache instead of the head cache; it is recommended to set this
2133  *      to true for the cases where the copy will likely be cloned
2134  *
2135  *      Make a copy of both an &sk_buff and part of its data, located
2136  *      in header. Fragmented data remain shared. This is used when
2137  *      the caller wishes to modify only header of &sk_buff and needs
2138  *      private copy of the header to alter. Returns %NULL on failure
2139  *      or the pointer to the buffer on success.
2140  *      The returned buffer has a reference count of 1.
2141  */
2142
2143 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
2144                                    gfp_t gfp_mask, bool fclone)
2145 {
2146         unsigned int size = skb_headlen(skb) + headroom;
2147         int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
2148         struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
2149
2150         if (!n)
2151                 goto out;
2152
2153         /* Set the data pointer */
2154         skb_reserve(n, headroom);
2155         /* Set the tail pointer and length */
2156         skb_put(n, skb_headlen(skb));
2157         /* Copy the bytes */
2158         skb_copy_from_linear_data(skb, n->data, n->len);
2159
2160         n->truesize += skb->data_len;
2161         n->data_len  = skb->data_len;
2162         n->len       = skb->len;
2163
2164         if (skb_shinfo(skb)->nr_frags) {
2165                 int i;
2166
2167                 if (skb_orphan_frags(skb, gfp_mask) ||
2168                     skb_zerocopy_clone(n, skb, gfp_mask)) {
2169                         kfree_skb(n);
2170                         n = NULL;
2171                         goto out;
2172                 }
2173                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2174                         skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
2175                         skb_frag_ref(skb, i);
2176                 }
2177                 skb_shinfo(n)->nr_frags = i;
2178         }
2179
2180         if (skb_has_frag_list(skb)) {
2181                 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
2182                 skb_clone_fraglist(n);
2183         }
2184
2185         skb_copy_header(n, skb);
2186 out:
2187         return n;
2188 }
2189 EXPORT_SYMBOL(__pskb_copy_fclone);
2190
2191 /**
2192  *      pskb_expand_head - reallocate header of &sk_buff
2193  *      @skb: buffer to reallocate
2194  *      @nhead: room to add at head
2195  *      @ntail: room to add at tail
2196  *      @gfp_mask: allocation priority
2197  *
2198  *      Expands (or creates identical copy, if @nhead and @ntail are zero)
2199  *      header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
2200  *      reference count of 1. Returns zero in the case of success or error,
2201  *      if expansion failed. In the last case, &sk_buff is not changed.
2202  *
2203  *      All the pointers pointing into skb header may change and must be
2204  *      reloaded after call to this function.
2205  */
2206
2207 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
2208                      gfp_t gfp_mask)
2209 {
2210         unsigned int osize = skb_end_offset(skb);
2211         unsigned int size = osize + nhead + ntail;
2212         long off;
2213         u8 *data;
2214         int i;
2215
2216         BUG_ON(nhead < 0);
2217
2218         BUG_ON(skb_shared(skb));
2219
2220         skb_zcopy_downgrade_managed(skb);
2221
2222         if (skb_pfmemalloc(skb))
2223                 gfp_mask |= __GFP_MEMALLOC;
2224
2225         data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
2226         if (!data)
2227                 goto nodata;
2228         size = SKB_WITH_OVERHEAD(size);
2229
2230         /* Copy only real data... and, alas, header. This should be
2231          * optimized for the cases when header is void.
2232          */
2233         memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
2234
2235         memcpy((struct skb_shared_info *)(data + size),
2236                skb_shinfo(skb),
2237                offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
2238
2239         /*
2240          * if shinfo is shared we must drop the old head gracefully, but if it
2241          * is not we can just drop the old head and let the existing refcount
2242          * be since all we did is relocate the values
2243          */
2244         if (skb_cloned(skb)) {
2245                 if (skb_orphan_frags(skb, gfp_mask))
2246                         goto nofrags;
2247                 if (skb_zcopy(skb))
2248                         refcount_inc(&skb_uarg(skb)->refcnt);
2249                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
2250                         skb_frag_ref(skb, i);
2251
2252                 if (skb_has_frag_list(skb))
2253                         skb_clone_fraglist(skb);
2254
2255                 skb_release_data(skb, SKB_CONSUMED);
2256         } else {
2257                 skb_free_head(skb);
2258         }
2259         off = (data + nhead) - skb->head;
2260
2261         skb->head     = data;
2262         skb->head_frag = 0;
2263         skb->data    += off;
2264
2265         skb_set_end_offset(skb, size);
2266 #ifdef NET_SKBUFF_DATA_USES_OFFSET
2267         off           = nhead;
2268 #endif
2269         skb->tail             += off;
2270         skb_headers_offset_update(skb, nhead);
2271         skb->cloned   = 0;
2272         skb->hdr_len  = 0;
2273         skb->nohdr    = 0;
2274         atomic_set(&skb_shinfo(skb)->dataref, 1);
2275
2276         skb_metadata_clear(skb);
2277
2278         /* It is not generally safe to change skb->truesize.
2279          * For the moment, we really care of rx path, or
2280          * when skb is orphaned (not attached to a socket).
2281          */
2282         if (!skb->sk || skb->destructor == sock_edemux)
2283                 skb->truesize += size - osize;
2284
2285         return 0;
2286
2287 nofrags:
2288         skb_kfree_head(data, size);
2289 nodata:
2290         return -ENOMEM;
2291 }
2292 EXPORT_SYMBOL(pskb_expand_head);
2293
2294 /* Make private copy of skb with writable head and some headroom */
2295
2296 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
2297 {
2298         struct sk_buff *skb2;
2299         int delta = headroom - skb_headroom(skb);
2300
2301         if (delta <= 0)
2302                 skb2 = pskb_copy(skb, GFP_ATOMIC);
2303         else {
2304                 skb2 = skb_clone(skb, GFP_ATOMIC);
2305                 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
2306                                              GFP_ATOMIC)) {
2307                         kfree_skb(skb2);
2308                         skb2 = NULL;
2309                 }
2310         }
2311         return skb2;
2312 }
2313 EXPORT_SYMBOL(skb_realloc_headroom);
2314
2315 /* Note: We plan to rework this in linux-6.4 */
2316 int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
2317 {
2318         unsigned int saved_end_offset, saved_truesize;
2319         struct skb_shared_info *shinfo;
2320         int res;
2321
2322         saved_end_offset = skb_end_offset(skb);
2323         saved_truesize = skb->truesize;
2324
2325         res = pskb_expand_head(skb, 0, 0, pri);
2326         if (res)
2327                 return res;
2328
2329         skb->truesize = saved_truesize;
2330
2331         if (likely(skb_end_offset(skb) == saved_end_offset))
2332                 return 0;
2333
2334         /* We can not change skb->end if the original or new value
2335          * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
2336          */
2337         if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
2338             skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
2339                 /* We think this path should not be taken.
2340                  * Add a temporary trace to warn us just in case.
2341                  */
2342                 pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
2343                             saved_end_offset, skb_end_offset(skb));
2344                 WARN_ON_ONCE(1);
2345                 return 0;
2346         }
2347
2348         shinfo = skb_shinfo(skb);
2349
2350         /* We are about to change back skb->end,
2351          * we need to move skb_shinfo() to its new location.
2352          */
2353         memmove(skb->head + saved_end_offset,
2354                 shinfo,
2355                 offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
2356
2357         skb_set_end_offset(skb, saved_end_offset);
2358
2359         return 0;
2360 }
2361
2362 /**
2363  *      skb_expand_head - reallocate header of &sk_buff
2364  *      @skb: buffer to reallocate
2365  *      @headroom: needed headroom
2366  *
2367  *      Unlike skb_realloc_headroom, this one does not allocate a new skb
2368  *      if possible; copies skb->sk to new skb as needed
2369  *      and frees original skb in case of failures.
2370  *
2371  *      It expect increased headroom and generates warning otherwise.
2372  */
2373
2374 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
2375 {
2376         int delta = headroom - skb_headroom(skb);
2377         int osize = skb_end_offset(skb);
2378         struct sock *sk = skb->sk;
2379
2380         if (WARN_ONCE(delta <= 0,
2381                       "%s is expecting an increase in the headroom", __func__))
2382                 return skb;
2383
2384         delta = SKB_DATA_ALIGN(delta);
2385         /* pskb_expand_head() might crash, if skb is shared. */
2386         if (skb_shared(skb) || !is_skb_wmem(skb)) {
2387                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2388
2389                 if (unlikely(!nskb))
2390                         goto fail;
2391
2392                 if (sk)
2393                         skb_set_owner_w(nskb, sk);
2394                 consume_skb(skb);
2395                 skb = nskb;
2396         }
2397         if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
2398                 goto fail;
2399
2400         if (sk && is_skb_wmem(skb)) {
2401                 delta = skb_end_offset(skb) - osize;
2402                 refcount_add(delta, &sk->sk_wmem_alloc);
2403                 skb->truesize += delta;
2404         }
2405         return skb;
2406
2407 fail:
2408         kfree_skb(skb);
2409         return NULL;
2410 }
2411 EXPORT_SYMBOL(skb_expand_head);
2412
2413 /**
2414  *      skb_copy_expand -       copy and expand sk_buff
2415  *      @skb: buffer to copy
2416  *      @newheadroom: new free bytes at head
2417  *      @newtailroom: new free bytes at tail
2418  *      @gfp_mask: allocation priority
2419  *
2420  *      Make a copy of both an &sk_buff and its data and while doing so
2421  *      allocate additional space.
2422  *
2423  *      This is used when the caller wishes to modify the data and needs a
2424  *      private copy of the data to alter as well as more space for new fields.
2425  *      Returns %NULL on failure or the pointer to the buffer
2426  *      on success. The returned buffer has a reference count of 1.
2427  *
2428  *      You must pass %GFP_ATOMIC as the allocation priority if this function
2429  *      is called from an interrupt.
2430  */
2431 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
2432                                 int newheadroom, int newtailroom,
2433                                 gfp_t gfp_mask)
2434 {
2435         /*
2436          *      Allocate the copy buffer
2437          */
2438         struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
2439                                         gfp_mask, skb_alloc_rx_flag(skb),
2440                                         NUMA_NO_NODE);
2441         int oldheadroom = skb_headroom(skb);
2442         int head_copy_len, head_copy_off;
2443
2444         if (!n)
2445                 return NULL;
2446
2447         skb_reserve(n, newheadroom);
2448
2449         /* Set the tail pointer and length */
2450         skb_put(n, skb->len);
2451
2452         head_copy_len = oldheadroom;
2453         head_copy_off = 0;
2454         if (newheadroom <= head_copy_len)
2455                 head_copy_len = newheadroom;
2456         else
2457                 head_copy_off = newheadroom - head_copy_len;
2458
2459         /* Copy the linear header and data. */
2460         BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
2461                              skb->len + head_copy_len));
2462
2463         skb_copy_header(n, skb);
2464
2465         skb_headers_offset_update(n, newheadroom - oldheadroom);
2466
2467         return n;
2468 }
2469 EXPORT_SYMBOL(skb_copy_expand);
2470
2471 /**
2472  *      __skb_pad               -       zero pad the tail of an skb
2473  *      @skb: buffer to pad
2474  *      @pad: space to pad
2475  *      @free_on_error: free buffer on error
2476  *
2477  *      Ensure that a buffer is followed by a padding area that is zero
2478  *      filled. Used by network drivers which may DMA or transfer data
2479  *      beyond the buffer end onto the wire.
2480  *
2481  *      May return error in out of memory cases. The skb is freed on error
2482  *      if @free_on_error is true.
2483  */
2484
2485 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
2486 {
2487         int err;
2488         int ntail;
2489
2490         /* If the skbuff is non linear tailroom is always zero.. */
2491         if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
2492                 memset(skb->data+skb->len, 0, pad);
2493                 return 0;
2494         }
2495
2496         ntail = skb->data_len + pad - (skb->end - skb->tail);
2497         if (likely(skb_cloned(skb) || ntail > 0)) {
2498                 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
2499                 if (unlikely(err))
2500                         goto free_skb;
2501         }
2502
2503         /* FIXME: The use of this function with non-linear skb's really needs
2504          * to be audited.
2505          */
2506         err = skb_linearize(skb);
2507         if (unlikely(err))
2508                 goto free_skb;
2509
2510         memset(skb->data + skb->len, 0, pad);
2511         return 0;
2512
2513 free_skb:
2514         if (free_on_error)
2515                 kfree_skb(skb);
2516         return err;
2517 }
2518 EXPORT_SYMBOL(__skb_pad);
2519
2520 /**
2521  *      pskb_put - add data to the tail of a potentially fragmented buffer
2522  *      @skb: start of the buffer to use
2523  *      @tail: tail fragment of the buffer to use
2524  *      @len: amount of data to add
2525  *
2526  *      This function extends the used data area of the potentially
2527  *      fragmented buffer. @tail must be the last fragment of @skb -- or
2528  *      @skb itself. If this would exceed the total buffer size the kernel
2529  *      will panic. A pointer to the first byte of the extra data is
2530  *      returned.
2531  */
2532
2533 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
2534 {
2535         if (tail != skb) {
2536                 skb->data_len += len;
2537                 skb->len += len;
2538         }
2539         return skb_put(tail, len);
2540 }
2541 EXPORT_SYMBOL_GPL(pskb_put);
2542
2543 /**
2544  *      skb_put - add data to a buffer
2545  *      @skb: buffer to use
2546  *      @len: amount of data to add
2547  *
2548  *      This function extends the used data area of the buffer. If this would
2549  *      exceed the total buffer size the kernel will panic. A pointer to the
2550  *      first byte of the extra data is returned.
2551  */
2552 void *skb_put(struct sk_buff *skb, unsigned int len)
2553 {
2554         void *tmp = skb_tail_pointer(skb);
2555         SKB_LINEAR_ASSERT(skb);
2556         skb->tail += len;
2557         skb->len  += len;
2558         if (unlikely(skb->tail > skb->end))
2559                 skb_over_panic(skb, len, __builtin_return_address(0));
2560         return tmp;
2561 }
2562 EXPORT_SYMBOL(skb_put);
2563
2564 /**
2565  *      skb_push - add data to the start of a buffer
2566  *      @skb: buffer to use
2567  *      @len: amount of data to add
2568  *
2569  *      This function extends the used data area of the buffer at the buffer
2570  *      start. If this would exceed the total buffer headroom the kernel will
2571  *      panic. A pointer to the first byte of the extra data is returned.
2572  */
2573 void *skb_push(struct sk_buff *skb, unsigned int len)
2574 {
2575         skb->data -= len;
2576         skb->len  += len;
2577         if (unlikely(skb->data < skb->head))
2578                 skb_under_panic(skb, len, __builtin_return_address(0));
2579         return skb->data;
2580 }
2581 EXPORT_SYMBOL(skb_push);
2582
2583 /**
2584  *      skb_pull - remove data from the start of a buffer
2585  *      @skb: buffer to use
2586  *      @len: amount of data to remove
2587  *
2588  *      This function removes data from the start of a buffer, returning
2589  *      the memory to the headroom. A pointer to the next data in the buffer
2590  *      is returned. Once the data has been pulled future pushes will overwrite
2591  *      the old data.
2592  */
2593 void *skb_pull(struct sk_buff *skb, unsigned int len)
2594 {
2595         return skb_pull_inline(skb, len);
2596 }
2597 EXPORT_SYMBOL(skb_pull);
2598
2599 /**
2600  *      skb_pull_data - remove data from the start of a buffer returning its
2601  *      original position.
2602  *      @skb: buffer to use
2603  *      @len: amount of data to remove
2604  *
2605  *      This function removes data from the start of a buffer, returning
2606  *      the memory to the headroom. A pointer to the original data in the buffer
2607  *      is returned after checking if there is enough data to pull. Once the
2608  *      data has been pulled future pushes will overwrite the old data.
2609  */
2610 void *skb_pull_data(struct sk_buff *skb, size_t len)
2611 {
2612         void *data = skb->data;
2613
2614         if (skb->len < len)
2615                 return NULL;
2616
2617         skb_pull(skb, len);
2618
2619         return data;
2620 }
2621 EXPORT_SYMBOL(skb_pull_data);
2622
2623 /**
2624  *      skb_trim - remove end from a buffer
2625  *      @skb: buffer to alter
2626  *      @len: new length
2627  *
2628  *      Cut the length of a buffer down by removing data from the tail. If
2629  *      the buffer is already under the length specified it is not modified.
2630  *      The skb must be linear.
2631  */
2632 void skb_trim(struct sk_buff *skb, unsigned int len)
2633 {
2634         if (skb->len > len)
2635                 __skb_trim(skb, len);
2636 }
2637 EXPORT_SYMBOL(skb_trim);
2638
2639 /* Trims skb to length len. It can change skb pointers.
2640  */
2641
2642 int ___pskb_trim(struct sk_buff *skb, unsigned int len)
2643 {
2644         struct sk_buff **fragp;
2645         struct sk_buff *frag;
2646         int offset = skb_headlen(skb);
2647         int nfrags = skb_shinfo(skb)->nr_frags;
2648         int i;
2649         int err;
2650
2651         if (skb_cloned(skb) &&
2652             unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
2653                 return err;
2654
2655         i = 0;
2656         if (offset >= len)
2657                 goto drop_pages;
2658
2659         for (; i < nfrags; i++) {
2660                 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
2661
2662                 if (end < len) {
2663                         offset = end;
2664                         continue;
2665                 }
2666
2667                 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
2668
2669 drop_pages:
2670                 skb_shinfo(skb)->nr_frags = i;
2671
2672                 for (; i < nfrags; i++)
2673                         skb_frag_unref(skb, i);
2674
2675                 if (skb_has_frag_list(skb))
2676                         skb_drop_fraglist(skb);
2677                 goto done;
2678         }
2679
2680         for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
2681              fragp = &frag->next) {
2682                 int end = offset + frag->len;
2683
2684                 if (skb_shared(frag)) {
2685                         struct sk_buff *nfrag;
2686
2687                         nfrag = skb_clone(frag, GFP_ATOMIC);
2688                         if (unlikely(!nfrag))
2689                                 return -ENOMEM;
2690
2691                         nfrag->next = frag->next;
2692                         consume_skb(frag);
2693                         frag = nfrag;
2694                         *fragp = frag;
2695                 }
2696
2697                 if (end < len) {
2698                         offset = end;
2699                         continue;
2700                 }
2701
2702                 if (end > len &&
2703                     unlikely((err = pskb_trim(frag, len - offset))))
2704                         return err;
2705
2706                 if (frag->next)
2707                         skb_drop_list(&frag->next);
2708                 break;
2709         }
2710
2711 done:
2712         if (len > skb_headlen(skb)) {
2713                 skb->data_len -= skb->len - len;
2714                 skb->len       = len;
2715         } else {
2716                 skb->len       = len;
2717                 skb->data_len  = 0;
2718                 skb_set_tail_pointer(skb, len);
2719         }
2720
2721         if (!skb->sk || skb->destructor == sock_edemux)
2722                 skb_condense(skb);
2723         return 0;
2724 }
2725 EXPORT_SYMBOL(___pskb_trim);
2726
2727 /* Note : use pskb_trim_rcsum() instead of calling this directly
2728  */
2729 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
2730 {
2731         if (skb->ip_summed == CHECKSUM_COMPLETE) {
2732                 int delta = skb->len - len;
2733
2734                 skb->csum = csum_block_sub(skb->csum,
2735                                            skb_checksum(skb, len, delta, 0),
2736                                            len);
2737         } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2738                 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
2739                 int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
2740
2741                 if (offset + sizeof(__sum16) > hdlen)
2742                         return -EINVAL;
2743         }
2744         return __pskb_trim(skb, len);
2745 }
2746 EXPORT_SYMBOL(pskb_trim_rcsum_slow);
2747
2748 /**
2749  *      __pskb_pull_tail - advance tail of skb header
2750  *      @skb: buffer to reallocate
2751  *      @delta: number of bytes to advance tail
2752  *
2753  *      The function makes a sense only on a fragmented &sk_buff,
2754  *      it expands header moving its tail forward and copying necessary
2755  *      data from fragmented part.
2756  *
2757  *      &sk_buff MUST have reference count of 1.
2758  *
2759  *      Returns %NULL (and &sk_buff does not change) if pull failed
2760  *      or value of new tail of skb in the case of success.
2761  *
2762  *      All the pointers pointing into skb header may change and must be
2763  *      reloaded after call to this function.
2764  */
2765
2766 /* Moves tail of skb head forward, copying data from fragmented part,
2767  * when it is necessary.
2768  * 1. It may fail due to malloc failure.
2769  * 2. It may change skb pointers.
2770  *
2771  * It is pretty complicated. Luckily, it is called only in exceptional cases.
2772  */
2773 void *__pskb_pull_tail(struct sk_buff *skb, int delta)
2774 {
2775         /* If skb has not enough free space at tail, get new one
2776          * plus 128 bytes for future expansions. If we have enough
2777          * room at tail, reallocate without expansion only if skb is cloned.
2778          */
2779         int i, k, eat = (skb->tail + delta) - skb->end;
2780
2781         if (eat > 0 || skb_cloned(skb)) {
2782                 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
2783                                      GFP_ATOMIC))
2784                         return NULL;
2785         }
2786
2787         BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
2788                              skb_tail_pointer(skb), delta));
2789
2790         /* Optimization: no fragments, no reasons to preestimate
2791          * size of pulled pages. Superb.
2792          */
2793         if (!skb_has_frag_list(skb))
2794                 goto pull_pages;
2795
2796         /* Estimate size of pulled pages. */
2797         eat = delta;
2798         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2799                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2800
2801                 if (size >= eat)
2802                         goto pull_pages;
2803                 eat -= size;
2804         }
2805
2806         /* If we need update frag list, we are in troubles.
2807          * Certainly, it is possible to add an offset to skb data,
2808          * but taking into account that pulling is expected to
2809          * be very rare operation, it is worth to fight against
2810          * further bloating skb head and crucify ourselves here instead.
2811          * Pure masohism, indeed. 8)8)
2812          */
2813         if (eat) {
2814                 struct sk_buff *list = skb_shinfo(skb)->frag_list;
2815                 struct sk_buff *clone = NULL;
2816                 struct sk_buff *insp = NULL;
2817
2818                 do {
2819                         if (list->len <= eat) {
2820                                 /* Eaten as whole. */
2821                                 eat -= list->len;
2822                                 list = list->next;
2823                                 insp = list;
2824                         } else {
2825                                 /* Eaten partially. */
2826                                 if (skb_is_gso(skb) && !list->head_frag &&
2827                                     skb_headlen(list))
2828                                         skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2829
2830                                 if (skb_shared(list)) {
2831                                         /* Sucks! We need to fork list. :-( */
2832                                         clone = skb_clone(list, GFP_ATOMIC);
2833                                         if (!clone)
2834                                                 return NULL;
2835                                         insp = list->next;
2836                                         list = clone;
2837                                 } else {
2838                                         /* This may be pulled without
2839                                          * problems. */
2840                                         insp = list;
2841                                 }
2842                                 if (!pskb_pull(list, eat)) {
2843                                         kfree_skb(clone);
2844                                         return NULL;
2845                                 }
2846                                 break;
2847                         }
2848                 } while (eat);
2849
2850                 /* Free pulled out fragments. */
2851                 while ((list = skb_shinfo(skb)->frag_list) != insp) {
2852                         skb_shinfo(skb)->frag_list = list->next;
2853                         consume_skb(list);
2854                 }
2855                 /* And insert new clone at head. */
2856                 if (clone) {
2857                         clone->next = list;
2858                         skb_shinfo(skb)->frag_list = clone;
2859                 }
2860         }
2861         /* Success! Now we may commit changes to skb data. */
2862
2863 pull_pages:
2864         eat = delta;
2865         k = 0;
2866         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2867                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2868
2869                 if (size <= eat) {
2870                         skb_frag_unref(skb, i);
2871                         eat -= size;
2872                 } else {
2873                         skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
2874
2875                         *frag = skb_shinfo(skb)->frags[i];
2876                         if (eat) {
2877                                 skb_frag_off_add(frag, eat);
2878                                 skb_frag_size_sub(frag, eat);
2879                                 if (!i)
2880                                         goto end;
2881                                 eat = 0;
2882                         }
2883                         k++;
2884                 }
2885         }
2886         skb_shinfo(skb)->nr_frags = k;
2887
2888 end:
2889         skb->tail     += delta;
2890         skb->data_len -= delta;
2891
2892         if (!skb->data_len)
2893                 skb_zcopy_clear(skb, false);
2894
2895         return skb_tail_pointer(skb);
2896 }
2897 EXPORT_SYMBOL(__pskb_pull_tail);
2898
2899 /**
2900  *      skb_copy_bits - copy bits from skb to kernel buffer
2901  *      @skb: source skb
2902  *      @offset: offset in source
2903  *      @to: destination buffer
2904  *      @len: number of bytes to copy
2905  *
2906  *      Copy the specified number of bytes from the source skb to the
2907  *      destination buffer.
2908  *
2909  *      CAUTION ! :
2910  *              If its prototype is ever changed,
2911  *              check arch/{*}/net/{*}.S files,
2912  *              since it is called from BPF assembly code.
2913  */
2914 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
2915 {
2916         int start = skb_headlen(skb);
2917         struct sk_buff *frag_iter;
2918         int i, copy;
2919
2920         if (offset > (int)skb->len - len)
2921                 goto fault;
2922
2923         /* Copy header. */
2924         if ((copy = start - offset) > 0) {
2925                 if (copy > len)
2926                         copy = len;
2927                 skb_copy_from_linear_data_offset(skb, offset, to, copy);
2928                 if ((len -= copy) == 0)
2929                         return 0;
2930                 offset += copy;
2931                 to     += copy;
2932         }
2933
2934         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2935                 int end;
2936                 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
2937
2938                 WARN_ON(start > offset + len);
2939
2940                 end = start + skb_frag_size(f);
2941                 if ((copy = end - offset) > 0) {
2942                         u32 p_off, p_len, copied;
2943                         struct page *p;
2944                         u8 *vaddr;
2945
2946                         if (copy > len)
2947                                 copy = len;
2948
2949                         skb_frag_foreach_page(f,
2950                                               skb_frag_off(f) + offset - start,
2951                                               copy, p, p_off, p_len, copied) {
2952                                 vaddr = kmap_atomic(p);
2953                                 memcpy(to + copied, vaddr + p_off, p_len);
2954                                 kunmap_atomic(vaddr);
2955                         }
2956
2957                         if ((len -= copy) == 0)
2958                                 return 0;
2959                         offset += copy;
2960                         to     += copy;
2961                 }
2962                 start = end;
2963         }
2964
2965         skb_walk_frags(skb, frag_iter) {
2966                 int end;
2967
2968                 WARN_ON(start > offset + len);
2969
2970                 end = start + frag_iter->len;
2971                 if ((copy = end - offset) > 0) {
2972                         if (copy > len)
2973                                 copy = len;
2974                         if (skb_copy_bits(frag_iter, offset - start, to, copy))
2975                                 goto fault;
2976                         if ((len -= copy) == 0)
2977                                 return 0;
2978                         offset += copy;
2979                         to     += copy;
2980                 }
2981                 start = end;
2982         }
2983
2984         if (!len)
2985                 return 0;
2986
2987 fault:
2988         return -EFAULT;
2989 }
2990 EXPORT_SYMBOL(skb_copy_bits);
2991
2992 /*
2993  * Callback from splice_to_pipe(), if we need to release some pages
2994  * at the end of the spd in case we error'ed out in filling the pipe.
2995  */
2996 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
2997 {
2998         put_page(spd->pages[i]);
2999 }
3000
3001 static struct page *linear_to_page(struct page *page, unsigned int *len,
3002                                    unsigned int *offset,
3003                                    struct sock *sk)
3004 {
3005         struct page_frag *pfrag = sk_page_frag(sk);
3006
3007         if (!sk_page_frag_refill(sk, pfrag))
3008                 return NULL;
3009
3010         *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
3011
3012         memcpy(page_address(pfrag->page) + pfrag->offset,
3013                page_address(page) + *offset, *len);
3014         *offset = pfrag->offset;
3015         pfrag->offset += *len;
3016
3017         return pfrag->page;
3018 }
3019
3020 static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
3021                              struct page *page,
3022                              unsigned int offset)
3023 {
3024         return  spd->nr_pages &&
3025                 spd->pages[spd->nr_pages - 1] == page &&
3026                 (spd->partial[spd->nr_pages - 1].offset +
3027                  spd->partial[spd->nr_pages - 1].len == offset);
3028 }
3029
3030 /*
3031  * Fill page/offset/length into spd, if it can hold more pages.
3032  */
3033 static bool spd_fill_page(struct splice_pipe_desc *spd,
3034                           struct pipe_inode_info *pipe, struct page *page,
3035                           unsigned int *len, unsigned int offset,
3036                           bool linear,
3037                           struct sock *sk)
3038 {
3039         if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
3040                 return true;
3041
3042         if (linear) {
3043                 page = linear_to_page(page, len, &offset, sk);
3044                 if (!page)
3045                         return true;
3046         }
3047         if (spd_can_coalesce(spd, page, offset)) {
3048                 spd->partial[spd->nr_pages - 1].len += *len;
3049                 return false;
3050         }
3051         get_page(page);
3052         spd->pages[spd->nr_pages] = page;
3053         spd->partial[spd->nr_pages].len = *len;
3054         spd->partial[spd->nr_pages].offset = offset;
3055         spd->nr_pages++;
3056
3057         return false;
3058 }
3059
3060 static bool __splice_segment(struct page *page, unsigned int poff,
3061                              unsigned int plen, unsigned int *off,
3062                              unsigned int *len,
3063                              struct splice_pipe_desc *spd, bool linear,
3064                              struct sock *sk,
3065                              struct pipe_inode_info *pipe)
3066 {
3067         if (!*len)
3068                 return true;
3069
3070         /* skip this segment if already processed */
3071         if (*off >= plen) {
3072                 *off -= plen;
3073                 return false;
3074         }
3075
3076         /* ignore any bits we already processed */
3077         poff += *off;
3078         plen -= *off;
3079         *off = 0;
3080
3081         do {
3082                 unsigned int flen = min(*len, plen);
3083
3084                 if (spd_fill_page(spd, pipe, page, &flen, poff,
3085                                   linear, sk))
3086                         return true;
3087                 poff += flen;
3088                 plen -= flen;
3089                 *len -= flen;
3090         } while (*len && plen);
3091
3092         return false;
3093 }
3094
3095 /*
3096  * Map linear and fragment data from the skb to spd. It reports true if the
3097  * pipe is full or if we already spliced the requested length.
3098  */
3099 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
3100                               unsigned int *offset, unsigned int *len,
3101                               struct splice_pipe_desc *spd, struct sock *sk)
3102 {
3103         int seg;
3104         struct sk_buff *iter;
3105
3106         /* map the linear part :
3107          * If skb->head_frag is set, this 'linear' part is backed by a
3108          * fragment, and if the head is not shared with any clones then
3109          * we can avoid a copy since we own the head portion of this page.
3110          */
3111         if (__splice_segment(virt_to_page(skb->data),
3112                              (unsigned long) skb->data & (PAGE_SIZE - 1),
3113                              skb_headlen(skb),
3114                              offset, len, spd,
3115                              skb_head_is_locked(skb),
3116                              sk, pipe))
3117                 return true;
3118
3119         /*
3120          * then map the fragments
3121          */
3122         for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
3123                 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
3124
3125                 if (__splice_segment(skb_frag_page(f),
3126                                      skb_frag_off(f), skb_frag_size(f),
3127                                      offset, len, spd, false, sk, pipe))
3128                         return true;
3129         }
3130
3131         skb_walk_frags(skb, iter) {
3132                 if (*offset >= iter->len) {
3133                         *offset -= iter->len;
3134                         continue;
3135                 }
3136                 /* __skb_splice_bits() only fails if the output has no room
3137                  * left, so no point in going over the frag_list for the error
3138                  * case.
3139                  */
3140                 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
3141                         return true;
3142         }
3143
3144         return false;
3145 }
3146
3147 /*
3148  * Map data from the skb to a pipe. Should handle both the linear part,
3149  * the fragments, and the frag list.
3150  */
3151 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
3152                     struct pipe_inode_info *pipe, unsigned int tlen,
3153                     unsigned int flags)
3154 {
3155         struct partial_page partial[MAX_SKB_FRAGS];
3156         struct page *pages[MAX_SKB_FRAGS];
3157         struct splice_pipe_desc spd = {
3158                 .pages = pages,
3159                 .partial = partial,
3160                 .nr_pages_max = MAX_SKB_FRAGS,
3161                 .ops = &nosteal_pipe_buf_ops,
3162                 .spd_release = sock_spd_release,
3163         };
3164         int ret = 0;
3165
3166         __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
3167
3168         if (spd.nr_pages)
3169                 ret = splice_to_pipe(pipe, &spd);
3170
3171         return ret;
3172 }
3173 EXPORT_SYMBOL_GPL(skb_splice_bits);
3174
3175 static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
3176 {
3177         struct socket *sock = sk->sk_socket;
3178         size_t size = msg_data_left(msg);
3179
3180         if (!sock)
3181                 return -EINVAL;
3182
3183         if (!sock->ops->sendmsg_locked)
3184                 return sock_no_sendmsg_locked(sk, msg, size);
3185
3186         return sock->ops->sendmsg_locked(sk, msg, size);
3187 }
3188
3189 static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
3190 {
3191         struct socket *sock = sk->sk_socket;
3192
3193         if (!sock)
3194                 return -EINVAL;
3195         return sock_sendmsg(sock, msg);
3196 }
3197
3198 typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
3199 static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
3200                            int len, sendmsg_func sendmsg)
3201 {
3202         unsigned int orig_len = len;
3203         struct sk_buff *head = skb;
3204         unsigned short fragidx;
3205         int slen, ret;
3206
3207 do_frag_list:
3208
3209         /* Deal with head data */
3210         while (offset < skb_headlen(skb) && len) {
3211                 struct kvec kv;
3212                 struct msghdr msg;
3213
3214                 slen = min_t(int, len, skb_headlen(skb) - offset);
3215                 kv.iov_base = skb->data + offset;
3216                 kv.iov_len = slen;
3217                 memset(&msg, 0, sizeof(msg));
3218                 msg.msg_flags = MSG_DONTWAIT;
3219
3220                 iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
3221                 ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
3222                                       sendmsg_unlocked, sk, &msg);
3223                 if (ret <= 0)
3224                         goto error;
3225
3226                 offset += ret;
3227                 len -= ret;
3228         }
3229
3230         /* All the data was skb head? */
3231         if (!len)
3232                 goto out;
3233
3234         /* Make offset relative to start of frags */
3235         offset -= skb_headlen(skb);
3236
3237         /* Find where we are in frag list */
3238         for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
3239                 skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
3240
3241                 if (offset < skb_frag_size(frag))
3242                         break;
3243
3244                 offset -= skb_frag_size(frag);
3245         }
3246
3247         for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
3248                 skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
3249
3250                 slen = min_t(size_t, len, skb_frag_size(frag) - offset);
3251
3252                 while (slen) {
3253                         struct bio_vec bvec;
3254                         struct msghdr msg = {
3255                                 .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
3256                         };
3257
3258                         bvec_set_page(&bvec, skb_frag_page(frag), slen,
3259                                       skb_frag_off(frag) + offset);
3260                         iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
3261                                       slen);
3262
3263                         ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
3264                                               sendmsg_unlocked, sk, &msg);
3265                         if (ret <= 0)
3266                                 goto error;
3267
3268                         len -= ret;
3269                         offset += ret;
3270                         slen -= ret;
3271                 }
3272
3273                 offset = 0;
3274         }
3275
3276         if (len) {
3277                 /* Process any frag lists */
3278
3279                 if (skb == head) {
3280                         if (skb_has_frag_list(skb)) {
3281                                 skb = skb_shinfo(skb)->frag_list;
3282                                 goto do_frag_list;
3283                         }
3284                 } else if (skb->next) {
3285                         skb = skb->next;
3286                         goto do_frag_list;
3287                 }
3288         }
3289
3290 out:
3291         return orig_len - len;
3292
3293 error:
3294         return orig_len == len ? ret : orig_len - len;
3295 }
3296
3297 /* Send skb data on a socket. Socket must be locked. */
3298 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
3299                          int len)
3300 {
3301         return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
3302 }
3303 EXPORT_SYMBOL_GPL(skb_send_sock_locked);
3304
3305 /* Send skb data on a socket. Socket must be unlocked. */
3306 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
3307 {
3308         return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
3309 }
3310
3311 /**
3312  *      skb_store_bits - store bits from kernel buffer to skb
3313  *      @skb: destination buffer
3314  *      @offset: offset in destination
3315  *      @from: source buffer
3316  *      @len: number of bytes to copy
3317  *
3318  *      Copy the specified number of bytes from the source buffer to the
3319  *      destination skb.  This function handles all the messy bits of
3320  *      traversing fragment lists and such.
3321  */
3322
3323 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
3324 {
3325         int start = skb_headlen(skb);
3326         struct sk_buff *frag_iter;
3327         int i, copy;
3328
3329         if (offset > (int)skb->len - len)
3330                 goto fault;
3331
3332         if ((copy = start - offset) > 0) {
3333                 if (copy > len)
3334                         copy = len;
3335                 skb_copy_to_linear_data_offset(skb, offset, from, copy);
3336                 if ((len -= copy) == 0)
3337                         return 0;
3338                 offset += copy;
3339                 from += copy;
3340         }
3341
3342         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3343                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3344                 int end;
3345
3346                 WARN_ON(start > offset + len);
3347
3348                 end = start + skb_frag_size(frag);
3349                 if ((copy = end - offset) > 0) {
3350                         u32 p_off, p_len, copied;
3351                         struct page *p;
3352                         u8 *vaddr;
3353
3354                         if (copy > len)
3355                                 copy = len;
3356
3357                         skb_frag_foreach_page(frag,
3358                                               skb_frag_off(frag) + offset - start,
3359                                               copy, p, p_off, p_len, copied) {
3360                                 vaddr = kmap_atomic(p);
3361                                 memcpy(vaddr + p_off, from + copied, p_len);
3362                                 kunmap_atomic(vaddr);
3363                         }
3364
3365                         if ((len -= copy) == 0)
3366                                 return 0;
3367                         offset += copy;
3368                         from += copy;
3369                 }
3370                 start = end;
3371         }
3372
3373         skb_walk_frags(skb, frag_iter) {
3374                 int end;
3375
3376                 WARN_ON(start > offset + len);
3377
3378                 end = start + frag_iter->len;
3379                 if ((copy = end - offset) > 0) {
3380                         if (copy > len)
3381                                 copy = len;
3382                         if (skb_store_bits(frag_iter, offset - start,
3383                                            from, copy))
3384                                 goto fault;
3385                         if ((len -= copy) == 0)
3386                                 return 0;
3387                         offset += copy;
3388                         from += copy;
3389                 }
3390                 start = end;
3391         }
3392         if (!len)
3393                 return 0;
3394
3395 fault:
3396         return -EFAULT;
3397 }
3398 EXPORT_SYMBOL(skb_store_bits);
3399
3400 /* Checksum skb data. */
3401 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
3402                       __wsum csum, const struct skb_checksum_ops *ops)
3403 {
3404         int start = skb_headlen(skb);
3405         int i, copy = start - offset;
3406         struct sk_buff *frag_iter;
3407         int pos = 0;
3408
3409         /* Checksum header. */
3410         if (copy > 0) {
3411                 if (copy > len)
3412                         copy = len;
3413                 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
3414                                        skb->data + offset, copy, csum);
3415                 if ((len -= copy) == 0)
3416                         return csum;
3417                 offset += copy;
3418                 pos     = copy;
3419         }
3420
3421         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3422                 int end;
3423                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3424
3425                 WARN_ON(start > offset + len);
3426
3427                 end = start + skb_frag_size(frag);
3428                 if ((copy = end - offset) > 0) {
3429                         u32 p_off, p_len, copied;
3430                         struct page *p;
3431                         __wsum csum2;
3432                         u8 *vaddr;
3433
3434                         if (copy > len)
3435                                 copy = len;
3436
3437                         skb_frag_foreach_page(frag,
3438                                               skb_frag_off(frag) + offset - start,
3439                                               copy, p, p_off, p_len, copied) {
3440                                 vaddr = kmap_atomic(p);
3441                                 csum2 = INDIRECT_CALL_1(ops->update,
3442                                                         csum_partial_ext,
3443                                                         vaddr + p_off, p_len, 0);
3444                                 kunmap_atomic(vaddr);
3445                                 csum = INDIRECT_CALL_1(ops->combine,
3446                                                        csum_block_add_ext, csum,
3447                                                        csum2, pos, p_len);
3448                                 pos += p_len;
3449                         }
3450
3451                         if (!(len -= copy))
3452                                 return csum;
3453                         offset += copy;
3454                 }
3455                 start = end;
3456         }
3457
3458         skb_walk_frags(skb, frag_iter) {
3459                 int end;
3460
3461                 WARN_ON(start > offset + len);
3462
3463                 end = start + frag_iter->len;
3464                 if ((copy = end - offset) > 0) {
3465                         __wsum csum2;
3466                         if (copy > len)
3467                                 copy = len;
3468                         csum2 = __skb_checksum(frag_iter, offset - start,
3469                                                copy, 0, ops);
3470                         csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
3471                                                csum, csum2, pos, copy);
3472                         if ((len -= copy) == 0)
3473                                 return csum;
3474                         offset += copy;
3475                         pos    += copy;
3476                 }
3477                 start = end;
3478         }
3479         BUG_ON(len);
3480
3481         return csum;
3482 }
3483 EXPORT_SYMBOL(__skb_checksum);
3484
3485 __wsum skb_checksum(const struct sk_buff *skb, int offset,
3486                     int len, __wsum csum)
3487 {
3488         const struct skb_checksum_ops ops = {
3489                 .update  = csum_partial_ext,
3490                 .combine = csum_block_add_ext,
3491         };
3492
3493         return __skb_checksum(skb, offset, len, csum, &ops);
3494 }
3495 EXPORT_SYMBOL(skb_checksum);
3496
3497 /* Both of above in one bottle. */
3498
3499 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
3500                                     u8 *to, int len)
3501 {
3502         int start = skb_headlen(skb);
3503         int i, copy = start - offset;
3504         struct sk_buff *frag_iter;
3505         int pos = 0;
3506         __wsum csum = 0;
3507
3508         /* Copy header. */
3509         if (copy > 0) {
3510                 if (copy > len)
3511                         copy = len;
3512                 csum = csum_partial_copy_nocheck(skb->data + offset, to,
3513                                                  copy);
3514                 if ((len -= copy) == 0)
3515                         return csum;
3516                 offset += copy;
3517                 to     += copy;
3518                 pos     = copy;
3519         }
3520
3521         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3522                 int end;
3523
3524                 WARN_ON(start > offset + len);
3525
3526                 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
3527                 if ((copy = end - offset) > 0) {
3528                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3529                         u32 p_off, p_len, copied;
3530                         struct page *p;
3531                         __wsum csum2;
3532                         u8 *vaddr;
3533
3534                         if (copy > len)
3535                                 copy = len;
3536
3537                         skb_frag_foreach_page(frag,
3538                                               skb_frag_off(frag) + offset - start,
3539                                               copy, p, p_off, p_len, copied) {
3540                                 vaddr = kmap_atomic(p);
3541                                 csum2 = csum_partial_copy_nocheck(vaddr + p_off,
3542                                                                   to + copied,
3543                                                                   p_len);
3544                                 kunmap_atomic(vaddr);
3545                                 csum = csum_block_add(csum, csum2, pos);
3546                                 pos += p_len;
3547                         }
3548
3549                         if (!(len -= copy))
3550                                 return csum;
3551                         offset += copy;
3552                         to     += copy;
3553                 }
3554                 start = end;
3555         }
3556
3557         skb_walk_frags(skb, frag_iter) {
3558                 __wsum csum2;
3559                 int end;
3560
3561                 WARN_ON(start > offset + len);
3562
3563                 end = start + frag_iter->len;
3564                 if ((copy = end - offset) > 0) {
3565                         if (copy > len)
3566                                 copy = len;
3567                         csum2 = skb_copy_and_csum_bits(frag_iter,
3568                                                        offset - start,
3569                                                        to, copy);
3570                         csum = csum_block_add(csum, csum2, pos);
3571                         if ((len -= copy) == 0)
3572                                 return csum;
3573                         offset += copy;
3574                         to     += copy;
3575                         pos    += copy;
3576                 }
3577                 start = end;
3578         }
3579         BUG_ON(len);
3580         return csum;
3581 }
3582 EXPORT_SYMBOL(skb_copy_and_csum_bits);
3583
3584 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
3585 {
3586         __sum16 sum;
3587
3588         sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
3589         /* See comments in __skb_checksum_complete(). */
3590         if (likely(!sum)) {
3591                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
3592                     !skb->csum_complete_sw)
3593                         netdev_rx_csum_fault(skb->dev, skb);
3594         }
3595         if (!skb_shared(skb))
3596                 skb->csum_valid = !sum;
3597         return sum;
3598 }
3599 EXPORT_SYMBOL(__skb_checksum_complete_head);
3600
3601 /* This function assumes skb->csum already holds pseudo header's checksum,
3602  * which has been changed from the hardware checksum, for example, by
3603  * __skb_checksum_validate_complete(). And, the original skb->csum must
3604  * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
3605  *
3606  * It returns non-zero if the recomputed checksum is still invalid, otherwise
3607  * zero. The new checksum is stored back into skb->csum unless the skb is
3608  * shared.
3609  */
3610 __sum16 __skb_checksum_complete(struct sk_buff *skb)
3611 {
3612         __wsum csum;
3613         __sum16 sum;
3614
3615         csum = skb_checksum(skb, 0, skb->len, 0);
3616
3617         sum = csum_fold(csum_add(skb->csum, csum));
3618         /* This check is inverted, because we already knew the hardware
3619          * checksum is invalid before calling this function. So, if the
3620          * re-computed checksum is valid instead, then we have a mismatch
3621          * between the original skb->csum and skb_checksum(). This means either
3622          * the original hardware checksum is incorrect or we screw up skb->csum
3623          * when moving skb->data around.
3624          */
3625         if (likely(!sum)) {
3626                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
3627                     !skb->csum_complete_sw)
3628                         netdev_rx_csum_fault(skb->dev, skb);
3629         }
3630
3631         if (!skb_shared(skb)) {
3632                 /* Save full packet checksum */
3633                 skb->csum = csum;
3634                 skb->ip_summed = CHECKSUM_COMPLETE;
3635                 skb->csum_complete_sw = 1;
3636                 skb->csum_valid = !sum;
3637         }
3638
3639         return sum;
3640 }
3641 EXPORT_SYMBOL(__skb_checksum_complete);
3642
3643 static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
3644 {
3645         net_warn_ratelimited(
3646                 "%s: attempt to compute crc32c without libcrc32c.ko\n",
3647                 __func__);
3648         return 0;
3649 }
3650
3651 static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
3652                                        int offset, int len)
3653 {
3654         net_warn_ratelimited(
3655                 "%s: attempt to compute crc32c without libcrc32c.ko\n",
3656                 __func__);
3657         return 0;
3658 }
3659
3660 static const struct skb_checksum_ops default_crc32c_ops = {
3661         .update  = warn_crc32c_csum_update,
3662         .combine = warn_crc32c_csum_combine,
3663 };
3664
3665 const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
3666         &default_crc32c_ops;
3667 EXPORT_SYMBOL(crc32c_csum_stub);
3668
3669  /**
3670  *      skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
3671  *      @from: source buffer
3672  *
3673  *      Calculates the amount of linear headroom needed in the 'to' skb passed
3674  *      into skb_zerocopy().
3675  */
3676 unsigned int
3677 skb_zerocopy_headlen(const struct sk_buff *from)
3678 {
3679         unsigned int hlen = 0;
3680
3681         if (!from->head_frag ||
3682             skb_headlen(from) < L1_CACHE_BYTES ||
3683             skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
3684                 hlen = skb_headlen(from);
3685                 if (!hlen)
3686                         hlen = from->len;
3687         }
3688
3689         if (skb_has_frag_list(from))
3690                 hlen = from->len;
3691
3692         return hlen;
3693 }
3694 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
3695
3696 /**
3697  *      skb_zerocopy - Zero copy skb to skb
3698  *      @to: destination buffer
3699  *      @from: source buffer
3700  *      @len: number of bytes to copy from source buffer
3701  *      @hlen: size of linear headroom in destination buffer
3702  *
3703  *      Copies up to `len` bytes from `from` to `to` by creating references
3704  *      to the frags in the source buffer.
3705  *
3706  *      The `hlen` as calculated by skb_zerocopy_headlen() specifies the
3707  *      headroom in the `to` buffer.
3708  *
3709  *      Return value:
3710  *      0: everything is OK
3711  *      -ENOMEM: couldn't orphan frags of @from due to lack of memory
3712  *      -EFAULT: skb_copy_bits() found some problem with skb geometry
3713  */
3714 int
3715 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
3716 {
3717         int i, j = 0;
3718         int plen = 0; /* length of skb->head fragment */
3719         int ret;
3720         struct page *page;
3721         unsigned int offset;
3722
3723         BUG_ON(!from->head_frag && !hlen);
3724
3725         /* dont bother with small payloads */
3726         if (len <= skb_tailroom(to))
3727                 return skb_copy_bits(from, 0, skb_put(to, len), len);
3728
3729         if (hlen) {
3730                 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
3731                 if (unlikely(ret))
3732                         return ret;
3733                 len -= hlen;
3734         } else {
3735                 plen = min_t(int, skb_headlen(from), len);
3736                 if (plen) {
3737                         page = virt_to_head_page(from->head);
3738                         offset = from->data - (unsigned char *)page_address(page);
3739                         __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
3740                                                offset, plen);
3741                         get_page(page);
3742                         j = 1;
3743                         len -= plen;
3744                 }
3745         }
3746
3747         skb_len_add(to, len + plen);
3748
3749         if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
3750                 skb_tx_error(from);
3751                 return -ENOMEM;
3752         }
3753         skb_zerocopy_clone(to, from, GFP_ATOMIC);
3754
3755         for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
3756                 int size;
3757
3758                 if (!len)
3759                         break;
3760                 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
3761                 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
3762                                         len);
3763                 skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
3764                 len -= size;
3765                 skb_frag_ref(to, j);
3766                 j++;
3767         }
3768         skb_shinfo(to)->nr_frags = j;
3769
3770         return 0;
3771 }
3772 EXPORT_SYMBOL_GPL(skb_zerocopy);
3773
3774 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
3775 {
3776         __wsum csum;
3777         long csstart;
3778
3779         if (skb->ip_summed == CHECKSUM_PARTIAL)
3780                 csstart = skb_checksum_start_offset(skb);
3781         else
3782                 csstart = skb_headlen(skb);
3783
3784         BUG_ON(csstart > skb_headlen(skb));
3785
3786         skb_copy_from_linear_data(skb, to, csstart);
3787
3788         csum = 0;
3789         if (csstart != skb->len)
3790                 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
3791                                               skb->len - csstart);
3792
3793         if (skb->ip_summed == CHECKSUM_PARTIAL) {
3794                 long csstuff = csstart + skb->csum_offset;
3795
3796                 *((__sum16 *)(to + csstuff)) = csum_fold(csum);
3797         }
3798 }
3799 EXPORT_SYMBOL(skb_copy_and_csum_dev);
3800
3801 /**
3802  *      skb_dequeue - remove from the head of the queue
3803  *      @list: list to dequeue from
3804  *
3805  *      Remove the head of the list. The list lock is taken so the function
3806  *      may be used safely with other locking list functions. The head item is
3807  *      returned or %NULL if the list is empty.
3808  */
3809
3810 struct sk_buff *skb_dequeue(struct sk_buff_head *list)
3811 {
3812         unsigned long flags;
3813         struct sk_buff *result;
3814
3815         spin_lock_irqsave(&list->lock, flags);
3816         result = __skb_dequeue(list);
3817         spin_unlock_irqrestore(&list->lock, flags);
3818         return result;
3819 }
3820 EXPORT_SYMBOL(skb_dequeue);
3821
3822 /**
3823  *      skb_dequeue_tail - remove from the tail of the queue
3824  *      @list: list to dequeue from
3825  *
3826  *      Remove the tail of the list. The list lock is taken so the function
3827  *      may be used safely with other locking list functions. The tail item is
3828  *      returned or %NULL if the list is empty.
3829  */
3830 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
3831 {
3832         unsigned long flags;
3833         struct sk_buff *result;
3834
3835         spin_lock_irqsave(&list->lock, flags);
3836         result = __skb_dequeue_tail(list);
3837         spin_unlock_irqrestore(&list->lock, flags);
3838         return result;
3839 }
3840 EXPORT_SYMBOL(skb_dequeue_tail);
3841
3842 /**
3843  *      skb_queue_purge_reason - empty a list
3844  *      @list: list to empty
3845  *      @reason: drop reason
3846  *
3847  *      Delete all buffers on an &sk_buff list. Each buffer is removed from
3848  *      the list and one reference dropped. This function takes the list
3849  *      lock and is atomic with respect to other list locking functions.
3850  */
3851 void skb_queue_purge_reason(struct sk_buff_head *list,
3852                             enum skb_drop_reason reason)
3853 {
3854         struct sk_buff_head tmp;
3855         unsigned long flags;
3856
3857         if (skb_queue_empty_lockless(list))
3858                 return;
3859
3860         __skb_queue_head_init(&tmp);
3861
3862         spin_lock_irqsave(&list->lock, flags);
3863         skb_queue_splice_init(list, &tmp);
3864         spin_unlock_irqrestore(&list->lock, flags);
3865
3866         __skb_queue_purge_reason(&tmp, reason);
3867 }
3868 EXPORT_SYMBOL(skb_queue_purge_reason);
3869
3870 /**
3871  *      skb_rbtree_purge - empty a skb rbtree
3872  *      @root: root of the rbtree to empty
3873  *      Return value: the sum of truesizes of all purged skbs.
3874  *
3875  *      Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
3876  *      the list and one reference dropped. This function does not take
3877  *      any lock. Synchronization should be handled by the caller (e.g., TCP
3878  *      out-of-order queue is protected by the socket lock).
3879  */
3880 unsigned int skb_rbtree_purge(struct rb_root *root)
3881 {
3882         struct rb_node *p = rb_first(root);
3883         unsigned int sum = 0;
3884
3885         while (p) {
3886                 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
3887
3888                 p = rb_next(p);
3889                 rb_erase(&skb->rbnode, root);
3890                 sum += skb->truesize;
3891                 kfree_skb(skb);
3892         }
3893         return sum;
3894 }
3895
3896 void skb_errqueue_purge(struct sk_buff_head *list)
3897 {
3898         struct sk_buff *skb, *next;
3899         struct sk_buff_head kill;
3900         unsigned long flags;
3901
3902         __skb_queue_head_init(&kill);
3903
3904         spin_lock_irqsave(&list->lock, flags);
3905         skb_queue_walk_safe(list, skb, next) {
3906                 if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
3907                     SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
3908                         continue;
3909                 __skb_unlink(skb, list);
3910                 __skb_queue_tail(&kill, skb);
3911         }
3912         spin_unlock_irqrestore(&list->lock, flags);
3913         __skb_queue_purge(&kill);
3914 }
3915 EXPORT_SYMBOL(skb_errqueue_purge);
3916
3917 /**
3918  *      skb_queue_head - queue a buffer at the list head
3919  *      @list: list to use
3920  *      @newsk: buffer to queue
3921  *
3922  *      Queue a buffer at the start of the list. This function takes the
3923  *      list lock and can be used safely with other locking &sk_buff functions
3924  *      safely.
3925  *
3926  *      A buffer cannot be placed on two lists at the same time.
3927  */
3928 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
3929 {
3930         unsigned long flags;
3931
3932         spin_lock_irqsave(&list->lock, flags);
3933         __skb_queue_head(list, newsk);
3934         spin_unlock_irqrestore(&list->lock, flags);
3935 }
3936 EXPORT_SYMBOL(skb_queue_head);
3937
3938 /**
3939  *      skb_queue_tail - queue a buffer at the list tail
3940  *      @list: list to use
3941  *      @newsk: buffer to queue
3942  *
3943  *      Queue a buffer at the tail of the list. This function takes the
3944  *      list lock and can be used safely with other locking &sk_buff functions
3945  *      safely.
3946  *
3947  *      A buffer cannot be placed on two lists at the same time.
3948  */
3949 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
3950 {
3951         unsigned long flags;
3952
3953         spin_lock_irqsave(&list->lock, flags);
3954         __skb_queue_tail(list, newsk);
3955         spin_unlock_irqrestore(&list->lock, flags);
3956 }
3957 EXPORT_SYMBOL(skb_queue_tail);
3958
3959 /**
3960  *      skb_unlink      -       remove a buffer from a list
3961  *      @skb: buffer to remove
3962  *      @list: list to use
3963  *
3964  *      Remove a packet from a list. The list locks are taken and this
3965  *      function is atomic with respect to other list locked calls
3966  *
3967  *      You must know what list the SKB is on.
3968  */
3969 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
3970 {
3971         unsigned long flags;
3972
3973         spin_lock_irqsave(&list->lock, flags);
3974         __skb_unlink(skb, list);
3975         spin_unlock_irqrestore(&list->lock, flags);
3976 }
3977 EXPORT_SYMBOL(skb_unlink);
3978
3979 /**
3980  *      skb_append      -       append a buffer
3981  *      @old: buffer to insert after
3982  *      @newsk: buffer to insert
3983  *      @list: list to use
3984  *
3985  *      Place a packet after a given packet in a list. The list locks are taken
3986  *      and this function is atomic with respect to other list locked calls.
3987  *      A buffer cannot be placed on two lists at the same time.
3988  */
3989 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
3990 {
3991         unsigned long flags;
3992
3993         spin_lock_irqsave(&list->lock, flags);
3994         __skb_queue_after(list, old, newsk);
3995         spin_unlock_irqrestore(&list->lock, flags);
3996 }
3997 EXPORT_SYMBOL(skb_append);
3998
3999 static inline void skb_split_inside_header(struct sk_buff *skb,
4000                                            struct sk_buff* skb1,
4001                                            const u32 len, const int pos)
4002 {
4003         int i;
4004
4005         skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
4006                                          pos - len);
4007         /* And move data appendix as is. */
4008         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
4009                 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
4010
4011         skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
4012         skb_shinfo(skb)->nr_frags  = 0;
4013         skb1->data_len             = skb->data_len;
4014         skb1->len                  += skb1->data_len;
4015         skb->data_len              = 0;
4016         skb->len                   = len;
4017         skb_set_tail_pointer(skb, len);
4018 }
4019
4020 static inline void skb_split_no_header(struct sk_buff *skb,
4021                                        struct sk_buff* skb1,
4022                                        const u32 len, int pos)
4023 {
4024         int i, k = 0;
4025         const int nfrags = skb_shinfo(skb)->nr_frags;
4026
4027         skb_shinfo(skb)->nr_frags = 0;
4028         skb1->len                 = skb1->data_len = skb->len - len;
4029         skb->len                  = len;
4030         skb->data_len             = len - pos;
4031
4032         for (i = 0; i < nfrags; i++) {
4033                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
4034
4035                 if (pos + size > len) {
4036                         skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
4037
4038                         if (pos < len) {
4039                                 /* Split frag.
4040                                  * We have two variants in this case:
4041                                  * 1. Move all the frag to the second
4042                                  *    part, if it is possible. F.e.
4043                                  *    this approach is mandatory for TUX,
4044                                  *    where splitting is expensive.
4045                                  * 2. Split is accurately. We make this.
4046                                  */
4047                                 skb_frag_ref(skb, i);
4048                                 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
4049                                 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
4050                                 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
4051                                 skb_shinfo(skb)->nr_frags++;
4052                         }
4053                         k++;
4054                 } else
4055                         skb_shinfo(skb)->nr_frags++;
4056                 pos += size;
4057         }
4058         skb_shinfo(skb1)->nr_frags = k;
4059 }
4060
4061 /**
4062  * skb_split - Split fragmented skb to two parts at length len.
4063  * @skb: the buffer to split
4064  * @skb1: the buffer to receive the second part
4065  * @len: new length for skb
4066  */
4067 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
4068 {
4069         int pos = skb_headlen(skb);
4070         const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
4071
4072         skb_zcopy_downgrade_managed(skb);
4073
4074         skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
4075         skb_zerocopy_clone(skb1, skb, 0);
4076         if (len < pos)  /* Split line is inside header. */
4077                 skb_split_inside_header(skb, skb1, len, pos);
4078         else            /* Second chunk has no header, nothing to copy. */
4079                 skb_split_no_header(skb, skb1, len, pos);
4080 }
4081 EXPORT_SYMBOL(skb_split);
4082
4083 /* Shifting from/to a cloned skb is a no-go.
4084  *
4085  * Caller cannot keep skb_shinfo related pointers past calling here!
4086  */
4087 static int skb_prepare_for_shift(struct sk_buff *skb)
4088 {
4089         return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
4090 }
4091
4092 /**
4093  * skb_shift - Shifts paged data partially from skb to another
4094  * @tgt: buffer into which tail data gets added
4095  * @skb: buffer from which the paged data comes from
4096  * @shiftlen: shift up to this many bytes
4097  *
4098  * Attempts to shift up to shiftlen worth of bytes, which may be less than
4099  * the length of the skb, from skb to tgt. Returns number bytes shifted.
4100  * It's up to caller to free skb if everything was shifted.
4101  *
4102  * If @tgt runs out of frags, the whole operation is aborted.
4103  *
4104  * Skb cannot include anything else but paged data while tgt is allowed
4105  * to have non-paged data as well.
4106  *
4107  * TODO: full sized shift could be optimized but that would need
4108  * specialized skb free'er to handle frags without up-to-date nr_frags.
4109  */
4110 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
4111 {
4112         int from, to, merge, todo;
4113         skb_frag_t *fragfrom, *fragto;
4114
4115         BUG_ON(shiftlen > skb->len);
4116
4117         if (skb_headlen(skb))
4118                 return 0;
4119         if (skb_zcopy(tgt) || skb_zcopy(skb))
4120                 return 0;
4121
4122         todo = shiftlen;
4123         from = 0;
4124         to = skb_shinfo(tgt)->nr_frags;
4125         fragfrom = &skb_shinfo(skb)->frags[from];
4126
4127         /* Actual merge is delayed until the point when we know we can
4128          * commit all, so that we don't have to undo partial changes
4129          */
4130         if (!to ||
4131             !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
4132                               skb_frag_off(fragfrom))) {
4133                 merge = -1;
4134         } else {
4135                 merge = to - 1;
4136
4137                 todo -= skb_frag_size(fragfrom);
4138                 if (todo < 0) {
4139                         if (skb_prepare_for_shift(skb) ||
4140                             skb_prepare_for_shift(tgt))
4141                                 return 0;
4142
4143                         /* All previous frag pointers might be stale! */
4144                         fragfrom = &skb_shinfo(skb)->frags[from];
4145                         fragto = &skb_shinfo(tgt)->frags[merge];
4146
4147                         skb_frag_size_add(fragto, shiftlen);
4148                         skb_frag_size_sub(fragfrom, shiftlen);
4149                         skb_frag_off_add(fragfrom, shiftlen);
4150
4151                         goto onlymerged;
4152                 }
4153
4154                 from++;
4155         }
4156
4157         /* Skip full, not-fitting skb to avoid expensive operations */
4158         if ((shiftlen == skb->len) &&
4159             (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
4160                 return 0;
4161
4162         if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
4163                 return 0;
4164
4165         while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
4166                 if (to == MAX_SKB_FRAGS)
4167                         return 0;
4168
4169                 fragfrom = &skb_shinfo(skb)->frags[from];
4170                 fragto = &skb_shinfo(tgt)->frags[to];
4171
4172                 if (todo >= skb_frag_size(fragfrom)) {
4173                         *fragto = *fragfrom;
4174                         todo -= skb_frag_size(fragfrom);
4175                         from++;
4176                         to++;
4177
4178                 } else {
4179                         __skb_frag_ref(fragfrom);
4180                         skb_frag_page_copy(fragto, fragfrom);
4181                         skb_frag_off_copy(fragto, fragfrom);
4182                         skb_frag_size_set(fragto, todo);
4183
4184                         skb_frag_off_add(fragfrom, todo);
4185                         skb_frag_size_sub(fragfrom, todo);
4186                         todo = 0;
4187
4188                         to++;
4189                         break;
4190                 }
4191         }
4192
4193         /* Ready to "commit" this state change to tgt */
4194         skb_shinfo(tgt)->nr_frags = to;
4195
4196         if (merge >= 0) {
4197                 fragfrom = &skb_shinfo(skb)->frags[0];
4198                 fragto = &skb_shinfo(tgt)->frags[merge];
4199
4200                 skb_frag_size_add(fragto, skb_frag_size(fragfrom));
4201                 __skb_frag_unref(fragfrom, skb->pp_recycle);
4202         }
4203
4204         /* Reposition in the original skb */
4205         to = 0;
4206         while (from < skb_shinfo(skb)->nr_frags)
4207                 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
4208         skb_shinfo(skb)->nr_frags = to;
4209
4210         BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
4211
4212 onlymerged:
4213         /* Most likely the tgt won't ever need its checksum anymore, skb on
4214          * the other hand might need it if it needs to be resent
4215          */
4216         tgt->ip_summed = CHECKSUM_PARTIAL;
4217         skb->ip_summed = CHECKSUM_PARTIAL;
4218
4219         skb_len_add(skb, -shiftlen);
4220         skb_len_add(tgt, shiftlen);
4221
4222         return shiftlen;
4223 }
4224
4225 /**
4226  * skb_prepare_seq_read - Prepare a sequential read of skb data
4227  * @skb: the buffer to read
4228  * @from: lower offset of data to be read
4229  * @to: upper offset of data to be read
4230  * @st: state variable
4231  *
4232  * Initializes the specified state variable. Must be called before
4233  * invoking skb_seq_read() for the first time.
4234  */
4235 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
4236                           unsigned int to, struct skb_seq_state *st)
4237 {
4238         st->lower_offset = from;
4239         st->upper_offset = to;
4240         st->root_skb = st->cur_skb = skb;
4241         st->frag_idx = st->stepped_offset = 0;
4242         st->frag_data = NULL;
4243         st->frag_off = 0;
4244 }
4245 EXPORT_SYMBOL(skb_prepare_seq_read);
4246
4247 /**
4248  * skb_seq_read - Sequentially read skb data
4249  * @consumed: number of bytes consumed by the caller so far
4250  * @data: destination pointer for data to be returned
4251  * @st: state variable
4252  *
4253  * Reads a block of skb data at @consumed relative to the
4254  * lower offset specified to skb_prepare_seq_read(). Assigns
4255  * the head of the data block to @data and returns the length
4256  * of the block or 0 if the end of the skb data or the upper
4257  * offset has been reached.
4258  *
4259  * The caller is not required to consume all of the data
4260  * returned, i.e. @consumed is typically set to the number
4261  * of bytes already consumed and the next call to
4262  * skb_seq_read() will return the remaining part of the block.
4263  *
4264  * Note 1: The size of each block of data returned can be arbitrary,
4265  *       this limitation is the cost for zerocopy sequential
4266  *       reads of potentially non linear data.
4267  *
4268  * Note 2: Fragment lists within fragments are not implemented
4269  *       at the moment, state->root_skb could be replaced with
4270  *       a stack for this purpose.
4271  */
4272 unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
4273                           struct skb_seq_state *st)
4274 {
4275         unsigned int block_limit, abs_offset = consumed + st->lower_offset;
4276         skb_frag_t *frag;
4277
4278         if (unlikely(abs_offset >= st->upper_offset)) {
4279                 if (st->frag_data) {
4280                         kunmap_atomic(st->frag_data);
4281                         st->frag_data = NULL;
4282                 }
4283                 return 0;
4284         }
4285
4286 next_skb:
4287         block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
4288
4289         if (abs_offset < block_limit && !st->frag_data) {
4290                 *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
4291                 return block_limit - abs_offset;
4292         }
4293
4294         if (st->frag_idx == 0 && !st->frag_data)
4295                 st->stepped_offset += skb_headlen(st->cur_skb);
4296
4297         while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
4298                 unsigned int pg_idx, pg_off, pg_sz;
4299
4300                 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
4301
4302                 pg_idx = 0;
4303                 pg_off = skb_frag_off(frag);
4304                 pg_sz = skb_frag_size(frag);
4305
4306                 if (skb_frag_must_loop(skb_frag_page(frag))) {
4307                         pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
4308                         pg_off = offset_in_page(pg_off + st->frag_off);
4309                         pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
4310                                                     PAGE_SIZE - pg_off);
4311                 }
4312
4313                 block_limit = pg_sz + st->stepped_offset;
4314                 if (abs_offset < block_limit) {
4315                         if (!st->frag_data)
4316                                 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
4317
4318                         *data = (u8 *)st->frag_data + pg_off +
4319                                 (abs_offset - st->stepped_offset);
4320
4321                         return block_limit - abs_offset;
4322                 }
4323
4324                 if (st->frag_data) {
4325                         kunmap_atomic(st->frag_data);
4326                         st->frag_data = NULL;
4327                 }
4328
4329                 st->stepped_offset += pg_sz;
4330                 st->frag_off += pg_sz;
4331                 if (st->frag_off == skb_frag_size(frag)) {
4332                         st->frag_off = 0;
4333                         st->frag_idx++;
4334                 }
4335         }
4336
4337         if (st->frag_data) {
4338                 kunmap_atomic(st->frag_data);
4339                 st->frag_data = NULL;
4340         }
4341
4342         if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
4343                 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
4344                 st->frag_idx = 0;
4345                 goto next_skb;
4346         } else if (st->cur_skb->next) {
4347                 st->cur_skb = st->cur_skb->next;
4348                 st->frag_idx = 0;
4349                 goto next_skb;
4350         }
4351
4352         return 0;
4353 }
4354 EXPORT_SYMBOL(skb_seq_read);
4355
4356 /**
4357  * skb_abort_seq_read - Abort a sequential read of skb data
4358  * @st: state variable
4359  *
4360  * Must be called if skb_seq_read() was not called until it
4361  * returned 0.
4362  */
4363 void skb_abort_seq_read(struct skb_seq_state *st)
4364 {
4365         if (st->frag_data)
4366                 kunmap_atomic(st->frag_data);
4367 }
4368 EXPORT_SYMBOL(skb_abort_seq_read);
4369
4370 #define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))
4371
4372 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
4373                                           struct ts_config *conf,
4374                                           struct ts_state *state)
4375 {
4376         return skb_seq_read(offset, text, TS_SKB_CB(state));
4377 }
4378
4379 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
4380 {
4381         skb_abort_seq_read(TS_SKB_CB(state));
4382 }
4383
4384 /**
4385  * skb_find_text - Find a text pattern in skb data
4386  * @skb: the buffer to look in
4387  * @from: search offset
4388  * @to: search limit
4389  * @config: textsearch configuration
4390  *
4391  * Finds a pattern in the skb data according to the specified
4392  * textsearch configuration. Use textsearch_next() to retrieve
4393  * subsequent occurrences of the pattern. Returns the offset
4394  * to the first occurrence or UINT_MAX if no match was found.
4395  */
4396 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
4397                            unsigned int to, struct ts_config *config)
4398 {
4399         unsigned int patlen = config->ops->get_pattern_len(config);
4400         struct ts_state state;
4401         unsigned int ret;
4402
4403         BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
4404
4405         config->get_next_block = skb_ts_get_next_block;
4406         config->finish = skb_ts_finish;
4407
4408         skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
4409
4410         ret = textsearch_find(config, &state);
4411         return (ret + patlen <= to - from ? ret : UINT_MAX);
4412 }
4413 EXPORT_SYMBOL(skb_find_text);
4414
4415 int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
4416                          int offset, size_t size, size_t max_frags)
4417 {
4418         int i = skb_shinfo(skb)->nr_frags;
4419
4420         if (skb_can_coalesce(skb, i, page, offset)) {
4421                 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
4422         } else if (i < max_frags) {
4423                 skb_zcopy_downgrade_managed(skb);
4424                 get_page(page);
4425                 skb_fill_page_desc_noacc(skb, i, page, offset, size);
4426         } else {
4427                 return -EMSGSIZE;
4428         }
4429
4430         return 0;
4431 }
4432 EXPORT_SYMBOL_GPL(skb_append_pagefrags);
4433
4434 /**
4435  *      skb_pull_rcsum - pull skb and update receive checksum
4436  *      @skb: buffer to update
4437  *      @len: length of data pulled
4438  *
4439  *      This function performs an skb_pull on the packet and updates
4440  *      the CHECKSUM_COMPLETE checksum.  It should be used on
4441  *      receive path processing instead of skb_pull unless you know
4442  *      that the checksum difference is zero (e.g., a valid IP header)
4443  *      or you are setting ip_summed to CHECKSUM_NONE.
4444  */
4445 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
4446 {
4447         unsigned char *data = skb->data;
4448
4449         BUG_ON(len > skb->len);
4450         __skb_pull(skb, len);
4451         skb_postpull_rcsum(skb, data, len);
4452         return skb->data;
4453 }
4454 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
4455
4456 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
4457 {
4458         skb_frag_t head_frag;
4459         struct page *page;
4460
4461         page = virt_to_head_page(frag_skb->head);
4462         skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
4463                                 (unsigned char *)page_address(page),
4464                                 skb_headlen(frag_skb));
4465         return head_frag;
4466 }
4467
4468 struct sk_buff *skb_segment_list(struct sk_buff *skb,
4469                                  netdev_features_t features,
4470                                  unsigned int offset)
4471 {
4472         struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
4473         unsigned int tnl_hlen = skb_tnl_header_len(skb);
4474         unsigned int delta_truesize = 0;
4475         unsigned int delta_len = 0;
4476         struct sk_buff *tail = NULL;
4477         struct sk_buff *nskb, *tmp;
4478         int len_diff, err;
4479
4480         skb_push(skb, -skb_network_offset(skb) + offset);
4481
4482         /* Ensure the head is writeable before touching the shared info */
4483         err = skb_unclone(skb, GFP_ATOMIC);
4484         if (err)
4485                 goto err_linearize;
4486
4487         skb_shinfo(skb)->frag_list = NULL;
4488
4489         while (list_skb) {
4490                 nskb = list_skb;
4491                 list_skb = list_skb->next;
4492
4493                 err = 0;
4494                 delta_truesize += nskb->truesize;
4495                 if (skb_shared(nskb)) {
4496                         tmp = skb_clone(nskb, GFP_ATOMIC);
4497                         if (tmp) {
4498                                 consume_skb(nskb);
4499                                 nskb = tmp;
4500                                 err = skb_unclone(nskb, GFP_ATOMIC);
4501                         } else {
4502                                 err = -ENOMEM;
4503                         }
4504                 }
4505
4506                 if (!tail)
4507                         skb->next = nskb;
4508                 else
4509                         tail->next = nskb;
4510
4511                 if (unlikely(err)) {
4512                         nskb->next = list_skb;
4513                         goto err_linearize;
4514                 }
4515
4516                 tail = nskb;
4517
4518                 delta_len += nskb->len;
4519
4520                 skb_push(nskb, -skb_network_offset(nskb) + offset);
4521
4522                 skb_release_head_state(nskb);
4523                 len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
4524                 __copy_skb_header(nskb, skb);
4525
4526                 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
4527                 nskb->transport_header += len_diff;
4528                 skb_copy_from_linear_data_offset(skb, -tnl_hlen,
4529                                                  nskb->data - tnl_hlen,
4530                                                  offset + tnl_hlen);
4531
4532                 if (skb_needs_linearize(nskb, features) &&
4533                     __skb_linearize(nskb))
4534                         goto err_linearize;
4535         }
4536
4537         skb->truesize = skb->truesize - delta_truesize;
4538         skb->data_len = skb->data_len - delta_len;
4539         skb->len = skb->len - delta_len;
4540
4541         skb_gso_reset(skb);
4542
4543         skb->prev = tail;
4544
4545         if (skb_needs_linearize(skb, features) &&
4546             __skb_linearize(skb))
4547                 goto err_linearize;
4548
4549         skb_get(skb);
4550
4551         return skb;
4552
4553 err_linearize:
4554         kfree_skb_list(skb->next);
4555         skb->next = NULL;
4556         return ERR_PTR(-ENOMEM);
4557 }
4558 EXPORT_SYMBOL_GPL(skb_segment_list);
4559
4560 /**
4561  *      skb_segment - Perform protocol segmentation on skb.
4562  *      @head_skb: buffer to segment
4563  *      @features: features for the output path (see dev->features)
4564  *
4565  *      This function performs segmentation on the given skb.  It returns
4566  *      a pointer to the first in a list of new skbs for the segments.
4567  *      In case of error it returns ERR_PTR(err).
4568  */
4569 struct sk_buff *skb_segment(struct sk_buff *head_skb,
4570                             netdev_features_t features)
4571 {
4572         struct sk_buff *segs = NULL;
4573         struct sk_buff *tail = NULL;
4574         struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
4575         unsigned int mss = skb_shinfo(head_skb)->gso_size;
4576         unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
4577         unsigned int offset = doffset;
4578         unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
4579         unsigned int partial_segs = 0;
4580         unsigned int headroom;
4581         unsigned int len = head_skb->len;
4582         struct sk_buff *frag_skb;
4583         skb_frag_t *frag;
4584         __be16 proto;
4585         bool csum, sg;
4586         int err = -ENOMEM;
4587         int i = 0;
4588         int nfrags, pos;
4589
4590         if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
4591             mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
4592                 struct sk_buff *check_skb;
4593
4594                 for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
4595                         if (skb_headlen(check_skb) && !check_skb->head_frag) {
4596                                 /* gso_size is untrusted, and we have a frag_list with
4597                                  * a linear non head_frag item.
4598                                  *
4599                                  * If head_skb's headlen does not fit requested gso_size,
4600                                  * it means that the frag_list members do NOT terminate
4601                                  * on exact gso_size boundaries. Hence we cannot perform
4602                                  * skb_frag_t page sharing. Therefore we must fallback to
4603                                  * copying the frag_list skbs; we do so by disabling SG.
4604                                  */
4605                                 features &= ~NETIF_F_SG;
4606                                 break;
4607                         }
4608                 }
4609         }
4610
4611         __skb_push(head_skb, doffset);
4612         proto = skb_network_protocol(head_skb, NULL);
4613         if (unlikely(!proto))
4614                 return ERR_PTR(-EINVAL);
4615
4616         sg = !!(features & NETIF_F_SG);
4617         csum = !!can_checksum_protocol(features, proto);
4618
4619         if (sg && csum && (mss != GSO_BY_FRAGS))  {
4620                 if (!(features & NETIF_F_GSO_PARTIAL)) {
4621                         struct sk_buff *iter;
4622                         unsigned int frag_len;
4623
4624                         if (!list_skb ||
4625                             !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
4626                                 goto normal;
4627
4628                         /* If we get here then all the required
4629                          * GSO features except frag_list are supported.
4630                          * Try to split the SKB to multiple GSO SKBs
4631                          * with no frag_list.
4632                          * Currently we can do that only when the buffers don't
4633                          * have a linear part and all the buffers except
4634                          * the last are of the same length.
4635                          */
4636                         frag_len = list_skb->len;
4637                         skb_walk_frags(head_skb, iter) {
4638                                 if (frag_len != iter->len && iter->next)
4639                                         goto normal;
4640                                 if (skb_headlen(iter) && !iter->head_frag)
4641                                         goto normal;
4642
4643                                 len -= iter->len;
4644                         }
4645
4646                         if (len != frag_len)
4647                                 goto normal;
4648                 }
4649
4650                 /* GSO partial only requires that we trim off any excess that
4651                  * doesn't fit into an MSS sized block, so take care of that
4652                  * now.
4653                  * Cap len to not accidentally hit GSO_BY_FRAGS.
4654                  */
4655                 partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
4656                 if (partial_segs > 1)
4657                         mss *= partial_segs;
4658                 else
4659                         partial_segs = 0;
4660         }
4661
4662 normal:
4663         headroom = skb_headroom(head_skb);
4664         pos = skb_headlen(head_skb);
4665
4666         if (skb_orphan_frags(head_skb, GFP_ATOMIC))
4667                 return ERR_PTR(-ENOMEM);
4668
4669         nfrags = skb_shinfo(head_skb)->nr_frags;
4670         frag = skb_shinfo(head_skb)->frags;
4671         frag_skb = head_skb;
4672
4673         do {
4674                 struct sk_buff *nskb;
4675                 skb_frag_t *nskb_frag;
4676                 int hsize;
4677                 int size;
4678
4679                 if (unlikely(mss == GSO_BY_FRAGS)) {
4680                         len = list_skb->len;
4681                 } else {
4682                         len = head_skb->len - offset;
4683                         if (len > mss)
4684                                 len = mss;
4685                 }
4686
4687                 hsize = skb_headlen(head_skb) - offset;
4688
4689                 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
4690                     (skb_headlen(list_skb) == len || sg)) {
4691                         BUG_ON(skb_headlen(list_skb) > len);
4692
4693                         nskb = skb_clone(list_skb, GFP_ATOMIC);
4694                         if (unlikely(!nskb))
4695                                 goto err;
4696
4697                         i = 0;
4698                         nfrags = skb_shinfo(list_skb)->nr_frags;
4699                         frag = skb_shinfo(list_skb)->frags;
4700                         frag_skb = list_skb;
4701                         pos += skb_headlen(list_skb);
4702
4703                         while (pos < offset + len) {
4704                                 BUG_ON(i >= nfrags);
4705
4706                                 size = skb_frag_size(frag);
4707                                 if (pos + size > offset + len)
4708                                         break;
4709
4710                                 i++;
4711                                 pos += size;
4712                                 frag++;
4713                         }
4714
4715                         list_skb = list_skb->next;
4716
4717                         if (unlikely(pskb_trim(nskb, len))) {
4718                                 kfree_skb(nskb);
4719                                 goto err;
4720                         }
4721
4722                         hsize = skb_end_offset(nskb);
4723                         if (skb_cow_head(nskb, doffset + headroom)) {
4724                                 kfree_skb(nskb);
4725                                 goto err;
4726                         }
4727
4728                         nskb->truesize += skb_end_offset(nskb) - hsize;
4729                         skb_release_head_state(nskb);
4730                         __skb_push(nskb, doffset);
4731                 } else {
4732                         if (hsize < 0)
4733                                 hsize = 0;
4734                         if (hsize > len || !sg)
4735                                 hsize = len;
4736
4737                         nskb = __alloc_skb(hsize + doffset + headroom,
4738                                            GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
4739                                            NUMA_NO_NODE);
4740
4741                         if (unlikely(!nskb))
4742                                 goto err;
4743
4744                         skb_reserve(nskb, headroom);
4745                         __skb_put(nskb, doffset);
4746                 }
4747
4748                 if (segs)
4749                         tail->next = nskb;
4750                 else
4751                         segs = nskb;
4752                 tail = nskb;
4753
4754                 __copy_skb_header(nskb, head_skb);
4755
4756                 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
4757                 skb_reset_mac_len(nskb);
4758
4759                 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
4760                                                  nskb->data - tnl_hlen,
4761                                                  doffset + tnl_hlen);
4762
4763                 if (nskb->len == len + doffset)
4764                         goto perform_csum_check;
4765
4766                 if (!sg) {
4767                         if (!csum) {
4768                                 if (!nskb->remcsum_offload)
4769                                         nskb->ip_summed = CHECKSUM_NONE;
4770                                 SKB_GSO_CB(nskb)->csum =
4771                                         skb_copy_and_csum_bits(head_skb, offset,
4772                                                                skb_put(nskb,
4773                                                                        len),
4774                                                                len);
4775                                 SKB_GSO_CB(nskb)->csum_start =
4776                                         skb_headroom(nskb) + doffset;
4777                         } else {
4778                                 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
4779                                         goto err;
4780                         }
4781                         continue;
4782                 }
4783
4784                 nskb_frag = skb_shinfo(nskb)->frags;
4785
4786                 skb_copy_from_linear_data_offset(head_skb, offset,
4787                                                  skb_put(nskb, hsize), hsize);
4788
4789                 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
4790                                            SKBFL_SHARED_FRAG;
4791
4792                 if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
4793                         goto err;
4794
4795                 while (pos < offset + len) {
4796                         if (i >= nfrags) {
4797                                 if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
4798                                     skb_zerocopy_clone(nskb, list_skb,
4799                                                        GFP_ATOMIC))
4800                                         goto err;
4801
4802                                 i = 0;
4803                                 nfrags = skb_shinfo(list_skb)->nr_frags;
4804                                 frag = skb_shinfo(list_skb)->frags;
4805                                 frag_skb = list_skb;
4806                                 if (!skb_headlen(list_skb)) {
4807                                         BUG_ON(!nfrags);
4808                                 } else {
4809                                         BUG_ON(!list_skb->head_frag);
4810
4811                                         /* to make room for head_frag. */
4812                                         i--;
4813                                         frag--;
4814                                 }
4815
4816                                 list_skb = list_skb->next;
4817                         }
4818
4819                         if (unlikely(skb_shinfo(nskb)->nr_frags >=
4820                                      MAX_SKB_FRAGS)) {
4821                                 net_warn_ratelimited(
4822                                         "skb_segment: too many frags: %u %u\n",
4823                                         pos, mss);
4824                                 err = -EINVAL;
4825                                 goto err;
4826                         }
4827
4828                         *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
4829                         __skb_frag_ref(nskb_frag);
4830                         size = skb_frag_size(nskb_frag);
4831
4832                         if (pos < offset) {
4833                                 skb_frag_off_add(nskb_frag, offset - pos);
4834                                 skb_frag_size_sub(nskb_frag, offset - pos);
4835                         }
4836
4837                         skb_shinfo(nskb)->nr_frags++;
4838
4839                         if (pos + size <= offset + len) {
4840                                 i++;
4841                                 frag++;
4842                                 pos += size;
4843                         } else {
4844                                 skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
4845                                 goto skip_fraglist;
4846                         }
4847
4848                         nskb_frag++;
4849                 }
4850
4851 skip_fraglist:
4852                 nskb->data_len = len - hsize;
4853                 nskb->len += nskb->data_len;
4854                 nskb->truesize += nskb->data_len;
4855
4856 perform_csum_check:
4857                 if (!csum) {
4858                         if (skb_has_shared_frag(nskb) &&
4859                             __skb_linearize(nskb))
4860                                 goto err;
4861
4862                         if (!nskb->remcsum_offload)
4863                                 nskb->ip_summed = CHECKSUM_NONE;
4864                         SKB_GSO_CB(nskb)->csum =
4865                                 skb_checksum(nskb, doffset,
4866                                              nskb->len - doffset, 0);
4867                         SKB_GSO_CB(nskb)->csum_start =
4868                                 skb_headroom(nskb) + doffset;
4869                 }
4870         } while ((offset += len) < head_skb->len);
4871
4872         /* Some callers want to get the end of the list.
4873          * Put it in segs->prev to avoid walking the list.
4874          * (see validate_xmit_skb_list() for example)
4875          */
4876         segs->prev = tail;
4877
4878         if (partial_segs) {
4879                 struct sk_buff *iter;
4880                 int type = skb_shinfo(head_skb)->gso_type;
4881                 unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
4882
4883                 /* Update type to add partial and then remove dodgy if set */
4884                 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
4885                 type &= ~SKB_GSO_DODGY;
4886
4887                 /* Update GSO info and prepare to start updating headers on
4888                  * our way back down the stack of protocols.
4889                  */
4890                 for (iter = segs; iter; iter = iter->next) {
4891                         skb_shinfo(iter)->gso_size = gso_size;
4892                         skb_shinfo(iter)->gso_segs = partial_segs;
4893                         skb_shinfo(iter)->gso_type = type;
4894                         SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
4895                 }
4896
4897                 if (tail->len - doffset <= gso_size)
4898                         skb_shinfo(tail)->gso_size = 0;
4899                 else if (tail != segs)
4900                         skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
4901         }
4902
4903         /* Following permits correct backpressure, for protocols
4904          * using skb_set_owner_w().
4905          * Idea is to tranfert ownership from head_skb to last segment.
4906          */
4907         if (head_skb->destructor == sock_wfree) {
4908                 swap(tail->truesize, head_skb->truesize);
4909                 swap(tail->destructor, head_skb->destructor);
4910                 swap(tail->sk, head_skb->sk);
4911         }
4912         return segs;
4913
4914 err:
4915         kfree_skb_list(segs);
4916         return ERR_PTR(err);
4917 }
4918 EXPORT_SYMBOL_GPL(skb_segment);
4919
4920 #ifdef CONFIG_SKB_EXTENSIONS
4921 #define SKB_EXT_ALIGN_VALUE     8
4922 #define SKB_EXT_CHUNKSIZEOF(x)  (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
4923
4924 static const u8 skb_ext_type_len[] = {
4925 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4926         [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
4927 #endif
4928 #ifdef CONFIG_XFRM
4929         [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
4930 #endif
4931 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4932         [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
4933 #endif
4934 #if IS_ENABLED(CONFIG_MPTCP)
4935         [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
4936 #endif
4937 #if IS_ENABLED(CONFIG_MCTP_FLOWS)
4938         [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
4939 #endif
4940 };
4941
4942 static __always_inline unsigned int skb_ext_total_length(void)
4943 {
4944         unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
4945         int i;
4946
4947         for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
4948                 l += skb_ext_type_len[i];
4949
4950         return l;
4951 }
4952
4953 static void skb_extensions_init(void)
4954 {
4955         BUILD_BUG_ON(SKB_EXT_NUM >= 8);
4956 #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
4957         BUILD_BUG_ON(skb_ext_total_length() > 255);
4958 #endif
4959
4960         skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
4961                                              SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
4962                                              0,
4963                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4964                                              NULL);
4965 }
4966 #else
4967 static void skb_extensions_init(void) {}
4968 #endif
4969
4970 /* The SKB kmem_cache slab is critical for network performance.  Never
4971  * merge/alias the slab with similar sized objects.  This avoids fragmentation
4972  * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
4973  */
4974 #ifndef CONFIG_SLUB_TINY
4975 #define FLAG_SKB_NO_MERGE       SLAB_NO_MERGE
4976 #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
4977 #define FLAG_SKB_NO_MERGE       0
4978 #endif
4979
4980 void __init skb_init(void)
4981 {
4982         net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
4983                                               sizeof(struct sk_buff),
4984                                               0,
4985                                               SLAB_HWCACHE_ALIGN|SLAB_PANIC|
4986                                                 FLAG_SKB_NO_MERGE,
4987                                               offsetof(struct sk_buff, cb),
4988                                               sizeof_field(struct sk_buff, cb),
4989                                               NULL);
4990         net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
4991                                                 sizeof(struct sk_buff_fclones),
4992                                                 0,
4993                                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4994                                                 NULL);
4995         /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
4996          * struct skb_shared_info is located at the end of skb->head,
4997          * and should not be copied to/from user.
4998          */
4999         net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
5000                                                 SKB_SMALL_HEAD_CACHE_SIZE,
5001                                                 0,
5002                                                 SLAB_HWCACHE_ALIGN | SLAB_PANIC,
5003                                                 0,
5004                                                 SKB_SMALL_HEAD_HEADROOM,
5005                                                 NULL);
5006         skb_extensions_init();
5007 }
5008
5009 static int
5010 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
5011                unsigned int recursion_level)
5012 {
5013         int start = skb_headlen(skb);
5014         int i, copy = start - offset;
5015         struct sk_buff *frag_iter;
5016         int elt = 0;
5017
5018         if (unlikely(recursion_level >= 24))
5019                 return -EMSGSIZE;
5020
5021         if (copy > 0) {
5022                 if (copy > len)
5023                         copy = len;
5024                 sg_set_buf(sg, skb->data + offset, copy);
5025                 elt++;
5026                 if ((len -= copy) == 0)
5027                         return elt;
5028                 offset += copy;
5029         }
5030
5031         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
5032                 int end;
5033
5034                 WARN_ON(start > offset + len);
5035
5036                 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
5037                 if ((copy = end - offset) > 0) {
5038                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
5039                         if (unlikely(elt && sg_is_last(&sg[elt - 1])))
5040                                 return -EMSGSIZE;
5041
5042                         if (copy > len)
5043                                 copy = len;
5044                         sg_set_page(&sg[elt], skb_frag_page(frag), copy,
5045                                     skb_frag_off(frag) + offset - start);
5046                         elt++;
5047                         if (!(len -= copy))
5048                                 return elt;
5049                         offset += copy;
5050                 }
5051                 start = end;
5052         }
5053
5054         skb_walk_frags(skb, frag_iter) {
5055                 int end, ret;
5056
5057                 WARN_ON(start > offset + len);
5058
5059                 end = start + frag_iter->len;
5060                 if ((copy = end - offset) > 0) {
5061                         if (unlikely(elt && sg_is_last(&sg[elt - 1])))
5062                                 return -EMSGSIZE;
5063
5064                         if (copy > len)
5065                                 copy = len;
5066                         ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
5067                                               copy, recursion_level + 1);
5068                         if (unlikely(ret < 0))
5069                                 return ret;
5070                         elt += ret;
5071                         if ((len -= copy) == 0)
5072                                 return elt;
5073                         offset += copy;
5074                 }
5075                 start = end;
5076         }
5077         BUG_ON(len);
5078         return elt;
5079 }
5080
5081 /**
5082  *      skb_to_sgvec - Fill a scatter-gather list from a socket buffer
5083  *      @skb: Socket buffer containing the buffers to be mapped
5084  *      @sg: The scatter-gather list to map into
5085  *      @offset: The offset into the buffer's contents to start mapping
5086  *      @len: Length of buffer space to be mapped
5087  *
5088  *      Fill the specified scatter-gather list with mappings/pointers into a
5089  *      region of the buffer space attached to a socket buffer. Returns either
5090  *      the number of scatterlist items used, or -EMSGSIZE if the contents
5091  *      could not fit.
5092  */
5093 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
5094 {
5095         int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
5096
5097         if (nsg <= 0)
5098                 return nsg;
5099
5100         sg_mark_end(&sg[nsg - 1]);
5101
5102         return nsg;
5103 }
5104 EXPORT_SYMBOL_GPL(skb_to_sgvec);
5105
5106 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
5107  * sglist without mark the sg which contain last skb data as the end.
5108  * So the caller can mannipulate sg list as will when padding new data after
5109  * the first call without calling sg_unmark_end to expend sg list.
5110  *
5111  * Scenario to use skb_to_sgvec_nomark:
5112  * 1. sg_init_table
5113  * 2. skb_to_sgvec_nomark(payload1)
5114  * 3. skb_to_sgvec_nomark(payload2)
5115  *
5116  * This is equivalent to:
5117  * 1. sg_init_table
5118  * 2. skb_to_sgvec(payload1)
5119  * 3. sg_unmark_end
5120  * 4. skb_to_sgvec(payload2)
5121  *
5122  * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
5123  * is more preferable.
5124  */
5125 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
5126                         int offset, int len)
5127 {
5128         return __skb_to_sgvec(skb, sg, offset, len, 0);
5129 }
5130 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
5131
5132
5133
5134 /**
5135  *      skb_cow_data - Check that a socket buffer's data buffers are writable
5136  *      @skb: The socket buffer to check.
5137  *      @tailbits: Amount of trailing space to be added
5138  *      @trailer: Returned pointer to the skb where the @tailbits space begins
5139  *
5140  *      Make sure that the data buffers attached to a socket buffer are
5141  *      writable. If they are not, private copies are made of the data buffers
5142  *      and the socket buffer is set to use these instead.
5143  *
5144  *      If @tailbits is given, make sure that there is space to write @tailbits
5145  *      bytes of data beyond current end of socket buffer.  @trailer will be
5146  *      set to point to the skb in which this space begins.
5147  *
5148  *      The number of scatterlist elements required to completely map the
5149  *      COW'd and extended socket buffer will be returned.
5150  */
5151 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
5152 {
5153         int copyflag;
5154         int elt;
5155         struct sk_buff *skb1, **skb_p;
5156
5157         /* If skb is cloned or its head is paged, reallocate
5158          * head pulling out all the pages (pages are considered not writable
5159          * at the moment even if they are anonymous).
5160          */
5161         if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
5162             !__pskb_pull_tail(skb, __skb_pagelen(skb)))
5163                 return -ENOMEM;
5164
5165         /* Easy case. Most of packets will go this way. */
5166         if (!skb_has_frag_list(skb)) {
5167                 /* A little of trouble, not enough of space for trailer.
5168                  * This should not happen, when stack is tuned to generate
5169                  * good frames. OK, on miss we reallocate and reserve even more
5170                  * space, 128 bytes is fair. */
5171
5172                 if (skb_tailroom(skb) < tailbits &&
5173                     pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
5174                         return -ENOMEM;
5175
5176                 /* Voila! */
5177                 *trailer = skb;
5178                 return 1;
5179         }
5180
5181         /* Misery. We are in troubles, going to mincer fragments... */
5182
5183         elt = 1;
5184         skb_p = &skb_shinfo(skb)->frag_list;
5185         copyflag = 0;
5186
5187         while ((skb1 = *skb_p) != NULL) {
5188                 int ntail = 0;
5189
5190                 /* The fragment is partially pulled by someone,
5191                  * this can happen on input. Copy it and everything
5192                  * after it. */
5193
5194                 if (skb_shared(skb1))
5195                         copyflag = 1;
5196
5197                 /* If the skb is the last, worry about trailer. */
5198
5199                 if (skb1->next == NULL && tailbits) {
5200                         if (skb_shinfo(skb1)->nr_frags ||
5201                             skb_has_frag_list(skb1) ||
5202                             skb_tailroom(skb1) < tailbits)
5203                                 ntail = tailbits + 128;
5204                 }
5205
5206                 if (copyflag ||
5207                     skb_cloned(skb1) ||
5208                     ntail ||
5209                     skb_shinfo(skb1)->nr_frags ||
5210                     skb_has_frag_list(skb1)) {
5211                         struct sk_buff *skb2;
5212
5213                         /* Fuck, we are miserable poor guys... */
5214                         if (ntail == 0)
5215                                 skb2 = skb_copy(skb1, GFP_ATOMIC);
5216                         else
5217                                 skb2 = skb_copy_expand(skb1,
5218                                                        skb_headroom(skb1),
5219                                                        ntail,
5220                                                        GFP_ATOMIC);
5221                         if (unlikely(skb2 == NULL))
5222                                 return -ENOMEM;
5223
5224                         if (skb1->sk)
5225                                 skb_set_owner_w(skb2, skb1->sk);
5226
5227                         /* Looking around. Are we still alive?
5228                          * OK, link new skb, drop old one */
5229
5230                         skb2->next = skb1->next;
5231                         *skb_p = skb2;
5232                         kfree_skb(skb1);
5233                         skb1 = skb2;
5234                 }
5235                 elt++;
5236                 *trailer = skb1;
5237                 skb_p = &skb1->next;
5238         }
5239
5240         return elt;
5241 }
5242 EXPORT_SYMBOL_GPL(skb_cow_data);
5243
5244 static void sock_rmem_free(struct sk_buff *skb)
5245 {
5246         struct sock *sk = skb->sk;
5247
5248         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
5249 }
5250
5251 static void skb_set_err_queue(struct sk_buff *skb)
5252 {
5253         /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
5254          * So, it is safe to (mis)use it to mark skbs on the error queue.
5255          */
5256         skb->pkt_type = PACKET_OUTGOING;
5257         BUILD_BUG_ON(PACKET_OUTGOING == 0);
5258 }
5259
5260 /*
5261  * Note: We dont mem charge error packets (no sk_forward_alloc changes)
5262  */
5263 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
5264 {
5265         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
5266             (unsigned int)READ_ONCE(sk->sk_rcvbuf))
5267                 return -ENOMEM;
5268
5269         skb_orphan(skb);
5270         skb->sk = sk;
5271         skb->destructor = sock_rmem_free;
5272         atomic_add(skb->truesize, &sk->sk_rmem_alloc);
5273         skb_set_err_queue(skb);
5274
5275         /* before exiting rcu section, make sure dst is refcounted */
5276         skb_dst_force(skb);
5277
5278         skb_queue_tail(&sk->sk_error_queue, skb);
5279         if (!sock_flag(sk, SOCK_DEAD))
5280                 sk_error_report(sk);
5281         return 0;
5282 }
5283 EXPORT_SYMBOL(sock_queue_err_skb);
5284
5285 static bool is_icmp_err_skb(const struct sk_buff *skb)
5286 {
5287         return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
5288                        SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
5289 }
5290
5291 struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
5292 {
5293         struct sk_buff_head *q = &sk->sk_error_queue;
5294         struct sk_buff *skb, *skb_next = NULL;
5295         bool icmp_next = false;
5296         unsigned long flags;
5297
5298         if (skb_queue_empty_lockless(q))
5299                 return NULL;
5300
5301         spin_lock_irqsave(&q->lock, flags);
5302         skb = __skb_dequeue(q);
5303         if (skb && (skb_next = skb_peek(q))) {
5304                 icmp_next = is_icmp_err_skb(skb_next);
5305                 if (icmp_next)
5306                         sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
5307         }
5308         spin_unlock_irqrestore(&q->lock, flags);
5309
5310         if (is_icmp_err_skb(skb) && !icmp_next)
5311                 sk->sk_err = 0;
5312
5313         if (skb_next)
5314                 sk_error_report(sk);
5315
5316         return skb;
5317 }
5318 EXPORT_SYMBOL(sock_dequeue_err_skb);
5319
5320 /**
5321  * skb_clone_sk - create clone of skb, and take reference to socket
5322  * @skb: the skb to clone
5323  *
5324  * This function creates a clone of a buffer that holds a reference on
5325  * sk_refcnt.  Buffers created via this function are meant to be
5326  * returned using sock_queue_err_skb, or free via kfree_skb.
5327  *
5328  * When passing buffers allocated with this function to sock_queue_err_skb
5329  * it is necessary to wrap the call with sock_hold/sock_put in order to
5330  * prevent the socket from being released prior to being enqueued on
5331  * the sk_error_queue.
5332  */
5333 struct sk_buff *skb_clone_sk(struct sk_buff *skb)
5334 {
5335         struct sock *sk = skb->sk;
5336         struct sk_buff *clone;
5337
5338         if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
5339                 return NULL;
5340
5341         clone = skb_clone(skb, GFP_ATOMIC);
5342         if (!clone) {
5343                 sock_put(sk);
5344                 return NULL;
5345         }
5346
5347         clone->sk = sk;
5348         clone->destructor = sock_efree;
5349
5350         return clone;
5351 }
5352 EXPORT_SYMBOL(skb_clone_sk);
5353
5354 static void __skb_complete_tx_timestamp(struct sk_buff *skb,
5355                                         struct sock *sk,
5356                                         int tstype,
5357                                         bool opt_stats)
5358 {
5359         struct sock_exterr_skb *serr;
5360         int err;
5361
5362         BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
5363
5364         serr = SKB_EXT_ERR(skb);
5365         memset(serr, 0, sizeof(*serr));
5366         serr->ee.ee_errno = ENOMSG;
5367         serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
5368         serr->ee.ee_info = tstype;
5369         serr->opt_stats = opt_stats;
5370         serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
5371         if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
5372                 serr->ee.ee_data = skb_shinfo(skb)->tskey;
5373                 if (sk_is_tcp(sk))
5374                         serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
5375         }
5376
5377         err = sock_queue_err_skb(sk, skb);
5378
5379         if (err)
5380                 kfree_skb(skb);
5381 }
5382
5383 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
5384 {
5385         bool ret;
5386
5387         if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
5388                 return true;
5389
5390         read_lock_bh(&sk->sk_callback_lock);
5391         ret = sk->sk_socket && sk->sk_socket->file &&
5392               file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
5393         read_unlock_bh(&sk->sk_callback_lock);
5394         return ret;
5395 }
5396
5397 void skb_complete_tx_timestamp(struct sk_buff *skb,
5398                                struct skb_shared_hwtstamps *hwtstamps)
5399 {
5400         struct sock *sk = skb->sk;
5401
5402         if (!skb_may_tx_timestamp(sk, false))
5403                 goto err;
5404
5405         /* Take a reference to prevent skb_orphan() from freeing the socket,
5406          * but only if the socket refcount is not zero.
5407          */
5408         if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
5409                 *skb_hwtstamps(skb) = *hwtstamps;
5410                 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
5411                 sock_put(sk);
5412                 return;
5413         }
5414
5415 err:
5416         kfree_skb(skb);
5417 }
5418 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
5419
5420 void __skb_tstamp_tx(struct sk_buff *orig_skb,
5421                      const struct sk_buff *ack_skb,
5422                      struct skb_shared_hwtstamps *hwtstamps,
5423                      struct sock *sk, int tstype)
5424 {
5425         struct sk_buff *skb;
5426         bool tsonly, opt_stats = false;
5427         u32 tsflags;
5428
5429         if (!sk)
5430                 return;
5431
5432         tsflags = READ_ONCE(sk->sk_tsflags);
5433         if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
5434             skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
5435                 return;
5436
5437         tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
5438         if (!skb_may_tx_timestamp(sk, tsonly))
5439                 return;
5440
5441         if (tsonly) {
5442 #ifdef CONFIG_INET
5443                 if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
5444                     sk_is_tcp(sk)) {
5445                         skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
5446                                                              ack_skb);
5447                         opt_stats = true;
5448                 } else
5449 #endif
5450                         skb = alloc_skb(0, GFP_ATOMIC);
5451         } else {
5452                 skb = skb_clone(orig_skb, GFP_ATOMIC);
5453
5454                 if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
5455                         kfree_skb(skb);
5456                         return;
5457                 }
5458         }
5459         if (!skb)
5460                 return;
5461
5462         if (tsonly) {
5463                 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
5464                                              SKBTX_ANY_TSTAMP;
5465                 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
5466         }
5467
5468         if (hwtstamps)
5469                 *skb_hwtstamps(skb) = *hwtstamps;
5470         else
5471                 __net_timestamp(skb);
5472
5473         __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
5474 }
5475 EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
5476
5477 void skb_tstamp_tx(struct sk_buff *orig_skb,
5478                    struct skb_shared_hwtstamps *hwtstamps)
5479 {
5480         return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
5481                                SCM_TSTAMP_SND);
5482 }
5483 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
5484
5485 #ifdef CONFIG_WIRELESS
5486 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
5487 {
5488         struct sock *sk = skb->sk;
5489         struct sock_exterr_skb *serr;
5490         int err = 1;
5491
5492         skb->wifi_acked_valid = 1;
5493         skb->wifi_acked = acked;
5494
5495         serr = SKB_EXT_ERR(skb);
5496         memset(serr, 0, sizeof(*serr));
5497         serr->ee.ee_errno = ENOMSG;
5498         serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
5499
5500         /* Take a reference to prevent skb_orphan() from freeing the socket,
5501          * but only if the socket refcount is not zero.
5502          */
5503         if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
5504                 err = sock_queue_err_skb(sk, skb);
5505                 sock_put(sk);
5506         }
5507         if (err)
5508                 kfree_skb(skb);
5509 }
5510 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
5511 #endif /* CONFIG_WIRELESS */
5512
5513 /**
5514  * skb_partial_csum_set - set up and verify partial csum values for packet
5515  * @skb: the skb to set
5516  * @start: the number of bytes after skb->data to start checksumming.
5517  * @off: the offset from start to place the checksum.
5518  *
5519  * For untrusted partially-checksummed packets, we need to make sure the values
5520  * for skb->csum_start and skb->csum_offset are valid so we don't oops.
5521  *
5522  * This function checks and sets those values and skb->ip_summed: if this
5523  * returns false you should drop the packet.
5524  */
5525 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
5526 {
5527         u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
5528         u32 csum_start = skb_headroom(skb) + (u32)start;
5529
5530         if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
5531                 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
5532                                      start, off, skb_headroom(skb), skb_headlen(skb));
5533                 return false;
5534         }
5535         skb->ip_summed = CHECKSUM_PARTIAL;
5536         skb->csum_start = csum_start;
5537         skb->csum_offset = off;
5538         skb->transport_header = csum_start;
5539         return true;
5540 }
5541 EXPORT_SYMBOL_GPL(skb_partial_csum_set);
5542
5543 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
5544                                unsigned int max)
5545 {
5546         if (skb_headlen(skb) >= len)
5547                 return 0;
5548
5549         /* If we need to pullup then pullup to the max, so we
5550          * won't need to do it again.
5551          */
5552         if (max > skb->len)
5553                 max = skb->len;
5554
5555         if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
5556                 return -ENOMEM;
5557
5558         if (skb_headlen(skb) < len)
5559                 return -EPROTO;
5560
5561         return 0;
5562 }
5563
5564 #define MAX_TCP_HDR_LEN (15 * 4)
5565
5566 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
5567                                       typeof(IPPROTO_IP) proto,
5568                                       unsigned int off)
5569 {
5570         int err;
5571
5572         switch (proto) {
5573         case IPPROTO_TCP:
5574                 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
5575                                           off + MAX_TCP_HDR_LEN);
5576                 if (!err && !skb_partial_csum_set(skb, off,
5577                                                   offsetof(struct tcphdr,
5578                                                            check)))
5579                         err = -EPROTO;
5580                 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
5581
5582         case IPPROTO_UDP:
5583                 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
5584                                           off + sizeof(struct udphdr));
5585                 if (!err && !skb_partial_csum_set(skb, off,
5586                                                   offsetof(struct udphdr,
5587                                                            check)))
5588                         err = -EPROTO;
5589                 return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
5590         }
5591
5592         return ERR_PTR(-EPROTO);
5593 }
5594
5595 /* This value should be large enough to cover a tagged ethernet header plus
5596  * maximally sized IP and TCP or UDP headers.
5597  */
5598 #define MAX_IP_HDR_LEN 128
5599
5600 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
5601 {
5602         unsigned int off;
5603         bool fragment;
5604         __sum16 *csum;
5605         int err;
5606
5607         fragment = false;
5608
5609         err = skb_maybe_pull_tail(skb,
5610                                   sizeof(struct iphdr),
5611                                   MAX_IP_HDR_LEN);
5612         if (err < 0)
5613                 goto out;
5614
5615         if (ip_is_fragment(ip_hdr(skb)))
5616                 fragment = true;
5617
5618         off = ip_hdrlen(skb);
5619
5620         err = -EPROTO;
5621
5622         if (fragment)
5623                 goto out;
5624
5625         csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
5626         if (IS_ERR(csum))
5627                 return PTR_ERR(csum);
5628
5629         if (recalculate)
5630                 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
5631                                            ip_hdr(skb)->daddr,
5632                                            skb->len - off,
5633                                            ip_hdr(skb)->protocol, 0);
5634         err = 0;
5635
5636 out:
5637         return err;
5638 }
5639
5640 /* This value should be large enough to cover a tagged ethernet header plus
5641  * an IPv6 header, all options, and a maximal TCP or UDP header.
5642  */
5643 #define MAX_IPV6_HDR_LEN 256
5644
5645 #define OPT_HDR(type, skb, off) \
5646         (type *)(skb_network_header(skb) + (off))
5647
5648 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
5649 {
5650         int err;
5651         u8 nexthdr;
5652         unsigned int off;
5653         unsigned int len;
5654         bool fragment;
5655         bool done;
5656         __sum16 *csum;
5657
5658         fragment = false;
5659         done = false;
5660
5661         off = sizeof(struct ipv6hdr);
5662
5663         err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
5664         if (err < 0)
5665                 goto out;
5666
5667         nexthdr = ipv6_hdr(skb)->nexthdr;
5668
5669         len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
5670         while (off <= len && !done) {
5671                 switch (nexthdr) {
5672                 case IPPROTO_DSTOPTS:
5673                 case IPPROTO_HOPOPTS:
5674                 case IPPROTO_ROUTING: {
5675                         struct ipv6_opt_hdr *hp;
5676
5677                         err = skb_maybe_pull_tail(skb,
5678                                                   off +
5679                                                   sizeof(struct ipv6_opt_hdr),
5680                                                   MAX_IPV6_HDR_LEN);
5681                         if (err < 0)
5682                                 goto out;
5683
5684                         hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
5685                         nexthdr = hp->nexthdr;
5686                         off += ipv6_optlen(hp);
5687                         break;
5688                 }
5689                 case IPPROTO_AH: {
5690                         struct ip_auth_hdr *hp;
5691
5692                         err = skb_maybe_pull_tail(skb,
5693                                                   off +
5694                                                   sizeof(struct ip_auth_hdr),
5695                                                   MAX_IPV6_HDR_LEN);
5696                         if (err < 0)
5697                                 goto out;
5698
5699                         hp = OPT_HDR(struct ip_auth_hdr, skb, off);
5700                         nexthdr = hp->nexthdr;
5701                         off += ipv6_authlen(hp);
5702                         break;
5703                 }
5704                 case IPPROTO_FRAGMENT: {
5705                         struct frag_hdr *hp;
5706
5707                         err = skb_maybe_pull_tail(skb,
5708                                                   off +
5709                                                   sizeof(struct frag_hdr),
5710                                                   MAX_IPV6_HDR_LEN);
5711                         if (err < 0)
5712                                 goto out;
5713
5714                         hp = OPT_HDR(struct frag_hdr, skb, off);
5715
5716                         if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
5717                                 fragment = true;
5718
5719                         nexthdr = hp->nexthdr;
5720                         off += sizeof(struct frag_hdr);
5721                         break;
5722                 }
5723                 default:
5724                         done = true;
5725                         break;
5726                 }
5727         }
5728
5729         err = -EPROTO;
5730
5731         if (!done || fragment)
5732                 goto out;
5733
5734         csum = skb_checksum_setup_ip(skb, nexthdr, off);
5735         if (IS_ERR(csum))
5736                 return PTR_ERR(csum);
5737
5738         if (recalculate)
5739                 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
5740                                          &ipv6_hdr(skb)->daddr,
5741                                          skb->len - off, nexthdr, 0);
5742         err = 0;
5743
5744 out:
5745         return err;
5746 }
5747
5748 /**
5749  * skb_checksum_setup - set up partial checksum offset
5750  * @skb: the skb to set up
5751  * @recalculate: if true the pseudo-header checksum will be recalculated
5752  */
5753 int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
5754 {
5755         int err;
5756
5757         switch (skb->protocol) {
5758         case htons(ETH_P_IP):
5759                 err = skb_checksum_setup_ipv4(skb, recalculate);
5760                 break;
5761
5762         case htons(ETH_P_IPV6):
5763                 err = skb_checksum_setup_ipv6(skb, recalculate);
5764                 break;
5765
5766         default:
5767                 err = -EPROTO;
5768                 break;
5769         }
5770
5771         return err;
5772 }
5773 EXPORT_SYMBOL(skb_checksum_setup);
5774
5775 /**
5776  * skb_checksum_maybe_trim - maybe trims the given skb
5777  * @skb: the skb to check
5778  * @transport_len: the data length beyond the network header
5779  *
5780  * Checks whether the given skb has data beyond the given transport length.
5781  * If so, returns a cloned skb trimmed to this transport length.
5782  * Otherwise returns the provided skb. Returns NULL in error cases
5783  * (e.g. transport_len exceeds skb length or out-of-memory).
5784  *
5785  * Caller needs to set the skb transport header and free any returned skb if it
5786  * differs from the provided skb.
5787  */
5788 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
5789                                                unsigned int transport_len)
5790 {
5791         struct sk_buff *skb_chk;
5792         unsigned int len = skb_transport_offset(skb) + transport_len;
5793         int ret;
5794
5795         if (skb->len < len)
5796                 return NULL;
5797         else if (skb->len == len)
5798                 return skb;
5799
5800         skb_chk = skb_clone(skb, GFP_ATOMIC);
5801         if (!skb_chk)
5802                 return NULL;
5803
5804         ret = pskb_trim_rcsum(skb_chk, len);
5805         if (ret) {
5806                 kfree_skb(skb_chk);
5807                 return NULL;
5808         }
5809
5810         return skb_chk;
5811 }
5812
5813 /**
5814  * skb_checksum_trimmed - validate checksum of an skb
5815  * @skb: the skb to check
5816  * @transport_len: the data length beyond the network header
5817  * @skb_chkf: checksum function to use
5818  *
5819  * Applies the given checksum function skb_chkf to the provided skb.
5820  * Returns a checked and maybe trimmed skb. Returns NULL on error.
5821  *
5822  * If the skb has data beyond the given transport length, then a
5823  * trimmed & cloned skb is checked and returned.
5824  *
5825  * Caller needs to set the skb transport header and free any returned skb if it
5826  * differs from the provided skb.
5827  */
5828 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
5829                                      unsigned int transport_len,
5830                                      __sum16(*skb_chkf)(struct sk_buff *skb))
5831 {
5832         struct sk_buff *skb_chk;
5833         unsigned int offset = skb_transport_offset(skb);
5834         __sum16 ret;
5835
5836         skb_chk = skb_checksum_maybe_trim(skb, transport_len);
5837         if (!skb_chk)
5838                 goto err;
5839
5840         if (!pskb_may_pull(skb_chk, offset))
5841                 goto err;
5842
5843         skb_pull_rcsum(skb_chk, offset);
5844         ret = skb_chkf(skb_chk);
5845         skb_push_rcsum(skb_chk, offset);
5846
5847         if (ret)
5848                 goto err;
5849
5850         return skb_chk;
5851
5852 err:
5853         if (skb_chk && skb_chk != skb)
5854                 kfree_skb(skb_chk);
5855
5856         return NULL;
5857
5858 }
5859 EXPORT_SYMBOL(skb_checksum_trimmed);
5860
5861 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
5862 {
5863         net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
5864                              skb->dev->name);
5865 }
5866 EXPORT_SYMBOL(__skb_warn_lro_forwarding);
5867
5868 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
5869 {
5870         if (head_stolen) {
5871                 skb_release_head_state(skb);
5872                 kmem_cache_free(net_hotdata.skbuff_cache, skb);
5873         } else {
5874                 __kfree_skb(skb);
5875         }
5876 }
5877 EXPORT_SYMBOL(kfree_skb_partial);
5878
5879 /**
5880  * skb_try_coalesce - try to merge skb to prior one
5881  * @to: prior buffer
5882  * @from: buffer to add
5883  * @fragstolen: pointer to boolean
5884  * @delta_truesize: how much more was allocated than was requested
5885  */
5886 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
5887                       bool *fragstolen, int *delta_truesize)
5888 {
5889         struct skb_shared_info *to_shinfo, *from_shinfo;
5890         int i, delta, len = from->len;
5891
5892         *fragstolen = false;
5893
5894         if (skb_cloned(to))
5895                 return false;
5896
5897         /* In general, avoid mixing page_pool and non-page_pool allocated
5898          * pages within the same SKB. In theory we could take full
5899          * references if @from is cloned and !@to->pp_recycle but its
5900          * tricky (due to potential race with the clone disappearing) and
5901          * rare, so not worth dealing with.
5902          */
5903         if (to->pp_recycle != from->pp_recycle)
5904                 return false;
5905
5906         if (len <= skb_tailroom(to)) {
5907                 if (len)
5908                         BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
5909                 *delta_truesize = 0;
5910                 return true;
5911         }
5912
5913         to_shinfo = skb_shinfo(to);
5914         from_shinfo = skb_shinfo(from);
5915         if (to_shinfo->frag_list || from_shinfo->frag_list)
5916                 return false;
5917         if (skb_zcopy(to) || skb_zcopy(from))
5918                 return false;
5919
5920         if (skb_headlen(from) != 0) {
5921                 struct page *page;
5922                 unsigned int offset;
5923
5924                 if (to_shinfo->nr_frags +
5925                     from_shinfo->nr_frags >= MAX_SKB_FRAGS)
5926                         return false;
5927
5928                 if (skb_head_is_locked(from))
5929                         return false;
5930
5931                 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
5932
5933                 page = virt_to_head_page(from->head);
5934                 offset = from->data - (unsigned char *)page_address(page);
5935
5936                 skb_fill_page_desc(to, to_shinfo->nr_frags,
5937                                    page, offset, skb_headlen(from));
5938                 *fragstolen = true;
5939         } else {
5940                 if (to_shinfo->nr_frags +
5941                     from_shinfo->nr_frags > MAX_SKB_FRAGS)
5942                         return false;
5943
5944                 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
5945         }
5946
5947         WARN_ON_ONCE(delta < len);
5948
5949         memcpy(to_shinfo->frags + to_shinfo->nr_frags,
5950                from_shinfo->frags,
5951                from_shinfo->nr_frags * sizeof(skb_frag_t));
5952         to_shinfo->nr_frags += from_shinfo->nr_frags;
5953
5954         if (!skb_cloned(from))
5955                 from_shinfo->nr_frags = 0;
5956
5957         /* if the skb is not cloned this does nothing
5958          * since we set nr_frags to 0.
5959          */
5960         if (skb_pp_frag_ref(from)) {
5961                 for (i = 0; i < from_shinfo->nr_frags; i++)
5962                         __skb_frag_ref(&from_shinfo->frags[i]);
5963         }
5964
5965         to->truesize += delta;
5966         to->len += len;
5967         to->data_len += len;
5968
5969         *delta_truesize = delta;
5970         return true;
5971 }
5972 EXPORT_SYMBOL(skb_try_coalesce);
5973
5974 /**
5975  * skb_scrub_packet - scrub an skb
5976  *
5977  * @skb: buffer to clean
5978  * @xnet: packet is crossing netns
5979  *
5980  * skb_scrub_packet can be used after encapsulating or decapsulting a packet
5981  * into/from a tunnel. Some information have to be cleared during these
5982  * operations.
5983  * skb_scrub_packet can also be used to clean a skb before injecting it in
5984  * another namespace (@xnet == true). We have to clear all information in the
5985  * skb that could impact namespace isolation.
5986  */
5987 void skb_scrub_packet(struct sk_buff *skb, bool xnet)
5988 {
5989         skb->pkt_type = PACKET_HOST;
5990         skb->skb_iif = 0;
5991         skb->ignore_df = 0;
5992         skb_dst_drop(skb);
5993         skb_ext_reset(skb);
5994         nf_reset_ct(skb);
5995         nf_reset_trace(skb);
5996
5997 #ifdef CONFIG_NET_SWITCHDEV
5998         skb->offload_fwd_mark = 0;
5999         skb->offload_l3_fwd_mark = 0;
6000 #endif
6001
6002         if (!xnet)
6003                 return;
6004
6005         ipvs_reset(skb);
6006         skb->mark = 0;
6007         skb_clear_tstamp(skb);
6008 }
6009 EXPORT_SYMBOL_GPL(skb_scrub_packet);
6010
6011 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
6012 {
6013         int mac_len, meta_len;
6014         void *meta;
6015
6016         if (skb_cow(skb, skb_headroom(skb)) < 0) {
6017                 kfree_skb(skb);
6018                 return NULL;
6019         }
6020
6021         mac_len = skb->data - skb_mac_header(skb);
6022         if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
6023                 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
6024                         mac_len - VLAN_HLEN - ETH_TLEN);
6025         }
6026
6027         meta_len = skb_metadata_len(skb);
6028         if (meta_len) {
6029                 meta = skb_metadata_end(skb) - meta_len;
6030                 memmove(meta + VLAN_HLEN, meta, meta_len);
6031         }
6032
6033         skb->mac_header += VLAN_HLEN;
6034         return skb;
6035 }
6036
6037 struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
6038 {
6039         struct vlan_hdr *vhdr;
6040         u16 vlan_tci;
6041
6042         if (unlikely(skb_vlan_tag_present(skb))) {
6043                 /* vlan_tci is already set-up so leave this for another time */
6044                 return skb;
6045         }
6046
6047         skb = skb_share_check(skb, GFP_ATOMIC);
6048         if (unlikely(!skb))
6049                 goto err_free;
6050         /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
6051         if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
6052                 goto err_free;
6053
6054         vhdr = (struct vlan_hdr *)skb->data;
6055         vlan_tci = ntohs(vhdr->h_vlan_TCI);
6056         __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
6057
6058         skb_pull_rcsum(skb, VLAN_HLEN);
6059         vlan_set_encap_proto(skb, vhdr);
6060
6061         skb = skb_reorder_vlan_header(skb);
6062         if (unlikely(!skb))
6063                 goto err_free;
6064
6065         skb_reset_network_header(skb);
6066         if (!skb_transport_header_was_set(skb))
6067                 skb_reset_transport_header(skb);
6068         skb_reset_mac_len(skb);
6069
6070         return skb;
6071
6072 err_free:
6073         kfree_skb(skb);
6074         return NULL;
6075 }
6076 EXPORT_SYMBOL(skb_vlan_untag);
6077
6078 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
6079 {
6080         if (!pskb_may_pull(skb, write_len))
6081                 return -ENOMEM;
6082
6083         if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
6084                 return 0;
6085
6086         return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
6087 }
6088 EXPORT_SYMBOL(skb_ensure_writable);
6089
6090 int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
6091 {
6092         int needed_headroom = dev->needed_headroom;
6093         int needed_tailroom = dev->needed_tailroom;
6094
6095         /* For tail taggers, we need to pad short frames ourselves, to ensure
6096          * that the tail tag does not fail at its role of being at the end of
6097          * the packet, once the conduit interface pads the frame. Account for
6098          * that pad length here, and pad later.
6099          */
6100         if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
6101                 needed_tailroom += ETH_ZLEN - skb->len;
6102         /* skb_headroom() returns unsigned int... */
6103         needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
6104         needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
6105
6106         if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
6107                 /* No reallocation needed, yay! */
6108                 return 0;
6109
6110         return pskb_expand_head(skb, needed_headroom, needed_tailroom,
6111                                 GFP_ATOMIC);
6112 }
6113 EXPORT_SYMBOL(skb_ensure_writable_head_tail);
6114
6115 /* remove VLAN header from packet and update csum accordingly.
6116  * expects a non skb_vlan_tag_present skb with a vlan tag payload
6117  */
6118 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
6119 {
6120         int offset = skb->data - skb_mac_header(skb);
6121         int err;
6122
6123         if (WARN_ONCE(offset,
6124                       "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
6125                       offset)) {
6126                 return -EINVAL;
6127         }
6128
6129         err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
6130         if (unlikely(err))
6131                 return err;
6132
6133         skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
6134
6135         vlan_remove_tag(skb, vlan_tci);
6136
6137         skb->mac_header += VLAN_HLEN;
6138
6139         if (skb_network_offset(skb) < ETH_HLEN)
6140                 skb_set_network_header(skb, ETH_HLEN);
6141
6142         skb_reset_mac_len(skb);
6143
6144         return err;
6145 }
6146 EXPORT_SYMBOL(__skb_vlan_pop);
6147
6148 /* Pop a vlan tag either from hwaccel or from payload.
6149  * Expects skb->data at mac header.
6150  */
6151 int skb_vlan_pop(struct sk_buff *skb)
6152 {
6153         u16 vlan_tci;
6154         __be16 vlan_proto;
6155         int err;
6156
6157         if (likely(skb_vlan_tag_present(skb))) {
6158                 __vlan_hwaccel_clear_tag(skb);
6159         } else {
6160                 if (unlikely(!eth_type_vlan(skb->protocol)))
6161                         return 0;
6162
6163                 err = __skb_vlan_pop(skb, &vlan_tci);
6164                 if (err)
6165                         return err;
6166         }
6167         /* move next vlan tag to hw accel tag */
6168         if (likely(!eth_type_vlan(skb->protocol)))
6169                 return 0;
6170
6171         vlan_proto = skb->protocol;
6172         err = __skb_vlan_pop(skb, &vlan_tci);
6173         if (unlikely(err))
6174                 return err;
6175
6176         __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
6177         return 0;
6178 }
6179 EXPORT_SYMBOL(skb_vlan_pop);
6180
6181 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
6182  * Expects skb->data at mac header.
6183  */
6184 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
6185 {
6186         if (skb_vlan_tag_present(skb)) {
6187                 int offset = skb->data - skb_mac_header(skb);
6188                 int err;
6189
6190                 if (WARN_ONCE(offset,
6191                               "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
6192                               offset)) {
6193                         return -EINVAL;
6194                 }
6195
6196                 err = __vlan_insert_tag(skb, skb->vlan_proto,
6197                                         skb_vlan_tag_get(skb));
6198                 if (err)
6199                         return err;
6200
6201                 skb->protocol = skb->vlan_proto;
6202                 skb->mac_len += VLAN_HLEN;
6203
6204                 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
6205         }
6206         __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
6207         return 0;
6208 }
6209 EXPORT_SYMBOL(skb_vlan_push);
6210
6211 /**
6212  * skb_eth_pop() - Drop the Ethernet header at the head of a packet
6213  *
6214  * @skb: Socket buffer to modify
6215  *
6216  * Drop the Ethernet header of @skb.
6217  *
6218  * Expects that skb->data points to the mac header and that no VLAN tags are
6219  * present.
6220  *
6221  * Returns 0 on success, -errno otherwise.
6222  */
6223 int skb_eth_pop(struct sk_buff *skb)
6224 {
6225         if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
6226             skb_network_offset(skb) < ETH_HLEN)
6227                 return -EPROTO;
6228
6229         skb_pull_rcsum(skb, ETH_HLEN);
6230         skb_reset_mac_header(skb);
6231         skb_reset_mac_len(skb);
6232
6233         return 0;
6234 }
6235 EXPORT_SYMBOL(skb_eth_pop);
6236
6237 /**
6238  * skb_eth_push() - Add a new Ethernet header at the head of a packet
6239  *
6240  * @skb: Socket buffer to modify
6241  * @dst: Destination MAC address of the new header
6242  * @src: Source MAC address of the new header
6243  *
6244  * Prepend @skb with a new Ethernet header.
6245  *
6246  * Expects that skb->data points to the mac header, which must be empty.
6247  *
6248  * Returns 0 on success, -errno otherwise.
6249  */
6250 int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
6251                  const unsigned char *src)
6252 {
6253         struct ethhdr *eth;
6254         int err;
6255
6256         if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
6257                 return -EPROTO;
6258
6259         err = skb_cow_head(skb, sizeof(*eth));
6260         if (err < 0)
6261                 return err;
6262
6263         skb_push(skb, sizeof(*eth));
6264         skb_reset_mac_header(skb);
6265         skb_reset_mac_len(skb);
6266
6267         eth = eth_hdr(skb);
6268         ether_addr_copy(eth->h_dest, dst);
6269         ether_addr_copy(eth->h_source, src);
6270         eth->h_proto = skb->protocol;
6271
6272         skb_postpush_rcsum(skb, eth, sizeof(*eth));
6273
6274         return 0;
6275 }
6276 EXPORT_SYMBOL(skb_eth_push);
6277
6278 /* Update the ethertype of hdr and the skb csum value if required. */
6279 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
6280                              __be16 ethertype)
6281 {
6282         if (skb->ip_summed == CHECKSUM_COMPLETE) {
6283                 __be16 diff[] = { ~hdr->h_proto, ethertype };
6284
6285                 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
6286         }
6287
6288         hdr->h_proto = ethertype;
6289 }
6290
6291 /**
6292  * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
6293  *                   the packet
6294  *
6295  * @skb: buffer
6296  * @mpls_lse: MPLS label stack entry to push
6297  * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
6298  * @mac_len: length of the MAC header
6299  * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
6300  *            ethernet
6301  *
6302  * Expects skb->data at mac header.
6303  *
6304  * Returns 0 on success, -errno otherwise.
6305  */
6306 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
6307                   int mac_len, bool ethernet)
6308 {
6309         struct mpls_shim_hdr *lse;
6310         int err;
6311
6312         if (unlikely(!eth_p_mpls(mpls_proto)))
6313                 return -EINVAL;
6314
6315         /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
6316         if (skb->encapsulation)
6317                 return -EINVAL;
6318
6319         err = skb_cow_head(skb, MPLS_HLEN);
6320         if (unlikely(err))
6321                 return err;
6322
6323         if (!skb->inner_protocol) {
6324                 skb_set_inner_network_header(skb, skb_network_offset(skb));
6325                 skb_set_inner_protocol(skb, skb->protocol);
6326         }
6327
6328         skb_push(skb, MPLS_HLEN);
6329         memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
6330                 mac_len);
6331         skb_reset_mac_header(skb);
6332         skb_set_network_header(skb, mac_len);
6333         skb_reset_mac_len(skb);
6334
6335         lse = mpls_hdr(skb);
6336         lse->label_stack_entry = mpls_lse;
6337         skb_postpush_rcsum(skb, lse, MPLS_HLEN);
6338
6339         if (ethernet && mac_len >= ETH_HLEN)
6340                 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
6341         skb->protocol = mpls_proto;
6342
6343         return 0;
6344 }
6345 EXPORT_SYMBOL_GPL(skb_mpls_push);
6346
6347 /**
6348  * skb_mpls_pop() - pop the outermost MPLS header
6349  *
6350  * @skb: buffer
6351  * @next_proto: ethertype of header after popped MPLS header
6352  * @mac_len: length of the MAC header
6353  * @ethernet: flag to indicate if the packet is ethernet
6354  *
6355  * Expects skb->data at mac header.
6356  *
6357  * Returns 0 on success, -errno otherwise.
6358  */
6359 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
6360                  bool ethernet)
6361 {
6362         int err;
6363
6364         if (unlikely(!eth_p_mpls(skb->protocol)))
6365                 return 0;
6366
6367         err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
6368         if (unlikely(err))
6369                 return err;
6370
6371         skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
6372         memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
6373                 mac_len);
6374
6375         __skb_pull(skb, MPLS_HLEN);
6376         skb_reset_mac_header(skb);
6377         skb_set_network_header(skb, mac_len);
6378
6379         if (ethernet && mac_len >= ETH_HLEN) {
6380                 struct ethhdr *hdr;
6381
6382                 /* use mpls_hdr() to get ethertype to account for VLANs. */
6383                 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
6384                 skb_mod_eth_type(skb, hdr, next_proto);
6385         }
6386         skb->protocol = next_proto;
6387
6388         return 0;
6389 }
6390 EXPORT_SYMBOL_GPL(skb_mpls_pop);
6391
6392 /**
6393  * skb_mpls_update_lse() - modify outermost MPLS header and update csum
6394  *
6395  * @skb: buffer
6396  * @mpls_lse: new MPLS label stack entry to update to
6397  *
6398  * Expects skb->data at mac header.
6399  *
6400  * Returns 0 on success, -errno otherwise.
6401  */
6402 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
6403 {
6404         int err;
6405
6406         if (unlikely(!eth_p_mpls(skb->protocol)))
6407                 return -EINVAL;
6408
6409         err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
6410         if (unlikely(err))
6411                 return err;
6412
6413         if (skb->ip_summed == CHECKSUM_COMPLETE) {
6414                 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
6415
6416                 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
6417         }
6418
6419         mpls_hdr(skb)->label_stack_entry = mpls_lse;
6420
6421         return 0;
6422 }
6423 EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
6424
6425 /**
6426  * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
6427  *
6428  * @skb: buffer
6429  *
6430  * Expects skb->data at mac header.
6431  *
6432  * Returns 0 on success, -errno otherwise.
6433  */
6434 int skb_mpls_dec_ttl(struct sk_buff *skb)
6435 {
6436         u32 lse;
6437         u8 ttl;
6438
6439         if (unlikely(!eth_p_mpls(skb->protocol)))
6440                 return -EINVAL;
6441
6442         if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
6443                 return -ENOMEM;
6444
6445         lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
6446         ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
6447         if (!--ttl)
6448                 return -EINVAL;
6449
6450         lse &= ~MPLS_LS_TTL_MASK;
6451         lse |= ttl << MPLS_LS_TTL_SHIFT;
6452
6453         return skb_mpls_update_lse(skb, cpu_to_be32(lse));
6454 }
6455 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
6456
6457 /**
6458  * alloc_skb_with_frags - allocate skb with page frags
6459  *
6460  * @header_len: size of linear part
6461  * @data_len: needed length in frags
6462  * @order: max page order desired.
6463  * @errcode: pointer to error code if any
6464  * @gfp_mask: allocation mask
6465  *
6466  * This can be used to allocate a paged skb, given a maximal order for frags.
6467  */
6468 struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
6469                                      unsigned long data_len,
6470                                      int order,
6471                                      int *errcode,
6472                                      gfp_t gfp_mask)
6473 {
6474         unsigned long chunk;
6475         struct sk_buff *skb;
6476         struct page *page;
6477         int nr_frags = 0;
6478
6479         *errcode = -EMSGSIZE;
6480         if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
6481                 return NULL;
6482
6483         *errcode = -ENOBUFS;
6484         skb = alloc_skb(header_len, gfp_mask);
6485         if (!skb)
6486                 return NULL;
6487
6488         while (data_len) {
6489                 if (nr_frags == MAX_SKB_FRAGS - 1)
6490                         goto failure;
6491                 while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
6492                         order--;
6493
6494                 if (order) {
6495                         page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
6496                                            __GFP_COMP |
6497                                            __GFP_NOWARN,
6498                                            order);
6499                         if (!page) {
6500                                 order--;
6501                                 continue;
6502                         }
6503                 } else {
6504                         page = alloc_page(gfp_mask);
6505                         if (!page)
6506                                 goto failure;
6507                 }
6508                 chunk = min_t(unsigned long, data_len,
6509                               PAGE_SIZE << order);
6510                 skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
6511                 nr_frags++;
6512                 skb->truesize += (PAGE_SIZE << order);
6513                 data_len -= chunk;
6514         }
6515         return skb;
6516
6517 failure:
6518         kfree_skb(skb);
6519         return NULL;
6520 }
6521 EXPORT_SYMBOL(alloc_skb_with_frags);
6522
6523 /* carve out the first off bytes from skb when off < headlen */
6524 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
6525                                     const int headlen, gfp_t gfp_mask)
6526 {
6527         int i;
6528         unsigned int size = skb_end_offset(skb);
6529         int new_hlen = headlen - off;
6530         u8 *data;
6531
6532         if (skb_pfmemalloc(skb))
6533                 gfp_mask |= __GFP_MEMALLOC;
6534
6535         data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
6536         if (!data)
6537                 return -ENOMEM;
6538         size = SKB_WITH_OVERHEAD(size);
6539
6540         /* Copy real data, and all frags */
6541         skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
6542         skb->len -= off;
6543
6544         memcpy((struct skb_shared_info *)(data + size),
6545                skb_shinfo(skb),
6546                offsetof(struct skb_shared_info,
6547                         frags[skb_shinfo(skb)->nr_frags]));
6548         if (skb_cloned(skb)) {
6549                 /* drop the old head gracefully */
6550                 if (skb_orphan_frags(skb, gfp_mask)) {
6551                         skb_kfree_head(data, size);
6552                         return -ENOMEM;
6553                 }
6554                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
6555                         skb_frag_ref(skb, i);
6556                 if (skb_has_frag_list(skb))
6557                         skb_clone_fraglist(skb);
6558                 skb_release_data(skb, SKB_CONSUMED);
6559         } else {
6560                 /* we can reuse existing recount- all we did was
6561                  * relocate values
6562                  */
6563                 skb_free_head(skb);
6564         }
6565
6566         skb->head = data;
6567         skb->data = data;
6568         skb->head_frag = 0;
6569         skb_set_end_offset(skb, size);
6570         skb_set_tail_pointer(skb, skb_headlen(skb));
6571         skb_headers_offset_update(skb, 0);
6572         skb->cloned = 0;
6573         skb->hdr_len = 0;
6574         skb->nohdr = 0;
6575         atomic_set(&skb_shinfo(skb)->dataref, 1);
6576
6577         return 0;
6578 }
6579
6580 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
6581
6582 /* carve out the first eat bytes from skb's frag_list. May recurse into
6583  * pskb_carve()
6584  */
6585 static int pskb_carve_frag_list(struct sk_buff *skb,
6586                                 struct skb_shared_info *shinfo, int eat,
6587                                 gfp_t gfp_mask)
6588 {
6589         struct sk_buff *list = shinfo->frag_list;
6590         struct sk_buff *clone = NULL;
6591         struct sk_buff *insp = NULL;
6592
6593         do {
6594                 if (!list) {
6595                         pr_err("Not enough bytes to eat. Want %d\n", eat);
6596                         return -EFAULT;
6597                 }
6598                 if (list->len <= eat) {
6599                         /* Eaten as whole. */
6600                         eat -= list->len;
6601                         list = list->next;
6602                         insp = list;
6603                 } else {
6604                         /* Eaten partially. */
6605                         if (skb_shared(list)) {
6606                                 clone = skb_clone(list, gfp_mask);
6607                                 if (!clone)
6608                                         return -ENOMEM;
6609                                 insp = list->next;
6610                                 list = clone;
6611                         } else {
6612                                 /* This may be pulled without problems. */
6613                                 insp = list;
6614                         }
6615                         if (pskb_carve(list, eat, gfp_mask) < 0) {
6616                                 kfree_skb(clone);
6617                                 return -ENOMEM;
6618                         }
6619                         break;
6620                 }
6621         } while (eat);
6622
6623         /* Free pulled out fragments. */
6624         while ((list = shinfo->frag_list) != insp) {
6625                 shinfo->frag_list = list->next;
6626                 consume_skb(list);
6627         }
6628         /* And insert new clone at head. */
6629         if (clone) {
6630                 clone->next = list;
6631                 shinfo->frag_list = clone;
6632         }
6633         return 0;
6634 }
6635
6636 /* carve off first len bytes from skb. Split line (off) is in the
6637  * non-linear part of skb
6638  */
6639 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
6640                                        int pos, gfp_t gfp_mask)
6641 {
6642         int i, k = 0;
6643         unsigned int size = skb_end_offset(skb);
6644         u8 *data;
6645         const int nfrags = skb_shinfo(skb)->nr_frags;
6646         struct skb_shared_info *shinfo;
6647
6648         if (skb_pfmemalloc(skb))
6649                 gfp_mask |= __GFP_MEMALLOC;
6650
6651         data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
6652         if (!data)
6653                 return -ENOMEM;
6654         size = SKB_WITH_OVERHEAD(size);
6655
6656         memcpy((struct skb_shared_info *)(data + size),
6657                skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
6658         if (skb_orphan_frags(skb, gfp_mask)) {
6659                 skb_kfree_head(data, size);
6660                 return -ENOMEM;
6661         }
6662         shinfo = (struct skb_shared_info *)(data + size);
6663         for (i = 0; i < nfrags; i++) {
6664                 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
6665
6666                 if (pos + fsize > off) {
6667                         shinfo->frags[k] = skb_shinfo(skb)->frags[i];
6668
6669                         if (pos < off) {
6670                                 /* Split frag.
6671                                  * We have two variants in this case:
6672                                  * 1. Move all the frag to the second
6673                                  *    part, if it is possible. F.e.
6674                                  *    this approach is mandatory for TUX,
6675                                  *    where splitting is expensive.
6676                                  * 2. Split is accurately. We make this.
6677                                  */
6678                                 skb_frag_off_add(&shinfo->frags[0], off - pos);
6679                                 skb_frag_size_sub(&shinfo->frags[0], off - pos);
6680                         }
6681                         skb_frag_ref(skb, i);
6682                         k++;
6683                 }
6684                 pos += fsize;
6685         }
6686         shinfo->nr_frags = k;
6687         if (skb_has_frag_list(skb))
6688                 skb_clone_fraglist(skb);
6689
6690         /* split line is in frag list */
6691         if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
6692                 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
6693                 if (skb_has_frag_list(skb))
6694                         kfree_skb_list(skb_shinfo(skb)->frag_list);
6695                 skb_kfree_head(data, size);
6696                 return -ENOMEM;
6697         }
6698         skb_release_data(skb, SKB_CONSUMED);
6699
6700         skb->head = data;
6701         skb->head_frag = 0;
6702         skb->data = data;
6703         skb_set_end_offset(skb, size);
6704         skb_reset_tail_pointer(skb);
6705         skb_headers_offset_update(skb, 0);
6706         skb->cloned   = 0;
6707         skb->hdr_len  = 0;
6708         skb->nohdr    = 0;
6709         skb->len -= off;
6710         skb->data_len = skb->len;
6711         atomic_set(&skb_shinfo(skb)->dataref, 1);
6712         return 0;
6713 }
6714
6715 /* remove len bytes from the beginning of the skb */
6716 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
6717 {
6718         int headlen = skb_headlen(skb);
6719
6720         if (len < headlen)
6721                 return pskb_carve_inside_header(skb, len, headlen, gfp);
6722         else
6723                 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
6724 }
6725
6726 /* Extract to_copy bytes starting at off from skb, and return this in
6727  * a new skb
6728  */
6729 struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
6730                              int to_copy, gfp_t gfp)
6731 {
6732         struct sk_buff  *clone = skb_clone(skb, gfp);
6733
6734         if (!clone)
6735                 return NULL;
6736
6737         if (pskb_carve(clone, off, gfp) < 0 ||
6738             pskb_trim(clone, to_copy)) {
6739                 kfree_skb(clone);
6740                 return NULL;
6741         }
6742         return clone;
6743 }
6744 EXPORT_SYMBOL(pskb_extract);
6745
6746 /**
6747  * skb_condense - try to get rid of fragments/frag_list if possible
6748  * @skb: buffer
6749  *
6750  * Can be used to save memory before skb is added to a busy queue.
6751  * If packet has bytes in frags and enough tail room in skb->head,
6752  * pull all of them, so that we can free the frags right now and adjust
6753  * truesize.
6754  * Notes:
6755  *      We do not reallocate skb->head thus can not fail.
6756  *      Caller must re-evaluate skb->truesize if needed.
6757  */
6758 void skb_condense(struct sk_buff *skb)
6759 {
6760         if (skb->data_len) {
6761                 if (skb->data_len > skb->end - skb->tail ||
6762                     skb_cloned(skb))
6763                         return;
6764
6765                 /* Nice, we can free page frag(s) right now */
6766                 __pskb_pull_tail(skb, skb->data_len);
6767         }
6768         /* At this point, skb->truesize might be over estimated,
6769          * because skb had a fragment, and fragments do not tell
6770          * their truesize.
6771          * When we pulled its content into skb->head, fragment
6772          * was freed, but __pskb_pull_tail() could not possibly
6773          * adjust skb->truesize, not knowing the frag truesize.
6774          */
6775         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6776 }
6777 EXPORT_SYMBOL(skb_condense);
6778
6779 #ifdef CONFIG_SKB_EXTENSIONS
6780 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
6781 {
6782         return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
6783 }
6784
6785 /**
6786  * __skb_ext_alloc - allocate a new skb extensions storage
6787  *
6788  * @flags: See kmalloc().
6789  *
6790  * Returns the newly allocated pointer. The pointer can later attached to a
6791  * skb via __skb_ext_set().
6792  * Note: caller must handle the skb_ext as an opaque data.
6793  */
6794 struct skb_ext *__skb_ext_alloc(gfp_t flags)
6795 {
6796         struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
6797
6798         if (new) {
6799                 memset(new->offset, 0, sizeof(new->offset));
6800                 refcount_set(&new->refcnt, 1);
6801         }
6802
6803         return new;
6804 }
6805
6806 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
6807                                          unsigned int old_active)
6808 {
6809         struct skb_ext *new;
6810
6811         if (refcount_read(&old->refcnt) == 1)
6812                 return old;
6813
6814         new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
6815         if (!new)
6816                 return NULL;
6817
6818         memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
6819         refcount_set(&new->refcnt, 1);
6820
6821 #ifdef CONFIG_XFRM
6822         if (old_active & (1 << SKB_EXT_SEC_PATH)) {
6823                 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
6824                 unsigned int i;
6825
6826                 for (i = 0; i < sp->len; i++)
6827                         xfrm_state_hold(sp->xvec[i]);
6828         }
6829 #endif
6830 #ifdef CONFIG_MCTP_FLOWS
6831         if (old_active & (1 << SKB_EXT_MCTP)) {
6832                 struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);
6833
6834                 if (flow->key)
6835                         refcount_inc(&flow->key->refs);
6836         }
6837 #endif
6838         __skb_ext_put(old);
6839         return new;
6840 }
6841
6842 /**
6843  * __skb_ext_set - attach the specified extension storage to this skb
6844  * @skb: buffer
6845  * @id: extension id
6846  * @ext: extension storage previously allocated via __skb_ext_alloc()
6847  *
6848  * Existing extensions, if any, are cleared.
6849  *
6850  * Returns the pointer to the extension.
6851  */
6852 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
6853                     struct skb_ext *ext)
6854 {
6855         unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
6856
6857         skb_ext_put(skb);
6858         newlen = newoff + skb_ext_type_len[id];
6859         ext->chunks = newlen;
6860         ext->offset[id] = newoff;
6861         skb->extensions = ext;
6862         skb->active_extensions = 1 << id;
6863         return skb_ext_get_ptr(ext, id);
6864 }
6865
6866 /**
6867  * skb_ext_add - allocate space for given extension, COW if needed
6868  * @skb: buffer
6869  * @id: extension to allocate space for
6870  *
6871  * Allocates enough space for the given extension.
6872  * If the extension is already present, a pointer to that extension
6873  * is returned.
6874  *
6875  * If the skb was cloned, COW applies and the returned memory can be
6876  * modified without changing the extension space of clones buffers.
6877  *
6878  * Returns pointer to the extension or NULL on allocation failure.
6879  */
6880 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
6881 {
6882         struct skb_ext *new, *old = NULL;
6883         unsigned int newlen, newoff;
6884
6885         if (skb->active_extensions) {
6886                 old = skb->extensions;
6887
6888                 new = skb_ext_maybe_cow(old, skb->active_extensions);
6889                 if (!new)
6890                         return NULL;
6891
6892                 if (__skb_ext_exist(new, id))
6893                         goto set_active;
6894
6895                 newoff = new->chunks;
6896         } else {
6897                 newoff = SKB_EXT_CHUNKSIZEOF(*new);
6898
6899                 new = __skb_ext_alloc(GFP_ATOMIC);
6900                 if (!new)
6901                         return NULL;
6902         }
6903
6904         newlen = newoff + skb_ext_type_len[id];
6905         new->chunks = newlen;
6906         new->offset[id] = newoff;
6907 set_active:
6908         skb->slow_gro = 1;
6909         skb->extensions = new;
6910         skb->active_extensions |= 1 << id;
6911         return skb_ext_get_ptr(new, id);
6912 }
6913 EXPORT_SYMBOL(skb_ext_add);
6914
6915 #ifdef CONFIG_XFRM
6916 static void skb_ext_put_sp(struct sec_path *sp)
6917 {
6918         unsigned int i;
6919
6920         for (i = 0; i < sp->len; i++)
6921                 xfrm_state_put(sp->xvec[i]);
6922 }
6923 #endif
6924
6925 #ifdef CONFIG_MCTP_FLOWS
6926 static void skb_ext_put_mctp(struct mctp_flow *flow)
6927 {
6928         if (flow->key)
6929                 mctp_key_unref(flow->key);
6930 }
6931 #endif
6932
6933 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
6934 {
6935         struct skb_ext *ext = skb->extensions;
6936
6937         skb->active_extensions &= ~(1 << id);
6938         if (skb->active_extensions == 0) {
6939                 skb->extensions = NULL;
6940                 __skb_ext_put(ext);
6941 #ifdef CONFIG_XFRM
6942         } else if (id == SKB_EXT_SEC_PATH &&
6943                    refcount_read(&ext->refcnt) == 1) {
6944                 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
6945
6946                 skb_ext_put_sp(sp);
6947                 sp->len = 0;
6948 #endif
6949         }
6950 }
6951 EXPORT_SYMBOL(__skb_ext_del);
6952
6953 void __skb_ext_put(struct skb_ext *ext)
6954 {
6955         /* If this is last clone, nothing can increment
6956          * it after check passes.  Avoids one atomic op.
6957          */
6958         if (refcount_read(&ext->refcnt) == 1)
6959                 goto free_now;
6960
6961         if (!refcount_dec_and_test(&ext->refcnt))
6962                 return;
6963 free_now:
6964 #ifdef CONFIG_XFRM
6965         if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
6966                 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
6967 #endif
6968 #ifdef CONFIG_MCTP_FLOWS
6969         if (__skb_ext_exist(ext, SKB_EXT_MCTP))
6970                 skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
6971 #endif
6972
6973         kmem_cache_free(skbuff_ext_cache, ext);
6974 }
6975 EXPORT_SYMBOL(__skb_ext_put);
6976 #endif /* CONFIG_SKB_EXTENSIONS */
6977
6978 static void kfree_skb_napi_cache(struct sk_buff *skb)
6979 {
6980         /* if SKB is a clone, don't handle this case */
6981         if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
6982                 __kfree_skb(skb);
6983                 return;
6984         }
6985
6986         local_bh_disable();
6987         __napi_kfree_skb(skb, SKB_CONSUMED);
6988         local_bh_enable();
6989 }
6990
6991 /**
6992  * skb_attempt_defer_free - queue skb for remote freeing
6993  * @skb: buffer
6994  *
6995  * Put @skb in a per-cpu list, using the cpu which
6996  * allocated the skb/pages to reduce false sharing
6997  * and memory zone spinlock contention.
6998  */
6999 void skb_attempt_defer_free(struct sk_buff *skb)
7000 {
7001         int cpu = skb->alloc_cpu;
7002         struct softnet_data *sd;
7003         unsigned int defer_max;
7004         bool kick;
7005
7006         if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
7007             !cpu_online(cpu) ||
7008             cpu == raw_smp_processor_id()) {
7009 nodefer:        kfree_skb_napi_cache(skb);
7010                 return;
7011         }
7012
7013         DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
7014         DEBUG_NET_WARN_ON_ONCE(skb->destructor);
7015
7016         sd = &per_cpu(softnet_data, cpu);
7017         defer_max = READ_ONCE(sysctl_skb_defer_max);
7018         if (READ_ONCE(sd->defer_count) >= defer_max)
7019                 goto nodefer;
7020
7021         spin_lock_bh(&sd->defer_lock);
7022         /* Send an IPI every time queue reaches half capacity. */
7023         kick = sd->defer_count == (defer_max >> 1);
7024         /* Paired with the READ_ONCE() few lines above */
7025         WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
7026
7027         skb->next = sd->defer_list;
7028         /* Paired with READ_ONCE() in skb_defer_free_flush() */
7029         WRITE_ONCE(sd->defer_list, skb);
7030         spin_unlock_bh(&sd->defer_lock);
7031
7032         /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
7033          * if we are unlucky enough (this seems very unlikely).
7034          */
7035         if (unlikely(kick))
7036                 kick_defer_list_purge(sd, cpu);
7037 }
7038
7039 static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
7040                                  size_t offset, size_t len)
7041 {
7042         const char *kaddr;
7043         __wsum csum;
7044
7045         kaddr = kmap_local_page(page);
7046         csum = csum_partial(kaddr + offset, len, 0);
7047         kunmap_local(kaddr);
7048         skb->csum = csum_block_add(skb->csum, csum, skb->len);
7049 }
7050
7051 /**
7052  * skb_splice_from_iter - Splice (or copy) pages to skbuff
7053  * @skb: The buffer to add pages to
7054  * @iter: Iterator representing the pages to be added
7055  * @maxsize: Maximum amount of pages to be added
7056  * @gfp: Allocation flags
7057  *
7058  * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
7059  * extracts pages from an iterator and adds them to the socket buffer if
7060  * possible, copying them to fragments if not possible (such as if they're slab
7061  * pages).
7062  *
7063  * Returns the amount of data spliced/copied or -EMSGSIZE if there's
7064  * insufficient space in the buffer to transfer anything.
7065  */
7066 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
7067                              ssize_t maxsize, gfp_t gfp)
7068 {
7069         size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
7070         struct page *pages[8], **ppages = pages;
7071         ssize_t spliced = 0, ret = 0;
7072         unsigned int i;
7073
7074         while (iter->count > 0) {
7075                 ssize_t space, nr, len;
7076                 size_t off;
7077
7078                 ret = -EMSGSIZE;
7079                 space = frag_limit - skb_shinfo(skb)->nr_frags;
7080                 if (space < 0)
7081                         break;
7082
7083                 /* We might be able to coalesce without increasing nr_frags */
7084                 nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
7085
7086                 len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
7087                 if (len <= 0) {
7088                         ret = len ?: -EIO;
7089                         break;
7090                 }
7091
7092                 i = 0;
7093                 do {
7094                         struct page *page = pages[i++];
7095                         size_t part = min_t(size_t, PAGE_SIZE - off, len);
7096
7097                         ret = -EIO;
7098                         if (WARN_ON_ONCE(!sendpage_ok(page)))
7099                                 goto out;
7100
7101                         ret = skb_append_pagefrags(skb, page, off, part,
7102                                                    frag_limit);
7103                         if (ret < 0) {
7104                                 iov_iter_revert(iter, len);
7105                                 goto out;
7106                         }
7107
7108                         if (skb->ip_summed == CHECKSUM_NONE)
7109                                 skb_splice_csum_page(skb, page, off, part);
7110
7111                         off = 0;
7112                         spliced += part;
7113                         maxsize -= part;
7114                         len -= part;
7115                 } while (len > 0);
7116
7117                 if (maxsize <= 0)
7118                         break;
7119         }
7120
7121 out:
7122         skb_len_add(skb, spliced);
7123         return spliced ?: ret;
7124 }
7125 EXPORT_SYMBOL(skb_splice_from_iter);
7126
7127 static __always_inline
7128 size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
7129                              size_t len, void *to, void *priv2)
7130 {
7131         __wsum *csum = priv2;
7132         __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
7133
7134         *csum = csum_block_add(*csum, next, progress);
7135         return 0;
7136 }
7137
7138 static __always_inline
7139 size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
7140                                 size_t len, void *to, void *priv2)
7141 {
7142         __wsum next, *csum = priv2;
7143
7144         next = csum_and_copy_from_user(iter_from, to + progress, len);
7145         *csum = csum_block_add(*csum, next, progress);
7146         return next ? 0 : len;
7147 }
7148
7149 bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
7150                                   __wsum *csum, struct iov_iter *i)
7151 {
7152         size_t copied;
7153
7154         if (WARN_ON_ONCE(!i->data_source))
7155                 return false;
7156         copied = iterate_and_advance2(i, bytes, addr, csum,
7157                                       copy_from_user_iter_csum,
7158                                       memcpy_from_iter_csum);
7159         if (likely(copied == bytes))
7160                 return true;
7161         iov_iter_revert(i, copied);
7162         return false;
7163 }
7164 EXPORT_SYMBOL(csum_and_copy_from_iter_full);