Merge tag 'linux-kselftest-kunit-6.3-rc1' of git://git.kernel.org/pub/scm/linux/kerne...
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
183f47fc 63#include <linux/kcov.h>
1da177e4
LT
64
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
ed1f50c3 69#include <net/ip6_checksum.h>
1da177e4 70#include <net/xfrm.h>
8822e270 71#include <net/mpls.h>
3ee17bc7 72#include <net/mptcp.h>
78476d31 73#include <net/mctp.h>
6a5bcd84 74#include <net/page_pool.h>
1da177e4 75
7c0f6ba6 76#include <linux/uaccess.h>
ad8d75ff 77#include <trace/events/skb.h>
51c56b00 78#include <linux/highmem.h>
b245be1f
WB
79#include <linux/capability.h>
80#include <linux/user_namespace.h>
2544af03 81#include <linux/indirect_call_wrapper.h>
2195e2a0 82#include <linux/textsearch.h>
a1f8e7f7 83
39564c3f 84#include "dev.h"
7f678def 85#include "sock_destructor.h"
7b7ed885 86
025a785f 87struct kmem_cache *skbuff_cache __ro_after_init;
08009a76 88static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
89#ifdef CONFIG_SKB_EXTENSIONS
90static struct kmem_cache *skbuff_ext_cache __ro_after_init;
91#endif
bf9f1baa
ED
92
93/* skb_small_head_cache and related code is only supported
94 * for CONFIG_SLAB and CONFIG_SLUB.
95 * As soon as SLOB is removed from the kernel, we can clean up this.
96 */
97#if !defined(CONFIG_SLOB)
98# define HAVE_SKB_SMALL_HEAD_CACHE 1
99#endif
100
101#ifdef HAVE_SKB_SMALL_HEAD_CACHE
102static struct kmem_cache *skb_small_head_cache __ro_after_init;
103
104#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
105
106/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
107 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
108 * size, and we can differentiate heads from skb_small_head_cache
109 * vs system slabs by looking at their size (skb_end_offset()).
110 */
111#define SKB_SMALL_HEAD_CACHE_SIZE \
112 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
113 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
114 SKB_SMALL_HEAD_SIZE)
115
116#define SKB_SMALL_HEAD_HEADROOM \
117 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
118#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
119
5f74f82e
HWR
120int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
121EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 122
9cb252c4
MD
123#undef FN
124#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
125const char * const drop_reasons[] = {
0e84afe8 126 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
127 DEFINE_DROP_REASON(FN, FN)
128};
ec43908d
MD
129EXPORT_SYMBOL(drop_reasons);
130
1da177e4 131/**
f05de73b
JS
132 * skb_panic - private function for out-of-line support
133 * @skb: buffer
134 * @sz: size
135 * @addr: address
99d5851e 136 * @msg: skb_over_panic or skb_under_panic
1da177e4 137 *
f05de73b
JS
138 * Out-of-line support for skb_put() and skb_push().
139 * Called via the wrapper skb_over_panic() or skb_under_panic().
140 * Keep out of line to prevent kernel bloat.
141 * __builtin_return_address is not used because it is not always reliable.
1da177e4 142 */
f05de73b 143static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 144 const char msg[])
1da177e4 145{
41a46913 146 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 147 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
148 (unsigned long)skb->tail, (unsigned long)skb->end,
149 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
150 BUG();
151}
152
f05de73b 153static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 154{
f05de73b 155 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
156}
157
f05de73b
JS
158static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
159{
160 skb_panic(skb, sz, addr, __func__);
161}
c93bdd0e 162
50fad4b5 163#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
164#define NAPI_SKB_CACHE_BULK 16
165#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 166
dbae2b06
PA
167#if PAGE_SIZE == SZ_4K
168
169#define NAPI_HAS_SMALL_PAGE_FRAG 1
170#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
171
172/* specialized page frag allocator using a single order 0 page
173 * and slicing it into 1K sized fragment. Constrained to systems
174 * with a very limited amount of 1K fragments fitting a single
175 * page - to avoid excessive truesize underestimation
176 */
177
178struct page_frag_1k {
179 void *va;
180 u16 offset;
181 bool pfmemalloc;
182};
183
184static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
185{
186 struct page *page;
187 int offset;
188
189 offset = nc->offset - SZ_1K;
190 if (likely(offset >= 0))
191 goto use_frag;
192
193 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
194 if (!page)
195 return NULL;
196
197 nc->va = page_address(page);
198 nc->pfmemalloc = page_is_pfmemalloc(page);
199 offset = PAGE_SIZE - SZ_1K;
200 page_ref_add(page, offset / SZ_1K);
201
202use_frag:
203 nc->offset = offset;
204 return nc->va + offset;
205}
206#else
207
208/* the small page is actually unused in this build; add dummy helpers
209 * to please the compiler and avoid later preprocessor's conditionals
210 */
211#define NAPI_HAS_SMALL_PAGE_FRAG 0
212#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
213
214struct page_frag_1k {
215};
216
217static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
218{
219 return NULL;
220}
221
222#endif
223
50fad4b5
AL
224struct napi_alloc_cache {
225 struct page_frag_cache page;
dbae2b06 226 struct page_frag_1k page_small;
50fad4b5
AL
227 unsigned int skb_count;
228 void *skb_cache[NAPI_SKB_CACHE_SIZE];
229};
230
231static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
232static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
233
dbae2b06
PA
234/* Double check that napi_get_frags() allocates skbs with
235 * skb->head being backed by slab, not a page fragment.
236 * This is to make sure bug fixed in 3226b158e67c
237 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
238 * does not accidentally come back.
239 */
240void napi_get_frags_check(struct napi_struct *napi)
241{
242 struct sk_buff *skb;
243
244 local_bh_disable();
245 skb = napi_get_frags(napi);
246 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
247 napi_free_frags(napi);
248 local_bh_enable();
249}
250
32e3573f 251void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
252{
253 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
254
50fad4b5
AL
255 fragsz = SKB_DATA_ALIGN(fragsz);
256
32e3573f 257 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
258}
259EXPORT_SYMBOL(__napi_alloc_frag_align);
260
261void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
262{
50fad4b5
AL
263 void *data;
264
265 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 266 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
267 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
268
50fad4b5
AL
269 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
270 } else {
32e3573f
YD
271 struct napi_alloc_cache *nc;
272
50fad4b5 273 local_bh_disable();
32e3573f
YD
274 nc = this_cpu_ptr(&napi_alloc_cache);
275 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
276 local_bh_enable();
277 }
278 return data;
279}
280EXPORT_SYMBOL(__netdev_alloc_frag_align);
281
f450d539
AL
282static struct sk_buff *napi_skb_cache_get(void)
283{
284 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
285 struct sk_buff *skb;
286
49ae83fc 287 if (unlikely(!nc->skb_count)) {
025a785f 288 nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
f450d539
AL
289 GFP_ATOMIC,
290 NAPI_SKB_CACHE_BULK,
291 nc->skb_cache);
49ae83fc
SPL
292 if (unlikely(!nc->skb_count))
293 return NULL;
294 }
f450d539
AL
295
296 skb = nc->skb_cache[--nc->skb_count];
025a785f 297 kasan_unpoison_object_data(skbuff_cache, skb);
f450d539
AL
298
299 return skb;
300}
301
ce098da1
KC
302static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
303 unsigned int size)
ba0509b6
JDB
304{
305 struct skb_shared_info *shinfo;
ba0509b6
JDB
306
307 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
308
309 /* Assumes caller memset cleared SKB */
310 skb->truesize = SKB_TRUESIZE(size);
311 refcount_set(&skb->users, 1);
312 skb->head = data;
313 skb->data = data;
314 skb_reset_tail_pointer(skb);
763087da 315 skb_set_end_offset(skb, size);
ba0509b6
JDB
316 skb->mac_header = (typeof(skb->mac_header))~0U;
317 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 318 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
319 /* make sure we initialize shinfo sequentially */
320 shinfo = skb_shinfo(skb);
321 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
322 atomic_set(&shinfo->dataref, 1);
323
6370cc3b 324 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
325}
326
ce098da1
KC
327static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
328 unsigned int *size)
329{
330 void *resized;
331
332 /* Must find the allocation size (and grow it to match). */
333 *size = ksize(data);
334 /* krealloc() will immediately return "data" when
335 * "ksize(data)" is requested: it is the existing upper
336 * bounds. As a result, GFP_ATOMIC will be ignored. Note
337 * that this "new" pointer needs to be passed back to the
338 * caller for use so the __alloc_size hinting will be
339 * tracked correctly.
340 */
341 resized = krealloc(data, *size, GFP_ATOMIC);
342 WARN_ON_ONCE(resized != data);
343 return resized;
344}
345
346/* build_skb() variant which can operate on slab buffers.
347 * Note that this should be used sparingly as slab buffers
348 * cannot be combined efficiently by GRO!
349 */
350struct sk_buff *slab_build_skb(void *data)
351{
352 struct sk_buff *skb;
353 unsigned int size;
354
025a785f 355 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ce098da1
KC
356 if (unlikely(!skb))
357 return NULL;
358
359 memset(skb, 0, offsetof(struct sk_buff, tail));
360 data = __slab_build_skb(skb, data, &size);
361 __finalize_skb_around(skb, data, size);
362
363 return skb;
364}
365EXPORT_SYMBOL(slab_build_skb);
366
367/* Caller must provide SKB that is memset cleared */
368static void __build_skb_around(struct sk_buff *skb, void *data,
369 unsigned int frag_size)
370{
371 unsigned int size = frag_size;
372
373 /* frag_size == 0 is considered deprecated now. Callers
374 * using slab buffer should use slab_build_skb() instead.
375 */
376 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
377 data = __slab_build_skb(skb, data, &size);
378
379 __finalize_skb_around(skb, data, size);
380}
381
b2b5ce9d 382/**
2ea2f62c 383 * __build_skb - build a network buffer
b2b5ce9d 384 * @data: data buffer provided by caller
ce098da1 385 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
386 *
387 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
388 * skb_shared_info. @data must have been allocated from the page
389 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
390 * allocation is deprecated, and callers should use slab_build_skb()
391 * instead.)
b2b5ce9d
ED
392 * The return is the new skb buffer.
393 * On a failure the return is %NULL, and @data is not freed.
394 * Notes :
395 * Before IO, driver allocates only data buffer where NIC put incoming frame
396 * Driver should add room at head (NET_SKB_PAD) and
397 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
398 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
399 * before giving packet to stack.
400 * RX rings only contains data buffers, not full skbs.
401 */
2ea2f62c 402struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 403{
b2b5ce9d 404 struct sk_buff *skb;
b2b5ce9d 405
025a785f 406 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ba0509b6 407 if (unlikely(!skb))
b2b5ce9d
ED
408 return NULL;
409
b2b5ce9d 410 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 411 __build_skb_around(skb, data, frag_size);
b2b5ce9d 412
483126b3 413 return skb;
b2b5ce9d 414}
2ea2f62c
ED
415
416/* build_skb() is wrapper over __build_skb(), that specifically
417 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
418 */
419struct sk_buff *build_skb(void *data, unsigned int frag_size)
420{
421 struct sk_buff *skb = __build_skb(data, frag_size);
422
423 if (skb && frag_size) {
424 skb->head_frag = 1;
2f064f34 425 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
426 skb->pfmemalloc = 1;
427 }
428 return skb;
429}
b2b5ce9d
ED
430EXPORT_SYMBOL(build_skb);
431
ba0509b6
JDB
432/**
433 * build_skb_around - build a network buffer around provided skb
434 * @skb: sk_buff provide by caller, must be memset cleared
435 * @data: data buffer provided by caller
12c1604a 436 * @frag_size: size of data
ba0509b6
JDB
437 */
438struct sk_buff *build_skb_around(struct sk_buff *skb,
439 void *data, unsigned int frag_size)
440{
441 if (unlikely(!skb))
442 return NULL;
443
483126b3 444 __build_skb_around(skb, data, frag_size);
ba0509b6 445
483126b3 446 if (frag_size) {
ba0509b6
JDB
447 skb->head_frag = 1;
448 if (page_is_pfmemalloc(virt_to_head_page(data)))
449 skb->pfmemalloc = 1;
450 }
451 return skb;
452}
453EXPORT_SYMBOL(build_skb_around);
454
f450d539
AL
455/**
456 * __napi_build_skb - build a network buffer
457 * @data: data buffer provided by caller
12c1604a 458 * @frag_size: size of data
f450d539
AL
459 *
460 * Version of __build_skb() that uses NAPI percpu caches to obtain
461 * skbuff_head instead of inplace allocation.
462 *
463 * Returns a new &sk_buff on success, %NULL on allocation failure.
464 */
465static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
466{
467 struct sk_buff *skb;
468
469 skb = napi_skb_cache_get();
470 if (unlikely(!skb))
471 return NULL;
472
473 memset(skb, 0, offsetof(struct sk_buff, tail));
474 __build_skb_around(skb, data, frag_size);
475
476 return skb;
477}
478
479/**
480 * napi_build_skb - build a network buffer
481 * @data: data buffer provided by caller
12c1604a 482 * @frag_size: size of data
f450d539
AL
483 *
484 * Version of __napi_build_skb() that takes care of skb->head_frag
485 * and skb->pfmemalloc when the data is a page or page fragment.
486 *
487 * Returns a new &sk_buff on success, %NULL on allocation failure.
488 */
489struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
490{
491 struct sk_buff *skb = __napi_build_skb(data, frag_size);
492
493 if (likely(skb) && frag_size) {
494 skb->head_frag = 1;
495 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
496 }
497
498 return skb;
499}
500EXPORT_SYMBOL(napi_build_skb);
501
5381b23d
AL
502/*
503 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
504 * the caller if emergency pfmemalloc reserves are being used. If it is and
505 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
506 * may be used. Otherwise, the packet data may be discarded until enough
507 * memory is free
508 */
5c0e820c 509static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 510 bool *pfmemalloc)
5381b23d 511{
5381b23d 512 bool ret_pfmemalloc = false;
5c0e820c
ED
513 unsigned int obj_size;
514 void *obj;
5381b23d 515
5c0e820c 516 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
517#ifdef HAVE_SKB_SMALL_HEAD_CACHE
518 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
519 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
520
521 /* skb_small_head_cache has non power of two size,
522 * likely forcing SLUB to use order-3 pages.
523 * We deliberately attempt a NOMEMALLOC allocation only.
524 */
525 obj = kmem_cache_alloc_node(skb_small_head_cache,
526 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
527 node);
528 if (obj) {
529 *size = SKB_SMALL_HEAD_CACHE_SIZE;
530 goto out;
531 }
532 }
533#endif
5c0e820c 534 *size = obj_size = kmalloc_size_roundup(obj_size);
5381b23d
AL
535 /*
536 * Try a regular allocation, when that fails and we're not entitled
537 * to the reserves, fail.
538 */
5c0e820c 539 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
540 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
541 node);
542 if (obj || !(gfp_pfmemalloc_allowed(flags)))
543 goto out;
544
545 /* Try again but now we are using pfmemalloc reserves */
546 ret_pfmemalloc = true;
5c0e820c 547 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
548
549out:
550 if (pfmemalloc)
551 *pfmemalloc = ret_pfmemalloc;
552
553 return obj;
554}
555
556/* Allocate a new skbuff. We do this ourselves so we can fill in a few
557 * 'private' fields and also do memory statistics to find all the
558 * [BEEP] leaks.
559 *
560 */
561
562/**
563 * __alloc_skb - allocate a network buffer
564 * @size: size to allocate
565 * @gfp_mask: allocation mask
566 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
567 * instead of head cache and allocate a cloned (child) skb.
568 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
569 * allocations in case the data is required for writeback
570 * @node: numa node to allocate memory on
571 *
572 * Allocate a new &sk_buff. The returned buffer has no headroom and a
573 * tail room of at least size bytes. The object has a reference count
574 * of one. The return is the buffer. On a failure the return is %NULL.
575 *
576 * Buffers may only be allocated from interrupts using a @gfp_mask of
577 * %GFP_ATOMIC.
578 */
579struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
580 int flags, int node)
581{
582 struct kmem_cache *cache;
5381b23d 583 struct sk_buff *skb;
5381b23d 584 bool pfmemalloc;
a5df6333 585 u8 *data;
5381b23d
AL
586
587 cache = (flags & SKB_ALLOC_FCLONE)
025a785f 588 ? skbuff_fclone_cache : skbuff_cache;
5381b23d
AL
589
590 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
591 gfp_mask |= __GFP_MEMALLOC;
592
593 /* Get the HEAD */
d13612b5
AL
594 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
595 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
596 skb = napi_skb_cache_get();
597 else
598 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
599 if (unlikely(!skb))
600 return NULL;
5381b23d
AL
601 prefetchw(skb);
602
603 /* We do our best to align skb_shared_info on a separate cache
604 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
605 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
606 * Both skb->head and skb_shared_info are cache line aligned.
607 */
5c0e820c 608 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 609 if (unlikely(!data))
5381b23d 610 goto nodata;
12d6c1d3 611 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
612 * Put skb_shared_info exactly at the end of allocated zone,
613 * to allow max possible filling before reallocation.
614 */
65998d2b 615 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
616
617 /*
618 * Only clear those fields we need to clear, not those that we will
619 * actually initialise below. Hence, don't put any more fields after
620 * the tail pointer in struct sk_buff!
621 */
622 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 623 __build_skb_around(skb, data, size);
5381b23d 624 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
625
626 if (flags & SKB_ALLOC_FCLONE) {
627 struct sk_buff_fclones *fclones;
628
629 fclones = container_of(skb, struct sk_buff_fclones, skb1);
630
631 skb->fclone = SKB_FCLONE_ORIG;
632 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
633 }
634
5381b23d 635 return skb;
df1ae022 636
5381b23d
AL
637nodata:
638 kmem_cache_free(cache, skb);
df1ae022 639 return NULL;
5381b23d
AL
640}
641EXPORT_SYMBOL(__alloc_skb);
642
fd11a83d
AD
643/**
644 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
645 * @dev: network device to receive on
d7499160 646 * @len: length to allocate
fd11a83d
AD
647 * @gfp_mask: get_free_pages mask, passed to alloc_skb
648 *
649 * Allocate a new &sk_buff and assign it a usage count of one. The
650 * buffer has NET_SKB_PAD headroom built in. Users should allocate
651 * the headroom they think they need without accounting for the
652 * built in space. The built in space is used for optimisations.
653 *
654 * %NULL is returned if there is no free memory.
655 */
9451980a
AD
656struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
657 gfp_t gfp_mask)
fd11a83d 658{
b63ae8ca 659 struct page_frag_cache *nc;
fd11a83d 660 struct sk_buff *skb;
9451980a
AD
661 bool pfmemalloc;
662 void *data;
663
664 len += NET_SKB_PAD;
fd11a83d 665
66c55602
AL
666 /* If requested length is either too small or too big,
667 * we use kmalloc() for skb->head allocation.
668 */
669 if (len <= SKB_WITH_OVERHEAD(1024) ||
670 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 671 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
672 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
673 if (!skb)
674 goto skb_fail;
675 goto skb_success;
676 }
fd11a83d 677
115f1a5c 678 len = SKB_HEAD_ALIGN(len);
9451980a
AD
679
680 if (sk_memalloc_socks())
681 gfp_mask |= __GFP_MEMALLOC;
682
afa79d08 683 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
684 nc = this_cpu_ptr(&netdev_alloc_cache);
685 data = page_frag_alloc(nc, len, gfp_mask);
686 pfmemalloc = nc->pfmemalloc;
687 } else {
688 local_bh_disable();
689 nc = this_cpu_ptr(&napi_alloc_cache.page);
690 data = page_frag_alloc(nc, len, gfp_mask);
691 pfmemalloc = nc->pfmemalloc;
692 local_bh_enable();
693 }
9451980a
AD
694
695 if (unlikely(!data))
696 return NULL;
697
698 skb = __build_skb(data, len);
699 if (unlikely(!skb)) {
181edb2b 700 skb_free_frag(data);
9451980a 701 return NULL;
7b2e497a 702 }
fd11a83d 703
9451980a
AD
704 if (pfmemalloc)
705 skb->pfmemalloc = 1;
706 skb->head_frag = 1;
707
a080e7bd 708skb_success:
9451980a
AD
709 skb_reserve(skb, NET_SKB_PAD);
710 skb->dev = dev;
711
a080e7bd 712skb_fail:
8af27456
CH
713 return skb;
714}
b4ac530f 715EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 716
fd11a83d
AD
717/**
718 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
719 * @napi: napi instance this buffer was allocated for
d7499160 720 * @len: length to allocate
fd11a83d
AD
721 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
722 *
723 * Allocate a new sk_buff for use in NAPI receive. This buffer will
724 * attempt to allocate the head from a special reserved region used
725 * only for NAPI Rx allocation. By doing this we can save several
726 * CPU cycles by avoiding having to disable and re-enable IRQs.
727 *
728 * %NULL is returned if there is no free memory.
729 */
9451980a
AD
730struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
731 gfp_t gfp_mask)
fd11a83d 732{
3226b158 733 struct napi_alloc_cache *nc;
fd11a83d 734 struct sk_buff *skb;
dbae2b06 735 bool pfmemalloc;
9451980a
AD
736 void *data;
737
ee2640df 738 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 739 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 740
3226b158
ED
741 /* If requested length is either too small or too big,
742 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
743 * When the small frag allocator is available, prefer it over kmalloc
744 * for small fragments
3226b158 745 */
dbae2b06 746 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 747 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 748 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
749 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
750 NUMA_NO_NODE);
a080e7bd
AD
751 if (!skb)
752 goto skb_fail;
753 goto skb_success;
754 }
9451980a 755
3226b158 756 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
757
758 if (sk_memalloc_socks())
759 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 760
dbae2b06
PA
761 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
762 /* we are artificially inflating the allocation size, but
763 * that is not as bad as it may look like, as:
764 * - 'len' less than GRO_MAX_HEAD makes little sense
765 * - On most systems, larger 'len' values lead to fragment
766 * size above 512 bytes
767 * - kmalloc would use the kmalloc-1k slab for such values
768 * - Builds with smaller GRO_MAX_HEAD will very likely do
769 * little networking, as that implies no WiFi and no
770 * tunnels support, and 32 bits arches.
771 */
772 len = SZ_1K;
773
774 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
775 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
776 } else {
115f1a5c 777 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
778
779 data = page_frag_alloc(&nc->page, len, gfp_mask);
780 pfmemalloc = nc->page.pfmemalloc;
781 }
782
9451980a
AD
783 if (unlikely(!data))
784 return NULL;
785
cfb8ec65 786 skb = __napi_build_skb(data, len);
9451980a 787 if (unlikely(!skb)) {
181edb2b 788 skb_free_frag(data);
9451980a 789 return NULL;
fd11a83d
AD
790 }
791
dbae2b06 792 if (pfmemalloc)
9451980a
AD
793 skb->pfmemalloc = 1;
794 skb->head_frag = 1;
795
a080e7bd 796skb_success:
9451980a
AD
797 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
798 skb->dev = napi->dev;
799
a080e7bd 800skb_fail:
fd11a83d
AD
801 return skb;
802}
803EXPORT_SYMBOL(__napi_alloc_skb);
804
654bed16 805void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 806 int size, unsigned int truesize)
654bed16
PZ
807{
808 skb_fill_page_desc(skb, i, page, off, size);
809 skb->len += size;
810 skb->data_len += size;
50269e19 811 skb->truesize += truesize;
654bed16
PZ
812}
813EXPORT_SYMBOL(skb_add_rx_frag);
814
f8e617e1
JW
815void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
816 unsigned int truesize)
817{
818 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
819
820 skb_frag_size_add(frag, size);
821 skb->len += size;
822 skb->data_len += size;
823 skb->truesize += truesize;
824}
825EXPORT_SYMBOL(skb_coalesce_rx_frag);
826
27b437c8 827static void skb_drop_list(struct sk_buff **listp)
1da177e4 828{
bd8a7036 829 kfree_skb_list(*listp);
27b437c8 830 *listp = NULL;
1da177e4
LT
831}
832
27b437c8
HX
833static inline void skb_drop_fraglist(struct sk_buff *skb)
834{
835 skb_drop_list(&skb_shinfo(skb)->frag_list);
836}
837
1da177e4
LT
838static void skb_clone_fraglist(struct sk_buff *skb)
839{
840 struct sk_buff *list;
841
fbb398a8 842 skb_walk_frags(skb, list)
1da177e4
LT
843 skb_get(list);
844}
845
4727bab4
YL
846static bool skb_pp_recycle(struct sk_buff *skb, void *data)
847{
848 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
849 return false;
850 return page_pool_return_skb_page(virt_to_page(data));
851}
852
bf9f1baa
ED
853static void skb_kfree_head(void *head, unsigned int end_offset)
854{
855#ifdef HAVE_SKB_SMALL_HEAD_CACHE
856 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
857 kmem_cache_free(skb_small_head_cache, head);
858 else
859#endif
860 kfree(head);
861}
862
d3836f21
ED
863static void skb_free_head(struct sk_buff *skb)
864{
181edb2b
AD
865 unsigned char *head = skb->head;
866
6a5bcd84
IA
867 if (skb->head_frag) {
868 if (skb_pp_recycle(skb, head))
869 return;
181edb2b 870 skb_free_frag(head);
6a5bcd84 871 } else {
bf9f1baa 872 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 873 }
d3836f21
ED
874}
875
511a3eda 876static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
1da177e4 877{
ff04a771
ED
878 struct skb_shared_info *shinfo = skb_shinfo(skb);
879 int i;
1da177e4 880
ff04a771
ED
881 if (skb->cloned &&
882 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
883 &shinfo->dataref))
2cc3aeb5 884 goto exit;
a6686f2f 885
753f1ca4
PB
886 if (skb_zcopy(skb)) {
887 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
888
889 skb_zcopy_clear(skb, true);
890 if (skip_unref)
891 goto free_head;
892 }
70c43167 893
ff04a771 894 for (i = 0; i < shinfo->nr_frags; i++)
6a5bcd84 895 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 896
753f1ca4 897free_head:
ff04a771 898 if (shinfo->frag_list)
511a3eda 899 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771
ED
900
901 skb_free_head(skb);
2cc3aeb5
IA
902exit:
903 /* When we clone an SKB we copy the reycling bit. The pp_recycle
904 * bit is only set on the head though, so in order to avoid races
905 * while trying to recycle fragments on __skb_frag_unref() we need
906 * to make one SKB responsible for triggering the recycle path.
907 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 908 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
909 * Eventually the last SKB will have the recycling bit set and it's
910 * dataref set to 0, which will trigger the recycling
911 */
912 skb->pp_recycle = 0;
1da177e4
LT
913}
914
915/*
916 * Free an skbuff by memory without cleaning the state.
917 */
2d4baff8 918static void kfree_skbmem(struct sk_buff *skb)
1da177e4 919{
d0bf4a9e 920 struct sk_buff_fclones *fclones;
d179cd12 921
d179cd12
DM
922 switch (skb->fclone) {
923 case SKB_FCLONE_UNAVAILABLE:
025a785f 924 kmem_cache_free(skbuff_cache, skb);
6ffe75eb 925 return;
d179cd12
DM
926
927 case SKB_FCLONE_ORIG:
d0bf4a9e 928 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 929
6ffe75eb
ED
930 /* We usually free the clone (TX completion) before original skb
931 * This test would have no chance to be true for the clone,
932 * while here, branch prediction will be good.
d179cd12 933 */
2638595a 934 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
935 goto fastpath;
936 break;
e7820e39 937
6ffe75eb
ED
938 default: /* SKB_FCLONE_CLONE */
939 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 940 break;
3ff50b79 941 }
2638595a 942 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
943 return;
944fastpath:
945 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
946}
947
0a463c78 948void skb_release_head_state(struct sk_buff *skb)
1da177e4 949{
adf30907 950 skb_dst_drop(skb);
9c2b3328 951 if (skb->destructor) {
7890e2f0 952 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
953 skb->destructor(skb);
954 }
a3bf7ae9 955#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 956 nf_conntrack_put(skb_nfct(skb));
1da177e4 957#endif
df5042f4 958 skb_ext_put(skb);
04a4bb55
LB
959}
960
961/* Free everything but the sk_buff shell. */
511a3eda 962static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
04a4bb55
LB
963{
964 skb_release_head_state(skb);
a28b1b90 965 if (likely(skb->head))
511a3eda 966 skb_release_data(skb, reason);
2d4baff8
HX
967}
968
969/**
970 * __kfree_skb - private function
971 * @skb: buffer
972 *
973 * Free an sk_buff. Release anything attached to the buffer.
974 * Clean the state. This is an internal helper function. Users should
975 * always call kfree_skb
976 */
1da177e4 977
2d4baff8
HX
978void __kfree_skb(struct sk_buff *skb)
979{
511a3eda 980 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1da177e4
LT
981 kfree_skbmem(skb);
982}
b4ac530f 983EXPORT_SYMBOL(__kfree_skb);
1da177e4 984
a4650da2
JDB
985static __always_inline
986bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
987{
988 if (unlikely(!skb_unref(skb)))
989 return false;
990
991 DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
992
993 if (reason == SKB_CONSUMED)
dd1b5278 994 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2
JDB
995 else
996 trace_kfree_skb(skb, __builtin_return_address(0), reason);
997 return true;
998}
999