Linux 6.10-rc3
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
f6d827b1 54#include <linux/skbuff_ref.h>
9c55e01c 55#include <linux/splice.h>
1da177e4
LT
56#include <linux/cache.h>
57#include <linux/rtnetlink.h>
58#include <linux/init.h>
716ea3a7 59#include <linux/scatterlist.h>
ac45f602 60#include <linux/errqueue.h>
268bb0ce 61#include <linux/prefetch.h>
071c0fc6 62#include <linux/bitfield.h>
0d5501c1 63#include <linux/if_vlan.h>
2a2ea508 64#include <linux/mpls.h>
183f47fc 65#include <linux/kcov.h>
6d0d4199 66#include <linux/iov_iter.h>
1da177e4
LT
67
68#include <net/protocol.h>
69#include <net/dst.h>
70#include <net/sock.h>
71#include <net/checksum.h>
d457a0e3 72#include <net/gso.h>
aa70d2d1 73#include <net/hotdata.h>
ed1f50c3 74#include <net/ip6_checksum.h>
1da177e4 75#include <net/xfrm.h>
8822e270 76#include <net/mpls.h>
3ee17bc7 77#include <net/mptcp.h>
78476d31 78#include <net/mctp.h>
75eaf63e 79#include <net/page_pool/helpers.h>
071c0fc6 80#include <net/dropreason.h>
1da177e4 81
7c0f6ba6 82#include <linux/uaccess.h>
ad8d75ff 83#include <trace/events/skb.h>
51c56b00 84#include <linux/highmem.h>
b245be1f
WB
85#include <linux/capability.h>
86#include <linux/user_namespace.h>
2544af03 87#include <linux/indirect_call_wrapper.h>
2195e2a0 88#include <linux/textsearch.h>
a1f8e7f7 89
39564c3f 90#include "dev.h"
7f678def 91#include "sock_destructor.h"
7b7ed885 92
df5042f4
FW
93#ifdef CONFIG_SKB_EXTENSIONS
94static struct kmem_cache *skbuff_ext_cache __ro_after_init;
95#endif
bf9f1baa 96
bf9f1baa
ED
97#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
98
99/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
100 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
101 * size, and we can differentiate heads from skb_small_head_cache
102 * vs system slabs by looking at their size (skb_end_offset()).
103 */
104#define SKB_SMALL_HEAD_CACHE_SIZE \
105 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
106 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
107 SKB_SMALL_HEAD_SIZE)
108
109#define SKB_SMALL_HEAD_HEADROOM \
110 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
bf9f1baa 111
21d2e673
MA
112/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
113 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
114 * netmem is a page.
115 */
116static_assert(offsetof(struct bio_vec, bv_page) ==
117 offsetof(skb_frag_t, netmem));
118static_assert(sizeof_field(struct bio_vec, bv_page) ==
119 sizeof_field(skb_frag_t, netmem));
120
121static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
122static_assert(sizeof_field(struct bio_vec, bv_len) ==
123 sizeof_field(skb_frag_t, len));
124
125static_assert(offsetof(struct bio_vec, bv_offset) ==
126 offsetof(skb_frag_t, offset));
127static_assert(sizeof_field(struct bio_vec, bv_offset) ==
128 sizeof_field(skb_frag_t, offset));
129
9cb252c4
MD
130#undef FN
131#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
071c0fc6 132static const char * const drop_reasons[] = {
0e84afe8 133 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
134 DEFINE_DROP_REASON(FN, FN)
135};
071c0fc6
JB
136
137static const struct drop_reason_list drop_reasons_core = {
138 .reasons = drop_reasons,
139 .n_reasons = ARRAY_SIZE(drop_reasons),
140};
141
142const struct drop_reason_list __rcu *
143drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
144 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
145};
146EXPORT_SYMBOL(drop_reasons_by_subsys);
147
148/**
149 * drop_reasons_register_subsys - register another drop reason subsystem
150 * @subsys: the subsystem to register, must not be the core
151 * @list: the list of drop reasons within the subsystem, must point to
152 * a statically initialized list
153 */
154void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
155 const struct drop_reason_list *list)
156{
157 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
158 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
159 "invalid subsystem %d\n", subsys))
160 return;
161
162 /* must point to statically allocated memory, so INIT is OK */
163 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
164}
165EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
166
167/**
168 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
169 * @subsys: the subsystem to remove, must not be the core
170 *
171 * Note: This will synchronize_rcu() to ensure no users when it returns.
172 */
173void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
174{
175 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
176 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
177 "invalid subsystem %d\n", subsys))
178 return;
179
180 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
181
182 synchronize_rcu();
183}
184EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
ec43908d 185
1da177e4 186/**
f05de73b
JS
187 * skb_panic - private function for out-of-line support
188 * @skb: buffer
189 * @sz: size
190 * @addr: address
99d5851e 191 * @msg: skb_over_panic or skb_under_panic
1da177e4 192 *
f05de73b
JS
193 * Out-of-line support for skb_put() and skb_push().
194 * Called via the wrapper skb_over_panic() or skb_under_panic().
195 * Keep out of line to prevent kernel bloat.
196 * __builtin_return_address is not used because it is not always reliable.
1da177e4 197 */
f05de73b 198static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 199 const char msg[])
1da177e4 200{
41a46913 201 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 202 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
203 (unsigned long)skb->tail, (unsigned long)skb->end,
204 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
205 BUG();
206}
207
f05de73b 208static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 209{
f05de73b 210 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
211}
212
f05de73b
JS
213static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
214{
215 skb_panic(skb, sz, addr, __func__);
216}
c93bdd0e 217
50fad4b5 218#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
219#define NAPI_SKB_CACHE_BULK 16
220#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 221
dbae2b06
PA
222#if PAGE_SIZE == SZ_4K
223
224#define NAPI_HAS_SMALL_PAGE_FRAG 1
225#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
226
227/* specialized page frag allocator using a single order 0 page
228 * and slicing it into 1K sized fragment. Constrained to systems
229 * with a very limited amount of 1K fragments fitting a single
230 * page - to avoid excessive truesize underestimation
231 */
232
233struct page_frag_1k {
234 void *va;
235 u16 offset;
236 bool pfmemalloc;
237};
238
239static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
240{
241 struct page *page;
242 int offset;
243
244 offset = nc->offset - SZ_1K;
245 if (likely(offset >= 0))
246 goto use_frag;
247
248 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
249 if (!page)
250 return NULL;
251
252 nc->va = page_address(page);
253 nc->pfmemalloc = page_is_pfmemalloc(page);
254 offset = PAGE_SIZE - SZ_1K;
255 page_ref_add(page, offset / SZ_1K);
256
257use_frag:
258 nc->offset = offset;
259 return nc->va + offset;
260}
261#else
262
263/* the small page is actually unused in this build; add dummy helpers
264 * to please the compiler and avoid later preprocessor's conditionals
265 */
266#define NAPI_HAS_SMALL_PAGE_FRAG 0
267#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
268
269struct page_frag_1k {
270};
271
272static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
273{
274 return NULL;
275}
276
277#endif
278
50fad4b5
AL
279struct napi_alloc_cache {
280 struct page_frag_cache page;
dbae2b06 281 struct page_frag_1k page_small;
50fad4b5
AL
282 unsigned int skb_count;
283 void *skb_cache[NAPI_SKB_CACHE_SIZE];
284};
285
286static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
287static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
288
dbae2b06
PA
289/* Double check that napi_get_frags() allocates skbs with
290 * skb->head being backed by slab, not a page fragment.
291 * This is to make sure bug fixed in 3226b158e67c
292 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
293 * does not accidentally come back.
294 */
295void napi_get_frags_check(struct napi_struct *napi)
296{
297 struct sk_buff *skb;
298
299 local_bh_disable();
300 skb = napi_get_frags(napi);
301 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
302 napi_free_frags(napi);
303 local_bh_enable();
304}
305
32e3573f 306void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
307{
308 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
309
50fad4b5
AL
310 fragsz = SKB_DATA_ALIGN(fragsz);
311
411c5f36
YL
312 return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
313 align_mask);
50fad4b5
AL
314}
315EXPORT_SYMBOL(__napi_alloc_frag_align);
316
317void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
318{
50fad4b5
AL
319 void *data;
320
321 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 322 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
323 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
324
411c5f36
YL
325 data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
326 align_mask);
50fad4b5 327 } else {
32e3573f
YD
328 struct napi_alloc_cache *nc;
329
50fad4b5 330 local_bh_disable();
32e3573f 331 nc = this_cpu_ptr(&napi_alloc_cache);
411c5f36
YL
332 data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
333 align_mask);
50fad4b5
AL
334 local_bh_enable();
335 }
336 return data;
337}
338EXPORT_SYMBOL(__netdev_alloc_frag_align);
339
f450d539
AL
340static struct sk_buff *napi_skb_cache_get(void)
341{
342 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
343 struct sk_buff *skb;
344
49ae83fc 345 if (unlikely(!nc->skb_count)) {
aa70d2d1 346 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
f450d539
AL
347 GFP_ATOMIC,
348 NAPI_SKB_CACHE_BULK,
349 nc->skb_cache);
49ae83fc
SPL
350 if (unlikely(!nc->skb_count))
351 return NULL;
352 }
f450d539
AL
353
354 skb = nc->skb_cache[--nc->skb_count];
aa70d2d1 355 kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
f450d539
AL
356
357 return skb;
358}
359
ce098da1
KC
360static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
361 unsigned int size)
ba0509b6
JDB
362{
363 struct skb_shared_info *shinfo;
ba0509b6
JDB
364
365 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
366
367 /* Assumes caller memset cleared SKB */
368 skb->truesize = SKB_TRUESIZE(size);
369 refcount_set(&skb->users, 1);
370 skb->head = data;
371 skb->data = data;
372 skb_reset_tail_pointer(skb);
763087da 373 skb_set_end_offset(skb, size);
ba0509b6
JDB
374 skb->mac_header = (typeof(skb->mac_header))~0U;
375 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 376 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
377 /* make sure we initialize shinfo sequentially */
378 shinfo = skb_shinfo(skb);
379 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
380 atomic_set(&shinfo->dataref, 1);
381
6370cc3b 382 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
383}
384
ce098da1
KC
385static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
386 unsigned int *size)
387{
388 void *resized;
389
390 /* Must find the allocation size (and grow it to match). */
391 *size = ksize(data);
392 /* krealloc() will immediately return "data" when
393 * "ksize(data)" is requested: it is the existing upper
394 * bounds. As a result, GFP_ATOMIC will be ignored. Note
395 * that this "new" pointer needs to be passed back to the
396 * caller for use so the __alloc_size hinting will be
397 * tracked correctly.
398 */
399 resized = krealloc(data, *size, GFP_ATOMIC);
400 WARN_ON_ONCE(resized != data);
401 return resized;
402}
403
404/* build_skb() variant which can operate on slab buffers.
405 * Note that this should be used sparingly as slab buffers
406 * cannot be combined efficiently by GRO!
407 */
408struct sk_buff *slab_build_skb(void *data)
409{
410 struct sk_buff *skb;
411 unsigned int size;
412
aa70d2d1 413 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
ce098da1
KC
414 if (unlikely(!skb))
415 return NULL;
416
417 memset(skb, 0, offsetof(struct sk_buff, tail));
418 data = __slab_build_skb(skb, data, &size);
419 __finalize_skb_around(skb, data, size);
420
421 return skb;
422}
423EXPORT_SYMBOL(slab_build_skb);
424
425/* Caller must provide SKB that is memset cleared */
426static void __build_skb_around(struct sk_buff *skb, void *data,
427 unsigned int frag_size)
428{
429 unsigned int size = frag_size;
430
431 /* frag_size == 0 is considered deprecated now. Callers
432 * using slab buffer should use slab_build_skb() instead.
433 */
434 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
435 data = __slab_build_skb(skb, data, &size);
436
437 __finalize_skb_around(skb, data, size);
438}
439
b2b5ce9d 440/**
2ea2f62c 441 * __build_skb - build a network buffer
b2b5ce9d 442 * @data: data buffer provided by caller
ce098da1 443 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
444 *
445 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
446 * skb_shared_info. @data must have been allocated from the page
447 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
448 * allocation is deprecated, and callers should use slab_build_skb()
449 * instead.)
b2b5ce9d
ED
450 * The return is the new skb buffer.
451 * On a failure the return is %NULL, and @data is not freed.
452 * Notes :
453 * Before IO, driver allocates only data buffer where NIC put incoming frame
454 * Driver should add room at head (NET_SKB_PAD) and
455 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
456 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
457 * before giving packet to stack.
458 * RX rings only contains data buffers, not full skbs.
459 */
2ea2f62c 460struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 461{
b2b5ce9d 462 struct sk_buff *skb;
b2b5ce9d 463
aa70d2d1 464 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
ba0509b6 465 if (unlikely(!skb))
b2b5ce9d
ED
466 return NULL;
467
b2b5ce9d 468 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 469 __build_skb_around(skb, data, frag_size);
b2b5ce9d 470
483126b3 471 return skb;
b2b5ce9d 472}
2ea2f62c
ED
473
474/* build_skb() is wrapper over __build_skb(), that specifically
475 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
476 */
477struct sk_buff *build_skb(void *data, unsigned int frag_size)
478{
479 struct sk_buff *skb = __build_skb(data, frag_size);
480
3c640126 481 if (likely(skb && frag_size)) {
2ea2f62c 482 skb->head_frag = 1;
566b6701 483 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
2ea2f62c
ED
484 }
485 return skb;
486}
b2b5ce9d
ED
487EXPORT_SYMBOL(build_skb);
488
ba0509b6
JDB
489/**
490 * build_skb_around - build a network buffer around provided skb
491 * @skb: sk_buff provide by caller, must be memset cleared
492 * @data: data buffer provided by caller
12c1604a 493 * @frag_size: size of data
ba0509b6
JDB
494 */
495struct sk_buff *build_skb_around(struct sk_buff *skb,
496 void *data, unsigned int frag_size)
497{
498 if (unlikely(!skb))
499 return NULL;
500
483126b3 501 __build_skb_around(skb, data, frag_size);
ba0509b6 502
483126b3 503 if (frag_size) {
ba0509b6 504 skb->head_frag = 1;
566b6701 505 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
ba0509b6
JDB
506 }
507 return skb;
508}
509EXPORT_SYMBOL(build_skb_around);
510
f450d539
AL
511/**
512 * __napi_build_skb - build a network buffer
513 * @data: data buffer provided by caller
12c1604a 514 * @frag_size: size of data
f450d539
AL
515 *
516 * Version of __build_skb() that uses NAPI percpu caches to obtain
517 * skbuff_head instead of inplace allocation.
518 *
519 * Returns a new &sk_buff on success, %NULL on allocation failure.
520 */
521static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
522{
523 struct sk_buff *skb;
524
525 skb = napi_skb_cache_get();
526 if (unlikely(!skb))
527 return NULL;
528
529 memset(skb, 0, offsetof(struct sk_buff, tail));
530 __build_skb_around(skb, data, frag_size);
531
532 return skb;
533}
534
535/**
536 * napi_build_skb - build a network buffer
537 * @data: data buffer provided by caller
12c1604a 538 * @frag_size: size of data
f450d539
AL
539 *
540 * Version of __napi_build_skb() that takes care of skb->head_frag
541 * and skb->pfmemalloc when the data is a page or page fragment.
542 *
543 * Returns a new &sk_buff on success, %NULL on allocation failure.
544 */
545struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
546{
547 struct sk_buff *skb = __napi_build_skb(data, frag_size);
548
549 if (likely(skb) && frag_size) {
550 skb->head_frag = 1;
551 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
552 }
553
554 return skb;
555}
556EXPORT_SYMBOL(napi_build_skb);
557
5381b23d
AL
558/*
559 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
560 * the caller if emergency pfmemalloc reserves are being used. If it is and
561 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
562 * may be used. Otherwise, the packet data may be discarded until enough
563 * memory is free
564 */
5c0e820c 565static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 566 bool *pfmemalloc)
5381b23d 567{
5381b23d 568 bool ret_pfmemalloc = false;
915d975b 569 size_t obj_size;
5c0e820c 570 void *obj;
5381b23d 571
5c0e820c 572 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
573 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
574 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
aa70d2d1 575 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
bf9f1baa
ED
576 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
577 node);
880ce5f2
ED
578 *size = SKB_SMALL_HEAD_CACHE_SIZE;
579 if (obj || !(gfp_pfmemalloc_allowed(flags)))
bf9f1baa 580 goto out;
880ce5f2
ED
581 /* Try again but now we are using pfmemalloc reserves */
582 ret_pfmemalloc = true;
aa70d2d1 583 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
880ce5f2 584 goto out;
bf9f1baa 585 }
915d975b
ED
586
587 obj_size = kmalloc_size_roundup(obj_size);
588 /* The following cast might truncate high-order bits of obj_size, this
589 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
590 */
591 *size = (unsigned int)obj_size;
592
5381b23d
AL
593 /*
594 * Try a regular allocation, when that fails and we're not entitled
595 * to the reserves, fail.
596 */
5c0e820c 597 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
598 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
599 node);
600 if (obj || !(gfp_pfmemalloc_allowed(flags)))
601 goto out;
602
603 /* Try again but now we are using pfmemalloc reserves */
604 ret_pfmemalloc = true;
5c0e820c 605 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
606
607out:
608 if (pfmemalloc)
609 *pfmemalloc = ret_pfmemalloc;
610
611 return obj;
612}
613
614/* Allocate a new skbuff. We do this ourselves so we can fill in a few
615 * 'private' fields and also do memory statistics to find all the
616 * [BEEP] leaks.
617 *
618 */
619
620/**
621 * __alloc_skb - allocate a network buffer
622 * @size: size to allocate
623 * @gfp_mask: allocation mask
624 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
625 * instead of head cache and allocate a cloned (child) skb.
626 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
627 * allocations in case the data is required for writeback
628 * @node: numa node to allocate memory on
629 *
630 * Allocate a new &sk_buff. The returned buffer has no headroom and a
631 * tail room of at least size bytes. The object has a reference count
632 * of one. The return is the buffer. On a failure the return is %NULL.
633 *
634 * Buffers may only be allocated from interrupts using a @gfp_mask of
635 * %GFP_ATOMIC.
636 */
637struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
638 int flags, int node)
639{
640 struct kmem_cache *cache;
5381b23d 641 struct sk_buff *skb;
5381b23d 642 bool pfmemalloc;
a5df6333 643 u8 *data;
5381b23d
AL
644
645 cache = (flags & SKB_ALLOC_FCLONE)
aa70d2d1 646 ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
5381b23d
AL
647
648 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
649 gfp_mask |= __GFP_MEMALLOC;
650
651 /* Get the HEAD */
d13612b5
AL
652 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
653 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
654 skb = napi_skb_cache_get();
655 else
656 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
657 if (unlikely(!skb))
658 return NULL;
5381b23d
AL
659 prefetchw(skb);
660
661 /* We do our best to align skb_shared_info on a separate cache
662 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
663 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
664 * Both skb->head and skb_shared_info are cache line aligned.
665 */
5c0e820c 666 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 667 if (unlikely(!data))
5381b23d 668 goto nodata;
12d6c1d3 669 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
670 * Put skb_shared_info exactly at the end of allocated zone,
671 * to allow max possible filling before reallocation.
672 */
65998d2b 673 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
674
675 /*
676 * Only clear those fields we need to clear, not those that we will
677 * actually initialise below. Hence, don't put any more fields after
678 * the tail pointer in struct sk_buff!
679 */
680 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 681 __build_skb_around(skb, data, size);
5381b23d 682 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
683
684 if (flags & SKB_ALLOC_FCLONE) {
685 struct sk_buff_fclones *fclones;
686
687 fclones = container_of(skb, struct sk_buff_fclones, skb1);
688
689 skb->fclone = SKB_FCLONE_ORIG;
690 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
691 }
692
5381b23d 693 return skb;
df1ae022 694
5381b23d
AL
695nodata:
696 kmem_cache_free(cache, skb);
df1ae022 697 return NULL;
5381b23d
AL
698}
699EXPORT_SYMBOL(__alloc_skb);
700
fd11a83d
AD
701/**
702 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
703 * @dev: network device to receive on
d7499160 704 * @len: length to allocate
fd11a83d
AD
705 * @gfp_mask: get_free_pages mask, passed to alloc_skb
706 *
707 * Allocate a new &sk_buff and assign it a usage count of one. The
708 * buffer has NET_SKB_PAD headroom built in. Users should allocate
709 * the headroom they think they need without accounting for the
710 * built in space. The built in space is used for optimisations.
711 *
712 * %NULL is returned if there is no free memory.
713 */
9451980a
AD
714struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
715 gfp_t gfp_mask)
fd11a83d 716{
b63ae8ca 717 struct page_frag_cache *nc;
fd11a83d 718 struct sk_buff *skb;
9451980a
AD
719 bool pfmemalloc;
720 void *data;
721
722 len += NET_SKB_PAD;
fd11a83d 723
66c55602
AL
724 /* If requested length is either too small or too big,
725 * we use kmalloc() for skb->head allocation.
726 */
727 if (len <= SKB_WITH_OVERHEAD(1024) ||
728 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 729 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
730 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
731 if (!skb)
732 goto skb_fail;
733 goto skb_success;
734 }
fd11a83d 735
115f1a5c 736 len = SKB_HEAD_ALIGN(len);
9451980a
AD
737
738 if (sk_memalloc_socks())
739 gfp_mask |= __GFP_MEMALLOC;
740
afa79d08 741 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
742 nc = this_cpu_ptr(&netdev_alloc_cache);
743 data = page_frag_alloc(nc, len, gfp_mask);
744 pfmemalloc = nc->pfmemalloc;
745 } else {
746 local_bh_disable();
747 nc = this_cpu_ptr(&napi_alloc_cache.page);
748 data = page_frag_alloc(nc, len, gfp_mask);
749 pfmemalloc = nc->pfmemalloc;
750 local_bh_enable();
751 }
9451980a
AD
752
753 if (unlikely(!data))
754 return NULL;
755
756 skb = __build_skb(data, len);
757 if (unlikely(!skb)) {
181edb2b 758 skb_free_frag(data);
9451980a 759 return NULL;
7b2e497a 760 }
fd11a83d 761
9451980a
AD
762 if (pfmemalloc)
763 skb->pfmemalloc = 1;
764 skb->head_frag = 1;
765
a080e7bd 766skb_success:
9451980a
AD
767 skb_reserve(skb, NET_SKB_PAD);
768 skb->dev = dev;
769
a080e7bd 770skb_fail:
8af27456
CH
771 return skb;
772}
b4ac530f 773EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 774
fd11a83d 775/**
6e9b0190 776 * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
fd11a83d 777 * @napi: napi instance this buffer was allocated for
d7499160 778 * @len: length to allocate
fd11a83d
AD
779 *
780 * Allocate a new sk_buff for use in NAPI receive. This buffer will
781 * attempt to allocate the head from a special reserved region used
782 * only for NAPI Rx allocation. By doing this we can save several
783 * CPU cycles by avoiding having to disable and re-enable IRQs.
784 *
785 * %NULL is returned if there is no free memory.
786 */
6e9b0190 787struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
fd11a83d 788{
6e9b0190 789 gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
3226b158 790 struct napi_alloc_cache *nc;
fd11a83d 791 struct sk_buff *skb;
dbae2b06 792 bool pfmemalloc;
9451980a
AD
793 void *data;
794
ee2640df 795 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 796 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 797
3226b158
ED
798 /* If requested length is either too small or too big,
799 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
800 * When the small frag allocator is available, prefer it over kmalloc
801 * for small fragments
3226b158 802 */
dbae2b06 803 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 804 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 805 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
806 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
807 NUMA_NO_NODE);
a080e7bd
AD
808 if (!skb)
809 goto skb_fail;
810 goto skb_success;
811 }
9451980a 812
3226b158 813 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
814
815 if (sk_memalloc_socks())
816 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 817
dbae2b06
PA
818 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
819 /* we are artificially inflating the allocation size, but
820 * that is not as bad as it may look like, as:
821 * - 'len' less than GRO_MAX_HEAD makes little sense
822 * - On most systems, larger 'len' values lead to fragment
823 * size above 512 bytes
824 * - kmalloc would use the kmalloc-1k slab for such values
825 * - Builds with smaller GRO_MAX_HEAD will very likely do
826 * little networking, as that implies no WiFi and no
827 * tunnels support, and 32 bits arches.
828 */
829 len = SZ_1K;
830
831 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
832 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
833 } else {
115f1a5c 834 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
835
836 data = page_frag_alloc(&nc->page, len, gfp_mask);
837 pfmemalloc = nc->page.pfmemalloc;
838 }
839
9451980a
AD
840 if (unlikely(!data))
841 return NULL;
842
cfb8ec65 843 skb = __napi_build_skb(data, len);
9451980a 844 if (unlikely(!skb)) {
181edb2b 845 skb_free_frag(data);
9451980a 846 return NULL;
fd11a83d
AD
847 }
848
dbae2b06 849 if (pfmemalloc)
9451980a
AD
850 skb->pfmemalloc = 1;
851 skb->head_frag = 1;
852
a080e7bd 853skb_success:
9451980a
AD
854 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
855 skb->dev = napi->dev;
856
a080e7bd 857skb_fail:
fd11a83d
AD
858 return skb;
859}
6e9b0190 860EXPORT_SYMBOL(napi_alloc_skb);
fd11a83d 861
21d2e673
MA
862void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
863 int off, int size, unsigned int truesize)
654bed16 864{
c123e0d3
ED
865 DEBUG_NET_WARN_ON_ONCE(size > truesize);
866
21d2e673 867 skb_fill_netmem_desc(skb, i, netmem, off, size);
654bed16
PZ
868 skb->len += size;
869 skb->data_len += size;
50269e19 870 skb->truesize += truesize;
654bed16 871}
21d2e673 872EXPORT_SYMBOL(skb_add_rx_frag_netmem);
654bed16 873
f8e617e1
JW
874void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
875 unsigned int truesize)
876{
877 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
878
c123e0d3
ED
879 DEBUG_NET_WARN_ON_ONCE(size > truesize);
880
f8e617e1
JW
881 skb_frag_size_add(frag, size);
882 skb->len += size;
883 skb->data_len += size;
884 skb->truesize += truesize;
885}
886EXPORT_SYMBOL(skb_coalesce_rx_frag);
887
27b437c8 888static void skb_drop_list(struct sk_buff **listp)
1da177e4 889{
bd8a7036 890 kfree_skb_list(*listp);
27b437c8 891 *listp = NULL;
1da177e4
LT
892}
893
27b437c8
HX
894static inline void skb_drop_fraglist(struct sk_buff *skb)
895{
896 skb_drop_list(&skb_shinfo(skb)->frag_list);
897}
898
1da177e4
LT
899static void skb_clone_fraglist(struct sk_buff *skb)
900{
901 struct sk_buff *list;
902
fbb398a8 903 skb_walk_frags(skb, list)
1da177e4
LT
904 skb_get(list);
905}
906
173e7622
MA
907static bool is_pp_page(struct page *page)
908{
909 return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
910}
911
27accb3c
LB
912int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
913 unsigned int headroom)
e6d5dbdd
LB
914{
915#if IS_ENABLED(CONFIG_PAGE_POOL)
916 u32 size, truesize, len, max_head_size, off;
917 struct sk_buff *skb = *pskb, *nskb;
918 int err, i, head_off;
919 void *data;
920
921 /* XDP does not support fraglist so we need to linearize
922 * the skb.
923 */
924 if (skb_has_frag_list(skb))
925 return -EOPNOTSUPP;
926
927 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
928 if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
929 return -ENOMEM;
930
931 size = min_t(u32, skb->len, max_head_size);
932 truesize = SKB_HEAD_ALIGN(size) + headroom;
933 data = page_pool_dev_alloc_va(pool, &truesize);
934 if (!data)
935 return -ENOMEM;
936
937 nskb = napi_build_skb(data, truesize);
938 if (!nskb) {
939 page_pool_free_va(pool, data, true);
940 return -ENOMEM;
941 }
942
943 skb_reserve(nskb, headroom);
944 skb_copy_header(nskb, skb);
945 skb_mark_for_recycle(nskb);
946
947 err = skb_copy_bits(skb, 0, nskb->data, size);
948 if (err) {
949 consume_skb(nskb);
950 return err;
951 }
952 skb_put(nskb, size);
953
954 head_off = skb_headroom(nskb) - skb_headroom(skb);
955 skb_headers_offset_update(nskb, head_off);
956
957 off = size;
958 len = skb->len - off;
959 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
960 struct page *page;
961 u32 page_off;
962
963 size = min_t(u32, len, PAGE_SIZE);
964 truesize = size;
965
966 page = page_pool_dev_alloc(pool, &page_off, &truesize);
c6a28acb 967 if (!page) {
e6d5dbdd
LB
968 consume_skb(nskb);
969 return -ENOMEM;
970 }
971
972 skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
973 err = skb_copy_bits(skb, off, page_address(page) + page_off,
974 size);
975 if (err) {
976 consume_skb(nskb);
977 return err;
978 }
979
980 len -= size;
981 off += size;
982 }
983
984 consume_skb(skb);
985 *pskb = nskb;
986
987 return 0;
988#else
989 return -EOPNOTSUPP;
990#endif
991}
27accb3c 992EXPORT_SYMBOL(skb_pp_cow_data);
e6d5dbdd
LB
993
994int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
995 struct bpf_prog *prog)
996{
997 if (!prog->aux->xdp_has_frags)
998 return -EINVAL;
999
1000 return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
1001}
1002EXPORT_SYMBOL(skb_cow_data_for_xdp);
1003
75eaf63e 1004#if IS_ENABLED(CONFIG_PAGE_POOL)
4a96a4e8 1005bool napi_pp_put_page(struct page *page)
75eaf63e 1006{
75eaf63e
AL
1007 page = compound_head(page);
1008
1009 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
1010 * in order to preserve any existing bits, such as bit 0 for the
1011 * head page of compound page and bit 1 for pfmemalloc page, so
1012 * mask those bits for freeing side when doing below checking,
1013 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
1014 * to avoid recycling the pfmemalloc page.
1015 */
8cfa2dee 1016 if (unlikely(!is_pp_page(page)))
75eaf63e
AL
1017 return false;
1018
4a96a4e8 1019 page_pool_put_full_page(page->pp, page, false);
75eaf63e
AL
1020
1021 return true;
1022}
1023EXPORT_SYMBOL(napi_pp_put_page);
1024#endif
1025
4a96a4e8 1026static bool skb_pp_recycle(struct sk_buff *skb, void *data)
4727bab4
YL
1027{
1028 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
1029 return false;
4a96a4e8 1030 return napi_pp_put_page(virt_to_page(data));
4727bab4
YL
1031}
1032
173e7622
MA
1033/**
1034 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
1035 * @skb: page pool aware skb
1036 *
1037 * Increase the fragment reference count (pp_ref_count) of a skb. This is
1038 * intended to gain fragment references only for page pool aware skbs,
1039 * i.e. when skb->pp_recycle is true, and not for fragments in a
1040 * non-pp-recycling skb. It has a fallback to increase references on normal
1041 * pages, as page pool aware skbs may also have normal page fragments.
1042 */
1043static int skb_pp_frag_ref(struct sk_buff *skb)
1044{
1045 struct skb_shared_info *shinfo;
1046 struct page *head_page;
1047 int i;
1048
1049 if (!skb->pp_recycle)
1050 return -EINVAL;
1051
1052 shinfo = skb_shinfo(skb);
1053
1054 for (i = 0; i < shinfo->nr_frags; i++) {
1055 head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
1056 if (likely(is_pp_page(head_page)))
1057 page_pool_ref_page(head_page);
1058 else
1059 page_ref_inc(head_page);
1060 }
1061 return 0;
1062}
1063
bf9f1baa
ED
1064static void skb_kfree_head(void *head, unsigned int end_offset)
1065{
bf9f1baa 1066 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
aa70d2d1 1067 kmem_cache_free(net_hotdata.skb_small_head_cache, head);
bf9f1baa 1068 else
bf9f1baa
ED
1069 kfree(head);
1070}
1071
4a96a4e8 1072static void skb_free_head(struct sk_buff *skb)
d3836f21 1073{
181edb2b
AD
1074 unsigned char *head = skb->head;
1075
6a5bcd84 1076 if (skb->head_frag) {
4a96a4e8 1077 if (skb_pp_recycle(skb, head))
6a5bcd84 1078 return;
181edb2b 1079 skb_free_frag(head);
6a5bcd84 1080 } else {
bf9f1baa 1081 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 1082 }
d3836f21
ED
1083}
1084
4a96a4e8 1085static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
1da177e4 1086{
ff04a771
ED
1087 struct skb_shared_info *shinfo = skb_shinfo(skb);
1088 int i;
1da177e4 1089
1cface55 1090 if (!skb_data_unref(skb, shinfo))
2cc3aeb5 1091 goto exit;
a6686f2f 1092
753f1ca4
PB
1093 if (skb_zcopy(skb)) {
1094 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
1095
1096 skb_zcopy_clear(skb, true);
1097 if (skip_unref)
1098 goto free_head;
1099 }
70c43167 1100
ff04a771 1101 for (i = 0; i < shinfo->nr_frags; i++)
f58f3c95 1102 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 1103
753f1ca4 1104free_head:
ff04a771 1105 if (shinfo->frag_list)
511a3eda 1106 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771 1107
4a96a4e8 1108 skb_free_head(skb);
2cc3aeb5
IA
1109exit:
1110 /* When we clone an SKB we copy the reycling bit. The pp_recycle
1111 * bit is only set on the head though, so in order to avoid races
1112 * while trying to recycle fragments on __skb_frag_unref() we need
1113 * to make one SKB responsible for triggering the recycle path.
1114 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 1115 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
1116 * Eventually the last SKB will have the recycling bit set and it's
1117 * dataref set to 0, which will trigger the recycling
1118 */
1119 skb->pp_recycle = 0;
1da177e4
LT
1120}
1121
1122/*
1123 * Free an skbuff by memory without cleaning the state.
1124 */
2d4baff8 1125static void kfree_skbmem(struct sk_buff *skb)
1da177e4 1126{
d0bf4a9e 1127 struct sk_buff_fclones *fclones;
d179cd12 1128
d179cd12
DM
1129 switch (skb->fclone) {
1130 case SKB_FCLONE_UNAVAILABLE:
aa70d2d1 1131 kmem_cache_free(net_hotdata.skbuff_cache, skb);
6ffe75eb 1132 return;
d179cd12
DM
1133
1134 case SKB_FCLONE_ORIG:
d0bf4a9e 1135 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 1136
6ffe75eb
ED
1137 /* We usually free the clone (TX completion) before original skb
1138 * This test would have no chance to be true for the clone,
1139 * while here, branch prediction will be good.
d179cd12 1140 */
2638595a 1141 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
1142 goto fastpath;
1143 break;
e7820e39 1144
6ffe75eb
ED
1145 default: /* SKB_FCLONE_CLONE */
1146 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 1147 break;
3ff50b79 1148 }
2638595a 1149 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
1150 return;
1151fastpath:
aa70d2d1 1152 kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
1da177e4
LT
1153}
1154
0a463c78 1155void skb_release_head_state(struct sk_buff *skb)
1da177e4 1156{
adf30907 1157 skb_dst_drop(skb);
9c2b3328 1158 if (skb->destructor) {
7890e2f0 1159 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
1160 skb->destructor(skb);
1161 }
a3bf7ae9 1162#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 1163 nf_conntrack_put(skb_nfct(skb));
1da177e4 1164#endif
df5042f4 1165 skb_ext_put(skb);
04a4bb55
LB
1166}
1167
1168/* Free everything but the sk_buff shell. */
4a96a4e8 1169static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
04a4bb55
LB
1170{
1171 skb_release_head_state(skb);
a28b1b90 1172 if (likely(skb->head))
4a96a4e8 1173 skb_release_data(skb, reason);
2d4baff8
HX
1174}
1175
1176/**
1177 * __kfree_skb - private function
1178 * @skb: buffer
1179 *
1180 * Free an sk_buff. Release anything attached to the buffer.
1181 * Clean the state. This is an internal helper function. Users should
1182 * always call kfree_skb
1183 */
1da177e4 1184
2d4baff8
HX
1185void __kfree_skb(struct sk_buff *skb)
1186{
4a96a4e8 1187 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1da177e4
LT
1188 kfree_skbmem(skb);
1189}
b4ac530f 1190EXPORT_SYMBOL(__kfree_skb);
1da177e4 1191
a4650da2
JDB
1192static __always_inline
1193bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1194{
1195 if (unlikely(!skb_unref(skb)))
1196 return false;
1197
071c0fc6
JB
1198 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1199 u32_get_bits(reason,
1200 SKB_DROP_REASON_SUBSYS_MASK) >=
1201 SKB_DROP_REASON_SUBSYS_NUM);
a4650da2
JDB
1202
1203 if (reason == SKB_CONSUMED)
dd1b5278 1204 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2
JDB
1205 else
1206 trace_kfree_skb(skb, __builtin_return_address(0), reason);
1207 return true;
1208}
1209