net: Use backlog-NAPI to clean up the defer_list.
[linux-2.6-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
071c0fc6 61#include <linux/bitfield.h>
0d5501c1 62#include <linux/if_vlan.h>
2a2ea508 63#include <linux/mpls.h>
183f47fc 64#include <linux/kcov.h>
6d0d4199 65#include <linux/iov_iter.h>
1da177e4
LT
66
67#include <net/protocol.h>
68#include <net/dst.h>
69#include <net/sock.h>
70#include <net/checksum.h>
d457a0e3 71#include <net/gso.h>
aa70d2d1 72#include <net/hotdata.h>
ed1f50c3 73#include <net/ip6_checksum.h>
1da177e4 74#include <net/xfrm.h>
8822e270 75#include <net/mpls.h>
3ee17bc7 76#include <net/mptcp.h>
78476d31 77#include <net/mctp.h>
75eaf63e 78#include <net/page_pool/helpers.h>
071c0fc6 79#include <net/dropreason.h>
1da177e4 80
7c0f6ba6 81#include <linux/uaccess.h>
ad8d75ff 82#include <trace/events/skb.h>
51c56b00 83#include <linux/highmem.h>
b245be1f
WB
84#include <linux/capability.h>
85#include <linux/user_namespace.h>
2544af03 86#include <linux/indirect_call_wrapper.h>
2195e2a0 87#include <linux/textsearch.h>
a1f8e7f7 88
39564c3f 89#include "dev.h"
7f678def 90#include "sock_destructor.h"
7b7ed885 91
df5042f4
FW
92#ifdef CONFIG_SKB_EXTENSIONS
93static struct kmem_cache *skbuff_ext_cache __ro_after_init;
94#endif
bf9f1baa 95
bf9f1baa
ED
96#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
97
98/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
99 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
100 * size, and we can differentiate heads from skb_small_head_cache
101 * vs system slabs by looking at their size (skb_end_offset()).
102 */
103#define SKB_SMALL_HEAD_CACHE_SIZE \
104 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
105 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
106 SKB_SMALL_HEAD_SIZE)
107
108#define SKB_SMALL_HEAD_HEADROOM \
109 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
bf9f1baa 110
5f74f82e
HWR
111int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
112EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 113
21d2e673
MA
114/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
115 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
116 * netmem is a page.
117 */
118static_assert(offsetof(struct bio_vec, bv_page) ==
119 offsetof(skb_frag_t, netmem));
120static_assert(sizeof_field(struct bio_vec, bv_page) ==
121 sizeof_field(skb_frag_t, netmem));
122
123static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
124static_assert(sizeof_field(struct bio_vec, bv_len) ==
125 sizeof_field(skb_frag_t, len));
126
127static_assert(offsetof(struct bio_vec, bv_offset) ==
128 offsetof(skb_frag_t, offset));
129static_assert(sizeof_field(struct bio_vec, bv_offset) ==
130 sizeof_field(skb_frag_t, offset));
131
9cb252c4
MD
132#undef FN
133#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
071c0fc6 134static const char * const drop_reasons[] = {
0e84afe8 135 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
136 DEFINE_DROP_REASON(FN, FN)
137};
071c0fc6
JB
138
139static const struct drop_reason_list drop_reasons_core = {
140 .reasons = drop_reasons,
141 .n_reasons = ARRAY_SIZE(drop_reasons),
142};
143
144const struct drop_reason_list __rcu *
145drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
146 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
147};
148EXPORT_SYMBOL(drop_reasons_by_subsys);
149
150/**
151 * drop_reasons_register_subsys - register another drop reason subsystem
152 * @subsys: the subsystem to register, must not be the core
153 * @list: the list of drop reasons within the subsystem, must point to
154 * a statically initialized list
155 */
156void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
157 const struct drop_reason_list *list)
158{
159 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
160 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
161 "invalid subsystem %d\n", subsys))
162 return;
163
164 /* must point to statically allocated memory, so INIT is OK */
165 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
166}
167EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
168
169/**
170 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
171 * @subsys: the subsystem to remove, must not be the core
172 *
173 * Note: This will synchronize_rcu() to ensure no users when it returns.
174 */
175void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
176{
177 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
178 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
179 "invalid subsystem %d\n", subsys))
180 return;
181
182 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
183
184 synchronize_rcu();
185}
186EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
ec43908d 187
1da177e4 188/**
f05de73b
JS
189 * skb_panic - private function for out-of-line support
190 * @skb: buffer
191 * @sz: size
192 * @addr: address
99d5851e 193 * @msg: skb_over_panic or skb_under_panic
1da177e4 194 *
f05de73b
JS
195 * Out-of-line support for skb_put() and skb_push().
196 * Called via the wrapper skb_over_panic() or skb_under_panic().
197 * Keep out of line to prevent kernel bloat.
198 * __builtin_return_address is not used because it is not always reliable.
1da177e4 199 */
f05de73b 200static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 201 const char msg[])
1da177e4 202{
41a46913 203 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 204 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
205 (unsigned long)skb->tail, (unsigned long)skb->end,
206 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
207 BUG();
208}
209
f05de73b 210static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 211{
f05de73b 212 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
213}
214
f05de73b
JS
215static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
216{
217 skb_panic(skb, sz, addr, __func__);
218}
c93bdd0e 219
50fad4b5 220#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
221#define NAPI_SKB_CACHE_BULK 16
222#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 223
dbae2b06
PA
224#if PAGE_SIZE == SZ_4K
225
226#define NAPI_HAS_SMALL_PAGE_FRAG 1
227#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
228
229/* specialized page frag allocator using a single order 0 page
230 * and slicing it into 1K sized fragment. Constrained to systems
231 * with a very limited amount of 1K fragments fitting a single
232 * page - to avoid excessive truesize underestimation
233 */
234
235struct page_frag_1k {
236 void *va;
237 u16 offset;
238 bool pfmemalloc;
239};
240
241static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
242{
243 struct page *page;
244 int offset;
245
246 offset = nc->offset - SZ_1K;
247 if (likely(offset >= 0))
248 goto use_frag;
249
250 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
251 if (!page)
252 return NULL;
253
254 nc->va = page_address(page);
255 nc->pfmemalloc = page_is_pfmemalloc(page);
256 offset = PAGE_SIZE - SZ_1K;
257 page_ref_add(page, offset / SZ_1K);
258
259use_frag:
260 nc->offset = offset;
261 return nc->va + offset;
262}
263#else
264
265/* the small page is actually unused in this build; add dummy helpers
266 * to please the compiler and avoid later preprocessor's conditionals
267 */
268#define NAPI_HAS_SMALL_PAGE_FRAG 0
269#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
270
271struct page_frag_1k {
272};
273
274static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
275{
276 return NULL;
277}
278
279#endif
280
50fad4b5
AL
281struct napi_alloc_cache {
282 struct page_frag_cache page;
dbae2b06 283 struct page_frag_1k page_small;
50fad4b5
AL
284 unsigned int skb_count;
285 void *skb_cache[NAPI_SKB_CACHE_SIZE];
286};
287
288static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
289static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
290
dbae2b06
PA
291/* Double check that napi_get_frags() allocates skbs with
292 * skb->head being backed by slab, not a page fragment.
293 * This is to make sure bug fixed in 3226b158e67c
294 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
295 * does not accidentally come back.
296 */
297void napi_get_frags_check(struct napi_struct *napi)
298{
299 struct sk_buff *skb;
300
301 local_bh_disable();
302 skb = napi_get_frags(napi);
303 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
304 napi_free_frags(napi);
305 local_bh_enable();
306}
307
32e3573f 308void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
309{
310 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
311
50fad4b5
AL
312 fragsz = SKB_DATA_ALIGN(fragsz);
313
411c5f36
YL
314 return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
315 align_mask);
50fad4b5
AL
316}
317EXPORT_SYMBOL(__napi_alloc_frag_align);
318
319void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
320{
50fad4b5
AL
321 void *data;
322
323 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 324 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
325 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
326
411c5f36
YL
327 data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
328 align_mask);
50fad4b5 329 } else {
32e3573f
YD
330 struct napi_alloc_cache *nc;
331
50fad4b5 332 local_bh_disable();
32e3573f 333 nc = this_cpu_ptr(&napi_alloc_cache);
411c5f36
YL
334 data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
335 align_mask);
50fad4b5
AL
336 local_bh_enable();
337 }
338 return data;
339}
340EXPORT_SYMBOL(__netdev_alloc_frag_align);
341
f450d539
AL
342static struct sk_buff *napi_skb_cache_get(void)
343{
344 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
345 struct sk_buff *skb;
346
49ae83fc 347 if (unlikely(!nc->skb_count)) {
aa70d2d1 348 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
f450d539
AL
349 GFP_ATOMIC,
350 NAPI_SKB_CACHE_BULK,
351 nc->skb_cache);
49ae83fc
SPL
352 if (unlikely(!nc->skb_count))
353 return NULL;
354 }
f450d539
AL
355
356 skb = nc->skb_cache[--nc->skb_count];
aa70d2d1 357 kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
f450d539
AL
358
359 return skb;
360}
361
ce098da1
KC
362static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
363 unsigned int size)
ba0509b6
JDB
364{
365 struct skb_shared_info *shinfo;
ba0509b6
JDB
366
367 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
368
369 /* Assumes caller memset cleared SKB */
370 skb->truesize = SKB_TRUESIZE(size);
371 refcount_set(&skb->users, 1);
372 skb->head = data;
373 skb->data = data;
374 skb_reset_tail_pointer(skb);
763087da 375 skb_set_end_offset(skb, size);
ba0509b6
JDB
376 skb->mac_header = (typeof(skb->mac_header))~0U;
377 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 378 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
379 /* make sure we initialize shinfo sequentially */
380 shinfo = skb_shinfo(skb);
381 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
382 atomic_set(&shinfo->dataref, 1);
383
6370cc3b 384 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
385}
386
ce098da1
KC
387static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
388 unsigned int *size)
389{
390 void *resized;
391
392 /* Must find the allocation size (and grow it to match). */
393 *size = ksize(data);
394 /* krealloc() will immediately return "data" when
395 * "ksize(data)" is requested: it is the existing upper
396 * bounds. As a result, GFP_ATOMIC will be ignored. Note
397 * that this "new" pointer needs to be passed back to the
398 * caller for use so the __alloc_size hinting will be
399 * tracked correctly.
400 */
401 resized = krealloc(data, *size, GFP_ATOMIC);
402 WARN_ON_ONCE(resized != data);
403 return resized;
404}
405
406/* build_skb() variant which can operate on slab buffers.
407 * Note that this should be used sparingly as slab buffers
408 * cannot be combined efficiently by GRO!
409 */
410struct sk_buff *slab_build_skb(void *data)
411{
412 struct sk_buff *skb;
413 unsigned int size;
414
aa70d2d1 415 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
ce098da1
KC
416 if (unlikely(!skb))
417 return NULL;
418
419 memset(skb, 0, offsetof(struct sk_buff, tail));
420 data = __slab_build_skb(skb, data, &size);
421 __finalize_skb_around(skb, data, size);
422
423 return skb;
424}
425EXPORT_SYMBOL(slab_build_skb);
426
427/* Caller must provide SKB that is memset cleared */
428static void __build_skb_around(struct sk_buff *skb, void *data,
429 unsigned int frag_size)
430{
431 unsigned int size = frag_size;
432
433 /* frag_size == 0 is considered deprecated now. Callers
434 * using slab buffer should use slab_build_skb() instead.
435 */
436 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
437 data = __slab_build_skb(skb, data, &size);
438
439 __finalize_skb_around(skb, data, size);
440}
441
b2b5ce9d 442/**
2ea2f62c 443 * __build_skb - build a network buffer
b2b5ce9d 444 * @data: data buffer provided by caller
ce098da1 445 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
446 *
447 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
448 * skb_shared_info. @data must have been allocated from the page
449 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
450 * allocation is deprecated, and callers should use slab_build_skb()
451 * instead.)
b2b5ce9d
ED
452 * The return is the new skb buffer.
453 * On a failure the return is %NULL, and @data is not freed.
454 * Notes :
455 * Before IO, driver allocates only data buffer where NIC put incoming frame
456 * Driver should add room at head (NET_SKB_PAD) and
457 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
458 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
459 * before giving packet to stack.
460 * RX rings only contains data buffers, not full skbs.
461 */
2ea2f62c 462struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 463{
b2b5ce9d 464 struct sk_buff *skb;
b2b5ce9d 465
aa70d2d1 466 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
ba0509b6 467 if (unlikely(!skb))
b2b5ce9d
ED
468 return NULL;
469
b2b5ce9d 470 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 471 __build_skb_around(skb, data, frag_size);
b2b5ce9d 472
483126b3 473 return skb;
b2b5ce9d 474}
2ea2f62c
ED
475
476/* build_skb() is wrapper over __build_skb(), that specifically
477 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
478 */
479struct sk_buff *build_skb(void *data, unsigned int frag_size)
480{
481 struct sk_buff *skb = __build_skb(data, frag_size);
482
3c640126 483 if (likely(skb && frag_size)) {
2ea2f62c 484 skb->head_frag = 1;
566b6701 485 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
2ea2f62c
ED
486 }
487 return skb;
488}
b2b5ce9d
ED
489EXPORT_SYMBOL(build_skb);
490
ba0509b6
JDB
491/**
492 * build_skb_around - build a network buffer around provided skb
493 * @skb: sk_buff provide by caller, must be memset cleared
494 * @data: data buffer provided by caller
12c1604a 495 * @frag_size: size of data
ba0509b6
JDB
496 */
497struct sk_buff *build_skb_around(struct sk_buff *skb,
498 void *data, unsigned int frag_size)
499{
500 if (unlikely(!skb))
501 return NULL;
502
483126b3 503 __build_skb_around(skb, data, frag_size);
ba0509b6 504
483126b3 505 if (frag_size) {
ba0509b6 506 skb->head_frag = 1;
566b6701 507 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
ba0509b6
JDB
508 }
509 return skb;
510}
511EXPORT_SYMBOL(build_skb_around);
512
f450d539
AL
513/**
514 * __napi_build_skb - build a network buffer
515 * @data: data buffer provided by caller
12c1604a 516 * @frag_size: size of data
f450d539
AL
517 *
518 * Version of __build_skb() that uses NAPI percpu caches to obtain
519 * skbuff_head instead of inplace allocation.
520 *
521 * Returns a new &sk_buff on success, %NULL on allocation failure.
522 */
523static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
524{
525 struct sk_buff *skb;
526
527 skb = napi_skb_cache_get();
528 if (unlikely(!skb))
529 return NULL;
530
531 memset(skb, 0, offsetof(struct sk_buff, tail));
532 __build_skb_around(skb, data, frag_size);
533
534 return skb;
535}
536
537/**
538 * napi_build_skb - build a network buffer
539 * @data: data buffer provided by caller
12c1604a 540 * @frag_size: size of data
f450d539
AL
541 *
542 * Version of __napi_build_skb() that takes care of skb->head_frag
543 * and skb->pfmemalloc when the data is a page or page fragment.
544 *
545 * Returns a new &sk_buff on success, %NULL on allocation failure.
546 */
547struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
548{
549 struct sk_buff *skb = __napi_build_skb(data, frag_size);
550
551 if (likely(skb) && frag_size) {
552 skb->head_frag = 1;
553 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
554 }
555
556 return skb;
557}
558EXPORT_SYMBOL(napi_build_skb);
559
5381b23d
AL
560/*
561 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
562 * the caller if emergency pfmemalloc reserves are being used. If it is and
563 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
564 * may be used. Otherwise, the packet data may be discarded until enough
565 * memory is free
566 */
5c0e820c 567static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 568 bool *pfmemalloc)
5381b23d 569{
5381b23d 570 bool ret_pfmemalloc = false;
915d975b 571 size_t obj_size;
5c0e820c 572 void *obj;
5381b23d 573
5c0e820c 574 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
575 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
576 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
aa70d2d1 577 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
bf9f1baa
ED
578 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
579 node);
880ce5f2
ED
580 *size = SKB_SMALL_HEAD_CACHE_SIZE;
581 if (obj || !(gfp_pfmemalloc_allowed(flags)))
bf9f1baa 582 goto out;
880ce5f2
ED
583 /* Try again but now we are using pfmemalloc reserves */
584 ret_pfmemalloc = true;
aa70d2d1 585 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
880ce5f2 586 goto out;
bf9f1baa 587 }
915d975b
ED
588
589 obj_size = kmalloc_size_roundup(obj_size);
590 /* The following cast might truncate high-order bits of obj_size, this
591 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
592 */
593 *size = (unsigned int)obj_size;
594
5381b23d
AL
595 /*
596 * Try a regular allocation, when that fails and we're not entitled
597 * to the reserves, fail.
598 */
5c0e820c 599 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
600 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
601 node);
602 if (obj || !(gfp_pfmemalloc_allowed(flags)))
603 goto out;
604
605 /* Try again but now we are using pfmemalloc reserves */
606 ret_pfmemalloc = true;
5c0e820c 607 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
608
609out:
610 if (pfmemalloc)
611 *pfmemalloc = ret_pfmemalloc;
612
613 return obj;
614}
615
616/* Allocate a new skbuff. We do this ourselves so we can fill in a few
617 * 'private' fields and also do memory statistics to find all the
618 * [BEEP] leaks.
619 *
620 */
621
622/**
623 * __alloc_skb - allocate a network buffer
624 * @size: size to allocate
625 * @gfp_mask: allocation mask
626 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
627 * instead of head cache and allocate a cloned (child) skb.
628 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
629 * allocations in case the data is required for writeback
630 * @node: numa node to allocate memory on
631 *
632 * Allocate a new &sk_buff. The returned buffer has no headroom and a
633 * tail room of at least size bytes. The object has a reference count
634 * of one. The return is the buffer. On a failure the return is %NULL.
635 *
636 * Buffers may only be allocated from interrupts using a @gfp_mask of
637 * %GFP_ATOMIC.
638 */
639struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
640 int flags, int node)
641{
642 struct kmem_cache *cache;
5381b23d 643 struct sk_buff *skb;
5381b23d 644 bool pfmemalloc;
a5df6333 645 u8 *data;
5381b23d
AL
646
647 cache = (flags & SKB_ALLOC_FCLONE)
aa70d2d1 648 ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
5381b23d
AL
649
650 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
651 gfp_mask |= __GFP_MEMALLOC;
652
653 /* Get the HEAD */
d13612b5
AL
654 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
655 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
656 skb = napi_skb_cache_get();
657 else
658 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
659 if (unlikely(!skb))
660 return NULL;
5381b23d
AL
661 prefetchw(skb);
662
663 /* We do our best to align skb_shared_info on a separate cache
664 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
665 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
666 * Both skb->head and skb_shared_info are cache line aligned.
667 */
5c0e820c 668 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 669 if (unlikely(!data))
5381b23d 670 goto nodata;
12d6c1d3 671 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
672 * Put skb_shared_info exactly at the end of allocated zone,
673 * to allow max possible filling before reallocation.
674 */
65998d2b 675 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
676
677 /*
678 * Only clear those fields we need to clear, not those that we will
679 * actually initialise below. Hence, don't put any more fields after
680 * the tail pointer in struct sk_buff!
681 */
682 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 683 __build_skb_around(skb, data, size);
5381b23d 684 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
685
686 if (flags & SKB_ALLOC_FCLONE) {
687 struct sk_buff_fclones *fclones;
688
689 fclones = container_of(skb, struct sk_buff_fclones, skb1);
690
691 skb->fclone = SKB_FCLONE_ORIG;
692 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
693 }
694
5381b23d 695 return skb;
df1ae022 696
5381b23d
AL
697nodata:
698 kmem_cache_free(cache, skb);
df1ae022 699 return NULL;
5381b23d
AL
700}
701EXPORT_SYMBOL(__alloc_skb);
702
fd11a83d
AD
703/**
704 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
705 * @dev: network device to receive on
d7499160 706 * @len: length to allocate
fd11a83d
AD
707 * @gfp_mask: get_free_pages mask, passed to alloc_skb
708 *
709 * Allocate a new &sk_buff and assign it a usage count of one. The
710 * buffer has NET_SKB_PAD headroom built in. Users should allocate
711 * the headroom they think they need without accounting for the
712 * built in space. The built in space is used for optimisations.
713 *
714 * %NULL is returned if there is no free memory.
715 */
9451980a
AD
716struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
717 gfp_t gfp_mask)
fd11a83d 718{
b63ae8ca 719 struct page_frag_cache *nc;
fd11a83d 720 struct sk_buff *skb;
9451980a
AD
721 bool pfmemalloc;
722 void *data;
723
724 len += NET_SKB_PAD;
fd11a83d 725
66c55602
AL
726 /* If requested length is either too small or too big,
727 * we use kmalloc() for skb->head allocation.
728 */
729 if (len <= SKB_WITH_OVERHEAD(1024) ||
730 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 731 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
732 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
733 if (!skb)
734 goto skb_fail;
735 goto skb_success;
736 }
fd11a83d 737
115f1a5c 738 len = SKB_HEAD_ALIGN(len);
9451980a
AD
739
740 if (sk_memalloc_socks())
741 gfp_mask |= __GFP_MEMALLOC;
742
afa79d08 743 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
744 nc = this_cpu_ptr(&netdev_alloc_cache);
745 data = page_frag_alloc(nc, len, gfp_mask);
746 pfmemalloc = nc->pfmemalloc;
747 } else {
748 local_bh_disable();
749 nc = this_cpu_ptr(&napi_alloc_cache.page);
750 data = page_frag_alloc(nc, len, gfp_mask);
751 pfmemalloc = nc->pfmemalloc;
752 local_bh_enable();
753 }
9451980a
AD
754
755 if (unlikely(!data))
756 return NULL;
757
758 skb = __build_skb(data, len);
759 if (unlikely(!skb)) {
181edb2b 760 skb_free_frag(data);
9451980a 761 return NULL;
7b2e497a 762 }
fd11a83d 763
9451980a
AD
764 if (pfmemalloc)
765 skb->pfmemalloc = 1;
766 skb->head_frag = 1;
767
a080e7bd 768skb_success:
9451980a
AD
769 skb_reserve(skb, NET_SKB_PAD);
770 skb->dev = dev;
771
a080e7bd 772skb_fail:
8af27456
CH
773 return skb;
774}
b4ac530f 775EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 776
fd11a83d
AD
777/**
778 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
779 * @napi: napi instance this buffer was allocated for
d7499160 780 * @len: length to allocate
fd11a83d
AD
781 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
782 *
783 * Allocate a new sk_buff for use in NAPI receive. This buffer will
784 * attempt to allocate the head from a special reserved region used
785 * only for NAPI Rx allocation. By doing this we can save several
786 * CPU cycles by avoiding having to disable and re-enable IRQs.
787 *
788 * %NULL is returned if there is no free memory.
789 */
9451980a
AD
790struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
791 gfp_t gfp_mask)
fd11a83d 792{
3226b158 793 struct napi_alloc_cache *nc;
fd11a83d 794 struct sk_buff *skb;
dbae2b06 795 bool pfmemalloc;
9451980a
AD
796 void *data;
797
ee2640df 798 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 799 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 800
3226b158
ED
801 /* If requested length is either too small or too big,
802 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
803 * When the small frag allocator is available, prefer it over kmalloc
804 * for small fragments
3226b158 805 */
dbae2b06 806 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 807 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 808 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
809 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
810 NUMA_NO_NODE);
a080e7bd
AD
811 if (!skb)
812 goto skb_fail;
813 goto skb_success;
814 }
9451980a 815
3226b158 816 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
817
818 if (sk_memalloc_socks())
819 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 820
dbae2b06
PA
821 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
822 /* we are artificially inflating the allocation size, but
823 * that is not as bad as it may look like, as:
824 * - 'len' less than GRO_MAX_HEAD makes little sense
825 * - On most systems, larger 'len' values lead to fragment
826 * size above 512 bytes
827 * - kmalloc would use the kmalloc-1k slab for such values
828 * - Builds with smaller GRO_MAX_HEAD will very likely do
829 * little networking, as that implies no WiFi and no
830 * tunnels support, and 32 bits arches.
831 */
832 len = SZ_1K;
833
834 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
835 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
836 } else {
115f1a5c 837 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
838
839 data = page_frag_alloc(&nc->page, len, gfp_mask);
840 pfmemalloc = nc->page.pfmemalloc;
841 }
842
9451980a
AD
843 if (unlikely(!data))
844 return NULL;
845
cfb8ec65 846 skb = __napi_build_skb(data, len);
9451980a 847 if (unlikely(!skb)) {
181edb2b 848 skb_free_frag(data);
9451980a 849 return NULL;
fd11a83d
AD
850 }
851
dbae2b06 852 if (pfmemalloc)
9451980a
AD
853 skb->pfmemalloc = 1;
854 skb->head_frag = 1;
855
a080e7bd 856skb_success:
9451980a
AD
857 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
858 skb->dev = napi->dev;
859
a080e7bd 860skb_fail:
fd11a83d
AD
861 return skb;
862}
863EXPORT_SYMBOL(__napi_alloc_skb);
864
21d2e673
MA
865void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
866 int off, int size, unsigned int truesize)
654bed16 867{
c123e0d3
ED
868 DEBUG_NET_WARN_ON_ONCE(size > truesize);
869
21d2e673 870 skb_fill_netmem_desc(skb, i, netmem, off, size);
654bed16
PZ
871 skb->len += size;
872 skb->data_len += size;
50269e19 873 skb->truesize += truesize;
654bed16 874}
21d2e673 875EXPORT_SYMBOL(skb_add_rx_frag_netmem);
654bed16 876
f8e617e1
JW
877void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
878 unsigned int truesize)
879{
880 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
881
c123e0d3
ED
882 DEBUG_NET_WARN_ON_ONCE(size > truesize);
883
f8e617e1
JW
884 skb_frag_size_add(frag, size);
885 skb->len += size;
886 skb->data_len += size;
887 skb->truesize += truesize;
888}
889EXPORT_SYMBOL(skb_coalesce_rx_frag);
890
27b437c8 891static void skb_drop_list(struct sk_buff **listp)
1da177e4 892{
bd8a7036 893 kfree_skb_list(*listp);
27b437c8 894 *listp = NULL;
1da177e4
LT
895}
896
27b437c8
HX
897static inline void skb_drop_fraglist(struct sk_buff *skb)
898{
899 skb_drop_list(&skb_shinfo(skb)->frag_list);
900}
901
1da177e4
LT
902static void skb_clone_fraglist(struct sk_buff *skb)
903{
904 struct sk_buff *list;
905
fbb398a8 906 skb_walk_frags(skb, list)
1da177e4
LT
907 skb_get(list);
908}
909
8cfa2dee
LC
910static bool is_pp_page(struct page *page)
911{
912 return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
913}
914
27accb3c
LB
915int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
916 unsigned int headroom)
e6d5dbdd
LB
917{
918#if IS_ENABLED(CONFIG_PAGE_POOL)
919 u32 size, truesize, len, max_head_size, off;
920 struct sk_buff *skb = *pskb, *nskb;
921 int err, i, head_off;
922 void *data;
923
924 /* XDP does not support fraglist so we need to linearize
925 * the skb.
926 */
927 if (skb_has_frag_list(skb))
928 return -EOPNOTSUPP;
929
930 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
931 if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
932 return -ENOMEM;
933
934 size = min_t(u32, skb->len, max_head_size);
935 truesize = SKB_HEAD_ALIGN(size) + headroom;
936 data = page_pool_dev_alloc_va(pool, &truesize);
937 if (!data)
938 return -ENOMEM;
939
940 nskb = napi_build_skb(data, truesize);
941 if (!nskb) {
942 page_pool_free_va(pool, data, true);
943 return -ENOMEM;
944 }
945
946 skb_reserve(nskb, headroom);
947 skb_copy_header(nskb, skb);
948 skb_mark_for_recycle(nskb);
949
950 err = skb_copy_bits(skb, 0, nskb->data, size);
951 if (err) {
952 consume_skb(nskb);
953 return err;
954 }
955 skb_put(nskb, size);
956
957 head_off = skb_headroom(nskb) - skb_headroom(skb);
958 skb_headers_offset_update(nskb, head_off);
959
960 off = size;
961 len = skb->len - off;
962 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
963 struct page *page;
964 u32 page_off;
965
966 size = min_t(u32, len, PAGE_SIZE);
967 truesize = size;
968
969 page = page_pool_dev_alloc(pool, &page_off, &truesize);
c6a28acb 970 if (!page) {
e6d5dbdd
LB
971 consume_skb(nskb);
972 return -ENOMEM;
973 }
974
975 skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
976 err = skb_copy_bits(skb, off, page_address(page) + page_off,
977 size);
978 if (err) {
979 consume_skb(nskb);
980 return err;
981 }
982
983 len -= size;
984 off += size;
985 }
986
987 consume_skb(skb);
988 *pskb = nskb;
989
990 return 0;
991#else
992 return -EOPNOTSUPP;
993#endif
994}
27accb3c 995EXPORT_SYMBOL(skb_pp_cow_data);
e6d5dbdd
LB
996
997int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
998 struct bpf_prog *prog)
999{
1000 if (!prog->aux->xdp_has_frags)
1001 return -EINVAL;
1002
1003 return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
1004}
1005EXPORT_SYMBOL(skb_cow_data_for_xdp);
1006
75eaf63e
AL
1007#if IS_ENABLED(CONFIG_PAGE_POOL)
1008bool napi_pp_put_page(struct page *page, bool napi_safe)
1009{
5b899c33 1010 bool allow_direct = false;
75eaf63e 1011 struct page_pool *pp;
75eaf63e
AL
1012
1013 page = compound_head(page);
1014
1015 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
1016 * in order to preserve any existing bits, such as bit 0 for the
1017 * head page of compound page and bit 1 for pfmemalloc page, so
1018 * mask those bits for freeing side when doing below checking,
1019 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
1020 * to avoid recycling the pfmemalloc page.
1021 */
8cfa2dee 1022 if (unlikely(!is_pp_page(page)))
75eaf63e
AL
1023 return false;
1024
1025 pp = page->pp;
1026
1027 /* Allow direct recycle if we have reasons to believe that we are
1028 * in the same context as the consumer would run, so there's
1029 * no possible race.
4a36d018
AL
1030 * __page_pool_put_page() makes sure we're not in hardirq context
1031 * and interrupts are enabled prior to accessing the cache.
75eaf63e 1032 */
4a36d018 1033 if (napi_safe || in_softirq()) {
5b899c33 1034 const struct napi_struct *napi = READ_ONCE(pp->p.napi);
2b0cfa6e 1035 unsigned int cpuid = smp_processor_id();
5b899c33 1036
2b0cfa6e 1037 allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
56ef27e3 1038 allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
5b899c33 1039 }
75eaf63e
AL
1040
1041 /* Driver set this to memory recycling info. Reset it on recycle.
1042 * This will *not* work for NIC using a split-page memory model.
1043 * The page will be returned to the pool here regardless of the
1044 * 'flipped' fragment being in use or not.
1045 */
1046 page_pool_put_full_page(pp, page, allow_direct);
1047
1048 return true;
1049}
1050EXPORT_SYMBOL(napi_pp_put_page);
1051#endif
1052
b07a2d97 1053static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
4727bab4
YL
1054{
1055 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
1056 return false;
75eaf63e 1057 return napi_pp_put_page(virt_to_page(data), napi_safe);
4727bab4
YL
1058}
1059
f7dc3248
LC
1060/**
1061 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
1062 * @skb: page pool aware skb
1063 *
1064 * Increase the fragment reference count (pp_ref_count) of a skb. This is
1065 * intended to gain fragment references only for page pool aware skbs,
1066 * i.e. when skb->pp_recycle is true, and not for fragments in a
1067 * non-pp-recycling skb. It has a fallback to increase references on normal
1068 * pages, as page pool aware skbs may also have normal page fragments.
1069 */
1070static int skb_pp_frag_ref(struct sk_buff *skb)
1071{
1072 struct skb_shared_info *shinfo;
1073 struct page *head_page;
1074 int i;
1075
1076 if (!skb->pp_recycle)
1077 return -EINVAL;
1078
1079 shinfo = skb_shinfo(skb);
1080
1081 for (i = 0; i < shinfo->nr_frags; i++) {
1082 head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
1083 if (likely(is_pp_page(head_page)))
1084 page_pool_ref_page(head_page);
1085 else
1086 page_ref_inc(head_page);
1087 }
1088 return 0;
1089}
1090
bf9f1baa
ED
1091static void skb_kfree_head(void *head, unsigned int end_offset)
1092{
bf9f1baa 1093 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
aa70d2d1 1094 kmem_cache_free(net_hotdata.skb_small_head_cache, head);
bf9f1baa 1095 else
bf9f1baa
ED
1096 kfree(head);
1097}
1098
b07a2d97 1099static void skb_free_head(struct sk_buff *skb, bool napi_safe)
d3836f21 1100{
181edb2b
AD
1101 unsigned char *head = skb->head;
1102
6a5bcd84 1103 if (skb->head_frag) {
b07a2d97 1104 if (skb_pp_recycle(skb, head, napi_safe))
6a5bcd84 1105 return;
181edb2b 1106 skb_free_frag(head);
6a5bcd84 1107 } else {
bf9f1baa 1108 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 1109 }
d3836f21
ED
1110}
1111
b07a2d97
JK
1112static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
1113 bool napi_safe)
1da177e4 1114{
ff04a771
ED
1115 struct skb_shared_info *shinfo = skb_shinfo(skb);
1116 int i;
1da177e4 1117
1cface55 1118 if (!skb_data_unref(skb, shinfo))
2cc3aeb5 1119 goto exit;
a6686f2f 1120
753f1ca4
PB
1121 if (skb_zcopy(skb)) {
1122 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
1123
1124 skb_zcopy_clear(skb, true);
1125 if (skip_unref)
1126 goto free_head;
1127 }
70c43167 1128
ff04a771 1129 for (i = 0; i < shinfo->nr_frags; i++)
8c48eea3 1130 napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
a6686f2f 1131
753f1ca4 1132free_head:
ff04a771 1133 if (shinfo->frag_list)
511a3eda 1134 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771 1135
b07a2d97 1136 skb_free_head(skb, napi_safe);
2cc3aeb5
IA
1137exit:
1138 /* When we clone an SKB we copy the reycling bit. The pp_recycle
1139 * bit is only set on the head though, so in order to avoid races
1140 * while trying to recycle fragments on __skb_frag_unref() we need
1141 * to make one SKB responsible for triggering the recycle path.
1142 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 1143 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
1144 * Eventually the last SKB will have the recycling bit set and it's
1145 * dataref set to 0, which will trigger the recycling
1146 */
1147 skb->pp_recycle = 0;
1da177e4
LT
1148}
1149
1150/*
1151 * Free an skbuff by memory without cleaning the state.
1152 */
2d4baff8 1153static void kfree_skbmem(struct sk_buff *skb)
1da177e4 1154{
d0bf4a9e 1155 struct sk_buff_fclones *fclones;
d179cd12 1156
d179cd12
DM
1157 switch (skb->fclone) {
1158 case SKB_FCLONE_UNAVAILABLE:
aa70d2d1 1159 kmem_cache_free(net_hotdata.skbuff_cache, skb);
6ffe75eb 1160 return;
d179cd12
DM
1161
1162 case SKB_FCLONE_ORIG:
d0bf4a9e 1163 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 1164
6ffe75eb
ED
1165 /* We usually free the clone (TX completion) before original skb
1166 * This test would have no chance to be true for the clone,
1167 * while here, branch prediction will be good.
d179cd12 1168 */
2638595a 1169 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
1170 goto fastpath;
1171 break;
e7820e39 1172
6ffe75eb
ED
1173 default: /* SKB_FCLONE_CLONE */
1174 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 1175 break;
3ff50b79 1176 }
2638595a 1177 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
1178 return;
1179fastpath:
aa70d2d1 1180 kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
1da177e4
LT
1181}
1182
0a463c78 1183void skb_release_head_state(struct sk_buff *skb)
1da177e4 1184{
adf30907 1185 skb_dst_drop(skb);
9c2b3328 1186 if (skb->destructor) {
7890e2f0 1187 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
1188 skb->destructor(skb);
1189 }
a3bf7ae9 1190#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 1191 nf_conntrack_put(skb_nfct(skb));
1da177e4 1192#endif
df5042f4 1193 skb_ext_put(skb);
04a4bb55
LB
1194}
1195
1196/* Free everything but the sk_buff shell. */
b07a2d97
JK
1197static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
1198 bool napi_safe)
04a4bb55
LB
1199{
1200 skb_release_head_state(skb);
a28b1b90 1201 if (likely(skb->head))
b07a2d97 1202 skb_release_data(skb, reason, napi_safe);
2d4baff8
HX
1203}
1204
1205/**
1206 * __kfree_skb - private function
1207 * @skb: buffer
1208 *
1209 * Free an sk_buff. Release anything attached to the buffer.
1210 * Clean the state. This is an internal helper function. Users should
1211 * always call kfree_skb
1212 */
1da177e4 1213
2d4baff8
HX
1214void __kfree_skb(struct sk_buff *skb)
1215{
b07a2d97 1216 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
1da177e4
LT
1217 kfree_skbmem(skb);
1218}
b4ac530f 1219EXPORT_SYMBOL(__kfree_skb);
1da177e4 1220
a4650da2
JDB
1221static __always_inline
1222bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1223{
1224 if (unlikely(!skb_unref(skb)))
1225 return false;
1226
071c0fc6
JB
1227 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1228 u32_get_bits(reason,
1229 SKB_DROP_REASON_SUBSYS_MASK) >=
1230 SKB_DROP_REASON_SUBSYS_NUM);
a4650da2
JDB
1231
1232 if (reason == SKB_CONSUMED)
dd1b5278 1233 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2
JDB
1234 else
1235 trace_kfree_skb(skb, __builtin_return_address(0), reason);
1236 return true;
1237}
1238