Merge tag 'pm-6.16-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
f6d827b1 54#include <linux/skbuff_ref.h>
9c55e01c 55#include <linux/splice.h>
1da177e4
LT
56#include <linux/cache.h>
57#include <linux/rtnetlink.h>
58#include <linux/init.h>
716ea3a7 59#include <linux/scatterlist.h>
ac45f602 60#include <linux/errqueue.h>
268bb0ce 61#include <linux/prefetch.h>
071c0fc6 62#include <linux/bitfield.h>
0d5501c1 63#include <linux/if_vlan.h>
2a2ea508 64#include <linux/mpls.h>
183f47fc 65#include <linux/kcov.h>
6d0d4199 66#include <linux/iov_iter.h>
a5bd029c 67#include <linux/crc32.h>
1da177e4
LT
68
69#include <net/protocol.h>
70#include <net/dst.h>
71#include <net/sock.h>
72#include <net/checksum.h>
14ad6ed3 73#include <net/gro.h>
d457a0e3 74#include <net/gso.h>
aa70d2d1 75#include <net/hotdata.h>
ed1f50c3 76#include <net/ip6_checksum.h>
1da177e4 77#include <net/xfrm.h>
8822e270 78#include <net/mpls.h>
3ee17bc7 79#include <net/mptcp.h>
78476d31 80#include <net/mctp.h>
75eaf63e 81#include <net/page_pool/helpers.h>
071c0fc6 82#include <net/dropreason.h>
1da177e4 83
7c0f6ba6 84#include <linux/uaccess.h>
ad8d75ff 85#include <trace/events/skb.h>
51c56b00 86#include <linux/highmem.h>
b245be1f
WB
87#include <linux/capability.h>
88#include <linux/user_namespace.h>
2544af03 89#include <linux/indirect_call_wrapper.h>
2195e2a0 90#include <linux/textsearch.h>
a1f8e7f7 91
39564c3f 92#include "dev.h"
e9f3d61d 93#include "devmem.h"
8ab79ed5 94#include "netmem_priv.h"
7f678def 95#include "sock_destructor.h"
7b7ed885 96
df5042f4
FW
97#ifdef CONFIG_SKB_EXTENSIONS
98static struct kmem_cache *skbuff_ext_cache __ro_after_init;
99#endif
bf9f1baa 100
14ad6ed3
PA
101#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
102#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
103 GRO_MAX_HEAD_PAD))
bf9f1baa
ED
104
105/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
106 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
107 * size, and we can differentiate heads from skb_small_head_cache
108 * vs system slabs by looking at their size (skb_end_offset()).
109 */
110#define SKB_SMALL_HEAD_CACHE_SIZE \
111 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
112 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
113 SKB_SMALL_HEAD_SIZE)
114
115#define SKB_SMALL_HEAD_HEADROOM \
116 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
bf9f1baa 117
21d2e673
MA
118/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
119 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
120 * netmem is a page.
121 */
122static_assert(offsetof(struct bio_vec, bv_page) ==
123 offsetof(skb_frag_t, netmem));
124static_assert(sizeof_field(struct bio_vec, bv_page) ==
125 sizeof_field(skb_frag_t, netmem));
126
127static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
128static_assert(sizeof_field(struct bio_vec, bv_len) ==
129 sizeof_field(skb_frag_t, len));
130
131static_assert(offsetof(struct bio_vec, bv_offset) ==
132 offsetof(skb_frag_t, offset));
133static_assert(sizeof_field(struct bio_vec, bv_offset) ==
134 sizeof_field(skb_frag_t, offset));
135
9cb252c4
MD
136#undef FN
137#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
071c0fc6 138static const char * const drop_reasons[] = {
0e84afe8 139 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
140 DEFINE_DROP_REASON(FN, FN)
141};
071c0fc6
JB
142
143static const struct drop_reason_list drop_reasons_core = {
144 .reasons = drop_reasons,
145 .n_reasons = ARRAY_SIZE(drop_reasons),
146};
147
148const struct drop_reason_list __rcu *
149drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
150 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
151};
152EXPORT_SYMBOL(drop_reasons_by_subsys);
153
154/**
155 * drop_reasons_register_subsys - register another drop reason subsystem
156 * @subsys: the subsystem to register, must not be the core
157 * @list: the list of drop reasons within the subsystem, must point to
158 * a statically initialized list
159 */
160void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
161 const struct drop_reason_list *list)
162{
163 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
164 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
165 "invalid subsystem %d\n", subsys))
166 return;
167
168 /* must point to statically allocated memory, so INIT is OK */
169 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
170}
171EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
172
173/**
174 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
175 * @subsys: the subsystem to remove, must not be the core
176 *
177 * Note: This will synchronize_rcu() to ensure no users when it returns.
178 */
179void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
180{
181 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
182 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
183 "invalid subsystem %d\n", subsys))
184 return;
185
186 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
187
188 synchronize_rcu();
189}
190EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
ec43908d 191
1da177e4 192/**
f05de73b
JS
193 * skb_panic - private function for out-of-line support
194 * @skb: buffer
195 * @sz: size
196 * @addr: address
99d5851e 197 * @msg: skb_over_panic or skb_under_panic
1da177e4 198 *
f05de73b
JS
199 * Out-of-line support for skb_put() and skb_push().
200 * Called via the wrapper skb_over_panic() or skb_under_panic().
201 * Keep out of line to prevent kernel bloat.
202 * __builtin_return_address is not used because it is not always reliable.
1da177e4 203 */
f05de73b 204static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 205 const char msg[])
1da177e4 206{
41a46913 207 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 208 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
209 (unsigned long)skb->tail, (unsigned long)skb->end,
210 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
211 BUG();
212}
213
f05de73b 214static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 215{
f05de73b 216 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
217}
218
f05de73b
JS
219static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
220{
221 skb_panic(skb, sz, addr, __func__);
222}
c93bdd0e 223
50fad4b5 224#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
225#define NAPI_SKB_CACHE_BULK 16
226#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5
AL
227
228struct napi_alloc_cache {
bdacf3e3 229 local_lock_t bh_lock;
50fad4b5
AL
230 struct page_frag_cache page;
231 unsigned int skb_count;
232 void *skb_cache[NAPI_SKB_CACHE_SIZE];
233};
234
235static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
bdacf3e3
SAS
236static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
237 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
238};
50fad4b5 239
32e3573f 240void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
241{
242 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
bdacf3e3 243 void *data;
50fad4b5 244
50fad4b5
AL
245 fragsz = SKB_DATA_ALIGN(fragsz);
246
bdacf3e3 247 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
c89cca30
JK
248 data = __page_frag_alloc_align(&nc->page, fragsz,
249 GFP_ATOMIC | __GFP_NOWARN, align_mask);
bdacf3e3
SAS
250 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
251 return data;
252
50fad4b5
AL
253}
254EXPORT_SYMBOL(__napi_alloc_frag_align);
255
256void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
257{
50fad4b5
AL
258 void *data;
259
afa79d08 260 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
261 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
262
43d7ca29 263 fragsz = SKB_DATA_ALIGN(fragsz);
c89cca30
JK
264 data = __page_frag_alloc_align(nc, fragsz,
265 GFP_ATOMIC | __GFP_NOWARN,
411c5f36 266 align_mask);
50fad4b5
AL
267 } else {
268 local_bh_disable();
43d7ca29 269 data = __napi_alloc_frag_align(fragsz, align_mask);
50fad4b5
AL
270 local_bh_enable();
271 }
272 return data;
273}
274EXPORT_SYMBOL(__netdev_alloc_frag_align);
275
f450d539
AL
276static struct sk_buff *napi_skb_cache_get(void)
277{
278 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
279 struct sk_buff *skb;
280
bdacf3e3 281 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
49ae83fc 282 if (unlikely(!nc->skb_count)) {
aa70d2d1 283 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
c89cca30 284 GFP_ATOMIC | __GFP_NOWARN,
f450d539
AL
285 NAPI_SKB_CACHE_BULK,
286 nc->skb_cache);
bdacf3e3
SAS
287 if (unlikely(!nc->skb_count)) {
288 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
49ae83fc 289 return NULL;
bdacf3e3 290 }
49ae83fc 291 }
f450d539
AL
292
293 skb = nc->skb_cache[--nc->skb_count];
bdacf3e3 294 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
aa70d2d1 295 kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
f450d539
AL
296
297 return skb;
298}
299
859d6acd
AL
300/**
301 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
302 * @skbs: pointer to an at least @n-sized array to fill with skb pointers
303 * @n: number of entries to provide
304 *
305 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
306 * the pointers into the provided array @skbs. If there are less entries
307 * available, tries to replenish the cache and bulk-allocates the diff from
308 * the MM layer if needed.
309 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
310 * ready for {,__}build_skb_around() and don't have any data buffers attached.
311 * Must be called *only* from the BH context.
312 *
313 * Return: number of successfully allocated skbs (@n if no actual allocation
314 * needed or kmem_cache_alloc_bulk() didn't fail).
315 */
316u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
317{
318 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
319 u32 bulk, total = n;
320
321 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
322
323 if (nc->skb_count >= n)
324 goto get;
325
326 /* No enough cached skbs. Try refilling the cache first */
327 bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
328 nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
329 GFP_ATOMIC | __GFP_NOWARN, bulk,
330 &nc->skb_cache[nc->skb_count]);
331 if (likely(nc->skb_count >= n))
332 goto get;
333
334 /* Still not enough. Bulk-allocate the missing part directly, zeroed */
335 n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
336 GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
337 n - nc->skb_count, &skbs[nc->skb_count]);
338 if (likely(nc->skb_count >= n))
339 goto get;
340
341 /* kmem_cache didn't allocate the number we need, limit the output */
342 total -= n - nc->skb_count;
343 n = nc->skb_count;
344
345get:
346 for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
347 u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
348
349 skbs[i] = nc->skb_cache[base + i];
350
351 kasan_mempool_unpoison_object(skbs[i], cache_size);
352 memset(skbs[i], 0, offsetof(struct sk_buff, tail));
353 }
354
355 nc->skb_count -= n;
356 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
357
358 return total;
359}
360EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
361
ce098da1
KC
362static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
363 unsigned int size)
ba0509b6
JDB
364{
365 struct skb_shared_info *shinfo;
ba0509b6
JDB
366
367 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
368
369 /* Assumes caller memset cleared SKB */
370 skb->truesize = SKB_TRUESIZE(size);
371 refcount_set(&skb->users, 1);
372 skb->head = data;
373 skb->data = data;
374 skb_reset_tail_pointer(skb);
763087da 375 skb_set_end_offset(skb, size);
ba0509b6
JDB
376 skb->mac_header = (typeof(skb->mac_header))~0U;
377 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 378 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
379 /* make sure we initialize shinfo sequentially */
380 shinfo = skb_shinfo(skb);
381 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
382 atomic_set(&shinfo->dataref, 1);
383
6370cc3b 384 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
385}
386
ce098da1
KC
387static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
388 unsigned int *size)
389{
390 void *resized;
391
392 /* Must find the allocation size (and grow it to match). */
393 *size = ksize(data);
394 /* krealloc() will immediately return "data" when
395 * "ksize(data)" is requested: it is the existing upper
396 * bounds. As a result, GFP_ATOMIC will be ignored. Note
397 * that this "new" pointer needs to be passed back to the
398 * caller for use so the __alloc_size hinting will be
399 * tracked correctly.
400 */
401 resized = krealloc(data, *size, GFP_ATOMIC);
402 WARN_ON_ONCE(resized != data);
403 return resized;
404}
405
406/* build_skb() variant which can operate on slab buffers.
407 * Note that this should be used sparingly as slab buffers
408 * cannot be combined efficiently by GRO!
409 */
410struct sk_buff *slab_build_skb(void *data)
411{
412 struct sk_buff *skb;
413 unsigned int size;
414
c89cca30
JK
415 skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
416 GFP_ATOMIC | __GFP_NOWARN);
ce098da1
KC
417 if (unlikely(!skb))
418 return NULL;
419
420 memset(skb, 0, offsetof(struct sk_buff, tail));
421 data = __slab_build_skb(skb, data, &size);
422 __finalize_skb_around(skb, data, size);
423
424 return skb;
425}
426EXPORT_SYMBOL(slab_build_skb);
427
428/* Caller must provide SKB that is memset cleared */
429static void __build_skb_around(struct sk_buff *skb, void *data,
430 unsigned int frag_size)
431{
432 unsigned int size = frag_size;
433
434 /* frag_size == 0 is considered deprecated now. Callers
435 * using slab buffer should use slab_build_skb() instead.
436 */
437 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
438 data = __slab_build_skb(skb, data, &size);
439
440 __finalize_skb_around(skb, data, size);
441}
442
b2b5ce9d 443/**
2ea2f62c 444 * __build_skb - build a network buffer
b2b5ce9d 445 * @data: data buffer provided by caller
ce098da1 446 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
447 *
448 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
449 * skb_shared_info. @data must have been allocated from the page
450 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
451 * allocation is deprecated, and callers should use slab_build_skb()
452 * instead.)
b2b5ce9d
ED
453 * The return is the new skb buffer.
454 * On a failure the return is %NULL, and @data is not freed.
455 * Notes :
456 * Before IO, driver allocates only data buffer where NIC put incoming frame
457 * Driver should add room at head (NET_SKB_PAD) and
458 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
459 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
460 * before giving packet to stack.
461 * RX rings only contains data buffers, not full skbs.
462 */
2ea2f62c 463struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 464{
b2b5ce9d 465 struct sk_buff *skb;
b2b5ce9d 466
c89cca30
JK
467 skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
468 GFP_ATOMIC | __GFP_NOWARN);
ba0509b6 469 if (unlikely(!skb))
b2b5ce9d
ED
470 return NULL;
471
b2b5ce9d 472 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 473 __build_skb_around(skb, data, frag_size);
b2b5ce9d 474
483126b3 475 return skb;
b2b5ce9d 476}
2ea2f62c
ED
477
478/* build_skb() is wrapper over __build_skb(), that specifically
479 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
480 */
481struct sk_buff *build_skb(void *data, unsigned int frag_size)
482{
483 struct sk_buff *skb = __build_skb(data, frag_size);
484
3c640126 485 if (likely(skb && frag_size)) {
2ea2f62c 486 skb->head_frag = 1;
566b6701 487 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
2ea2f62c
ED
488 }
489 return skb;
490}
b2b5ce9d
ED
491EXPORT_SYMBOL(build_skb);
492
ba0509b6
JDB
493/**
494 * build_skb_around - build a network buffer around provided skb
495 * @skb: sk_buff provide by caller, must be memset cleared
496 * @data: data buffer provided by caller
12c1604a 497 * @frag_size: size of data
ba0509b6
JDB
498 */
499struct sk_buff *build_skb_around(struct sk_buff *skb,
500 void *data, unsigned int frag_size)
501{
502 if (unlikely(!skb))
503 return NULL;
504
483126b3 505 __build_skb_around(skb, data, frag_size);
ba0509b6 506
483126b3 507 if (frag_size) {
ba0509b6 508 skb->head_frag = 1;
566b6701 509 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
ba0509b6
JDB
510 }
511 return skb;
512}
513EXPORT_SYMBOL(build_skb_around);
514
f450d539
AL
515/**
516 * __napi_build_skb - build a network buffer
517 * @data: data buffer provided by caller
12c1604a 518 * @frag_size: size of data
f450d539
AL
519 *
520 * Version of __build_skb() that uses NAPI percpu caches to obtain
521 * skbuff_head instead of inplace allocation.
522 *
523 * Returns a new &sk_buff on success, %NULL on allocation failure.
524 */
525static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
526{
527 struct sk_buff *skb;
528
529 skb = napi_skb_cache_get();
530 if (unlikely(!skb))
531 return NULL;
532
533 memset(skb, 0, offsetof(struct sk_buff, tail));
534 __build_skb_around(skb, data, frag_size);
535
536 return skb;
537}
538
539/**
540 * napi_build_skb - build a network buffer
541 * @data: data buffer provided by caller
12c1604a 542 * @frag_size: size of data
f450d539
AL
543 *
544 * Version of __napi_build_skb() that takes care of skb->head_frag
545 * and skb->pfmemalloc when the data is a page or page fragment.
546 *
547 * Returns a new &sk_buff on success, %NULL on allocation failure.
548 */
549struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
550{
551 struct sk_buff *skb = __napi_build_skb(data, frag_size);
552
553 if (likely(skb) && frag_size) {
554 skb->head_frag = 1;
555 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
556 }
557
558 return skb;
559}
560EXPORT_SYMBOL(napi_build_skb);
561
5381b23d
AL
562/*
563 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
564 * the caller if emergency pfmemalloc reserves are being used. If it is and
565 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
566 * may be used. Otherwise, the packet data may be discarded until enough
567 * memory is free
568 */
5c0e820c 569static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 570 bool *pfmemalloc)
5381b23d 571{
5381b23d 572 bool ret_pfmemalloc = false;
915d975b 573 size_t obj_size;
5c0e820c 574 void *obj;
5381b23d 575
5c0e820c 576 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
577 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
578 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
aa70d2d1 579 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
bf9f1baa
ED
580 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
581 node);
880ce5f2
ED
582 *size = SKB_SMALL_HEAD_CACHE_SIZE;
583 if (obj || !(gfp_pfmemalloc_allowed(flags)))
bf9f1baa 584 goto out;
880ce5f2
ED
585 /* Try again but now we are using pfmemalloc reserves */
586 ret_pfmemalloc = true;
aa70d2d1 587 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
880ce5f2 588 goto out;
bf9f1baa 589 }
915d975b
ED
590
591 obj_size = kmalloc_size_roundup(obj_size);
592 /* The following cast might truncate high-order bits of obj_size, this
593 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
594 */
595 *size = (unsigned int)obj_size;
596
5381b23d
AL
597 /*
598 * Try a regular allocation, when that fails and we're not entitled
599 * to the reserves, fail.
600 */
5c0e820c 601 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
602 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
603 node);
604 if (obj || !(gfp_pfmemalloc_allowed(flags)))
605 goto out;
606
607 /* Try again but now we are using pfmemalloc reserves */
608 ret_pfmemalloc = true;
5c0e820c 609 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
610
611out:
612 if (pfmemalloc)
613 *pfmemalloc = ret_pfmemalloc;
614
615 return obj;
616}
617
618/* Allocate a new skbuff. We do this ourselves so we can fill in a few
619 * 'private' fields and also do memory statistics to find all the
620 * [BEEP] leaks.
621 *
622 */
623
624/**
625 * __alloc_skb - allocate a network buffer
626 * @size: size to allocate
627 * @gfp_mask: allocation mask
628 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
629 * instead of head cache and allocate a cloned (child) skb.
630 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
631 * allocations in case the data is required for writeback
632 * @node: numa node to allocate memory on
633 *
634 * Allocate a new &sk_buff. The returned buffer has no headroom and a
635 * tail room of at least size bytes. The object has a reference count
636 * of one. The return is the buffer. On a failure the return is %NULL.
637 *
638 * Buffers may only be allocated from interrupts using a @gfp_mask of
639 * %GFP_ATOMIC.
640 */
641struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
642 int flags, int node)
643{
644 struct kmem_cache *cache;
5381b23d 645 struct sk_buff *skb;
5381b23d 646 bool pfmemalloc;
a5df6333 647 u8 *data;
5381b23d
AL
648
649 cache = (flags & SKB_ALLOC_FCLONE)
aa70d2d1 650 ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
5381b23d
AL
651
652 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
653 gfp_mask |= __GFP_MEMALLOC;
654
655 /* Get the HEAD */
d13612b5
AL
656 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
657 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
658 skb = napi_skb_cache_get();
659 else
660 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
661 if (unlikely(!skb))
662 return NULL;
5381b23d
AL
663 prefetchw(skb);
664
665 /* We do our best to align skb_shared_info on a separate cache
666 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
667 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
668 * Both skb->head and skb_shared_info are cache line aligned.
669 */
5c0e820c 670 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 671 if (unlikely(!data))
5381b23d 672 goto nodata;
12d6c1d3 673 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
674 * Put skb_shared_info exactly at the end of allocated zone,
675 * to allow max possible filling before reallocation.
676 */
65998d2b 677 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
678
679 /*
680 * Only clear those fields we need to clear, not those that we will
681 * actually initialise below. Hence, don't put any more fields after
682 * the tail pointer in struct sk_buff!
683 */
684 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 685 __build_skb_around(skb, data, size);
5381b23d 686 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
687
688 if (flags & SKB_ALLOC_FCLONE) {
689 struct sk_buff_fclones *fclones;
690
691 fclones = container_of(skb, struct sk_buff_fclones, skb1);
692
693 skb->fclone = SKB_FCLONE_ORIG;
694 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
695 }
696
5381b23d 697 return skb;
df1ae022 698
5381b23d
AL
699nodata:
700 kmem_cache_free(cache, skb);
df1ae022 701 return NULL;
5381b23d
AL
702}
703EXPORT_SYMBOL(__alloc_skb);
704
fd11a83d
AD
705/**
706 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
707 * @dev: network device to receive on
d7499160 708 * @len: length to allocate
fd11a83d
AD
709 * @gfp_mask: get_free_pages mask, passed to alloc_skb
710 *
711 * Allocate a new &sk_buff and assign it a usage count of one. The
712 * buffer has NET_SKB_PAD headroom built in. Users should allocate
713 * the headroom they think they need without accounting for the
714 * built in space. The built in space is used for optimisations.
715 *
716 * %NULL is returned if there is no free memory.
717 */
9451980a
AD
718struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
719 gfp_t gfp_mask)
fd11a83d 720{
b63ae8ca 721 struct page_frag_cache *nc;
fd11a83d 722 struct sk_buff *skb;
9451980a
AD
723 bool pfmemalloc;
724 void *data;
725
726 len += NET_SKB_PAD;
fd11a83d 727
66c55602
AL
728 /* If requested length is either too small or too big,
729 * we use kmalloc() for skb->head allocation.
730 */
14ad6ed3 731 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
66c55602 732 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 733 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
734 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
735 if (!skb)
736 goto skb_fail;
737 goto skb_success;
738 }
fd11a83d 739
115f1a5c 740 len = SKB_HEAD_ALIGN(len);
9451980a
AD
741
742 if (sk_memalloc_socks())
743 gfp_mask |= __GFP_MEMALLOC;
744
afa79d08 745 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
746 nc = this_cpu_ptr(&netdev_alloc_cache);
747 data = page_frag_alloc(nc, len, gfp_mask);
3d18dfe6 748 pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
92dcabd7
SAS
749 } else {
750 local_bh_disable();
bdacf3e3
SAS
751 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
752
92dcabd7
SAS
753 nc = this_cpu_ptr(&napi_alloc_cache.page);
754 data = page_frag_alloc(nc, len, gfp_mask);
3d18dfe6 755 pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
bdacf3e3
SAS
756
757 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
92dcabd7
SAS
758 local_bh_enable();
759 }
9451980a
AD
760
761 if (unlikely(!data))
762 return NULL;
763
764 skb = __build_skb(data, len);
765 if (unlikely(!skb)) {
181edb2b 766 skb_free_frag(data);
9451980a 767 return NULL;
7b2e497a 768 }
fd11a83d 769
9451980a
AD
770 if (pfmemalloc)
771 skb->pfmemalloc = 1;
772 skb->head_frag = 1;
773
a080e7bd 774skb_success:
9451980a
AD
775 skb_reserve(skb, NET_SKB_PAD);
776 skb->dev = dev;
777
a080e7bd 778skb_fail:
8af27456
CH
779 return skb;
780}
b4ac530f 781EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 782
fd11a83d 783/**
6e9b0190 784 * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
fd11a83d 785 * @napi: napi instance this buffer was allocated for
d7499160 786 * @len: length to allocate
fd11a83d
AD
787 *
788 * Allocate a new sk_buff for use in NAPI receive. This buffer will
789 * attempt to allocate the head from a special reserved region used
790 * only for NAPI Rx allocation. By doing this we can save several
791 * CPU cycles by avoiding having to disable and re-enable IRQs.
792 *
793 * %NULL is returned if there is no free memory.
794 */
6e9b0190 795struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
fd11a83d 796{
6e9b0190 797 gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
3226b158 798 struct napi_alloc_cache *nc;
fd11a83d 799 struct sk_buff *skb;
dbae2b06 800 bool pfmemalloc;
9451980a
AD
801 void *data;
802
ee2640df 803 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 804 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 805
3226b158
ED
806 /* If requested length is either too small or too big,
807 * we use kmalloc() for skb->head allocation.
808 */
6bc7e4eb 809 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
3226b158 810 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 811 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
812 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
813 NUMA_NO_NODE);
a080e7bd
AD
814 if (!skb)
815 goto skb_fail;
816 goto skb_success;
817 }
9451980a 818
6bc7e4eb
PA
819 len = SKB_HEAD_ALIGN(len);
820
9451980a
AD
821 if (sk_memalloc_socks())
822 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 823
bdacf3e3
SAS
824 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
825 nc = this_cpu_ptr(&napi_alloc_cache);
dbae2b06 826
6bc7e4eb
PA
827 data = page_frag_alloc(&nc->page, len, gfp_mask);
828 pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
bdacf3e3 829 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
dbae2b06 830
9451980a
AD
831 if (unlikely(!data))
832 return NULL;
833
cfb8ec65 834 skb = __napi_build_skb(data, len);
9451980a 835 if (unlikely(!skb)) {
181edb2b 836 skb_free_frag(data);
9451980a 837 return NULL;
fd11a83d
AD
838 }
839
dbae2b06 840 if (pfmemalloc)
9451980a
AD
841 skb->pfmemalloc = 1;
842 skb->head_frag = 1;
843
a080e7bd 844skb_success:
9451980a
AD
845 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
846 skb->dev = napi->dev;
847
a080e7bd 848skb_fail:
fd11a83d
AD
849 return skb;
850}
6e9b0190 851EXPORT_SYMBOL(napi_alloc_skb);
fd11a83d 852
21d2e673
MA
853void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
854 int off, int size, unsigned int truesize)
654bed16 855{
c123e0d3
ED
856 DEBUG_NET_WARN_ON_ONCE(size > truesize);
857
21d2e673 858 skb_fill_netmem_desc(skb, i, netmem, off, size);
654bed16
PZ
859 skb->len += size;
860 skb->data_len += size;
50269e19 861 skb->truesize += truesize;
654bed16 862}
21d2e673 863EXPORT_SYMBOL(skb_add_rx_frag_netmem);
654bed16 864
f8e617e1
JW
865void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
866 unsigned int truesize)
867{
868 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
869
c123e0d3
ED
870 DEBUG_NET_WARN_ON_ONCE(size > truesize);
871
f8e617e1
JW
872 skb_frag_size_add(frag, size);
873 skb->len += size;
874 skb->data_len += size;
875 skb->truesize += truesize;
876}
877EXPORT_SYMBOL(skb_coalesce_rx_frag);
878
27b437c8 879static void skb_drop_list(struct sk_buff **listp)
1da177e4 880{
bd8a7036 881 kfree_skb_list(*listp);
27b437c8 882 *listp = NULL;
1da177e4
LT
883}
884
27b437c8
HX
885static inline void skb_drop_fraglist(struct sk_buff *skb)
886{
887 skb_drop_list(&skb_shinfo(skb)->frag_list);
888}
889
1da177e4
LT
890static void skb_clone_fraglist(struct sk_buff *skb)
891{
892 struct sk_buff *list;
893
fbb398a8 894 skb_walk_frags(skb, list)
1da177e4
LT
895 skb_get(list);
896}
897
27accb3c
LB
898int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
899 unsigned int headroom)
e6d5dbdd
LB
900{
901#if IS_ENABLED(CONFIG_PAGE_POOL)
902 u32 size, truesize, len, max_head_size, off;
903 struct sk_buff *skb = *pskb, *nskb;
904 int err, i, head_off;
905 void *data;
906
907 /* XDP does not support fraglist so we need to linearize
908 * the skb.
909 */
910 if (skb_has_frag_list(skb))
911 return -EOPNOTSUPP;
912
913 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
914 if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
915 return -ENOMEM;
916
917 size = min_t(u32, skb->len, max_head_size);
918 truesize = SKB_HEAD_ALIGN(size) + headroom;
919 data = page_pool_dev_alloc_va(pool, &truesize);
920 if (!data)
921 return -ENOMEM;
922
923 nskb = napi_build_skb(data, truesize);
924 if (!nskb) {
925 page_pool_free_va(pool, data, true);
926 return -ENOMEM;
927 }
928
929 skb_reserve(nskb, headroom);
930 skb_copy_header(nskb, skb);
931 skb_mark_for_recycle(nskb);
932
933 err = skb_copy_bits(skb, 0, nskb->data, size);
934 if (err) {
935 consume_skb(nskb);
936 return err;
937 }
938 skb_put(nskb, size);
939
940 head_off = skb_headroom(nskb) - skb_headroom(skb);
941 skb_headers_offset_update(nskb, head_off);
942
943 off = size;
944 len = skb->len - off;
945 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
946 struct page *page;
947 u32 page_off;
948
949 size = min_t(u32, len, PAGE_SIZE);
950 truesize = size;
951
952 page = page_pool_dev_alloc(pool, &page_off, &truesize);
c6a28acb 953 if (!page) {
e6d5dbdd
LB
954 consume_skb(nskb);
955 return -ENOMEM;
956 }
957
958 skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
959 err = skb_copy_bits(skb, off, page_address(page) + page_off,
960 size);
961 if (err) {
962 consume_skb(nskb);
963 return err;
964 }
965
966 len -= size;
967 off += size;
968 }
969
970 consume_skb(skb);
971 *pskb = nskb;
972
973 return 0;
974#else
975 return -EOPNOTSUPP;
976#endif
977}
27accb3c 978EXPORT_SYMBOL(skb_pp_cow_data);
e6d5dbdd
LB
979
980int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
7cd1107f 981 const struct bpf_prog *prog)
e6d5dbdd
LB
982{
983 if (!prog->aux->xdp_has_frags)
984 return -EINVAL;
985
986 return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
987}
988EXPORT_SYMBOL(skb_cow_data_for_xdp);
989
75eaf63e 990#if IS_ENABLED(CONFIG_PAGE_POOL)
4dec64c5 991bool napi_pp_put_page(netmem_ref netmem)
75eaf63e 992{
8ab79ed5 993 netmem = netmem_compound_head(netmem);
75eaf63e 994
cd3c9316 995 if (unlikely(!netmem_is_pp(netmem)))
75eaf63e
AL
996 return false;
997
8ab79ed5 998 page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
75eaf63e
AL
999
1000 return true;
1001}
1002EXPORT_SYMBOL(napi_pp_put_page);
1003#endif
1004
4a96a4e8 1005static bool skb_pp_recycle(struct sk_buff *skb, void *data)
4727bab4
YL
1006{
1007 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
1008 return false;
4dec64c5 1009 return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
4727bab4
YL
1010}
1011
173e7622
MA
1012/**
1013 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
1014 * @skb: page pool aware skb
1015 *
1016 * Increase the fragment reference count (pp_ref_count) of a skb. This is
1017 * intended to gain fragment references only for page pool aware skbs,
1018 * i.e. when skb->pp_recycle is true, and not for fragments in a
1019 * non-pp-recycling skb. It has a fallback to increase references on normal
1020 * pages, as page pool aware skbs may also have normal page fragments.
1021 */
1022static int skb_pp_frag_ref(struct sk_buff *skb)
1023{
1024 struct skb_shared_info *shinfo;
8ab79ed5 1025 netmem_ref head_netmem;
173e7622
MA
1026 int i;
1027
1028 if (!skb->pp_recycle)
1029 return -EINVAL;
1030
1031 shinfo = skb_shinfo(skb);
1032
1033 for (i = 0; i < shinfo->nr_frags; i++) {
8ab79ed5 1034 head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
cd3c9316 1035 if (likely(netmem_is_pp(head_netmem)))
8ab79ed5 1036 page_pool_ref_netmem(head_netmem);
173e7622 1037 else
8ab79ed5 1038 page_ref_inc(netmem_to_page(head_netmem));
173e7622
MA
1039 }
1040 return 0;
1041}
1042
bf9f1baa
ED
1043static void skb_kfree_head(void *head, unsigned int end_offset)
1044{
bf9f1baa 1045 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
aa70d2d1 1046 kmem_cache_free(net_hotdata.skb_small_head_cache, head);
bf9f1baa 1047 else
bf9f1baa
ED
1048 kfree(head);
1049}
1050
4a96a4e8 1051static void skb_free_head(struct sk_buff *skb)
d3836f21 1052{
181edb2b
AD
1053 unsigned char *head = skb->head;
1054
6a5bcd84 1055 if (skb->head_frag) {
4a96a4e8 1056 if (skb_pp_recycle(skb, head))
6a5bcd84 1057 return;
181edb2b 1058 skb_free_frag(head);
6a5bcd84 1059 } else {
bf9f1baa 1060 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 1061 }
d3836f21
ED
1062}
1063
4a96a4e8 1064static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
1da177e4 1065{
ff04a771
ED
1066 struct skb_shared_info *shinfo = skb_shinfo(skb);
1067 int i;
1da177e4 1068
1cface55 1069 if (!skb_data_unref(skb, shinfo))
2cc3aeb5 1070 goto exit;
a6686f2f 1071
753f1ca4
PB
1072 if (skb_zcopy(skb)) {
1073 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
1074
1075 skb_zcopy_clear(skb, true);
1076 if (skip_unref)
1077 goto free_head;
1078 }
70c43167 1079
ff04a771 1080 for (i = 0; i < shinfo->nr_frags; i++)
f58f3c95 1081 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 1082
753f1ca4 1083free_head:
ff04a771 1084 if (shinfo->frag_list)
511a3eda 1085 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771 1086
4a96a4e8 1087 skb_free_head(skb);
2cc3aeb5
IA
1088exit:
1089 /* When we clone an SKB we copy the reycling bit. The pp_recycle
1090 * bit is only set on the head though, so in order to avoid races
1091 * while trying to recycle fragments on __skb_frag_unref() we need
1092 * to make one SKB responsible for triggering the recycle path.
1093 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 1094 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
1095 * Eventually the last SKB will have the recycling bit set and it's
1096 * dataref set to 0, which will trigger the recycling
1097 */
1098 skb->pp_recycle = 0;
1da177e4
LT
1099}
1100
1101/*
1102 * Free an skbuff by memory without cleaning the state.
1103 */
2d4baff8 1104static void kfree_skbmem(struct sk_buff *skb)
1da177e4 1105{
d0bf4a9e 1106 struct sk_buff_fclones *fclones;
d179cd12 1107
d179cd12
DM
1108 switch (skb->fclone) {
1109 case SKB_FCLONE_UNAVAILABLE:
aa70d2d1 1110 kmem_cache_free(net_hotdata.skbuff_cache, skb);
6ffe75eb 1111 return;
d179cd12
DM
1112
1113 case SKB_FCLONE_ORIG:
d0bf4a9e 1114 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 1115
6ffe75eb
ED
1116 /* We usually free the clone (TX completion) before original skb
1117 * This test would have no chance to be true for the clone,
1118 * while here, branch prediction will be good.
d179cd12 1119 */
2638595a 1120 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
1121 goto fastpath;
1122 break;
e7820e39 1123
6ffe75eb
ED
1124 default: /* SKB_FCLONE_CLONE */
1125 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 1126 break;
3ff50b79 1127 }
2638595a 1128 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
1129 return;
1130fastpath:
aa70d2d1 1131 kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
1da177e4
LT
1132}
1133
0a463c78 1134void skb_release_head_state(struct sk_buff *skb)
1da177e4 1135{
adf30907 1136 skb_dst_drop(skb);
9c2b3328 1137 if (skb->destructor) {
7890e2f0 1138 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
1139 skb->destructor(skb);
1140 }
a3bf7ae9 1141#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 1142 nf_conntrack_put(skb_nfct(skb));
1da177e4 1143#endif
df5042f4 1144 skb_ext_put(skb);
04a4bb55
LB
1145}
1146
1147/* Free everything but the sk_buff shell. */
4a96a4e8 1148static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
04a4bb55
LB
1149{
1150 skb_release_head_state(skb);
a28b1b90 1151 if (likely(skb->head))
4a96a4e8 1152 skb_release_data(skb, reason);
2d4baff8
HX
1153}
1154
1155/**
1156 * __kfree_skb - private function
1157 * @skb: buffer
1158 *
1159 * Free an sk_buff. Release anything attached to the buffer.
1160 * Clean the state. This is an internal helper function. Users should
1161 * always call kfree_skb
1162 */
1da177e4 1163
2d4baff8
HX
1164void __kfree_skb(struct sk_buff *skb)
1165{
4a96a4e8 1166 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1da177e4
LT
1167 kfree_skbmem(skb);
1168}
b4ac530f 1169EXPORT_SYMBOL(__kfree_skb);
1da177e4 1170
a4650da2 1171static __always_inline
ba8de796
YZ
1172bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
1173 enum skb_drop_reason reason)
a4650da2
JDB
1174{
1175 if (unlikely(!skb_unref(skb)))
1176 return false;
1177
071c0fc6
JB
1178 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1179 u32_get_bits(reason,
1180 SKB_DROP_REASON_SUBSYS_MASK) >=
1181 SKB_DROP_REASON_SUBSYS_NUM);
a4650da2
JDB
1182
1183 if (reason == SKB_CONSUMED)
dd1b5278 1184 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2 1185 else
ba8de796 1186 trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
a4650da2
JDB
1187 return true;
1188}
1189