net: add SKB_HEAD_ALIGN() helper
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
183f47fc 63#include <linux/kcov.h>
1da177e4
LT
64
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
ed1f50c3 69#include <net/ip6_checksum.h>
1da177e4 70#include <net/xfrm.h>
8822e270 71#include <net/mpls.h>
3ee17bc7 72#include <net/mptcp.h>
78476d31 73#include <net/mctp.h>
6a5bcd84 74#include <net/page_pool.h>
1da177e4 75
7c0f6ba6 76#include <linux/uaccess.h>
ad8d75ff 77#include <trace/events/skb.h>
51c56b00 78#include <linux/highmem.h>
b245be1f
WB
79#include <linux/capability.h>
80#include <linux/user_namespace.h>
2544af03 81#include <linux/indirect_call_wrapper.h>
2195e2a0 82#include <linux/textsearch.h>
a1f8e7f7 83
39564c3f 84#include "dev.h"
7f678def 85#include "sock_destructor.h"
7b7ed885 86
08009a76
AD
87struct kmem_cache *skbuff_head_cache __ro_after_init;
88static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
89#ifdef CONFIG_SKB_EXTENSIONS
90static struct kmem_cache *skbuff_ext_cache __ro_after_init;
91#endif
5f74f82e
HWR
92int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
93EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 94
9cb252c4
MD
95#undef FN
96#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
97const char * const drop_reasons[] = {
0e84afe8 98 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
99 DEFINE_DROP_REASON(FN, FN)
100};
ec43908d
MD
101EXPORT_SYMBOL(drop_reasons);
102
1da177e4 103/**
f05de73b
JS
104 * skb_panic - private function for out-of-line support
105 * @skb: buffer
106 * @sz: size
107 * @addr: address
99d5851e 108 * @msg: skb_over_panic or skb_under_panic
1da177e4 109 *
f05de73b
JS
110 * Out-of-line support for skb_put() and skb_push().
111 * Called via the wrapper skb_over_panic() or skb_under_panic().
112 * Keep out of line to prevent kernel bloat.
113 * __builtin_return_address is not used because it is not always reliable.
1da177e4 114 */
f05de73b 115static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 116 const char msg[])
1da177e4 117{
41a46913 118 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 119 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
120 (unsigned long)skb->tail, (unsigned long)skb->end,
121 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
122 BUG();
123}
124
f05de73b 125static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 126{
f05de73b 127 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
128}
129
f05de73b
JS
130static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
131{
132 skb_panic(skb, sz, addr, __func__);
133}
c93bdd0e 134
50fad4b5 135#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
136#define NAPI_SKB_CACHE_BULK 16
137#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 138
dbae2b06
PA
139#if PAGE_SIZE == SZ_4K
140
141#define NAPI_HAS_SMALL_PAGE_FRAG 1
142#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
143
144/* specialized page frag allocator using a single order 0 page
145 * and slicing it into 1K sized fragment. Constrained to systems
146 * with a very limited amount of 1K fragments fitting a single
147 * page - to avoid excessive truesize underestimation
148 */
149
150struct page_frag_1k {
151 void *va;
152 u16 offset;
153 bool pfmemalloc;
154};
155
156static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
157{
158 struct page *page;
159 int offset;
160
161 offset = nc->offset - SZ_1K;
162 if (likely(offset >= 0))
163 goto use_frag;
164
165 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
166 if (!page)
167 return NULL;
168
169 nc->va = page_address(page);
170 nc->pfmemalloc = page_is_pfmemalloc(page);
171 offset = PAGE_SIZE - SZ_1K;
172 page_ref_add(page, offset / SZ_1K);
173
174use_frag:
175 nc->offset = offset;
176 return nc->va + offset;
177}
178#else
179
180/* the small page is actually unused in this build; add dummy helpers
181 * to please the compiler and avoid later preprocessor's conditionals
182 */
183#define NAPI_HAS_SMALL_PAGE_FRAG 0
184#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
185
186struct page_frag_1k {
187};
188
189static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
190{
191 return NULL;
192}
193
194#endif
195
50fad4b5
AL
196struct napi_alloc_cache {
197 struct page_frag_cache page;
dbae2b06 198 struct page_frag_1k page_small;
50fad4b5
AL
199 unsigned int skb_count;
200 void *skb_cache[NAPI_SKB_CACHE_SIZE];
201};
202
203static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
204static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
205
dbae2b06
PA
206/* Double check that napi_get_frags() allocates skbs with
207 * skb->head being backed by slab, not a page fragment.
208 * This is to make sure bug fixed in 3226b158e67c
209 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
210 * does not accidentally come back.
211 */
212void napi_get_frags_check(struct napi_struct *napi)
213{
214 struct sk_buff *skb;
215
216 local_bh_disable();
217 skb = napi_get_frags(napi);
218 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
219 napi_free_frags(napi);
220 local_bh_enable();
221}
222
32e3573f 223void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
224{
225 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
226
50fad4b5
AL
227 fragsz = SKB_DATA_ALIGN(fragsz);
228
32e3573f 229 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
230}
231EXPORT_SYMBOL(__napi_alloc_frag_align);
232
233void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
234{
50fad4b5
AL
235 void *data;
236
237 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 238 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
239 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
240
50fad4b5
AL
241 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
242 } else {
32e3573f
YD
243 struct napi_alloc_cache *nc;
244
50fad4b5 245 local_bh_disable();
32e3573f
YD
246 nc = this_cpu_ptr(&napi_alloc_cache);
247 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
248 local_bh_enable();
249 }
250 return data;
251}
252EXPORT_SYMBOL(__netdev_alloc_frag_align);
253
f450d539
AL
254static struct sk_buff *napi_skb_cache_get(void)
255{
256 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
257 struct sk_buff *skb;
258
49ae83fc 259 if (unlikely(!nc->skb_count)) {
f450d539
AL
260 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
261 GFP_ATOMIC,
262 NAPI_SKB_CACHE_BULK,
263 nc->skb_cache);
49ae83fc
SPL
264 if (unlikely(!nc->skb_count))
265 return NULL;
266 }
f450d539
AL
267
268 skb = nc->skb_cache[--nc->skb_count];
269 kasan_unpoison_object_data(skbuff_head_cache, skb);
270
271 return skb;
272}
273
ce098da1
KC
274static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
275 unsigned int size)
ba0509b6
JDB
276{
277 struct skb_shared_info *shinfo;
ba0509b6
JDB
278
279 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
280
281 /* Assumes caller memset cleared SKB */
282 skb->truesize = SKB_TRUESIZE(size);
283 refcount_set(&skb->users, 1);
284 skb->head = data;
285 skb->data = data;
286 skb_reset_tail_pointer(skb);
763087da 287 skb_set_end_offset(skb, size);
ba0509b6
JDB
288 skb->mac_header = (typeof(skb->mac_header))~0U;
289 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 290 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
291 /* make sure we initialize shinfo sequentially */
292 shinfo = skb_shinfo(skb);
293 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
294 atomic_set(&shinfo->dataref, 1);
295
6370cc3b 296 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
297}
298
ce098da1
KC
299static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
300 unsigned int *size)
301{
302 void *resized;
303
304 /* Must find the allocation size (and grow it to match). */
305 *size = ksize(data);
306 /* krealloc() will immediately return "data" when
307 * "ksize(data)" is requested: it is the existing upper
308 * bounds. As a result, GFP_ATOMIC will be ignored. Note
309 * that this "new" pointer needs to be passed back to the
310 * caller for use so the __alloc_size hinting will be
311 * tracked correctly.
312 */
313 resized = krealloc(data, *size, GFP_ATOMIC);
314 WARN_ON_ONCE(resized != data);
315 return resized;
316}
317
318/* build_skb() variant which can operate on slab buffers.
319 * Note that this should be used sparingly as slab buffers
320 * cannot be combined efficiently by GRO!
321 */
322struct sk_buff *slab_build_skb(void *data)
323{
324 struct sk_buff *skb;
325 unsigned int size;
326
327 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
328 if (unlikely(!skb))
329 return NULL;
330
331 memset(skb, 0, offsetof(struct sk_buff, tail));
332 data = __slab_build_skb(skb, data, &size);
333 __finalize_skb_around(skb, data, size);
334
335 return skb;
336}
337EXPORT_SYMBOL(slab_build_skb);
338
339/* Caller must provide SKB that is memset cleared */
340static void __build_skb_around(struct sk_buff *skb, void *data,
341 unsigned int frag_size)
342{
343 unsigned int size = frag_size;
344
345 /* frag_size == 0 is considered deprecated now. Callers
346 * using slab buffer should use slab_build_skb() instead.
347 */
348 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
349 data = __slab_build_skb(skb, data, &size);
350
351 __finalize_skb_around(skb, data, size);
352}
353
b2b5ce9d 354/**
2ea2f62c 355 * __build_skb - build a network buffer
b2b5ce9d 356 * @data: data buffer provided by caller
ce098da1 357 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
358 *
359 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
360 * skb_shared_info. @data must have been allocated from the page
361 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
362 * allocation is deprecated, and callers should use slab_build_skb()
363 * instead.)
b2b5ce9d
ED
364 * The return is the new skb buffer.
365 * On a failure the return is %NULL, and @data is not freed.
366 * Notes :
367 * Before IO, driver allocates only data buffer where NIC put incoming frame
368 * Driver should add room at head (NET_SKB_PAD) and
369 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
370 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
371 * before giving packet to stack.
372 * RX rings only contains data buffers, not full skbs.
373 */
2ea2f62c 374struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 375{
b2b5ce9d 376 struct sk_buff *skb;
b2b5ce9d
ED
377
378 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
ba0509b6 379 if (unlikely(!skb))
b2b5ce9d
ED
380 return NULL;
381
b2b5ce9d 382 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 383 __build_skb_around(skb, data, frag_size);
b2b5ce9d 384
483126b3 385 return skb;
b2b5ce9d 386}
2ea2f62c
ED
387
388/* build_skb() is wrapper over __build_skb(), that specifically
389 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
390 */
391struct sk_buff *build_skb(void *data, unsigned int frag_size)
392{
393 struct sk_buff *skb = __build_skb(data, frag_size);
394
395 if (skb && frag_size) {
396 skb->head_frag = 1;
2f064f34 397 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
398 skb->pfmemalloc = 1;
399 }
400 return skb;
401}
b2b5ce9d
ED
402EXPORT_SYMBOL(build_skb);
403
ba0509b6
JDB
404/**
405 * build_skb_around - build a network buffer around provided skb
406 * @skb: sk_buff provide by caller, must be memset cleared
407 * @data: data buffer provided by caller
12c1604a 408 * @frag_size: size of data
ba0509b6
JDB
409 */
410struct sk_buff *build_skb_around(struct sk_buff *skb,
411 void *data, unsigned int frag_size)
412{
413 if (unlikely(!skb))
414 return NULL;
415
483126b3 416 __build_skb_around(skb, data, frag_size);
ba0509b6 417
483126b3 418 if (frag_size) {
ba0509b6
JDB
419 skb->head_frag = 1;
420 if (page_is_pfmemalloc(virt_to_head_page(data)))
421 skb->pfmemalloc = 1;
422 }
423 return skb;
424}
425EXPORT_SYMBOL(build_skb_around);
426
f450d539
AL
427/**
428 * __napi_build_skb - build a network buffer
429 * @data: data buffer provided by caller
12c1604a 430 * @frag_size: size of data
f450d539
AL
431 *
432 * Version of __build_skb() that uses NAPI percpu caches to obtain
433 * skbuff_head instead of inplace allocation.
434 *
435 * Returns a new &sk_buff on success, %NULL on allocation failure.
436 */
437static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
438{
439 struct sk_buff *skb;
440
441 skb = napi_skb_cache_get();
442 if (unlikely(!skb))
443 return NULL;
444
445 memset(skb, 0, offsetof(struct sk_buff, tail));
446 __build_skb_around(skb, data, frag_size);
447
448 return skb;
449}
450
451/**
452 * napi_build_skb - build a network buffer
453 * @data: data buffer provided by caller
12c1604a 454 * @frag_size: size of data
f450d539
AL
455 *
456 * Version of __napi_build_skb() that takes care of skb->head_frag
457 * and skb->pfmemalloc when the data is a page or page fragment.
458 *
459 * Returns a new &sk_buff on success, %NULL on allocation failure.
460 */
461struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
462{
463 struct sk_buff *skb = __napi_build_skb(data, frag_size);
464
465 if (likely(skb) && frag_size) {
466 skb->head_frag = 1;
467 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
468 }
469
470 return skb;
471}
472EXPORT_SYMBOL(napi_build_skb);
473
5381b23d
AL
474/*
475 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
476 * the caller if emergency pfmemalloc reserves are being used. If it is and
477 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
478 * may be used. Otherwise, the packet data may be discarded until enough
479 * memory is free
480 */
ef28095f
AL
481static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
482 bool *pfmemalloc)
5381b23d
AL
483{
484 void *obj;
485 bool ret_pfmemalloc = false;
486
487 /*
488 * Try a regular allocation, when that fails and we're not entitled
489 * to the reserves, fail.
490 */
491 obj = kmalloc_node_track_caller(size,
492 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
493 node);
494 if (obj || !(gfp_pfmemalloc_allowed(flags)))
495 goto out;
496
497 /* Try again but now we are using pfmemalloc reserves */
498 ret_pfmemalloc = true;
499 obj = kmalloc_node_track_caller(size, flags, node);
500
501out:
502 if (pfmemalloc)
503 *pfmemalloc = ret_pfmemalloc;
504
505 return obj;
506}
507
508/* Allocate a new skbuff. We do this ourselves so we can fill in a few
509 * 'private' fields and also do memory statistics to find all the
510 * [BEEP] leaks.
511 *
512 */
513
514/**
515 * __alloc_skb - allocate a network buffer
516 * @size: size to allocate
517 * @gfp_mask: allocation mask
518 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
519 * instead of head cache and allocate a cloned (child) skb.
520 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
521 * allocations in case the data is required for writeback
522 * @node: numa node to allocate memory on
523 *
524 * Allocate a new &sk_buff. The returned buffer has no headroom and a
525 * tail room of at least size bytes. The object has a reference count
526 * of one. The return is the buffer. On a failure the return is %NULL.
527 *
528 * Buffers may only be allocated from interrupts using a @gfp_mask of
529 * %GFP_ATOMIC.
530 */
531struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
532 int flags, int node)
533{
534 struct kmem_cache *cache;
5381b23d 535 struct sk_buff *skb;
a5df6333 536 unsigned int osize;
5381b23d 537 bool pfmemalloc;
a5df6333 538 u8 *data;
5381b23d
AL
539
540 cache = (flags & SKB_ALLOC_FCLONE)
541 ? skbuff_fclone_cache : skbuff_head_cache;
542
543 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
544 gfp_mask |= __GFP_MEMALLOC;
545
546 /* Get the HEAD */
d13612b5
AL
547 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
548 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
549 skb = napi_skb_cache_get();
550 else
551 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
552 if (unlikely(!skb))
553 return NULL;
5381b23d
AL
554 prefetchw(skb);
555
556 /* We do our best to align skb_shared_info on a separate cache
557 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
558 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
559 * Both skb->head and skb_shared_info are cache line aligned.
560 */
115f1a5c 561 size = SKB_HEAD_ALIGN(size);
12d6c1d3
KC
562 osize = kmalloc_size_roundup(size);
563 data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
df1ae022 564 if (unlikely(!data))
5381b23d 565 goto nodata;
12d6c1d3 566 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
567 * Put skb_shared_info exactly at the end of allocated zone,
568 * to allow max possible filling before reallocation.
569 */
a5df6333 570 size = SKB_WITH_OVERHEAD(osize);
5381b23d
AL
571 prefetchw(data + size);
572
573 /*
574 * Only clear those fields we need to clear, not those that we will
575 * actually initialise below. Hence, don't put any more fields after
576 * the tail pointer in struct sk_buff!
577 */
578 memset(skb, 0, offsetof(struct sk_buff, tail));
a5df6333 579 __build_skb_around(skb, data, osize);
5381b23d 580 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
581
582 if (flags & SKB_ALLOC_FCLONE) {
583 struct sk_buff_fclones *fclones;
584
585 fclones = container_of(skb, struct sk_buff_fclones, skb1);
586
587 skb->fclone = SKB_FCLONE_ORIG;
588 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
589 }
590
5381b23d 591 return skb;
df1ae022 592
5381b23d
AL
593nodata:
594 kmem_cache_free(cache, skb);
df1ae022 595 return NULL;
5381b23d
AL
596}
597EXPORT_SYMBOL(__alloc_skb);
598
fd11a83d
AD
599/**
600 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
601 * @dev: network device to receive on
d7499160 602 * @len: length to allocate
fd11a83d
AD
603 * @gfp_mask: get_free_pages mask, passed to alloc_skb
604 *
605 * Allocate a new &sk_buff and assign it a usage count of one. The
606 * buffer has NET_SKB_PAD headroom built in. Users should allocate
607 * the headroom they think they need without accounting for the
608 * built in space. The built in space is used for optimisations.
609 *
610 * %NULL is returned if there is no free memory.
611 */
9451980a
AD
612struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
613 gfp_t gfp_mask)
fd11a83d 614{
b63ae8ca 615 struct page_frag_cache *nc;
fd11a83d 616 struct sk_buff *skb;
9451980a
AD
617 bool pfmemalloc;
618 void *data;
619
620 len += NET_SKB_PAD;
fd11a83d 621
66c55602
AL
622 /* If requested length is either too small or too big,
623 * we use kmalloc() for skb->head allocation.
624 */
625 if (len <= SKB_WITH_OVERHEAD(1024) ||
626 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 627 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
628 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
629 if (!skb)
630 goto skb_fail;
631 goto skb_success;
632 }
fd11a83d 633
115f1a5c 634 len = SKB_HEAD_ALIGN(len);
9451980a
AD
635
636 if (sk_memalloc_socks())
637 gfp_mask |= __GFP_MEMALLOC;
638
afa79d08 639 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
640 nc = this_cpu_ptr(&netdev_alloc_cache);
641 data = page_frag_alloc(nc, len, gfp_mask);
642 pfmemalloc = nc->pfmemalloc;
643 } else {
644 local_bh_disable();
645 nc = this_cpu_ptr(&napi_alloc_cache.page);
646 data = page_frag_alloc(nc, len, gfp_mask);
647 pfmemalloc = nc->pfmemalloc;
648 local_bh_enable();
649 }
9451980a
AD
650
651 if (unlikely(!data))
652 return NULL;
653
654 skb = __build_skb(data, len);
655 if (unlikely(!skb)) {
181edb2b 656 skb_free_frag(data);
9451980a 657 return NULL;
7b2e497a 658 }
fd11a83d 659
9451980a
AD
660 if (pfmemalloc)
661 skb->pfmemalloc = 1;
662 skb->head_frag = 1;
663
a080e7bd 664skb_success:
9451980a
AD
665 skb_reserve(skb, NET_SKB_PAD);
666 skb->dev = dev;
667
a080e7bd 668skb_fail:
8af27456
CH
669 return skb;
670}
b4ac530f 671EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 672
fd11a83d
AD
673/**
674 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
675 * @napi: napi instance this buffer was allocated for
d7499160 676 * @len: length to allocate
fd11a83d
AD
677 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
678 *
679 * Allocate a new sk_buff for use in NAPI receive. This buffer will
680 * attempt to allocate the head from a special reserved region used
681 * only for NAPI Rx allocation. By doing this we can save several
682 * CPU cycles by avoiding having to disable and re-enable IRQs.
683 *
684 * %NULL is returned if there is no free memory.
685 */
9451980a
AD
686struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
687 gfp_t gfp_mask)
fd11a83d 688{
3226b158 689 struct napi_alloc_cache *nc;
fd11a83d 690 struct sk_buff *skb;
dbae2b06 691 bool pfmemalloc;
9451980a
AD
692 void *data;
693
ee2640df 694 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 695 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 696
3226b158
ED
697 /* If requested length is either too small or too big,
698 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
699 * When the small frag allocator is available, prefer it over kmalloc
700 * for small fragments
3226b158 701 */
dbae2b06 702 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 703 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 704 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
705 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
706 NUMA_NO_NODE);
a080e7bd
AD
707 if (!skb)
708 goto skb_fail;
709 goto skb_success;
710 }
9451980a 711
3226b158 712 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
713
714 if (sk_memalloc_socks())
715 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 716
dbae2b06
PA
717 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
718 /* we are artificially inflating the allocation size, but
719 * that is not as bad as it may look like, as:
720 * - 'len' less than GRO_MAX_HEAD makes little sense
721 * - On most systems, larger 'len' values lead to fragment
722 * size above 512 bytes
723 * - kmalloc would use the kmalloc-1k slab for such values
724 * - Builds with smaller GRO_MAX_HEAD will very likely do
725 * little networking, as that implies no WiFi and no
726 * tunnels support, and 32 bits arches.
727 */
728 len = SZ_1K;
729
730 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
731 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
732 } else {
115f1a5c 733 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
734
735 data = page_frag_alloc(&nc->page, len, gfp_mask);
736 pfmemalloc = nc->page.pfmemalloc;
737 }
738
9451980a
AD
739 if (unlikely(!data))
740 return NULL;
741
cfb8ec65 742 skb = __napi_build_skb(data, len);
9451980a 743 if (unlikely(!skb)) {
181edb2b 744 skb_free_frag(data);
9451980a 745 return NULL;
fd11a83d
AD
746 }
747
dbae2b06 748 if (pfmemalloc)
9451980a
AD
749 skb->pfmemalloc = 1;
750 skb->head_frag = 1;
751
a080e7bd 752skb_success:
9451980a
AD
753 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
754 skb->dev = napi->dev;
755
a080e7bd 756skb_fail:
fd11a83d
AD
757 return skb;
758}
759EXPORT_SYMBOL(__napi_alloc_skb);
760
654bed16 761void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 762 int size, unsigned int truesize)
654bed16
PZ
763{
764 skb_fill_page_desc(skb, i, page, off, size);
765 skb->len += size;
766 skb->data_len += size;
50269e19 767 skb->truesize += truesize;
654bed16
PZ
768}
769EXPORT_SYMBOL(skb_add_rx_frag);
770
f8e617e1
JW
771void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
772 unsigned int truesize)
773{
774 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
775
776 skb_frag_size_add(frag, size);
777 skb->len += size;
778 skb->data_len += size;
779 skb->truesize += truesize;
780}
781EXPORT_SYMBOL(skb_coalesce_rx_frag);
782
27b437c8 783static void skb_drop_list(struct sk_buff **listp)
1da177e4 784{
bd8a7036 785 kfree_skb_list(*listp);
27b437c8 786 *listp = NULL;
1da177e4
LT
787}
788
27b437c8
HX
789static inline void skb_drop_fraglist(struct sk_buff *skb)
790{
791 skb_drop_list(&skb_shinfo(skb)->frag_list);
792}
793
1da177e4
LT
794static void skb_clone_fraglist(struct sk_buff *skb)
795{
796 struct sk_buff *list;
797
fbb398a8 798 skb_walk_frags(skb, list)
1da177e4
LT
799 skb_get(list);
800}
801
4727bab4
YL
802static bool skb_pp_recycle(struct sk_buff *skb, void *data)
803{
804 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
805 return false;
806 return page_pool_return_skb_page(virt_to_page(data));
807}
808
d3836f21
ED
809static void skb_free_head(struct sk_buff *skb)
810{
181edb2b
AD
811 unsigned char *head = skb->head;
812
6a5bcd84
IA
813 if (skb->head_frag) {
814 if (skb_pp_recycle(skb, head))
815 return;
181edb2b 816 skb_free_frag(head);
6a5bcd84 817 } else {
181edb2b 818 kfree(head);
6a5bcd84 819 }
d3836f21
ED
820}
821
511a3eda 822static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
1da177e4 823{
ff04a771
ED
824 struct skb_shared_info *shinfo = skb_shinfo(skb);
825 int i;
1da177e4 826
ff04a771
ED
827 if (skb->cloned &&
828 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
829 &shinfo->dataref))
2cc3aeb5 830 goto exit;
a6686f2f 831
753f1ca4
PB
832 if (skb_zcopy(skb)) {
833 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
834
835 skb_zcopy_clear(skb, true);
836 if (skip_unref)
837 goto free_head;
838 }
70c43167 839
ff04a771 840 for (i = 0; i < shinfo->nr_frags; i++)
6a5bcd84 841 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 842
753f1ca4 843free_head:
ff04a771 844 if (shinfo->frag_list)
511a3eda 845 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771
ED
846
847 skb_free_head(skb);
2cc3aeb5
IA
848exit:
849 /* When we clone an SKB we copy the reycling bit. The pp_recycle
850 * bit is only set on the head though, so in order to avoid races
851 * while trying to recycle fragments on __skb_frag_unref() we need
852 * to make one SKB responsible for triggering the recycle path.
853 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 854 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
855 * Eventually the last SKB will have the recycling bit set and it's
856 * dataref set to 0, which will trigger the recycling
857 */
858 skb->pp_recycle = 0;
1da177e4
LT
859}
860
861/*
862 * Free an skbuff by memory without cleaning the state.
863 */
2d4baff8 864static void kfree_skbmem(struct sk_buff *skb)
1da177e4 865{
d0bf4a9e 866 struct sk_buff_fclones *fclones;
d179cd12 867
d179cd12
DM
868 switch (skb->fclone) {
869 case SKB_FCLONE_UNAVAILABLE:
870 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 871 return;
d179cd12
DM
872
873 case SKB_FCLONE_ORIG:
d0bf4a9e 874 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 875
6ffe75eb
ED
876 /* We usually free the clone (TX completion) before original skb
877 * This test would have no chance to be true for the clone,
878 * while here, branch prediction will be good.
d179cd12 879 */
2638595a 880 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
881 goto fastpath;
882 break;
e7820e39 883
6ffe75eb
ED
884 default: /* SKB_FCLONE_CLONE */
885 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 886 break;
3ff50b79 887 }
2638595a 888 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
889 return;
890fastpath:
891 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
892}
893
0a463c78 894void skb_release_head_state(struct sk_buff *skb)
1da177e4 895{
adf30907 896 skb_dst_drop(skb);
9c2b3328 897 if (skb->destructor) {
7890e2f0 898 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
899 skb->destructor(skb);
900 }
a3bf7ae9 901#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 902 nf_conntrack_put(skb_nfct(skb));
1da177e4 903#endif
df5042f4 904 skb_ext_put(skb);
04a4bb55
LB
905}
906
907/* Free everything but the sk_buff shell. */
511a3eda 908static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
04a4bb55
LB
909{
910 skb_release_head_state(skb);
a28b1b90 911 if (likely(skb->head))
511a3eda 912 skb_release_data(skb, reason);
2d4baff8
HX
913}
914
915/**
916 * __kfree_skb - private function
917 * @skb: buffer
918 *
919 * Free an sk_buff. Release anything attached to the buffer.
920 * Clean the state. This is an internal helper function. Users should
921 * always call kfree_skb
922 */
1da177e4 923
2d4baff8
HX
924void __kfree_skb(struct sk_buff *skb)
925{
511a3eda 926 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1da177e4
LT
927 kfree_skbmem(skb);
928}
b4ac530f 929EXPORT_SYMBOL(__kfree_skb);
1da177e4 930
a4650da2
JDB
931static __always_inline
932bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
933{
934 if (unlikely(!skb_unref(skb)))
935 return false;
936
937 DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
938
939 if (reason == SKB_CONSUMED)
940 trace_consume_skb(skb);
941 else
942 trace_kfree_skb(skb, __builtin_return_address(0), reason);
943 return true;
944}
945