Merge tag 'zonefs-6.5-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal...
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
071c0fc6 61#include <linux/bitfield.h>
0d5501c1 62#include <linux/if_vlan.h>
2a2ea508 63#include <linux/mpls.h>
183f47fc 64#include <linux/kcov.h>
1da177e4
LT
65
66#include <net/protocol.h>
67#include <net/dst.h>
68#include <net/sock.h>
69#include <net/checksum.h>
ed1f50c3 70#include <net/ip6_checksum.h>
1da177e4 71#include <net/xfrm.h>
8822e270 72#include <net/mpls.h>
3ee17bc7 73#include <net/mptcp.h>
78476d31 74#include <net/mctp.h>
6a5bcd84 75#include <net/page_pool.h>
071c0fc6 76#include <net/dropreason.h>
1da177e4 77
7c0f6ba6 78#include <linux/uaccess.h>
ad8d75ff 79#include <trace/events/skb.h>
51c56b00 80#include <linux/highmem.h>
b245be1f
WB
81#include <linux/capability.h>
82#include <linux/user_namespace.h>
2544af03 83#include <linux/indirect_call_wrapper.h>
2195e2a0 84#include <linux/textsearch.h>
a1f8e7f7 85
39564c3f 86#include "dev.h"
7f678def 87#include "sock_destructor.h"
7b7ed885 88
025a785f 89struct kmem_cache *skbuff_cache __ro_after_init;
08009a76 90static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
91#ifdef CONFIG_SKB_EXTENSIONS
92static struct kmem_cache *skbuff_ext_cache __ro_after_init;
93#endif
bf9f1baa
ED
94
95/* skb_small_head_cache and related code is only supported
96 * for CONFIG_SLAB and CONFIG_SLUB.
97 * As soon as SLOB is removed from the kernel, we can clean up this.
98 */
99#if !defined(CONFIG_SLOB)
100# define HAVE_SKB_SMALL_HEAD_CACHE 1
101#endif
102
103#ifdef HAVE_SKB_SMALL_HEAD_CACHE
104static struct kmem_cache *skb_small_head_cache __ro_after_init;
105
106#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
107
108/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
109 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
110 * size, and we can differentiate heads from skb_small_head_cache
111 * vs system slabs by looking at their size (skb_end_offset()).
112 */
113#define SKB_SMALL_HEAD_CACHE_SIZE \
114 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
115 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
116 SKB_SMALL_HEAD_SIZE)
117
118#define SKB_SMALL_HEAD_HEADROOM \
119 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
120#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
121
5f74f82e
HWR
122int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
123EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 124
9cb252c4
MD
125#undef FN
126#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
071c0fc6 127static const char * const drop_reasons[] = {
0e84afe8 128 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
129 DEFINE_DROP_REASON(FN, FN)
130};
071c0fc6
JB
131
132static const struct drop_reason_list drop_reasons_core = {
133 .reasons = drop_reasons,
134 .n_reasons = ARRAY_SIZE(drop_reasons),
135};
136
137const struct drop_reason_list __rcu *
138drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
139 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
140};
141EXPORT_SYMBOL(drop_reasons_by_subsys);
142
143/**
144 * drop_reasons_register_subsys - register another drop reason subsystem
145 * @subsys: the subsystem to register, must not be the core
146 * @list: the list of drop reasons within the subsystem, must point to
147 * a statically initialized list
148 */
149void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
150 const struct drop_reason_list *list)
151{
152 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
153 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
154 "invalid subsystem %d\n", subsys))
155 return;
156
157 /* must point to statically allocated memory, so INIT is OK */
158 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
159}
160EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
161
162/**
163 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
164 * @subsys: the subsystem to remove, must not be the core
165 *
166 * Note: This will synchronize_rcu() to ensure no users when it returns.
167 */
168void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
169{
170 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
171 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
172 "invalid subsystem %d\n", subsys))
173 return;
174
175 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
176
177 synchronize_rcu();
178}
179EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
ec43908d 180
1da177e4 181/**
f05de73b
JS
182 * skb_panic - private function for out-of-line support
183 * @skb: buffer
184 * @sz: size
185 * @addr: address
99d5851e 186 * @msg: skb_over_panic or skb_under_panic
1da177e4 187 *
f05de73b
JS
188 * Out-of-line support for skb_put() and skb_push().
189 * Called via the wrapper skb_over_panic() or skb_under_panic().
190 * Keep out of line to prevent kernel bloat.
191 * __builtin_return_address is not used because it is not always reliable.
1da177e4 192 */
f05de73b 193static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 194 const char msg[])
1da177e4 195{
41a46913 196 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 197 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
198 (unsigned long)skb->tail, (unsigned long)skb->end,
199 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
200 BUG();
201}
202
f05de73b 203static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 204{
f05de73b 205 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
206}
207
f05de73b
JS
208static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
209{
210 skb_panic(skb, sz, addr, __func__);
211}
c93bdd0e 212
50fad4b5 213#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
214#define NAPI_SKB_CACHE_BULK 16
215#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 216
dbae2b06
PA
217#if PAGE_SIZE == SZ_4K
218
219#define NAPI_HAS_SMALL_PAGE_FRAG 1
220#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
221
222/* specialized page frag allocator using a single order 0 page
223 * and slicing it into 1K sized fragment. Constrained to systems
224 * with a very limited amount of 1K fragments fitting a single
225 * page - to avoid excessive truesize underestimation
226 */
227
228struct page_frag_1k {
229 void *va;
230 u16 offset;
231 bool pfmemalloc;
232};
233
234static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
235{
236 struct page *page;
237 int offset;
238
239 offset = nc->offset - SZ_1K;
240 if (likely(offset >= 0))
241 goto use_frag;
242
243 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
244 if (!page)
245 return NULL;
246
247 nc->va = page_address(page);
248 nc->pfmemalloc = page_is_pfmemalloc(page);
249 offset = PAGE_SIZE - SZ_1K;
250 page_ref_add(page, offset / SZ_1K);
251
252use_frag:
253 nc->offset = offset;
254 return nc->va + offset;
255}
256#else
257
258/* the small page is actually unused in this build; add dummy helpers
259 * to please the compiler and avoid later preprocessor's conditionals
260 */
261#define NAPI_HAS_SMALL_PAGE_FRAG 0
262#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
263
264struct page_frag_1k {
265};
266
267static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
268{
269 return NULL;
270}
271
272#endif
273
50fad4b5
AL
274struct napi_alloc_cache {
275 struct page_frag_cache page;
dbae2b06 276 struct page_frag_1k page_small;
50fad4b5
AL
277 unsigned int skb_count;
278 void *skb_cache[NAPI_SKB_CACHE_SIZE];
279};
280
281static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
282static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
283
dbae2b06
PA
284/* Double check that napi_get_frags() allocates skbs with
285 * skb->head being backed by slab, not a page fragment.
286 * This is to make sure bug fixed in 3226b158e67c
287 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
288 * does not accidentally come back.
289 */
290void napi_get_frags_check(struct napi_struct *napi)
291{
292 struct sk_buff *skb;
293
294 local_bh_disable();
295 skb = napi_get_frags(napi);
296 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
297 napi_free_frags(napi);
298 local_bh_enable();
299}
300
32e3573f 301void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
302{
303 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
304
50fad4b5
AL
305 fragsz = SKB_DATA_ALIGN(fragsz);
306
32e3573f 307 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
308}
309EXPORT_SYMBOL(__napi_alloc_frag_align);
310
311void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
312{
50fad4b5
AL
313 void *data;
314
315 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 316 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
317 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
318
50fad4b5
AL
319 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
320 } else {
32e3573f
YD
321 struct napi_alloc_cache *nc;
322
50fad4b5 323 local_bh_disable();
32e3573f
YD
324 nc = this_cpu_ptr(&napi_alloc_cache);
325 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
326 local_bh_enable();
327 }
328 return data;
329}
330EXPORT_SYMBOL(__netdev_alloc_frag_align);
331
f450d539
AL
332static struct sk_buff *napi_skb_cache_get(void)
333{
334 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
335 struct sk_buff *skb;
336
49ae83fc 337 if (unlikely(!nc->skb_count)) {
025a785f 338 nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
f450d539
AL
339 GFP_ATOMIC,
340 NAPI_SKB_CACHE_BULK,
341 nc->skb_cache);
49ae83fc
SPL
342 if (unlikely(!nc->skb_count))
343 return NULL;
344 }
f450d539
AL
345
346 skb = nc->skb_cache[--nc->skb_count];
025a785f 347 kasan_unpoison_object_data(skbuff_cache, skb);
f450d539
AL
348
349 return skb;
350}
351
ce098da1
KC
352static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
353 unsigned int size)
ba0509b6
JDB
354{
355 struct skb_shared_info *shinfo;
ba0509b6
JDB
356
357 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
358
359 /* Assumes caller memset cleared SKB */
360 skb->truesize = SKB_TRUESIZE(size);
361 refcount_set(&skb->users, 1);
362 skb->head = data;
363 skb->data = data;
364 skb_reset_tail_pointer(skb);
763087da 365 skb_set_end_offset(skb, size);
ba0509b6
JDB
366 skb->mac_header = (typeof(skb->mac_header))~0U;
367 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 368 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
369 /* make sure we initialize shinfo sequentially */
370 shinfo = skb_shinfo(skb);
371 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
372 atomic_set(&shinfo->dataref, 1);
373
6370cc3b 374 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
375}
376
ce098da1
KC
377static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
378 unsigned int *size)
379{
380 void *resized;
381
382 /* Must find the allocation size (and grow it to match). */
383 *size = ksize(data);
384 /* krealloc() will immediately return "data" when
385 * "ksize(data)" is requested: it is the existing upper
386 * bounds. As a result, GFP_ATOMIC will be ignored. Note
387 * that this "new" pointer needs to be passed back to the
388 * caller for use so the __alloc_size hinting will be
389 * tracked correctly.
390 */
391 resized = krealloc(data, *size, GFP_ATOMIC);
392 WARN_ON_ONCE(resized != data);
393 return resized;
394}
395
396/* build_skb() variant which can operate on slab buffers.
397 * Note that this should be used sparingly as slab buffers
398 * cannot be combined efficiently by GRO!
399 */
400struct sk_buff *slab_build_skb(void *data)
401{
402 struct sk_buff *skb;
403 unsigned int size;
404
025a785f 405 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ce098da1
KC
406 if (unlikely(!skb))
407 return NULL;
408
409 memset(skb, 0, offsetof(struct sk_buff, tail));
410 data = __slab_build_skb(skb, data, &size);
411 __finalize_skb_around(skb, data, size);
412
413 return skb;
414}
415EXPORT_SYMBOL(slab_build_skb);
416
417/* Caller must provide SKB that is memset cleared */
418static void __build_skb_around(struct sk_buff *skb, void *data,
419 unsigned int frag_size)
420{
421 unsigned int size = frag_size;
422
423 /* frag_size == 0 is considered deprecated now. Callers
424 * using slab buffer should use slab_build_skb() instead.
425 */
426 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
427 data = __slab_build_skb(skb, data, &size);
428
429 __finalize_skb_around(skb, data, size);
430}
431
b2b5ce9d 432/**
2ea2f62c 433 * __build_skb - build a network buffer
b2b5ce9d 434 * @data: data buffer provided by caller
ce098da1 435 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
436 *
437 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
438 * skb_shared_info. @data must have been allocated from the page
439 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
440 * allocation is deprecated, and callers should use slab_build_skb()
441 * instead.)
b2b5ce9d
ED
442 * The return is the new skb buffer.
443 * On a failure the return is %NULL, and @data is not freed.
444 * Notes :
445 * Before IO, driver allocates only data buffer where NIC put incoming frame
446 * Driver should add room at head (NET_SKB_PAD) and
447 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
448 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
449 * before giving packet to stack.
450 * RX rings only contains data buffers, not full skbs.
451 */
2ea2f62c 452struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 453{
b2b5ce9d 454 struct sk_buff *skb;
b2b5ce9d 455
025a785f 456 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ba0509b6 457 if (unlikely(!skb))
b2b5ce9d
ED
458 return NULL;
459
b2b5ce9d 460 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 461 __build_skb_around(skb, data, frag_size);
b2b5ce9d 462
483126b3 463 return skb;
b2b5ce9d 464}
2ea2f62c
ED
465
466/* build_skb() is wrapper over __build_skb(), that specifically
467 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
468 */
469struct sk_buff *build_skb(void *data, unsigned int frag_size)
470{
471 struct sk_buff *skb = __build_skb(data, frag_size);
472
3c640126 473 if (likely(skb && frag_size)) {
2ea2f62c 474 skb->head_frag = 1;
566b6701 475 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
2ea2f62c
ED
476 }
477 return skb;
478}
b2b5ce9d
ED
479EXPORT_SYMBOL(build_skb);
480
ba0509b6
JDB
481/**
482 * build_skb_around - build a network buffer around provided skb
483 * @skb: sk_buff provide by caller, must be memset cleared
484 * @data: data buffer provided by caller
12c1604a 485 * @frag_size: size of data
ba0509b6
JDB
486 */
487struct sk_buff *build_skb_around(struct sk_buff *skb,
488 void *data, unsigned int frag_size)
489{
490 if (unlikely(!skb))
491 return NULL;
492
483126b3 493 __build_skb_around(skb, data, frag_size);
ba0509b6 494
483126b3 495 if (frag_size) {
ba0509b6 496 skb->head_frag = 1;
566b6701 497 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
ba0509b6
JDB
498 }
499 return skb;
500}
501EXPORT_SYMBOL(build_skb_around);
502
f450d539
AL
503/**
504 * __napi_build_skb - build a network buffer
505 * @data: data buffer provided by caller
12c1604a 506 * @frag_size: size of data
f450d539
AL
507 *
508 * Version of __build_skb() that uses NAPI percpu caches to obtain
509 * skbuff_head instead of inplace allocation.
510 *
511 * Returns a new &sk_buff on success, %NULL on allocation failure.
512 */
513static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
514{
515 struct sk_buff *skb;
516
517 skb = napi_skb_cache_get();
518 if (unlikely(!skb))
519 return NULL;
520
521 memset(skb, 0, offsetof(struct sk_buff, tail));
522 __build_skb_around(skb, data, frag_size);
523
524 return skb;
525}
526
527/**
528 * napi_build_skb - build a network buffer
529 * @data: data buffer provided by caller
12c1604a 530 * @frag_size: size of data
f450d539
AL
531 *
532 * Version of __napi_build_skb() that takes care of skb->head_frag
533 * and skb->pfmemalloc when the data is a page or page fragment.
534 *
535 * Returns a new &sk_buff on success, %NULL on allocation failure.
536 */
537struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
538{
539 struct sk_buff *skb = __napi_build_skb(data, frag_size);
540
541 if (likely(skb) && frag_size) {
542 skb->head_frag = 1;
543 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
544 }
545
546 return skb;
547}
548EXPORT_SYMBOL(napi_build_skb);
549
5381b23d
AL
550/*
551 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
552 * the caller if emergency pfmemalloc reserves are being used. If it is and
553 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
554 * may be used. Otherwise, the packet data may be discarded until enough
555 * memory is free
556 */
5c0e820c 557static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 558 bool *pfmemalloc)
5381b23d 559{
5381b23d 560 bool ret_pfmemalloc = false;
5c0e820c
ED
561 unsigned int obj_size;
562 void *obj;
5381b23d 563
5c0e820c 564 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
565#ifdef HAVE_SKB_SMALL_HEAD_CACHE
566 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
567 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
bf9f1baa
ED
568 obj = kmem_cache_alloc_node(skb_small_head_cache,
569 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
570 node);
880ce5f2
ED
571 *size = SKB_SMALL_HEAD_CACHE_SIZE;
572 if (obj || !(gfp_pfmemalloc_allowed(flags)))
bf9f1baa 573 goto out;
880ce5f2
ED
574 /* Try again but now we are using pfmemalloc reserves */
575 ret_pfmemalloc = true;
576 obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
577 goto out;
bf9f1baa
ED
578 }
579#endif
5c0e820c 580 *size = obj_size = kmalloc_size_roundup(obj_size);
5381b23d
AL
581 /*
582 * Try a regular allocation, when that fails and we're not entitled
583 * to the reserves, fail.
584 */
5c0e820c 585 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
586 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
587 node);
588 if (obj || !(gfp_pfmemalloc_allowed(flags)))
589 goto out;
590
591 /* Try again but now we are using pfmemalloc reserves */
592 ret_pfmemalloc = true;
5c0e820c 593 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
594
595out:
596 if (pfmemalloc)
597 *pfmemalloc = ret_pfmemalloc;
598
599 return obj;
600}
601
602/* Allocate a new skbuff. We do this ourselves so we can fill in a few
603 * 'private' fields and also do memory statistics to find all the
604 * [BEEP] leaks.
605 *
606 */
607
608/**
609 * __alloc_skb - allocate a network buffer
610 * @size: size to allocate
611 * @gfp_mask: allocation mask
612 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
613 * instead of head cache and allocate a cloned (child) skb.
614 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
615 * allocations in case the data is required for writeback
616 * @node: numa node to allocate memory on
617 *
618 * Allocate a new &sk_buff. The returned buffer has no headroom and a
619 * tail room of at least size bytes. The object has a reference count
620 * of one. The return is the buffer. On a failure the return is %NULL.
621 *
622 * Buffers may only be allocated from interrupts using a @gfp_mask of
623 * %GFP_ATOMIC.
624 */
625struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
626 int flags, int node)
627{
628 struct kmem_cache *cache;
5381b23d 629 struct sk_buff *skb;
5381b23d 630 bool pfmemalloc;
a5df6333 631 u8 *data;
5381b23d
AL
632
633 cache = (flags & SKB_ALLOC_FCLONE)
025a785f 634 ? skbuff_fclone_cache : skbuff_cache;
5381b23d
AL
635
636 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
637 gfp_mask |= __GFP_MEMALLOC;
638
639 /* Get the HEAD */
d13612b5
AL
640 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
641 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
642 skb = napi_skb_cache_get();
643 else
644 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
645 if (unlikely(!skb))
646 return NULL;
5381b23d
AL
647 prefetchw(skb);
648
649 /* We do our best to align skb_shared_info on a separate cache
650 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
651 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
652 * Both skb->head and skb_shared_info are cache line aligned.
653 */
5c0e820c 654 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 655 if (unlikely(!data))
5381b23d 656 goto nodata;
12d6c1d3 657 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
658 * Put skb_shared_info exactly at the end of allocated zone,
659 * to allow max possible filling before reallocation.
660 */
65998d2b 661 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
662
663 /*
664 * Only clear those fields we need to clear, not those that we will
665 * actually initialise below. Hence, don't put any more fields after
666 * the tail pointer in struct sk_buff!
667 */
668 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 669 __build_skb_around(skb, data, size);
5381b23d 670 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
671
672 if (flags & SKB_ALLOC_FCLONE) {
673 struct sk_buff_fclones *fclones;
674
675 fclones = container_of(skb, struct sk_buff_fclones, skb1);
676
677 skb->fclone = SKB_FCLONE_ORIG;
678 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
679 }
680
5381b23d 681 return skb;
df1ae022 682
5381b23d
AL
683nodata:
684 kmem_cache_free(cache, skb);
df1ae022 685 return NULL;
5381b23d
AL
686}
687EXPORT_SYMBOL(__alloc_skb);
688
fd11a83d
AD
689/**
690 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
691 * @dev: network device to receive on
d7499160 692 * @len: length to allocate
fd11a83d
AD
693 * @gfp_mask: get_free_pages mask, passed to alloc_skb
694 *
695 * Allocate a new &sk_buff and assign it a usage count of one. The
696 * buffer has NET_SKB_PAD headroom built in. Users should allocate
697 * the headroom they think they need without accounting for the
698 * built in space. The built in space is used for optimisations.
699 *
700 * %NULL is returned if there is no free memory.
701 */
9451980a
AD
702struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
703 gfp_t gfp_mask)
fd11a83d 704{
b63ae8ca 705 struct page_frag_cache *nc;
fd11a83d 706 struct sk_buff *skb;
9451980a
AD
707 bool pfmemalloc;
708 void *data;
709
710 len += NET_SKB_PAD;
fd11a83d 711
66c55602
AL
712 /* If requested length is either too small or too big,
713 * we use kmalloc() for skb->head allocation.
714 */
715 if (len <= SKB_WITH_OVERHEAD(1024) ||
716 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 717 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
718 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
719 if (!skb)
720 goto skb_fail;
721 goto skb_success;
722 }
fd11a83d 723
115f1a5c 724 len = SKB_HEAD_ALIGN(len);
9451980a
AD
725
726 if (sk_memalloc_socks())
727 gfp_mask |= __GFP_MEMALLOC;
728
afa79d08 729 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
730 nc = this_cpu_ptr(&netdev_alloc_cache);
731 data = page_frag_alloc(nc, len, gfp_mask);
732 pfmemalloc = nc->pfmemalloc;
733 } else {
734 local_bh_disable();
735 nc = this_cpu_ptr(&napi_alloc_cache.page);
736 data = page_frag_alloc(nc, len, gfp_mask);
737 pfmemalloc = nc->pfmemalloc;
738 local_bh_enable();
739 }
9451980a
AD
740
741 if (unlikely(!data))
742 return NULL;
743
744 skb = __build_skb(data, len);
745 if (unlikely(!skb)) {
181edb2b 746 skb_free_frag(data);
9451980a 747 return NULL;
7b2e497a 748 }
fd11a83d 749
9451980a
AD
750 if (pfmemalloc)
751 skb->pfmemalloc = 1;
752 skb->head_frag = 1;
753
a080e7bd 754skb_success:
9451980a
AD
755 skb_reserve(skb, NET_SKB_PAD);
756 skb->dev = dev;
757
a080e7bd 758skb_fail:
8af27456
CH
759 return skb;
760}
b4ac530f 761EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 762
fd11a83d
AD
763/**
764 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
765 * @napi: napi instance this buffer was allocated for
d7499160 766 * @len: length to allocate
fd11a83d
AD
767 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
768 *
769 * Allocate a new sk_buff for use in NAPI receive. This buffer will
770 * attempt to allocate the head from a special reserved region used
771 * only for NAPI Rx allocation. By doing this we can save several
772 * CPU cycles by avoiding having to disable and re-enable IRQs.
773 *
774 * %NULL is returned if there is no free memory.
775 */
9451980a
AD
776struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
777 gfp_t gfp_mask)
fd11a83d 778{
3226b158 779 struct napi_alloc_cache *nc;
fd11a83d 780 struct sk_buff *skb;
dbae2b06 781 bool pfmemalloc;
9451980a
AD
782 void *data;
783
ee2640df 784 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 785 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 786
3226b158
ED
787 /* If requested length is either too small or too big,
788 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
789 * When the small frag allocator is available, prefer it over kmalloc
790 * for small fragments
3226b158 791 */
dbae2b06 792 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 793 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 794 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
795 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
796 NUMA_NO_NODE);
a080e7bd
AD
797 if (!skb)
798 goto skb_fail;
799 goto skb_success;
800 }
9451980a 801
3226b158 802 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
803
804 if (sk_memalloc_socks())
805 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 806
dbae2b06
PA
807 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
808 /* we are artificially inflating the allocation size, but
809 * that is not as bad as it may look like, as:
810 * - 'len' less than GRO_MAX_HEAD makes little sense
811 * - On most systems, larger 'len' values lead to fragment
812 * size above 512 bytes
813 * - kmalloc would use the kmalloc-1k slab for such values
814 * - Builds with smaller GRO_MAX_HEAD will very likely do
815 * little networking, as that implies no WiFi and no
816 * tunnels support, and 32 bits arches.
817 */
818 len = SZ_1K;
819
820 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
821 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
822 } else {
115f1a5c 823 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
824
825 data = page_frag_alloc(&nc->page, len, gfp_mask);
826 pfmemalloc = nc->page.pfmemalloc;
827 }
828
9451980a
AD
829 if (unlikely(!data))
830 return NULL;
831
cfb8ec65 832 skb = __napi_build_skb(data, len);
9451980a 833 if (unlikely(!skb)) {
181edb2b 834 skb_free_frag(data);
9451980a 835 return NULL;
fd11a83d
AD
836 }
837
dbae2b06 838 if (pfmemalloc)
9451980a
AD
839 skb->pfmemalloc = 1;
840 skb->head_frag = 1;
841
a080e7bd 842skb_success:
9451980a
AD
843 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
844 skb->dev = napi->dev;
845
a080e7bd 846skb_fail:
fd11a83d
AD
847 return skb;
848}
849EXPORT_SYMBOL(__napi_alloc_skb);
850
654bed16 851void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 852 int size, unsigned int truesize)
654bed16
PZ
853{
854 skb_fill_page_desc(skb, i, page, off, size);
855 skb->len += size;
856 skb->data_len += size;
50269e19 857 skb->truesize += truesize;
654bed16
PZ
858}
859EXPORT_SYMBOL(skb_add_rx_frag);
860
f8e617e1
JW
861void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
862 unsigned int truesize)
863{
864 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
865
866 skb_frag_size_add(frag, size);
867 skb->len += size;
868 skb->data_len += size;
869 skb->truesize += truesize;
870}
871EXPORT_SYMBOL(skb_coalesce_rx_frag);
872
27b437c8 873static void skb_drop_list(struct sk_buff **listp)
1da177e4 874{
bd8a7036 875 kfree_skb_list(*listp);
27b437c8 876 *listp = NULL;
1da177e4
LT
877}
878
27b437c8
HX
879static inline void skb_drop_fraglist(struct sk_buff *skb)
880{
881 skb_drop_list(&skb_shinfo(skb)->frag_list);
882}
883
1da177e4
LT
884static void skb_clone_fraglist(struct sk_buff *skb)
885{
886 struct sk_buff *list;
887
fbb398a8 888 skb_walk_frags(skb, list)
1da177e4
LT
889 skb_get(list);
890}
891
b07a2d97 892static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
4727bab4
YL
893{
894 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
895 return false;
8c48eea3 896 return page_pool_return_skb_page(virt_to_page(data), napi_safe);
4727bab4
YL
897}
898
bf9f1baa
ED
899static void skb_kfree_head(void *head, unsigned int end_offset)
900{
901#ifdef HAVE_SKB_SMALL_HEAD_CACHE
902 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
903 kmem_cache_free(skb_small_head_cache, head);
904 else
905#endif
906 kfree(head);
907}
908
b07a2d97 909static void skb_free_head(struct sk_buff *skb, bool napi_safe)
d3836f21 910{
181edb2b
AD
911 unsigned char *head = skb->head;
912
6a5bcd84 913 if (skb->head_frag) {
b07a2d97 914 if (skb_pp_recycle(skb, head, napi_safe))
6a5bcd84 915 return;
181edb2b 916 skb_free_frag(head);
6a5bcd84 917 } else {
bf9f1baa 918 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 919 }
d3836f21
ED
920}
921
b07a2d97
JK
922static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
923 bool napi_safe)
1da177e4 924{
ff04a771
ED
925 struct skb_shared_info *shinfo = skb_shinfo(skb);
926 int i;
1da177e4 927
ff04a771
ED
928 if (skb->cloned &&
929 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
930 &shinfo->dataref))
2cc3aeb5 931 goto exit;
a6686f2f 932
753f1ca4
PB
933 if (skb_zcopy(skb)) {
934 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
935
936 skb_zcopy_clear(skb, true);
937 if (skip_unref)
938 goto free_head;
939 }
70c43167 940
ff04a771 941 for (i = 0; i < shinfo->nr_frags; i++)
8c48eea3 942 napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
a6686f2f 943
753f1ca4 944free_head:
ff04a771 945 if (shinfo->frag_list)
511a3eda 946 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771 947
b07a2d97 948 skb_free_head(skb, napi_safe);
2cc3aeb5
IA
949exit:
950 /* When we clone an SKB we copy the reycling bit. The pp_recycle
951 * bit is only set on the head though, so in order to avoid races
952 * while trying to recycle fragments on __skb_frag_unref() we need
953 * to make one SKB responsible for triggering the recycle path.
954 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 955 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
956 * Eventually the last SKB will have the recycling bit set and it's
957 * dataref set to 0, which will trigger the recycling
958 */
959 skb->pp_recycle = 0;
1da177e4
LT
960}
961
962/*
963 * Free an skbuff by memory without cleaning the state.
964 */
2d4baff8 965static void kfree_skbmem(struct sk_buff *skb)
1da177e4 966{
d0bf4a9e 967 struct sk_buff_fclones *fclones;
d179cd12 968
d179cd12
DM
969 switch (skb->fclone) {
970 case SKB_FCLONE_UNAVAILABLE:
025a785f 971 kmem_cache_free(skbuff_cache, skb);
6ffe75eb 972 return;
d179cd12
DM
973
974 case SKB_FCLONE_ORIG:
d0bf4a9e 975 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 976
6ffe75eb
ED
977 /* We usually free the clone (TX completion) before original skb
978 * This test would have no chance to be true for the clone,
979 * while here, branch prediction will be good.
d179cd12 980 */
2638595a 981 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
982 goto fastpath;
983 break;
e7820e39 984
6ffe75eb
ED
985 default: /* SKB_FCLONE_CLONE */
986 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 987 break;
3ff50b79 988 }
2638595a 989 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
990 return;
991fastpath:
992 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
993}
994
0a463c78 995void skb_release_head_state(struct sk_buff *skb)
1da177e4 996{
adf30907 997 skb_dst_drop(skb);
9c2b3328 998 if (skb->destructor) {
7890e2f0 999 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
1000 skb->destructor(skb);
1001 }
a3bf7ae9 1002#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 1003 nf_conntrack_put(skb_nfct(skb));
1da177e4 1004#endif
df5042f4 1005 skb_ext_put(skb);
04a4bb55
LB
1006}
1007
1008/* Free everything but the sk_buff shell. */
b07a2d97
JK
1009static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
1010 bool napi_safe)
04a4bb55
LB
1011{
1012 skb_release_head_state(skb);
a28b1b90 1013 if (likely(skb->head))
b07a2d97 1014 skb_release_data(skb, reason, napi_safe);
2d4baff8
HX
1015}
1016
1017/**
1018 * __kfree_skb - private function
1019 * @skb: buffer
1020 *
1021 * Free an sk_buff. Release anything attached to the buffer.
1022 * Clean the state. This is an internal helper function. Users should
1023 * always call kfree_skb
1024 */
1da177e4 1025
2d4baff8
HX
1026void __kfree_skb(struct sk_buff *skb)
1027{
b07a2d97 1028 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
1da177e4
LT
1029 kfree_skbmem(skb);
1030}
b4ac530f 1031EXPORT_SYMBOL(__kfree_skb);
1da177e4 1032
a4650da2
JDB
1033static __always_inline
1034bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1035{
1036 if (unlikely(!skb_unref(skb)))
1037 return false;
1038
071c0fc6
JB
1039 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1040 u32_get_bits(reason,
1041 SKB_DROP_REASON_SUBSYS_MASK) >=
1042 SKB_DROP_REASON_SUBSYS_NUM);
a4650da2
JDB
1043
1044 if (reason == SKB_CONSUMED)
dd1b5278 1045 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2
JDB
1046 else
1047 trace_kfree_skb(skb, __builtin_return_address(0), reason);
1048 return true;
1049}
1050