skbuff: Add a function to check if a page belongs to page_pool
[linux-2.6-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
071c0fc6 61#include <linux/bitfield.h>
0d5501c1 62#include <linux/if_vlan.h>
2a2ea508 63#include <linux/mpls.h>
183f47fc 64#include <linux/kcov.h>
6d0d4199 65#include <linux/iov_iter.h>
1da177e4
LT
66
67#include <net/protocol.h>
68#include <net/dst.h>
69#include <net/sock.h>
70#include <net/checksum.h>
d457a0e3 71#include <net/gso.h>
ed1f50c3 72#include <net/ip6_checksum.h>
1da177e4 73#include <net/xfrm.h>
8822e270 74#include <net/mpls.h>
3ee17bc7 75#include <net/mptcp.h>
78476d31 76#include <net/mctp.h>
75eaf63e 77#include <net/page_pool/helpers.h>
071c0fc6 78#include <net/dropreason.h>
1da177e4 79
7c0f6ba6 80#include <linux/uaccess.h>
ad8d75ff 81#include <trace/events/skb.h>
51c56b00 82#include <linux/highmem.h>
b245be1f
WB
83#include <linux/capability.h>
84#include <linux/user_namespace.h>
2544af03 85#include <linux/indirect_call_wrapper.h>
2195e2a0 86#include <linux/textsearch.h>
a1f8e7f7 87
39564c3f 88#include "dev.h"
7f678def 89#include "sock_destructor.h"
7b7ed885 90
025a785f 91struct kmem_cache *skbuff_cache __ro_after_init;
08009a76 92static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
93#ifdef CONFIG_SKB_EXTENSIONS
94static struct kmem_cache *skbuff_ext_cache __ro_after_init;
95#endif
bf9f1baa 96
bf9f1baa 97
bf9f1baa
ED
98static struct kmem_cache *skb_small_head_cache __ro_after_init;
99
100#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
101
102/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
103 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
104 * size, and we can differentiate heads from skb_small_head_cache
105 * vs system slabs by looking at their size (skb_end_offset()).
106 */
107#define SKB_SMALL_HEAD_CACHE_SIZE \
108 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
109 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
110 SKB_SMALL_HEAD_SIZE)
111
112#define SKB_SMALL_HEAD_HEADROOM \
113 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
bf9f1baa 114
5f74f82e
HWR
115int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
116EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 117
9cb252c4
MD
118#undef FN
119#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
071c0fc6 120static const char * const drop_reasons[] = {
0e84afe8 121 [SKB_CONSUMED] = "CONSUMED",
9cb252c4
MD
122 DEFINE_DROP_REASON(FN, FN)
123};
071c0fc6
JB
124
125static const struct drop_reason_list drop_reasons_core = {
126 .reasons = drop_reasons,
127 .n_reasons = ARRAY_SIZE(drop_reasons),
128};
129
130const struct drop_reason_list __rcu *
131drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
132 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
133};
134EXPORT_SYMBOL(drop_reasons_by_subsys);
135
136/**
137 * drop_reasons_register_subsys - register another drop reason subsystem
138 * @subsys: the subsystem to register, must not be the core
139 * @list: the list of drop reasons within the subsystem, must point to
140 * a statically initialized list
141 */
142void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
143 const struct drop_reason_list *list)
144{
145 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
146 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
147 "invalid subsystem %d\n", subsys))
148 return;
149
150 /* must point to statically allocated memory, so INIT is OK */
151 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
152}
153EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
154
155/**
156 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
157 * @subsys: the subsystem to remove, must not be the core
158 *
159 * Note: This will synchronize_rcu() to ensure no users when it returns.
160 */
161void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
162{
163 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
164 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
165 "invalid subsystem %d\n", subsys))
166 return;
167
168 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
169
170 synchronize_rcu();
171}
172EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
ec43908d 173
1da177e4 174/**
f05de73b
JS
175 * skb_panic - private function for out-of-line support
176 * @skb: buffer
177 * @sz: size
178 * @addr: address
99d5851e 179 * @msg: skb_over_panic or skb_under_panic
1da177e4 180 *
f05de73b
JS
181 * Out-of-line support for skb_put() and skb_push().
182 * Called via the wrapper skb_over_panic() or skb_under_panic().
183 * Keep out of line to prevent kernel bloat.
184 * __builtin_return_address is not used because it is not always reliable.
1da177e4 185 */
f05de73b 186static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 187 const char msg[])
1da177e4 188{
41a46913 189 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 190 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
191 (unsigned long)skb->tail, (unsigned long)skb->end,
192 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
193 BUG();
194}
195
f05de73b 196static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 197{
f05de73b 198 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
199}
200
f05de73b
JS
201static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
202{
203 skb_panic(skb, sz, addr, __func__);
204}
c93bdd0e 205
50fad4b5 206#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
207#define NAPI_SKB_CACHE_BULK 16
208#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5 209
dbae2b06
PA
210#if PAGE_SIZE == SZ_4K
211
212#define NAPI_HAS_SMALL_PAGE_FRAG 1
213#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
214
215/* specialized page frag allocator using a single order 0 page
216 * and slicing it into 1K sized fragment. Constrained to systems
217 * with a very limited amount of 1K fragments fitting a single
218 * page - to avoid excessive truesize underestimation
219 */
220
221struct page_frag_1k {
222 void *va;
223 u16 offset;
224 bool pfmemalloc;
225};
226
227static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
228{
229 struct page *page;
230 int offset;
231
232 offset = nc->offset - SZ_1K;
233 if (likely(offset >= 0))
234 goto use_frag;
235
236 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
237 if (!page)
238 return NULL;
239
240 nc->va = page_address(page);
241 nc->pfmemalloc = page_is_pfmemalloc(page);
242 offset = PAGE_SIZE - SZ_1K;
243 page_ref_add(page, offset / SZ_1K);
244
245use_frag:
246 nc->offset = offset;
247 return nc->va + offset;
248}
249#else
250
251/* the small page is actually unused in this build; add dummy helpers
252 * to please the compiler and avoid later preprocessor's conditionals
253 */
254#define NAPI_HAS_SMALL_PAGE_FRAG 0
255#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
256
257struct page_frag_1k {
258};
259
260static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
261{
262 return NULL;
263}
264
265#endif
266
50fad4b5
AL
267struct napi_alloc_cache {
268 struct page_frag_cache page;
dbae2b06 269 struct page_frag_1k page_small;
50fad4b5
AL
270 unsigned int skb_count;
271 void *skb_cache[NAPI_SKB_CACHE_SIZE];
272};
273
274static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
275static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
276
dbae2b06
PA
277/* Double check that napi_get_frags() allocates skbs with
278 * skb->head being backed by slab, not a page fragment.
279 * This is to make sure bug fixed in 3226b158e67c
280 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
281 * does not accidentally come back.
282 */
283void napi_get_frags_check(struct napi_struct *napi)
284{
285 struct sk_buff *skb;
286
287 local_bh_disable();
288 skb = napi_get_frags(napi);
289 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
290 napi_free_frags(napi);
291 local_bh_enable();
292}
293
32e3573f 294void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
295{
296 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
297
50fad4b5
AL
298 fragsz = SKB_DATA_ALIGN(fragsz);
299
32e3573f 300 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
301}
302EXPORT_SYMBOL(__napi_alloc_frag_align);
303
304void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
305{
50fad4b5
AL
306 void *data;
307
308 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 309 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
310 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
311
50fad4b5
AL
312 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
313 } else {
32e3573f
YD
314 struct napi_alloc_cache *nc;
315
50fad4b5 316 local_bh_disable();
32e3573f
YD
317 nc = this_cpu_ptr(&napi_alloc_cache);
318 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
319 local_bh_enable();
320 }
321 return data;
322}
323EXPORT_SYMBOL(__netdev_alloc_frag_align);
324
f450d539
AL
325static struct sk_buff *napi_skb_cache_get(void)
326{
327 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
328 struct sk_buff *skb;
329
49ae83fc 330 if (unlikely(!nc->skb_count)) {
025a785f 331 nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
f450d539
AL
332 GFP_ATOMIC,
333 NAPI_SKB_CACHE_BULK,
334 nc->skb_cache);
49ae83fc
SPL
335 if (unlikely(!nc->skb_count))
336 return NULL;
337 }
f450d539
AL
338
339 skb = nc->skb_cache[--nc->skb_count];
025a785f 340 kasan_unpoison_object_data(skbuff_cache, skb);
f450d539
AL
341
342 return skb;
343}
344
ce098da1
KC
345static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
346 unsigned int size)
ba0509b6
JDB
347{
348 struct skb_shared_info *shinfo;
ba0509b6
JDB
349
350 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
351
352 /* Assumes caller memset cleared SKB */
353 skb->truesize = SKB_TRUESIZE(size);
354 refcount_set(&skb->users, 1);
355 skb->head = data;
356 skb->data = data;
357 skb_reset_tail_pointer(skb);
763087da 358 skb_set_end_offset(skb, size);
ba0509b6
JDB
359 skb->mac_header = (typeof(skb->mac_header))~0U;
360 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 361 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
362 /* make sure we initialize shinfo sequentially */
363 shinfo = skb_shinfo(skb);
364 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
365 atomic_set(&shinfo->dataref, 1);
366
6370cc3b 367 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
368}
369
ce098da1
KC
370static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
371 unsigned int *size)
372{
373 void *resized;
374
375 /* Must find the allocation size (and grow it to match). */
376 *size = ksize(data);
377 /* krealloc() will immediately return "data" when
378 * "ksize(data)" is requested: it is the existing upper
379 * bounds. As a result, GFP_ATOMIC will be ignored. Note
380 * that this "new" pointer needs to be passed back to the
381 * caller for use so the __alloc_size hinting will be
382 * tracked correctly.
383 */
384 resized = krealloc(data, *size, GFP_ATOMIC);
385 WARN_ON_ONCE(resized != data);
386 return resized;
387}
388
389/* build_skb() variant which can operate on slab buffers.
390 * Note that this should be used sparingly as slab buffers
391 * cannot be combined efficiently by GRO!
392 */
393struct sk_buff *slab_build_skb(void *data)
394{
395 struct sk_buff *skb;
396 unsigned int size;
397
025a785f 398 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ce098da1
KC
399 if (unlikely(!skb))
400 return NULL;
401
402 memset(skb, 0, offsetof(struct sk_buff, tail));
403 data = __slab_build_skb(skb, data, &size);
404 __finalize_skb_around(skb, data, size);
405
406 return skb;
407}
408EXPORT_SYMBOL(slab_build_skb);
409
410/* Caller must provide SKB that is memset cleared */
411static void __build_skb_around(struct sk_buff *skb, void *data,
412 unsigned int frag_size)
413{
414 unsigned int size = frag_size;
415
416 /* frag_size == 0 is considered deprecated now. Callers
417 * using slab buffer should use slab_build_skb() instead.
418 */
419 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
420 data = __slab_build_skb(skb, data, &size);
421
422 __finalize_skb_around(skb, data, size);
423}
424
b2b5ce9d 425/**
2ea2f62c 426 * __build_skb - build a network buffer
b2b5ce9d 427 * @data: data buffer provided by caller
ce098da1 428 * @frag_size: size of data (must not be 0)
b2b5ce9d
ED
429 *
430 * Allocate a new &sk_buff. Caller provides space holding head and
ce098da1
KC
431 * skb_shared_info. @data must have been allocated from the page
432 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
433 * allocation is deprecated, and callers should use slab_build_skb()
434 * instead.)
b2b5ce9d
ED
435 * The return is the new skb buffer.
436 * On a failure the return is %NULL, and @data is not freed.
437 * Notes :
438 * Before IO, driver allocates only data buffer where NIC put incoming frame
439 * Driver should add room at head (NET_SKB_PAD) and
440 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
441 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
442 * before giving packet to stack.
443 * RX rings only contains data buffers, not full skbs.
444 */
2ea2f62c 445struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 446{
b2b5ce9d 447 struct sk_buff *skb;
b2b5ce9d 448
025a785f 449 skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
ba0509b6 450 if (unlikely(!skb))
b2b5ce9d
ED
451 return NULL;
452
b2b5ce9d 453 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 454 __build_skb_around(skb, data, frag_size);
b2b5ce9d 455
483126b3 456 return skb;
b2b5ce9d 457}
2ea2f62c
ED
458
459/* build_skb() is wrapper over __build_skb(), that specifically
460 * takes care of skb->head and skb->pfmemalloc
2ea2f62c
ED
461 */
462struct sk_buff *build_skb(void *data, unsigned int frag_size)
463{
464 struct sk_buff *skb = __build_skb(data, frag_size);
465
3c640126 466 if (likely(skb && frag_size)) {
2ea2f62c 467 skb->head_frag = 1;
566b6701 468 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
2ea2f62c
ED
469 }
470 return skb;
471}
b2b5ce9d
ED
472EXPORT_SYMBOL(build_skb);
473
ba0509b6
JDB
474/**
475 * build_skb_around - build a network buffer around provided skb
476 * @skb: sk_buff provide by caller, must be memset cleared
477 * @data: data buffer provided by caller
12c1604a 478 * @frag_size: size of data
ba0509b6
JDB
479 */
480struct sk_buff *build_skb_around(struct sk_buff *skb,
481 void *data, unsigned int frag_size)
482{
483 if (unlikely(!skb))
484 return NULL;
485
483126b3 486 __build_skb_around(skb, data, frag_size);
ba0509b6 487
483126b3 488 if (frag_size) {
ba0509b6 489 skb->head_frag = 1;
566b6701 490 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
ba0509b6
JDB
491 }
492 return skb;
493}
494EXPORT_SYMBOL(build_skb_around);
495
f450d539
AL
496/**
497 * __napi_build_skb - build a network buffer
498 * @data: data buffer provided by caller
12c1604a 499 * @frag_size: size of data
f450d539
AL
500 *
501 * Version of __build_skb() that uses NAPI percpu caches to obtain
502 * skbuff_head instead of inplace allocation.
503 *
504 * Returns a new &sk_buff on success, %NULL on allocation failure.
505 */
506static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
507{
508 struct sk_buff *skb;
509
510 skb = napi_skb_cache_get();
511 if (unlikely(!skb))
512 return NULL;
513
514 memset(skb, 0, offsetof(struct sk_buff, tail));
515 __build_skb_around(skb, data, frag_size);
516
517 return skb;
518}
519
520/**
521 * napi_build_skb - build a network buffer
522 * @data: data buffer provided by caller
12c1604a 523 * @frag_size: size of data
f450d539
AL
524 *
525 * Version of __napi_build_skb() that takes care of skb->head_frag
526 * and skb->pfmemalloc when the data is a page or page fragment.
527 *
528 * Returns a new &sk_buff on success, %NULL on allocation failure.
529 */
530struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
531{
532 struct sk_buff *skb = __napi_build_skb(data, frag_size);
533
534 if (likely(skb) && frag_size) {
535 skb->head_frag = 1;
536 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
537 }
538
539 return skb;
540}
541EXPORT_SYMBOL(napi_build_skb);
542
5381b23d
AL
543/*
544 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
545 * the caller if emergency pfmemalloc reserves are being used. If it is and
546 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
547 * may be used. Otherwise, the packet data may be discarded until enough
548 * memory is free
549 */
5c0e820c 550static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
ef28095f 551 bool *pfmemalloc)
5381b23d 552{
5381b23d 553 bool ret_pfmemalloc = false;
915d975b 554 size_t obj_size;
5c0e820c 555 void *obj;
5381b23d 556
5c0e820c 557 obj_size = SKB_HEAD_ALIGN(*size);
bf9f1baa
ED
558 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
559 !(flags & KMALLOC_NOT_NORMAL_BITS)) {
bf9f1baa
ED
560 obj = kmem_cache_alloc_node(skb_small_head_cache,
561 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
562 node);
880ce5f2
ED
563 *size = SKB_SMALL_HEAD_CACHE_SIZE;
564 if (obj || !(gfp_pfmemalloc_allowed(flags)))
bf9f1baa 565 goto out;
880ce5f2
ED
566 /* Try again but now we are using pfmemalloc reserves */
567 ret_pfmemalloc = true;
568 obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
569 goto out;
bf9f1baa 570 }
915d975b
ED
571
572 obj_size = kmalloc_size_roundup(obj_size);
573 /* The following cast might truncate high-order bits of obj_size, this
574 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
575 */
576 *size = (unsigned int)obj_size;
577
5381b23d
AL
578 /*
579 * Try a regular allocation, when that fails and we're not entitled
580 * to the reserves, fail.
581 */
5c0e820c 582 obj = kmalloc_node_track_caller(obj_size,
5381b23d
AL
583 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
584 node);
585 if (obj || !(gfp_pfmemalloc_allowed(flags)))
586 goto out;
587
588 /* Try again but now we are using pfmemalloc reserves */
589 ret_pfmemalloc = true;
5c0e820c 590 obj = kmalloc_node_track_caller(obj_size, flags, node);
5381b23d
AL
591
592out:
593 if (pfmemalloc)
594 *pfmemalloc = ret_pfmemalloc;
595
596 return obj;
597}
598
599/* Allocate a new skbuff. We do this ourselves so we can fill in a few
600 * 'private' fields and also do memory statistics to find all the
601 * [BEEP] leaks.
602 *
603 */
604
605/**
606 * __alloc_skb - allocate a network buffer
607 * @size: size to allocate
608 * @gfp_mask: allocation mask
609 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
610 * instead of head cache and allocate a cloned (child) skb.
611 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
612 * allocations in case the data is required for writeback
613 * @node: numa node to allocate memory on
614 *
615 * Allocate a new &sk_buff. The returned buffer has no headroom and a
616 * tail room of at least size bytes. The object has a reference count
617 * of one. The return is the buffer. On a failure the return is %NULL.
618 *
619 * Buffers may only be allocated from interrupts using a @gfp_mask of
620 * %GFP_ATOMIC.
621 */
622struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
623 int flags, int node)
624{
625 struct kmem_cache *cache;
5381b23d 626 struct sk_buff *skb;
5381b23d 627 bool pfmemalloc;
a5df6333 628 u8 *data;
5381b23d
AL
629
630 cache = (flags & SKB_ALLOC_FCLONE)
025a785f 631 ? skbuff_fclone_cache : skbuff_cache;
5381b23d
AL
632
633 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
634 gfp_mask |= __GFP_MEMALLOC;
635
636 /* Get the HEAD */
d13612b5
AL
637 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
638 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
639 skb = napi_skb_cache_get();
640 else
641 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
642 if (unlikely(!skb))
643 return NULL;
5381b23d
AL
644 prefetchw(skb);
645
646 /* We do our best to align skb_shared_info on a separate cache
647 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
648 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
649 * Both skb->head and skb_shared_info are cache line aligned.
650 */
5c0e820c 651 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
df1ae022 652 if (unlikely(!data))
5381b23d 653 goto nodata;
12d6c1d3 654 /* kmalloc_size_roundup() might give us more room than requested.
5381b23d
AL
655 * Put skb_shared_info exactly at the end of allocated zone,
656 * to allow max possible filling before reallocation.
657 */
65998d2b 658 prefetchw(data + SKB_WITH_OVERHEAD(size));
5381b23d
AL
659
660 /*
661 * Only clear those fields we need to clear, not those that we will
662 * actually initialise below. Hence, don't put any more fields after
663 * the tail pointer in struct sk_buff!
664 */
665 memset(skb, 0, offsetof(struct sk_buff, tail));
65998d2b 666 __build_skb_around(skb, data, size);
5381b23d 667 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
668
669 if (flags & SKB_ALLOC_FCLONE) {
670 struct sk_buff_fclones *fclones;
671
672 fclones = container_of(skb, struct sk_buff_fclones, skb1);
673
674 skb->fclone = SKB_FCLONE_ORIG;
675 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
676 }
677
5381b23d 678 return skb;
df1ae022 679
5381b23d
AL
680nodata:
681 kmem_cache_free(cache, skb);
df1ae022 682 return NULL;
5381b23d
AL
683}
684EXPORT_SYMBOL(__alloc_skb);
685
fd11a83d
AD
686/**
687 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
688 * @dev: network device to receive on
d7499160 689 * @len: length to allocate
fd11a83d
AD
690 * @gfp_mask: get_free_pages mask, passed to alloc_skb
691 *
692 * Allocate a new &sk_buff and assign it a usage count of one. The
693 * buffer has NET_SKB_PAD headroom built in. Users should allocate
694 * the headroom they think they need without accounting for the
695 * built in space. The built in space is used for optimisations.
696 *
697 * %NULL is returned if there is no free memory.
698 */
9451980a
AD
699struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
700 gfp_t gfp_mask)
fd11a83d 701{
b63ae8ca 702 struct page_frag_cache *nc;
fd11a83d 703 struct sk_buff *skb;
9451980a
AD
704 bool pfmemalloc;
705 void *data;
706
707 len += NET_SKB_PAD;
fd11a83d 708
66c55602
AL
709 /* If requested length is either too small or too big,
710 * we use kmalloc() for skb->head allocation.
711 */
712 if (len <= SKB_WITH_OVERHEAD(1024) ||
713 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 714 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
715 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
716 if (!skb)
717 goto skb_fail;
718 goto skb_success;
719 }
fd11a83d 720
115f1a5c 721 len = SKB_HEAD_ALIGN(len);
9451980a
AD
722
723 if (sk_memalloc_socks())
724 gfp_mask |= __GFP_MEMALLOC;
725
afa79d08 726 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
727 nc = this_cpu_ptr(&netdev_alloc_cache);
728 data = page_frag_alloc(nc, len, gfp_mask);
729 pfmemalloc = nc->pfmemalloc;
730 } else {
731 local_bh_disable();
732 nc = this_cpu_ptr(&napi_alloc_cache.page);
733 data = page_frag_alloc(nc, len, gfp_mask);
734 pfmemalloc = nc->pfmemalloc;
735 local_bh_enable();
736 }
9451980a
AD
737
738 if (unlikely(!data))
739 return NULL;
740
741 skb = __build_skb(data, len);
742 if (unlikely(!skb)) {
181edb2b 743 skb_free_frag(data);
9451980a 744 return NULL;
7b2e497a 745 }
fd11a83d 746
9451980a
AD
747 if (pfmemalloc)
748 skb->pfmemalloc = 1;
749 skb->head_frag = 1;
750
a080e7bd 751skb_success:
9451980a
AD
752 skb_reserve(skb, NET_SKB_PAD);
753 skb->dev = dev;
754
a080e7bd 755skb_fail:
8af27456
CH
756 return skb;
757}
b4ac530f 758EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 759
fd11a83d
AD
760/**
761 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
762 * @napi: napi instance this buffer was allocated for
d7499160 763 * @len: length to allocate
fd11a83d
AD
764 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
765 *
766 * Allocate a new sk_buff for use in NAPI receive. This buffer will
767 * attempt to allocate the head from a special reserved region used
768 * only for NAPI Rx allocation. By doing this we can save several
769 * CPU cycles by avoiding having to disable and re-enable IRQs.
770 *
771 * %NULL is returned if there is no free memory.
772 */
9451980a
AD
773struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
774 gfp_t gfp_mask)
fd11a83d 775{
3226b158 776 struct napi_alloc_cache *nc;
fd11a83d 777 struct sk_buff *skb;
dbae2b06 778 bool pfmemalloc;
9451980a
AD
779 void *data;
780
ee2640df 781 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 782 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 783
3226b158
ED
784 /* If requested length is either too small or too big,
785 * we use kmalloc() for skb->head allocation.
dbae2b06
PA
786 * When the small frag allocator is available, prefer it over kmalloc
787 * for small fragments
3226b158 788 */
dbae2b06 789 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
3226b158 790 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 791 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
792 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
793 NUMA_NO_NODE);
a080e7bd
AD
794 if (!skb)
795 goto skb_fail;
796 goto skb_success;
797 }
9451980a 798
3226b158 799 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
800
801 if (sk_memalloc_socks())
802 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 803
dbae2b06
PA
804 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
805 /* we are artificially inflating the allocation size, but
806 * that is not as bad as it may look like, as:
807 * - 'len' less than GRO_MAX_HEAD makes little sense
808 * - On most systems, larger 'len' values lead to fragment
809 * size above 512 bytes
810 * - kmalloc would use the kmalloc-1k slab for such values
811 * - Builds with smaller GRO_MAX_HEAD will very likely do
812 * little networking, as that implies no WiFi and no
813 * tunnels support, and 32 bits arches.
814 */
815 len = SZ_1K;
816
817 data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
818 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
819 } else {
115f1a5c 820 len = SKB_HEAD_ALIGN(len);
dbae2b06
PA
821
822 data = page_frag_alloc(&nc->page, len, gfp_mask);
823 pfmemalloc = nc->page.pfmemalloc;
824 }
825
9451980a
AD
826 if (unlikely(!data))
827 return NULL;
828
cfb8ec65 829 skb = __napi_build_skb(data, len);
9451980a 830 if (unlikely(!skb)) {
181edb2b 831 skb_free_frag(data);
9451980a 832 return NULL;
fd11a83d
AD
833 }
834
dbae2b06 835 if (pfmemalloc)
9451980a
AD
836 skb->pfmemalloc = 1;
837 skb->head_frag = 1;
838
a080e7bd 839skb_success:
9451980a
AD
840 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
841 skb->dev = napi->dev;
842
a080e7bd 843skb_fail:
fd11a83d
AD
844 return skb;
845}
846EXPORT_SYMBOL(__napi_alloc_skb);
847
654bed16 848void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 849 int size, unsigned int truesize)
654bed16 850{
c123e0d3
ED
851 DEBUG_NET_WARN_ON_ONCE(size > truesize);
852
654bed16
PZ
853 skb_fill_page_desc(skb, i, page, off, size);
854 skb->len += size;
855 skb->data_len += size;
50269e19 856 skb->truesize += truesize;
654bed16
PZ
857}
858EXPORT_SYMBOL(skb_add_rx_frag);
859
f8e617e1
JW
860void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
861 unsigned int truesize)
862{
863 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
864
c123e0d3
ED
865 DEBUG_NET_WARN_ON_ONCE(size > truesize);
866
f8e617e1
JW
867 skb_frag_size_add(frag, size);
868 skb->len += size;
869 skb->data_len += size;
870 skb->truesize += truesize;
871}
872EXPORT_SYMBOL(skb_coalesce_rx_frag);
873
27b437c8 874static void skb_drop_list(struct sk_buff **listp)
1da177e4 875{
bd8a7036 876 kfree_skb_list(*listp);
27b437c8 877 *listp = NULL;
1da177e4
LT
878}
879
27b437c8
HX
880static inline void skb_drop_fraglist(struct sk_buff *skb)
881{
882 skb_drop_list(&skb_shinfo(skb)->frag_list);
883}
884
1da177e4
LT
885static void skb_clone_fraglist(struct sk_buff *skb)
886{
887 struct sk_buff *list;
888
fbb398a8 889 skb_walk_frags(skb, list)
1da177e4
LT
890 skb_get(list);
891}
892
8cfa2dee
LC
893static bool is_pp_page(struct page *page)
894{
895 return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
896}
897
75eaf63e
AL
898#if IS_ENABLED(CONFIG_PAGE_POOL)
899bool napi_pp_put_page(struct page *page, bool napi_safe)
900{
5b899c33 901 bool allow_direct = false;
75eaf63e 902 struct page_pool *pp;
75eaf63e
AL
903
904 page = compound_head(page);
905
906 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
907 * in order to preserve any existing bits, such as bit 0 for the
908 * head page of compound page and bit 1 for pfmemalloc page, so
909 * mask those bits for freeing side when doing below checking,
910 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
911 * to avoid recycling the pfmemalloc page.
912 */
8cfa2dee 913 if (unlikely(!is_pp_page(page)))
75eaf63e
AL
914 return false;
915
916 pp = page->pp;
917
918 /* Allow direct recycle if we have reasons to believe that we are
919 * in the same context as the consumer would run, so there's
920 * no possible race.
4a36d018
AL
921 * __page_pool_put_page() makes sure we're not in hardirq context
922 * and interrupts are enabled prior to accessing the cache.
75eaf63e 923 */
4a36d018 924 if (napi_safe || in_softirq()) {
5b899c33
AL
925 const struct napi_struct *napi = READ_ONCE(pp->p.napi);
926
927 allow_direct = napi &&
928 READ_ONCE(napi->list_owner) == smp_processor_id();
929 }
75eaf63e
AL
930
931 /* Driver set this to memory recycling info. Reset it on recycle.
932 * This will *not* work for NIC using a split-page memory model.
933 * The page will be returned to the pool here regardless of the
934 * 'flipped' fragment being in use or not.
935 */
936 page_pool_put_full_page(pp, page, allow_direct);
937
938 return true;
939}
940EXPORT_SYMBOL(napi_pp_put_page);
941#endif
942
b07a2d97 943static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
4727bab4
YL
944{
945 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
946 return false;
75eaf63e 947 return napi_pp_put_page(virt_to_page(data), napi_safe);
4727bab4
YL
948}
949
bf9f1baa
ED
950static void skb_kfree_head(void *head, unsigned int end_offset)
951{
bf9f1baa
ED
952 if (end_offset == SKB_SMALL_HEAD_HEADROOM)
953 kmem_cache_free(skb_small_head_cache, head);
954 else
bf9f1baa
ED
955 kfree(head);
956}
957
b07a2d97 958static void skb_free_head(struct sk_buff *skb, bool napi_safe)
d3836f21 959{
181edb2b
AD
960 unsigned char *head = skb->head;
961
6a5bcd84 962 if (skb->head_frag) {
b07a2d97 963 if (skb_pp_recycle(skb, head, napi_safe))
6a5bcd84 964 return;
181edb2b 965 skb_free_frag(head);
6a5bcd84 966 } else {
bf9f1baa 967 skb_kfree_head(head, skb_end_offset(skb));
6a5bcd84 968 }
d3836f21
ED
969}
970
b07a2d97
JK
971static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
972 bool napi_safe)
1da177e4 973{
ff04a771
ED
974 struct skb_shared_info *shinfo = skb_shinfo(skb);
975 int i;
1da177e4 976
ff04a771
ED
977 if (skb->cloned &&
978 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
979 &shinfo->dataref))
2cc3aeb5 980 goto exit;
a6686f2f 981
753f1ca4
PB
982 if (skb_zcopy(skb)) {
983 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
984
985 skb_zcopy_clear(skb, true);
986 if (skip_unref)
987 goto free_head;
988 }
70c43167 989
ff04a771 990 for (i = 0; i < shinfo->nr_frags; i++)
8c48eea3 991 napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
a6686f2f 992
753f1ca4 993free_head:
ff04a771 994 if (shinfo->frag_list)
511a3eda 995 kfree_skb_list_reason(shinfo->frag_list, reason);
ff04a771 996
b07a2d97 997 skb_free_head(skb, napi_safe);
2cc3aeb5
IA
998exit:
999 /* When we clone an SKB we copy the reycling bit. The pp_recycle
1000 * bit is only set on the head though, so in order to avoid races
1001 * while trying to recycle fragments on __skb_frag_unref() we need
1002 * to make one SKB responsible for triggering the recycle path.
1003 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 1004 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
1005 * Eventually the last SKB will have the recycling bit set and it's
1006 * dataref set to 0, which will trigger the recycling
1007 */
1008 skb->pp_recycle = 0;
1da177e4
LT
1009}
1010
1011/*
1012 * Free an skbuff by memory without cleaning the state.
1013 */
2d4baff8 1014static void kfree_skbmem(struct sk_buff *skb)
1da177e4 1015{
d0bf4a9e 1016 struct sk_buff_fclones *fclones;
d179cd12 1017
d179cd12
DM
1018 switch (skb->fclone) {
1019 case SKB_FCLONE_UNAVAILABLE:
025a785f 1020 kmem_cache_free(skbuff_cache, skb);
6ffe75eb 1021 return;
d179cd12
DM
1022
1023 case SKB_FCLONE_ORIG:
d0bf4a9e 1024 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 1025
6ffe75eb
ED
1026 /* We usually free the clone (TX completion) before original skb
1027 * This test would have no chance to be true for the clone,
1028 * while here, branch prediction will be good.
d179cd12 1029 */
2638595a 1030 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
1031 goto fastpath;
1032 break;
e7820e39 1033
6ffe75eb
ED
1034 default: /* SKB_FCLONE_CLONE */
1035 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 1036 break;
3ff50b79 1037 }
2638595a 1038 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
1039 return;
1040fastpath:
1041 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
1042}
1043
0a463c78 1044void skb_release_head_state(struct sk_buff *skb)
1da177e4 1045{
adf30907 1046 skb_dst_drop(skb);
9c2b3328 1047 if (skb->destructor) {
7890e2f0 1048 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
1049 skb->destructor(skb);
1050 }
a3bf7ae9 1051#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 1052 nf_conntrack_put(skb_nfct(skb));
1da177e4 1053#endif
df5042f4 1054 skb_ext_put(skb);
04a4bb55
LB
1055}
1056
1057/* Free everything but the sk_buff shell. */
b07a2d97
JK
1058static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
1059 bool napi_safe)
04a4bb55
LB
1060{
1061 skb_release_head_state(skb);
a28b1b90 1062 if (likely(skb->head))
b07a2d97 1063 skb_release_data(skb, reason, napi_safe);
2d4baff8
HX
1064}
1065
1066/**
1067 * __kfree_skb - private function
1068 * @skb: buffer
1069 *
1070 * Free an sk_buff. Release anything attached to the buffer.
1071 * Clean the state. This is an internal helper function. Users should
1072 * always call kfree_skb
1073 */
1da177e4 1074
2d4baff8
HX
1075void __kfree_skb(struct sk_buff *skb)
1076{
b07a2d97 1077 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
1da177e4
LT
1078 kfree_skbmem(skb);
1079}
b4ac530f 1080EXPORT_SYMBOL(__kfree_skb);
1da177e4 1081
a4650da2
JDB
1082static __always_inline
1083bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
1084{
1085 if (unlikely(!skb_unref(skb)))
1086 return false;
1087
071c0fc6
JB
1088 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
1089 u32_get_bits(reason,
1090 SKB_DROP_REASON_SUBSYS_MASK) >=
1091 SKB_DROP_REASON_SUBSYS_NUM);
a4650da2
JDB
1092
1093 if (reason == SKB_CONSUMED)
dd1b5278 1094 trace_consume_skb(skb, __builtin_return_address(0));
a4650da2
JDB
1095 else
1096 trace_kfree_skb(skb, __builtin_return_address(0), reason);
1097 return true;
1098}
1099