net: sched: cls_u32: Avoid memcpy() false-positive warning
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
183f47fc 63#include <linux/kcov.h>
1da177e4
LT
64
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
ed1f50c3 69#include <net/ip6_checksum.h>
1da177e4 70#include <net/xfrm.h>
8822e270 71#include <net/mpls.h>
3ee17bc7 72#include <net/mptcp.h>
78476d31 73#include <net/mctp.h>
6a5bcd84 74#include <net/page_pool.h>
1da177e4 75
7c0f6ba6 76#include <linux/uaccess.h>
ad8d75ff 77#include <trace/events/skb.h>
51c56b00 78#include <linux/highmem.h>
b245be1f
WB
79#include <linux/capability.h>
80#include <linux/user_namespace.h>
2544af03 81#include <linux/indirect_call_wrapper.h>
a1f8e7f7 82
39564c3f 83#include "dev.h"
7f678def 84#include "sock_destructor.h"
7b7ed885 85
08009a76
AD
86struct kmem_cache *skbuff_head_cache __ro_after_init;
87static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
88#ifdef CONFIG_SKB_EXTENSIONS
89static struct kmem_cache *skbuff_ext_cache __ro_after_init;
90#endif
5f74f82e
HWR
91int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
92EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 93
9cb252c4
MD
94#undef FN
95#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
96const char * const drop_reasons[] = {
97 DEFINE_DROP_REASON(FN, FN)
98};
ec43908d
MD
99EXPORT_SYMBOL(drop_reasons);
100
1da177e4 101/**
f05de73b
JS
102 * skb_panic - private function for out-of-line support
103 * @skb: buffer
104 * @sz: size
105 * @addr: address
99d5851e 106 * @msg: skb_over_panic or skb_under_panic
1da177e4 107 *
f05de73b
JS
108 * Out-of-line support for skb_put() and skb_push().
109 * Called via the wrapper skb_over_panic() or skb_under_panic().
110 * Keep out of line to prevent kernel bloat.
111 * __builtin_return_address is not used because it is not always reliable.
1da177e4 112 */
f05de73b 113static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 114 const char msg[])
1da177e4 115{
41a46913 116 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 117 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
118 (unsigned long)skb->tail, (unsigned long)skb->end,
119 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
120 BUG();
121}
122
f05de73b 123static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 124{
f05de73b 125 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
126}
127
f05de73b
JS
128static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
129{
130 skb_panic(skb, sz, addr, __func__);
131}
c93bdd0e 132
50fad4b5 133#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
134#define NAPI_SKB_CACHE_BULK 16
135#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5
AL
136
137struct napi_alloc_cache {
138 struct page_frag_cache page;
139 unsigned int skb_count;
140 void *skb_cache[NAPI_SKB_CACHE_SIZE];
141};
142
143static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
144static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
145
32e3573f 146void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
50fad4b5
AL
147{
148 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
149
50fad4b5
AL
150 fragsz = SKB_DATA_ALIGN(fragsz);
151
32e3573f 152 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
153}
154EXPORT_SYMBOL(__napi_alloc_frag_align);
155
156void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
157{
50fad4b5
AL
158 void *data;
159
160 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 161 if (in_hardirq() || irqs_disabled()) {
32e3573f
YD
162 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
163
50fad4b5
AL
164 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
165 } else {
32e3573f
YD
166 struct napi_alloc_cache *nc;
167
50fad4b5 168 local_bh_disable();
32e3573f
YD
169 nc = this_cpu_ptr(&napi_alloc_cache);
170 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
50fad4b5
AL
171 local_bh_enable();
172 }
173 return data;
174}
175EXPORT_SYMBOL(__netdev_alloc_frag_align);
176
f450d539
AL
177static struct sk_buff *napi_skb_cache_get(void)
178{
179 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
180 struct sk_buff *skb;
181
49ae83fc 182 if (unlikely(!nc->skb_count)) {
f450d539
AL
183 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
184 GFP_ATOMIC,
185 NAPI_SKB_CACHE_BULK,
186 nc->skb_cache);
49ae83fc
SPL
187 if (unlikely(!nc->skb_count))
188 return NULL;
189 }
f450d539
AL
190
191 skb = nc->skb_cache[--nc->skb_count];
192 kasan_unpoison_object_data(skbuff_head_cache, skb);
193
194 return skb;
195}
196
ba0509b6 197/* Caller must provide SKB that is memset cleared */
483126b3
AL
198static void __build_skb_around(struct sk_buff *skb, void *data,
199 unsigned int frag_size)
ba0509b6
JDB
200{
201 struct skb_shared_info *shinfo;
202 unsigned int size = frag_size ? : ksize(data);
203
204 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
205
206 /* Assumes caller memset cleared SKB */
207 skb->truesize = SKB_TRUESIZE(size);
208 refcount_set(&skb->users, 1);
209 skb->head = data;
210 skb->data = data;
211 skb_reset_tail_pointer(skb);
763087da 212 skb_set_end_offset(skb, size);
ba0509b6
JDB
213 skb->mac_header = (typeof(skb->mac_header))~0U;
214 skb->transport_header = (typeof(skb->transport_header))~0U;
68822bdf 215 skb->alloc_cpu = raw_smp_processor_id();
ba0509b6
JDB
216 /* make sure we initialize shinfo sequentially */
217 shinfo = skb_shinfo(skb);
218 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
219 atomic_set(&shinfo->dataref, 1);
220
6370cc3b 221 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
222}
223
b2b5ce9d 224/**
2ea2f62c 225 * __build_skb - build a network buffer
b2b5ce9d 226 * @data: data buffer provided by caller
2ea2f62c 227 * @frag_size: size of data, or 0 if head was kmalloced
b2b5ce9d
ED
228 *
229 * Allocate a new &sk_buff. Caller provides space holding head and
deceb4c0 230 * skb_shared_info. @data must have been allocated by kmalloc() only if
2ea2f62c
ED
231 * @frag_size is 0, otherwise data should come from the page allocator
232 * or vmalloc()
b2b5ce9d
ED
233 * The return is the new skb buffer.
234 * On a failure the return is %NULL, and @data is not freed.
235 * Notes :
236 * Before IO, driver allocates only data buffer where NIC put incoming frame
237 * Driver should add room at head (NET_SKB_PAD) and
238 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
239 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
240 * before giving packet to stack.
241 * RX rings only contains data buffers, not full skbs.
242 */
2ea2f62c 243struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 244{
b2b5ce9d 245 struct sk_buff *skb;
b2b5ce9d
ED
246
247 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
ba0509b6 248 if (unlikely(!skb))
b2b5ce9d
ED
249 return NULL;
250
b2b5ce9d 251 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 252 __build_skb_around(skb, data, frag_size);
b2b5ce9d 253
483126b3 254 return skb;
b2b5ce9d 255}
2ea2f62c
ED
256
257/* build_skb() is wrapper over __build_skb(), that specifically
258 * takes care of skb->head and skb->pfmemalloc
259 * This means that if @frag_size is not zero, then @data must be backed
260 * by a page fragment, not kmalloc() or vmalloc()
261 */
262struct sk_buff *build_skb(void *data, unsigned int frag_size)
263{
264 struct sk_buff *skb = __build_skb(data, frag_size);
265
266 if (skb && frag_size) {
267 skb->head_frag = 1;
2f064f34 268 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
269 skb->pfmemalloc = 1;
270 }
271 return skb;
272}
b2b5ce9d
ED
273EXPORT_SYMBOL(build_skb);
274
ba0509b6
JDB
275/**
276 * build_skb_around - build a network buffer around provided skb
277 * @skb: sk_buff provide by caller, must be memset cleared
278 * @data: data buffer provided by caller
279 * @frag_size: size of data, or 0 if head was kmalloced
280 */
281struct sk_buff *build_skb_around(struct sk_buff *skb,
282 void *data, unsigned int frag_size)
283{
284 if (unlikely(!skb))
285 return NULL;
286
483126b3 287 __build_skb_around(skb, data, frag_size);
ba0509b6 288
483126b3 289 if (frag_size) {
ba0509b6
JDB
290 skb->head_frag = 1;
291 if (page_is_pfmemalloc(virt_to_head_page(data)))
292 skb->pfmemalloc = 1;
293 }
294 return skb;
295}
296EXPORT_SYMBOL(build_skb_around);
297
f450d539
AL
298/**
299 * __napi_build_skb - build a network buffer
300 * @data: data buffer provided by caller
301 * @frag_size: size of data, or 0 if head was kmalloced
302 *
303 * Version of __build_skb() that uses NAPI percpu caches to obtain
304 * skbuff_head instead of inplace allocation.
305 *
306 * Returns a new &sk_buff on success, %NULL on allocation failure.
307 */
308static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
309{
310 struct sk_buff *skb;
311
312 skb = napi_skb_cache_get();
313 if (unlikely(!skb))
314 return NULL;
315
316 memset(skb, 0, offsetof(struct sk_buff, tail));
317 __build_skb_around(skb, data, frag_size);
318
319 return skb;
320}
321
322/**
323 * napi_build_skb - build a network buffer
324 * @data: data buffer provided by caller
325 * @frag_size: size of data, or 0 if head was kmalloced
326 *
327 * Version of __napi_build_skb() that takes care of skb->head_frag
328 * and skb->pfmemalloc when the data is a page or page fragment.
329 *
330 * Returns a new &sk_buff on success, %NULL on allocation failure.
331 */
332struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
333{
334 struct sk_buff *skb = __napi_build_skb(data, frag_size);
335
336 if (likely(skb) && frag_size) {
337 skb->head_frag = 1;
338 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
339 }
340
341 return skb;
342}
343EXPORT_SYMBOL(napi_build_skb);
344
5381b23d
AL
345/*
346 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
347 * the caller if emergency pfmemalloc reserves are being used. If it is and
348 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
349 * may be used. Otherwise, the packet data may be discarded until enough
350 * memory is free
351 */
ef28095f
AL
352static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
353 bool *pfmemalloc)
5381b23d
AL
354{
355 void *obj;
356 bool ret_pfmemalloc = false;
357
358 /*
359 * Try a regular allocation, when that fails and we're not entitled
360 * to the reserves, fail.
361 */
362 obj = kmalloc_node_track_caller(size,
363 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
364 node);
365 if (obj || !(gfp_pfmemalloc_allowed(flags)))
366 goto out;
367
368 /* Try again but now we are using pfmemalloc reserves */
369 ret_pfmemalloc = true;
370 obj = kmalloc_node_track_caller(size, flags, node);
371
372out:
373 if (pfmemalloc)
374 *pfmemalloc = ret_pfmemalloc;
375
376 return obj;
377}
378
379/* Allocate a new skbuff. We do this ourselves so we can fill in a few
380 * 'private' fields and also do memory statistics to find all the
381 * [BEEP] leaks.
382 *
383 */
384
385/**
386 * __alloc_skb - allocate a network buffer
387 * @size: size to allocate
388 * @gfp_mask: allocation mask
389 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
390 * instead of head cache and allocate a cloned (child) skb.
391 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
392 * allocations in case the data is required for writeback
393 * @node: numa node to allocate memory on
394 *
395 * Allocate a new &sk_buff. The returned buffer has no headroom and a
396 * tail room of at least size bytes. The object has a reference count
397 * of one. The return is the buffer. On a failure the return is %NULL.
398 *
399 * Buffers may only be allocated from interrupts using a @gfp_mask of
400 * %GFP_ATOMIC.
401 */
402struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
403 int flags, int node)
404{
405 struct kmem_cache *cache;
5381b23d 406 struct sk_buff *skb;
a5df6333 407 unsigned int osize;
5381b23d 408 bool pfmemalloc;
a5df6333 409 u8 *data;
5381b23d
AL
410
411 cache = (flags & SKB_ALLOC_FCLONE)
412 ? skbuff_fclone_cache : skbuff_head_cache;
413
414 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
415 gfp_mask |= __GFP_MEMALLOC;
416
417 /* Get the HEAD */
d13612b5
AL
418 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
419 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
420 skb = napi_skb_cache_get();
421 else
422 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
423 if (unlikely(!skb))
424 return NULL;
5381b23d
AL
425 prefetchw(skb);
426
427 /* We do our best to align skb_shared_info on a separate cache
428 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
429 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
430 * Both skb->head and skb_shared_info are cache line aligned.
431 */
432 size = SKB_DATA_ALIGN(size);
433 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
434 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
df1ae022 435 if (unlikely(!data))
5381b23d
AL
436 goto nodata;
437 /* kmalloc(size) might give us more room than requested.
438 * Put skb_shared_info exactly at the end of allocated zone,
439 * to allow max possible filling before reallocation.
440 */
a5df6333
LR
441 osize = ksize(data);
442 size = SKB_WITH_OVERHEAD(osize);
5381b23d
AL
443 prefetchw(data + size);
444
445 /*
446 * Only clear those fields we need to clear, not those that we will
447 * actually initialise below. Hence, don't put any more fields after
448 * the tail pointer in struct sk_buff!
449 */
450 memset(skb, 0, offsetof(struct sk_buff, tail));
a5df6333 451 __build_skb_around(skb, data, osize);
5381b23d 452 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
453
454 if (flags & SKB_ALLOC_FCLONE) {
455 struct sk_buff_fclones *fclones;
456
457 fclones = container_of(skb, struct sk_buff_fclones, skb1);
458
459 skb->fclone = SKB_FCLONE_ORIG;
460 refcount_set(&fclones->fclone_ref, 1);
5381b23d
AL
461 }
462
5381b23d 463 return skb;
df1ae022 464
5381b23d
AL
465nodata:
466 kmem_cache_free(cache, skb);
df1ae022 467 return NULL;
5381b23d
AL
468}
469EXPORT_SYMBOL(__alloc_skb);
470
fd11a83d
AD
471/**
472 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
473 * @dev: network device to receive on
d7499160 474 * @len: length to allocate
fd11a83d
AD
475 * @gfp_mask: get_free_pages mask, passed to alloc_skb
476 *
477 * Allocate a new &sk_buff and assign it a usage count of one. The
478 * buffer has NET_SKB_PAD headroom built in. Users should allocate
479 * the headroom they think they need without accounting for the
480 * built in space. The built in space is used for optimisations.
481 *
482 * %NULL is returned if there is no free memory.
483 */
9451980a
AD
484struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
485 gfp_t gfp_mask)
fd11a83d 486{
b63ae8ca 487 struct page_frag_cache *nc;
fd11a83d 488 struct sk_buff *skb;
9451980a
AD
489 bool pfmemalloc;
490 void *data;
491
492 len += NET_SKB_PAD;
fd11a83d 493
66c55602
AL
494 /* If requested length is either too small or too big,
495 * we use kmalloc() for skb->head allocation.
496 */
497 if (len <= SKB_WITH_OVERHEAD(1024) ||
498 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 499 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
500 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
501 if (!skb)
502 goto skb_fail;
503 goto skb_success;
504 }
fd11a83d 505
9451980a
AD
506 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
507 len = SKB_DATA_ALIGN(len);
508
509 if (sk_memalloc_socks())
510 gfp_mask |= __GFP_MEMALLOC;
511
afa79d08 512 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
513 nc = this_cpu_ptr(&netdev_alloc_cache);
514 data = page_frag_alloc(nc, len, gfp_mask);
515 pfmemalloc = nc->pfmemalloc;
516 } else {
517 local_bh_disable();
518 nc = this_cpu_ptr(&napi_alloc_cache.page);
519 data = page_frag_alloc(nc, len, gfp_mask);
520 pfmemalloc = nc->pfmemalloc;
521 local_bh_enable();
522 }
9451980a
AD
523
524 if (unlikely(!data))
525 return NULL;
526
527 skb = __build_skb(data, len);
528 if (unlikely(!skb)) {
181edb2b 529 skb_free_frag(data);
9451980a 530 return NULL;
7b2e497a 531 }
fd11a83d 532
9451980a
AD
533 if (pfmemalloc)
534 skb->pfmemalloc = 1;
535 skb->head_frag = 1;
536
a080e7bd 537skb_success:
9451980a
AD
538 skb_reserve(skb, NET_SKB_PAD);
539 skb->dev = dev;
540
a080e7bd 541skb_fail:
8af27456
CH
542 return skb;
543}
b4ac530f 544EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 545
fd11a83d
AD
546/**
547 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
548 * @napi: napi instance this buffer was allocated for
d7499160 549 * @len: length to allocate
fd11a83d
AD
550 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
551 *
552 * Allocate a new sk_buff for use in NAPI receive. This buffer will
553 * attempt to allocate the head from a special reserved region used
554 * only for NAPI Rx allocation. By doing this we can save several
555 * CPU cycles by avoiding having to disable and re-enable IRQs.
556 *
557 * %NULL is returned if there is no free memory.
558 */
9451980a
AD
559struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
560 gfp_t gfp_mask)
fd11a83d 561{
3226b158 562 struct napi_alloc_cache *nc;
fd11a83d 563 struct sk_buff *skb;
9451980a
AD
564 void *data;
565
ee2640df 566 DEBUG_NET_WARN_ON_ONCE(!in_softirq());
9451980a 567 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 568
3226b158
ED
569 /* If requested length is either too small or too big,
570 * we use kmalloc() for skb->head allocation.
571 */
572 if (len <= SKB_WITH_OVERHEAD(1024) ||
573 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 574 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
575 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
576 NUMA_NO_NODE);
a080e7bd
AD
577 if (!skb)
578 goto skb_fail;
579 goto skb_success;
580 }
9451980a 581
3226b158 582 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
583 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
584 len = SKB_DATA_ALIGN(len);
585
586 if (sk_memalloc_socks())
587 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 588
8c2dd3e4 589 data = page_frag_alloc(&nc->page, len, gfp_mask);
9451980a
AD
590 if (unlikely(!data))
591 return NULL;
592
cfb8ec65 593 skb = __napi_build_skb(data, len);
9451980a 594 if (unlikely(!skb)) {
181edb2b 595 skb_free_frag(data);
9451980a 596 return NULL;
fd11a83d
AD
597 }
598
795bb1c0 599 if (nc->page.pfmemalloc)
9451980a
AD
600 skb->pfmemalloc = 1;
601 skb->head_frag = 1;
602
a080e7bd 603skb_success:
9451980a
AD
604 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
605 skb->dev = napi->dev;
606
a080e7bd 607skb_fail:
fd11a83d
AD
608 return skb;
609}
610EXPORT_SYMBOL(__napi_alloc_skb);
611
654bed16 612void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 613 int size, unsigned int truesize)
654bed16
PZ
614{
615 skb_fill_page_desc(skb, i, page, off, size);
616 skb->len += size;
617 skb->data_len += size;
50269e19 618 skb->truesize += truesize;
654bed16
PZ
619}
620EXPORT_SYMBOL(skb_add_rx_frag);
621
f8e617e1
JW
622void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
623 unsigned int truesize)
624{
625 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
626
627 skb_frag_size_add(frag, size);
628 skb->len += size;
629 skb->data_len += size;
630 skb->truesize += truesize;
631}
632EXPORT_SYMBOL(skb_coalesce_rx_frag);
633
27b437c8 634static void skb_drop_list(struct sk_buff **listp)
1da177e4 635{
bd8a7036 636 kfree_skb_list(*listp);
27b437c8 637 *listp = NULL;
1da177e4
LT
638}
639
27b437c8
HX
640static inline void skb_drop_fraglist(struct sk_buff *skb)
641{
642 skb_drop_list(&skb_shinfo(skb)->frag_list);
643}
644
1da177e4
LT
645static void skb_clone_fraglist(struct sk_buff *skb)
646{
647 struct sk_buff *list;
648
fbb398a8 649 skb_walk_frags(skb, list)
1da177e4
LT
650 skb_get(list);
651}
652
d3836f21
ED
653static void skb_free_head(struct sk_buff *skb)
654{
181edb2b
AD
655 unsigned char *head = skb->head;
656
6a5bcd84
IA
657 if (skb->head_frag) {
658 if (skb_pp_recycle(skb, head))
659 return;
181edb2b 660 skb_free_frag(head);
6a5bcd84 661 } else {
181edb2b 662 kfree(head);
6a5bcd84 663 }
d3836f21
ED
664}
665
5bba1712 666static void skb_release_data(struct sk_buff *skb)
1da177e4 667{
ff04a771
ED
668 struct skb_shared_info *shinfo = skb_shinfo(skb);
669 int i;
1da177e4 670
ff04a771
ED
671 if (skb->cloned &&
672 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
673 &shinfo->dataref))
2cc3aeb5 674 goto exit;
a6686f2f 675
753f1ca4
PB
676 if (skb_zcopy(skb)) {
677 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
678
679 skb_zcopy_clear(skb, true);
680 if (skip_unref)
681 goto free_head;
682 }
70c43167 683
ff04a771 684 for (i = 0; i < shinfo->nr_frags; i++)
6a5bcd84 685 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 686
753f1ca4 687free_head:
ff04a771
ED
688 if (shinfo->frag_list)
689 kfree_skb_list(shinfo->frag_list);
690
691 skb_free_head(skb);
2cc3aeb5
IA
692exit:
693 /* When we clone an SKB we copy the reycling bit. The pp_recycle
694 * bit is only set on the head though, so in order to avoid races
695 * while trying to recycle fragments on __skb_frag_unref() we need
696 * to make one SKB responsible for triggering the recycle path.
697 * So disable the recycling bit if an SKB is cloned and we have
58e61e41 698 * additional references to the fragmented part of the SKB.
2cc3aeb5
IA
699 * Eventually the last SKB will have the recycling bit set and it's
700 * dataref set to 0, which will trigger the recycling
701 */
702 skb->pp_recycle = 0;
1da177e4
LT
703}
704
705/*
706 * Free an skbuff by memory without cleaning the state.
707 */
2d4baff8 708static void kfree_skbmem(struct sk_buff *skb)
1da177e4 709{
d0bf4a9e 710 struct sk_buff_fclones *fclones;
d179cd12 711
d179cd12
DM
712 switch (skb->fclone) {
713 case SKB_FCLONE_UNAVAILABLE:
714 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 715 return;
d179cd12
DM
716
717 case SKB_FCLONE_ORIG:
d0bf4a9e 718 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 719
6ffe75eb
ED
720 /* We usually free the clone (TX completion) before original skb
721 * This test would have no chance to be true for the clone,
722 * while here, branch prediction will be good.
d179cd12 723 */
2638595a 724 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
725 goto fastpath;
726 break;
e7820e39 727
6ffe75eb
ED
728 default: /* SKB_FCLONE_CLONE */
729 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 730 break;
3ff50b79 731 }
2638595a 732 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
733 return;
734fastpath:
735 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
736}
737
0a463c78 738void skb_release_head_state(struct sk_buff *skb)
1da177e4 739{
adf30907 740 skb_dst_drop(skb);
9c2b3328 741 if (skb->destructor) {
7890e2f0 742 DEBUG_NET_WARN_ON_ONCE(in_hardirq());
1da177e4
LT
743 skb->destructor(skb);
744 }
a3bf7ae9 745#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 746 nf_conntrack_put(skb_nfct(skb));
1da177e4 747#endif
df5042f4 748 skb_ext_put(skb);
04a4bb55
LB
749}
750
751/* Free everything but the sk_buff shell. */
752static void skb_release_all(struct sk_buff *skb)
753{
754 skb_release_head_state(skb);
a28b1b90
FW
755 if (likely(skb->head))
756 skb_release_data(skb);
2d4baff8
HX
757}
758
759/**
760 * __kfree_skb - private function
761 * @skb: buffer
762 *
763 * Free an sk_buff. Release anything attached to the buffer.
764 * Clean the state. This is an internal helper function. Users should
765 * always call kfree_skb
766 */
1da177e4 767
2d4baff8
HX
768void __kfree_skb(struct sk_buff *skb)
769{
770 skb_release_all(skb);
1da177e4
LT
771 kfree_skbmem(skb);
772}
b4ac530f 773EXPORT_SYMBOL(__kfree_skb);
1da177e4 774