drm/i915: Finally remove obj->mm.lock.
[linux-block.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
1da177e4
LT
63
64#include <net/protocol.h>
65#include <net/dst.h>
66#include <net/sock.h>
67#include <net/checksum.h>
ed1f50c3 68#include <net/ip6_checksum.h>
1da177e4 69#include <net/xfrm.h>
8822e270 70#include <net/mpls.h>
3ee17bc7 71#include <net/mptcp.h>
1da177e4 72
7c0f6ba6 73#include <linux/uaccess.h>
ad8d75ff 74#include <trace/events/skb.h>
51c56b00 75#include <linux/highmem.h>
b245be1f
WB
76#include <linux/capability.h>
77#include <linux/user_namespace.h>
2544af03 78#include <linux/indirect_call_wrapper.h>
a1f8e7f7 79
7b7ed885
BVA
80#include "datagram.h"
81
08009a76
AD
82struct kmem_cache *skbuff_head_cache __ro_after_init;
83static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
84#ifdef CONFIG_SKB_EXTENSIONS
85static struct kmem_cache *skbuff_ext_cache __ro_after_init;
86#endif
5f74f82e
HWR
87int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
88EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 89
1da177e4 90/**
f05de73b
JS
91 * skb_panic - private function for out-of-line support
92 * @skb: buffer
93 * @sz: size
94 * @addr: address
99d5851e 95 * @msg: skb_over_panic or skb_under_panic
1da177e4 96 *
f05de73b
JS
97 * Out-of-line support for skb_put() and skb_push().
98 * Called via the wrapper skb_over_panic() or skb_under_panic().
99 * Keep out of line to prevent kernel bloat.
100 * __builtin_return_address is not used because it is not always reliable.
1da177e4 101 */
f05de73b 102static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 103 const char msg[])
1da177e4 104{
41a46913 105 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 106 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
107 (unsigned long)skb->tail, (unsigned long)skb->end,
108 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
109 BUG();
110}
111
f05de73b 112static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 113{
f05de73b 114 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
115}
116
f05de73b
JS
117static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
118{
119 skb_panic(skb, sz, addr, __func__);
120}
c93bdd0e 121
50fad4b5 122#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
123#define NAPI_SKB_CACHE_BULK 16
124#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5
AL
125
126struct napi_alloc_cache {
127 struct page_frag_cache page;
128 unsigned int skb_count;
129 void *skb_cache[NAPI_SKB_CACHE_SIZE];
130};
131
132static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
133static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
134
135static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
136 unsigned int align_mask)
137{
138 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
139
140 return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
141}
142
143void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
144{
145 fragsz = SKB_DATA_ALIGN(fragsz);
146
147 return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
148}
149EXPORT_SYMBOL(__napi_alloc_frag_align);
150
151void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
152{
153 struct page_frag_cache *nc;
154 void *data;
155
156 fragsz = SKB_DATA_ALIGN(fragsz);
157 if (in_irq() || irqs_disabled()) {
158 nc = this_cpu_ptr(&netdev_alloc_cache);
159 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
160 } else {
161 local_bh_disable();
162 data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
163 local_bh_enable();
164 }
165 return data;
166}
167EXPORT_SYMBOL(__netdev_alloc_frag_align);
168
f450d539
AL
169static struct sk_buff *napi_skb_cache_get(void)
170{
171 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
172 struct sk_buff *skb;
173
174 if (unlikely(!nc->skb_count))
175 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
176 GFP_ATOMIC,
177 NAPI_SKB_CACHE_BULK,
178 nc->skb_cache);
179 if (unlikely(!nc->skb_count))
180 return NULL;
181
182 skb = nc->skb_cache[--nc->skb_count];
183 kasan_unpoison_object_data(skbuff_head_cache, skb);
184
185 return skb;
186}
187
ba0509b6 188/* Caller must provide SKB that is memset cleared */
483126b3
AL
189static void __build_skb_around(struct sk_buff *skb, void *data,
190 unsigned int frag_size)
ba0509b6
JDB
191{
192 struct skb_shared_info *shinfo;
193 unsigned int size = frag_size ? : ksize(data);
194
195 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
196
197 /* Assumes caller memset cleared SKB */
198 skb->truesize = SKB_TRUESIZE(size);
199 refcount_set(&skb->users, 1);
200 skb->head = data;
201 skb->data = data;
202 skb_reset_tail_pointer(skb);
203 skb->end = skb->tail + size;
204 skb->mac_header = (typeof(skb->mac_header))~0U;
205 skb->transport_header = (typeof(skb->transport_header))~0U;
206
207 /* make sure we initialize shinfo sequentially */
208 shinfo = skb_shinfo(skb);
209 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
210 atomic_set(&shinfo->dataref, 1);
211
6370cc3b 212 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
213}
214
b2b5ce9d 215/**
2ea2f62c 216 * __build_skb - build a network buffer
b2b5ce9d 217 * @data: data buffer provided by caller
2ea2f62c 218 * @frag_size: size of data, or 0 if head was kmalloced
b2b5ce9d
ED
219 *
220 * Allocate a new &sk_buff. Caller provides space holding head and
deceb4c0 221 * skb_shared_info. @data must have been allocated by kmalloc() only if
2ea2f62c
ED
222 * @frag_size is 0, otherwise data should come from the page allocator
223 * or vmalloc()
b2b5ce9d
ED
224 * The return is the new skb buffer.
225 * On a failure the return is %NULL, and @data is not freed.
226 * Notes :
227 * Before IO, driver allocates only data buffer where NIC put incoming frame
228 * Driver should add room at head (NET_SKB_PAD) and
229 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
230 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
231 * before giving packet to stack.
232 * RX rings only contains data buffers, not full skbs.
233 */
2ea2f62c 234struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 235{
b2b5ce9d 236 struct sk_buff *skb;
b2b5ce9d
ED
237
238 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
ba0509b6 239 if (unlikely(!skb))
b2b5ce9d
ED
240 return NULL;
241
b2b5ce9d 242 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 243 __build_skb_around(skb, data, frag_size);
b2b5ce9d 244
483126b3 245 return skb;
b2b5ce9d 246}
2ea2f62c
ED
247
248/* build_skb() is wrapper over __build_skb(), that specifically
249 * takes care of skb->head and skb->pfmemalloc
250 * This means that if @frag_size is not zero, then @data must be backed
251 * by a page fragment, not kmalloc() or vmalloc()
252 */
253struct sk_buff *build_skb(void *data, unsigned int frag_size)
254{
255 struct sk_buff *skb = __build_skb(data, frag_size);
256
257 if (skb && frag_size) {
258 skb->head_frag = 1;
2f064f34 259 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
260 skb->pfmemalloc = 1;
261 }
262 return skb;
263}
b2b5ce9d
ED
264EXPORT_SYMBOL(build_skb);
265
ba0509b6
JDB
266/**
267 * build_skb_around - build a network buffer around provided skb
268 * @skb: sk_buff provide by caller, must be memset cleared
269 * @data: data buffer provided by caller
270 * @frag_size: size of data, or 0 if head was kmalloced
271 */
272struct sk_buff *build_skb_around(struct sk_buff *skb,
273 void *data, unsigned int frag_size)
274{
275 if (unlikely(!skb))
276 return NULL;
277
483126b3 278 __build_skb_around(skb, data, frag_size);
ba0509b6 279
483126b3 280 if (frag_size) {
ba0509b6
JDB
281 skb->head_frag = 1;
282 if (page_is_pfmemalloc(virt_to_head_page(data)))
283 skb->pfmemalloc = 1;
284 }
285 return skb;
286}
287EXPORT_SYMBOL(build_skb_around);
288
f450d539
AL
289/**
290 * __napi_build_skb - build a network buffer
291 * @data: data buffer provided by caller
292 * @frag_size: size of data, or 0 if head was kmalloced
293 *
294 * Version of __build_skb() that uses NAPI percpu caches to obtain
295 * skbuff_head instead of inplace allocation.
296 *
297 * Returns a new &sk_buff on success, %NULL on allocation failure.
298 */
299static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
300{
301 struct sk_buff *skb;
302
303 skb = napi_skb_cache_get();
304 if (unlikely(!skb))
305 return NULL;
306
307 memset(skb, 0, offsetof(struct sk_buff, tail));
308 __build_skb_around(skb, data, frag_size);
309
310 return skb;
311}
312
313/**
314 * napi_build_skb - build a network buffer
315 * @data: data buffer provided by caller
316 * @frag_size: size of data, or 0 if head was kmalloced
317 *
318 * Version of __napi_build_skb() that takes care of skb->head_frag
319 * and skb->pfmemalloc when the data is a page or page fragment.
320 *
321 * Returns a new &sk_buff on success, %NULL on allocation failure.
322 */
323struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
324{
325 struct sk_buff *skb = __napi_build_skb(data, frag_size);
326
327 if (likely(skb) && frag_size) {
328 skb->head_frag = 1;
329 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
330 }
331
332 return skb;
333}
334EXPORT_SYMBOL(napi_build_skb);
335
5381b23d
AL
336/*
337 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
338 * the caller if emergency pfmemalloc reserves are being used. If it is and
339 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
340 * may be used. Otherwise, the packet data may be discarded until enough
341 * memory is free
342 */
ef28095f
AL
343static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
344 bool *pfmemalloc)
5381b23d
AL
345{
346 void *obj;
347 bool ret_pfmemalloc = false;
348
349 /*
350 * Try a regular allocation, when that fails and we're not entitled
351 * to the reserves, fail.
352 */
353 obj = kmalloc_node_track_caller(size,
354 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
355 node);
356 if (obj || !(gfp_pfmemalloc_allowed(flags)))
357 goto out;
358
359 /* Try again but now we are using pfmemalloc reserves */
360 ret_pfmemalloc = true;
361 obj = kmalloc_node_track_caller(size, flags, node);
362
363out:
364 if (pfmemalloc)
365 *pfmemalloc = ret_pfmemalloc;
366
367 return obj;
368}
369
370/* Allocate a new skbuff. We do this ourselves so we can fill in a few
371 * 'private' fields and also do memory statistics to find all the
372 * [BEEP] leaks.
373 *
374 */
375
376/**
377 * __alloc_skb - allocate a network buffer
378 * @size: size to allocate
379 * @gfp_mask: allocation mask
380 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
381 * instead of head cache and allocate a cloned (child) skb.
382 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
383 * allocations in case the data is required for writeback
384 * @node: numa node to allocate memory on
385 *
386 * Allocate a new &sk_buff. The returned buffer has no headroom and a
387 * tail room of at least size bytes. The object has a reference count
388 * of one. The return is the buffer. On a failure the return is %NULL.
389 *
390 * Buffers may only be allocated from interrupts using a @gfp_mask of
391 * %GFP_ATOMIC.
392 */
393struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
394 int flags, int node)
395{
396 struct kmem_cache *cache;
5381b23d
AL
397 struct sk_buff *skb;
398 u8 *data;
399 bool pfmemalloc;
400
401 cache = (flags & SKB_ALLOC_FCLONE)
402 ? skbuff_fclone_cache : skbuff_head_cache;
403
404 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
405 gfp_mask |= __GFP_MEMALLOC;
406
407 /* Get the HEAD */
d13612b5
AL
408 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
409 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
410 skb = napi_skb_cache_get();
411 else
412 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
413 if (unlikely(!skb))
414 return NULL;
5381b23d
AL
415 prefetchw(skb);
416
417 /* We do our best to align skb_shared_info on a separate cache
418 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
419 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
420 * Both skb->head and skb_shared_info are cache line aligned.
421 */
422 size = SKB_DATA_ALIGN(size);
423 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
424 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
df1ae022 425 if (unlikely(!data))
5381b23d
AL
426 goto nodata;
427 /* kmalloc(size) might give us more room than requested.
428 * Put skb_shared_info exactly at the end of allocated zone,
429 * to allow max possible filling before reallocation.
430 */
431 size = SKB_WITH_OVERHEAD(ksize(data));
432 prefetchw(data + size);
433
434 /*
435 * Only clear those fields we need to clear, not those that we will
436 * actually initialise below. Hence, don't put any more fields after
437 * the tail pointer in struct sk_buff!
438 */
439 memset(skb, 0, offsetof(struct sk_buff, tail));
f9d6725b 440 __build_skb_around(skb, data, 0);
5381b23d 441 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
442
443 if (flags & SKB_ALLOC_FCLONE) {
444 struct sk_buff_fclones *fclones;
445
446 fclones = container_of(skb, struct sk_buff_fclones, skb1);
447
448 skb->fclone = SKB_FCLONE_ORIG;
449 refcount_set(&fclones->fclone_ref, 1);
450
451 fclones->skb2.fclone = SKB_FCLONE_CLONE;
452 }
453
5381b23d 454 return skb;
df1ae022 455
5381b23d
AL
456nodata:
457 kmem_cache_free(cache, skb);
df1ae022 458 return NULL;
5381b23d
AL
459}
460EXPORT_SYMBOL(__alloc_skb);
461
fd11a83d
AD
462/**
463 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
464 * @dev: network device to receive on
d7499160 465 * @len: length to allocate
fd11a83d
AD
466 * @gfp_mask: get_free_pages mask, passed to alloc_skb
467 *
468 * Allocate a new &sk_buff and assign it a usage count of one. The
469 * buffer has NET_SKB_PAD headroom built in. Users should allocate
470 * the headroom they think they need without accounting for the
471 * built in space. The built in space is used for optimisations.
472 *
473 * %NULL is returned if there is no free memory.
474 */
9451980a
AD
475struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
476 gfp_t gfp_mask)
fd11a83d 477{
b63ae8ca 478 struct page_frag_cache *nc;
fd11a83d 479 struct sk_buff *skb;
9451980a
AD
480 bool pfmemalloc;
481 void *data;
482
483 len += NET_SKB_PAD;
fd11a83d 484
66c55602
AL
485 /* If requested length is either too small or too big,
486 * we use kmalloc() for skb->head allocation.
487 */
488 if (len <= SKB_WITH_OVERHEAD(1024) ||
489 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 490 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
491 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
492 if (!skb)
493 goto skb_fail;
494 goto skb_success;
495 }
fd11a83d 496
9451980a
AD
497 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
498 len = SKB_DATA_ALIGN(len);
499
500 if (sk_memalloc_socks())
501 gfp_mask |= __GFP_MEMALLOC;
502
92dcabd7
SAS
503 if (in_irq() || irqs_disabled()) {
504 nc = this_cpu_ptr(&netdev_alloc_cache);
505 data = page_frag_alloc(nc, len, gfp_mask);
506 pfmemalloc = nc->pfmemalloc;
507 } else {
508 local_bh_disable();
509 nc = this_cpu_ptr(&napi_alloc_cache.page);
510 data = page_frag_alloc(nc, len, gfp_mask);
511 pfmemalloc = nc->pfmemalloc;
512 local_bh_enable();
513 }
9451980a
AD
514
515 if (unlikely(!data))
516 return NULL;
517
518 skb = __build_skb(data, len);
519 if (unlikely(!skb)) {
181edb2b 520 skb_free_frag(data);
9451980a 521 return NULL;
7b2e497a 522 }
fd11a83d 523
9451980a
AD
524 if (pfmemalloc)
525 skb->pfmemalloc = 1;
526 skb->head_frag = 1;
527
a080e7bd 528skb_success:
9451980a
AD
529 skb_reserve(skb, NET_SKB_PAD);
530 skb->dev = dev;
531
a080e7bd 532skb_fail:
8af27456
CH
533 return skb;
534}
b4ac530f 535EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 536
fd11a83d
AD
537/**
538 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
539 * @napi: napi instance this buffer was allocated for
d7499160 540 * @len: length to allocate
fd11a83d
AD
541 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
542 *
543 * Allocate a new sk_buff for use in NAPI receive. This buffer will
544 * attempt to allocate the head from a special reserved region used
545 * only for NAPI Rx allocation. By doing this we can save several
546 * CPU cycles by avoiding having to disable and re-enable IRQs.
547 *
548 * %NULL is returned if there is no free memory.
549 */
9451980a
AD
550struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
551 gfp_t gfp_mask)
fd11a83d 552{
3226b158 553 struct napi_alloc_cache *nc;
fd11a83d 554 struct sk_buff *skb;
9451980a
AD
555 void *data;
556
557 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 558
3226b158
ED
559 /* If requested length is either too small or too big,
560 * we use kmalloc() for skb->head allocation.
561 */
562 if (len <= SKB_WITH_OVERHEAD(1024) ||
563 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 564 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
565 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
566 NUMA_NO_NODE);
a080e7bd
AD
567 if (!skb)
568 goto skb_fail;
569 goto skb_success;
570 }
9451980a 571
3226b158 572 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
573 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
574 len = SKB_DATA_ALIGN(len);
575
576 if (sk_memalloc_socks())
577 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 578
8c2dd3e4 579 data = page_frag_alloc(&nc->page, len, gfp_mask);
9451980a
AD
580 if (unlikely(!data))
581 return NULL;
582
cfb8ec65 583 skb = __napi_build_skb(data, len);
9451980a 584 if (unlikely(!skb)) {
181edb2b 585 skb_free_frag(data);
9451980a 586 return NULL;
fd11a83d
AD
587 }
588
795bb1c0 589 if (nc->page.pfmemalloc)
9451980a
AD
590 skb->pfmemalloc = 1;
591 skb->head_frag = 1;
592
a080e7bd 593skb_success:
9451980a
AD
594 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
595 skb->dev = napi->dev;
596
a080e7bd 597skb_fail:
fd11a83d
AD
598 return skb;
599}
600EXPORT_SYMBOL(__napi_alloc_skb);
601
654bed16 602void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 603 int size, unsigned int truesize)
654bed16
PZ
604{
605 skb_fill_page_desc(skb, i, page, off, size);
606 skb->len += size;
607 skb->data_len += size;
50269e19 608 skb->truesize += truesize;
654bed16
PZ
609}
610EXPORT_SYMBOL(skb_add_rx_frag);
611
f8e617e1
JW
612void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
613 unsigned int truesize)
614{
615 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
616
617 skb_frag_size_add(frag, size);
618 skb->len += size;
619 skb->data_len += size;
620 skb->truesize += truesize;
621}
622EXPORT_SYMBOL(skb_coalesce_rx_frag);
623
27b437c8 624static void skb_drop_list(struct sk_buff **listp)
1da177e4 625{
bd8a7036 626 kfree_skb_list(*listp);
27b437c8 627 *listp = NULL;
1da177e4
LT
628}
629
27b437c8
HX
630static inline void skb_drop_fraglist(struct sk_buff *skb)
631{
632 skb_drop_list(&skb_shinfo(skb)->frag_list);
633}
634
1da177e4
LT
635static void skb_clone_fraglist(struct sk_buff *skb)
636{
637 struct sk_buff *list;
638
fbb398a8 639 skb_walk_frags(skb, list)
1da177e4
LT
640 skb_get(list);
641}
642
d3836f21
ED
643static void skb_free_head(struct sk_buff *skb)
644{
181edb2b
AD
645 unsigned char *head = skb->head;
646
d3836f21 647 if (skb->head_frag)
181edb2b 648 skb_free_frag(head);
d3836f21 649 else
181edb2b 650 kfree(head);
d3836f21
ED
651}
652
5bba1712 653static void skb_release_data(struct sk_buff *skb)
1da177e4 654{
ff04a771
ED
655 struct skb_shared_info *shinfo = skb_shinfo(skb);
656 int i;
1da177e4 657
ff04a771
ED
658 if (skb->cloned &&
659 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
660 &shinfo->dataref))
661 return;
a6686f2f 662
70c43167
JL
663 skb_zcopy_clear(skb, true);
664
ff04a771
ED
665 for (i = 0; i < shinfo->nr_frags; i++)
666 __skb_frag_unref(&shinfo->frags[i]);
a6686f2f 667
ff04a771
ED
668 if (shinfo->frag_list)
669 kfree_skb_list(shinfo->frag_list);
670
671 skb_free_head(skb);
1da177e4
LT
672}
673
674/*
675 * Free an skbuff by memory without cleaning the state.
676 */
2d4baff8 677static void kfree_skbmem(struct sk_buff *skb)
1da177e4 678{
d0bf4a9e 679 struct sk_buff_fclones *fclones;
d179cd12 680
d179cd12
DM
681 switch (skb->fclone) {
682 case SKB_FCLONE_UNAVAILABLE:
683 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 684 return;
d179cd12
DM
685
686 case SKB_FCLONE_ORIG:
d0bf4a9e 687 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 688
6ffe75eb
ED
689 /* We usually free the clone (TX completion) before original skb
690 * This test would have no chance to be true for the clone,
691 * while here, branch prediction will be good.
d179cd12 692 */
2638595a 693 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
694 goto fastpath;
695 break;
e7820e39 696
6ffe75eb
ED
697 default: /* SKB_FCLONE_CLONE */
698 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 699 break;
3ff50b79 700 }
2638595a 701 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
702 return;
703fastpath:
704 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
705}
706
0a463c78 707void skb_release_head_state(struct sk_buff *skb)
1da177e4 708{
adf30907 709 skb_dst_drop(skb);
9c2b3328
SH
710 if (skb->destructor) {
711 WARN_ON(in_irq());
1da177e4
LT
712 skb->destructor(skb);
713 }
a3bf7ae9 714#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 715 nf_conntrack_put(skb_nfct(skb));
1da177e4 716#endif
df5042f4 717 skb_ext_put(skb);
04a4bb55
LB
718}
719
720/* Free everything but the sk_buff shell. */
721static void skb_release_all(struct sk_buff *skb)
722{
723 skb_release_head_state(skb);
a28b1b90
FW
724 if (likely(skb->head))
725 skb_release_data(skb);
2d4baff8
HX
726}
727
728/**
729 * __kfree_skb - private function
730 * @skb: buffer
731 *
732 * Free an sk_buff. Release anything attached to the buffer.
733 * Clean the state. This is an internal helper function. Users should
734 * always call kfree_skb
735 */
1da177e4 736
2d4baff8
HX
737void __kfree_skb(struct sk_buff *skb)
738{
739 skb_release_all(skb);
1da177e4
LT
740 kfree_skbmem(skb);
741}
b4ac530f 742EXPORT_SYMBOL(__kfree_skb);
1da177e4 743