[linux-2.6-block.git] / net / core / skbuff.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *	Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
 *			Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *	Fixes:
 *		Alan Cox	:	Fixed the worst of the load
 *					balancer bugs.
 *		Dave Platt	:	Interrupt stacking fix.
 *	Richard Kooijman	:	Timestamp fixes.
 *		Alan Cox	:	Changed buffer format.
 *		Alan Cox	:	destructor hook for AF_UNIX etc.
 *		Linus Torvalds	:	Better skb_clone.
 *		Alan Cox	:	Added skb_copy.
 *		Alan Cox	:	Added all the changed routines Linus
 *					only put in the headers
 *		Ray VanTassle	:	Fixed --skb->lock in free
 *		Alan Cox	:	skb_copy copy arp field
 *		Andi Kleen	:	slabified it.
 *		Robert Olsson	:	Removed skb_head_pool
 *
 *	NOTE:
 *		The __skb_ routines should be called with interrupts
 *	disabled, or you better be *real* sure that the operation is atomic
 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
 *	or via disabling bottom half handlers, etc).
 */

/*
 *	The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
#include <net/dropreason.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/textsearch.h>

#include "dev.h"
#include "sock_destructor.h"

#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 * size, and we can differentiate heads from skb_small_head_cache
 * vs system slabs by looking at their size (skb_end_offset()).
 */
#define SKB_SMALL_HEAD_CACHE_SIZE					\
	(is_power_of_2(SKB_SMALL_HEAD_SIZE) ?			\
		(SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :	\
		SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM						\
	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)

int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);

/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 * netmem is a page.
 */
static_assert(offsetof(struct bio_vec, bv_page) ==
	      offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
	      sizeof_field(skb_frag_t, netmem));

static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
static_assert(sizeof_field(struct bio_vec, bv_len) ==
	      sizeof_field(skb_frag_t, len));

static_assert(offsetof(struct bio_vec, bv_offset) ==
	      offsetof(skb_frag_t, offset));
static_assert(sizeof_field(struct bio_vec, bv_offset) ==
	      sizeof_field(skb_frag_t, offset));

#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
static const char * const drop_reasons[] = {
	[SKB_CONSUMED] = "CONSUMED",
	DEFINE_DROP_REASON(FN, FN)
};

static const struct drop_reason_list drop_reasons_core = {
	.reasons = drop_reasons,
	.n_reasons = ARRAY_SIZE(drop_reasons),
};

const struct drop_reason_list __rcu *
drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
	[SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
};
EXPORT_SYMBOL(drop_reasons_by_subsys);

/**
 * drop_reasons_register_subsys - register another drop reason subsystem
 * @subsys: the subsystem to register, must not be the core
 * @list: the list of drop reasons within the subsystem, must point to
 *	a statically initialized list
 */
void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
				  const struct drop_reason_list *list)
{
	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
		 "invalid subsystem %d\n", subsys))
		return;

	/* must point to statically allocated memory, so INIT is OK */
	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);

/**
 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 * @subsys: the subsystem to remove, must not be the core
 *
 * Note: This will synchronize_rcu() to ensure no users when it returns.
 */
void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{
	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
		 "invalid subsystem %d\n", subsys))
		return;

	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);

	synchronize_rcu();
}
EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);

/**
 *	skb_panic - private function for out-of-line support
 *	@skb:	buffer
 *	@sz:	size
 *	@addr:	address
 *	@msg:	skb_over_panic or skb_under_panic
 *
 *	Out-of-line support for skb_put() and skb_push().
 *	Called via the wrapper skb_over_panic() or skb_under_panic().
 *	Keep out of line to prevent kernel bloat.
 *	__builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
		      const char msg[])
{
	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
		 msg, addr, skb->len, sz, skb->head, skb->data,
		 (unsigned long)skb->tail, (unsigned long)skb->end,
		 skb->dev ? skb->dev->name : "<NULL>");
	BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
	skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
	skb_panic(skb, sz, addr, __func__);
}

#define NAPI_SKB_CACHE_SIZE	64
#define NAPI_SKB_CACHE_BULK	16
#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)

#if PAGE_SIZE == SZ_4K

#define NAPI_HAS_SMALL_PAGE_FRAG	1
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	((nc).pfmemalloc)

/* specialized page frag allocator using a single order 0 page
 * and slicing it into 1K sized fragment. Constrained to systems
 * with a very limited amount of 1K fragments fitting a single
 * page - to avoid excessive truesize underestimation
 */

struct page_frag_1k {
	void *va;
	u16 offset;
	bool pfmemalloc;
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
{
	struct page *page;
	int offset;

	offset = nc->offset - SZ_1K;
	if (likely(offset >= 0))
		goto use_frag;

	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
	if (!page)
		return NULL;

	nc->va = page_address(page);
	nc->pfmemalloc = page_is_pfmemalloc(page);
	offset = PAGE_SIZE - SZ_1K;
	page_ref_add(page, offset / SZ_1K);

use_frag:
	nc->offset = offset;
	return nc->va + offset;
}
#else

/* the small page is actually unused in this build; add dummy helpers
 * to please the compiler and avoid later preprocessor's conditionals
 */
#define NAPI_HAS_SMALL_PAGE_FRAG	0
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	false

struct page_frag_1k {
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
{
	return NULL;
}

#endif

struct napi_alloc_cache {
	struct page_frag_cache page;
	struct page_frag_1k page_small;
	unsigned int skb_count;
	void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
void napi_get_frags_check(struct napi_struct *napi)
{
	struct sk_buff *skb;

	local_bh_disable();
	skb = napi_get_frags(napi);
	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
	napi_free_frags(napi);
	local_bh_enable();
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

	fragsz = SKB_DATA_ALIGN(fragsz);

	return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
				       align_mask);
}
EXPORT_SYMBOL(__napi_alloc_frag_align);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
	void *data;

	fragsz = SKB_DATA_ALIGN(fragsz);
	if (in_hardirq() || irqs_disabled()) {
		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);

		data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
					       align_mask);
	} else {
		struct napi_alloc_cache *nc;

		local_bh_disable();
		nc = this_cpu_ptr(&napi_alloc_cache);
		data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
					       align_mask);
		local_bh_enable();
	}
	return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);

static struct sk_buff *napi_skb_cache_get(void)
{
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
	struct sk_buff *skb;

	if (unlikely(!nc->skb_count)) {
		nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
						      GFP_ATOMIC,
						      NAPI_SKB_CACHE_BULK,
						      nc->skb_cache);
		if (unlikely(!nc->skb_count))
			return NULL;
	}

	skb = nc->skb_cache[--nc->skb_count];
	kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));

	return skb;
}

static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
					 unsigned int size)
{
	struct skb_shared_info *shinfo;

	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

	/* Assumes caller memset cleared SKB */
	skb->truesize = SKB_TRUESIZE(size);
	refcount_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
	skb_reset_tail_pointer(skb);
	skb_set_end_offset(skb, size);
	skb->mac_header = (typeof(skb->mac_header))~0U;
	skb->transport_header = (typeof(skb->transport_header))~0U;
	skb->alloc_cpu = raw_smp_processor_id();
	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
	atomic_set(&shinfo->dataref, 1);

	skb_set_kcov_handle(skb, kcov_common_handle());
}

static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
				     unsigned int *size)
{
	void *resized;

	/* Must find the allocation size (and grow it to match). */
	*size = ksize(data);
	/* krealloc() will immediately return "data" when
	 * "ksize(data)" is requested: it is the existing upper
	 * bounds. As a result, GFP_ATOMIC will be ignored. Note
	 * that this "new" pointer needs to be passed back to the
	 * caller for use so the __alloc_size hinting will be
	 * tracked correctly.
	 */
	resized = krealloc(data, *size, GFP_ATOMIC);
	WARN_ON_ONCE(resized != data);
	return resized;
}

/* build_skb() variant which can operate on slab buffers.
 * Note that this should be used sparingly as slab buffers
 * cannot be combined efficiently by GRO!
 */
struct sk_buff *slab_build_skb(void *data)
{
	struct sk_buff *skb;
	unsigned int size;

	skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
	if (unlikely(!skb))
		return NULL;

	memset(skb, 0, offsetof(struct sk_buff, tail));
	data = __slab_build_skb(skb, data, &size);
	__finalize_skb_around(skb, data, size);

	return skb;
}
EXPORT_SYMBOL(slab_build_skb);

/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
			       unsigned int frag_size)
{
	unsigned int size = frag_size;

	/* frag_size == 0 is considered deprecated now. Callers
	 * using slab buffer should use slab_build_skb() instead.
	 */
	if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
		data = __slab_build_skb(skb, data, &size);

	__finalize_skb_around(skb, data, size);
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data (must not be 0)
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated from the page
 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 * allocation is deprecated, and callers should use slab_build_skb()
 * instead.)
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
	struct sk_buff *skb;

	skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
	if (unlikely(!skb))
		return NULL;

	memset(skb, 0, offsetof(struct sk_buff, tail));
	__build_skb_around(skb, data, frag_size);

	return skb;
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
	struct sk_buff *skb = __build_skb(data, frag_size);

	if (likely(skb && frag_size)) {
		skb->head_frag = 1;
		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
	}
	return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
				 void *data, unsigned int frag_size)
{
	if (unlikely(!skb))
		return NULL;

	__build_skb_around(skb, data, frag_size);

	if (frag_size) {
		skb->head_frag = 1;
		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
	}
	return skb;
}
EXPORT_SYMBOL(build_skb_around);

/**
 * __napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __build_skb() that uses NAPI percpu caches to obtain
 * skbuff_head instead of inplace allocation.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
	struct sk_buff *skb;

	skb = napi_skb_cache_get();
	if (unlikely(!skb))
		return NULL;

	memset(skb, 0, offsetof(struct sk_buff, tail));
	__build_skb_around(skb, data, frag_size);

	return skb;
}

/**
 * napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __napi_build_skb() that takes care of skb->head_frag
 * and skb->pfmemalloc when the data is a page or page fragment.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
	struct sk_buff *skb = __napi_build_skb(data, frag_size);

	if (likely(skb) && frag_size) {
		skb->head_frag = 1;
		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
	}

	return skb;
}
EXPORT_SYMBOL(napi_build_skb);

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
			     bool *pfmemalloc)
{
	bool ret_pfmemalloc = false;
	size_t obj_size;
	void *obj;

	obj_size = SKB_HEAD_ALIGN(*size);
	if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
	    !(flags & KMALLOC_NOT_NORMAL_BITS)) {
		obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
				flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
				node);
		*size = SKB_SMALL_HEAD_CACHE_SIZE;
		if (obj || !(gfp_pfmemalloc_allowed(flags)))
			goto out;
		/* Try again but now we are using pfmemalloc reserves */
		ret_pfmemalloc = true;
		obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
		goto out;
	}

	obj_size = kmalloc_size_roundup(obj_size);
	/* The following cast might truncate high-order bits of obj_size, this
	 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
	 */
	*size = (unsigned int)obj_size;

	/*
	 * Try a regular allocation, when that fails and we're not entitled
	 * to the reserves, fail.
	 */
	obj = kmalloc_node_track_caller(obj_size,
					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
					node);
	if (obj || !(gfp_pfmemalloc_allowed(flags)))
		goto out;

	/* Try again but now we are using pfmemalloc reserves */
	ret_pfmemalloc = true;
	obj = kmalloc_node_track_caller(obj_size, flags, node);

out:
	if (pfmemalloc)
		*pfmemalloc = ret_pfmemalloc;

	return obj;
}

/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 *	'private' fields and also do memory statistics to find all the
 *	[BEEP] leaks.
 *
 */

/**
 *	__alloc_skb	-	allocate a network buffer
 *	@size: size to allocate
 *	@gfp_mask: allocation mask
 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *		instead of head cache and allocate a cloned (child) skb.
 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *		allocations in case the data is required for writeback
 *	@node: numa node to allocate memory on
 *
 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
 *	tail room of at least size bytes. The object has a reference count
 *	of one. The return is the buffer. On a failure the return is %NULL.
 *
 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 *	%GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
			    int flags, int node)
{
	struct kmem_cache *cache;
	struct sk_buff *skb;
	bool pfmemalloc;
	u8 *data;

	cache = (flags & SKB_ALLOC_FCLONE)
		? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;

	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
		gfp_mask |= __GFP_MEMALLOC;

	/* Get the HEAD */
	if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
	    likely(node == NUMA_NO_NODE || node == numa_mem_id()))
		skb = napi_skb_cache_get();
	else
		skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
	if (unlikely(!skb))
		return NULL;
	prefetchw(skb);

	/* We do our best to align skb_shared_info on a separate cache
	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
	 * Both skb->head and skb_shared_info are cache line aligned.
	 */
	data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
	if (unlikely(!data))
		goto nodata;
	/* kmalloc_size_roundup() might give us more room than requested.
	 * Put skb_shared_info exactly at the end of allocated zone,
	 * to allow max possible filling before reallocation.
	 */
	prefetchw(data + SKB_WITH_OVERHEAD(size));

	/*
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
	__build_skb_around(skb, data, size);
	skb->pfmemalloc = pfmemalloc;

	if (flags & SKB_ALLOC_FCLONE) {
		struct sk_buff_fclones *fclones;

		fclones = container_of(skb, struct sk_buff_fclones, skb1);

		skb->fclone = SKB_FCLONE_ORIG;
		refcount_set(&fclones->fclone_ref, 1);
	}

	return skb;

nodata:
	kmem_cache_free(cache, skb);
	return NULL;
}
EXPORT_SYMBOL(__alloc_skb);

/**
 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *	@dev: network device to receive on
 *	@len: length to allocate
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has NET_SKB_PAD headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
				   gfp_t gfp_mask)
{
	struct page_frag_cache *nc;
	struct sk_buff *skb;
	bool pfmemalloc;
	void *data;

	len += NET_SKB_PAD;

	/* If requested length is either too small or too big,
	 * we use kmalloc() for skb->head allocation.
	 */
	if (len <= SKB_WITH_OVERHEAD(1024) ||
	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
		if (!skb)
			goto skb_fail;
		goto skb_success;
	}

	len = SKB_HEAD_ALIGN(len);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;

	if (in_hardirq() || irqs_disabled()) {
		nc = this_cpu_ptr(&netdev_alloc_cache);
		data = page_frag_alloc(nc, len, gfp_mask);
		pfmemalloc = nc->pfmemalloc;
	} else {
		local_bh_disable();
		nc = this_cpu_ptr(&napi_alloc_cache.page);
		data = page_frag_alloc(nc, len, gfp_mask);
		pfmemalloc = nc->pfmemalloc;
		local_bh_enable();
	}

	if (unlikely(!data))
		return NULL;

	skb = __build_skb(data, len);
	if (unlikely(!skb)) {
		skb_free_frag(data);
		return NULL;
	}

	if (pfmemalloc)
		skb->pfmemalloc = 1;
	skb->head_frag = 1;

skb_success:
	skb_reserve(skb, NET_SKB_PAD);
	skb->dev = dev;

skb_fail:
	return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *	@napi: napi instance this buffer was allocated for
 *	@len: length to allocate
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
 *
 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *	attempt to allocate the head from a special reserved region used
 *	only for NAPI Rx allocation.  By doing this we can save several
 *	CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *	%NULL is returned if there is no free memory.
 */
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
				 gfp_t gfp_mask)
{
	struct napi_alloc_cache *nc;
	struct sk_buff *skb;
	bool pfmemalloc;
	void *data;

	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
	len += NET_SKB_PAD + NET_IP_ALIGN;

	/* If requested length is either too small or too big,
	 * we use kmalloc() for skb->head allocation.
	 * When the small frag allocator is available, prefer it over kmalloc
	 * for small fragments
	 */
	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
				  NUMA_NO_NODE);
		if (!skb)
			goto skb_fail;
		goto skb_success;
	}

	nc = this_cpu_ptr(&napi_alloc_cache);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;

	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
		/* we are artificially inflating the allocation size, but
		 * that is not as bad as it may look like, as:
		 * - 'len' less than GRO_MAX_HEAD makes little sense
		 * - On most systems, larger 'len' values lead to fragment
		 *   size above 512 bytes
		 * - kmalloc would use the kmalloc-1k slab for such values
		 * - Builds with smaller GRO_MAX_HEAD will very likely do
		 *   little networking, as that implies no WiFi and no
		 *   tunnels support, and 32 bits arches.
		 */
		len = SZ_1K;

		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
		pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
	} else {
		len = SKB_HEAD_ALIGN(len);

		data = page_frag_alloc(&nc->page, len, gfp_mask);
		pfmemalloc = nc->page.pfmemalloc;
	}

	if (unlikely(!data))
		return NULL;

	skb = __napi_build_skb(data, len);
	if (unlikely(!skb)) {
		skb_free_frag(data);
		return NULL;
	}

	if (pfmemalloc)
		skb->pfmemalloc = 1;
	skb->head_frag = 1;

skb_success:
	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
	skb->dev = napi->dev;

skb_fail:
	return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
			    int off, int size, unsigned int truesize)
{
	DEBUG_NET_WARN_ON_ONCE(size > truesize);

	skb_fill_netmem_desc(skb, i, netmem, off, size);
	skb->len += size;
	skb->data_len += size;
	skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag_netmem);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
			  unsigned int truesize)
{
	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

	DEBUG_NET_WARN_ON_ONCE(size > truesize);

	skb_frag_size_add(frag, size);
	skb->len += size;
	skb->data_len += size;
	skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
	kfree_skb_list(*listp);
	*listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
	skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
	struct sk_buff *list;

	skb_walk_frags(skb, list)
		skb_get(list);
}

static bool is_pp_page(struct page *page)
{
	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
		    unsigned int headroom)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
	u32 size, truesize, len, max_head_size, off;
	struct sk_buff *skb = *pskb, *nskb;
	int err, i, head_off;
	void *data;

	/* XDP does not support fraglist so we need to linearize
	 * the skb.
	 */
	if (skb_has_frag_list(skb))
		return -EOPNOTSUPP;

	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
		return -ENOMEM;

	size = min_t(u32, skb->len, max_head_size);
	truesize = SKB_HEAD_ALIGN(size) + headroom;
	data = page_pool_dev_alloc_va(pool, &truesize);
	if (!data)
		return -ENOMEM;

	nskb = napi_build_skb(data, truesize);
	if (!nskb) {
		page_pool_free_va(pool, data, true);
		return -ENOMEM;
	}

	skb_reserve(nskb, headroom);
	skb_copy_header(nskb, skb);
	skb_mark_for_recycle(nskb);

	err = skb_copy_bits(skb, 0, nskb->data, size);
	if (err) {
		consume_skb(nskb);
		return err;
	}
	skb_put(nskb, size);

	head_off = skb_headroom(nskb) - skb_headroom(skb);
	skb_headers_offset_update(nskb, head_off);

	off = size;
	len = skb->len - off;
	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
		struct page *page;
		u32 page_off;

		size = min_t(u32, len, PAGE_SIZE);
		truesize = size;

		page = page_pool_dev_alloc(pool, &page_off, &truesize);
		if (!page) {
			consume_skb(nskb);
			return -ENOMEM;
		}

		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
		err = skb_copy_bits(skb, off, page_address(page) + page_off,
				    size);
		if (err) {
			consume_skb(nskb);
			return err;
		}

		len -= size;
		off += size;
	}

	consume_skb(skb);
	*pskb = nskb;

	return 0;
#else
	return -EOPNOTSUPP;
#endif
}
EXPORT_SYMBOL(skb_pp_cow_data);

int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
			 struct bpf_prog *prog)
{
	if (!prog->aux->xdp_has_frags)
		return -EINVAL;

	return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
}
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct page *page, bool napi_safe)
{
	bool allow_direct = false;
	struct page_pool *pp;

	page = compound_head(page);

	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
	 * in order to preserve any existing bits, such as bit 0 for the
	 * head page of compound page and bit 1 for pfmemalloc page, so
	 * mask those bits for freeing side when doing below checking,
	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
	 * to avoid recycling the pfmemalloc page.
	 */
	if (unlikely(!is_pp_page(page)))
		return false;

	pp = page->pp;

	/* Allow direct recycle if we have reasons to believe that we are
	 * in the same context as the consumer would run, so there's
	 * no possible race.
	 * __page_pool_put_page() makes sure we're not in hardirq context
	 * and interrupts are enabled prior to accessing the cache.
	 */
	if (napi_safe || in_softirq()) {
		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
		unsigned int cpuid = smp_processor_id();

		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
		allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
	}

	/* Driver set this to memory recycling info. Reset it on recycle.
	 * This will *not* work for NIC using a split-page memory model.
	 * The page will be returned to the pool here regardless of the
	 * 'flipped' fragment being in use or not.
	 */
	page_pool_put_full_page(pp, page, allow_direct);

	return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
		return false;
	return napi_pp_put_page(virt_to_page(data), napi_safe);
}

/**
 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
 * @skb:	page pool aware skb
 *
 * Increase the fragment reference count (pp_ref_count) of a skb. This is
 * intended to gain fragment references only for page pool aware skbs,
 * i.e. when skb->pp_recycle is true, and not for fragments in a
 * non-pp-recycling skb. It has a fallback to increase references on normal
 * pages, as page pool aware skbs may also have normal page fragments.
 */
static int skb_pp_frag_ref(struct sk_buff *skb)
{
	struct skb_shared_info *shinfo;
	struct page *head_page;
	int i;

	if (!skb->pp_recycle)
		return -EINVAL;

	shinfo = skb_shinfo(skb);

	for (i = 0; i < shinfo->nr_frags; i++) {
		head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
		if (likely(is_pp_page(head_page)))
			page_pool_ref_page(head_page);
		else
			page_ref_inc(head_page);
	}
	return 0;
}

static void skb_kfree_head(void *head, unsigned int end_offset)
{
	if (end_offset == SKB_SMALL_HEAD_HEADROOM)
		kmem_cache_free(net_hotdata.skb_small_head_cache, head);
	else
		kfree(head);
}

static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{
	unsigned char *head = skb->head;

	if (skb->head_frag) {
		if (skb_pp_recycle(skb, head, napi_safe))
			return;
		skb_free_frag(head);
	} else {
		skb_kfree_head(head, skb_end_offset(skb));
	}
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
			     bool napi_safe)
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int i;

	if (!skb_data_unref(skb, shinfo))
		goto exit;

	if (skb_zcopy(skb)) {
		bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

		skb_zcopy_clear(skb, true);
		if (skip_unref)
			goto free_head;
	}

	for (i = 0; i < shinfo->nr_frags; i++)
		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);

free_head:
	if (shinfo->frag_list)
		kfree_skb_list_reason(shinfo->frag_list, reason);

	skb_free_head(skb, napi_safe);
exit:
	/* When we clone an SKB we copy the reycling bit. The pp_recycle
	 * bit is only set on the head though, so in order to avoid races
	 * while trying to recycle fragments on __skb_frag_unref() we need
	 * to make one SKB responsible for triggering the recycle path.
	 * So disable the recycling bit if an SKB is cloned and we have
	 * additional references to the fragmented part of the SKB.
	 * Eventually the last SKB will have the recycling bit set and it's
	 * dataref set to 0, which will trigger the recycling
	 */
	skb->pp_recycle = 0;
}

/*
 *	Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
	struct sk_buff_fclones *fclones;

	switch (skb->fclone) {
	case SKB_FCLONE_UNAVAILABLE:
		kmem_cache_free(net_hotdata.skbuff_cache, skb);
		return;

	case SKB_FCLONE_ORIG:
		fclones = container_of(skb, struct sk_buff_fclones, skb1);

		/* We usually free the clone (TX completion) before original skb
		 * This test would have no chance to be true for the clone,
		 * while here, branch prediction will be good.
		 */
		if (refcount_read(&fclones->fclone_ref) == 1)
			goto fastpath;
		break;

	default: /* SKB_FCLONE_CLONE */
		fclones = container_of(skb, struct sk_buff_fclones, skb2);
		break;
	}
	if (!refcount_dec_and_test(&fclones->fclone_ref))
		return;
fastpath:
	kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
	skb_dst_drop(skb);
	if (skb->destructor) {
		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
		skb->destructor(skb);
	}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
	nf_conntrack_put(skb_nfct(skb));
#endif
	skb_ext_put(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
			    bool napi_safe)
{
	skb_release_head_state(skb);
	if (likely(skb->head))
		skb_release_data(skb, reason, napi_safe);
}

/**
 *	__kfree_skb - private function
 *	@skb: buffer
 *
 *	Free an sk_buff. Release anything attached to the buffer.
 *	Clean the state. This is an internal helper function. Users should
 *	always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
	kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
	if (unlikely(!skb_unref(skb)))
		return false;

	DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
			       u32_get_bits(reason,
					    SKB_DROP_REASON_SUBSYS_MASK) >=
				SKB_DROP_REASON_SUBSYS_NUM);

	if (reason == SKB_CONSUMED)
		trace_consume_skb(skb, __builtin_return_address(0));
	else
		trace_kfree_skb(skb, __builtin_return_address(0), reason);
	return true;
}