[linux-block.git] / net / xdp / xdp_umem.c

// SPDX-License-Identifier: GPL-2.0
/* XDP user-space packet buffer
 * Copyright(c) 2018 Intel Corporation.
 */

#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
#include <linux/vmalloc.h>

#include "xdp_umem.h"
#include "xsk_queue.h"

#define XDP_UMEM_MIN_CHUNK_SIZE 2048

static DEFINE_IDA(umem_ida);

void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
	unsigned long flags;

	if (!xs->tx)
		return;

	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
	list_add_rcu(&xs->list, &umem->xsk_tx_list);
	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
}

void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
	unsigned long flags;

	if (!xs->tx)
		return;

	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
	list_del_rcu(&xs->list);
	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
}

/* The umem is stored both in the _rx struct and the _tx struct as we do
 * not know if the device has more tx queues than rx, or the opposite.
 * This might also change during run time.
 */
static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
			       u16 queue_id)
{
	if (queue_id >= max_t(unsigned int,
			      dev->real_num_rx_queues,
			      dev->real_num_tx_queues))
		return -EINVAL;

	if (queue_id < dev->real_num_rx_queues)
		dev->_rx[queue_id].umem = umem;
	if (queue_id < dev->real_num_tx_queues)
		dev->_tx[queue_id].umem = umem;

	return 0;
}

struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
				       u16 queue_id)
{
	if (queue_id < dev->real_num_rx_queues)
		return dev->_rx[queue_id].umem;
	if (queue_id < dev->real_num_tx_queues)
		return dev->_tx[queue_id].umem;

	return NULL;
}
EXPORT_SYMBOL(xdp_get_umem_from_qid);

static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
{
	if (queue_id < dev->real_num_rx_queues)
		dev->_rx[queue_id].umem = NULL;
	if (queue_id < dev->real_num_tx_queues)
		dev->_tx[queue_id].umem = NULL;
}

int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
			u16 queue_id, u16 flags)
{
	bool force_zc, force_copy;
	struct netdev_bpf bpf;
	int err = 0;

	ASSERT_RTNL();

	force_zc = flags & XDP_ZEROCOPY;
	force_copy = flags & XDP_COPY;

	if (force_zc && force_copy)
		return -EINVAL;

	if (xdp_get_umem_from_qid(dev, queue_id))
		return -EBUSY;

	err = xdp_reg_umem_at_qid(dev, umem, queue_id);
	if (err)
		return err;

	umem->dev = dev;
	umem->queue_id = queue_id;

	if (flags & XDP_USE_NEED_WAKEUP) {
		umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
		/* Tx needs to be explicitly woken up the first time.
		 * Also for supporting drivers that do not implement this
		 * feature. They will always have to call sendto().
		 */
		xsk_set_tx_need_wakeup(umem);
	}

	dev_hold(dev);

	if (force_copy)
		/* For copy-mode, we are done. */
		return 0;

	if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
		err = -EOPNOTSUPP;
		goto err_unreg_umem;
	}

	bpf.command = XDP_SETUP_XSK_UMEM;
	bpf.xsk.umem = umem;
	bpf.xsk.queue_id = queue_id;

	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
	if (err)
		goto err_unreg_umem;

	umem->zc = true;
	return 0;

err_unreg_umem:
	if (!force_zc)
		err = 0; /* fallback to copy mode */
	if (err)
		xdp_clear_umem_at_qid(dev, queue_id);
	return err;
}

void xdp_umem_clear_dev(struct xdp_umem *umem)
{
	struct netdev_bpf bpf;
	int err;

	ASSERT_RTNL();

	if (!umem->dev)
		return;

	if (umem->zc) {
		bpf.command = XDP_SETUP_XSK_UMEM;
		bpf.xsk.umem = NULL;
		bpf.xsk.queue_id = umem->queue_id;

		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);

		if (err)
			WARN(1, "failed to disable umem!\n");
	}

	xdp_clear_umem_at_qid(umem->dev, umem->queue_id);

	dev_put(umem->dev);
	umem->dev = NULL;
	umem->zc = false;
}

static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);

	kfree(umem->pgs);
	umem->pgs = NULL;
}

static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
{
	if (umem->user) {
		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
		free_uid(umem->user);
	}
}

static void xdp_umem_release(struct xdp_umem *umem)
{
	rtnl_lock();
	xdp_umem_clear_dev(umem);
	rtnl_unlock();

	ida_simple_remove(&umem_ida, umem->id);

	if (umem->fq) {
		xskq_destroy(umem->fq);
		umem->fq = NULL;
	}

	if (umem->cq) {
		xskq_destroy(umem->cq);
		umem->cq = NULL;
	}

	xp_destroy(umem->pool);
	xdp_umem_unpin_pages(umem);

	xdp_umem_unaccount_pages(umem);
	kfree(umem);
}

static void xdp_umem_release_deferred(struct work_struct *work)
{
	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);

	xdp_umem_release(umem);
}

void xdp_get_umem(struct xdp_umem *umem)
{
	refcount_inc(&umem->users);
}

void xdp_put_umem(struct xdp_umem *umem)
{
	if (!umem)
		return;

	if (refcount_dec_and_test(&umem->users)) {
		INIT_WORK(&umem->work, xdp_umem_release_deferred);
		schedule_work(&umem->work);
	}
}

static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
{
	unsigned int gup_flags = FOLL_WRITE;
	long npgs;
	int err;

	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
			    GFP_KERNEL | __GFP_NOWARN);
	if (!umem->pgs)
		return -ENOMEM;

	mmap_read_lock(current->mm);
	npgs = pin_user_pages(address, umem->npgs,
			      gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
	mmap_read_unlock(current->mm);

	if (npgs != umem->npgs) {
		if (npgs >= 0) {
			umem->npgs = npgs;
			err = -ENOMEM;
			goto out_pin;
		}
		err = npgs;
		goto out_pgs;
	}
	return 0;

out_pin:
	xdp_umem_unpin_pages(umem);
out_pgs:
	kfree(umem->pgs);
	umem->pgs = NULL;
	return err;
}

static int xdp_umem_account_pages(struct xdp_umem *umem)
{
	unsigned long lock_limit, new_npgs, old_npgs;

	if (capable(CAP_IPC_LOCK))
		return 0;

	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	umem->user = get_uid(current_user());

	do {
		old_npgs = atomic_long_read(&umem->user->locked_vm);
		new_npgs = old_npgs + umem->npgs;
		if (new_npgs > lock_limit) {
			free_uid(umem->user);
			umem->user = NULL;
			return -ENOBUFS;
		}
	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
				     new_npgs) != old_npgs);
	return 0;
}

static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
	u64 npgs, addr = mr->addr, size = mr->len;
	unsigned int chunks, chunks_per_page;
	int err;

	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
		/* Strictly speaking we could support this, if:
		 * - huge pages, or*
		 * - using an IOMMU, or
		 * - making sure the memory area is consecutive
		 * but for now, we simply say "computer says no".
		 */
		return -EINVAL;
	}

	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
			XDP_UMEM_USES_NEED_WAKEUP))
		return -EINVAL;

	if (!unaligned_chunks && !is_power_of_2(chunk_size))
		return -EINVAL;

	if (!PAGE_ALIGNED(addr)) {
		/* Memory area has to be page size aligned. For
		 * simplicity, this might change.
		 */
		return -EINVAL;
	}

	if ((addr + size) < addr)
		return -EINVAL;

	npgs = size >> PAGE_SHIFT;
	if (npgs > U32_MAX)
		return -EINVAL;

	chunks = (unsigned int)div_u64(size, chunk_size);
	if (chunks == 0)
		return -EINVAL;

	if (!unaligned_chunks) {
		chunks_per_page = PAGE_SIZE / chunk_size;
		if (chunks < chunks_per_page || chunks % chunks_per_page)
			return -EINVAL;
	}

	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
		return -EINVAL;

	umem->size = size;
	umem->headroom = headroom;
	umem->chunk_size = chunk_size;
	umem->npgs = (u32)npgs;
	umem->pgs = NULL;
	umem->user = NULL;
	umem->flags = mr->flags;
	INIT_LIST_HEAD(&umem->xsk_tx_list);
	spin_lock_init(&umem->xsk_tx_list_lock);

	refcount_set(&umem->users, 1);

	err = xdp_umem_account_pages(umem);
	if (err)
		return err;

	err = xdp_umem_pin_pages(umem, (unsigned long)addr);
	if (err)
		goto out_account;

	umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
			       headroom, size, unaligned_chunks);
	if (!umem->pool) {
		err = -ENOMEM;
		goto out_pin;
	}
	return 0;

out_pin:
	xdp_umem_unpin_pages(umem);
out_account:
	xdp_umem_unaccount_pages(umem);
	return err;
}

struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
{
	struct xdp_umem *umem;
	int err;

	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
	if (!umem)
		return ERR_PTR(-ENOMEM);

	err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
	if (err < 0) {
		kfree(umem);
		return ERR_PTR(err);
	}
	umem->id = err;

	err = xdp_umem_reg(umem, mr);
	if (err) {
		ida_simple_remove(&umem_ida, umem->id);
		kfree(umem);
		return ERR_PTR(err);
	}

	return umem;
}

bool xdp_umem_validate_queues(struct xdp_umem *umem)
{
	return umem->fq && umem->cq;
}
Commit	Line	Data
c0c77d8f BT	1	// SPDX-License-Identifier: GPL-2.0
	2	/* XDP user-space packet buffer
	3	* Copyright(c) 2018 Intel Corporation.
c0c77d8f BT	4	*/
	5
	6	#include <linux/init.h>
	7	#include <linux/sched/mm.h>
	8	#include <linux/sched/signal.h>
	9	#include <linux/sched/task.h>
	10	#include <linux/uaccess.h>
	11	#include <linux/slab.h>
	12	#include <linux/bpf.h>
	13	#include <linux/mm.h>
84c6b868 JK	14	#include <linux/netdevice.h>
84c6b868 JK	15	#include <linux/rtnetlink.h>
50e74c01	16	#include <linux/idr.h>
624676e7	17	#include <linux/vmalloc.h>
c0c77d8f BT	18
c0c77d8f BT	19	#include "xdp_umem.h"
e61e62b9	20	#include "xsk_queue.h"
c0c77d8f	21
bbff2f32	22	#define XDP_UMEM_MIN_CHUNK_SIZE 2048
c0c77d8f	23
50e74c01 BT	24	static DEFINE_IDA(umem_ida);
50e74c01 BT	25
ac98d8aa MK	26	void xdp_add_sk_umem(struct xdp_umem umem, struct xdp_sock xs)
	27	{
	28	unsigned long flags;
	29
2afd23f7 MK	30	if (!xs->tx)
	31	return;
	32
e4e5aefc MK	33	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
	34	list_add_rcu(&xs->list, &umem->xsk_tx_list);
	35	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
ac98d8aa MK	36	}
	37
	38	void xdp_del_sk_umem(struct xdp_umem umem, struct xdp_sock xs)
	39	{
	40	unsigned long flags;
	41
2afd23f7 MK	42	if (!xs->tx)
	43	return;
	44
e4e5aefc	45	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
541d7fdd	46	list_del_rcu(&xs->list);
e4e5aefc	47	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
ac98d8aa MK	48	}
ac98d8aa MK	49
c9b47cc1 MK	50	/* The umem is stored both in the _rx struct and the _tx struct as we do
	51	* not know if the device has more tx queues than rx, or the opposite.
	52	* This might also change during run time.
	53	*/
cc5b5d35 KK	54	static int xdp_reg_umem_at_qid(struct net_device dev, struct xdp_umem umem,
cc5b5d35 KK	55	u16 queue_id)
84c6b868	56	{
cc5b5d35 KK	57	if (queue_id >= max_t(unsigned int,
	58	dev->real_num_rx_queues,
	59	dev->real_num_tx_queues))
	60	return -EINVAL;
	61
c9b47cc1 MK	62	if (queue_id < dev->real_num_rx_queues)
	63	dev->_rx[queue_id].umem = umem;
	64	if (queue_id < dev->real_num_tx_queues)
	65	dev->_tx[queue_id].umem = umem;
cc5b5d35 KK	66
cc5b5d35 KK	67	return 0;
c9b47cc1	68	}
84c6b868	69
1661d346 JK	70	struct xdp_umem xdp_get_umem_from_qid(struct net_device dev,
1661d346 JK	71	u16 queue_id)
c9b47cc1 MK	72	{
	73	if (queue_id < dev->real_num_rx_queues)
	74	return dev->_rx[queue_id].umem;
	75	if (queue_id < dev->real_num_tx_queues)
	76	return dev->_tx[queue_id].umem;
84c6b868	77
c9b47cc1 MK	78	return NULL;
c9b47cc1 MK	79	}
5f4f3b2d	80	EXPORT_SYMBOL(xdp_get_umem_from_qid);
84c6b868	81
c9b47cc1 MK	82	static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
c9b47cc1 MK	83	{
a41b4f3c	84	if (queue_id < dev->real_num_rx_queues)
c9b47cc1	85	dev->_rx[queue_id].umem = NULL;
a41b4f3c	86	if (queue_id < dev->real_num_tx_queues)
c9b47cc1	87	dev->_tx[queue_id].umem = NULL;
84c6b868 JK	88	}
84c6b868 JK	89
173d3adb	90	int xdp_umem_assign_dev(struct xdp_umem umem, struct net_device dev,
c9b47cc1	91	u16 queue_id, u16 flags)
173d3adb BT	92	{
	93	bool force_zc, force_copy;
	94	struct netdev_bpf bpf;
c9b47cc1	95	int err = 0;
173d3adb	96
5464c3a0 IM	97	ASSERT_RTNL();
5464c3a0 IM	98
173d3adb BT	99	force_zc = flags & XDP_ZEROCOPY;
	100	force_copy = flags & XDP_COPY;
	101
	102	if (force_zc && force_copy)
	103	return -EINVAL;
	104
5464c3a0 IM	105	if (xdp_get_umem_from_qid(dev, queue_id))
5464c3a0 IM	106	return -EBUSY;
173d3adb	107
cc5b5d35 KK	108	err = xdp_reg_umem_at_qid(dev, umem, queue_id);
cc5b5d35 KK	109	if (err)
5464c3a0	110	return err;
cc5b5d35	111
c9b47cc1 MK	112	umem->dev = dev;
c9b47cc1 MK	113	umem->queue_id = queue_id;
162c820e	114
77cd0d7b MK	115	if (flags & XDP_USE_NEED_WAKEUP) {
	116	umem->flags \|= XDP_UMEM_USES_NEED_WAKEUP;
	117	/* Tx needs to be explicitly woken up the first time.
	118	* Also for supporting drivers that do not implement this
	119	* feature. They will always have to call sendto().
	120	*/
	121	xsk_set_tx_need_wakeup(umem);
	122	}
	123
162c820e IM	124	dev_hold(dev);
162c820e IM	125
c9b47cc1 MK	126	if (force_copy)
c9b47cc1 MK	127	/* For copy-mode, we are done. */
5464c3a0	128	return 0;
173d3adb	129
9116e5e2	130	if (!dev->netdev_ops->ndo_bpf \|\| !dev->netdev_ops->ndo_xsk_wakeup) {
c9b47cc1 MK	131	err = -EOPNOTSUPP;
c9b47cc1 MK	132	goto err_unreg_umem;
84c6b868	133	}
173d3adb	134
f734607e JK	135	bpf.command = XDP_SETUP_XSK_UMEM;
	136	bpf.xsk.umem = umem;
	137	bpf.xsk.queue_id = queue_id;
173d3adb	138
f734607e	139	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
f734607e	140	if (err)
c9b47cc1	141	goto err_unreg_umem;
173d3adb	142
f734607e JK	143	umem->zc = true;
f734607e JK	144	return 0;
84c6b868	145
c9b47cc1	146	err_unreg_umem:
c9b47cc1 MK	147	if (!force_zc)
c9b47cc1 MK	148	err = 0; /* fallback to copy mode */
1e405c1a BT	149	if (err)
1e405c1a BT	150	xdp_clear_umem_at_qid(dev, queue_id);
c9b47cc1	151	return err;
173d3adb BT	152	}
173d3adb BT	153
455302d1	154	void xdp_umem_clear_dev(struct xdp_umem *umem)
173d3adb BT	155	{
	156	struct netdev_bpf bpf;
	157	int err;
	158
455302d1 IM	159	ASSERT_RTNL();
455302d1 IM	160
01d76b53 IM	161	if (!umem->dev)
	162	return;
	163
c9b47cc1	164	if (umem->zc) {
173d3adb BT	165	bpf.command = XDP_SETUP_XSK_UMEM;
	166	bpf.xsk.umem = NULL;
	167	bpf.xsk.queue_id = umem->queue_id;
	168
173d3adb	169	err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
173d3adb BT	170
	171	if (err)
	172	WARN(1, "failed to disable umem!\n");
c9b47cc1 MK	173	}
c9b47cc1 MK	174
01d76b53	175	xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
173d3adb	176
162c820e IM	177	dev_put(umem->dev);
	178	umem->dev = NULL;
	179	umem->zc = false;
173d3adb BT	180	}
173d3adb BT	181
c0c77d8f BT	182	static void xdp_umem_unpin_pages(struct xdp_umem *umem)
c0c77d8f BT	183	{
f1f6a7dd	184	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
a49049ea BT	185
	186	kfree(umem->pgs);
	187	umem->pgs = NULL;
c0c77d8f BT	188	}
	189
	190	static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
	191	{
c09290c5 DB	192	if (umem->user) {
	193	atomic_long_sub(umem->npgs, &umem->user->locked_vm);
	194	free_uid(umem->user);
	195	}
c0c77d8f BT	196	}
	197
	198	static void xdp_umem_release(struct xdp_umem *umem)
	199	{
455302d1	200	rtnl_lock();
173d3adb	201	xdp_umem_clear_dev(umem);
455302d1	202	rtnl_unlock();
173d3adb	203
50e74c01 BT	204	ida_simple_remove(&umem_ida, umem->id);
50e74c01 BT	205
423f3832 MK	206	if (umem->fq) {
	207	xskq_destroy(umem->fq);
	208	umem->fq = NULL;
	209	}
	210
fe230832 MK	211	if (umem->cq) {
	212	xskq_destroy(umem->cq);
	213	umem->cq = NULL;
	214	}
	215
2b43470a	216	xp_destroy(umem->pool);
a49049ea	217	xdp_umem_unpin_pages(umem);
c0c77d8f	218
c0c77d8f	219	xdp_umem_unaccount_pages(umem);
c0c77d8f BT	220	kfree(umem);
	221	}
	222
	223	static void xdp_umem_release_deferred(struct work_struct *work)
	224	{
	225	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
	226
	227	xdp_umem_release(umem);
	228	}
	229
	230	void xdp_get_umem(struct xdp_umem *umem)
	231	{
d3b42f14	232	refcount_inc(&umem->users);
c0c77d8f BT	233	}
	234
	235	void xdp_put_umem(struct xdp_umem *umem)
	236	{
	237	if (!umem)
	238	return;
	239
d3b42f14	240	if (refcount_dec_and_test(&umem->users)) {
c0c77d8f BT	241	INIT_WORK(&umem->work, xdp_umem_release_deferred);
	242	schedule_work(&umem->work);
	243	}
	244	}
	245
07bf2d97	246	static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
c0c77d8f BT	247	{
	248	unsigned int gup_flags = FOLL_WRITE;
	249	long npgs;
	250	int err;
	251
a343993c BT	252	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
a343993c BT	253	GFP_KERNEL \| __GFP_NOWARN);
c0c77d8f BT	254	if (!umem->pgs)
	255	return -ENOMEM;
	256
d8ed45c5	257	mmap_read_lock(current->mm);
07bf2d97	258	npgs = pin_user_pages(address, umem->npgs,
932f4a63	259	gup_flags \| FOLL_LONGTERM, &umem->pgs[0], NULL);
d8ed45c5	260	mmap_read_unlock(current->mm);
c0c77d8f BT	261
	262	if (npgs != umem->npgs) {
	263	if (npgs >= 0) {
	264	umem->npgs = npgs;
	265	err = -ENOMEM;
	266	goto out_pin;
	267	}
	268	err = npgs;
	269	goto out_pgs;
	270	}
	271	return 0;
	272
	273	out_pin:
	274	xdp_umem_unpin_pages(umem);
	275	out_pgs:
	276	kfree(umem->pgs);
	277	umem->pgs = NULL;
	278	return err;
	279	}
	280
	281	static int xdp_umem_account_pages(struct xdp_umem *umem)
	282	{
	283	unsigned long lock_limit, new_npgs, old_npgs;
	284
	285	if (capable(CAP_IPC_LOCK))
	286	return 0;
	287
	288	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	289	umem->user = get_uid(current_user());
	290
	291	do {
	292	old_npgs = atomic_long_read(&umem->user->locked_vm);
	293	new_npgs = old_npgs + umem->npgs;
	294	if (new_npgs > lock_limit) {
	295	free_uid(umem->user);
	296	umem->user = NULL;
	297	return -ENOBUFS;
	298	}
	299	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
	300	new_npgs) != old_npgs);
	301	return 0;
	302	}
	303
a49049ea	304	static int xdp_umem_reg(struct xdp_umem umem, struct xdp_umem_reg mr)
c0c77d8f	305	{
c05cd364	306	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
bbff2f32	307	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
b16a87d0	308	u64 npgs, addr = mr->addr, size = mr->len;
bbff2f32	309	unsigned int chunks, chunks_per_page;
99e3a236	310	int err;
c0c77d8f	311
bbff2f32	312	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE \|\| chunk_size > PAGE_SIZE) {
c0c77d8f BT	313	/* Strictly speaking we could support this, if:
	314	* - huge pages, or*
	315	* - using an IOMMU, or
	316	* - making sure the memory area is consecutive
	317	* but for now, we simply say "computer says no".
	318	*/
	319	return -EINVAL;
	320	}
	321
c05cd364 KL	322	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG \|
	323	XDP_UMEM_USES_NEED_WAKEUP))
	324	return -EINVAL;
	325
	326	if (!unaligned_chunks && !is_power_of_2(chunk_size))
c0c77d8f BT	327	return -EINVAL;
	328
	329	if (!PAGE_ALIGNED(addr)) {
	330	/* Memory area has to be page size aligned. For
	331	* simplicity, this might change.
	332	*/
	333	return -EINVAL;
	334	}
	335
	336	if ((addr + size) < addr)
	337	return -EINVAL;
	338
7d877c35	339	npgs = size >> PAGE_SHIFT;
b16a87d0 BT	340	if (npgs > U32_MAX)
	341	return -EINVAL;
	342
bbff2f32 BT	343	chunks = (unsigned int)div_u64(size, chunk_size);
bbff2f32 BT	344	if (chunks == 0)
c0c77d8f BT	345	return -EINVAL;
c0c77d8f BT	346
c05cd364 KL	347	if (!unaligned_chunks) {
	348	chunks_per_page = PAGE_SIZE / chunk_size;
	349	if (chunks < chunks_per_page \|\| chunks % chunks_per_page)
	350	return -EINVAL;
	351	}
c0c77d8f	352
99e3a236	353	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
c0c77d8f BT	354	return -EINVAL;
c0c77d8f BT	355
93ee30f3	356	umem->size = size;
bbff2f32	357	umem->headroom = headroom;
2b43470a	358	umem->chunk_size = chunk_size;
b16a87d0	359	umem->npgs = (u32)npgs;
c0c77d8f BT	360	umem->pgs = NULL;
c0c77d8f BT	361	umem->user = NULL;
c05cd364	362	umem->flags = mr->flags;
e4e5aefc MK	363	INIT_LIST_HEAD(&umem->xsk_tx_list);
e4e5aefc MK	364	spin_lock_init(&umem->xsk_tx_list_lock);
c0c77d8f	365
d3b42f14	366	refcount_set(&umem->users, 1);
c0c77d8f BT	367
	368	err = xdp_umem_account_pages(umem);
	369	if (err)
044175a0	370	return err;
c0c77d8f	371
07bf2d97	372	err = xdp_umem_pin_pages(umem, (unsigned long)addr);
c0c77d8f BT	373	if (err)
c0c77d8f BT	374	goto out_account;
8aef7340	375
2b43470a BT	376	umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
	377	headroom, size, unaligned_chunks);
	378	if (!umem->pool) {
	379	err = -ENOMEM;
0807892e	380	goto out_pin;
2b43470a BT	381	}
2b43470a BT	382	return 0;
c0c77d8f	383
fb89c394 IK	384	out_pin:
fb89c394 IK	385	xdp_umem_unpin_pages(umem);
c0c77d8f BT	386	out_account:
c0c77d8f BT	387	xdp_umem_unaccount_pages(umem);
c0c77d8f BT	388	return err;
c0c77d8f BT	389	}
965a9909	390
a49049ea BT	391	struct xdp_umem xdp_umem_create(struct xdp_umem_reg mr)
	392	{
	393	struct xdp_umem *umem;
	394	int err;
	395
	396	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
	397	if (!umem)
	398	return ERR_PTR(-ENOMEM);
	399
50e74c01 BT	400	err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
	401	if (err < 0) {
	402	kfree(umem);
	403	return ERR_PTR(err);
	404	}
	405	umem->id = err;
	406
a49049ea BT	407	err = xdp_umem_reg(umem, mr);
a49049ea BT	408	if (err) {
50e74c01	409	ida_simple_remove(&umem_ida, umem->id);
a49049ea BT	410	kfree(umem);
	411	return ERR_PTR(err);
	412	}
	413
	414	return umem;
	415	}
	416
965a9909 MK	417	bool xdp_umem_validate_queues(struct xdp_umem *umem)
965a9909 MK	418	{
da60cf00	419	return umem->fq && umem->cq;
965a9909	420	}