[linux-2.6-block.git] / drivers / vhost / net.c

/* Copyright (C) 2009 Red Hat, Inc.
 * Author: Michael S. Tsirkin <mst@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 * virtio-net server in host kernel.
 */

#include <linux/compat.h>
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>

#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
#include <linux/if_macvlan.h>

#include <net/sock.h>

#include "vhost.h"

/* Max number of bytes transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000

enum {
	VHOST_NET_VQ_RX = 0,
	VHOST_NET_VQ_TX = 1,
	VHOST_NET_VQ_MAX = 2,
};

enum vhost_net_poll_state {
	VHOST_NET_POLL_DISABLED = 0,
	VHOST_NET_POLL_STARTED = 1,
	VHOST_NET_POLL_STOPPED = 2,
};

struct vhost_net {
	struct vhost_dev dev;
	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	/* Tells us whether we are polling a socket for TX.
	 * We only do this when socket buffer fills up.
	 * Protected by tx vq lock. */
	enum vhost_net_poll_state tx_poll_state;
};

/* Pop first len bytes from iovec. Return number of segments used. */
static int move_iovec_hdr(struct iovec *from, struct iovec *to,
			  size_t len, int iov_count)
{
	int seg = 0;
	size_t size;
	while (len && seg < iov_count) {
		size = min(from->iov_len, len);
		to->iov_base = from->iov_base;
		to->iov_len = size;
		from->iov_len -= size;
		from->iov_base += size;
		len -= size;
		++from;
		++to;
		++seg;
	}
	return seg;
}

/* Caller must have TX VQ lock */
static void tx_poll_stop(struct vhost_net *net)
{
	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
		return;
	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
}

/* Caller must have TX VQ lock */
static void tx_poll_start(struct vhost_net *net, struct socket *sock)
{
	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
		return;
	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
	net->tx_poll_state = VHOST_NET_POLL_STARTED;
}

/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
	unsigned head, out, in, s;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL,
		.msg_controllen = 0,
		.msg_iov = vq->iov,
		.msg_flags = MSG_DONTWAIT,
	};
	size_t len, total_len = 0;
	int err, wmem;
	size_t hdr_size;
	struct socket *sock = rcu_dereference(vq->private_data);
	if (!sock)
		return;

	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	if (wmem >= sock->sk->sk_sndbuf) {
		mutex_lock(&vq->mutex);
		tx_poll_start(net, sock);
		mutex_unlock(&vq->mutex);
		return;
	}

	use_mm(net->dev.mm);
	mutex_lock(&vq->mutex);
	vhost_disable_notify(vq);

	if (wmem < sock->sk->sk_sndbuf * 2)
		tx_poll_stop(net);
	hdr_size = vq->hdr_size;

	for (;;) {
		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
					 ARRAY_SIZE(vq->iov),
					 &out, &in,
					 NULL, NULL);
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (head == vq->num) {
			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
				tx_poll_start(net, sock);
				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
				break;
			}
			if (unlikely(vhost_enable_notify(vq))) {
				vhost_disable_notify(vq);
				continue;
			}
			break;
		}
		if (in) {
			vq_err(vq, "Unexpected descriptor format for TX: "
			       "out %d, int %d\n", out, in);
			break;
		}
		/* Skip header. TODO: support TSO. */
		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
		msg.msg_iovlen = out;
		len = iov_length(vq->iov, out);
		/* Sanity check */
		if (!len) {
			vq_err(vq, "Unexpected header len for TX: "
			       "%zd expected %zd\n",
			       iov_length(vq->hdr, s), hdr_size);
			break;
		}
		/* TODO: Check specific error and bomb out unless ENOBUFS? */
		err = sock->ops->sendmsg(NULL, sock, &msg, len);
		if (unlikely(err < 0)) {
			vhost_discard_vq_desc(vq);
			tx_poll_start(net, sock);
			break;
		}
		if (err != len)
			pr_err("Truncated TX packet: "
			       " len %d != %zd\n", err, len);
		vhost_add_used_and_signal(&net->dev, vq, head, 0);
		total_len += len;
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}

	mutex_unlock(&vq->mutex);
	unuse_mm(net->dev.mm);
}

/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_rx(struct vhost_net *net)
{
	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
	unsigned head, out, in, log, s;
	struct vhost_log *vq_log;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
		.msg_controllen = 0,
		.msg_iov = vq->iov,
		.msg_flags = MSG_DONTWAIT,
	};

	struct virtio_net_hdr hdr = {
		.flags = 0,
		.gso_type = VIRTIO_NET_HDR_GSO_NONE
	};

	size_t len, total_len = 0;
	int err;
	size_t hdr_size;
	struct socket *sock = rcu_dereference(vq->private_data);
	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
		return;

	use_mm(net->dev.mm);
	mutex_lock(&vq->mutex);
	vhost_disable_notify(vq);
	hdr_size = vq->hdr_size;

	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
		vq->log : NULL;

	for (;;) {
		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
					 ARRAY_SIZE(vq->iov),
					 &out, &in,
					 vq_log, &log);
		/* OK, now we need to know about added descriptors. */
		if (head == vq->num) {
			if (unlikely(vhost_enable_notify(vq))) {
				/* They have slipped one in as we were
				 * doing that: check again. */
				vhost_disable_notify(vq);
				continue;
			}
			/* Nothing new?  Wait for eventfd to tell us
			 * they refilled. */
			break;
		}
		/* We don't need to be notified again. */
		if (out) {
			vq_err(vq, "Unexpected descriptor format for RX: "
			       "out %d, int %d\n",
			       out, in);
			break;
		}
		/* Skip header. TODO: support TSO/mergeable rx buffers. */
		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
		msg.msg_iovlen = in;
		len = iov_length(vq->iov, in);
		/* Sanity check */
		if (!len) {
			vq_err(vq, "Unexpected header len for RX: "
			       "%zd expected %zd\n",
			       iov_length(vq->hdr, s), hdr_size);
			break;
		}
		err = sock->ops->recvmsg(NULL, sock, &msg,
					 len, MSG_DONTWAIT | MSG_TRUNC);
		/* TODO: Check specific error and bomb out unless EAGAIN? */
		if (err < 0) {
			vhost_discard_vq_desc(vq);
			break;
		}
		/* TODO: Should check and handle checksum. */
		if (err > len) {
			pr_err("Discarded truncated rx packet: "
			       " len %d > %zd\n", err, len);
			vhost_discard_vq_desc(vq);
			continue;
		}
		len = err;
		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
		if (err) {
			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
			       vq->iov->iov_base, err);
			break;
		}
		len += hdr_size;
		vhost_add_used_and_signal(&net->dev, vq, head, len);
		if (unlikely(vq_log))
			vhost_log_write(vq, vq_log, log, len);
		total_len += len;
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}

	mutex_unlock(&vq->mutex);
	unuse_mm(net->dev.mm);
}

static void handle_tx_kick(struct work_struct *work)
{
	struct vhost_virtqueue *vq;
	struct vhost_net *net;
	vq = container_of(work, struct vhost_virtqueue, poll.work);
	net = container_of(vq->dev, struct vhost_net, dev);
	handle_tx(net);
}

static void handle_rx_kick(struct work_struct *work)
{
	struct vhost_virtqueue *vq;
	struct vhost_net *net;
	vq = container_of(work, struct vhost_virtqueue, poll.work);
	net = container_of(vq->dev, struct vhost_net, dev);
	handle_rx(net);
}

static void handle_tx_net(struct work_struct *work)
{
	struct vhost_net *net;
	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
	handle_tx(net);
}

static void handle_rx_net(struct work_struct *work)
{
	struct vhost_net *net;
	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
	handle_rx(net);
}

static int vhost_net_open(struct inode *inode, struct file *f)
{
	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
	int r;
	if (!n)
		return -ENOMEM;
	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
	if (r < 0) {
		kfree(n);
		return r;
	}

	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
	n->tx_poll_state = VHOST_NET_POLL_DISABLED;

	f->private_data = n;

	return 0;
}

static void vhost_net_disable_vq(struct vhost_net *n,
				 struct vhost_virtqueue *vq)
{
	if (!vq->private_data)
		return;
	if (vq == n->vqs + VHOST_NET_VQ_TX) {
		tx_poll_stop(n);
		n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	} else
		vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
}

static void vhost_net_enable_vq(struct vhost_net *n,
				struct vhost_virtqueue *vq)
{
	struct socket *sock = vq->private_data;
	if (!sock)
		return;
	if (vq == n->vqs + VHOST_NET_VQ_TX) {
		n->tx_poll_state = VHOST_NET_POLL_STOPPED;
		tx_poll_start(n, sock);
	} else
		vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
}

static struct socket *vhost_net_stop_vq(struct vhost_net *n,
					struct vhost_virtqueue *vq)
{
	struct socket *sock;

	mutex_lock(&vq->mutex);
	sock = vq->private_data;
	vhost_net_disable_vq(n, vq);
	rcu_assign_pointer(vq->private_data, NULL);
	mutex_unlock(&vq->mutex);
	return sock;
}

static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
			   struct socket **rx_sock)
{
	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
}

static void vhost_net_flush_vq(struct vhost_net *n, int index)
{
	vhost_poll_flush(n->poll + index);
	vhost_poll_flush(&n->dev.vqs[index].poll);
}

static void vhost_net_flush(struct vhost_net *n)
{
	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

static int vhost_net_release(struct inode *inode, struct file *f)
{
	struct vhost_net *n = f->private_data;
	struct socket *tx_sock;
	struct socket *rx_sock;

	vhost_net_stop(n, &tx_sock, &rx_sock);
	vhost_net_flush(n);
	vhost_dev_cleanup(&n->dev);
	if (tx_sock)
		fput(tx_sock->file);
	if (rx_sock)
		fput(rx_sock->file);
	/* We do an extra flush before freeing memory,
	 * since jobs can re-queue themselves. */
	vhost_net_flush(n);
	kfree(n);
	return 0;
}

static struct socket *get_raw_socket(int fd)
{
	struct {
		struct sockaddr_ll sa;
		char  buf[MAX_ADDR_LEN];
	} uaddr;
	int uaddr_len = sizeof uaddr, r;
	struct socket *sock = sockfd_lookup(fd, &r);
	if (!sock)
		return ERR_PTR(-ENOTSOCK);

	/* Parameter checking */
	if (sock->sk->sk_type != SOCK_RAW) {
		r = -ESOCKTNOSUPPORT;
		goto err;
	}

	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
			       &uaddr_len, 0);
	if (r)
		goto err;

	if (uaddr.sa.sll_family != AF_PACKET) {
		r = -EPFNOSUPPORT;
		goto err;
	}
	return sock;
err:
	fput(sock->file);
	return ERR_PTR(r);
}

static struct socket *get_tap_socket(int fd)
{
	struct file *file = fget(fd);
	struct socket *sock;
	if (!file)
		return ERR_PTR(-EBADF);
	sock = tun_get_socket(file);
	if (!IS_ERR(sock))
		return sock;
	sock = macvtap_get_socket(file);
	if (IS_ERR(sock))
		fput(file);
	return sock;
}

static struct socket *get_socket(int fd)
{
	struct socket *sock;
	/* special case to disable backend */
	if (fd == -1)
		return NULL;
	sock = get_raw_socket(fd);
	if (!IS_ERR(sock))
		return sock;
	sock = get_tap_socket(fd);
	if (!IS_ERR(sock))
		return sock;
	return ERR_PTR(-ENOTSOCK);
}

static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
	struct socket *sock, *oldsock;
	struct vhost_virtqueue *vq;
	int r;

	mutex_lock(&n->dev.mutex);
	r = vhost_dev_check_owner(&n->dev);
	if (r)
		goto err;

	if (index >= VHOST_NET_VQ_MAX) {
		r = -ENOBUFS;
		goto err;
	}
	vq = n->vqs + index;
	mutex_lock(&vq->mutex);

	/* Verify that ring has been setup correctly. */
	if (!vhost_vq_access_ok(vq)) {
		r = -EFAULT;
		goto err;
	}
	sock = get_socket(fd);
	if (IS_ERR(sock)) {
		r = PTR_ERR(sock);
		goto err;
	}

	/* start polling new socket */
	oldsock = vq->private_data;
	if (sock == oldsock)
		goto done;

	vhost_net_disable_vq(n, vq);
	rcu_assign_pointer(vq->private_data, sock);
	vhost_net_enable_vq(n, vq);
	mutex_unlock(&vq->mutex);
done:
	if (oldsock) {
		vhost_net_flush_vq(n, index);
		fput(oldsock->file);
	}
err:
	mutex_unlock(&n->dev.mutex);
	return r;
}

static long vhost_net_reset_owner(struct vhost_net *n)
{
	struct socket *tx_sock = NULL;
	struct socket *rx_sock = NULL;
	long err;
	mutex_lock(&n->dev.mutex);
	err = vhost_dev_check_owner(&n->dev);
	if (err)
		goto done;
	vhost_net_stop(n, &tx_sock, &rx_sock);
	vhost_net_flush(n);
	err = vhost_dev_reset_owner(&n->dev);
done:
	mutex_unlock(&n->dev.mutex);
	if (tx_sock)
		fput(tx_sock->file);
	if (rx_sock)
		fput(rx_sock->file);
	return err;
}

static int vhost_net_set_features(struct vhost_net *n, u64 features)
{
	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
		sizeof(struct virtio_net_hdr) : 0;
	int i;
	mutex_lock(&n->dev.mutex);
	if ((features & (1 << VHOST_F_LOG_ALL)) &&
	    !vhost_log_access_ok(&n->dev)) {
		mutex_unlock(&n->dev.mutex);
		return -EFAULT;
	}
	n->dev.acked_features = features;
	smp_wmb();
	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
		mutex_lock(&n->vqs[i].mutex);
		n->vqs[i].hdr_size = hdr_size;
		mutex_unlock(&n->vqs[i].mutex);
	}
	vhost_net_flush(n);
	mutex_unlock(&n->dev.mutex);
	return 0;
}

static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
			    unsigned long arg)
{
	struct vhost_net *n = f->private_data;
	void __user *argp = (void __user *)arg;
	u64 __user *featurep = argp;
	struct vhost_vring_file backend;
	u64 features;
	int r;
	switch (ioctl) {
	case VHOST_NET_SET_BACKEND:
		r = copy_from_user(&backend, argp, sizeof backend);
		if (r < 0)
			return r;
		return vhost_net_set_backend(n, backend.index, backend.fd);
	case VHOST_GET_FEATURES:
		features = VHOST_FEATURES;
		return copy_to_user(featurep, &features, sizeof features);
	case VHOST_SET_FEATURES:
		r = copy_from_user(&features, featurep, sizeof features);
		if (r < 0)
			return r;
		if (features & ~VHOST_FEATURES)
			return -EOPNOTSUPP;
		return vhost_net_set_features(n, features);
	case VHOST_RESET_OWNER:
		return vhost_net_reset_owner(n);
	default:
		mutex_lock(&n->dev.mutex);
		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
		vhost_net_flush(n);
		mutex_unlock(&n->dev.mutex);
		return r;
	}
}

#ifdef CONFIG_COMPAT
static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
				   unsigned long arg)
{
	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
}
#endif

const static struct file_operations vhost_net_fops = {
	.owner          = THIS_MODULE,
	.release        = vhost_net_release,
	.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl   = vhost_net_compat_ioctl,
#endif
	.open           = vhost_net_open,
};

static struct miscdevice vhost_net_misc = {
	VHOST_NET_MINOR,
	"vhost-net",
	&vhost_net_fops,
};

int vhost_net_init(void)
{
	int r = vhost_init();
	if (r)
		goto err_init;
	r = misc_register(&vhost_net_misc);
	if (r)
		goto err_reg;
	return 0;
err_reg:
	vhost_cleanup();
err_init:
	return r;

}
module_init(vhost_net_init);

void vhost_net_exit(void)
{
	misc_deregister(&vhost_net_misc);
	vhost_cleanup();
}
module_exit(vhost_net_exit);

MODULE_VERSION("0.0.1");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Michael S. Tsirkin");
MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
Commit	Line	Data
3a4d5c94 MT	1	/* Copyright (C) 2009 Red Hat, Inc.
	2	* Author: Michael S. Tsirkin <mst@redhat.com>
	3	*
	4	* This work is licensed under the terms of the GNU GPL, version 2.
	5	*
	6	* virtio-net server in host kernel.
	7	*/
	8
	9	#include <linux/compat.h>
	10	#include <linux/eventfd.h>
	11	#include <linux/vhost.h>
	12	#include <linux/virtio_net.h>
	13	#include <linux/mmu_context.h>
	14	#include <linux/miscdevice.h>
	15	#include <linux/module.h>
	16	#include <linux/mutex.h>
	17	#include <linux/workqueue.h>
	18	#include <linux/rcupdate.h>
	19	#include <linux/file.h>
	20
	21	#include <linux/net.h>
	22	#include <linux/if_packet.h>
	23	#include <linux/if_arp.h>
	24	#include <linux/if_tun.h>
501c774c	25	#include <linux/if_macvlan.h>
3a4d5c94 MT	26
	27	#include <net/sock.h>
	28
	29	#include "vhost.h"
	30
	31	/* Max number of bytes transferred before requeueing the job.
	32	* Using this limit prevents one virtqueue from starving others. */
	33	#define VHOST_NET_WEIGHT 0x80000
	34
	35	enum {
	36	VHOST_NET_VQ_RX = 0,
	37	VHOST_NET_VQ_TX = 1,
	38	VHOST_NET_VQ_MAX = 2,
	39	};
	40
	41	enum vhost_net_poll_state {
	42	VHOST_NET_POLL_DISABLED = 0,
	43	VHOST_NET_POLL_STARTED = 1,
	44	VHOST_NET_POLL_STOPPED = 2,
	45	};
	46
	47	struct vhost_net {
	48	struct vhost_dev dev;
	49	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	50	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	51	/* Tells us whether we are polling a socket for TX.
	52	* We only do this when socket buffer fills up.
	53	* Protected by tx vq lock. */
	54	enum vhost_net_poll_state tx_poll_state;
	55	};
	56
	57	/* Pop first len bytes from iovec. Return number of segments used. */
	58	static int move_iovec_hdr(struct iovec from, struct iovec to,
	59	size_t len, int iov_count)
	60	{
	61	int seg = 0;
	62	size_t size;
	63	while (len && seg < iov_count) {
	64	size = min(from->iov_len, len);
	65	to->iov_base = from->iov_base;
	66	to->iov_len = size;
	67	from->iov_len -= size;
	68	from->iov_base += size;
	69	len -= size;
	70	++from;
	71	++to;
	72	++seg;
	73	}
	74	return seg;
	75	}
	76
	77	/* Caller must have TX VQ lock */
	78	static void tx_poll_stop(struct vhost_net *net)
	79	{
	80	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
	81	return;
	82	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
	83	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
	84	}
	85
	86	/* Caller must have TX VQ lock */
	87	static void tx_poll_start(struct vhost_net net, struct socket sock)
	88	{
	89	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
90	return;
91	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
92	net->tx_poll_state = VHOST_NET_POLL_STARTED;
93	}
94
95	/* Expects to be always run from workqueue - which acts as
96	* read-size critical section for our kind of RCU. */
97	static void handle_tx(struct vhost_net *net)
98	{
99	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
100	unsigned head, out, in, s;
101	struct msghdr msg = {
102	.msg_name = NULL,
103	.msg_namelen = 0,
104	.msg_control = NULL,
105	.msg_controllen = 0,
106	.msg_iov = vq->iov,
107	.msg_flags = MSG_DONTWAIT,
108	};
109	size_t len, total_len = 0;
110	int err, wmem;
111	size_t hdr_size;
112	struct socket *sock = rcu_dereference(vq->private_data);
113	if (!sock)
114	return;
115
116	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
39286fa4 SS	117	if (wmem >= sock->sk->sk_sndbuf) {
	118	mutex_lock(&vq->mutex);
	119	tx_poll_start(net, sock);
	120	mutex_unlock(&vq->mutex);
3a4d5c94	121	return;
39286fa4	122	}
3a4d5c94 MT	123
	124	use_mm(net->dev.mm);
	125	mutex_lock(&vq->mutex);
	126	vhost_disable_notify(vq);
	127
	128	if (wmem < sock->sk->sk_sndbuf * 2)
	129	tx_poll_stop(net);
	130	hdr_size = vq->hdr_size;
	131
	132	for (;;) {
	133	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
	134	ARRAY_SIZE(vq->iov),
	135	&out, &in,
	136	NULL, NULL);
	137	/* Nothing new? Wait for eventfd to tell us they refilled. */
	138	if (head == vq->num) {
	139	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	140	if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
	141	tx_poll_start(net, sock);
	142	set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
	143	break;
	144	}
	145	if (unlikely(vhost_enable_notify(vq))) {
	146	vhost_disable_notify(vq);
	147	continue;
	148	}
	149	break;
	150	}
	151	if (in) {
	152	vq_err(vq, "Unexpected descriptor format for TX: "
	153	"out %d, int %d\n", out, in);
	154	break;
	155	}
	156	/* Skip header. TODO: support TSO. */
	157	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
	158	msg.msg_iovlen = out;
	159	len = iov_length(vq->iov, out);
	160	/* Sanity check */
	161	if (!len) {
	162	vq_err(vq, "Unexpected header len for TX: "
	163	"%zd expected %zd\n",
	164	iov_length(vq->hdr, s), hdr_size);
	165	break;
	166	}
	167	/* TODO: Check specific error and bomb out unless ENOBUFS? */
	168	err = sock->ops->sendmsg(NULL, sock, &msg, len);
	169	if (unlikely(err < 0)) {
	170	vhost_discard_vq_desc(vq);
	171	tx_poll_start(net, sock);
	172	break;
	173	}
	174	if (err != len)
	175	pr_err("Truncated TX packet: "
	176	" len %d != %zd\n", err, len);
	177	vhost_add_used_and_signal(&net->dev, vq, head, 0);
	178	total_len += len;
	179	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
	180	vhost_poll_queue(&vq->poll);
	181	break;
	182	}
	183	}
	184
	185	mutex_unlock(&vq->mutex);
	186	unuse_mm(net->dev.mm);
187	}
188
189	/* Expects to be always run from workqueue - which acts as
190	* read-size critical section for our kind of RCU. */
191	static void handle_rx(struct vhost_net *net)
192	{
193	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
194	unsigned head, out, in, log, s;
195	struct vhost_log *vq_log;
196	struct msghdr msg = {
197	.msg_name = NULL,
198	.msg_namelen = 0,
199	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
200	.msg_controllen = 0,
201	.msg_iov = vq->iov,
202	.msg_flags = MSG_DONTWAIT,
203	};
204
205	struct virtio_net_hdr hdr = {
206	.flags = 0,
207	.gso_type = VIRTIO_NET_HDR_GSO_NONE
208	};
209
210	size_t len, total_len = 0;
211	int err;
212	size_t hdr_size;
213	struct socket *sock = rcu_dereference(vq->private_data);
214	if (!sock \|\| skb_queue_empty(&sock->sk->sk_receive_queue))
215	return;
216
217	use_mm(net->dev.mm);
218	mutex_lock(&vq->mutex);
219	vhost_disable_notify(vq);
220	hdr_size = vq->hdr_size;
221
222	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
223	vq->log : NULL;
224
225	for (;;) {
226	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
227	ARRAY_SIZE(vq->iov),
228	&out, &in,
229	vq_log, &log);
230	/* OK, now we need to know about added descriptors. */
231	if (head == vq->num) {
232	if (unlikely(vhost_enable_notify(vq))) {
233	/* They have slipped one in as we were
234	* doing that: check again. */
235	vhost_disable_notify(vq);
236	continue;
237	}
238	/* Nothing new? Wait for eventfd to tell us
239	* they refilled. */
240	break;
241	}
242	/* We don't need to be notified again. */
243	if (out) {
244	vq_err(vq, "Unexpected descriptor format for RX: "
245	"out %d, int %d\n",
246	out, in);
247	break;
248	}
249	/* Skip header. TODO: support TSO/mergeable rx buffers. */
250	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
251	msg.msg_iovlen = in;
252	len = iov_length(vq->iov, in);
253	/* Sanity check */
254	if (!len) {
255	vq_err(vq, "Unexpected header len for RX: "
256	"%zd expected %zd\n",
257	iov_length(vq->hdr, s), hdr_size);
258	break;
259	}
260	err = sock->ops->recvmsg(NULL, sock, &msg,
261	len, MSG_DONTWAIT \| MSG_TRUNC);
262	/* TODO: Check specific error and bomb out unless EAGAIN? */
263	if (err < 0) {
264	vhost_discard_vq_desc(vq);
265	break;
266	}
267	/* TODO: Should check and handle checksum. */
268	if (err > len) {
269	pr_err("Discarded truncated rx packet: "
270	" len %d > %zd\n", err, len);
271	vhost_discard_vq_desc(vq);
272	continue;
273	}
274	len = err;
275	err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
276	if (err) {
277	vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
278	vq->iov->iov_base, err);
279	break;
280	}
281	len += hdr_size;
282	vhost_add_used_and_signal(&net->dev, vq, head, len);
283	if (unlikely(vq_log))
284	vhost_log_write(vq, vq_log, log, len);
285	total_len += len;
286	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
287	vhost_poll_queue(&vq->poll);
288	break;
289	}
290	}
291
292	mutex_unlock(&vq->mutex);
293	unuse_mm(net->dev.mm);
294	}
295
296	static void handle_tx_kick(struct work_struct *work)
297	{
298	struct vhost_virtqueue *vq;
299	struct vhost_net *net;
300	vq = container_of(work, struct vhost_virtqueue, poll.work);
301	net = container_of(vq->dev, struct vhost_net, dev);
302	handle_tx(net);
303	}
304
305	static void handle_rx_kick(struct work_struct *work)
306	{
307	struct vhost_virtqueue *vq;
308	struct vhost_net *net;
309	vq = container_of(work, struct vhost_virtqueue, poll.work);
310	net = container_of(vq->dev, struct vhost_net, dev);
311	handle_rx(net);
312	}
313
314	static void handle_tx_net(struct work_struct *work)
315	{
316	struct vhost_net *net;
317	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
318	handle_tx(net);
319	}
320
321	static void handle_rx_net(struct work_struct *work)
322	{
323	struct vhost_net *net;
324	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
325	handle_rx(net);
326	}
327
328	static int vhost_net_open(struct inode inode, struct file f)
329	{
330	struct vhost_net n = kmalloc(sizeof n, GFP_KERNEL);
331	int r;
332	if (!n)
333	return -ENOMEM;
334	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
335	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
336	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
337	if (r < 0) {
338	kfree(n);
339	return r;
340	}
341
342	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
343	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
344	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
345
346	f->private_data = n;
347
348	return 0;
349	}
350
351	static void vhost_net_disable_vq(struct vhost_net *n,
352	struct vhost_virtqueue *vq)
353	{
354	if (!vq->private_data)
355	return;
356	if (vq == n->vqs + VHOST_NET_VQ_TX) {
357	tx_poll_stop(n);
358	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
359	} else
360	vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
361	}
362
363	static void vhost_net_enable_vq(struct vhost_net *n,
364	struct vhost_virtqueue *vq)
365	{
366	struct socket *sock = vq->private_data;
367	if (!sock)
368	return;
369	if (vq == n->vqs + VHOST_NET_VQ_TX) {
370	n->tx_poll_state = VHOST_NET_POLL_STOPPED;
371	tx_poll_start(n, sock);
372	} else
373	vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
374	}
375
376	static struct socket vhost_net_stop_vq(struct vhost_net n,
377	struct vhost_virtqueue *vq)
378	{
379	struct socket *sock;
380
381	mutex_lock(&vq->mutex);
382	sock = vq->private_data;
383	vhost_net_disable_vq(n, vq);
384	rcu_assign_pointer(vq->private_data, NULL);
385	mutex_unlock(&vq->mutex);
386	return sock;
387	}
388
389	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
390	struct socket **rx_sock)
391	{
392	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
393	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
394	}
395
396	static void vhost_net_flush_vq(struct vhost_net *n, int index)
397	{
398	vhost_poll_flush(n->poll + index);
399	vhost_poll_flush(&n->dev.vqs[index].poll);
400	}
401
402	static void vhost_net_flush(struct vhost_net *n)
403	{
404	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
405	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
406	}
407
408	static int vhost_net_release(struct inode inode, struct file f)
409	{
410	struct vhost_net *n = f->private_data;
411	struct socket *tx_sock;
412	struct socket *rx_sock;
413
414	vhost_net_stop(n, &tx_sock, &rx_sock);
415	vhost_net_flush(n);
416	vhost_dev_cleanup(&n->dev);
417	if (tx_sock)
418	fput(tx_sock->file);
419	if (rx_sock)
420	fput(rx_sock->file);
421	/* We do an extra flush before freeing memory,
422	* since jobs can re-queue themselves. */
423	vhost_net_flush(n);
424	kfree(n);
425	return 0;
426	}
427
428	static struct socket *get_raw_socket(int fd)
429	{
430	struct {
431	struct sockaddr_ll sa;
432	char buf[MAX_ADDR_LEN];
433	} uaddr;
434	int uaddr_len = sizeof uaddr, r;
435	struct socket *sock = sockfd_lookup(fd, &r);
436	if (!sock)
437	return ERR_PTR(-ENOTSOCK);
438
439	/* Parameter checking */
440	if (sock->sk->sk_type != SOCK_RAW) {
441	r = -ESOCKTNOSUPPORT;
442	goto err;
443	}
444
445	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
446	&uaddr_len, 0);
447	if (r)
448	goto err;
449
450	if (uaddr.sa.sll_family != AF_PACKET) {
451	r = -EPFNOSUPPORT;
452	goto err;
453	}
454	return sock;
455	err:
456	fput(sock->file);
457	return ERR_PTR(r);
458	}
459
501c774c	460	static struct socket *get_tap_socket(int fd)
3a4d5c94 MT	461	{
	462	struct file *file = fget(fd);
	463	struct socket *sock;
	464	if (!file)
	465	return ERR_PTR(-EBADF);
	466	sock = tun_get_socket(file);
501c774c AB	467	if (!IS_ERR(sock))
	468	return sock;
	469	sock = macvtap_get_socket(file);
3a4d5c94 MT	470	if (IS_ERR(sock))
	471	fput(file);
	472	return sock;
	473	}
	474
	475	static struct socket *get_socket(int fd)
	476	{
	477	struct socket *sock;
	478	/* special case to disable backend */
	479	if (fd == -1)
	480	return NULL;
	481	sock = get_raw_socket(fd);
	482	if (!IS_ERR(sock))
	483	return sock;
501c774c	484	sock = get_tap_socket(fd);
3a4d5c94 MT	485	if (!IS_ERR(sock))
	486	return sock;
	487	return ERR_PTR(-ENOTSOCK);
	488	}
	489
	490	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
	491	{
	492	struct socket sock, oldsock;
	493	struct vhost_virtqueue *vq;
	494	int r;
	495
	496	mutex_lock(&n->dev.mutex);
	497	r = vhost_dev_check_owner(&n->dev);
	498	if (r)
	499	goto err;
	500
	501	if (index >= VHOST_NET_VQ_MAX) {
	502	r = -ENOBUFS;
	503	goto err;
	504	}
	505	vq = n->vqs + index;
	506	mutex_lock(&vq->mutex);
	507
	508	/* Verify that ring has been setup correctly. */
	509	if (!vhost_vq_access_ok(vq)) {
	510	r = -EFAULT;
	511	goto err;
	512	}
	513	sock = get_socket(fd);
	514	if (IS_ERR(sock)) {
	515	r = PTR_ERR(sock);
	516	goto err;
	517	}
	518
	519	/* start polling new socket */
	520	oldsock = vq->private_data;
	521	if (sock == oldsock)
	522	goto done;
	523
	524	vhost_net_disable_vq(n, vq);
	525	rcu_assign_pointer(vq->private_data, sock);
	526	vhost_net_enable_vq(n, vq);
	527	mutex_unlock(&vq->mutex);
	528	done:
	529	if (oldsock) {
	530	vhost_net_flush_vq(n, index);
	531	fput(oldsock->file);
	532	}
	533	err:
	534	mutex_unlock(&n->dev.mutex);
	535	return r;
	536	}
	537
	538	static long vhost_net_reset_owner(struct vhost_net *n)
	539	{
	540	struct socket *tx_sock = NULL;
	541	struct socket *rx_sock = NULL;
	542	long err;
	543	mutex_lock(&n->dev.mutex);
	544	err = vhost_dev_check_owner(&n->dev);
	545	if (err)
	546	goto done;
	547	vhost_net_stop(n, &tx_sock, &rx_sock);
	548	vhost_net_flush(n);
549	err = vhost_dev_reset_owner(&n->dev);
550	done:
551	mutex_unlock(&n->dev.mutex);
552	if (tx_sock)
553	fput(tx_sock->file);
554	if (rx_sock)
555	fput(rx_sock->file);
556	return err;
557	}
558
559	static int vhost_net_set_features(struct vhost_net *n, u64 features)
560	{
561	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
562	sizeof(struct virtio_net_hdr) : 0;
563	int i;
564	mutex_lock(&n->dev.mutex);
565	if ((features & (1 << VHOST_F_LOG_ALL)) &&
566	!vhost_log_access_ok(&n->dev)) {
567	mutex_unlock(&n->dev.mutex);
568	return -EFAULT;
569	}
570	n->dev.acked_features = features;
571	smp_wmb();
572	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
573	mutex_lock(&n->vqs[i].mutex);
574	n->vqs[i].hdr_size = hdr_size;
575	mutex_unlock(&n->vqs[i].mutex);
576	}
577	vhost_net_flush(n);
578	mutex_unlock(&n->dev.mutex);
579	return 0;
580	}
581
582	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
583	unsigned long arg)
584	{
585	struct vhost_net *n = f->private_data;
586	void __user argp = (void __user )arg;
587	u64 __user *featurep = argp;
588	struct vhost_vring_file backend;
589	u64 features;
590	int r;
591	switch (ioctl) {
592	case VHOST_NET_SET_BACKEND:
593	r = copy_from_user(&backend, argp, sizeof backend);
594	if (r < 0)
595	return r;
596	return vhost_net_set_backend(n, backend.index, backend.fd);
597	case VHOST_GET_FEATURES:
598	features = VHOST_FEATURES;
599	return copy_to_user(featurep, &features, sizeof features);
600	case VHOST_SET_FEATURES:
601	r = copy_from_user(&features, featurep, sizeof features);
602	if (r < 0)
603	return r;
604	if (features & ~VHOST_FEATURES)
605	return -EOPNOTSUPP;
606	return vhost_net_set_features(n, features);
607	case VHOST_RESET_OWNER:
608	return vhost_net_reset_owner(n);
609	default:
610	mutex_lock(&n->dev.mutex);
611	r = vhost_dev_ioctl(&n->dev, ioctl, arg);
612	vhost_net_flush(n);
613	mutex_unlock(&n->dev.mutex);
614	return r;
615	}
616	}
617
618	#ifdef CONFIG_COMPAT
619	static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
620	unsigned long arg)
621	{
622	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
623	}
624	#endif
625
626	const static struct file_operations vhost_net_fops = {
627	.owner = THIS_MODULE,
628	.release = vhost_net_release,
629	.unlocked_ioctl = vhost_net_ioctl,
630	#ifdef CONFIG_COMPAT
631	.compat_ioctl = vhost_net_compat_ioctl,
632	#endif
633	.open = vhost_net_open,
634	};
635
636	static struct miscdevice vhost_net_misc = {
637	VHOST_NET_MINOR,
638	"vhost-net",
639	&vhost_net_fops,
640	};
641
642	int vhost_net_init(void)
643	{
644	int r = vhost_init();
645	if (r)
646	goto err_init;
647	r = misc_register(&vhost_net_misc);
648	if (r)
649	goto err_reg;
650	return 0;
651	err_reg:
652	vhost_cleanup();
653	err_init:
654	return r;
655
656	}
657	module_init(vhost_net_init);
658
659	void vhost_net_exit(void)
660	{
661	misc_deregister(&vhost_net_misc);
662	vhost_cleanup();
663	}
664	module_exit(vhost_net_exit);
665
666	MODULE_VERSION("0.0.1");
667	MODULE_LICENSE("GPL v2");
668	MODULE_AUTHOR("Michael S. Tsirkin");
669	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");