[linux-block.git] / drivers / block / lguest_blk.c

/*D:400
 * The Guest block driver
 *
 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
 * The mechanism is simple: we place the information about the request in the
 * device page, then use SEND_DMA (containing the data for a write, or an empty
 * "ping" DMA for a read).
 :*/
/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
//#define DEBUG
#include <linux/init.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/lguest_bus.h>

static char next_block_index = 'a';

/*D:420 Here is the structure which holds all the information we need about
 * each Guest block device.
 *
 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
 * my blog".  I think Real adventures have boring bits, too, and you're in the
 * middle of one.  But it gets better.  Just not quite yet. */
struct blockdev
{
	/* The block queue infrastructure wants a spinlock: it is held while it
	 * calls our block request function.  We grab it in our interrupt
	 * handler so the responses don't mess with new requests. */
	spinlock_t lock;

	/* The disk structure registered with kernel. */
	struct gendisk *disk;

	/* The major device number for this disk, and the interrupt.  We only
	 * really keep them here for completeness; we'd need them if we
	 * supported device unplugging. */
	int major;
	int irq;

	/* The physical address of this device's memory page */
	unsigned long phys_addr;
	/* The mapped memory page for convenient acces. */
	struct lguest_block_page *lb_page;

	/* We only have a single request outstanding at a time: this is it. */
	struct lguest_dma dma;
	struct request *req;
};

/*D:495 We originally used end_request() throughout the driver, but it turns
 * out that end_request() is deprecated, and doesn't actually end the request
 * (which seems like a good reason to deprecate it!).  It simply ends the first
 * bio.  So if we had 3 bios in a "struct request" we would do all 3,
 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
 * work as we needed to do.
 *
 * This reinforced to me that I do not understand the block layer.
 *
 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
 * request.  This improved disk speed by 130%. */
static void end_entire_request(struct request *req, int uptodate)
{
	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
		BUG();
	add_disk_randomness(req->rq_disk);
	blkdev_dequeue_request(req);
	end_that_request_last(req, uptodate);
}

/* I'm told there are only two stories in the world worth telling: love and
 * hate.  So there used to be a love scene here like this:
 *
 *  Launcher:	We could make beautiful I/O together, you and I.
 *  Guest:	My, that's a big disk!
 *
 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */

/*D:490 This is the interrupt handler, called when a block read or write has
 * been completed for us. */
static irqreturn_t lgb_irq(int irq, void *_bd)
{
	/* We handed our "struct blockdev" as the argument to request_irq(), so
	 * it is passed through to us here.  This tells us which device we're
	 * dealing with in case we have more than one. */
	struct blockdev *bd = _bd;
	unsigned long flags;

	/* We weren't doing anything?  Strange, but could happen if we shared
	 * interrupts (we don't!). */
	if (!bd->req) {
		pr_debug("No work!\n");
		return IRQ_NONE;
	}

	/* Not done yet?  That's equally strange. */
	if (!bd->lb_page->result) {
		pr_debug("No result!\n");
		return IRQ_NONE;
	}

	/* We have to grab the lock before ending the request. */
	spin_lock_irqsave(&bd->lock, flags);
	/* "result" is 1 for success, 2 for failure: end_entire_request() wants
	 * to know whether this succeeded or not. */
	end_entire_request(bd->req, bd->lb_page->result == 1);
	/* Clear out request, it's done. */
	bd->req = NULL;
	/* Reset incoming DMA for next time. */
	bd->dma.used_len = 0;
	/* Ready for more reads or writes */
	blk_start_queue(bd->disk->queue);
	spin_unlock_irqrestore(&bd->lock, flags);

	/* The interrupt was for us, we dealt with it. */
	return IRQ_HANDLED;
}

/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
 * each of which contains "struct bio_vec"s, each of which contains a page, an
 * offset and a length.
 *
 * Fortunately there are iterators to help us walk through the "struct
 * request".  Even more fortunately, there were plenty of places to steal the
 * code from.  We pack the "struct request" into our "struct lguest_dma" and
 * return the total length. */
static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
{
	unsigned int i = 0, len = 0;
	struct req_iterator iter;
	struct bio_vec *bvec;

	rq_for_each_segment(bvec, req, iter) {
		/* We told the block layer not to give us too many. */
		BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
		/* If we had a zero-length segment, it would look like
		 * the end of the data referred to by the "struct
		 * lguest_dma", so make sure that doesn't happen. */
		BUG_ON(!bvec->bv_len);
		/* Convert page & offset to a physical address */
		dma->addr[i] = page_to_phys(bvec->bv_page)
			+ bvec->bv_offset;
		dma->len[i] = bvec->bv_len;
		len += bvec->bv_len;
		i++;
	}
	/* If the array isn't full, we mark the end with a 0 length */
	if (i < LGUEST_MAX_DMA_SECTIONS)
		dma->len[i] = 0;
	return len;
}

/* This creates an empty DMA, useful for prodding the Host without sending data
 * (ie. when we want to do a read) */
static void empty_dma(struct lguest_dma *dma)
{
	dma->len[0] = 0;
}

/*D:470 Setting up a request is fairly easy: */
static void setup_req(struct blockdev *bd,
		      int type, struct request *req, struct lguest_dma *dma)
{
	/* The type is 1 (write) or 0 (read). */
	bd->lb_page->type = type;
	/* The sector on disk where the read or write starts. */
	bd->lb_page->sector = req->sector;
	/* The result is initialized to 0 (unfinished). */
	bd->lb_page->result = 0;
	/* The current request (so we can end it in the interrupt handler). */
	bd->req = req;
	/* The number of bytes: returned as a side-effect of req_to_dma(),
	 * which packs the block layer's "struct request" into our "struct
	 * lguest_dma" */
	bd->lb_page->bytes = req_to_dma(req, dma);
}

/*D:450 Write is pretty straightforward: we pack the request into a "struct
 * lguest_dma", then use SEND_DMA to send the request. */
static void do_write(struct blockdev *bd, struct request *req)
{
	struct lguest_dma send;

	pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
	setup_req(bd, 1, req, &send);

	lguest_send_dma(bd->phys_addr, &send);
}

/* Read is similar to write, except we pack the request into our receive
 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
 * there's a request pending. */
static void do_read(struct blockdev *bd, struct request *req)
{
	struct lguest_dma ping;

	pr_debug("lgb: READ sector %li\n", (long)req->sector);
	setup_req(bd, 0, req, &bd->dma);

	empty_dma(&ping);
	lguest_send_dma(bd->phys_addr, &ping);
}

/*D:440 This where requests come in: we get handed the request queue and are
 * expected to pull a "struct request" off it until we've finished them or
 * we're waiting for a reply: */
static void do_lgb_request(struct request_queue *q)
{
	struct blockdev *bd;
	struct request *req;

again:
	/* This sometimes returns NULL even on the very first time around.  I
	 * wonder if it's something to do with letting elves handle the request
	 * queue... */
	req = elv_next_request(q);
	if (!req)
		return;

	/* We attached the struct blockdev to the disk: get it back */
	bd = req->rq_disk->private_data;
	/* Sometimes we get repeated requests after blk_stop_queue(), but we
	 * can only handle one at a time. */
	if (bd->req)
		return;

	/* We only do reads and writes: no tricky business! */
	if (!blk_fs_request(req)) {
		pr_debug("Got non-command 0x%08x\n", req->cmd_type);
		req->errors++;
		end_entire_request(req, 0);
		goto again;
	}

	if (rq_data_dir(req) == WRITE)
		do_write(bd, req);
	else
		do_read(bd, req);

	/* We've put out the request, so stop any more coming in until we get
	 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
	blk_stop_queue(q);
}

/*D:430 This is the "struct block_device_operations" we attach to the disk at
 * the end of lguestblk_probe().  It doesn't seem to want much. */
static struct block_device_operations lguestblk_fops = {
	.owner = THIS_MODULE,
};

/*D:425 Setting up a disk device seems to involve a lot of code.  I'm not sure
 * quite why.  I do know that the IDE code sent two or three of the maintainers
 * insane, perhaps this is the fringe of the same disease?
 *
 * As in the console code, the probe function gets handed the generic
 * lguest_device from lguest_bus.c: */
static int lguestblk_probe(struct lguest_device *lgdev)
{
	struct blockdev *bd;
	int err;
	int irqflags = IRQF_SHARED;

	/* First we allocate our own "struct blockdev" and initialize the easy
	 * fields. */
	bd = kmalloc(sizeof(*bd), GFP_KERNEL);
	if (!bd)
		return -ENOMEM;

	spin_lock_init(&bd->lock);
	bd->irq = lgdev_irq(lgdev);
	bd->req = NULL;
	bd->dma.used_len = 0;
	bd->dma.len[0] = 0;
	/* The descriptor in the lguest_devices array provided by the Host
	 * gives the Guest the physical page number of the device's page. */
	bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);

	/* We use lguest_map() to get a pointer to the device page */
	bd->lb_page = lguest_map(bd->phys_addr, 1);
	if (!bd->lb_page) {
		err = -ENOMEM;
		goto out_free_bd;
	}

	/* We need a major device number: 0 means "assign one dynamically". */
	bd->major = register_blkdev(0, "lguestblk");
	if (bd->major < 0) {
		err = bd->major;
		goto out_unmap;
	}

	/* This allocates a "struct gendisk" where we pack all the information
	 * about the disk which the rest of Linux sees.  The argument is the
	 * number of minor devices desired: we need one minor for the main
	 * disk, and one for each partition.  Of course, we can't possibly know
	 * how many partitions are on the disk (add_disk does that).
	 */
	bd->disk = alloc_disk(16);
	if (!bd->disk) {
		err = -ENOMEM;
		goto out_unregister_blkdev;
	}

	/* Every disk needs a queue for requests to come in: we set up the
	 * queue with a callback function (the core of our driver) and the lock
	 * to use. */
	bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
	if (!bd->disk->queue) {
		err = -ENOMEM;
		goto out_put_disk;
	}

	/* We can only handle a certain number of pointers in our SEND_DMA
	 * call, so we set that with blk_queue_max_hw_segments().  This is not
	 * to be confused with blk_queue_max_phys_segments() of course!  I
	 * know, who could possibly confuse the two?
	 *
	 * Well, it's simple to tell them apart: this one seems to work and the
	 * other one didn't. */
	blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);

	/* Due to technical limitations of our Host (and simple coding) we
	 * can't have a single buffer which crosses a page boundary.  Tell it
	 * here.  This means that our maximum request size is 16
	 * (LGUEST_MAX_DMA_SECTIONS) pages. */
	blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);

	/* We name our disk: this becomes the device name when udev does its
	 * magic thing and creates the device node, such as /dev/lgba.
	 * next_block_index is a global which starts at 'a'.  Unfortunately
	 * this simple increment logic means that the 27th disk will be called
	 * "/dev/lgb{".  In that case, I recommend having at least 29 disks, so
	 * your /dev directory will be balanced. */
	sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);

	/* We look to the device descriptor again to see if this device's
	 * interrupts are expected to be random.  If they are, we tell the irq
	 * subsystem.  At the moment this bit is always set. */
	if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
		irqflags |= IRQF_SAMPLE_RANDOM;

	/* Now we have the name and irqflags, we can request the interrupt; we
	 * give it the "struct blockdev" we have set up to pass to lgb_irq()
	 * when there is an interrupt. */
	err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
	if (err)
		goto out_cleanup_queue;

	/* We bind our one-entry DMA pool to the key for this block device so
	 * the Host can reply to our requests.  The key is equal to the
	 * physical address of the device's page, which is conveniently
	 * unique. */
	err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
	if (err)
		goto out_free_irq;

	/* We finish our disk initialization and add the disk to the system. */
	bd->disk->major = bd->major;
	bd->disk->first_minor = 0;
	bd->disk->private_data = bd;
	bd->disk->fops = &lguestblk_fops;
	/* This is initialized to the disk size by the Launcher. */
	set_capacity(bd->disk, bd->lb_page->num_sectors);
	add_disk(bd->disk);

	printk(KERN_INFO "%s: device %i at major %d\n",
	       bd->disk->disk_name, lgdev->index, bd->major);

	/* We don't need to keep the "struct blockdev" around, but if we ever
	 * implemented device removal, we'd need this. */
	lgdev->private = bd;
	return 0;

out_free_irq:
	free_irq(bd->irq, bd);
out_cleanup_queue:
	blk_cleanup_queue(bd->disk->queue);
out_put_disk:
	put_disk(bd->disk);
out_unregister_blkdev:
	unregister_blkdev(bd->major, "lguestblk");
out_unmap:
	lguest_unmap(bd->lb_page);
out_free_bd:
	kfree(bd);
	return err;
}

/*D:410 The boilerplate code for registering the lguest block driver is just
 * like the console: */
static struct lguest_driver lguestblk_drv = {
	.name = "lguestblk",
	.owner = THIS_MODULE,
	.device_type = LGUEST_DEVICE_T_BLOCK,
	.probe = lguestblk_probe,
};

static __init int lguestblk_init(void)
{
	return register_lguest_driver(&lguestblk_drv);
}
module_init(lguestblk_init);

MODULE_DESCRIPTION("Lguest block driver");
MODULE_LICENSE("GPL");
Commit	Line	Data
e2c97843 RR	1	/*D:400
e2c97843 RR	2	* The Guest block driver
b754416b	3	*
e2c97843 RR	4	* This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
	5	* The mechanism is simple: we place the information about the request in the
	6	* device page, then use SEND_DMA (containing the data for a write, or an empty
	7	* "ping" DMA for a read).
	8	:*/
	9	/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
b754416b RR	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of the GNU General Public License as published by
	13	* the Free Software Foundation; either version 2 of the License, or
	14	* (at your option) any later version.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program; if not, write to the Free Software
	23	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	24	*/
	25	//#define DEBUG
	26	#include <linux/init.h>
	27	#include <linux/types.h>
	28	#include <linux/blkdev.h>
	29	#include <linux/interrupt.h>
	30	#include <linux/lguest_bus.h>
	31
	32	static char next_block_index = 'a';
	33
e2c97843 RR	34	/*D:420 Here is the structure which holds all the information we need about
	35	* each Guest block device.
	36	*
	37	* I'm sure at this stage, you're wondering "hey, where was the adventure I was
	38	* promised?" and thinking "Rusty sucks, I shall say nasty things about him on
	39	* my blog". I think Real adventures have boring bits, too, and you're in the
	40	* middle of one. But it gets better. Just not quite yet. */
b754416b RR	41	struct blockdev
b754416b RR	42	{
e2c97843 RR	43	/* The block queue infrastructure wants a spinlock: it is held while it
	44	* calls our block request function. We grab it in our interrupt
	45	* handler so the responses don't mess with new requests. */
b754416b RR	46	spinlock_t lock;
b754416b RR	47
e2c97843	48	/* The disk structure registered with kernel. */
b754416b RR	49	struct gendisk *disk;
b754416b RR	50
e2c97843 RR	51	/* The major device number for this disk, and the interrupt. We only
	52	* really keep them here for completeness; we'd need them if we
	53	* supported device unplugging. */
b754416b RR	54	int major;
	55	int irq;
	56
e2c97843	57	/* The physical address of this device's memory page */
b754416b	58	unsigned long phys_addr;
e2c97843	59	/* The mapped memory page for convenient acces. */
b754416b RR	60	struct lguest_block_page *lb_page;
b754416b RR	61
e2c97843	62	/* We only have a single request outstanding at a time: this is it. */
b754416b RR	63	struct lguest_dma dma;
	64	struct request *req;
	65	};
	66
e2c97843 RR	67	/*D:495 We originally used end_request() throughout the driver, but it turns
	68	* out that end_request() is deprecated, and doesn't actually end the request
	69	* (which seems like a good reason to deprecate it!). It simply ends the first
	70	* bio. So if we had 3 bios in a "struct request" we would do all 3,
	71	* end_request(), do 2, end_request(), do 1 and end_request(): twice as much
	72	* work as we needed to do.
	73	*
	74	* This reinforced to me that I do not understand the block layer.
	75	*
	76	* Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
	77	* request. This improved disk speed by 130%. */
b754416b RR	78	static void end_entire_request(struct request *req, int uptodate)
	79	{
	80	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
	81	BUG();
	82	add_disk_randomness(req->rq_disk);
	83	blkdev_dequeue_request(req);
	84	end_that_request_last(req, uptodate);
	85	}
	86
e2c97843 RR	87	/* I'm told there are only two stories in the world worth telling: love and
	88	* hate. So there used to be a love scene here like this:
	89	*
	90	* Launcher: We could make beautiful I/O together, you and I.
	91	* Guest: My, that's a big disk!
	92	*
	93	* Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
	94
	95	/*D:490 This is the interrupt handler, called when a block read or write has
	96	* been completed for us. */
b754416b RR	97	static irqreturn_t lgb_irq(int irq, void *_bd)
b754416b RR	98	{
e2c97843 RR	99	/* We handed our "struct blockdev" as the argument to request_irq(), so
	100	* it is passed through to us here. This tells us which device we're
	101	* dealing with in case we have more than one. */
b754416b RR	102	struct blockdev *bd = _bd;
	103	unsigned long flags;
	104
e2c97843 RR	105	/* We weren't doing anything? Strange, but could happen if we shared
e2c97843 RR	106	* interrupts (we don't!). */
b754416b RR	107	if (!bd->req) {
	108	pr_debug("No work!\n");
	109	return IRQ_NONE;
	110	}
	111
e2c97843	112	/* Not done yet? That's equally strange. */
b754416b RR	113	if (!bd->lb_page->result) {
	114	pr_debug("No result!\n");
	115	return IRQ_NONE;
	116	}
	117
e2c97843	118	/* We have to grab the lock before ending the request. */
b754416b	119	spin_lock_irqsave(&bd->lock, flags);
e2c97843 RR	120	/* "result" is 1 for success, 2 for failure: end_entire_request() wants
e2c97843 RR	121	* to know whether this succeeded or not. */
b754416b	122	end_entire_request(bd->req, bd->lb_page->result == 1);
e2c97843	123	/* Clear out request, it's done. */
b754416b	124	bd->req = NULL;
e2c97843	125	/* Reset incoming DMA for next time. */
b754416b	126	bd->dma.used_len = 0;
e2c97843	127	/* Ready for more reads or writes */
b754416b RR	128	blk_start_queue(bd->disk->queue);
b754416b RR	129	spin_unlock_irqrestore(&bd->lock, flags);
e2c97843 RR	130
e2c97843 RR	131	/* The interrupt was for us, we dealt with it. */
b754416b RR	132	return IRQ_HANDLED;
	133	}
	134
e2c97843 RR	135	/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
	136	* each of which contains "struct bio_vec"s, each of which contains a page, an
	137	* offset and a length.
	138	*
	139	* Fortunately there are iterators to help us walk through the "struct
	140	* request". Even more fortunately, there were plenty of places to steal the
	141	* code from. We pack the "struct request" into our "struct lguest_dma" and
	142	* return the total length. */
b754416b RR	143	static unsigned int req_to_dma(struct request req, struct lguest_dma dma)
b754416b RR	144	{
5705f702 N	145	unsigned int i = 0, len = 0;
	146	struct req_iterator iter;
	147	struct bio_vec *bvec;
b754416b	148
5705f702	149	rq_for_each_segment(bvec, req, iter) {
6c92e699 JA	150	/* We told the block layer not to give us too many. */
	151	BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
	152	/* If we had a zero-length segment, it would look like
	153	* the end of the data referred to by the "struct
	154	* lguest_dma", so make sure that doesn't happen. */
	155	BUG_ON(!bvec->bv_len);
	156	/* Convert page & offset to a physical address */
	157	dma->addr[i] = page_to_phys(bvec->bv_page)
	158	+ bvec->bv_offset;
	159	dma->len[i] = bvec->bv_len;
	160	len += bvec->bv_len;
	161	i++;
b754416b	162	}
e2c97843	163	/* If the array isn't full, we mark the end with a 0 length */
b754416b RR	164	if (i < LGUEST_MAX_DMA_SECTIONS)
	165	dma->len[i] = 0;
	166	return len;
	167	}
	168
e2c97843 RR	169	/* This creates an empty DMA, useful for prodding the Host without sending data
e2c97843 RR	170	* (ie. when we want to do a read) */
b754416b RR	171	static void empty_dma(struct lguest_dma *dma)
	172	{
	173	dma->len[0] = 0;
	174	}
	175
e2c97843	176	/D:470 Setting up a request is fairly easy: /
b754416b RR	177	static void setup_req(struct blockdev *bd,
	178	int type, struct request req, struct lguest_dma dma)
	179	{
e2c97843	180	/* The type is 1 (write) or 0 (read). */
b754416b	181	bd->lb_page->type = type;
e2c97843	182	/* The sector on disk where the read or write starts. */
b754416b	183	bd->lb_page->sector = req->sector;
e2c97843	184	/* The result is initialized to 0 (unfinished). */
b754416b	185	bd->lb_page->result = 0;
e2c97843	186	/* The current request (so we can end it in the interrupt handler). */
b754416b	187	bd->req = req;
e2c97843 RR	188	/* The number of bytes: returned as a side-effect of req_to_dma(),
	189	* which packs the block layer's "struct request" into our "struct
	190	* lguest_dma" */
b754416b RR	191	bd->lb_page->bytes = req_to_dma(req, dma);
	192	}
	193
e2c97843 RR	194	/*D:450 Write is pretty straightforward: we pack the request into a "struct
e2c97843 RR	195	* lguest_dma", then use SEND_DMA to send the request. */
b754416b RR	196	static void do_write(struct blockdev bd, struct request req)
	197	{
	198	struct lguest_dma send;
	199
	200	pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
	201	setup_req(bd, 1, req, &send);
	202
	203	lguest_send_dma(bd->phys_addr, &send);
	204	}
	205
e2c97843 RR	206	/* Read is similar to write, except we pack the request into our receive
	207	* "struct lguest_dma" and send through an empty DMA just to tell the Host that
	208	* there's a request pending. */
b754416b RR	209	static void do_read(struct blockdev bd, struct request req)
	210	{
	211	struct lguest_dma ping;
	212
	213	pr_debug("lgb: READ sector %li\n", (long)req->sector);
	214	setup_req(bd, 0, req, &bd->dma);
	215
	216	empty_dma(&ping);
	217	lguest_send_dma(bd->phys_addr, &ping);
	218	}
	219
e2c97843 RR	220	/*D:440 This where requests come in: we get handed the request queue and are
	221	* expected to pull a "struct request" off it until we've finished them or
	222	* we're waiting for a reply: */
165125e1	223	static void do_lgb_request(struct request_queue *q)
b754416b RR	224	{
	225	struct blockdev *bd;
	226	struct request *req;
	227
	228	again:
e2c97843 RR	229	/* This sometimes returns NULL even on the very first time around. I
	230	* wonder if it's something to do with letting elves handle the request
	231	* queue... */
b754416b RR	232	req = elv_next_request(q);
	233	if (!req)
	234	return;
	235
e2c97843	236	/* We attached the struct blockdev to the disk: get it back */
b754416b	237	bd = req->rq_disk->private_data;
e2c97843 RR	238	/* Sometimes we get repeated requests after blk_stop_queue(), but we
e2c97843 RR	239	* can only handle one at a time. */
b754416b RR	240	if (bd->req)
	241	return;
	242
e2c97843	243	/* We only do reads and writes: no tricky business! */
b754416b RR	244	if (!blk_fs_request(req)) {
	245	pr_debug("Got non-command 0x%08x\n", req->cmd_type);
	246	req->errors++;
	247	end_entire_request(req, 0);
	248	goto again;
	249	}
	250
	251	if (rq_data_dir(req) == WRITE)
	252	do_write(bd, req);
	253	else
	254	do_read(bd, req);
	255
e2c97843 RR	256	/* We've put out the request, so stop any more coming in until we get
e2c97843 RR	257	* an interrupt, which takes us to lgb_irq() to re-enable the queue. */
b754416b RR	258	blk_stop_queue(q);
	259	}
	260
e2c97843 RR	261	/*D:430 This is the "struct block_device_operations" we attach to the disk at
e2c97843 RR	262	* the end of lguestblk_probe(). It doesn't seem to want much. */
b754416b RR	263	static struct block_device_operations lguestblk_fops = {
	264	.owner = THIS_MODULE,
	265	};
	266
e2c97843 RR	267	/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
	268	* quite why. I do know that the IDE code sent two or three of the maintainers
	269	* insane, perhaps this is the fringe of the same disease?
	270	*
	271	* As in the console code, the probe function gets handed the generic
	272	* lguest_device from lguest_bus.c: */
b754416b RR	273	static int lguestblk_probe(struct lguest_device *lgdev)
	274	{
	275	struct blockdev *bd;
	276	int err;
	277	int irqflags = IRQF_SHARED;
	278
e2c97843 RR	279	/* First we allocate our own "struct blockdev" and initialize the easy
e2c97843 RR	280	* fields. */
b754416b RR	281	bd = kmalloc(sizeof(*bd), GFP_KERNEL);
	282	if (!bd)
	283	return -ENOMEM;
	284
	285	spin_lock_init(&bd->lock);
	286	bd->irq = lgdev_irq(lgdev);
	287	bd->req = NULL;
	288	bd->dma.used_len = 0;
	289	bd->dma.len[0] = 0;
e2c97843 RR	290	/* The descriptor in the lguest_devices array provided by the Host
e2c97843 RR	291	* gives the Guest the physical page number of the device's page. */
b754416b RR	292	bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
b754416b RR	293
e2c97843	294	/* We use lguest_map() to get a pointer to the device page */
b754416b RR	295	bd->lb_page = lguest_map(bd->phys_addr, 1);
	296	if (!bd->lb_page) {
	297	err = -ENOMEM;
	298	goto out_free_bd;
	299	}
	300
e2c97843	301	/* We need a major device number: 0 means "assign one dynamically". */
b754416b RR	302	bd->major = register_blkdev(0, "lguestblk");
	303	if (bd->major < 0) {
	304	err = bd->major;
	305	goto out_unmap;
	306	}
	307
e2c97843	308	/* This allocates a "struct gendisk" where we pack all the information
9ef7ad22 RR	309	* about the disk which the rest of Linux sees. The argument is the
	310	* number of minor devices desired: we need one minor for the main
	311	* disk, and one for each partition. Of course, we can't possibly know
	312	* how many partitions are on the disk (add_disk does that).
	313	*/
	314	bd->disk = alloc_disk(16);
b754416b RR	315	if (!bd->disk) {
	316	err = -ENOMEM;
	317	goto out_unregister_blkdev;
	318	}
	319
e2c97843 RR	320	/* Every disk needs a queue for requests to come in: we set up the
	321	* queue with a callback function (the core of our driver) and the lock
	322	* to use. */
b754416b RR	323	bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
	324	if (!bd->disk->queue) {
	325	err = -ENOMEM;
	326	goto out_put_disk;
	327	}
	328
e2c97843 RR	329	/* We can only handle a certain number of pointers in our SEND_DMA
	330	* call, so we set that with blk_queue_max_hw_segments(). This is not
	331	* to be confused with blk_queue_max_phys_segments() of course! I
	332	* know, who could possibly confuse the two?
	333	*
	334	* Well, it's simple to tell them apart: this one seems to work and the
	335	* other one didn't. */
b754416b	336	blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
e2c97843 RR	337
	338	/* Due to technical limitations of our Host (and simple coding) we
	339	* can't have a single buffer which crosses a page boundary. Tell it
	340	* here. This means that our maximum request size is 16
	341	* (LGUEST_MAX_DMA_SECTIONS) pages. */
b754416b RR	342	blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
b754416b RR	343
e2c97843 RR	344	/* We name our disk: this becomes the device name when udev does its
	345	* magic thing and creates the device node, such as /dev/lgba.
	346	* next_block_index is a global which starts at 'a'. Unfortunately
	347	* this simple increment logic means that the 27th disk will be called
	348	* "/dev/lgb{". In that case, I recommend having at least 29 disks, so
	349	* your /dev directory will be balanced. */
b754416b	350	sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
e2c97843 RR	351
	352	/* We look to the device descriptor again to see if this device's
	353	* interrupts are expected to be random. If they are, we tell the irq
	354	* subsystem. At the moment this bit is always set. */
b754416b RR	355	if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
b754416b RR	356	irqflags \|= IRQF_SAMPLE_RANDOM;
e2c97843 RR	357
	358	/* Now we have the name and irqflags, we can request the interrupt; we
	359	* give it the "struct blockdev" we have set up to pass to lgb_irq()
	360	* when there is an interrupt. */
b754416b RR	361	err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
	362	if (err)
	363	goto out_cleanup_queue;
	364
e2c97843 RR	365	/* We bind our one-entry DMA pool to the key for this block device so
	366	* the Host can reply to our requests. The key is equal to the
	367	* physical address of the device's page, which is conveniently
	368	* unique. */
b754416b RR	369	err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
	370	if (err)
	371	goto out_free_irq;
	372
e2c97843	373	/* We finish our disk initialization and add the disk to the system. */
b754416b RR	374	bd->disk->major = bd->major;
	375	bd->disk->first_minor = 0;
	376	bd->disk->private_data = bd;
	377	bd->disk->fops = &lguestblk_fops;
e2c97843	378	/* This is initialized to the disk size by the Launcher. */
b754416b RR	379	set_capacity(bd->disk, bd->lb_page->num_sectors);
	380	add_disk(bd->disk);
	381
	382	printk(KERN_INFO "%s: device %i at major %d\n",
	383	bd->disk->disk_name, lgdev->index, bd->major);
	384
e2c97843 RR	385	/* We don't need to keep the "struct blockdev" around, but if we ever
e2c97843 RR	386	* implemented device removal, we'd need this. */
b754416b RR	387	lgdev->private = bd;
	388	return 0;
	389
	390	out_free_irq:
	391	free_irq(bd->irq, bd);
	392	out_cleanup_queue:
	393	blk_cleanup_queue(bd->disk->queue);
	394	out_put_disk:
	395	put_disk(bd->disk);
	396	out_unregister_blkdev:
	397	unregister_blkdev(bd->major, "lguestblk");
	398	out_unmap:
	399	lguest_unmap(bd->lb_page);
	400	out_free_bd:
	401	kfree(bd);
	402	return err;
	403	}
	404
e2c97843 RR	405	/*D:410 The boilerplate code for registering the lguest block driver is just
e2c97843 RR	406	* like the console: */
b754416b RR	407	static struct lguest_driver lguestblk_drv = {
	408	.name = "lguestblk",
	409	.owner = THIS_MODULE,
	410	.device_type = LGUEST_DEVICE_T_BLOCK,
	411	.probe = lguestblk_probe,
	412	};
	413
	414	static __init int lguestblk_init(void)
	415	{
	416	return register_lguest_driver(&lguestblk_drv);
	417	}
	418	module_init(lguestblk_init);
	419
	420	MODULE_DESCRIPTION("Lguest block driver");
	421	MODULE_LICENSE("GPL");