[PATCH] fcache: add file system frontend cache fcache
authorJens Axboe <axboe@kernel.dk>
Thu, 21 Sep 2006 11:52:16 +0000 (13:52 +0200)
committerJens Axboe <axboe@kernel.dk>
Thu, 21 Sep 2006 11:52:16 +0000 (13:52 +0200)
fcache is a frontend cache for a file system that aims to reduce a
dominantly repeatable seeky read workload to sequential access. It does
this by first running the workload in a priming mode, logging the
accesses to the file systme on a cache device. Subsequent runs in
non-priming mode then access the data sequentially from the frontend
cache, instead of punting to the real device.

The primary intended application is to reduce boot time by attaching
to the root file system.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/ll_rw_blk.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/fcache.c [new file with mode: 0644]
fs/ext3/super.c
include/linux/bio.h
include/linux/ext3_fs.h
include/linux/fs.h

index ddd9253f9d55f267120ddf04a014eaecd12d283c..bc0085e811030e39c60b9d74848d4160b2208e48 100644 (file)
@@ -2819,12 +2819,10 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
         */
        if (bio_rw_ahead(bio) || bio_failfast(bio))
                req->flags |= REQ_FAILFAST;
-
-       /*
-        * REQ_BARRIER implies no merging, but lets make it explicit
-        */
        if (unlikely(bio_barrier(bio)))
-               req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+               req->flags |= REQ_HARDBARRIER;
+       if (!bio_mergeable(bio))
+               req->flags |= REQ_NOMERGE;
 
        if (bio_sync(bio))
                req->flags |= REQ_RW_SYNC;
@@ -2875,7 +2873,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 
        spin_lock_irq(q->queue_lock);
 
-       if (unlikely(barrier) || elv_queue_empty(q))
+       if (!bio_mergeable(bio) || elv_queue_empty(q))
                goto get_rq;
 
        el_ret = elv_merge(q, &req, bio);
@@ -3114,6 +3112,7 @@ void submit_bio(int rw, struct bio *bio)
 
        BIO_BUG_ON(!bio->bi_size);
        BIO_BUG_ON(!bio->bi_io_vec);
+       BIO_BUG_ON(bio->bi_next);
        bio->bi_rw |= rw;
        if (rw & WRITE)
                count_vm_events(PGPGOUT, count);
index b5382cedf0c09cad0376df575c32b194d5ba7c0d..ec3af195e0b69f0bb1cd6d474542c80d5e95e5d4 100644 (file)
@@ -467,4 +467,19 @@ config ATA_OVER_ETH
        This driver provides Support for ATA over Ethernet block
        devices like the Coraid EtherDrive (R) Storage Blade.
 
+config BLK_FCACHE
+       bool "Frontend block device cache driver"
+       help
+       This driver monitors file system read activity on a partition
+       and logs that to a separate cache device. The intention is to
+       reduce a mostly repeatable seeky read workload to a much faster
+       sequential access pattern from the cache device. The primary
+       intended application for fcache is to reduce boot time by attaching
+       it to the root file system.
+
+       There is a simple userspace consistency checker available through
+       git at:
+
+       git://brick.kernel.dk/data/git/fcache.git
+
 endmenu
index 410f259a8031a651abbb13fb7f084d75da150450..5efeee98e6ae2920cb6073abecdec668b977cc9f 100644 (file)
@@ -5,6 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
+obj-$(CONFIG_BLK_FCACHE)       += fcache.o
 obj-$(CONFIG_MAC_FLOPPY)       += swim3.o
 obj-$(CONFIG_BLK_DEV_FD)       += floppy.o
 obj-$(CONFIG_AMIGA_FLOPPY)     += amiflop.o
diff --git a/drivers/block/fcache.c b/drivers/block/fcache.c
new file mode 100644 (file)
index 0000000..375b153
--- /dev/null
@@ -0,0 +1,1853 @@
+/*
+ * A frontend cache for a block device. The purpose is to speedup a
+ * fairly random but repeated read work load, like the boot of a system.
+ *
+ * When run in priming mode, fcache allocates and writes data read from
+ * the source drive to our extent cache in the order in which they are
+ * accessed. When later run in non-priming mode, data accessed in the same
+ * order will be linearly available in the cache.
+ *
+ * Performance when priming is slower than non-fcache usage would be. If
+ * the fcache is located on another disk, the hit should be small. If the
+ * the fcache is located on the same disk (another partition), it runs
+ * at about half the speed. Non-priming performance should be fairly
+ * similar on same/other disk.
+ *
+ * On-disk format is as follows:
+ *     Block0:         header
+ *     Block1..X       extent maps
+ *     BlockX+1..Y     extent data
+ *
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <linux/prio_tree.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#define FCACHE_MAGIC   0x61786663
+#define FCACHE_VERSION 0x03
+
+#define FCACHE_HEADER_BLOCK    0
+#define FCACHE_EXTENT_BLOCK    1
+
+/*
+ * If some extents see repeated overwrites, just get rid of them.
+ */
+#define FCACHE_MAX_OVERWRITES  5
+#define FCACHE_OVERWRITE_TIME  HZ
+
+#undef FCACHE_PAGES_PROTECTED
+
+struct fcache_dev {
+       struct block_device *bdev;
+       struct block_device *fs_bdev;
+       make_request_fn *mfn;
+       struct prio_tree_root prio_root;
+       struct list_head fe_list;
+       unsigned long next_cache_block;
+       unsigned long nr_extents;
+       unsigned long max_extents;
+       unsigned int old_bs;
+       spinlock_t lock;
+
+       sector_t cache_start_sector;
+       unsigned long cache_blocks;
+       sector_t fs_start_sector;
+       sector_t fs_sectors;
+
+       unsigned long flags;
+       int priming;
+       int serial;
+       int chop_ios;
+
+       struct list_head list;
+       struct work_struct work;
+
+       struct fcache_extent *last_fe;
+
+       struct rb_root rb_root;         /* sorted pcache pages */
+
+       /*
+        * stats
+        */
+       unsigned int ios[2];
+       unsigned int hits;
+       unsigned int misses;
+       unsigned int partial;
+       unsigned int overwrites;
+       unsigned int extent_merges;
+       unsigned int extent_drops;
+       unsigned long pcache_kb;
+       unsigned int pcache_hits;
+};
+
+enum {
+       FDEV_F_DOWN = 0,
+};
+
+static struct fcache_dev fcache_dev;
+
+static int disable;
+module_param(disable, int, 0444);
+static int pcache_disable;
+module_param(pcache_disable, int, 0444);
+
+struct fcache_endio_data {
+       struct fcache_dev *fdev;
+       sector_t fs_sector;
+       unsigned int fs_size;
+       sector_t cache_sector;
+       atomic_t completions;
+       struct bio *bio;
+       int io_error;
+       struct list_head list;
+};
+
+/*
+ * Maps a file system block to the fcache
+ */
+struct fcache_extent {
+       sector_t fs_sector;     /* real device offset */
+       unsigned int fs_size;   /* extent length */
+       sector_t cache_sector;  /* cache device offset */
+
+       struct prio_tree_node prio_node;        /* fs_sector sorted */
+       struct list_head fe_list;               /* cache_sector sorted */
+
+       struct page **pages;    /* prefetch window pages */
+       int numpages;
+       int readpages;
+
+       unsigned int valid;     /* extent is valid */
+       unsigned int overwrites;        /* number of times we wrote over */
+       unsigned long ow_time;  /* overwrite time */
+};
+
+/*
+ * On-disk representation of fcache_extent.
+ */
+struct fcache_ondisk_extent {
+       __be64 fs_sector;
+       __be64 cache_sector;
+       __be32 fs_size;
+};
+
+/*
+ * Header on fcache device - will take up the first page of data, so
+ * plenty of room to go around.
+ */
+struct fcache_header {
+       u32 magic;              /* fcache magic */
+       u32 version;            /* fcache version */
+       u32 nr_extents;         /* nr of extents in cache */
+       u32 max_extents;        /* max nr of extents available */
+       u32 serial;             /* fs and cache serial */
+       u32 extent_offset;      /* where extents start */
+       u64 fs_start_sector;    /* where fs starts */
+       u64 fs_sectors;         /* how big fs is */
+       char fs_dev[BDEVNAME_SIZE];     /* fs partition */
+       u64 cache_blocks;       /* number of blocks in cache */
+       u64 cache_blocks_used;  /* used blocks in cache */
+};
+
+#define FBLOCK_SHIFT (PAGE_SHIFT - 9)
+#define FBLOCK_SIZE (1 << FBLOCK_SHIFT)
+#define FBLOCK_MASK (FBLOCK_SIZE - 1)
+
+static struct kmem_cache *fcache_slab;
+static struct kmem_cache *fcache_fed_slab;
+static mempool_t *fed_pool;
+static struct workqueue_struct *fcache_workqueue;
+
+static void __fcache_fe_drop(struct fcache_extent *fe);
+
+static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
+{
+       if (bio->bi_size)
+               return 1;
+
+       complete(bio->bi_private);
+       return 0;
+}
+
+/*
+ * Writes out a page of data and waits for it to complete.
+ */
+static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
+                         struct page *page, int rw)
+{
+       DECLARE_COMPLETION(wait);
+       struct bio *bio;
+       int ret = 0;
+
+       bio = bio_alloc(GFP_KERNEL, 1);
+
+       bio->bi_sector = index << FBLOCK_SHIFT;
+       bio->bi_bdev = fdev->bdev;
+       bio->bi_rw |= (1 << BIO_RW_SYNC);
+       bio->bi_end_io = fcache_rw_page_endio;
+       bio->bi_private = &wait;
+
+       bio_add_page(bio, page, PAGE_SIZE, 0);
+       submit_bio(rw, bio);
+
+       wait_for_completion(&wait);
+
+       if (!bio_flagged(bio, BIO_UPTODATE))
+               ret = -EIO;
+
+       bio_put(bio);
+       return ret;
+}
+
+static inline void fcache_fill_header(struct fcache_dev *fdev,
+                                     struct fcache_header *header,
+                                     unsigned int nr_extents)
+{
+       /*
+        * See how many pages we need for extent headers, then we know where
+        * to start putting data. Assume worst case of 1 page per extent, and
+        * reserve the first page for the header.
+        */
+
+       header->magic = FCACHE_MAGIC;
+       header->version = FCACHE_VERSION;
+       header->nr_extents = nr_extents;
+       header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_ondisk_extent));
+       header->serial = fdev->serial;
+
+       header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_ondisk_extent) / PAGE_SIZE);
+
+       header->fs_start_sector = fdev->fs_start_sector;
+       header->fs_sectors = fdev->fs_sectors;
+       bdevname(fdev->fs_bdev, header->fs_dev);
+       header->cache_blocks = fdev->cache_blocks;
+       header->cache_blocks_used = fdev->next_cache_block;
+}
+
+static int fcache_write_new_header(struct fcache_dev *fdev)
+{
+       struct fcache_header *header;
+       struct page *page;
+       int ret;
+
+       page = alloc_page(GFP_HIGHUSER);
+       if (unlikely(!page))
+               return -ENOMEM;
+
+       header = kmap_atomic(page, KM_USER0);
+       clear_page(header);
+       fcache_fill_header(fdev, header, 0);
+       fdev->next_cache_block = header->extent_offset;
+       fdev->max_extents = header->max_extents;
+       kunmap_atomic(header, KM_USER0);
+
+       printk("fcache: new header: first block %lu, max %lu\n",
+                               fdev->next_cache_block, fdev->max_extents);
+       ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+       __free_page(page);
+       return ret;
+}
+
+static void fcache_free_links(struct fcache_dev *fdev)
+{
+       struct fcache_extent *fe;
+       int i;
+
+       while (!list_empty(&fdev->fe_list)) {
+               fe = list_entry(fdev->fe_list.next, struct fcache_extent, fe_list);
+
+               list_del_init(&fe->fe_list);
+
+               for (i = 0; i < fe->numpages; i++) {
+                       wait_on_page_locked(fe->pages[i]);
+                       __free_page(fe->pages[i]);
+               }
+
+               prio_tree_remove(&fdev->prio_root, &fe->prio_node);
+               kmem_cache_free(fcache_slab, fe);
+       }
+}
+
+/*
+ * First clear the header, write extents, then write real header.
+ */
+static int fcache_write_extents(struct fcache_dev *fdev)
+{
+       struct fcache_header *header;
+       sector_t index, sectors;
+       unsigned int nr_extents, this_extents;
+       struct fcache_extent *fe;
+       struct prio_tree_iter iter;
+       struct prio_tree_node *node;
+       struct page *page;
+       void *p;
+       int ret;
+
+       page = alloc_page(GFP_KERNEL);
+       if (unlikely(!page))
+               return -ENOMEM;
+
+       header = page_address(page);
+       clear_page(header);
+       fcache_fill_header(fdev, header, 0);
+       ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+       if (ret)
+               goto err;
+
+       /*
+        * Now write the extents in page size chunks.
+        */
+       p = page_address(page);
+       clear_page(p);
+       index = FCACHE_EXTENT_BLOCK;
+       sectors = 0;
+       this_extents = nr_extents = 0;
+
+       prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
+
+       do {
+               struct fcache_ondisk_extent *foe;
+
+               node = prio_tree_next(&iter);
+               if (!node)
+                       break;
+
+               fe = prio_tree_entry(node, struct fcache_extent, prio_node);
+               if (!fe->valid)
+                       continue;
+
+               foe = p;
+               foe->fs_sector = cpu_to_be64(fe->fs_sector);
+               foe->cache_sector = cpu_to_be64(fe->cache_sector);
+               foe->fs_size = cpu_to_be32(fe->fs_size);
+
+               nr_extents++;
+               this_extents++;
+               sectors += fe->fs_size >> 9;
+               p += sizeof(*foe);
+               if ((this_extents + 1) * sizeof(*foe) > PAGE_SIZE) {
+                       ret = fcache_rw_page(fdev, index, page, WRITE);
+                       if (ret)
+                               break;
+
+                       this_extents = 0;
+                       index++;
+                       p = page_address(page);
+               }
+       } while (1);
+
+       if (this_extents)
+               ret = fcache_rw_page(fdev, index, page, WRITE);
+
+       fdev->nr_extents = nr_extents;
+       printk("fcache: wrote %d extents, holding %llu sectors of data\n",
+                               nr_extents, (unsigned long long) sectors);
+err:
+       __free_page(page);
+       return ret;
+}
+
+static int fcache_write_header(struct fcache_dev *fdev)
+{
+       struct page *page;
+       int ret;
+
+       page = alloc_page(GFP_KERNEL);
+       if (unlikely(!page))
+               return -ENOMEM;
+
+       ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
+       if (!ret) {
+               struct fcache_header *header = page_address(page);
+
+               fcache_fill_header(fdev, header, fdev->nr_extents);
+               ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+               printk("fcache: wrote header (extents=%lu,serial=%d)\n",
+                                               fdev->nr_extents, fdev->serial);
+       }
+
+       __free_page(page);
+       return ret;
+}
+
+static void fcache_fe_link(struct fcache_dev *fdev, struct fcache_extent *fe)
+{
+       struct fcache_extent *__fe;
+       struct list_head *entry;
+
+       entry = &fdev->fe_list;
+       while ((entry = entry->prev) != &fdev->fe_list) {
+               __fe = list_entry(entry, struct fcache_extent, fe_list);
+               if (__fe->cache_sector < fe->cache_sector)
+                       break;
+       }
+
+       list_add(&fe->fe_list, entry);
+}
+
+static int fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe)
+{
+       struct prio_tree_node *node = &fe->prio_node;
+       struct prio_tree_node *ret;
+
+       node->start = fe->fs_sector;
+       node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
+
+       ret = prio_tree_insert(&fdev->prio_root, node);
+       if (ret == node)
+               return 0;
+
+       return 1;
+}
+
+static void fcache_tree_reposition(struct fcache_dev *fdev,
+                                  struct fcache_extent *fe)
+{
+       struct prio_tree_node *node = &fe->prio_node;
+       unsigned long flags;
+
+       spin_lock_irqsave(&fdev->lock, flags);
+       prio_tree_remove(&fdev->prio_root, node);
+       node->start = fe->fs_sector;
+       node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
+
+       prio_tree_insert(&fdev->prio_root, node);
+       spin_unlock_irqrestore(&fdev->lock, flags);
+}
+
+static struct fcache_extent *fcache_get_fe(gfp_t gfp_mask)
+{
+       struct fcache_extent *fe = kmem_cache_alloc(fcache_slab, gfp_mask);
+
+       if (fe) {
+               memset(fe, 0, sizeof(*fe));
+               INIT_PRIO_TREE_NODE(&fe->prio_node);
+               INIT_LIST_HEAD(&fe->fe_list);
+               fe->valid = 1;
+               fe->ow_time = -1;
+       }
+
+       return fe;
+}
+
+#define MAX_FE 16
+
+/*
+ * Lookup the range of a given request in the prio tree. Used for both
+ * looking up a range covering a read operation to be served from cache,
+ * and to lookup potential conflicts from a new write with an existing
+ * extent.
+ */
+static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
+                               sector_t end_sector, struct fcache_extent **map)
+{
+       struct prio_tree_node *node;
+       struct prio_tree_iter iter;
+       int i = 0, need_lock;
+
+       prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
+
+       /*
+        * We only need to lock, if we are priming. The prio tree does
+        * not change when in normal mode.
+        */
+       need_lock = fdev->priming;
+       if (need_lock)
+               spin_lock_irq(&fdev->lock);
+
+       do {
+               struct fcache_extent *fe;
+
+               node = prio_tree_next(&iter);
+               if (!node)
+                       break;
+
+               fe = prio_tree_entry(node, struct fcache_extent, prio_node);
+               smp_wmb();
+               if (fe->valid)
+                       map[i++] = fe;
+       } while (i < MAX_FE);
+
+       if (need_lock)
+               spin_unlock_irq(&fdev->lock);
+
+       return i;
+}
+
+/*
+ * Our data write is done, now insert the fcache extents into the rbtree.
+ */
+static int fcache_instantiate_extent(struct fcache_dev *fdev,
+                                    struct fcache_endio_data *fed)
+{
+       struct fcache_extent *fe = fdev->last_fe;
+
+       /*
+        * Don't instantiate an errored io.
+        */
+       if (fed->io_error)
+               return -EIO;
+
+       /*
+        * Check if we can merge this extent with the previous one.
+        */
+       if (fe && (fe->fs_sector + (fe->fs_size >> 9) == fed->fs_sector) &&
+           (fe->cache_sector + (fe->fs_size >> 9) == fed->cache_sector) &&
+           fe->valid) {
+               fe->fs_size += fed->fs_size;
+               fcache_tree_reposition(fdev, fe);
+               fdev->nr_extents--;
+               fdev->extent_merges++;
+       } else {
+               unsigned long flags;
+
+               fe = fcache_get_fe(GFP_ATOMIC);
+               if (unlikely(!fe))
+                       return -ENOMEM;
+
+               fe->fs_sector = fed->fs_sector;
+               fe->fs_size = fed->fs_size;
+               fe->cache_sector = fed->cache_sector;
+
+               spin_lock_irqsave(&fdev->lock, flags);
+               if (!fcache_tree_link(fdev, fe)) {
+                       fcache_fe_link(fdev, fe);
+                       fdev->last_fe = fe;
+               } else {
+                       fdev->nr_extents--;
+                       kmem_cache_free(fcache_slab, fe);
+               }
+
+               spin_unlock_irqrestore(&fdev->lock, flags);
+       }
+
+       return 0;
+}
+
+/*
+ * Hang on to the bio and its pages - ideally we would want to ensure
+ * that the page data doesn't change between calling this function and
+ * fcache_put_bio_pages() as well...
+ */
+static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
+{
+       /*
+        * Currently stubbed out, as we cannot end the bio read before
+        * the write completes without also making sure that the pages
+        * don't get reused for something else in the mean time.
+        */
+#ifdef FCACHE_PAGES_PROTECTED
+       struct bio_vec *bvec;
+       int i;
+
+       bio_get(bio);
+
+       __bio_for_each_segment(bvec, bio, i, 0)
+               get_page(bvec->bv_page);
+#endif
+}
+
+static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
+{
+#ifdef FCACHE_PAGES_PROTECTED
+       struct bio_vec *bvec;
+       int i;
+
+       __bio_for_each_segment(bvec, bio, i, 0)
+               put_page(bvec->bv_page);
+
+       bio_put(bio);
+#endif
+}
+
+static void fcache_chop_write_done(struct fcache_endio_data *fed)
+{
+       /*
+        * Last io completes.
+        */
+       if (atomic_dec_and_test(&fed->completions)) {
+               struct fcache_dev *fdev = fed->fdev;
+               struct bio *bio = fed->bio;
+
+               /*
+                * Release our reference to the original bio and
+                * its pages.
+                */
+               fcache_put_bio_pages(fdev, bio);
+
+               /*
+                * End the read!
+                */
+               bio_endio(bio, bio->bi_size, fed->io_error);
+
+               /*
+                * If we don't get the extent added, remember to decrement
+                * the number of extents.
+                */
+               if (fcache_instantiate_extent(fdev, fed)) {
+                       unsigned long flags;
+
+                       spin_lock_irqsave(&fdev->lock, flags);
+                       fdev->nr_extents--;
+                       spin_unlock_irqrestore(&fdev->lock, flags);
+               }
+
+               mempool_free(fed, fed_pool);
+       }
+}
+
+/*
+ * Our data write to the cache completes, we can free our clone and
+ * instantiate the extent block.
+ */
+static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
+                                    int err)
+{
+       struct fcache_endio_data *fed;
+
+       if (bio->bi_size)
+               return 1;
+
+       fed = bio->bi_private;
+
+       if (!bio_flagged(bio, BIO_UPTODATE))
+               fed->io_error = -EIO;
+
+       bio_put(bio);
+       fcache_chop_write_done(fed);
+       return 0;
+}
+
+static void fcache_chop_read_done(struct fcache_endio_data *fed)
+{
+       if (atomic_dec_and_test(&fed->completions)) {
+               struct bio *bio = fed->bio;
+
+               bio_endio(bio, bio->bi_size, fed->io_error);
+               mempool_free(fed, fed_pool);
+       }
+}
+
+static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err)
+{
+       struct fcache_endio_data *fed;
+
+       if (bio->bi_size)
+               return 1;
+
+       fed = bio->bi_private;
+
+       if (!bio_flagged(bio, BIO_UPTODATE))
+               fed->io_error = -EIO;
+
+       bio_put(bio);
+       fcache_chop_read_done(fed);
+       return 0;
+}
+
+typedef void (chopper_done_t) (struct fcache_endio_data *);
+
+/*
+ * This is our io chopper - it hacks a bio into smaller pieces, suitable
+ * for the target device. Caller supplies suitable end_io and done functions.
+ */
+static void fcache_io_chopper(struct fcache_dev *fdev,
+                             struct fcache_endio_data *fed,
+                             bio_end_io_t *endio, chopper_done_t *done, int rw)
+{
+       struct bio *bio = NULL;
+       struct bio_vec *bv;
+       unsigned int total_bytes;
+       sector_t sector;
+       int i, vecs;
+
+       /*
+        * Make sure 'fed' doesn't disappear while we are still issuing
+        * ios, the artificial reference is dropped at the end.
+        */
+       atomic_set(&fed->completions, 1);
+
+       sector = fed->cache_sector;
+       total_bytes = fed->fs_size;
+       vecs = fed->bio->bi_vcnt;
+       bio_for_each_segment(bv, fed->bio, i) {
+               unsigned int len;
+
+               if (!total_bytes)
+                       break;
+
+               len = bv->bv_len;
+               if (len > total_bytes)
+                       len = total_bytes;
+
+               do {
+                       unsigned int l;
+
+                       if (!bio) {
+                               bio = bio_alloc(GFP_NOFS, vecs);
+
+                               bio->bi_sector = sector;
+                               bio->bi_bdev = fdev->bdev;
+                               bio->bi_end_io = endio;
+                               bio->bi_private = fed;
+                       }
+
+                       /*
+                        * If successful, break out of this loop and move on.
+                        */
+                       l = bio_add_page(bio, bv->bv_page, len, bv->bv_offset);
+                       if (l == len)
+                               break;
+
+                       BUG_ON(!bio->bi_size);
+
+                       /*
+                        * We could not add this page, submit what we have
+                        * and alloc a new bio.
+                        */
+                       atomic_inc(&fed->completions);
+                       submit_bio(rw, bio);
+                       bio = NULL;
+               } while (1);
+
+               total_bytes -= len;
+               sector += len >> 9;
+               vecs--;
+       }
+
+       if (bio) {
+               atomic_inc(&fed->completions);
+               submit_bio(rw, bio);
+       }
+
+       /*
+        * Drop our reference to fed.
+        */
+       done(fed);
+}
+
+/*
+ * cache device has similar or higher queue restrictions than the fs
+ * device - in that case, we can resubmit the bio to the device directly.
+ */
+static void fcache_direct_cache_write(struct fcache_dev *fdev,
+                                     struct fcache_endio_data *fed)
+{
+       struct bio *bio = bio_clone(fed->bio, GFP_NOFS);
+
+       bio->bi_sector = fed->cache_sector;
+       bio->bi_bdev = fdev->bdev;
+       bio->bi_rw &= ~(1 << BIO_RW_SYNC);
+       bio->bi_end_io = fcache_extent_write_endio;
+       bio->bi_private = fed;
+
+       atomic_set(&fed->completions, 1);
+       submit_bio(WRITE, bio);
+}
+
+/*
+ * cache device has more conservative restrictions than the fs device.
+ * The safest approach is to split up the bio and let bio_add_page()
+ * decide when it's time to submit the pieces.
+ */
+static void fcache_submit_cache_write(struct fcache_dev *fdev,
+                                     struct fcache_endio_data *fed)
+{
+       if (!fdev->chop_ios)
+               fcache_direct_cache_write(fdev, fed);
+       else
+               fcache_io_chopper(fdev, fed, fcache_extent_write_endio,
+                                       fcache_chop_write_done, WRITE);
+}
+
+/*
+ * We punt work to fcache_work() whenever we need do work that blocks. The
+ * only thing that this thread handles is submitting the extent write
+ * when the real read has completed. We used to do the extent instantiation
+ * here as well, but fcache_extent_write_endio handles that now.
+ */
+static void fcache_work(void *data)
+{
+       struct fcache_dev *fdev = data;
+
+       do {
+               struct fcache_endio_data *fed = NULL;
+               struct bio *bio;
+
+               spin_lock_irq(&fdev->lock);
+               if (!list_empty(&fdev->list)) {
+                       fed = list_entry(fdev->list.next, struct fcache_endio_data,list);
+                       list_del_init(&fed->list);
+               }
+               spin_unlock_irq(&fdev->lock);
+
+               if (!fed)
+                       break;
+
+               bio = fed->bio;
+
+               if (fed->io_error) {
+                       printk(KERN_ERR "fcache: read error from device\n");
+                       bio_endio(bio, bio->bi_size, fed->io_error);
+                       continue;
+               }
+
+               /*
+                * Get a ref on the original bio and pages, then
+                * we should be able to signal completion of the READ
+                * without waiting for the write to finish first.
+                */
+               fcache_get_bio_pages(fdev, bio);
+
+               /*
+                * Submit the read data as cache writes.
+                */
+               fcache_submit_cache_write(fdev, fed);
+
+               /*
+                * If fcache_get_bio_pages() could protect the pages from
+                * being changed, we could end the io here instead of in
+                * fcache_extent_fed_completes().
+                */
+       } while (1);
+}
+
+/*
+ * Align bio to start at extent and stop sooner if extent is short. Must
+ * be called cautiously - it's only allowed to modify the bio if this is
+ * a clone and a write request, reads must be fully aligned and only
+ * possibly require a starting offset modification.
+ */
+static void fcache_bio_align(struct bio *bio, struct fcache_extent *fe)
+{
+       struct bio_vec *bvec;
+       sector_t start, end;
+       sector_t org_start, org_end;
+       unsigned int org_size, org_idx;
+       int i;
+
+       start = bio->bi_sector;
+       bio->bi_sector = fe->cache_sector;
+
+       /*
+        * Nothing to do, perfectly aligned.
+        */
+       if (start == fe->fs_sector && bio->bi_size == fe->fs_size)
+               return;
+
+       org_start = bio->bi_sector;
+       org_end = bio->bi_sector + (bio->bi_size >> 9);
+       org_size = bio->bi_size;
+       org_idx = bio->bi_idx;
+
+       /*
+        * Adjust beginning.
+        */
+       if (start > fe->fs_sector)
+               bio->bi_sector += (start - fe->fs_sector);
+       else if (start < fe->fs_sector) {
+               sector_t diff = fe->fs_sector - start;
+               int idx = 0;
+
+               BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
+               BUG_ON(bio_data_dir(bio) != WRITE);
+
+               /*
+                * Adjust where bio starts
+                */
+               __bio_for_each_segment(bvec, bio, i, 0) {
+                       unsigned int bsec = bvec->bv_len >> 9;
+                       unsigned int this_diff = bsec;
+
+                       if (!diff)
+                               break;
+                       if (this_diff > diff)
+                               this_diff = diff;
+
+                       bio->bi_sector += this_diff;
+                       bio->bi_size -= (this_diff << 9);
+
+                       /*
+                        * Bigger than this chunk, skip ahead.
+                        */
+                       if (this_diff == bsec) {
+                               idx++;
+                               diff -= this_diff;
+                               continue;
+                       }
+
+                       /*
+                        * Adjust this bvec
+                        */
+                       bvec->bv_offset += (this_diff << 9);
+                       bvec->bv_len -= (this_diff << 9);
+                       break;
+               }
+               bio->bi_idx += idx;
+       }
+
+       /*
+        * Goes beyond the end, shrink size.
+        */
+       end = bio->bi_sector + (bio->bi_size >> 9);
+       if (end > fe->cache_sector + (fe->fs_size >> 9)) {
+               sector_t diff = end - (fe->cache_sector + (fe->fs_size >> 9));
+               int vecs = 0;
+
+               BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
+               BUG_ON(bio_data_dir(bio) != WRITE);
+
+               /*
+                * This is __bio_for_each_segment_reverse().
+                */
+               for (i = bio->bi_vcnt - 1; i >= bio->bi_idx; i--) {
+                       struct bio_vec *bvec = &bio->bi_io_vec[i];
+                       unsigned int bsec = bvec->bv_len >> 9;
+                       unsigned int this_diff = bsec;
+
+                       if (!diff)
+                               break;
+                       if (this_diff > diff)
+                               this_diff = diff;
+
+                       bio->bi_size -= (this_diff << 9);
+
+                       /*
+                        * Bigger than this chunk, skip ahead.
+                        */
+                       if (this_diff == bsec) {
+                               vecs++;
+                               diff -= this_diff;
+                               continue;
+                       }
+
+                       /*
+                        * Adjust this bvec
+                        */
+                       bvec->bv_len -= (this_diff << 9);
+                       break;
+               }
+               bio->bi_vcnt -= vecs;
+       }
+
+       BUG_ON(bio->bi_sector < fe->cache_sector);
+       BUG_ON(bio->bi_sector + (bio->bi_size >> 9) > fe->cache_sector + (fe->fs_size >> 9));
+
+       /*
+        * Invalidate the segment counts, we changed the bio layout.
+        */
+       bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+       bio->bi_flags |= (1 << BIO_NOMERGE);
+}
+
+static int fcache_overwrite_endio(struct bio *bio, unsigned int bytes, int err)
+{
+       if (bio->bi_size)
+               return 1;
+
+       if (!bio_flagged(bio, BIO_UPTODATE)) {
+               struct fcache_dev *fdev = bio->bi_private;
+
+               printk(KERN_ERR "fcache: overwrite error, cache off\n");
+               set_bit(FDEV_F_DOWN, &fdev->flags);
+       }
+
+       bio_put(bio);
+       return 0;
+}
+
+/*
+ * Schedule overwrite of some existing block(s).
+ */
+static int fcache_overwrite_extent(struct fcache_dev *fdev,
+                                  struct fcache_extent *fe, struct bio *bio)
+{
+       struct bio *clone = bio_clone(bio, GFP_NOFS);
+
+       /*
+        * Just drop prefetch info on overwrites.
+        */
+       if (fe->pages)
+               __fcache_fe_drop(fe);
+
+       clone->bi_bdev = fdev->bdev;
+       clone->bi_end_io = fcache_overwrite_endio;
+       clone->bi_private = fdev;
+       fcache_bio_align(clone, fe);
+       submit_bio(WRITE, clone);
+       return 0;
+}
+
+/*
+ * Our real data read is complete. Kick our process context handler so it
+ * can submit the write to our cache.
+ */
+static int fcache_extent_endio(struct bio *bio, unsigned int bytes, int err)
+{
+       struct fcache_dev *fdev;
+       struct fcache_endio_data *fed;
+       unsigned long flags;
+
+       if (bio->bi_size)
+               return 1;
+
+       fed = bio->bi_private;
+
+       if (!bio_flagged(bio, BIO_UPTODATE))
+               fed->io_error = -EIO;
+
+       bio_put(bio);
+
+       fdev = fed->fdev;
+       spin_lock_irqsave(&fdev->lock, flags);
+       list_add_tail(&fed->list, &fdev->list);
+       spin_unlock_irqrestore(&fdev->lock, flags);
+       queue_work(fcache_workqueue, &fdev->work);
+       return 0;
+}
+
+/*
+ * This initiates adding an extent to our list. We do this by cloning the
+ * original bio and submitting that to the real device and when that completes
+ * we write that out to the cache device and instantiate the extent.
+ */
+static int fcache_add_extent(struct fcache_dev *fdev, struct bio *bio)
+{
+       struct fcache_endio_data *fed;
+       struct bio *clone;
+
+       fed = mempool_alloc(fed_pool, GFP_NOIO);
+
+       fed->fdev = fdev;
+       fed->fs_sector = bio->bi_sector;
+       fed->fs_size = bio->bi_size;
+       fed->cache_sector = -1;
+       fed->bio = bio;
+       fed->io_error = 0;
+       INIT_LIST_HEAD(&fed->list);
+
+       /*
+        * Allocate/assign an extent block for this range
+        */
+       spin_lock_irq(&fdev->lock);
+       if (fdev->nr_extents < fdev->max_extents) {
+               unsigned long nr = (bio->bi_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+               if (fdev->next_cache_block + nr <= fdev->cache_blocks) {
+                       fdev->nr_extents++;
+                       fed->cache_sector = fdev->next_cache_block << FBLOCK_SHIFT;
+                       fdev->next_cache_block += nr;
+               }
+       }
+       spin_unlock_irq(&fdev->lock);
+
+       /*
+        * Ran out of room
+        */
+       if (fed->cache_sector == -1) {
+               printk(KERN_ERR "fcache: ran out of space, priming now off\n");
+               fdev->priming = 0;
+               mempool_free(fed, fed_pool);
+               return -ENOENT;
+       }
+
+       clone = bio_clone(bio, GFP_NOFS);
+       clone->bi_private = fed;
+       clone->bi_end_io = fcache_extent_endio;
+       clone->bi_rw |= (1 << BIO_RW_SYNC);
+
+       generic_make_request(clone);
+       return 0;
+}
+
+static int fcache_parse_extents(struct fcache_dev *fdev, void *addr,
+                               unsigned int max_extents)
+{
+       int nr_extents = PAGE_SIZE / sizeof(struct fcache_ondisk_extent);
+       int extents_read;
+
+       if (nr_extents > max_extents)
+               nr_extents = max_extents;
+
+       extents_read = 0;
+       while (nr_extents) {
+               struct fcache_ondisk_extent *foe = addr;
+               struct fcache_extent *fe;
+
+               fe = fcache_get_fe(GFP_KERNEL);
+               if (unlikely(!fe))
+                       return -ENOMEM;
+
+               fe->fs_sector = be64_to_cpu(foe->fs_sector);
+               fe->cache_sector = be64_to_cpu(foe->cache_sector);
+               fe->fs_size = be32_to_cpu(foe->fs_size);
+
+               spin_lock_irq(&fdev->lock);
+               if (fcache_tree_link(fdev, fe))
+                       BUG();
+               fcache_fe_link(fdev, fe);
+               spin_unlock_irq(&fdev->lock);
+
+               nr_extents--;
+               extents_read++;
+               addr += sizeof(*foe);
+       }
+
+       return extents_read;
+}
+
+static int fcache_read_extents(struct fcache_dev *fdev)
+{
+       unsigned int nr_extents = fdev->nr_extents;
+       int ret, extents, total_extents;
+       struct page *page;
+       sector_t index;
+       void *p;
+
+       page = alloc_page(GFP_KERNEL);
+       if (unlikely(!page))
+               return -ENOMEM;
+
+       ret = 0;
+       total_extents = 0;
+       index = FCACHE_EXTENT_BLOCK;
+       while (nr_extents) {
+               ret = fcache_rw_page(fdev, index, page, READ);
+               if (ret)
+                       break;
+
+               p = page_address(page);
+               extents = fcache_parse_extents(fdev, p, nr_extents);
+
+               if (extents < 0) {
+                       ret = extents;
+                       break;
+               }
+
+               index++;
+               nr_extents -= extents;
+               total_extents += extents;
+       }
+
+       __free_page(page);
+
+       if (ret)
+               return ret;
+
+       return total_extents;
+}
+
+/*
+ * Read an existing fcache header from the device, and then proceed to
+ * reading and adding the extents to out prio tree.
+ */
+static int fcache_load_header(struct fcache_dev *fdev, int serial)
+{
+       struct fcache_header *header = NULL;
+       struct page *page;
+       int ret, wrong_serial = 0;
+       char b[BDEVNAME_SIZE];
+
+       page = alloc_page(GFP_HIGHUSER);
+       if (unlikely(!page))
+               return -ENOMEM;
+
+       ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
+       if (unlikely(ret))
+               goto err;
+
+       ret = -EINVAL;
+       header = kmap_atomic(page, KM_USER0);
+       if (header->magic != FCACHE_MAGIC) {
+               printk(KERN_ERR "fcache: bad magic %x\n", header->magic);
+               goto err;
+       }
+       if (header->version != FCACHE_VERSION) {
+               printk(KERN_ERR "fcache: bad version %d\n", header->version);
+               goto err;
+       }
+       if (strcmp(bdevname(fdev->fs_bdev, b), header->fs_dev)) {
+               printk(KERN_ERR "fcache: device mismatch (%s/%s\n", b,
+                                                       header->fs_dev);
+               goto err;
+       }
+       if (header->fs_start_sector != fdev->fs_start_sector ||
+           header->fs_sectors != fdev->fs_sectors) {
+               printk(KERN_ERR "fcache: fs appears to have changed size\n");
+               goto err;
+       }
+
+       fdev->nr_extents = header->nr_extents;
+       fdev->max_extents = header->max_extents;
+
+       /*
+        * Don't fail on out-of-date serial, just warn that the user needs
+        * to prime the cache again. Until then we'll just bypass the cache.
+        */
+       if (header->serial != serial) {
+               printk(KERN_ERR "fcache: found serial %d, expected %d.\n",
+                                                       header->serial, serial);
+               printk(KERN_ERR "fcache: reprime the cache!\n");
+               wrong_serial = 1;
+       }
+
+       fdev->serial = header->serial;
+       kunmap_atomic(header, KM_USER0);
+       __free_page(page);
+
+       if (!wrong_serial) {
+               printk("fcache: header looks valid (extents=%ld, serial=%u)\n",
+                                               fdev->nr_extents, fdev->serial);
+
+               ret = fcache_read_extents(fdev);
+               printk("fcache: loaded %d extents\n", ret);
+
+               /*
+                * If we don't find all the extents we require, fail.
+                */
+               if (ret != fdev->nr_extents) {
+                       fcache_free_links(fdev);
+                       ret = -EINVAL;
+               } else
+                       ret = 0;
+       }
+
+       return ret;
+err:
+       __free_page(page);
+       if (header)
+               kunmap_atomic(header, KM_USER0);
+       return ret;
+}
+
+/*
+ * We use this range to decide when to log an io to the target device.
+ */
+static void fcache_fill_fs_size(struct fcache_dev *fdev)
+{
+       struct block_device *bdev = fdev->fs_bdev;
+
+       /*
+        * Partition or whole device?
+        */
+       if (bdev != bdev->bd_contains) {
+               struct hd_struct *p = bdev->bd_part;
+
+               fdev->fs_start_sector = p->start_sect;
+               fdev->fs_sectors = p->nr_sects;
+       } else {
+               fdev->fs_start_sector = 0;
+               fdev->fs_sectors = bdev->bd_inode->i_size >> 9;
+       }
+}
+
+static void fcache_fill_cache_size(struct fcache_dev *fdev)
+{
+       struct block_device *bdev = fdev->bdev;
+
+       /*
+        * Partition or whole device?
+        */
+       if (bdev != bdev->bd_contains) {
+               struct hd_struct *p = bdev->bd_part;
+
+               fdev->cache_start_sector = p->start_sect;
+               fdev->cache_blocks = p->nr_sects >> FBLOCK_SHIFT;
+       } else {
+               fdev->cache_start_sector = 0;
+               fdev->cache_blocks = bdev->bd_inode->i_size >> PAGE_SHIFT;
+       }
+}
+
+static int fcache_prefetch_endio(struct bio *bio, unsigned int bytes, int err)
+{
+       struct bio_vec *bvec;
+       int i;
+
+       if (bio->bi_size)
+               return 1;
+
+       __bio_for_each_segment(bvec, bio, i, 0) {
+               if (!bio_flagged(bio, BIO_UPTODATE))
+                       ClearPageUptodate(bvec->bv_page);
+
+               unlock_page(bvec->bv_page);
+       }
+
+       bio_put(bio);
+       return 0;
+}
+
+/*
+ * Prefetch from this fe and forward for at least 128kb. It may be more,
+ * we always prefetch full extents. We do not prefetch extents that are
+ * bigger than 32kb as that should go at full speed anyways.
+ */
+static void fcache_do_prefetch(struct fcache_dev *fdev,
+                              struct fcache_extent *fe)
+{
+       struct page **map;
+       struct fcache_extent *fenxt;
+       unsigned int pages, i, ret, bytes, this_pages;
+       struct bio *bio;
+       sector_t sec;
+
+       if (pcache_disable)
+               return;
+       if (fe->fs_size >= 64 * 1024)
+               return;
+       if (fe->pages)
+               return;
+
+       sec = fe->cache_sector;
+       bytes = 0;
+       this_pages = 0;
+       bio = NULL;
+       while (bytes < 128*1024) {
+               pages = (fe->fs_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+               map = kmalloc(pages * sizeof(struct page *), GFP_NOIO);
+               fe->pages = map;
+               fe->numpages = pages;
+
+               for (i = 0; i < pages; i++) {
+                       map[i] = alloc_page(GFP_NOIO);
+                       SetPageUptodate(map[i]);
+                       SetPageLocked(map[i]);
+               }
+
+               for (i = 0; i < pages; i++) {
+                       if (!bio) {
+get_bio:
+                               bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+
+                               bio->bi_sector = sec;
+                               bio->bi_bdev = fdev->bdev;
+                               bio->bi_end_io = fcache_prefetch_endio;
+                               this_pages = 0;
+                       }
+
+                       ret = bio_add_page(bio, map[i], PAGE_CACHE_SIZE, 0);
+                       if (ret != PAGE_CACHE_SIZE) {
+                               submit_bio(READ, bio);
+                               bio = NULL;
+                               goto get_bio;
+                       }
+                       this_pages++;
+                       bytes += PAGE_CACHE_SIZE;
+                       sec += FBLOCK_SIZE;
+               }
+
+               if (fe->fe_list.next == &fdev->fe_list)
+                       break;
+
+               fenxt = list_entry(fe->fe_list.next, struct fcache_extent, fe_list);
+               if (fenxt->pages)
+                       break;
+               if (fe->cache_sector + (fe->fs_size >> 9) != fenxt->cache_sector)
+                       break;
+               fe = fenxt;
+       }
+
+       if (bio)
+               submit_bio(READ, bio);
+
+       fdev->pcache_kb += bytes >> 10;
+}
+
+static void __fcache_fe_drop(struct fcache_extent *fe)
+{
+       struct page **pages = xchg(&fe->pages, NULL);
+
+       if (pages) {
+               int i;
+
+               for (i = 0; i < fe->numpages; i++)
+                       __free_page(pages[i]);
+
+               fe->numpages = fe->readpages = 0;
+               kfree(pages);
+       }
+}
+
+/*
+ * Assume that we can drop the prefetch data, if we have as many reads
+ * as pages. Wont be true for multiple reads of the data, however the page
+ * cache should prevent that.
+ */
+static void fcache_fe_checkdrop(struct fcache_extent *fe)
+{
+       if (fe->readpages >= fe->numpages)
+               __fcache_fe_drop(fe);
+}
+
+static int fcache_read_fe_cached(struct fcache_dev *fdev,
+                                struct fcache_extent *fe, struct bio *bio)
+{
+       unsigned long pgoff, boff;
+       struct bio_vec *bvec;
+       char *src, *dst;
+       int i;
+
+       if (pcache_disable)
+               return 1;
+
+       /*
+        * page index offset
+        */
+       pgoff = ((bio->bi_sector - fe->fs_sector) << 9) / PAGE_CACHE_SIZE;
+       boff = ((bio->bi_sector - fe->fs_sector) << 9) & (PAGE_CACHE_SIZE - 1);
+
+       /*
+        * Loop over bio, copying data.
+        */
+       bio_for_each_segment(bvec, bio, i) {
+               if (pgoff == fe->numpages) {
+                       fcache_fe_checkdrop(fe);
+                       BUG_ON(fe->fe_list.next == &fdev->fe_list);
+                       fe = list_entry(fe->fe_list.next, struct fcache_extent, fe_list);
+                       pgoff = 0;
+               }
+
+               wait_on_page_locked(fe->pages[pgoff]);
+
+               /*
+                * IO error, bail.
+                */
+               if (unlikely(!PageUptodate(fe->pages[pgoff]))) {
+                       __fcache_fe_drop(fe);
+                       return 1;
+               }
+
+               WARN_ON(boff + bvec->bv_len > PAGE_CACHE_SIZE);
+
+               src = kmap_atomic(fe->pages[pgoff], KM_USER0);
+               dst = kmap_atomic(bvec->bv_page, KM_USER1);
+
+               memcpy(dst + bvec->bv_offset, src + boff, bvec->bv_len);
+
+               kunmap_atomic(src, KM_USER0);
+               kunmap_atomic(dst, KM_USER1);
+
+               pgoff++;
+               boff = 0;
+               fe->readpages++;
+       }
+
+       fcache_fe_checkdrop(fe);
+       return 0;
+}
+
+static int fcache_read_fe(struct fcache_dev *fdev, struct fcache_extent *fe,
+                         struct bio *bio)
+{
+       fdev->hits++;
+
+       fcache_do_prefetch(fdev, fe);
+
+       /*
+        * If we have the data already (it was prefetched), just copy it
+        * over and end it.
+        */
+       if (fe->pages && !fcache_read_fe_cached(fdev, fe, bio)) {
+               fdev->pcache_hits++;
+               bio_endio(bio, bio->bi_size, 0);
+               return 0;
+       }
+
+       /*
+        * We need to issue IO to our cache, align and submit appropriately.
+        */
+       fcache_bio_align(bio, fe);
+
+       /*
+        * If we don't have to chop it up, just let generic_make_request()
+        * handle the stacking. Otherwise, return handled and pass to chopper.
+        */
+       if (fdev->chop_ios) {
+               struct fcache_endio_data *fed;
+
+               fed = mempool_alloc(fed_pool, GFP_NOIO);
+
+               fed->fdev = fdev;
+               fed->cache_sector = bio->bi_sector;
+               fed->fs_size = bio->bi_size;
+               fed->bio = bio;
+               fed->io_error = 0;
+               fcache_io_chopper(fdev, fed, fcache_chop_read_endio,
+                                       fcache_chop_read_done, READ);
+               return 0;
+       }
+
+       bio->bi_bdev = fdev->bdev;
+       return 1;
+}
+
+/*
+ * This is a read request, check if we have that block. If we do, then
+ * just redirect. If not, pass it through.
+ */
+static int fcache_read_request(struct fcache_dev *fdev, request_queue_t *q,
+                              struct bio *bio)
+{
+       struct fcache_extent *extents[MAX_FE];
+       struct fcache_extent *fe;
+       sector_t end_sector;
+       int i, nr;
+
+       end_sector = bio->bi_sector + (bio->bi_size >> 9) - 1;
+       nr = fcache_lookup_extent(fdev, bio->bi_sector, end_sector, extents);
+       if (!nr) {
+               /*
+                * Not there, redirect to original but schedule adding this
+                * extent to our list if we are priming.
+                */
+               if (fdev->priming && !fcache_add_extent(fdev, bio))
+                       return 0;
+
+               fdev->misses++;
+               return fdev->mfn(q, bio);
+       }
+
+       if (nr == MAX_FE)
+               printk("FOUND MAX\n");
+
+       /*
+        * If range is at least as big, we use our cache. If not, cop out
+        * and just submit to real device.
+        */
+       for (i = 0; i < nr; i++) {
+               sector_t end_fe, end_bi;
+               fe = extents[i];
+
+               end_fe = fe->fs_sector + (fe->fs_size >> 9);
+               end_bi = bio->bi_sector + (bio->bi_size >> 9);
+
+               /*
+                * match!
+                */
+               if (bio->bi_sector >= fe->fs_sector && end_bi <= end_fe)
+                       break;
+
+               fe = NULL;
+       }
+
+       /*
+        * Nopes, send to real device.
+        */
+       if (!fe) {
+               fdev->partial++;
+               fdev->misses++;
+               return fdev->mfn(q, bio);
+       }
+
+       return fcache_read_fe(fdev, fe, bio);
+}
+
+/*
+ * If we are priming the cache, always add this block. If not, then we still
+ * need to overwrite this block if it's in our cache.
+ */
+static int fcache_write_request(struct fcache_dev *fdev, request_queue_t *q,
+                               struct bio *bio)
+{
+       struct fcache_extent *extents[MAX_FE];
+       struct fcache_extent *fe;
+       sector_t start = bio->bi_sector;
+       sector_t end = start + (bio->bi_size >> 9) - 1;
+       int i, nr;
+
+repeat:
+       nr = fcache_lookup_extent(fdev, start, end, extents);
+
+       /*
+        * Find out what to overwrite, if anything.
+        */
+       for (i = 0; i < nr; i++) {
+               fe = extents[i];
+
+               if (fe->ow_time == -1UL)
+                       fe->ow_time = jiffies + FCACHE_OVERWRITE_TIME;
+
+               if (!fdev->priming && time_before(jiffies, fe->ow_time) &&
+                   ++fe->overwrites >= FCACHE_MAX_OVERWRITES) {
+                       fdev->extent_drops++;
+                       fe->valid = 0;
+               } else {
+                       /*
+                        * TODO: possibly split extent in two (or chop off
+                        * a part of the end/front) instead of always
+                        * overwriting.
+                        */
+                       fdev->overwrites++;
+                       fcache_overwrite_extent(fdev, fe, bio);
+               }
+       }
+
+       /*
+        * If i == MAX_FE, there _may_ be more extents. Repeat lookup, start
+        * from the end of last request.
+        */
+       if (i == MAX_FE) {
+               fe = extents[i - 1];
+               start = fe->fs_sector + (fe->fs_size >> 9);
+               if (start < end)
+                       goto repeat;
+       }
+
+       return fdev->mfn(q, bio);
+}
+
+/*
+ * This is the only case where we resubmit an io to the device but don't
+ * want to count it as part of io we log.
+ */
+#define fcache_bio_seen(bio)   ((bio)->bi_end_io == fcache_extent_endio)
+
+static int fcache_make_request(request_queue_t *q, struct bio *bio)
+{
+       struct fcache_dev *fdev = &fcache_dev;
+
+       /*
+        * If it's in the sector range we are monitoring and the device isn't
+        * being shutdown, then pass it on. Assume a bio doesn't span into
+        * the next partition, so don't bother accounting for size.
+        */
+       if ((bio->bi_sector >= fdev->fs_start_sector) &&
+           (bio->bi_sector < (fdev->fs_start_sector + fdev->fs_sectors)) &&
+           !test_bit(FDEV_F_DOWN, &fdev->flags) &&
+           !fcache_bio_seen(bio)) {
+
+               fdev->ios[bio_data_dir(bio)]++;
+
+               if (bio_data_dir(bio) == READ)
+                       return fcache_read_request(fdev, q, bio);
+
+               return fcache_write_request(fdev, q, bio);
+       }
+
+       /*
+        * Pass through to original make_request_fn.
+        */
+       return fdev->mfn(q, bio);
+}
+
+/*
+ * Attach the cache device 'bdev' to 'fdev'.
+ */
+static int fcache_setup_dev(struct fcache_dev *fdev,
+                           struct block_device *fs_bdev,
+                           struct block_device *bdev,
+                           int priming, int serial)
+{
+       request_queue_t *fs_q, *cache_q;
+       char b[BDEVNAME_SIZE];
+       int ret;
+
+       memset(fdev, 0, sizeof(*fdev));
+       INIT_PRIO_TREE_ROOT(&fdev->prio_root);
+       spin_lock_init(&fdev->lock);
+       INIT_LIST_HEAD(&fdev->list);
+       INIT_LIST_HEAD(&fdev->fe_list);
+       INIT_WORK(&fdev->work, fcache_work, fdev);
+       fdev->priming = priming;
+       fdev->fs_bdev = fs_bdev;
+       fdev->bdev = bdev;
+
+       ret = -EINVAL;
+
+       fs_q = bdev_get_queue(fs_bdev);
+       cache_q = bdev_get_queue(bdev);
+       if (!fs_q || !cache_q)
+               goto out;
+
+       /*
+        * Chop up outgoing ios, if the target is a different queue. We could
+        * look closer at limits, but it's fragile and pretty pointless.
+        */
+       if (fs_q != cache_q)
+               fdev->chop_ios = 1;
+
+       /*
+        * Some other device already monitoring this block_device?
+        */
+       ret = -EBUSY;
+       if (fs_q->make_request_fn == fcache_make_request)
+               goto out;
+
+       ret = bd_claim(bdev, fcache_setup_dev);
+       if (ret < 0)
+               goto out;
+
+       ret = block_size(bdev);
+       if (ret != PAGE_SIZE) {
+               fdev->old_bs = ret;
+               ret = set_blocksize(bdev, PAGE_SIZE);
+               if (ret < 0)
+                       goto out_release;
+       } else
+               ret = 0;
+
+       fcache_fill_cache_size(fdev);
+       fcache_fill_fs_size(fdev);
+
+       if (priming) {
+               fdev->serial = serial;
+               ret = fcache_write_new_header(fdev);
+       } else
+               ret = fcache_load_header(fdev, serial);
+
+       if (!ret) {
+               printk("fcache: %s opened successfully (%spriming)\n",
+                                               bdevname(bdev, b),
+                                               priming ? "" : "not ");
+               return 0;
+       }
+
+out_release:
+       bd_release(fdev->bdev);
+out:
+       blkdev_put(fdev->bdev);
+       fdev->bdev = NULL;
+       return ret;
+}
+
+/*
+ * Return fdev->bdev to its original state.
+ */
+static void fcache_shutdown_dev(struct fcache_dev *fdev,
+                               struct block_device *bdev)
+{
+       if (fdev->bdev) {
+               if (fdev->mfn) {
+                       request_queue_t *q = bdev_get_queue(bdev);
+
+                       (void) xchg(&q->make_request_fn, fdev->mfn);
+               }
+               sync_blockdev(fdev->bdev);
+               if (fdev->old_bs)
+                       set_blocksize(fdev->bdev, fdev->old_bs);
+
+               bd_release(fdev->bdev);
+               blkdev_put(fdev->bdev);
+               fdev->bdev = NULL;
+               INIT_PRIO_TREE_ROOT(&fdev->prio_root);
+       }
+}
+
+/*
+ * bdev is the file system device, cache_dev is the device we want to store
+ * the cache on.
+ */
+int fcache_dev_open(struct block_device *bdev, unsigned long cache_dev,
+                   int priming, int serial)
+{
+       struct block_device *fcache_bdev;
+       request_queue_t *q;
+       int ret;
+
+       if (disable)
+               return 0;
+       if (fcache_dev.bdev)
+               return -EBUSY;
+
+       fcache_bdev = open_by_devnum(cache_dev, FMODE_READ|FMODE_WRITE);
+       if (IS_ERR(fcache_bdev))
+               return PTR_ERR(fcache_bdev);
+
+       ret = fcache_setup_dev(&fcache_dev, bdev, fcache_bdev, priming, serial);
+       if (ret)
+               return ret;
+
+       q = bdev_get_queue(bdev);
+       fcache_dev.mfn = xchg(&q->make_request_fn, fcache_make_request);
+       return 0;
+}
+
+EXPORT_SYMBOL(fcache_dev_open);
+
+void fcache_dev_close(struct block_device *bdev, int serial)
+{
+       struct fcache_dev *fdev = &fcache_dev;
+
+       if (disable)
+               return;
+
+       if (!fdev->bdev)
+               return;
+
+       printk("fcache: ios r/w %u/%u, hits %u, misses %u, overwrites %u, "
+                                       "extent_merges=%d, extent_drops=%d "
+                                       "pcache_kb=%lu, pcache_hits=%d, "
+                                       "partial=%u\n",
+                                       fdev->ios[0], fdev->ios[1], fdev->hits,
+                                       fdev->misses, fdev->overwrites,
+                                       fdev->extent_merges,
+                                       fdev->extent_drops,
+                                       fdev->pcache_kb, fdev->pcache_hits,
+                                       fdev->partial);
+
+       fdev->serial = serial;
+
+       sync_blockdev(bdev);
+       set_bit(FDEV_F_DOWN, &fdev->flags);
+
+       if (fdev->priming)
+               fcache_write_extents(fdev);
+
+       fcache_write_header(fdev);
+       fcache_free_links(fdev);
+       fcache_shutdown_dev(fdev, bdev);
+}
+
+EXPORT_SYMBOL(fcache_dev_close);
+
+static int fcache_init(void)
+{
+       fcache_slab = kmem_cache_create("fcache", sizeof(struct fcache_extent),
+                                       0, 0, NULL, NULL);
+       if (!fcache_slab)
+               return -ENOMEM;
+
+       fcache_fed_slab = kmem_cache_create("fcache_fed",
+                                       sizeof(struct fcache_endio_data), 0, 0,
+                                       NULL, NULL);
+       if (!fcache_fed_slab) {
+               kmem_cache_destroy(fcache_slab);
+               return -ENOMEM;
+       }
+
+       fed_pool = mempool_create_slab_pool(1, fcache_fed_slab);
+       if (!fed_pool) {
+               kmem_cache_destroy(fcache_slab);
+               kmem_cache_destroy(fcache_fed_slab);
+               return -ENOMEM;
+       }
+
+       fcache_workqueue = create_singlethread_workqueue("fcached");
+       if (!fcache_workqueue)
+               panic("fcache: failed to create fcached\n");
+
+       return 0;
+}
+
+static void fcache_exit(void)
+{
+       destroy_workqueue(fcache_workqueue);
+       kmem_cache_destroy(fcache_slab);
+       kmem_cache_destroy(fcache_fed_slab);
+       mempool_destroy(fed_pool);
+}
+
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(fcache_init);
+module_exit(fcache_exit);
index 3559086eee5fb45ce45a79ed4f076b62759e2ce5..5d8cabc5b85383c660c1c9107c9813aeeb898f17 100644 (file)
@@ -383,11 +383,40 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
        }
 }
 
+static void ext3_close_fcache(struct super_block *sb)
+{
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       struct ext3_super_block *es = sbi->s_es;
+       int serial = le16_to_cpu(es->s_mnt_count);
+
+       fcache_dev_close(sb->s_bdev, serial);
+}
+
+static int ext3_open_fcache(struct super_block *sb, unsigned long cachedev)
+{
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       struct ext3_super_block *es = sbi->s_es;
+       int priming = test_opt(sb, FCACHEPRIME);
+       int serial = le16_to_cpu(es->s_mnt_count);
+       int ret;
+
+       ret = fcache_dev_open(sb->s_bdev, cachedev, priming, serial);
+       if (!ret) {
+               set_opt(sbi->s_mount_opt, FCACHE);
+               return 0;
+       }
+
+       printk(KERN_ERR "ext3: failed to open fcache (err=%d)\n", ret);
+       return ret;
+}
+
 static void ext3_put_super (struct super_block * sb)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-       int i;
+       int i, has_fcache;
+
+       has_fcache = test_opt(sb, FCACHE);
 
        ext3_xattr_put_super(sb);
        journal_destroy(sbi->s_journal);
@@ -430,6 +459,8 @@ static void ext3_put_super (struct super_block * sb)
                invalidate_bdev(sbi->journal_bdev, 0);
                ext3_blkdev_remove(sbi);
        }
+       if (has_fcache)
+               ext3_close_fcache(sb);
        sb->s_fs_info = NULL;
        kfree(sbi);
        return;
@@ -677,7 +708,7 @@ enum {
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-       Opt_grpquota
+       Opt_grpquota, Opt_fcache_dev, Opt_fcache_prime,
 };
 
 static match_table_t tokens = {
@@ -727,6 +758,8 @@ static match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+       {Opt_fcache_dev, "fdev=%s"},
+       {Opt_fcache_prime, "fprime"},
        {Opt_err, NULL},
        {Opt_resize, "resize"},
 };
@@ -754,6 +787,7 @@ static ext3_fsblk_t get_sb_block(void **data)
 
 static int parse_options (char *options, struct super_block *sb,
                          unsigned long *inum, unsigned long *journal_devnum,
+                         unsigned long *fcache_devnum,
                          ext3_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -766,6 +800,8 @@ static int parse_options (char *options, struct super_block *sb,
        char *qname;
 #endif
 
+       clear_opt(sbi->s_mount_opt, FCACHEPRIME);
+
        if (!options)
                return 1;
 
@@ -1059,6 +1095,24 @@ clear_qf_name:
                case Opt_bh:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
+               case Opt_fcache_dev: {
+                       int maj, min;
+                       char *p, *pm;
+
+                       if (!fcache_devnum)
+                               break;
+                       p = match_strdup(&args[0]);
+                       if (!p)
+                               return 0;
+                       maj = simple_strtol(p, &pm, 10);
+                       min = simple_strtol(pm + 1, NULL, 10);
+                       *fcache_devnum = maj << MINORBITS | min;
+                       kfree(p);
+                       break;
+                       }
+               case Opt_fcache_prime:
+                       set_opt(sbi->s_mount_opt, FCACHEPRIME);
+                       break;
                default:
                        printk (KERN_ERR
                                "EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1392,6 +1446,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        unsigned long offset = 0;
        unsigned long journal_inum = 0;
        unsigned long journal_devnum = 0;
+       unsigned long fcache_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
        int blocksize;
@@ -1399,6 +1454,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        int db_count;
        int i;
        int needs_recovery;
+       int fcache = 0;
        __le32 features;
 
        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
@@ -1473,7 +1529,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        set_opt(sbi->s_mount_opt, RESERVATION);
 
        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
-                           NULL, 0))
+                           &fcache_devnum, NULL, 0))
                goto failed_mount;
 
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1712,6 +1768,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount3;
        }
 
+       if (fcache_devnum)
+               fcache = ext3_open_fcache(sb, fcache_devnum);
+
        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
@@ -1794,6 +1853,8 @@ cantfind_ext3:
        goto failed_mount;
 
 failed_mount4:
+       if (!fcache)
+               ext3_close_fcache(sb);
        journal_destroy(sbi->s_journal);
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -2263,6 +2324,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
+       unsigned long fcache_devnum = 0;
        struct ext3_mount_options old_opts;
        int err;
 #ifdef CONFIG_QUOTA
@@ -2284,7 +2346,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-       if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+       if (!parse_options(data, sb, NULL, NULL, &fcache_devnum, &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -2299,6 +2361,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 
        ext3_init_journal_params(sb, sbi->s_journal);
 
+       if (fcache_devnum) {
+               ext3_close_fcache(sb);
+               ext3_open_fcache(sb, fcache_devnum);
+       }
+
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
                if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
index 76bdaeab6f622240fe8f898a9caf0815770f7c38..d1ce1e84075fde54bce60bc2cff6153dcd1bd32e 100644 (file)
@@ -124,6 +124,7 @@ struct bio {
 #define BIO_BOUNCED    5       /* bio is a bounce bio */
 #define BIO_USER_MAPPED 6      /* contains user pages */
 #define BIO_EOPNOTSUPP 7       /* not supported */
+#define BIO_NOMERGE    8       /* bio not mergeable */
 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -179,6 +180,14 @@ struct bio {
 #define bio_failfast(bio)      ((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)      ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 
+static inline int bio_mergeable(struct bio *bio)
+{
+       if (!bio_barrier(bio) && !bio->bi_idx && !bio_flagged(bio, BIO_NOMERGE))
+               return 1;
+
+       return 0;
+}
+
 /*
  * will die
  */
index 9f9cce7bd86dd0f5e4991040cf94dee4401c8f95..0558f4e7163c8ac8664fe54a730ce66f848a37bc 100644 (file)
@@ -371,6 +371,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA               0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA            0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA            0x200000 /* "old" group quota */
+#define EXT3_MOUNT_FCACHE              0x400000 /* using fcache */
+#define EXT3_MOUNT_FCACHEPRIME         0x800000 /* priming fcache */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
index 555bc195c4207cbecf1dba61a43268a1e7692326..a942c6ae1fe3525ed569a93c2a9959497762aa9b 100644 (file)
@@ -1940,5 +1940,21 @@ static inline void free_secdata(void *secdata)
 { }
 #endif /* CONFIG_SECURITY */
 
+#ifdef CONFIG_BLK_FCACHE
+extern int fcache_dev_open(struct block_device *, unsigned long, int, int);
+extern void fcache_dev_close(struct block_device *, int);
+#else
+static inline int fcache_dev_open(struct block_device *bdev,
+                                 unsigned long cache_dev,
+                                 int priming, int serial)
+{
+       return 0;
+}
+
+static inline void fcache_dev_close(struct block_device *bdev, int serial)
+{
+}
+#endif /* FCACHE */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */