summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-03-03 12:36:43 +0100
committerJens Axboe <jens.axboe@oracle.com>2009-03-03 12:36:43 +0100
commite85a418ce59532bc1d31e09c9afa2ca6ac515d0e (patch)
tree4af124bec79d74de1be0878ac636e1fed0db29d8
parent31bd2bc7c95258356c7292d126c114749ce6d50f (diff)
loop: fastfs supportloop-extent_map
Add code to support redirecting IO directly to the filesystem blocks instead of going through the page cache. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--drivers/block/loop.c485
-rw-r--r--include/linux/loop.h14
2 files changed, 492 insertions, 7 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index edbaac6c0573..a43f49c062aa 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,6 +75,7 @@
#include <linux/gfp.h>
#include <linux/kthread.h>
#include <linux/splice.h>
+#include <linux/extent_map.h>
#include <asm/uaccess.h>
@@ -482,16 +483,67 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
return ret;
}
+#define __lo_throttle(wq, lock, condition) \
+do { \
+ DEFINE_WAIT(__wait); \
+ for (;;) { \
+ prepare_to_wait((wq), &__wait, TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq((lock)); \
+ wake_up(&lo->lo_event); \
+ io_schedule(); \
+ spin_lock_irq((lock)); \
+ } \
+ finish_wait((wq), &__wait); \
+} while (0) \
+
+#define LO_BIO_THROTTLE 128
+#define LO_BIO_THROTTLE_LOW (LO_BIO_THROTTLE / 2)
+
/*
- * Add bio to back of pending list
+ * A normal block device will throttle on request allocation. Do the same
+ * for loop to prevent millions of bio's queued internally.
+ */
+static void loop_bio_throttle(struct loop_device *lo, struct bio *bio)
+{
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock,
+ lo->lo_bio_cnt < LO_BIO_THROTTLE);
+}
+
+static void loop_bio_timer(unsigned long data)
+{
+ struct loop_device *lo = (struct loop_device *) data;
+
+ wake_up(&lo->lo_event);
+}
+
+/*
+ * Add bio to back of pending list and wakeup thread
*/
static void loop_add_bio(struct loop_device *lo, struct bio *bio)
{
+ loop_bio_throttle(lo, bio);
+
if (lo->lo_biotail) {
lo->lo_biotail->bi_next = bio;
lo->lo_biotail = bio;
} else
lo->lo_bio = lo->lo_biotail = bio;
+
+ lo->lo_bio_cnt++;
+
+ smp_mb();
+ if (lo->lo_bio_cnt > 8) {
+ if (timer_pending(&lo->lo_bio_timer))
+ del_timer(&lo->lo_bio_timer);
+
+ if (waitqueue_active(&lo->lo_event))
+ wake_up(&lo->lo_event);
+ } else if (!timer_pending(&lo->lo_bio_timer)) {
+ lo->lo_bio_timer.expires = jiffies + 1;
+ add_timer(&lo->lo_bio_timer);
+ }
}
/*
@@ -511,6 +563,241 @@ static struct bio *loop_get_bio(struct loop_device *lo)
return bio;
}
+static void loop_exit_fastfs(struct loop_device *lo)
+{
+ struct inode *inode = lo->lo_backing_file->f_mapping->host;
+
+ /*
+ * drop what page cache we instantiated filling holes
+ */
+ invalidate_inode_pages2(lo->lo_backing_file->f_mapping);
+
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_NONE, NULL);
+
+ mutex_lock(&inode->i_mutex);
+ inode->i_flags &= ~S_SWAPFILE;
+ mutex_unlock(&inode->i_mutex);
+}
+
+static inline u64 lo_bio_offset(struct loop_device *lo, struct bio *bio)
+{
+ return (u64)lo->lo_offset + ((u64)bio->bi_sector << 9);
+}
+
+/*
+ * Find extent mapping this lo device block to the file block on the real
+ * device
+ */
+static struct extent_map *loop_lookup_extent(struct loop_device *lo,
+ u64 offset, gfp_t gfp_mask)
+{
+ struct address_space *mapping = lo->lo_backing_file->f_mapping;
+ u64 len = 1 << lo->blkbits;
+
+ return mapping->a_ops->map_extent(mapping, NULL, 0, offset, len, 0,
+ gfp_mask);
+}
+
+static void end_bio_hole_filling(struct bio *bio, int err)
+{
+ struct address_space *mapping = bio->bi_bdev->bd_inode->i_mapping;
+ struct bio *orig_bio = bio->bi_private;
+
+ if (mapping->a_ops->extent_io_complete) {
+ u64 start = orig_bio->bi_sector << 9;
+ u64 len = bio->bi_size;
+
+ mapping->a_ops->extent_io_complete(mapping, start, len);
+ }
+
+ bio_put(bio);
+ bio_endio(orig_bio, err);
+}
+
+static void fill_extent_hole(struct loop_device *lo, struct bio *bio)
+{
+ struct address_space *mapping = lo->lo_backing_file->f_mapping;
+ struct bio *new_bio;
+ struct extent_map *em;
+ u64 len = bio->bi_size;
+ u64 start = lo_bio_offset(lo, bio);
+ u64 disk_block;
+ u64 extent_off;
+
+ /*
+ * change the sector so we can find the correct file offset in our
+ * endio
+ */
+ bio->bi_sector = start >> 9;
+
+ mutex_lock(&mapping->host->i_mutex);
+
+ em = mapping->a_ops->map_extent(mapping, NULL, 0,
+ start, len, 1, GFP_KERNEL);
+ mark_inode_dirty(mapping->host);
+ mutex_unlock(&mapping->host->i_mutex);
+
+ if (em && !IS_ERR(em)) {
+ disk_block = em->block_start;
+ extent_off = start - em->start;
+
+ /*
+ * bio_clone() is mempool backed, so if __GFP_WAIT is set
+ * it wont ever fail
+ */
+ new_bio = bio_clone(bio, GFP_NOIO);
+ new_bio->bi_sector = (disk_block + extent_off) >> 9;
+ new_bio->bi_bdev = em->bdev;
+ new_bio->bi_private = bio;
+ new_bio->bi_size = bio->bi_size;
+ new_bio->bi_end_io = end_bio_hole_filling;
+ free_extent_map(em);
+
+ generic_make_request(new_bio);
+ } else
+ bio_endio(bio, -EIO);
+}
+
+static void loop_bio_destructor(struct bio *bio)
+{
+ struct completion *c = (struct completion *) bio->bi_flags;
+
+ complete(c);
+}
+
+/*
+ * Alloc a hint bio to tell the loop thread to read file blocks for a given
+ * range
+ */
+static void loop_schedule_extent_mapping(struct loop_device *lo,
+ struct bio *old_bio)
+{
+ DECLARE_COMPLETION_ONSTACK(comp);
+ struct bio *bio, stackbio;
+ int do_sync = 0;
+
+ bio = bio_alloc(GFP_ATOMIC, 0);
+ if (!bio) {
+ bio = &stackbio;
+ bio_init(bio);
+ bio->bi_destructor = loop_bio_destructor;
+ bio->bi_flags = (unsigned long) &comp;
+ do_sync = 1;
+ }
+
+ bio->bi_rw = LOOP_EXTENT_RW_MAGIC;
+ bio->bi_private = old_bio;
+
+ loop_add_bio(lo, bio);
+
+ if (do_sync) {
+ spin_unlock_irq(&lo->lo_lock);
+ wait_for_completion(&comp);
+ spin_lock_irq(&lo->lo_lock);
+ }
+}
+
+static void loop_handle_extent_hole(struct loop_device *lo, struct bio *bio,
+ int sync)
+{
+ /*
+ * for a read, just zero the data and end the io
+ */
+ if (bio_data_dir(bio) == READ) {
+ struct bio_vec *bvec;
+ unsigned long flags;
+ int i;
+
+ bio_for_each_segment(bvec, bio, i) {
+ char *dst = bvec_kmap_irq(bvec, &flags);
+
+ memset(dst, 0, bvec->bv_len);
+ bvec_kunmap_irq(dst, &flags);
+ }
+ bio_endio(bio, 0);
+ } else {
+ /*
+ * let the page cache handling path do this bio, and then
+ * lookup the mapped blocks after the io has been issued to
+ * instantiate extents.
+ */
+ if (!sync)
+ loop_add_bio(lo, bio);
+ else
+ fill_extent_hole(lo, bio);
+ }
+}
+
+static inline int lo_is_switch_bio(struct bio *bio)
+{
+ return !bio->bi_bdev && bio->bi_rw == LOOP_SWITCH_RW_MAGIC;
+}
+
+static inline int lo_is_map_bio(struct bio *bio)
+{
+ return !bio->bi_bdev && bio->bi_rw == LOOP_EXTENT_RW_MAGIC;
+}
+
+static int __loop_redirect_bio(struct loop_device *lo, struct extent_map *em,
+ struct bio *bio, int sync)
+{
+ u64 extent_off;
+ u64 disk_block;
+
+ /*
+ * handle sparse io
+ */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ loop_handle_extent_hole(lo, bio, sync);
+ free_extent_map(em);
+ return 0;
+ }
+
+ /*
+ * not a hole, redirect
+ */
+ disk_block = em->block_start;
+ extent_off = lo_bio_offset(lo, bio) - em->start;
+ bio->bi_bdev = em->bdev;
+ bio->bi_sector = (disk_block + extent_off) >> 9;
+ free_extent_map(em);
+ return 1;
+}
+
+/*
+ * Change mapping of the bio, so that it points to the real bdev and offset
+ */
+static int loop_redirect_bio(struct loop_device *lo, struct bio *bio)
+{
+ u64 start = lo_bio_offset(lo, bio);
+ struct extent_map *em;
+
+ em = loop_lookup_extent(lo, start, GFP_ATOMIC);
+ if (IS_ERR(em)) {
+ bio_endio(bio, PTR_ERR(em));
+ return 0;
+ } else if (!em) {
+ loop_schedule_extent_mapping(lo, bio);
+ return 0;
+ }
+
+ return __loop_redirect_bio(lo, em, bio, 0);
+}
+
+/*
+ * Wait on bio's on our list to complete before sending a barrier bio
+ * to the below device. Called with lo_lock held.
+ */
+static void loop_wait_on_bios(struct loop_device *lo)
+{
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock, !lo->lo_bio);
+}
+
+static void loop_wait_on_switch(struct loop_device *lo)
+{
+ __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock, !lo->lo_switch);
+}
+
static int loop_make_request(struct request_queue *q, struct bio *old_bio)
{
struct loop_device *lo = q->queuedata;
@@ -526,15 +813,39 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
goto out;
if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
goto out;
+ if (lo->lo_flags & LO_FLAGS_FASTFS) {
+ /*
+ * If we get a barrier bio, then we just need to wait for
+ * existing bio's to be complete. This can only happen
+ * on the 'new' extent mapped loop, since that is the only
+ * one that supports barriers.
+ */
+ if (bio_barrier(old_bio))
+ loop_wait_on_bios(lo);
+
+ /*
+ * if file switch is in progress, wait for it to complete
+ */
+ if (!lo_is_switch_bio(old_bio) && lo->lo_switch)
+ loop_wait_on_switch(lo);
+
+ if (loop_redirect_bio(lo, old_bio))
+ goto out_redir;
+ goto out_end;
+ }
loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
return 0;
out:
- spin_unlock_irq(&lo->lo_lock);
bio_io_error(old_bio);
+out_end:
+ spin_unlock_irq(&lo->lo_lock);
return 0;
+
+out_redir:
+ spin_unlock_irq(&lo->lo_lock);
+ return 1;
}
/*
@@ -548,21 +859,53 @@ static void loop_unplug(struct request_queue *q)
blk_run_address_space(lo->lo_backing_file->f_mapping);
}
+static void loop_unplug_fastfs(struct request_queue *q)
+{
+ struct loop_device *lo = q->queuedata;
+ struct request_queue *rq = bdev_get_queue(lo->fs_bdev);
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (blk_remove_plug(q) && rq->unplug_fn)
+ rq->unplug_fn(rq);
+
+ local_irq_restore(flags);
+}
+
struct switch_request {
struct file *file;
struct completion wait;
};
static void do_loop_switch(struct loop_device *, struct switch_request *);
+static int loop_init_fastfs(struct loop_device *);
static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
{
- if (unlikely(!bio->bi_bdev)) {
+ if (lo_is_map_bio(bio)) {
+ struct bio *org_bio = bio->bi_private;
+ struct extent_map *em;
+
+ em = loop_lookup_extent(lo, lo_bio_offset(lo, org_bio),
+ GFP_KERNEL);
+
+ if (__loop_redirect_bio(lo, em, org_bio, 1))
+ generic_make_request(org_bio);
+
+ bio_put(bio);
+ } else if (lo_is_switch_bio(bio)) {
do_loop_switch(lo, bio->bi_private);
bio_put(bio);
} else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
+ if (lo->lo_flags & LO_FLAGS_FASTFS) {
+ /* we only get here when filling holes */
+ fill_extent_hole(lo, bio);
+ } else {
+ int ret = do_bio_filebacked(lo, bio);
+
+ bio_endio(bio, ret);
+ }
}
}
@@ -589,7 +932,6 @@ static int loop_thread(void *data)
wait_event_interruptible(lo->lo_event,
lo->lo_bio || kthread_should_stop());
-
if (!lo->lo_bio)
continue;
spin_lock_irq(&lo->lo_lock);
@@ -597,7 +939,13 @@ static int loop_thread(void *data)
spin_unlock_irq(&lo->lo_lock);
BUG_ON(!bio);
+
loop_handle_bio(lo, bio);
+
+ spin_lock_irq(&lo->lo_lock);
+ if (--lo->lo_bio_cnt < LO_BIO_THROTTLE_LOW || !lo->lo_bio)
+ wake_up(&lo->lo_bio_wait);
+ spin_unlock_irq(&lo->lo_lock);
}
return 0;
@@ -618,6 +966,8 @@ static int loop_switch(struct loop_device *lo, struct file *file)
w.file = file;
bio->bi_private = &w;
bio->bi_bdev = NULL;
+ bio->bi_rw = LOOP_SWITCH_RW_MAGIC;
+ lo->lo_switch = 1;
loop_make_request(lo->lo_queue, bio);
wait_for_completion(&w.wait);
return 0;
@@ -643,11 +993,15 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
struct file *file = p->file;
struct file *old_file = lo->lo_backing_file;
struct address_space *mapping;
+ const int fastfs = lo->lo_flags & LO_FLAGS_FASTFS;
/* if no new file, only flush of queued bios requested */
if (!file)
goto out;
+ if (fastfs)
+ loop_exit_fastfs(lo);
+
mapping = file->f_mapping;
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
@@ -655,6 +1009,12 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+ if (fastfs)
+ loop_init_fastfs(lo);
+
+ lo->lo_switch = 0;
+ wake_up(&lo->lo_bio_wait);
out:
complete(&p->wait);
}
@@ -721,6 +1081,92 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
return error;
}
+/*
+ * See if adding this bvec would cause us to spill into a new extent. If so,
+ * disallow the add to start a new bio. This ensures that the bio we receive
+ * in loop_make_request() never spans two extents or more.
+ */
+static int loop_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm,
+ struct bio_vec *bvec)
+{
+ struct loop_device *lo = q->queuedata;
+ struct extent_map *em;
+ unsigned int ret;
+ u64 start;
+ u64 len;
+
+ if (!bvm->bi_size)
+ return bvec->bv_len;
+
+ start = (u64) lo->lo_offset + ((u64)bvm->bi_sector << 9);
+ len = bvm->bi_size + bvec->bv_len;
+ ret = bvec->bv_len;
+
+ em = loop_lookup_extent(lo, start, GFP_ATOMIC);
+ if (em && !IS_ERR(em)) {
+ /*
+ * have extent, disallow if outside that extent
+ */
+ if (start + len > em->start + em->len || start < em->start)
+ ret = 0;
+
+ free_extent_map(em);
+ } else
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Initialize the members pertaining to extent mapping. We will populate
+ * the tree lazily on demand, as a full scan of a big file can take some
+ * time.
+ */
+static int loop_init_fastfs(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct inode *inode = file->f_mapping->host;
+ struct request_queue *fs_q;
+ int ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * Need a working extent_map
+ */
+ if (inode->i_mapping->a_ops->map_extent == NULL)
+ return -EINVAL;
+ /*
+ * invalidate all page cache belonging to this file, it could become
+ * stale when we directly overwrite blocks.
+ */
+ ret = invalidate_inode_pages2(file->f_mapping);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * disable truncate on this file
+ */
+ mutex_lock(&inode->i_mutex);
+ inode->i_flags |= S_SWAPFILE;
+ mutex_unlock(&inode->i_mutex);
+
+ lo->blkbits = inode->i_blkbits;
+ lo->fs_bdev = file->f_mapping->host->i_sb->s_bdev;
+ lo->lo_flags |= LO_FLAGS_FASTFS;
+ lo->lo_queue->unplug_fn = loop_unplug_fastfs;
+
+ blk_queue_merge_bvec(lo->lo_queue, loop_merge_bvec);
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+
+ fs_q = bdev_get_queue(lo->fs_bdev);
+ blk_queue_stack_limits(lo->lo_queue, fs_q);
+
+ printk(KERN_INFO "loop%d: fast redirect\n", lo->lo_number);
+ return 0;
+}
+
static inline int is_loop_device(struct file *file)
{
struct inode *i = file->f_mapping->host;
@@ -769,6 +1215,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
mapping = file->f_mapping;
inode = mapping->host;
+ lo->lo_flags = 0;
if (!(file->f_mode & FMODE_WRITE))
lo_flags |= LO_FLAGS_READ_ONLY;
@@ -832,6 +1279,12 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_blocksize(bdev, lo_blocksize);
+ /*
+ * This needs to be done after setup with another ioctl,
+ * not automatically like this.
+ */
+ loop_init_fastfs(lo);
+
lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
lo->lo_number);
if (IS_ERR(lo->lo_thread)) {
@@ -919,6 +1372,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
kthread_stop(lo->lo_thread);
+ if (lo->lo_flags & LO_FLAGS_FASTFS)
+ loop_exit_fastfs(lo);
+
lo->lo_queue->unplug_fn = NULL;
lo->lo_backing_file = NULL;
@@ -973,6 +1429,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
if (info->lo_encrypt_type) {
unsigned int type = info->lo_encrypt_type;
+ if (lo->lo_flags & LO_FLAGS_FASTFS)
+ return -EINVAL;
+
if (type >= MAX_LO_CRYPT)
return -EINVAL;
xfer = xfer_funcs[type];
@@ -981,6 +1440,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
} else
xfer = NULL;
+ /*
+ * for remaps, offset must be a multiple of full blocks
+ */
+ if ((lo->lo_flags & LO_FLAGS_FASTFS) &&
+ (((1 << lo->blkbits) - 1) & info->lo_offset))
+ return -EINVAL;
+
err = loop_init_xfer(lo, xfer, info);
if (err)
return err;
@@ -1187,6 +1653,9 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
case LOOP_GET_STATUS64:
err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
break;
+ case LOOP_SET_FASTFS:
+ err = loop_init_fastfs(lo);
+ break;
default:
err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
}
@@ -1466,6 +1935,8 @@ static struct loop_device *loop_alloc(int i)
lo->lo_number = i;
lo->lo_thread = NULL;
init_waitqueue_head(&lo->lo_event);
+ init_waitqueue_head(&lo->lo_bio_wait);
+ setup_timer(&lo->lo_bio_timer, loop_bio_timer, (unsigned long) lo);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 6ffd6db5bb0d..e5d3cb5aaa49 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -50,22 +50,29 @@ struct loop_device {
struct file * lo_backing_file;
struct block_device *lo_device;
+ struct block_device *fs_bdev;
unsigned lo_blocksize;
void *key_data;
+ unsigned int lo_switch;
gfp_t old_gfp_mask;
spinlock_t lo_lock;
struct bio *lo_bio;
struct bio *lo_biotail;
+ unsigned int lo_bio_cnt;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
+ wait_queue_head_t lo_bio_wait;
+ struct timer_list lo_bio_timer;
struct request_queue *lo_queue;
struct gendisk *lo_disk;
struct list_head lo_list;
+
+ unsigned int blkbits;
};
#endif /* __KERNEL__ */
@@ -77,6 +84,7 @@ enum {
LO_FLAGS_READ_ONLY = 1,
LO_FLAGS_USE_AOPS = 2,
LO_FLAGS_AUTOCLEAR = 4,
+ LO_FLAGS_FASTFS = 8,
};
#include <asm/posix_types.h> /* for __kernel_old_dev_t */
@@ -160,5 +168,11 @@ int loop_unregister_transfer(int number);
#define LOOP_SET_STATUS64 0x4C04
#define LOOP_GET_STATUS64 0x4C05
#define LOOP_CHANGE_FD 0x4C06
+#define LOOP_SET_FASTFS 0x4C07
+
+enum {
+ LOOP_EXTENT_RW_MAGIC = 0x19283744,
+ LOOP_SWITCH_RW_MAGIC = 0xfeedbeec,
+};
#endif