md/r5cache: enable chunk_aligned_read with write back cache
[linux-2.6-block.git] / drivers / md / raid5-cache.c
index 0e8ed2c327b07fd849c1720d7b272dd860b949b9..76c0e5063f1b3d7278697078810261a3c813af91 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/crc32c.h>
 #include <linux/random.h>
 #include <linux/kthread.h>
+#include <linux/types.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -162,8 +163,61 @@ struct r5l_log {
 
        /* to submit async io_units, to fulfill ordering of flush */
        struct work_struct deferred_io_work;
+       /* to disable write back during in degraded mode */
+       struct work_struct disable_writeback_work;
+
+       /* to for chunk_aligned_read in writeback mode, details below */
+       spinlock_t tree_lock;
+       struct radix_tree_root big_stripe_tree;
 };
 
+/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+                                     sector_t sect)
+{
+       sector_t offset;
+
+       offset = sector_div(sect, conf->chunk_sectors);
+       return sect;
+}
+
 /*
  * an IO range starts from a meta data block and end at the next meta data
  * block. The io unit's the meta data block tracks data/parity followed it. io
@@ -410,16 +464,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
 
        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                atomic_inc(&conf->preread_active_stripes);
-
-       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_partial_stripes);
-       }
-
-       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
-               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
-               atomic_dec(&conf->r5c_cached_full_stripes);
-       }
 }
 
 static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -611,6 +655,21 @@ static void r5l_submit_io_async(struct work_struct *work)
                r5l_do_submit_io(log, io);
 }
 
+static void r5c_disable_writeback_async(struct work_struct *work)
+{
+       struct r5l_log *log = container_of(work, struct r5l_log,
+                                          disable_writeback_work);
+       struct mddev *mddev = log->rdev->mddev;
+
+       if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+               return;
+       pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
+               mdname(mddev));
+       mddev_suspend(mddev);
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+       mddev_resume(mddev);
+}
+
 static void r5l_submit_current_io(struct r5l_log *log)
 {
        struct r5l_io_unit *io = log->current_io;
@@ -1393,8 +1452,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
        next_checkpoint = r5c_calculate_new_cp(conf);
        spin_unlock_irq(&log->io_list_lock);
 
-       BUG_ON(reclaimable < 0);
-
        if (reclaimable == 0 || !write_super)
                return;
 
@@ -2062,7 +2119,7 @@ static int
 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
                                       struct r5l_recovery_ctx *ctx)
 {
-       struct stripe_head *sh, *next;
+       struct stripe_head *sh;
        struct mddev *mddev = log->rdev->mddev;
        struct page *page;
        sector_t next_checkpoint = MaxSector;
@@ -2076,7 +2133,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 
        WARN_ON(list_empty(&ctx->cached_list));
 
-       list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+       list_for_each_entry(sh, &ctx->cached_list, lru) {
                struct r5l_meta_block *mb;
                int i;
                int offset;
@@ -2126,14 +2183,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
                ctx->pos = write_pos;
                ctx->seq += 1;
                next_checkpoint = sh->log_start;
-               list_del_init(&sh->lru);
-               raid5_release_stripe(sh);
        }
        log->next_checkpoint = next_checkpoint;
        __free_page(page);
        return 0;
 }
 
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+                                                struct r5l_recovery_ctx *ctx)
+{
+       struct mddev *mddev = log->rdev->mddev;
+       struct r5conf *conf = mddev->private;
+       struct stripe_head *sh, *next;
+
+       if (ctx->data_only_stripes == 0)
+               return;
+
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+
+       list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+               r5c_make_stripe_write_out(sh);
+               set_bit(STRIPE_HANDLE, &sh->state);
+               list_del_init(&sh->lru);
+               raid5_release_stripe(sh);
+       }
+
+       md_wakeup_thread(conf->mddev->thread);
+       /* reuse conf->wait_for_quiescent in recovery */
+       wait_event(conf->wait_for_quiescent,
+                  atomic_read(&conf->active_stripes) == 0);
+
+       log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+}
+
 static int r5l_recovery_log(struct r5l_log *log)
 {
        struct mddev *mddev = log->rdev->mddev;
@@ -2160,32 +2242,31 @@ static int r5l_recovery_log(struct r5l_log *log)
        pos = ctx.pos;
        ctx.seq += 10000;
 
-       if (ctx.data_only_stripes == 0) {
-               log->next_checkpoint = ctx.pos;
-               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
-               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-       }
 
        if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
                pr_debug("md/raid:%s: starting from clean shutdown\n",
                         mdname(mddev));
-       else {
+       else
                pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
                         mdname(mddev), ctx.data_only_stripes,
                         ctx.data_parity_stripes);
 
-               if (ctx.data_only_stripes > 0)
-                       if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
-                               pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
-                                      mdname(mddev));
-                               return -EIO;
-                       }
+       if (ctx.data_only_stripes == 0) {
+               log->next_checkpoint = ctx.pos;
+               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+       } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+               pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+                      mdname(mddev));
+               return -EIO;
        }
 
        log->log_start = ctx.pos;
        log->seq = ctx.seq;
        log->last_checkpoint = pos;
        r5l_write_super(log, pos);
+
+       r5c_recovery_flush_data_only_stripes(log, &ctx);
        return 0;
 }
 
@@ -2247,6 +2328,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
            val > R5C_JOURNAL_MODE_WRITE_BACK)
                return -EINVAL;
 
+       if (raid5_calc_degraded(conf) > 0 &&
+           val == R5C_JOURNAL_MODE_WRITE_BACK)
+               return -EINVAL;
+
        mddev_suspend(mddev);
        conf->log->r5c_journal_mode = val;
        mddev_resume(mddev);
@@ -2277,6 +2362,10 @@ int r5c_try_caching_write(struct r5conf *conf,
        int i;
        struct r5dev *dev;
        int to_cache = 0;
+       void **pslot;
+       sector_t tree_index;
+       int ret;
+       uintptr_t refcount;
 
        BUG_ON(!r5c_is_writeback(log));
 
@@ -2301,6 +2390,16 @@ int r5c_try_caching_write(struct r5conf *conf,
                set_bit(STRIPE_R5C_CACHING, &sh->state);
        }
 
+       /*
+        * When run in degraded mode, array is set to write-through mode.
+        * This check helps drain pending write safely in the transition to
+        * write-through mode.
+        */
+       if (s->failed) {
+               r5c_make_stripe_write_out(sh);
+               return -EAGAIN;
+       }
+
        for (i = disks; i--; ) {
                dev = &sh->dev[i];
                /* if non-overwrite, use writing-out phase */
@@ -2311,6 +2410,44 @@ int r5c_try_caching_write(struct r5conf *conf,
                }
        }
 
+       /* if the stripe is not counted in big_stripe_tree, add it now */
+       if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+           !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               if (pslot) {
+                       refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                               pslot, &log->tree_lock) >>
+                               R5C_RADIX_COUNT_SHIFT;
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+               } else {
+                       /*
+                        * this radix_tree_insert can fail safely, so no
+                        * need to call radix_tree_preload()
+                        */
+                       ret = radix_tree_insert(
+                               &log->big_stripe_tree, tree_index,
+                               (void *)(1 << R5C_RADIX_COUNT_SHIFT));
+                       if (ret) {
+                               spin_unlock(&log->tree_lock);
+                               r5c_make_stripe_write_out(sh);
+                               return -EAGAIN;
+                       }
+               }
+               spin_unlock(&log->tree_lock);
+
+               /*
+                * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+                * counted in the radix tree
+                */
+               set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+               atomic_inc(&conf->r5c_cached_partial_stripes);
+       }
+
        for (i = disks; i--; ) {
                dev = &sh->dev[i];
                if (dev->towrite) {
@@ -2351,6 +2488,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
                        struct page *p = sh->dev[i].orig_page;
 
                        sh->dev[i].orig_page = sh->dev[i].page;
+                       clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
                        if (!using_disk_info_extra_page)
                                put_page(p);
                }
@@ -2383,17 +2522,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
                                 struct stripe_head *sh,
                                 struct stripe_head_state *s)
 {
+       struct r5l_log *log = conf->log;
        int i;
        int do_wakeup = 0;
+       sector_t tree_index;
+       void **pslot;
+       uintptr_t refcount;
 
-       if (!conf->log ||
-           !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+       if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
                return;
 
        WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
        clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 
-       if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+       if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
                return;
 
        for (i = sh->disks; i--; ) {
@@ -2415,12 +2557,43 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
        if (do_wakeup)
                wake_up(&conf->wait_for_overlap);
 
-       spin_lock_irq(&conf->log->stripe_in_journal_lock);
+       spin_lock_irq(&log->stripe_in_journal_lock);
        list_del_init(&sh->r5c);
-       spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+       spin_unlock_irq(&log->stripe_in_journal_lock);
        sh->log_start = MaxSector;
-       atomic_dec(&conf->log->stripe_in_journal_count);
-       r5c_update_log_state(conf->log);
+
+       atomic_dec(&log->stripe_in_journal_count);
+       r5c_update_log_state(log);
+
+       /* stop counting this stripe in big_stripe_tree */
+       if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+           test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               tree_index = r5c_tree_index(conf, sh->sector);
+               spin_lock(&log->tree_lock);
+               pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+                                              tree_index);
+               BUG_ON(pslot == NULL);
+               refcount = (uintptr_t)radix_tree_deref_slot_protected(
+                       pslot, &log->tree_lock) >>
+                       R5C_RADIX_COUNT_SHIFT;
+               if (refcount == 1)
+                       radix_tree_delete(&log->big_stripe_tree, tree_index);
+               else
+                       radix_tree_replace_slot(
+                               &log->big_stripe_tree, pslot,
+                               (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+               spin_unlock(&log->tree_lock);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+               atomic_dec(&conf->r5c_cached_partial_stripes);
+       }
+
+       if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+               BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+               atomic_dec(&conf->r5c_cached_full_stripes);
+       }
 }
 
 int
@@ -2480,6 +2653,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
        return 0;
 }
 
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+       struct r5l_log *log = conf->log;
+       sector_t tree_index;
+       void *slot;
+
+       if (!log)
+               return false;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       tree_index = r5c_tree_index(conf, sect);
+       slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+       return slot != NULL;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
        struct md_rdev *rdev = log->rdev;
@@ -2555,6 +2744,19 @@ ioerr:
        return ret;
 }
 
+void r5c_update_on_rdev_error(struct mddev *mddev)
+{
+       struct r5conf *conf = mddev->private;
+       struct r5l_log *log = conf->log;
+
+       if (!log)
+               return;
+
+       if (raid5_calc_degraded(conf) > 0 &&
+           conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+               schedule_work(&log->disable_writeback_work);
+}
+
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2613,6 +2815,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        if (!log->meta_pool)
                goto out_mempool;
 
+       spin_lock_init(&log->tree_lock);
+       INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
        log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
                                                 log->rdev->mddev, "reclaim");
        if (!log->reclaim_thread)
@@ -2627,6 +2832,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        spin_lock_init(&log->no_space_stripes_lock);
 
        INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+       INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
 
        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
        INIT_LIST_HEAD(&log->stripe_in_journal_list);
@@ -2659,6 +2865,7 @@ io_kc:
 
 void r5l_exit_log(struct r5l_log *log)
 {
+       flush_work(&log->disable_writeback_work);
        md_unregister_thread(&log->reclaim_thread);
        mempool_destroy(log->meta_pool);
        bioset_free(log->bs);