bcache: Kill unused freelist
authorKent Overstreet <kmo@daterainc.com>
Mon, 17 Mar 2014 23:55:55 +0000 (16:55 -0700)
committerKent Overstreet <kmo@daterainc.com>
Tue, 18 Mar 2014 19:23:36 +0000 (12:23 -0700)
This was originally added as at optimization that for various reasons isn't
needed anymore, but it does add a lot of nasty corner cases (and it was
responsible for some recently fixed bugs). Just get rid of it now.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/super.c
include/trace/events/bcache.h

index a59ef6147fc72f775225247796d72193a69a4f92..443d03fbac4705bd97f5acf77f622e0562b7fc55 100644 (file)
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
        ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
        WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
 
-       if (CACHE_SYNC(&ca->set->sb)) {
-               ca->need_save_prio = max(ca->need_save_prio,
-                                        bucket_disk_gen(b));
-               WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-       }
-
        return ret;
 }
 
@@ -120,58 +114,46 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
        mutex_unlock(&c->bucket_lock);
 }
 
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
-       return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-               bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+       return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
 }
 
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-       BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+       BUG_ON(!ca->set->gc_mark_valid);
 
-       if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-               unsigned i;
-
-               for (i = 0; i < RESERVE_NONE; i++)
-                       if (!fifo_full(&ca->free[i]))
-                               goto add;
-
-               return false;
-       }
-add:
-       b->prio = 0;
-
-       if (can_inc_bucket_gen(b) &&
-           fifo_push(&ca->unused, b - ca->buckets)) {
-               atomic_inc(&b->pin);
-               return true;
-       }
-
-       return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
        return (!GC_MARK(b) ||
                GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
                !atomic_read(&b->pin) &&
                can_inc_bucket_gen(b);
 }
 
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
-       size_t bucket = b - ca->buckets;
+       lockdep_assert_held(&ca->set->bucket_lock);
+       BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
 
        if (GC_SECTORS_USED(b))
-               trace_bcache_invalidate(ca, bucket);
+               trace_bcache_invalidate(ca, b - ca->buckets);
 
        bch_inc_gen(ca, b);
        b->prio = INITIAL_PRIO;
        atomic_inc(&b->pin);
-       fifo_push(&ca->free_inc, bucket);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+       __bch_invalidate_one_bucket(ca, b);
+
+       fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
 /*
@@ -201,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
        ca->heap.used = 0;
 
        for_each_bucket(b, ca) {
-               /*
-                * If we fill up the unused list, if we then return before
-                * adding anything to the free_inc list we'll skip writing
-                * prios/gens and just go back to allocating from the unused
-                * list:
-                */
-               if (fifo_full(&ca->unused))
-                       return;
-
-               if (!can_invalidate_bucket(ca, b))
-                       continue;
-
-               if (!GC_SECTORS_USED(b) &&
-                   bch_bucket_add_unused(ca, b))
+               if (!bch_can_invalidate_bucket(ca, b))
                        continue;
 
                if (!heap_full(&ca->heap))
@@ -239,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
                        return;
                }
 
-               invalidate_one_bucket(ca, b);
+               bch_invalidate_one_bucket(ca, b);
        }
 }
 
@@ -255,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
                b = ca->buckets + ca->fifo_last_bucket++;
 
-               if (can_invalidate_bucket(ca, b))
-                       invalidate_one_bucket(ca, b);
+               if (bch_can_invalidate_bucket(ca, b))
+                       bch_invalidate_one_bucket(ca, b);
 
                if (++checked >= ca->sb.nbuckets) {
                        ca->invalidate_needs_gc = 1;
@@ -280,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
 
                b = ca->buckets + n;
 
-               if (can_invalidate_bucket(ca, b))
-                       invalidate_one_bucket(ca, b);
+               if (bch_can_invalidate_bucket(ca, b))
+                       bch_invalidate_one_bucket(ca, b);
 
                if (++checked >= ca->sb.nbuckets / 2) {
                        ca->invalidate_needs_gc = 1;
@@ -293,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 static void invalidate_buckets(struct cache *ca)
 {
-       if (ca->invalidate_needs_gc)
-               return;
+       BUG_ON(ca->invalidate_needs_gc);
 
        switch (CACHE_REPLACEMENT(&ca->sb)) {
        case CACHE_REPLACEMENT_LRU:
@@ -354,17 +322,10 @@ static int bch_allocator_thread(void *arg)
                 * possibly issue discards to them, then we add the bucket to
                 * the free list:
                 */
-               while (1) {
+               while (!fifo_empty(&ca->free_inc)) {
                        long bucket;
 
-                       if ((!atomic_read(&ca->set->prio_blocked) ||
-                            !CACHE_SYNC(&ca->set->sb)) &&
-                           !fifo_empty(&ca->unused))
-                               fifo_pop(&ca->unused, bucket);
-                       else if (!fifo_empty(&ca->free_inc))
-                               fifo_pop(&ca->free_inc, bucket);
-                       else
-                               break;
+                       fifo_pop(&ca->free_inc, bucket);
 
                        if (ca->discard) {
                                mutex_unlock(&ca->set->bucket_lock);
@@ -385,9 +346,9 @@ static int bch_allocator_thread(void *arg)
                 * them to the free_inc list:
                 */
 
+retry_invalidate:
                allocator_wait(ca, ca->set->gc_mark_valid &&
-                              (ca->need_save_prio > 64 ||
-                               !ca->invalidate_needs_gc));
+                              !ca->invalidate_needs_gc);
                invalidate_buckets(ca);
 
                /*
@@ -395,13 +356,28 @@ static int bch_allocator_thread(void *arg)
                 * new stuff to them:
                 */
                allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-               if (CACHE_SYNC(&ca->set->sb) &&
-                   (!fifo_empty(&ca->free_inc) ||
-                    ca->need_save_prio > 64))
+               if (CACHE_SYNC(&ca->set->sb)) {
+                       /*
+                        * This could deadlock if an allocation with a btree
+                        * node locked ever blocked - having the btree node
+                        * locked would block garbage collection, but here we're
+                        * waiting on garbage collection before we invalidate
+                        * and free anything.
+                        *
+                        * But this should be safe since the btree code always
+                        * uses btree_check_reserve() before allocating now, and
+                        * if it fails it blocks without btree nodes locked.
+                        */
+                       if (!fifo_full(&ca->free_inc))
+                               goto retry_invalidate;
+
                        bch_prio_write(ca);
+               }
        }
 }
 
+/* Allocation */
+
 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
        DEFINE_WAIT(w);
@@ -447,8 +423,6 @@ out:
                                BUG_ON(i == r);
                fifo_for_each(i, &ca->free_inc, iter)
                        BUG_ON(i == r);
-               fifo_for_each(i, &ca->unused, iter)
-                       BUG_ON(i == r);
        }
 
        b = ca->buckets + r;
@@ -470,17 +444,19 @@ out:
        return r;
 }
 
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+       SET_GC_MARK(b, 0);
+       SET_GC_SECTORS_USED(b, 0);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
        unsigned i;
 
-       for (i = 0; i < KEY_PTRS(k); i++) {
-               struct bucket *b = PTR_BUCKET(c, k, i);
-
-               SET_GC_MARK(b, 0);
-               SET_GC_SECTORS_USED(b, 0);
-               bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-       }
+       for (i = 0; i < KEY_PTRS(k); i++)
+               __bch_bucket_free(PTR_CACHE(c, k, i),
+                                 PTR_BUCKET(c, k, i));
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
index 171cda89cb6b25bf79c1fe8915f681adfe2fabda..200efc1b5cf8b312dd3711eab19ef5cb1b2e4955 100644 (file)
@@ -195,7 +195,6 @@ struct bucket {
        atomic_t        pin;
        uint16_t        prio;
        uint8_t         gen;
-       uint8_t         disk_gen;
        uint8_t         last_gc; /* Most out of date gen in the btree */
        uint8_t         gc_gen;
        uint16_t        gc_mark; /* Bitfield used by GC. See below for field */
@@ -426,14 +425,9 @@ struct cache {
         * their new gen to disk. After prio_write() finishes writing the new
         * gens/prios, they'll be moved to the free list (and possibly discarded
         * in the process)
-        *
-        * unused: GC found nothing pointing into these buckets (possibly
-        * because all the data they contained was overwritten), so we only
-        * need to discard them before they can be moved to the free list.
         */
        DECLARE_FIFO(long, free)[RESERVE_NR];
        DECLARE_FIFO(long, free_inc);
-       DECLARE_FIFO(long, unused);
 
        size_t                  fifo_last_bucket;
 
@@ -442,12 +436,6 @@ struct cache {
 
        DECLARE_HEAP(struct bucket *, heap);
 
-       /*
-        * max(gen - disk_gen) for all buckets. When it gets too big we have to
-        * call prio_write() to keep gens from wrapping.
-        */
-       uint8_t                 need_save_prio;
-
        /*
         * If nonzero, we know we aren't going to find any buckets to invalidate
         * until a gc finishes - otherwise we could pointlessly burn a ton of
@@ -848,9 +836,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
  */
 
 static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -858,13 +843,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
        return b->gen - b->last_gc;
 }
 
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-       return b->gen - b->disk_gen;
-}
-
 #define BUCKET_GC_GEN_MAX      96U
-#define BUCKET_DISK_GEN_MAX    64U
 
 #define kobj_attribute_write(n, fn)                                    \
        static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -897,11 +876,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
 
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
                           struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
index be90596a9e2ad8992aee0eb3227049bae8c1e0cb..4c340c85b122e93a7229b9c484aa76061928e116 100644 (file)
@@ -1641,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
        mutex_unlock(&c->bucket_lock);
 }
 
-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
 {
        size_t available = 0;
        struct bucket *b;
@@ -1703,9 +1703,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 
                        if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
                                available++;
-
-                       if (!GC_MARK(b))
-                               bch_bucket_add_unused(ca, b);
                }
        }
 
@@ -1836,6 +1833,42 @@ int bch_btree_check(struct cache_set *c)
        return btree_root(check_recurse, c, &op);
 }
 
+void bch_initial_gc_finish(struct cache_set *c)
+{
+       struct cache *ca;
+       struct bucket *b;
+       unsigned i;
+
+       bch_btree_gc_finish(c);
+
+       mutex_lock(&c->bucket_lock);
+
+       /*
+        * We need to put some unused buckets directly on the prio freelist in
+        * order to get the allocator thread started - it needs freed buckets in
+        * order to rewrite the prios and gens, and it needs to rewrite prios
+        * and gens in order to free buckets.
+        *
+        * This is only safe for buckets that have no live data in them, which
+        * there should always be some of.
+        */
+       for_each_cache(ca, c, i) {
+               for_each_bucket(b, ca) {
+                       if (fifo_full(&ca->free[RESERVE_PRIO]))
+                               break;
+
+                       if (bch_can_invalidate_bucket(ca, b) &&
+                           !GC_MARK(b)) {
+                               __bch_invalidate_one_bucket(ca, b);
+                               fifo_push(&ca->free[RESERVE_PRIO],
+                                         b - ca->buckets);
+                       }
+               }
+       }
+
+       mutex_unlock(&c->bucket_lock);
+}
+
 /* Btree insertion */
 
 static bool btree_insert_key(struct btree *b, struct bkey *k,
index 3ce371fa7f98678b63f54cb11145258ff7cb8e73..91dfa5e696857ded36b3de7b66a8f862d9540cea 100644 (file)
@@ -252,7 +252,7 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
                     atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *);
 void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
index 2d4a56219ec751987fa48cc086782598eb1c7ad3..a8c57d59a726bce7e8d122ae235672247c492b84 100644 (file)
@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
        closure_sync(cl);
 }
 
-#define buckets_free(c)        "free %zu, free_inc %zu, unused %zu",           \
-       fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
 void bch_prio_write(struct cache *ca)
 {
        int i;
@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)
 
        lockdep_assert_held(&ca->set->bucket_lock);
 
-       for (b = ca->buckets;
-            b < ca->buckets + ca->sb.nbuckets; b++)
-               b->disk_gen = b->gen;
-
        ca->disk_buckets->seq++;
 
        atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)
 
        mutex_lock(&ca->set->bucket_lock);
 
-       ca->need_save_prio = 0;
-
        /*
         * Don't want the old priorities to get garbage collected until after we
         * finish writing the new ones, and they're journalled
         */
-       for (i = 0; i < prio_buckets(ca); i++)
+       for (i = 0; i < prio_buckets(ca); i++) {
+               if (ca->prio_last_buckets[i])
+                       __bch_bucket_free(ca,
+                               &ca->buckets[ca->prio_last_buckets[i]]);
+
                ca->prio_last_buckets[i] = ca->prio_buckets[i];
+       }
 }
 
 static void prio_read(struct cache *ca, uint64_t bucket)
@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
                }
 
                b->prio = le16_to_cpu(d->prio);
-               b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+               b->gen = b->last_gc = b->gc_gen = d->gen;
        }
 }
 
@@ -1606,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
                        goto err;
 
                bch_journal_mark(c, &journal);
-               bch_btree_gc_finish(c);
+               bch_initial_gc_finish(c);
                pr_debug("btree_check() done");
 
                /*
@@ -1648,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
                                ca->sb.d[j] = ca->sb.first_bucket + j;
                }
 
-               bch_btree_gc_finish(c);
+               bch_initial_gc_finish(c);
 
                err = "error starting allocator thread";
                for_each_cache(ca, c, i)
@@ -1794,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
        vfree(ca->buckets);
 
        free_heap(&ca->heap);
-       free_fifo(&ca->unused);
        free_fifo(&ca->free_inc);
 
        for (i = 0; i < RESERVE_NR; i++)
@@ -1831,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
            !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
            !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
-           !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
            !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
            !(ca->buckets       = vzalloc(sizeof(struct bucket) *
                                          ca->sb.nbuckets)) ||
index 8fc2a7134d3c9e2a4e50d9df9bfec9621b95bf40..c9c3c044b32f060b749c63381b956fceabc64f89 100644 (file)
@@ -446,7 +446,6 @@ TRACE_EVENT(bcache_alloc_fail,
                __field(dev_t,          dev                     )
                __field(unsigned,       free                    )
                __field(unsigned,       free_inc                )
-               __field(unsigned,       unused                  )
                __field(unsigned,       blocked                 )
        ),
 
@@ -454,13 +453,12 @@ TRACE_EVENT(bcache_alloc_fail,
                __entry->dev            = ca->bdev->bd_dev;
                __entry->free           = fifo_used(&ca->free[reserve]);
                __entry->free_inc       = fifo_used(&ca->free_inc);
-               __entry->unused         = fifo_used(&ca->unused);
                __entry->blocked        = atomic_read(&ca->set->prio_blocked);
        ),
 
-       TP_printk("alloc fail %d,%d free %u free_inc %u unused %u blocked %u",
+       TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free,
-                 __entry->free_inc, __entry->unused, __entry->blocked)
+                 __entry->free_inc, __entry->blocked)
 );
 
 /* Background writeback */