bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 11 Dec 2023 16:11:22 +0000 (11:11 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Mon, 1 Jan 2024 16:47:44 +0000 (11:47 -0500)
 - Some tweaks to greatly reduce locking overhead for the list of btree
   transactions, so that it can always be enabled: leave btree_trans
   objects on the list when they're on the percpu single item freelist,
   and only check for duplicates in the same process when
   CONFIG_BCACHEFS_DEBUG is enabled

 - don't zero out the full btree_trans() unless we allocated it from
   the mempool

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/Kconfig
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_locking.c
fs/bcachefs/btree_types.h
fs/bcachefs/debug.c

index 7d3ac9556c9867d479ae9eaa42fac6d8ac039467..5cdfef3b551a78fddf62a1bb47dbfedfa0c149c5 100644 (file)
@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
        depends on BCACHEFS_FS
        select FS_POSIX_ACL
 
-config BCACHEFS_DEBUG_TRANSACTIONS
-       bool "bcachefs runtime info"
-       depends on BCACHEFS_FS
-       help
-       This makes the list of running btree transactions available in debugfs.
-
-       This is a highly useful debugging feature but does add a small amount of overhead.
-
 config BCACHEFS_DEBUG
        bool "bcachefs debugging"
        depends on BCACHEFS_FS
index 818676b97436253b4966563adca3ddb0cbbacb69..c35a262f608145bdff255f860c39d2afa0500eed 100644 (file)
@@ -2714,6 +2714,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 
 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
+       struct bch_fs *c = trans->c;
        unsigned new_top = trans->mem_top + size;
        unsigned old_bytes = trans->mem_bytes;
        unsigned new_bytes = roundup_pow_of_two(new_top);
@@ -2721,17 +2722,19 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        void *new_mem;
        void *p;
 
-       trans->mem_max = max(trans->mem_max, new_top);
-
        WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
 
+       struct btree_transaction_stats *s = btree_trans_stats(trans);
+       if (s)
+               s->max_mem = max(s->max_mem, new_bytes);
+
        new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
        if (unlikely(!new_mem)) {
                bch2_trans_unlock(trans);
 
                new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
                if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-                       new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+                       new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
                        new_bytes = BTREE_TRANS_MEM_MAX;
                        kfree(trans->mem);
                }
@@ -2751,7 +2754,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        trans->mem_bytes = new_bytes;
 
        if (old_bytes) {
-               trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+               trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
                return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
        }
 
@@ -2860,25 +2863,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        return trans->restart_count;
 }
 
-static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
-{
-       struct btree_trans *trans;
-
-       if (IS_ENABLED(__KERNEL__)) {
-               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
-               if (trans)
-                       return trans;
-       }
-
-       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
-       /*
-        * paths need to be zeroed, bch2_check_for_deadlock looks at
-        * paths in other threads
-        */
-       memset(&trans->paths, 0, sizeof(trans->paths));
-       return trans;
-}
-
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 
 unsigned bch2_trans_get_fn_idx(const char *fn)
@@ -2900,22 +2884,55 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        __acquires(&c->btree_trans_barrier)
 {
        struct btree_trans *trans;
-       struct btree_transaction_stats *s;
 
-       trans = bch2_trans_alloc(c);
+       if (IS_ENABLED(__KERNEL__)) {
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+               if (trans) {
+                       memset(trans, 0, offsetof(struct btree_trans, updates));
+                       goto got_trans;
+               }
+       }
 
+       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
        memset(trans, 0, sizeof(*trans));
+       closure_init_stack(&trans->ref);
+
+       seqmutex_lock(&c->btree_trans_lock);
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               struct btree_trans *pos;
+               pid_t pid = current->pid;
+
+               trans->locking_wait.task = current;
+
+               list_for_each_entry(pos, &c->btree_trans_list, list) {
+                       struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
+                       /*
+                        * We'd much prefer to be stricter here and completely
+                        * disallow multiple btree_trans in the same thread -
+                        * but the data move path calls bch2_write when we
+                        * already have a btree_trans initialized.
+                        */
+                       BUG_ON(pos_task &&
+                              pid == pos_task->pid &&
+                              bch2_trans_locked(pos));
+
+                       if (pos_task && pid < pos_task->pid) {
+                               list_add_tail(&trans->list, &pos->list);
+                               goto list_add_done;
+                       }
+               }
+       }
+       list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
+       seqmutex_unlock(&c->btree_trans_lock);
+got_trans:
        trans->c                = c;
-       trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
-               ? bch2_btree_transaction_fns[fn_idx] : NULL;
        trans->last_begin_time  = local_clock();
        trans->fn_idx           = fn_idx;
        trans->locking_wait.task = current;
        trans->journal_replay_not_finished =
                unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
                atomic_inc_not_zero(&c->journal_keys.ref);
-       closure_init_stack(&trans->ref);
-
        trans->paths_allocated  = trans->_paths_allocated;
        trans->sorted           = trans->_sorted;
        trans->paths            = trans->_paths;
@@ -2924,21 +2941,19 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 
        trans->paths_allocated[0] = 1;
 
-       s = btree_trans_stats(trans);
-       if (s && s->max_mem) {
-               unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+       if (fn_idx < BCH_TRANSACTIONS_NR) {
+               trans->fn = bch2_btree_transaction_fns[fn_idx];
 
-               trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+               struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
 
-               if (!unlikely(trans->mem)) {
-                       trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
-                       trans->mem_bytes = BTREE_TRANS_MEM_MAX;
-               } else {
-                       trans->mem_bytes = expected_mem_bytes;
+               if (s->max_mem) {
+                       unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+                       trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+                       if (likely(trans->mem))
+                               trans->mem_bytes = expected_mem_bytes;
                }
-       }
 
-       if (s) {
                trans->nr_paths_max = s->nr_max_paths;
                trans->journal_entries_size = s->journal_entries_size;
        }
@@ -2946,31 +2961,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        trans->srcu_idx         = srcu_read_lock(&c->btree_trans_barrier);
        trans->srcu_lock_time   = jiffies;
        trans->srcu_held        = true;
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
-               struct btree_trans *pos;
-
-               seqmutex_lock(&c->btree_trans_lock);
-               list_for_each_entry(pos, &c->btree_trans_list, list) {
-                       /*
-                        * We'd much prefer to be stricter here and completely
-                        * disallow multiple btree_trans in the same thread -
-                        * but the data move path calls bch2_write when we
-                        * already have a btree_trans initialized.
-                        */
-                       BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
-                              bch2_trans_locked(pos));
-
-                       if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
-                               list_add_tail(&trans->list, &pos->list);
-                               goto list_add_done;
-                       }
-               }
-               list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
-               seqmutex_unlock(&c->btree_trans_lock);
-       }
-
        return trans;
 }
 
@@ -3001,24 +2991,13 @@ void bch2_trans_put(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
        struct bch_fs *c = trans->c;
-       struct btree_transaction_stats *s = btree_trans_stats(trans);
 
        bch2_trans_unlock(trans);
 
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
-               seqmutex_lock(&c->btree_trans_lock);
-               list_del(&trans->list);
-               seqmutex_unlock(&c->btree_trans_lock);
-       }
-
-       closure_sync(&trans->ref);
-
-       if (s)
-               s->max_mem = max(s->max_mem, trans->mem_max);
-
        trans_for_each_update(trans, i)
                __btree_path_put(trans->paths + i->path, true);
-       trans->nr_updates               = 0;
+       trans->nr_updates       = 0;
+       trans->locking_wait.task = NULL;
 
        check_btree_paths_leaked(trans);
 
@@ -3047,8 +3026,16 @@ void bch2_trans_put(struct btree_trans *trans)
        /* Userspace doesn't have a real percpu implementation: */
        if (IS_ENABLED(__KERNEL__))
                trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-       if (trans)
+
+       if (trans) {
+               closure_sync(&trans->ref);
+
+               seqmutex_lock(&c->btree_trans_lock);
+               list_del(&trans->list);
+               seqmutex_unlock(&c->btree_trans_lock);
+
                mempool_free(trans, &c->btree_trans_pool);
+       }
 }
 
 static void __maybe_unused
@@ -3146,15 +3133,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
        struct btree_trans *trans;
        int cpu;
 
+       if (c->btree_trans_bufs)
+               for_each_possible_cpu(cpu) {
+                       struct btree_trans *trans =
+                               per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
+
+                       if (trans) {
+                               closure_sync(&trans->ref);
+
+                               seqmutex_lock(&c->btree_trans_lock);
+                               list_del(&trans->list);
+                               seqmutex_unlock(&c->btree_trans_lock);
+                       }
+                       kfree(trans);
+               }
+       free_percpu(c->btree_trans_bufs);
+
        trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
        if (trans)
                panic("%s leaked btree_trans\n", trans->fn);
 
-       if (c->btree_trans_bufs)
-               for_each_possible_cpu(cpu)
-                       kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
-       free_percpu(c->btree_trans_bufs);
-
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
             s++) {
index d68e1211029da5af628dbbc3948626f182e96bb8..1ed8327a9fa2cc409cba3eb8c2ac1848999c7508 100644 (file)
@@ -95,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
        struct trans_waiting_for_lock *i;
 
        for (i = g->g; i != g->g + g->nr; i++) {
+               struct task_struct *task = i->trans->locking_wait.task;
                if (i != g->g)
                        prt_str(out, "<- ");
-               prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+               prt_printf(out, "%u ", task ?task->pid : 0);
        }
        prt_newline(out);
 }
index 28bac4ce20e1616b1ad196580afc7cb8e9a1853c..3baf688177c48c25f8058d1c1da8150627e48d6f 100644 (file)
@@ -386,7 +386,6 @@ struct btree_trans {
 
        void                    *mem;
        unsigned                mem_top;
-       unsigned                mem_max;
        unsigned                mem_bytes;
 
        btree_path_idx_t        nr_sorted;
@@ -413,8 +412,6 @@ struct btree_trans {
        unsigned long           srcu_lock_time;
 
        const char              *fn;
-       struct closure          ref;
-       struct list_head        list;
        struct btree_bkey_cached_common *locking;
        struct six_lock_waiter  locking_wait;
        int                     srcu_idx;
@@ -424,7 +421,6 @@ struct btree_trans {
        u16                     journal_entries_size;
        struct jset_entry       *journal_entries;
 
-       struct btree_insert_entry updates[BTREE_ITER_MAX];
        struct btree_trans_commit_hook *hooks;
        struct journal_entry_pin *journal_pin;
 
@@ -435,6 +431,13 @@ struct btree_trans {
        unsigned                extra_disk_res; /* XXX kill */
        struct replicas_delta_list *fs_usage_deltas;
 
+       /* Entries before this are zeroed out on every bch2_trans_get() call */
+
+       struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+       struct list_head        list;
+       struct closure          ref;
+
        unsigned long           _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)];
        struct btree_trans_paths trans_paths;
        struct btree_path       _paths[BTREE_ITER_MAX];
index bbf6fce3a78907379c5c992b5bffb406a507114f..c0b4d9057f29eacea1bd6574929f8d4ae4427e41 100644 (file)
@@ -592,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
        .read           = bch2_cached_btree_nodes_read,
 };
 
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
 static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
                                            size_t size, loff_t *ppos)
 {
@@ -608,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 restart:
        seqmutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (trans->locking_wait.task->pid <= i->iter)
+               struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+               if (!task || task->pid <= i->iter)
                        continue;
 
                closure_get(&trans->ref);
@@ -626,11 +627,11 @@ restart:
                prt_printf(&i->buf, "backtrace:");
                prt_newline(&i->buf);
                printbuf_indent_add(&i->buf, 2);
-               bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+               bch2_prt_task_backtrace(&i->buf, task);
                printbuf_indent_sub(&i->buf, 2);
                prt_newline(&i->buf);
 
-               i->iter = trans->locking_wait.task->pid;
+               i->iter = task->pid;
 
                closure_put(&trans->ref);
 
@@ -654,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
        .release        = bch2_dump_release,
        .read           = bch2_btree_transactions_read,
 };
-#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
 
 static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
                                      size_t size, loff_t *ppos)
@@ -811,7 +811,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
 restart:
        seqmutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (trans->locking_wait.task->pid <= i->iter)
+               struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+               if (!task || task->pid <= i->iter)
                        continue;
 
                closure_get(&trans->ref);
@@ -826,7 +828,7 @@ restart:
 
                bch2_check_for_deadlock(trans, &i->buf);
 
-               i->iter = trans->locking_wait.task->pid;
+               i->iter = task->pid;
 
                closure_put(&trans->ref);
 
@@ -873,10 +875,8 @@ void bch2_fs_debug_init(struct bch_fs *c)
        debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
                            c->btree_debug, &cached_btree_nodes_ops);
 
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
        debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
                            c->btree_debug, &btree_transactions_ops);
-#endif
 
        debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
                            c->btree_debug, &journal_pins_ops);