[linux-block.git] / fs / btrfs / defrag.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include "ctree.h"
#include "disk-io.h"
#include "print-tree.h"
#include "transaction.h"
#include "locking.h"
#include "accessors.h"

static struct kmem_cache *btrfs_inode_defrag_cachep;

/*
 * When auto defrag is enabled we queue up these defrag structs to remember
 * which inodes need defragging passes.
 */
struct inode_defrag {
	struct rb_node rb_node;
	/* Inode number */
	u64 ino;
	/*
	 * Transid where the defrag was added, we search for extents newer than
	 * this.
	 */
	u64 transid;

	/* Root objectid */
	u64 root;

	/*
	 * The extent size threshold for autodefrag.
	 *
	 * This value is different for compressed/non-compressed extents, thus
	 * needs to be passed from higher layer.
	 * (aka, inode_should_defrag())
	 */
	u32 extent_thresh;
};

static int __compare_inode_defrag(struct inode_defrag *defrag1,
				  struct inode_defrag *defrag2)
{
	if (defrag1->root > defrag2->root)
		return 1;
	else if (defrag1->root < defrag2->root)
		return -1;
	else if (defrag1->ino > defrag2->ino)
		return 1;
	else if (defrag1->ino < defrag2->ino)
		return -1;
	else
		return 0;
}

/*
 * Pop a record for an inode into the defrag tree.  The lock must be held
 * already.
 *
 * If you're inserting a record for an older transid than an existing record,
 * the transid already in the tree is lowered.
 *
 * If an existing record is found the defrag item you pass in is freed.
 */
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
				    struct inode_defrag *defrag)
{
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct inode_defrag *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;
	int ret;

	p = &fs_info->defrag_inodes.rb_node;
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

		ret = __compare_inode_defrag(defrag, entry);
		if (ret < 0)
			p = &parent->rb_left;
		else if (ret > 0)
			p = &parent->rb_right;
		else {
			/*
			 * If we're reinserting an entry for an old defrag run,
			 * make sure to lower the transid of our existing
			 * record.
			 */
			if (defrag->transid < entry->transid)
				entry->transid = defrag->transid;
			entry->extent_thresh = min(defrag->extent_thresh,
						   entry->extent_thresh);
			return -EEXIST;
		}
	}
	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
	rb_link_node(&defrag->rb_node, parent, p);
	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
	return 0;
}

static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
{
	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
		return 0;

	if (btrfs_fs_closing(fs_info))
		return 0;

	return 1;
}

/*
 * Insert a defrag record for this inode if auto defrag is enabled.
 */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode, u32 extent_thresh)
{
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct inode_defrag *defrag;
	u64 transid;
	int ret;

	if (!__need_auto_defrag(fs_info))
		return 0;

	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
		return 0;

	if (trans)
		transid = trans->transid;
	else
		transid = inode->root->last_trans;

	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
	if (!defrag)
		return -ENOMEM;

	defrag->ino = btrfs_ino(inode);
	defrag->transid = transid;
	defrag->root = root->root_key.objectid;
	defrag->extent_thresh = extent_thresh;

	spin_lock(&fs_info->defrag_inodes_lock);
	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
		/*
		 * If we set IN_DEFRAG flag and evict the inode from memory,
		 * and then re-read this inode, this new inode doesn't have
		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
		 */
		ret = __btrfs_add_inode_defrag(inode, defrag);
		if (ret)
			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	} else {
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	}
	spin_unlock(&fs_info->defrag_inodes_lock);
	return 0;
}

/*
 * Pick the defragable inode that we want, if it doesn't exist, we will get the
 * next one.
 */
static struct inode_defrag *btrfs_pick_defrag_inode(
			struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
	struct inode_defrag *entry = NULL;
	struct inode_defrag tmp;
	struct rb_node *p;
	struct rb_node *parent = NULL;
	int ret;

	tmp.ino = ino;
	tmp.root = root;

	spin_lock(&fs_info->defrag_inodes_lock);
	p = fs_info->defrag_inodes.rb_node;
	while (p) {
		parent = p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

		ret = __compare_inode_defrag(&tmp, entry);
		if (ret < 0)
			p = parent->rb_left;
		else if (ret > 0)
			p = parent->rb_right;
		else
			goto out;
	}

	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
		parent = rb_next(parent);
		if (parent)
			entry = rb_entry(parent, struct inode_defrag, rb_node);
		else
			entry = NULL;
	}
out:
	if (entry)
		rb_erase(parent, &fs_info->defrag_inodes);
	spin_unlock(&fs_info->defrag_inodes_lock);
	return entry;
}

void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
	struct inode_defrag *defrag;
	struct rb_node *node;

	spin_lock(&fs_info->defrag_inodes_lock);
	node = rb_first(&fs_info->defrag_inodes);
	while (node) {
		rb_erase(node, &fs_info->defrag_inodes);
		defrag = rb_entry(node, struct inode_defrag, rb_node);
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

		cond_resched_lock(&fs_info->defrag_inodes_lock);

		node = rb_first(&fs_info->defrag_inodes);
	}
	spin_unlock(&fs_info->defrag_inodes_lock);
}

#define BTRFS_DEFRAG_BATCH	1024

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
				    struct inode_defrag *defrag)
{
	struct btrfs_root *inode_root;
	struct inode *inode;
	struct btrfs_ioctl_defrag_range_args range;
	int ret = 0;
	u64 cur = 0;

again:
	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
		goto cleanup;
	if (!__need_auto_defrag(fs_info))
		goto cleanup;

	/* Get the inode */
	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
	if (IS_ERR(inode_root)) {
		ret = PTR_ERR(inode_root);
		goto cleanup;
	}

	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
	btrfs_put_root(inode_root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		goto cleanup;
	}

	if (cur >= i_size_read(inode)) {
		iput(inode);
		goto cleanup;
	}

	/* Do a chunk of defrag */
	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
	memset(&range, 0, sizeof(range));
	range.len = (u64)-1;
	range.start = cur;
	range.extent_thresh = defrag->extent_thresh;

	sb_start_write(fs_info->sb);
	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
				       BTRFS_DEFRAG_BATCH);
	sb_end_write(fs_info->sb);
	iput(inode);

	if (ret < 0)
		goto cleanup;

	cur = max(cur + fs_info->sectorsize, range.start);
	goto again;

cleanup:
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	return ret;
}

/*
 * Run through the list of inodes in the FS that need defragging.
 */
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
	struct inode_defrag *defrag;
	u64 first_ino = 0;
	u64 root_objectid = 0;

	atomic_inc(&fs_info->defrag_running);
	while (1) {
		/* Pause the auto defragger. */
		if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
			break;

		if (!__need_auto_defrag(fs_info))
			break;

		/* find an inode to defrag */
		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
		if (!defrag) {
			if (root_objectid || first_ino) {
				root_objectid = 0;
				first_ino = 0;
				continue;
			} else {
				break;
			}
		}

		first_ino = defrag->ino + 1;
		root_objectid = defrag->root;

		__btrfs_run_defrag_inode(fs_info, defrag);
	}
	atomic_dec(&fs_info->defrag_running);

	/*
	 * During unmount, we use the transaction_wait queue to wait for the
	 * defragger to stop.
	 */
	wake_up(&fs_info->transaction_wait);
	return 0;
}

/*
 * Defrag all the leaves in a given btree.
 * Read all the leaves and try to get key order to
 * better reflect disk order
 */

int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
			struct btrfs_root *root)
{
	struct btrfs_path *path = NULL;
	struct btrfs_key key;
	int ret = 0;
	int wret;
	int level;
	int next_key_ret = 0;
	u64 last_ret = 0;

	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
		goto out;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	level = btrfs_header_level(root->node);

	if (level == 0)
		goto out;

	if (root->defrag_progress.objectid == 0) {
		struct extent_buffer *root_node;
		u32 nritems;

		root_node = btrfs_lock_root_node(root);
		nritems = btrfs_header_nritems(root_node);
		root->defrag_max.objectid = 0;
		/* from above we know this is not a leaf */
		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
				      nritems - 1);
		btrfs_tree_unlock(root_node);
		free_extent_buffer(root_node);
		memset(&key, 0, sizeof(key));
	} else {
		memcpy(&key, &root->defrag_progress, sizeof(key));
	}

	path->keep_locks = 1;

	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = 0;
		goto out;
	}
	btrfs_release_path(path);
	/*
	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
	 * a deadlock (attempting to write lock an already write locked leaf).
	 */
	path->lowest_level = 1;
	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);

	if (wret < 0) {
		ret = wret;
		goto out;
	}
	if (!path->nodes[1]) {
		ret = 0;
		goto out;
	}
	/*
	 * The node at level 1 must always be locked when our path has
	 * keep_locks set and lowest_level is 1, regardless of the value of
	 * path->slots[1].
	 */
	BUG_ON(path->locks[1] == 0);
	ret = btrfs_realloc_node(trans, root,
				 path->nodes[1], 0,
				 &last_ret,
				 &root->defrag_progress);
	if (ret) {
		WARN_ON(ret == -EAGAIN);
		goto out;
	}
	/*
	 * Now that we reallocated the node we can find the next key. Note that
	 * btrfs_find_next_key() can release our path and do another search
	 * without COWing, this is because even with path->keep_locks = 1,
	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
	 * node when path->slots[node_level - 1] does not point to the last
	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
	 * we search for the next key after reallocating our node.
	 */
	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
					   BTRFS_OLDEST_GENERATION);
	if (next_key_ret == 0) {
		memcpy(&root->defrag_progress, &key, sizeof(key));
		ret = -EAGAIN;
	}
out:
	btrfs_free_path(path);
	if (ret == -EAGAIN) {
		if (root->defrag_max.objectid > root->defrag_progress.objectid)
			goto done;
		if (root->defrag_max.type > root->defrag_progress.type)
			goto done;
		if (root->defrag_max.offset > root->defrag_progress.offset)
			goto done;
		ret = 0;
	}
done:
	if (ret != -EAGAIN)
		memset(&root->defrag_progress, 0,
		       sizeof(root->defrag_progress));

	return ret;
}

void __cold btrfs_auto_defrag_exit(void)
{
	kmem_cache_destroy(btrfs_inode_defrag_cachep);
}

int __init btrfs_auto_defrag_init(void)
{
	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
					sizeof(struct inode_defrag), 0,
					SLAB_MEM_SPREAD,
					NULL);
	if (!btrfs_inode_defrag_cachep)
		return -ENOMEM;

	return 0;
}
Commit	Line	Data
c1d7c514	1	// SPDX-License-Identifier: GPL-2.0
6702ed49 CM	2	/*
6702ed49 CM	3	* Copyright (C) 2007 Oracle. All rights reserved.
6702ed49 CM	4	*/
	5
	6	#include <linux/sched.h>
	7	#include "ctree.h"
	8	#include "disk-io.h"
	9	#include "print-tree.h"
	10	#include "transaction.h"
e7a84565	11	#include "locking.h"
07e81dc9	12	#include "accessors.h"
6702ed49	13
6e3df18b JB	14	static struct kmem_cache *btrfs_inode_defrag_cachep;
	15
	16	/*
	17	* When auto defrag is enabled we queue up these defrag structs to remember
	18	* which inodes need defragging passes.
	19	*/
	20	struct inode_defrag {
	21	struct rb_node rb_node;
	22	/* Inode number */
	23	u64 ino;
	24	/*
	25	* Transid where the defrag was added, we search for extents newer than
	26	* this.
	27	*/
	28	u64 transid;
	29
	30	/* Root objectid */
	31	u64 root;
	32
	33	/*
	34	* The extent size threshold for autodefrag.
	35	*
	36	* This value is different for compressed/non-compressed extents, thus
	37	* needs to be passed from higher layer.
	38	* (aka, inode_should_defrag())
	39	*/
	40	u32 extent_thresh;
	41	};
	42
	43	static int __compare_inode_defrag(struct inode_defrag *defrag1,
	44	struct inode_defrag *defrag2)
	45	{
	46	if (defrag1->root > defrag2->root)
	47	return 1;
	48	else if (defrag1->root < defrag2->root)
	49	return -1;
	50	else if (defrag1->ino > defrag2->ino)
	51	return 1;
	52	else if (defrag1->ino < defrag2->ino)
	53	return -1;
	54	else
	55	return 0;
	56	}
	57
	58	/*
	59	* Pop a record for an inode into the defrag tree. The lock must be held
	60	* already.
	61	*
	62	* If you're inserting a record for an older transid than an existing record,
	63	* the transid already in the tree is lowered.
	64	*
	65	* If an existing record is found the defrag item you pass in is freed.
	66	*/
	67	static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
	68	struct inode_defrag *defrag)
	69	{
	70	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	71	struct inode_defrag *entry;
	72	struct rb_node **p;
	73	struct rb_node *parent = NULL;
	74	int ret;
	75
	76	p = &fs_info->defrag_inodes.rb_node;
	77	while (*p) {
78	parent = *p;
79	entry = rb_entry(parent, struct inode_defrag, rb_node);
80
81	ret = __compare_inode_defrag(defrag, entry);
82	if (ret < 0)
83	p = &parent->rb_left;
84	else if (ret > 0)
85	p = &parent->rb_right;
86	else {
87	/*
88	* If we're reinserting an entry for an old defrag run,
89	* make sure to lower the transid of our existing
90	* record.
91	*/
92	if (defrag->transid < entry->transid)
93	entry->transid = defrag->transid;
94	entry->extent_thresh = min(defrag->extent_thresh,
95	entry->extent_thresh);
96	return -EEXIST;
97	}
98	}
99	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
100	rb_link_node(&defrag->rb_node, parent, p);
101	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
102	return 0;
103	}
104
105	static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
106	{
107	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
108	return 0;
109
110	if (btrfs_fs_closing(fs_info))
111	return 0;
112
113	return 1;
114	}
115
116	/*
117	* Insert a defrag record for this inode if auto defrag is enabled.
118	*/
119	int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
120	struct btrfs_inode *inode, u32 extent_thresh)
121	{
122	struct btrfs_root *root = inode->root;
123	struct btrfs_fs_info *fs_info = root->fs_info;
124	struct inode_defrag *defrag;
125	u64 transid;
126	int ret;
127
128	if (!__need_auto_defrag(fs_info))
129	return 0;
130
131	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
132	return 0;
133
134	if (trans)
135	transid = trans->transid;
136	else
137	transid = inode->root->last_trans;
138
139	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
140	if (!defrag)
141	return -ENOMEM;
142
143	defrag->ino = btrfs_ino(inode);
144	defrag->transid = transid;
145	defrag->root = root->root_key.objectid;
146	defrag->extent_thresh = extent_thresh;
147
148	spin_lock(&fs_info->defrag_inodes_lock);
149	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
150	/*
151	* If we set IN_DEFRAG flag and evict the inode from memory,
152	* and then re-read this inode, this new inode doesn't have
153	* IN_DEFRAG flag. At the case, we may find the existed defrag.
154	*/
155	ret = __btrfs_add_inode_defrag(inode, defrag);
156	if (ret)
157	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
158	} else {
159	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
160	}
161	spin_unlock(&fs_info->defrag_inodes_lock);
162	return 0;
163	}
164
165	/*
166	* Pick the defragable inode that we want, if it doesn't exist, we will get the
167	* next one.
168	*/
169	static struct inode_defrag *btrfs_pick_defrag_inode(
170	struct btrfs_fs_info *fs_info, u64 root, u64 ino)
171	{
172	struct inode_defrag *entry = NULL;
173	struct inode_defrag tmp;
174	struct rb_node *p;
175	struct rb_node *parent = NULL;
176	int ret;
177
178	tmp.ino = ino;
179	tmp.root = root;
180
181	spin_lock(&fs_info->defrag_inodes_lock);
182	p = fs_info->defrag_inodes.rb_node;
183	while (p) {
184	parent = p;
185	entry = rb_entry(parent, struct inode_defrag, rb_node);
186
187	ret = __compare_inode_defrag(&tmp, entry);
188	if (ret < 0)
189	p = parent->rb_left;
190	else if (ret > 0)
191	p = parent->rb_right;
192	else
193	goto out;
194	}
195
196	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
197	parent = rb_next(parent);
198	if (parent)
199	entry = rb_entry(parent, struct inode_defrag, rb_node);
200	else
201	entry = NULL;
202	}
203	out:
204	if (entry)
205	rb_erase(parent, &fs_info->defrag_inodes);
206	spin_unlock(&fs_info->defrag_inodes_lock);
207	return entry;
208	}
209
210	void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
211	{
212	struct inode_defrag *defrag;
213	struct rb_node *node;
214
215	spin_lock(&fs_info->defrag_inodes_lock);
216	node = rb_first(&fs_info->defrag_inodes);
217	while (node) {
218	rb_erase(node, &fs_info->defrag_inodes);
219	defrag = rb_entry(node, struct inode_defrag, rb_node);
220	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
221
222	cond_resched_lock(&fs_info->defrag_inodes_lock);
223
224	node = rb_first(&fs_info->defrag_inodes);
225	}
226	spin_unlock(&fs_info->defrag_inodes_lock);
227	}
228
229	#define BTRFS_DEFRAG_BATCH 1024
230
231	static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
232	struct inode_defrag *defrag)
233	{
234	struct btrfs_root *inode_root;
235	struct inode *inode;
236	struct btrfs_ioctl_defrag_range_args range;
237	int ret = 0;
238	u64 cur = 0;
239
240	again:
241	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
242	goto cleanup;
243	if (!__need_auto_defrag(fs_info))
244	goto cleanup;
245
246	/* Get the inode */
247	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
248	if (IS_ERR(inode_root)) {
249	ret = PTR_ERR(inode_root);
250	goto cleanup;
251	}
252
253	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
254	btrfs_put_root(inode_root);
255	if (IS_ERR(inode)) {
256	ret = PTR_ERR(inode);
257	goto cleanup;
258	}
259
260	if (cur >= i_size_read(inode)) {
261	iput(inode);
262	goto cleanup;
263	}
264
265	/* Do a chunk of defrag */
266	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
267	memset(&range, 0, sizeof(range));
268	range.len = (u64)-1;
269	range.start = cur;
270	range.extent_thresh = defrag->extent_thresh;
271
272	sb_start_write(fs_info->sb);
273	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
274	BTRFS_DEFRAG_BATCH);
275	sb_end_write(fs_info->sb);
276	iput(inode);
277
278	if (ret < 0)
279	goto cleanup;
280
281	cur = max(cur + fs_info->sectorsize, range.start);
282	goto again;
283
284	cleanup:
285	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
286	return ret;
287	}
288
289	/*
290	* Run through the list of inodes in the FS that need defragging.
291	*/
292	int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
293	{
294	struct inode_defrag *defrag;
295	u64 first_ino = 0;
296	u64 root_objectid = 0;
297
298	atomic_inc(&fs_info->defrag_running);
299	while (1) {
300	/* Pause the auto defragger. */
301	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
302	break;
303
304	if (!__need_auto_defrag(fs_info))
305	break;
306
307	/* find an inode to defrag */
308	defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
309	if (!defrag) {
310	if (root_objectid \|\| first_ino) {
311	root_objectid = 0;
312	first_ino = 0;
313	continue;
314	} else {
315	break;
316	}
317	}
318
319	first_ino = defrag->ino + 1;
320	root_objectid = defrag->root;
321
322	__btrfs_run_defrag_inode(fs_info, defrag);
323	}
324	atomic_dec(&fs_info->defrag_running);
325
326	/*
327	* During unmount, we use the transaction_wait queue to wait for the
328	* defragger to stop.
329	*/
330	wake_up(&fs_info->transaction_wait);
331	return 0;
332	}
333
de78b51a ES	334	/*
	335	* Defrag all the leaves in a given btree.
	336	* Read all the leaves and try to get key order to
d352ac68 CM	337	* better reflect disk order
d352ac68 CM	338	*/
d397712b	339
6702ed49	340	int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
de78b51a	341	struct btrfs_root *root)
6702ed49 CM	342	{
6702ed49 CM	343	struct btrfs_path *path = NULL;
e7a84565	344	struct btrfs_key key;
6702ed49 CM	345	int ret = 0;
	346	int wret;
	347	int level;
e7a84565	348	int next_key_ret = 0;
e9d0b13b	349	u64 last_ret = 0;
3f157a2f	350
92a7cc42	351	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
6702ed49	352	goto out;
5f39d397	353
6702ed49 CM	354	path = btrfs_alloc_path();
	355	if (!path)
	356	return -ENOMEM;
	357
5f39d397	358	level = btrfs_header_level(root->node);
0f1ebbd1	359
d397712b	360	if (level == 0)
6702ed49	361	goto out;
d397712b	362
6702ed49	363	if (root->defrag_progress.objectid == 0) {
e7a84565	364	struct extent_buffer *root_node;
0ef3e66b CM	365	u32 nritems;
0ef3e66b CM	366
e7a84565 CM	367	root_node = btrfs_lock_root_node(root);
e7a84565 CM	368	nritems = btrfs_header_nritems(root_node);
0ef3e66b CM	369	root->defrag_max.objectid = 0;
0ef3e66b CM	370	/* from above we know this is not a leaf */
e7a84565	371	btrfs_node_key_to_cpu(root_node, &root->defrag_max,
0ef3e66b	372	nritems - 1);
e7a84565 CM	373	btrfs_tree_unlock(root_node);
	374	free_extent_buffer(root_node);
	375	memset(&key, 0, sizeof(key));
6702ed49	376	} else {
e7a84565	377	memcpy(&key, &root->defrag_progress, sizeof(key));
6702ed49 CM	378	}
6702ed49 CM	379
e7a84565	380	path->keep_locks = 1;
3f157a2f	381
7c829b72	382	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
3f157a2f CM	383	if (ret < 0)
	384	goto out;
	385	if (ret > 0) {
	386	ret = 0;
	387	goto out;
	388	}
b3b4aa74	389	btrfs_release_path(path);
0376374a FM	390	/*
	391	* We don't need a lock on a leaf. btrfs_realloc_node() will lock all
	392	* leafs from path->nodes[1], so set lowest_level to 1 to avoid later
	393	* a deadlock (attempting to write lock an already write locked leaf).
	394	*/
	395	path->lowest_level = 1;
e7a84565	396	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6702ed49	397
e7a84565 CM	398	if (wret < 0) {
	399	ret = wret;
	400	goto out;
	401	}
	402	if (!path->nodes[1]) {
	403	ret = 0;
	404	goto out;
	405	}
0376374a FM	406	/*
	407	* The node at level 1 must always be locked when our path has
	408	* keep_locks set and lowest_level is 1, regardless of the value of
	409	* path->slots[1].
	410	*/
	411	BUG_ON(path->locks[1] == 0);
e7a84565 CM	412	ret = btrfs_realloc_node(trans, root,
e7a84565 CM	413	path->nodes[1], 0,
de78b51a	414	&last_ret,
e7a84565	415	&root->defrag_progress);
8929ecfa YZ	416	if (ret) {
	417	WARN_ON(ret == -EAGAIN);
	418	goto out;
	419	}
0376374a FM	420	/*
	421	* Now that we reallocated the node we can find the next key. Note that
	422	* btrfs_find_next_key() can release our path and do another search
	423	* without COWing, this is because even with path->keep_locks = 1,
	424	* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
	425	* node when path->slots[node_level - 1] does not point to the last
	426	* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
	427	* we search for the next key after reallocating our node.
	428	*/
	429	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
	430	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
7c829b72	431	BTRFS_OLDEST_GENERATION);
e7a84565 CM	432	if (next_key_ret == 0) {
	433	memcpy(&root->defrag_progress, &key, sizeof(key));
	434	ret = -EAGAIN;
6702ed49	435	}
6702ed49	436	out:
527afb44	437	btrfs_free_path(path);
0ef3e66b CM	438	if (ret == -EAGAIN) {
	439	if (root->defrag_max.objectid > root->defrag_progress.objectid)
	440	goto done;
	441	if (root->defrag_max.type > root->defrag_progress.type)
	442	goto done;
	443	if (root->defrag_max.offset > root->defrag_progress.offset)
	444	goto done;
	445	ret = 0;
	446	}
	447	done:
a2570ef3	448	if (ret != -EAGAIN)
6702ed49 CM	449	memset(&root->defrag_progress, 0,
6702ed49 CM	450	sizeof(root->defrag_progress));
a2570ef3	451
6702ed49 CM	452	return ret;
6702ed49 CM	453	}
6e3df18b JB	454
	455	void __cold btrfs_auto_defrag_exit(void)
	456	{
	457	kmem_cache_destroy(btrfs_inode_defrag_cachep);
	458	}
	459
	460	int __init btrfs_auto_defrag_init(void)
	461	{
	462	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
	463	sizeof(struct inode_defrag), 0,
	464	SLAB_MEM_SPREAD,
	465	NULL);
	466	if (!btrfs_inode_defrag_cachep)
	467	return -ENOMEM;
	468
	469	return 0;
	470	}