btrfs: only update i_size in truncate paths that care
[linux-block.git] / fs / btrfs / free-space-cache.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0f9dd46c
JB
2/*
3 * Copyright (C) 2008 Red Hat. All rights reserved.
0f9dd46c
JB
4 */
5
96303081 6#include <linux/pagemap.h>
0f9dd46c 7#include <linux/sched.h>
f361bf4a 8#include <linux/sched/signal.h>
5a0e3ad6 9#include <linux/slab.h>
96303081 10#include <linux/math64.h>
6ab60601 11#include <linux/ratelimit.h>
540adea3 12#include <linux/error-injection.h>
84de76a2 13#include <linux/sched/mm.h>
18bb8bbf 14#include "misc.h"
0f9dd46c 15#include "ctree.h"
fa9c0d79
CM
16#include "free-space-cache.h"
17#include "transaction.h"
0af3d00b 18#include "disk-io.h"
43be2146 19#include "extent_io.h"
04216820 20#include "volumes.h"
8719aaae 21#include "space-info.h"
86736342 22#include "delalloc-space.h"
aac0023c 23#include "block-group.h"
b0643e59 24#include "discard.h"
e4f94347 25#include "subpage.h"
26c2c454 26#include "inode-item.h"
fa9c0d79 27
0ef6447a 28#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
5d90c5c7
DZ
29#define MAX_CACHE_BYTES_PER_GIG SZ_64K
30#define FORCE_EXTENT_THRESHOLD SZ_1M
0f9dd46c 31
55507ce3
FM
32struct btrfs_trim_range {
33 u64 start;
34 u64 bytes;
35 struct list_head list;
36};
37
34d52cb6 38static int link_free_space(struct btrfs_free_space_ctl *ctl,
0cb59c99 39 struct btrfs_free_space *info);
cd023e7b 40static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
32e1649b 41 struct btrfs_free_space *info, bool update_stat);
cd79909b
JB
42static int search_bitmap(struct btrfs_free_space_ctl *ctl,
43 struct btrfs_free_space *bitmap_info, u64 *offset,
44 u64 *bytes, bool for_alloc);
45static void free_bitmap(struct btrfs_free_space_ctl *ctl,
46 struct btrfs_free_space *bitmap_info);
47static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
48 struct btrfs_free_space *info, u64 offset,
f594f13c 49 u64 bytes, bool update_stats);
0cb59c99 50
0414efae
LZ
51static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
52 struct btrfs_path *path,
53 u64 offset)
0af3d00b 54{
0b246afa 55 struct btrfs_fs_info *fs_info = root->fs_info;
0af3d00b
JB
56 struct btrfs_key key;
57 struct btrfs_key location;
58 struct btrfs_disk_key disk_key;
59 struct btrfs_free_space_header *header;
60 struct extent_buffer *leaf;
61 struct inode *inode = NULL;
84de76a2 62 unsigned nofs_flag;
0af3d00b
JB
63 int ret;
64
0af3d00b 65 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 66 key.offset = offset;
0af3d00b
JB
67 key.type = 0;
68
69 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0) {
b3b4aa74 73 btrfs_release_path(path);
0af3d00b
JB
74 return ERR_PTR(-ENOENT);
75 }
76
77 leaf = path->nodes[0];
78 header = btrfs_item_ptr(leaf, path->slots[0],
79 struct btrfs_free_space_header);
80 btrfs_free_space_key(leaf, header, &disk_key);
81 btrfs_disk_key_to_cpu(&location, &disk_key);
b3b4aa74 82 btrfs_release_path(path);
0af3d00b 83
84de76a2
JB
84 /*
85 * We are often under a trans handle at this point, so we need to make
86 * sure NOFS is set to keep us from deadlocking.
87 */
88 nofs_flag = memalloc_nofs_save();
0202e83f 89 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
4222ea71 90 btrfs_release_path(path);
84de76a2 91 memalloc_nofs_restore(nofs_flag);
0af3d00b
JB
92 if (IS_ERR(inode))
93 return inode;
0af3d00b 94
528c0327 95 mapping_set_gfp_mask(inode->i_mapping,
c62d2555
MH
96 mapping_gfp_constraint(inode->i_mapping,
97 ~(__GFP_FS | __GFP_HIGHMEM)));
adae52b9 98
0414efae
LZ
99 return inode;
100}
101
32da5386 102struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
7949f339 103 struct btrfs_path *path)
0414efae 104{
7949f339 105 struct btrfs_fs_info *fs_info = block_group->fs_info;
0414efae 106 struct inode *inode = NULL;
5b0e95bf 107 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0414efae
LZ
108
109 spin_lock(&block_group->lock);
110 if (block_group->inode)
111 inode = igrab(block_group->inode);
112 spin_unlock(&block_group->lock);
113 if (inode)
114 return inode;
115
77ab86bf 116 inode = __lookup_free_space_inode(fs_info->tree_root, path,
b3470b5d 117 block_group->start);
0414efae
LZ
118 if (IS_ERR(inode))
119 return inode;
120
0af3d00b 121 spin_lock(&block_group->lock);
5b0e95bf 122 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
0b246afa 123 btrfs_info(fs_info, "Old style space inode found, converting.");
5b0e95bf
JB
124 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
125 BTRFS_INODE_NODATACOW;
2f356126
JB
126 block_group->disk_cache_state = BTRFS_DC_CLEAR;
127 }
128
300e4f8a 129 if (!block_group->iref) {
0af3d00b
JB
130 block_group->inode = igrab(inode);
131 block_group->iref = 1;
132 }
133 spin_unlock(&block_group->lock);
134
135 return inode;
136}
137
48a3b636
ES
138static int __create_free_space_inode(struct btrfs_root *root,
139 struct btrfs_trans_handle *trans,
140 struct btrfs_path *path,
141 u64 ino, u64 offset)
0af3d00b
JB
142{
143 struct btrfs_key key;
144 struct btrfs_disk_key disk_key;
145 struct btrfs_free_space_header *header;
146 struct btrfs_inode_item *inode_item;
147 struct extent_buffer *leaf;
f0d1219d
NB
148 /* We inline CRCs for the free disk space cache */
149 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
150 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0af3d00b
JB
151 int ret;
152
0414efae 153 ret = btrfs_insert_empty_inode(trans, root, path, ino);
0af3d00b
JB
154 if (ret)
155 return ret;
156
157 leaf = path->nodes[0];
158 inode_item = btrfs_item_ptr(leaf, path->slots[0],
159 struct btrfs_inode_item);
160 btrfs_item_key(leaf, &disk_key, path->slots[0]);
b159fa28 161 memzero_extent_buffer(leaf, (unsigned long)inode_item,
0af3d00b
JB
162 sizeof(*inode_item));
163 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
164 btrfs_set_inode_size(leaf, inode_item, 0);
165 btrfs_set_inode_nbytes(leaf, inode_item, 0);
166 btrfs_set_inode_uid(leaf, inode_item, 0);
167 btrfs_set_inode_gid(leaf, inode_item, 0);
168 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
5b0e95bf 169 btrfs_set_inode_flags(leaf, inode_item, flags);
0af3d00b
JB
170 btrfs_set_inode_nlink(leaf, inode_item, 1);
171 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
0414efae 172 btrfs_set_inode_block_group(leaf, inode_item, offset);
0af3d00b 173 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 174 btrfs_release_path(path);
0af3d00b
JB
175
176 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 177 key.offset = offset;
0af3d00b 178 key.type = 0;
0af3d00b
JB
179 ret = btrfs_insert_empty_item(trans, root, path, &key,
180 sizeof(struct btrfs_free_space_header));
181 if (ret < 0) {
b3b4aa74 182 btrfs_release_path(path);
0af3d00b
JB
183 return ret;
184 }
c9dc4c65 185
0af3d00b
JB
186 leaf = path->nodes[0];
187 header = btrfs_item_ptr(leaf, path->slots[0],
188 struct btrfs_free_space_header);
b159fa28 189 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
0af3d00b
JB
190 btrfs_set_free_space_key(leaf, header, &disk_key);
191 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 192 btrfs_release_path(path);
0af3d00b
JB
193
194 return 0;
195}
196
4ca75f1b 197int create_free_space_inode(struct btrfs_trans_handle *trans,
32da5386 198 struct btrfs_block_group *block_group,
0414efae
LZ
199 struct btrfs_path *path)
200{
201 int ret;
202 u64 ino;
203
543068a2 204 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
0414efae
LZ
205 if (ret < 0)
206 return ret;
207
4ca75f1b 208 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
b3470b5d 209 ino, block_group->start);
0414efae
LZ
210}
211
36b216c8
BB
212/*
213 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
214 * handles lookup, otherwise it takes ownership and iputs the inode.
215 * Don't reuse an inode pointer after passing it into this function.
216 */
217int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
218 struct inode *inode,
219 struct btrfs_block_group *block_group)
220{
221 struct btrfs_path *path;
222 struct btrfs_key key;
223 int ret = 0;
224
225 path = btrfs_alloc_path();
226 if (!path)
227 return -ENOMEM;
228
229 if (!inode)
230 inode = lookup_free_space_inode(block_group, path);
231 if (IS_ERR(inode)) {
232 if (PTR_ERR(inode) != -ENOENT)
233 ret = PTR_ERR(inode);
234 goto out;
235 }
236 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
237 if (ret) {
238 btrfs_add_delayed_iput(inode);
239 goto out;
240 }
241 clear_nlink(inode);
242 /* One for the block groups ref */
243 spin_lock(&block_group->lock);
244 if (block_group->iref) {
245 block_group->iref = 0;
246 block_group->inode = NULL;
247 spin_unlock(&block_group->lock);
248 iput(inode);
249 } else {
250 spin_unlock(&block_group->lock);
251 }
252 /* One for the lookup ref */
253 btrfs_add_delayed_iput(inode);
254
255 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
256 key.type = 0;
257 key.offset = block_group->start;
258 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
259 -1, 1);
260 if (ret) {
261 if (ret > 0)
262 ret = 0;
263 goto out;
264 }
265 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
266out:
267 btrfs_free_path(path);
268 return ret;
269}
270
2ff7e61e 271int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
7b61cd92 272 struct btrfs_block_rsv *rsv)
0af3d00b 273{
c8174313 274 u64 needed_bytes;
7b61cd92 275 int ret;
c8174313
JB
276
277 /* 1 for slack space, 1 for updating the inode */
2bd36e7b
JB
278 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
279 btrfs_calc_metadata_size(fs_info, 1);
c8174313 280
7b61cd92
MX
281 spin_lock(&rsv->lock);
282 if (rsv->reserved < needed_bytes)
283 ret = -ENOSPC;
284 else
285 ret = 0;
286 spin_unlock(&rsv->lock);
4b286cd1 287 return ret;
7b61cd92
MX
288}
289
77ab86bf 290int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
32da5386 291 struct btrfs_block_group *block_group,
9a4a1429 292 struct inode *vfs_inode)
7b61cd92 293{
d9ac19c3
JB
294 struct btrfs_truncate_control control = {
295 .new_size = 0,
296 .min_type = BTRFS_EXTENT_DATA_KEY,
297 };
9a4a1429
JB
298 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
299 struct btrfs_root *root = inode->root;
300 struct extent_state *cached_state = NULL;
7b61cd92 301 int ret = 0;
35c76642 302 bool locked = false;
1bbc621e 303
1bbc621e 304 if (block_group) {
21e75ffe
JM
305 struct btrfs_path *path = btrfs_alloc_path();
306
307 if (!path) {
308 ret = -ENOMEM;
309 goto fail;
310 }
35c76642 311 locked = true;
1bbc621e
CM
312 mutex_lock(&trans->transaction->cache_write_mutex);
313 if (!list_empty(&block_group->io_list)) {
314 list_del_init(&block_group->io_list);
315
afdb5718 316 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
317 btrfs_put_block_group(block_group);
318 }
319
320 /*
321 * now that we've truncated the cache away, its no longer
322 * setup or written
323 */
324 spin_lock(&block_group->lock);
325 block_group->disk_cache_state = BTRFS_DC_CLEAR;
326 spin_unlock(&block_group->lock);
21e75ffe 327 btrfs_free_path(path);
1bbc621e 328 }
0af3d00b 329
9a4a1429
JB
330 btrfs_i_size_write(inode, 0);
331 truncate_pagecache(vfs_inode, 0);
332
333 lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
334 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
0af3d00b
JB
335
336 /*
f7e9e8fc
OS
337 * We skip the throttling logic for free space cache inodes, so we don't
338 * need to check for -EAGAIN.
0af3d00b 339 */
d9ac19c3 340 ret = btrfs_truncate_inode_items(trans, root, inode, &control);
c2ddb612
JB
341
342 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
343
9a4a1429 344 unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
35c76642
FM
345 if (ret)
346 goto fail;
0af3d00b 347
9a4a1429 348 ret = btrfs_update_inode(trans, root, inode);
1bbc621e 349
1bbc621e 350fail:
35c76642
FM
351 if (locked)
352 mutex_unlock(&trans->transaction->cache_write_mutex);
79787eaa 353 if (ret)
66642832 354 btrfs_abort_transaction(trans, ret);
c8174313 355
82d5902d 356 return ret;
0af3d00b
JB
357}
358
1d480538 359static void readahead_cache(struct inode *inode)
9d66e233 360{
98caf953 361 struct file_ra_state ra;
9d66e233
JB
362 unsigned long last_index;
363
98caf953 364 file_ra_state_init(&ra, inode->i_mapping);
09cbfeaf 365 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
9d66e233 366
98caf953 367 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
9d66e233
JB
368}
369
4c6d1d85 370static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
f15376df 371 int write)
a67509c3 372{
5349d6c3 373 int num_pages;
5349d6c3 374
09cbfeaf 375 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
5349d6c3 376
8f6c72a9 377 /* Make sure we can fit our crcs and generation into the first page */
7dbdb443 378 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
5349d6c3
MX
379 return -ENOSPC;
380
4c6d1d85 381 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
5349d6c3 382
31e818fe 383 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
a67509c3
JB
384 if (!io_ctl->pages)
385 return -ENOMEM;
5349d6c3
MX
386
387 io_ctl->num_pages = num_pages;
f15376df 388 io_ctl->fs_info = btrfs_sb(inode->i_sb);
c9dc4c65 389 io_ctl->inode = inode;
5349d6c3 390
a67509c3
JB
391 return 0;
392}
663faf9f 393ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
a67509c3 394
4c6d1d85 395static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
396{
397 kfree(io_ctl->pages);
c9dc4c65 398 io_ctl->pages = NULL;
a67509c3
JB
399}
400
4c6d1d85 401static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
402{
403 if (io_ctl->cur) {
a67509c3
JB
404 io_ctl->cur = NULL;
405 io_ctl->orig = NULL;
406 }
407}
408
4c6d1d85 409static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
a67509c3 410{
b12d6869 411 ASSERT(io_ctl->index < io_ctl->num_pages);
a67509c3 412 io_ctl->page = io_ctl->pages[io_ctl->index++];
2b108268 413 io_ctl->cur = page_address(io_ctl->page);
a67509c3 414 io_ctl->orig = io_ctl->cur;
09cbfeaf 415 io_ctl->size = PAGE_SIZE;
a67509c3 416 if (clear)
619a9742 417 clear_page(io_ctl->cur);
a67509c3
JB
418}
419
4c6d1d85 420static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
421{
422 int i;
423
424 io_ctl_unmap_page(io_ctl);
425
426 for (i = 0; i < io_ctl->num_pages; i++) {
a1ee5a45 427 if (io_ctl->pages[i]) {
e4f94347
QW
428 btrfs_page_clear_checked(io_ctl->fs_info,
429 io_ctl->pages[i],
430 page_offset(io_ctl->pages[i]),
431 PAGE_SIZE);
a1ee5a45 432 unlock_page(io_ctl->pages[i]);
09cbfeaf 433 put_page(io_ctl->pages[i]);
a1ee5a45 434 }
a67509c3
JB
435 }
436}
437
7a195f6d 438static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
a67509c3
JB
439{
440 struct page *page;
831fa14f 441 struct inode *inode = io_ctl->inode;
a67509c3
JB
442 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
443 int i;
444
445 for (i = 0; i < io_ctl->num_pages; i++) {
32443de3
QW
446 int ret;
447
a67509c3
JB
448 page = find_or_create_page(inode->i_mapping, i, mask);
449 if (!page) {
450 io_ctl_drop_pages(io_ctl);
451 return -ENOMEM;
452 }
32443de3
QW
453
454 ret = set_page_extent_mapped(page);
455 if (ret < 0) {
456 unlock_page(page);
457 put_page(page);
458 io_ctl_drop_pages(io_ctl);
459 return ret;
460 }
461
a67509c3
JB
462 io_ctl->pages[i] = page;
463 if (uptodate && !PageUptodate(page)) {
464 btrfs_readpage(NULL, page);
465 lock_page(page);
3797136b
JB
466 if (page->mapping != inode->i_mapping) {
467 btrfs_err(BTRFS_I(inode)->root->fs_info,
468 "free space cache page truncated");
469 io_ctl_drop_pages(io_ctl);
470 return -EIO;
471 }
a67509c3 472 if (!PageUptodate(page)) {
efe120a0
FH
473 btrfs_err(BTRFS_I(inode)->root->fs_info,
474 "error reading free space cache");
a67509c3
JB
475 io_ctl_drop_pages(io_ctl);
476 return -EIO;
477 }
478 }
479 }
480
32443de3 481 for (i = 0; i < io_ctl->num_pages; i++)
f7d61dcd 482 clear_page_dirty_for_io(io_ctl->pages[i]);
f7d61dcd 483
a67509c3
JB
484 return 0;
485}
486
4c6d1d85 487static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 488{
a67509c3
JB
489 io_ctl_map_page(io_ctl, 1);
490
491 /*
5b0e95bf
JB
492 * Skip the csum areas. If we don't check crcs then we just have a
493 * 64bit chunk at the front of the first page.
a67509c3 494 */
7dbdb443
NB
495 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
496 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
a67509c3 497
6994ca36 498 put_unaligned_le64(generation, io_ctl->cur);
a67509c3 499 io_ctl->cur += sizeof(u64);
a67509c3
JB
500}
501
4c6d1d85 502static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 503{
6994ca36 504 u64 cache_gen;
a67509c3 505
5b0e95bf
JB
506 /*
507 * Skip the crc area. If we don't check crcs then we just have a 64bit
508 * chunk at the front of the first page.
509 */
7dbdb443
NB
510 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
511 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
a67509c3 512
6994ca36
DS
513 cache_gen = get_unaligned_le64(io_ctl->cur);
514 if (cache_gen != generation) {
f15376df 515 btrfs_err_rl(io_ctl->fs_info,
94647322 516 "space cache generation (%llu) does not match inode (%llu)",
6994ca36 517 cache_gen, generation);
a67509c3
JB
518 io_ctl_unmap_page(io_ctl);
519 return -EIO;
520 }
521 io_ctl->cur += sizeof(u64);
5b0e95bf
JB
522 return 0;
523}
524
4c6d1d85 525static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
526{
527 u32 *tmp;
528 u32 crc = ~(u32)0;
529 unsigned offset = 0;
530
5b0e95bf 531 if (index == 0)
cb54f257 532 offset = sizeof(u32) * io_ctl->num_pages;
5b0e95bf 533
4bb3c2e2
JT
534 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
535 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 536 io_ctl_unmap_page(io_ctl);
2b108268 537 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
538 tmp += index;
539 *tmp = crc;
5b0e95bf
JB
540}
541
4c6d1d85 542static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
543{
544 u32 *tmp, val;
545 u32 crc = ~(u32)0;
546 unsigned offset = 0;
547
5b0e95bf
JB
548 if (index == 0)
549 offset = sizeof(u32) * io_ctl->num_pages;
550
2b108268 551 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
552 tmp += index;
553 val = *tmp;
5b0e95bf
JB
554
555 io_ctl_map_page(io_ctl, 0);
4bb3c2e2
JT
556 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
557 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 558 if (val != crc) {
f15376df 559 btrfs_err_rl(io_ctl->fs_info,
94647322 560 "csum mismatch on free space cache");
5b0e95bf
JB
561 io_ctl_unmap_page(io_ctl);
562 return -EIO;
563 }
564
a67509c3
JB
565 return 0;
566}
567
4c6d1d85 568static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
a67509c3
JB
569 void *bitmap)
570{
571 struct btrfs_free_space_entry *entry;
572
573 if (!io_ctl->cur)
574 return -ENOSPC;
575
576 entry = io_ctl->cur;
6994ca36
DS
577 put_unaligned_le64(offset, &entry->offset);
578 put_unaligned_le64(bytes, &entry->bytes);
a67509c3
JB
579 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
580 BTRFS_FREE_SPACE_EXTENT;
581 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
582 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
583
584 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
585 return 0;
586
5b0e95bf 587 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
588
589 /* No more pages to map */
590 if (io_ctl->index >= io_ctl->num_pages)
591 return 0;
592
593 /* map the next page */
594 io_ctl_map_page(io_ctl, 1);
595 return 0;
596}
597
4c6d1d85 598static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
a67509c3
JB
599{
600 if (!io_ctl->cur)
601 return -ENOSPC;
602
603 /*
604 * If we aren't at the start of the current page, unmap this one and
605 * map the next one if there is any left.
606 */
607 if (io_ctl->cur != io_ctl->orig) {
5b0e95bf 608 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
609 if (io_ctl->index >= io_ctl->num_pages)
610 return -ENOSPC;
611 io_ctl_map_page(io_ctl, 0);
612 }
613
69d24804 614 copy_page(io_ctl->cur, bitmap);
5b0e95bf 615 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
616 if (io_ctl->index < io_ctl->num_pages)
617 io_ctl_map_page(io_ctl, 0);
618 return 0;
619}
620
4c6d1d85 621static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
a67509c3 622{
5b0e95bf
JB
623 /*
624 * If we're not on the boundary we know we've modified the page and we
625 * need to crc the page.
626 */
627 if (io_ctl->cur != io_ctl->orig)
628 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
629 else
630 io_ctl_unmap_page(io_ctl);
a67509c3
JB
631
632 while (io_ctl->index < io_ctl->num_pages) {
633 io_ctl_map_page(io_ctl, 1);
5b0e95bf 634 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
635 }
636}
637
4c6d1d85 638static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
5b0e95bf 639 struct btrfs_free_space *entry, u8 *type)
a67509c3
JB
640{
641 struct btrfs_free_space_entry *e;
2f120c05
JB
642 int ret;
643
644 if (!io_ctl->cur) {
645 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
646 if (ret)
647 return ret;
648 }
a67509c3
JB
649
650 e = io_ctl->cur;
6994ca36
DS
651 entry->offset = get_unaligned_le64(&e->offset);
652 entry->bytes = get_unaligned_le64(&e->bytes);
5b0e95bf 653 *type = e->type;
a67509c3
JB
654 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
655 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
656
657 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
5b0e95bf 658 return 0;
a67509c3
JB
659
660 io_ctl_unmap_page(io_ctl);
661
2f120c05 662 return 0;
a67509c3
JB
663}
664
4c6d1d85 665static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
5b0e95bf 666 struct btrfs_free_space *entry)
a67509c3 667{
5b0e95bf
JB
668 int ret;
669
5b0e95bf
JB
670 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
671 if (ret)
672 return ret;
673
69d24804 674 copy_page(entry->bitmap, io_ctl->cur);
a67509c3 675 io_ctl_unmap_page(io_ctl);
5b0e95bf
JB
676
677 return 0;
a67509c3
JB
678}
679
fa598b06
DS
680static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
681{
364be842 682 struct btrfs_block_group *block_group = ctl->block_group;
fa598b06
DS
683 u64 max_bytes;
684 u64 bitmap_bytes;
685 u64 extent_bytes;
686 u64 size = block_group->length;
687 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
688 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
689
690 max_bitmaps = max_t(u64, max_bitmaps, 1);
691
692 ASSERT(ctl->total_bitmaps <= max_bitmaps);
693
694 /*
695 * We are trying to keep the total amount of memory used per 1GiB of
696 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
697 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
698 * bitmaps, we may end up using more memory than this.
699 */
700 if (size < SZ_1G)
701 max_bytes = MAX_CACHE_BYTES_PER_GIG;
702 else
703 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
704
705 bitmap_bytes = ctl->total_bitmaps * ctl->unit;
706
707 /*
708 * we want the extent entry threshold to always be at most 1/2 the max
709 * bytes we can have, or whatever is less than that.
710 */
711 extent_bytes = max_bytes - bitmap_bytes;
712 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
713
714 ctl->extents_thresh =
715 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
716}
717
48a3b636
ES
718static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
719 struct btrfs_free_space_ctl *ctl,
720 struct btrfs_path *path, u64 offset)
9d66e233 721{
3ffbd68c 722 struct btrfs_fs_info *fs_info = root->fs_info;
9d66e233
JB
723 struct btrfs_free_space_header *header;
724 struct extent_buffer *leaf;
4c6d1d85 725 struct btrfs_io_ctl io_ctl;
9d66e233 726 struct btrfs_key key;
a67509c3 727 struct btrfs_free_space *e, *n;
b76808fc 728 LIST_HEAD(bitmaps);
9d66e233
JB
729 u64 num_entries;
730 u64 num_bitmaps;
731 u64 generation;
a67509c3 732 u8 type;
f6a39829 733 int ret = 0;
9d66e233 734
9d66e233 735 /* Nothing in the space cache, goodbye */
0414efae 736 if (!i_size_read(inode))
a67509c3 737 return 0;
9d66e233
JB
738
739 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 740 key.offset = offset;
9d66e233
JB
741 key.type = 0;
742
743 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0414efae 744 if (ret < 0)
a67509c3 745 return 0;
0414efae 746 else if (ret > 0) {
945d8962 747 btrfs_release_path(path);
a67509c3 748 return 0;
9d66e233
JB
749 }
750
0414efae
LZ
751 ret = -1;
752
9d66e233
JB
753 leaf = path->nodes[0];
754 header = btrfs_item_ptr(leaf, path->slots[0],
755 struct btrfs_free_space_header);
756 num_entries = btrfs_free_space_entries(leaf, header);
757 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
758 generation = btrfs_free_space_generation(leaf, header);
945d8962 759 btrfs_release_path(path);
9d66e233 760
e570fd27 761 if (!BTRFS_I(inode)->generation) {
0b246afa 762 btrfs_info(fs_info,
913e1535 763 "the free space cache file (%llu) is invalid, skip it",
e570fd27
MX
764 offset);
765 return 0;
766 }
767
9d66e233 768 if (BTRFS_I(inode)->generation != generation) {
0b246afa
JM
769 btrfs_err(fs_info,
770 "free space inode generation (%llu) did not match free space cache generation (%llu)",
771 BTRFS_I(inode)->generation, generation);
a67509c3 772 return 0;
9d66e233
JB
773 }
774
775 if (!num_entries)
a67509c3 776 return 0;
9d66e233 777
f15376df 778 ret = io_ctl_init(&io_ctl, inode, 0);
706efc66
LZ
779 if (ret)
780 return ret;
781
1d480538 782 readahead_cache(inode);
9d66e233 783
7a195f6d 784 ret = io_ctl_prepare_pages(&io_ctl, true);
a67509c3
JB
785 if (ret)
786 goto out;
9d66e233 787
5b0e95bf
JB
788 ret = io_ctl_check_crc(&io_ctl, 0);
789 if (ret)
790 goto free_cache;
791
a67509c3
JB
792 ret = io_ctl_check_generation(&io_ctl, generation);
793 if (ret)
794 goto free_cache;
9d66e233 795
a67509c3
JB
796 while (num_entries) {
797 e = kmem_cache_zalloc(btrfs_free_space_cachep,
798 GFP_NOFS);
3cc64e7e
ZC
799 if (!e) {
800 ret = -ENOMEM;
9d66e233 801 goto free_cache;
3cc64e7e 802 }
9d66e233 803
5b0e95bf
JB
804 ret = io_ctl_read_entry(&io_ctl, e, &type);
805 if (ret) {
806 kmem_cache_free(btrfs_free_space_cachep, e);
807 goto free_cache;
808 }
809
a67509c3 810 if (!e->bytes) {
3cc64e7e 811 ret = -1;
a67509c3
JB
812 kmem_cache_free(btrfs_free_space_cachep, e);
813 goto free_cache;
9d66e233 814 }
a67509c3
JB
815
816 if (type == BTRFS_FREE_SPACE_EXTENT) {
817 spin_lock(&ctl->tree_lock);
818 ret = link_free_space(ctl, e);
819 spin_unlock(&ctl->tree_lock);
820 if (ret) {
0b246afa 821 btrfs_err(fs_info,
c2cf52eb 822 "Duplicate entries in free space cache, dumping");
a67509c3 823 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
824 goto free_cache;
825 }
a67509c3 826 } else {
b12d6869 827 ASSERT(num_bitmaps);
a67509c3 828 num_bitmaps--;
3acd4850
CL
829 e->bitmap = kmem_cache_zalloc(
830 btrfs_free_space_bitmap_cachep, GFP_NOFS);
a67509c3 831 if (!e->bitmap) {
3cc64e7e 832 ret = -ENOMEM;
a67509c3
JB
833 kmem_cache_free(
834 btrfs_free_space_cachep, e);
9d66e233
JB
835 goto free_cache;
836 }
a67509c3
JB
837 spin_lock(&ctl->tree_lock);
838 ret = link_free_space(ctl, e);
839 ctl->total_bitmaps++;
fa598b06 840 recalculate_thresholds(ctl);
a67509c3
JB
841 spin_unlock(&ctl->tree_lock);
842 if (ret) {
0b246afa 843 btrfs_err(fs_info,
c2cf52eb 844 "Duplicate entries in free space cache, dumping");
dc89e982 845 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
846 goto free_cache;
847 }
a67509c3 848 list_add_tail(&e->list, &bitmaps);
9d66e233
JB
849 }
850
a67509c3
JB
851 num_entries--;
852 }
9d66e233 853
2f120c05
JB
854 io_ctl_unmap_page(&io_ctl);
855
a67509c3
JB
856 /*
857 * We add the bitmaps at the end of the entries in order that
858 * the bitmap entries are added to the cache.
859 */
860 list_for_each_entry_safe(e, n, &bitmaps, list) {
9d66e233 861 list_del_init(&e->list);
5b0e95bf
JB
862 ret = io_ctl_read_bitmap(&io_ctl, e);
863 if (ret)
864 goto free_cache;
9d66e233
JB
865 }
866
a67509c3 867 io_ctl_drop_pages(&io_ctl);
9d66e233
JB
868 ret = 1;
869out:
a67509c3 870 io_ctl_free(&io_ctl);
9d66e233 871 return ret;
9d66e233 872free_cache:
a67509c3 873 io_ctl_drop_pages(&io_ctl);
0414efae 874 __btrfs_remove_free_space_cache(ctl);
9d66e233
JB
875 goto out;
876}
877
cd79909b
JB
878static int copy_free_space_cache(struct btrfs_block_group *block_group,
879 struct btrfs_free_space_ctl *ctl)
880{
881 struct btrfs_free_space *info;
882 struct rb_node *n;
883 int ret = 0;
884
885 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
886 info = rb_entry(n, struct btrfs_free_space, offset_index);
887 if (!info->bitmap) {
32e1649b 888 unlink_free_space(ctl, info, true);
cd79909b
JB
889 ret = btrfs_add_free_space(block_group, info->offset,
890 info->bytes);
891 kmem_cache_free(btrfs_free_space_cachep, info);
892 } else {
893 u64 offset = info->offset;
894 u64 bytes = ctl->unit;
895
896 while (search_bitmap(ctl, info, &offset, &bytes,
897 false) == 0) {
898 ret = btrfs_add_free_space(block_group, offset,
899 bytes);
900 if (ret)
901 break;
f594f13c 902 bitmap_clear_bits(ctl, info, offset, bytes, true);
cd79909b
JB
903 offset = info->offset;
904 bytes = ctl->unit;
905 }
906 free_bitmap(ctl, info);
907 }
908 cond_resched();
909 }
910 return ret;
911}
912
32da5386 913int load_free_space_cache(struct btrfs_block_group *block_group)
0cb59c99 914{
bb6cb1c5 915 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 916 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
cd79909b 917 struct btrfs_free_space_ctl tmp_ctl = {};
0414efae
LZ
918 struct inode *inode;
919 struct btrfs_path *path;
5b0e95bf 920 int ret = 0;
0414efae 921 bool matched;
bf38be65 922 u64 used = block_group->used;
0414efae 923
cd79909b
JB
924 /*
925 * Because we could potentially discard our loaded free space, we want
926 * to load everything into a temporary structure first, and then if it's
927 * valid copy it all into the actual free space ctl.
928 */
929 btrfs_init_free_space_ctl(block_group, &tmp_ctl);
930
0414efae
LZ
931 /*
932 * If this block group has been marked to be cleared for one reason or
933 * another then we can't trust the on disk cache, so just return.
934 */
9d66e233 935 spin_lock(&block_group->lock);
0414efae
LZ
936 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
937 spin_unlock(&block_group->lock);
938 return 0;
939 }
9d66e233 940 spin_unlock(&block_group->lock);
0414efae
LZ
941
942 path = btrfs_alloc_path();
943 if (!path)
944 return 0;
d53ba474
JB
945 path->search_commit_root = 1;
946 path->skip_locking = 1;
0414efae 947
4222ea71
FM
948 /*
949 * We must pass a path with search_commit_root set to btrfs_iget in
950 * order to avoid a deadlock when allocating extents for the tree root.
951 *
952 * When we are COWing an extent buffer from the tree root, when looking
953 * for a free extent, at extent-tree.c:find_free_extent(), we can find
954 * block group without its free space cache loaded. When we find one
955 * we must load its space cache which requires reading its free space
956 * cache's inode item from the root tree. If this inode item is located
957 * in the same leaf that we started COWing before, then we end up in
958 * deadlock on the extent buffer (trying to read lock it when we
959 * previously write locked it).
960 *
961 * It's safe to read the inode item using the commit root because
962 * block groups, once loaded, stay in memory forever (until they are
963 * removed) as well as their space caches once loaded. New block groups
964 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
965 * we will never try to read their inode item while the fs is mounted.
966 */
7949f339 967 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
968 if (IS_ERR(inode)) {
969 btrfs_free_path(path);
970 return 0;
971 }
972
5b0e95bf
JB
973 /* We may have converted the inode and made the cache invalid. */
974 spin_lock(&block_group->lock);
975 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
976 spin_unlock(&block_group->lock);
a7e221e9 977 btrfs_free_path(path);
5b0e95bf
JB
978 goto out;
979 }
980 spin_unlock(&block_group->lock);
981
cd79909b 982 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
b3470b5d 983 path, block_group->start);
0414efae
LZ
984 btrfs_free_path(path);
985 if (ret <= 0)
986 goto out;
987
cd79909b
JB
988 matched = (tmp_ctl.free_space == (block_group->length - used -
989 block_group->bytes_super));
0414efae 990
cd79909b
JB
991 if (matched) {
992 ret = copy_free_space_cache(block_group, &tmp_ctl);
993 /*
994 * ret == 1 means we successfully loaded the free space cache,
995 * so we need to re-set it here.
996 */
997 if (ret == 0)
998 ret = 1;
999 } else {
1000 __btrfs_remove_free_space_cache(&tmp_ctl);
5d163e0e
JM
1001 btrfs_warn(fs_info,
1002 "block group %llu has wrong amount of free space",
b3470b5d 1003 block_group->start);
0414efae
LZ
1004 ret = -1;
1005 }
1006out:
1007 if (ret < 0) {
1008 /* This cache is bogus, make sure it gets cleared */
1009 spin_lock(&block_group->lock);
1010 block_group->disk_cache_state = BTRFS_DC_CLEAR;
1011 spin_unlock(&block_group->lock);
82d5902d 1012 ret = 0;
0414efae 1013
5d163e0e
JM
1014 btrfs_warn(fs_info,
1015 "failed to load free space cache for block group %llu, rebuilding it now",
b3470b5d 1016 block_group->start);
0414efae
LZ
1017 }
1018
66b53bae
JB
1019 spin_lock(&ctl->tree_lock);
1020 btrfs_discard_update_discardable(block_group);
1021 spin_unlock(&ctl->tree_lock);
0414efae
LZ
1022 iput(inode);
1023 return ret;
9d66e233
JB
1024}
1025
d4452bc5 1026static noinline_for_stack
4c6d1d85 1027int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
d4452bc5 1028 struct btrfs_free_space_ctl *ctl,
32da5386 1029 struct btrfs_block_group *block_group,
d4452bc5
CM
1030 int *entries, int *bitmaps,
1031 struct list_head *bitmap_list)
0cb59c99 1032{
c09544e0 1033 int ret;
d4452bc5 1034 struct btrfs_free_cluster *cluster = NULL;
1bbc621e 1035 struct btrfs_free_cluster *cluster_locked = NULL;
d4452bc5 1036 struct rb_node *node = rb_first(&ctl->free_space_offset);
55507ce3 1037 struct btrfs_trim_range *trim_entry;
be1a12a0 1038
43be2146 1039 /* Get the cluster for this block_group if it exists */
d4452bc5 1040 if (block_group && !list_empty(&block_group->cluster_list)) {
43be2146
JB
1041 cluster = list_entry(block_group->cluster_list.next,
1042 struct btrfs_free_cluster,
1043 block_group_list);
d4452bc5 1044 }
43be2146 1045
f75b130e 1046 if (!node && cluster) {
1bbc621e
CM
1047 cluster_locked = cluster;
1048 spin_lock(&cluster_locked->lock);
f75b130e
JB
1049 node = rb_first(&cluster->root);
1050 cluster = NULL;
1051 }
1052
a67509c3
JB
1053 /* Write out the extent entries */
1054 while (node) {
1055 struct btrfs_free_space *e;
0cb59c99 1056
a67509c3 1057 e = rb_entry(node, struct btrfs_free_space, offset_index);
d4452bc5 1058 *entries += 1;
0cb59c99 1059
d4452bc5 1060 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
a67509c3
JB
1061 e->bitmap);
1062 if (ret)
d4452bc5 1063 goto fail;
2f356126 1064
a67509c3 1065 if (e->bitmap) {
d4452bc5
CM
1066 list_add_tail(&e->list, bitmap_list);
1067 *bitmaps += 1;
2f356126 1068 }
a67509c3
JB
1069 node = rb_next(node);
1070 if (!node && cluster) {
1071 node = rb_first(&cluster->root);
1bbc621e
CM
1072 cluster_locked = cluster;
1073 spin_lock(&cluster_locked->lock);
a67509c3 1074 cluster = NULL;
43be2146 1075 }
a67509c3 1076 }
1bbc621e
CM
1077 if (cluster_locked) {
1078 spin_unlock(&cluster_locked->lock);
1079 cluster_locked = NULL;
1080 }
55507ce3
FM
1081
1082 /*
1083 * Make sure we don't miss any range that was removed from our rbtree
1084 * because trimming is running. Otherwise after a umount+mount (or crash
1085 * after committing the transaction) we would leak free space and get
1086 * an inconsistent free space cache report from fsck.
1087 */
1088 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1089 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1090 trim_entry->bytes, NULL);
1091 if (ret)
1092 goto fail;
1093 *entries += 1;
1094 }
1095
d4452bc5
CM
1096 return 0;
1097fail:
1bbc621e
CM
1098 if (cluster_locked)
1099 spin_unlock(&cluster_locked->lock);
d4452bc5
CM
1100 return -ENOSPC;
1101}
1102
1103static noinline_for_stack int
1104update_cache_item(struct btrfs_trans_handle *trans,
1105 struct btrfs_root *root,
1106 struct inode *inode,
1107 struct btrfs_path *path, u64 offset,
1108 int entries, int bitmaps)
1109{
1110 struct btrfs_key key;
1111 struct btrfs_free_space_header *header;
1112 struct extent_buffer *leaf;
1113 int ret;
1114
1115 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1116 key.offset = offset;
1117 key.type = 0;
1118
1119 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1120 if (ret < 0) {
1121 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1122 EXTENT_DELALLOC, 0, 0, NULL);
d4452bc5
CM
1123 goto fail;
1124 }
1125 leaf = path->nodes[0];
1126 if (ret > 0) {
1127 struct btrfs_key found_key;
1128 ASSERT(path->slots[0]);
1129 path->slots[0]--;
1130 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1131 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1132 found_key.offset != offset) {
1133 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
e182163d
OS
1134 inode->i_size - 1, EXTENT_DELALLOC, 0,
1135 0, NULL);
d4452bc5
CM
1136 btrfs_release_path(path);
1137 goto fail;
1138 }
1139 }
1140
1141 BTRFS_I(inode)->generation = trans->transid;
1142 header = btrfs_item_ptr(leaf, path->slots[0],
1143 struct btrfs_free_space_header);
1144 btrfs_set_free_space_entries(leaf, header, entries);
1145 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1146 btrfs_set_free_space_generation(leaf, header, trans->transid);
1147 btrfs_mark_buffer_dirty(leaf);
1148 btrfs_release_path(path);
1149
1150 return 0;
1151
1152fail:
1153 return -1;
1154}
1155
6701bdb3 1156static noinline_for_stack int write_pinned_extent_entries(
6b45f641 1157 struct btrfs_trans_handle *trans,
32da5386 1158 struct btrfs_block_group *block_group,
4c6d1d85 1159 struct btrfs_io_ctl *io_ctl,
5349d6c3 1160 int *entries)
d4452bc5
CM
1161{
1162 u64 start, extent_start, extent_end, len;
d4452bc5
CM
1163 struct extent_io_tree *unpin = NULL;
1164 int ret;
43be2146 1165
5349d6c3
MX
1166 if (!block_group)
1167 return 0;
1168
a67509c3
JB
1169 /*
1170 * We want to add any pinned extents to our free space cache
1171 * so we don't leak the space
d4452bc5 1172 *
db804f23
LZ
1173 * We shouldn't have switched the pinned extents yet so this is the
1174 * right one
1175 */
fe119a6e 1176 unpin = &trans->transaction->pinned_extents;
db804f23 1177
b3470b5d 1178 start = block_group->start;
db804f23 1179
b3470b5d 1180 while (start < block_group->start + block_group->length) {
db804f23
LZ
1181 ret = find_first_extent_bit(unpin, start,
1182 &extent_start, &extent_end,
e6138876 1183 EXTENT_DIRTY, NULL);
5349d6c3
MX
1184 if (ret)
1185 return 0;
0cb59c99 1186
a67509c3 1187 /* This pinned extent is out of our range */
b3470b5d 1188 if (extent_start >= block_group->start + block_group->length)
5349d6c3 1189 return 0;
2f356126 1190
db804f23 1191 extent_start = max(extent_start, start);
b3470b5d
DS
1192 extent_end = min(block_group->start + block_group->length,
1193 extent_end + 1);
db804f23 1194 len = extent_end - extent_start;
0cb59c99 1195
d4452bc5
CM
1196 *entries += 1;
1197 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
a67509c3 1198 if (ret)
5349d6c3 1199 return -ENOSPC;
0cb59c99 1200
db804f23 1201 start = extent_end;
a67509c3 1202 }
0cb59c99 1203
5349d6c3
MX
1204 return 0;
1205}
1206
1207static noinline_for_stack int
4c6d1d85 1208write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
5349d6c3 1209{
7ae1681e 1210 struct btrfs_free_space *entry, *next;
5349d6c3
MX
1211 int ret;
1212
0cb59c99 1213 /* Write out the bitmaps */
7ae1681e 1214 list_for_each_entry_safe(entry, next, bitmap_list, list) {
d4452bc5 1215 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
a67509c3 1216 if (ret)
5349d6c3 1217 return -ENOSPC;
0cb59c99 1218 list_del_init(&entry->list);
be1a12a0
JB
1219 }
1220
5349d6c3
MX
1221 return 0;
1222}
0cb59c99 1223
5349d6c3
MX
1224static int flush_dirty_cache(struct inode *inode)
1225{
1226 int ret;
be1a12a0 1227
0ef8b726 1228 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5349d6c3 1229 if (ret)
0ef8b726 1230 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1231 EXTENT_DELALLOC, 0, 0, NULL);
0cb59c99 1232
5349d6c3 1233 return ret;
d4452bc5
CM
1234}
1235
1236static void noinline_for_stack
a3bdccc4 1237cleanup_bitmap_list(struct list_head *bitmap_list)
d4452bc5 1238{
7ae1681e 1239 struct btrfs_free_space *entry, *next;
5349d6c3 1240
7ae1681e 1241 list_for_each_entry_safe(entry, next, bitmap_list, list)
d4452bc5 1242 list_del_init(&entry->list);
a3bdccc4
CM
1243}
1244
1245static void noinline_for_stack
1246cleanup_write_cache_enospc(struct inode *inode,
1247 struct btrfs_io_ctl *io_ctl,
7bf1a159 1248 struct extent_state **cached_state)
a3bdccc4 1249{
d4452bc5
CM
1250 io_ctl_drop_pages(io_ctl);
1251 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1252 i_size_read(inode) - 1, cached_state);
d4452bc5 1253}
549b4fdb 1254
afdb5718
JM
1255static int __btrfs_wait_cache_io(struct btrfs_root *root,
1256 struct btrfs_trans_handle *trans,
32da5386 1257 struct btrfs_block_group *block_group,
afdb5718
JM
1258 struct btrfs_io_ctl *io_ctl,
1259 struct btrfs_path *path, u64 offset)
c9dc4c65
CM
1260{
1261 int ret;
1262 struct inode *inode = io_ctl->inode;
1263
1bbc621e
CM
1264 if (!inode)
1265 return 0;
1266
c9dc4c65
CM
1267 /* Flush the dirty pages in the cache file. */
1268 ret = flush_dirty_cache(inode);
1269 if (ret)
1270 goto out;
1271
1272 /* Update the cache item to tell everyone this cache file is valid. */
1273 ret = update_cache_item(trans, root, inode, path, offset,
1274 io_ctl->entries, io_ctl->bitmaps);
1275out:
c9dc4c65
CM
1276 if (ret) {
1277 invalidate_inode_pages2(inode->i_mapping);
1278 BTRFS_I(inode)->generation = 0;
bbcd1f4d
FM
1279 if (block_group)
1280 btrfs_debug(root->fs_info,
2e69a7a6
FM
1281 "failed to write free space cache for block group %llu error %d",
1282 block_group->start, ret);
c9dc4c65 1283 }
9a56fcd1 1284 btrfs_update_inode(trans, root, BTRFS_I(inode));
c9dc4c65
CM
1285
1286 if (block_group) {
1bbc621e
CM
1287 /* the dirty list is protected by the dirty_bgs_lock */
1288 spin_lock(&trans->transaction->dirty_bgs_lock);
1289
1290 /* the disk_cache_state is protected by the block group lock */
c9dc4c65
CM
1291 spin_lock(&block_group->lock);
1292
1293 /*
1294 * only mark this as written if we didn't get put back on
1bbc621e
CM
1295 * the dirty list while waiting for IO. Otherwise our
1296 * cache state won't be right, and we won't get written again
c9dc4c65
CM
1297 */
1298 if (!ret && list_empty(&block_group->dirty_list))
1299 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1300 else if (ret)
1301 block_group->disk_cache_state = BTRFS_DC_ERROR;
1302
1303 spin_unlock(&block_group->lock);
1bbc621e 1304 spin_unlock(&trans->transaction->dirty_bgs_lock);
c9dc4c65
CM
1305 io_ctl->inode = NULL;
1306 iput(inode);
1307 }
1308
1309 return ret;
1310
1311}
1312
afdb5718 1313int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
32da5386 1314 struct btrfs_block_group *block_group,
afdb5718
JM
1315 struct btrfs_path *path)
1316{
1317 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1318 block_group, &block_group->io_ctl,
b3470b5d 1319 path, block_group->start);
afdb5718
JM
1320}
1321
d4452bc5 1322/**
f092cf3c
NB
1323 * Write out cached info to an inode
1324 *
1325 * @root: root the inode belongs to
1326 * @inode: freespace inode we are writing out
1327 * @ctl: free space cache we are going to write out
1328 * @block_group: block_group for this cache if it belongs to a block_group
1329 * @io_ctl: holds context for the io
1330 * @trans: the trans handle
d4452bc5
CM
1331 *
1332 * This function writes out a free space cache struct to disk for quick recovery
8cd1e731 1333 * on mount. This will return 0 if it was successful in writing the cache out,
b8605454 1334 * or an errno if it was not.
d4452bc5
CM
1335 */
1336static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1337 struct btrfs_free_space_ctl *ctl,
32da5386 1338 struct btrfs_block_group *block_group,
c9dc4c65 1339 struct btrfs_io_ctl *io_ctl,
0e8d931a 1340 struct btrfs_trans_handle *trans)
d4452bc5
CM
1341{
1342 struct extent_state *cached_state = NULL;
5349d6c3 1343 LIST_HEAD(bitmap_list);
d4452bc5
CM
1344 int entries = 0;
1345 int bitmaps = 0;
1346 int ret;
c9dc4c65 1347 int must_iput = 0;
d4452bc5
CM
1348
1349 if (!i_size_read(inode))
b8605454 1350 return -EIO;
d4452bc5 1351
c9dc4c65 1352 WARN_ON(io_ctl->pages);
f15376df 1353 ret = io_ctl_init(io_ctl, inode, 1);
d4452bc5 1354 if (ret)
b8605454 1355 return ret;
d4452bc5 1356
e570fd27
MX
1357 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1358 down_write(&block_group->data_rwsem);
1359 spin_lock(&block_group->lock);
1360 if (block_group->delalloc_bytes) {
1361 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1362 spin_unlock(&block_group->lock);
1363 up_write(&block_group->data_rwsem);
1364 BTRFS_I(inode)->generation = 0;
1365 ret = 0;
c9dc4c65 1366 must_iput = 1;
e570fd27
MX
1367 goto out;
1368 }
1369 spin_unlock(&block_group->lock);
1370 }
1371
d4452bc5 1372 /* Lock all pages first so we can lock the extent safely. */
7a195f6d 1373 ret = io_ctl_prepare_pages(io_ctl, false);
b8605454 1374 if (ret)
b77000ed 1375 goto out_unlock;
d4452bc5
CM
1376
1377 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
ff13db41 1378 &cached_state);
d4452bc5 1379
c9dc4c65 1380 io_ctl_set_generation(io_ctl, trans->transid);
d4452bc5 1381
55507ce3 1382 mutex_lock(&ctl->cache_writeout_mutex);
5349d6c3 1383 /* Write out the extent entries in the free space cache */
1bbc621e 1384 spin_lock(&ctl->tree_lock);
c9dc4c65 1385 ret = write_cache_extent_entries(io_ctl, ctl,
d4452bc5
CM
1386 block_group, &entries, &bitmaps,
1387 &bitmap_list);
a3bdccc4
CM
1388 if (ret)
1389 goto out_nospc_locked;
d4452bc5 1390
5349d6c3
MX
1391 /*
1392 * Some spaces that are freed in the current transaction are pinned,
1393 * they will be added into free space cache after the transaction is
1394 * committed, we shouldn't lose them.
1bbc621e
CM
1395 *
1396 * If this changes while we are working we'll get added back to
1397 * the dirty list and redo it. No locking needed
5349d6c3 1398 */
6b45f641 1399 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
a3bdccc4
CM
1400 if (ret)
1401 goto out_nospc_locked;
5349d6c3 1402
55507ce3
FM
1403 /*
1404 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1405 * locked while doing it because a concurrent trim can be manipulating
1406 * or freeing the bitmap.
1407 */
c9dc4c65 1408 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1bbc621e 1409 spin_unlock(&ctl->tree_lock);
55507ce3 1410 mutex_unlock(&ctl->cache_writeout_mutex);
5349d6c3
MX
1411 if (ret)
1412 goto out_nospc;
1413
1414 /* Zero out the rest of the pages just to make sure */
c9dc4c65 1415 io_ctl_zero_remaining_pages(io_ctl);
d4452bc5 1416
5349d6c3 1417 /* Everything is written out, now we dirty the pages in the file. */
088545f6
NB
1418 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1419 io_ctl->num_pages, 0, i_size_read(inode),
aa8c1a41 1420 &cached_state, false);
5349d6c3 1421 if (ret)
d4452bc5 1422 goto out_nospc;
5349d6c3 1423
e570fd27
MX
1424 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1425 up_write(&block_group->data_rwsem);
5349d6c3
MX
1426 /*
1427 * Release the pages and unlock the extent, we will flush
1428 * them out later
1429 */
c9dc4c65 1430 io_ctl_drop_pages(io_ctl);
bbc37d6e 1431 io_ctl_free(io_ctl);
5349d6c3
MX
1432
1433 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1434 i_size_read(inode) - 1, &cached_state);
5349d6c3 1435
c9dc4c65
CM
1436 /*
1437 * at this point the pages are under IO and we're happy,
260db43c 1438 * The caller is responsible for waiting on them and updating
c9dc4c65
CM
1439 * the cache and the inode
1440 */
1441 io_ctl->entries = entries;
1442 io_ctl->bitmaps = bitmaps;
1443
1444 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
5349d6c3 1445 if (ret)
d4452bc5
CM
1446 goto out;
1447
c9dc4c65
CM
1448 return 0;
1449
a3bdccc4
CM
1450out_nospc_locked:
1451 cleanup_bitmap_list(&bitmap_list);
1452 spin_unlock(&ctl->tree_lock);
1453 mutex_unlock(&ctl->cache_writeout_mutex);
1454
a67509c3 1455out_nospc:
7bf1a159 1456 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
e570fd27 1457
b77000ed 1458out_unlock:
e570fd27
MX
1459 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1460 up_write(&block_group->data_rwsem);
1461
fd8efa81
JT
1462out:
1463 io_ctl->inode = NULL;
1464 io_ctl_free(io_ctl);
1465 if (ret) {
1466 invalidate_inode_pages2(inode->i_mapping);
1467 BTRFS_I(inode)->generation = 0;
1468 }
9a56fcd1 1469 btrfs_update_inode(trans, root, BTRFS_I(inode));
fd8efa81
JT
1470 if (must_iput)
1471 iput(inode);
1472 return ret;
0414efae
LZ
1473}
1474
fe041534 1475int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
32da5386 1476 struct btrfs_block_group *block_group,
0414efae
LZ
1477 struct btrfs_path *path)
1478{
fe041534 1479 struct btrfs_fs_info *fs_info = trans->fs_info;
0414efae
LZ
1480 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1481 struct inode *inode;
1482 int ret = 0;
1483
0414efae
LZ
1484 spin_lock(&block_group->lock);
1485 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1486 spin_unlock(&block_group->lock);
e570fd27
MX
1487 return 0;
1488 }
0414efae
LZ
1489 spin_unlock(&block_group->lock);
1490
7949f339 1491 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
1492 if (IS_ERR(inode))
1493 return 0;
1494
77ab86bf
JM
1495 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1496 block_group, &block_group->io_ctl, trans);
c09544e0 1497 if (ret) {
bbcd1f4d 1498 btrfs_debug(fs_info,
2e69a7a6
FM
1499 "failed to write free space cache for block group %llu error %d",
1500 block_group->start, ret);
c9dc4c65
CM
1501 spin_lock(&block_group->lock);
1502 block_group->disk_cache_state = BTRFS_DC_ERROR;
1503 spin_unlock(&block_group->lock);
1504
1505 block_group->io_ctl.inode = NULL;
1506 iput(inode);
0414efae
LZ
1507 }
1508
c9dc4c65
CM
1509 /*
1510 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1511 * to wait for IO and put the inode
1512 */
1513
0cb59c99
JB
1514 return ret;
1515}
1516
34d52cb6 1517static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
96303081 1518 u64 offset)
0f9dd46c 1519{
b12d6869 1520 ASSERT(offset >= bitmap_start);
96303081 1521 offset -= bitmap_start;
34d52cb6 1522 return (unsigned long)(div_u64(offset, unit));
96303081 1523}
0f9dd46c 1524
34d52cb6 1525static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
96303081 1526{
34d52cb6 1527 return (unsigned long)(div_u64(bytes, unit));
96303081 1528}
0f9dd46c 1529
34d52cb6 1530static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1531 u64 offset)
1532{
1533 u64 bitmap_start;
0ef6447a 1534 u64 bytes_per_bitmap;
0f9dd46c 1535
34d52cb6
LZ
1536 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1537 bitmap_start = offset - ctl->start;
0ef6447a 1538 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
96303081 1539 bitmap_start *= bytes_per_bitmap;
34d52cb6 1540 bitmap_start += ctl->start;
0f9dd46c 1541
96303081 1542 return bitmap_start;
0f9dd46c
JB
1543}
1544
96303081
JB
1545static int tree_insert_offset(struct rb_root *root, u64 offset,
1546 struct rb_node *node, int bitmap)
0f9dd46c
JB
1547{
1548 struct rb_node **p = &root->rb_node;
1549 struct rb_node *parent = NULL;
1550 struct btrfs_free_space *info;
1551
1552 while (*p) {
1553 parent = *p;
96303081 1554 info = rb_entry(parent, struct btrfs_free_space, offset_index);
0f9dd46c 1555
96303081 1556 if (offset < info->offset) {
0f9dd46c 1557 p = &(*p)->rb_left;
96303081 1558 } else if (offset > info->offset) {
0f9dd46c 1559 p = &(*p)->rb_right;
96303081
JB
1560 } else {
1561 /*
1562 * we could have a bitmap entry and an extent entry
1563 * share the same offset. If this is the case, we want
1564 * the extent entry to always be found first if we do a
1565 * linear search through the tree, since we want to have
1566 * the quickest allocation time, and allocating from an
1567 * extent is faster than allocating from a bitmap. So
1568 * if we're inserting a bitmap and we find an entry at
1569 * this offset, we want to go right, or after this entry
1570 * logically. If we are inserting an extent and we've
1571 * found a bitmap, we want to go left, or before
1572 * logically.
1573 */
1574 if (bitmap) {
207dde82
JB
1575 if (info->bitmap) {
1576 WARN_ON_ONCE(1);
1577 return -EEXIST;
1578 }
96303081
JB
1579 p = &(*p)->rb_right;
1580 } else {
207dde82
JB
1581 if (!info->bitmap) {
1582 WARN_ON_ONCE(1);
1583 return -EEXIST;
1584 }
96303081
JB
1585 p = &(*p)->rb_left;
1586 }
1587 }
0f9dd46c
JB
1588 }
1589
1590 rb_link_node(node, parent, p);
1591 rb_insert_color(node, root);
1592
1593 return 0;
1594}
1595
59c7b566
JB
1596/*
1597 * This is a little subtle. We *only* have ->max_extent_size set if we actually
1598 * searched through the bitmap and figured out the largest ->max_extent_size,
1599 * otherwise it's 0. In the case that it's 0 we don't want to tell the
1600 * allocator the wrong thing, we want to use the actual real max_extent_size
1601 * we've found already if it's larger, or we want to use ->bytes.
1602 *
1603 * This matters because find_free_space() will skip entries who's ->bytes is
1604 * less than the required bytes. So if we didn't search down this bitmap, we
1605 * may pick some previous entry that has a smaller ->max_extent_size than we
1606 * have. For example, assume we have two entries, one that has
1607 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
1608 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
1609 * call into find_free_space(), and return with max_extent_size == 4K, because
1610 * that first bitmap entry had ->max_extent_size set, but the second one did
1611 * not. If instead we returned 8K we'd come in searching for 8K, and find the
1612 * 8K contiguous range.
1613 *
1614 * Consider the other case, we have 2 8K chunks in that second entry and still
1615 * don't have ->max_extent_size set. We'll return 16K, and the next time the
1616 * allocator comes in it'll fully search our second bitmap, and this time it'll
1617 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
1618 * right allocation the next loop through.
1619 */
1620static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1621{
1622 if (entry->bitmap && entry->max_extent_size)
1623 return entry->max_extent_size;
1624 return entry->bytes;
1625}
1626
1627/*
1628 * We want the largest entry to be leftmost, so this is inverted from what you'd
1629 * normally expect.
1630 */
1631static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1632{
1633 const struct btrfs_free_space *entry, *exist;
1634
1635 entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1636 exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1637 return get_max_extent_size(exist) < get_max_extent_size(entry);
1638}
1639
0f9dd46c 1640/*
70cb0743
JB
1641 * searches the tree for the given offset.
1642 *
96303081
JB
1643 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1644 * want a section that has at least bytes size and comes at or after the given
1645 * offset.
0f9dd46c 1646 */
96303081 1647static struct btrfs_free_space *
34d52cb6 1648tree_search_offset(struct btrfs_free_space_ctl *ctl,
96303081 1649 u64 offset, int bitmap_only, int fuzzy)
0f9dd46c 1650{
34d52cb6 1651 struct rb_node *n = ctl->free_space_offset.rb_node;
f1a8fc62 1652 struct btrfs_free_space *entry = NULL, *prev = NULL;
96303081
JB
1653
1654 /* find entry that is closest to the 'offset' */
f1a8fc62 1655 while (n) {
0f9dd46c 1656 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081 1657 prev = entry;
0f9dd46c 1658
96303081 1659 if (offset < entry->offset)
0f9dd46c 1660 n = n->rb_left;
96303081 1661 else if (offset > entry->offset)
0f9dd46c 1662 n = n->rb_right;
96303081 1663 else
0f9dd46c 1664 break;
f1a8fc62
NB
1665
1666 entry = NULL;
0f9dd46c
JB
1667 }
1668
96303081
JB
1669 if (bitmap_only) {
1670 if (!entry)
1671 return NULL;
1672 if (entry->bitmap)
1673 return entry;
0f9dd46c 1674
96303081
JB
1675 /*
1676 * bitmap entry and extent entry may share same offset,
1677 * in that case, bitmap entry comes after extent entry.
1678 */
1679 n = rb_next(n);
1680 if (!n)
1681 return NULL;
1682 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1683 if (entry->offset != offset)
1684 return NULL;
0f9dd46c 1685
96303081
JB
1686 WARN_ON(!entry->bitmap);
1687 return entry;
1688 } else if (entry) {
1689 if (entry->bitmap) {
0f9dd46c 1690 /*
96303081
JB
1691 * if previous extent entry covers the offset,
1692 * we should return it instead of the bitmap entry
0f9dd46c 1693 */
de6c4115
MX
1694 n = rb_prev(&entry->offset_index);
1695 if (n) {
96303081
JB
1696 prev = rb_entry(n, struct btrfs_free_space,
1697 offset_index);
de6c4115
MX
1698 if (!prev->bitmap &&
1699 prev->offset + prev->bytes > offset)
1700 entry = prev;
0f9dd46c 1701 }
96303081
JB
1702 }
1703 return entry;
1704 }
1705
1706 if (!prev)
1707 return NULL;
1708
1709 /* find last entry before the 'offset' */
1710 entry = prev;
1711 if (entry->offset > offset) {
1712 n = rb_prev(&entry->offset_index);
1713 if (n) {
1714 entry = rb_entry(n, struct btrfs_free_space,
1715 offset_index);
b12d6869 1716 ASSERT(entry->offset <= offset);
0f9dd46c 1717 } else {
96303081
JB
1718 if (fuzzy)
1719 return entry;
1720 else
1721 return NULL;
0f9dd46c
JB
1722 }
1723 }
1724
96303081 1725 if (entry->bitmap) {
de6c4115
MX
1726 n = rb_prev(&entry->offset_index);
1727 if (n) {
96303081
JB
1728 prev = rb_entry(n, struct btrfs_free_space,
1729 offset_index);
de6c4115
MX
1730 if (!prev->bitmap &&
1731 prev->offset + prev->bytes > offset)
1732 return prev;
96303081 1733 }
34d52cb6 1734 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
96303081
JB
1735 return entry;
1736 } else if (entry->offset + entry->bytes > offset)
1737 return entry;
1738
1739 if (!fuzzy)
1740 return NULL;
1741
1742 while (1) {
167c0bd3
NB
1743 n = rb_next(&entry->offset_index);
1744 if (!n)
1745 return NULL;
1746 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081
JB
1747 if (entry->bitmap) {
1748 if (entry->offset + BITS_PER_BITMAP *
34d52cb6 1749 ctl->unit > offset)
96303081
JB
1750 break;
1751 } else {
1752 if (entry->offset + entry->bytes > offset)
1753 break;
1754 }
96303081
JB
1755 }
1756 return entry;
0f9dd46c
JB
1757}
1758
32e1649b
NB
1759static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1760 struct btrfs_free_space *info,
1761 bool update_stat)
0f9dd46c 1762{
34d52cb6 1763 rb_erase(&info->offset_index, &ctl->free_space_offset);
59c7b566 1764 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
34d52cb6 1765 ctl->free_extents--;
dfb79ddb 1766
5dc7c10b 1767 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
dfb79ddb 1768 ctl->discardable_extents[BTRFS_STAT_CURR]--;
5dc7c10b
DZ
1769 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1770 }
f333adb5 1771
32e1649b
NB
1772 if (update_stat)
1773 ctl->free_space -= info->bytes;
0f9dd46c
JB
1774}
1775
34d52cb6 1776static int link_free_space(struct btrfs_free_space_ctl *ctl,
0f9dd46c
JB
1777 struct btrfs_free_space *info)
1778{
1779 int ret = 0;
1780
b12d6869 1781 ASSERT(info->bytes || info->bitmap);
34d52cb6 1782 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
96303081 1783 &info->offset_index, (info->bitmap != NULL));
0f9dd46c
JB
1784 if (ret)
1785 return ret;
1786
59c7b566
JB
1787 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1788
5dc7c10b 1789 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
dfb79ddb 1790 ctl->discardable_extents[BTRFS_STAT_CURR]++;
5dc7c10b
DZ
1791 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1792 }
dfb79ddb 1793
34d52cb6
LZ
1794 ctl->free_space += info->bytes;
1795 ctl->free_extents++;
96303081
JB
1796 return ret;
1797}
1798
59c7b566
JB
1799static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1800 struct btrfs_free_space *info)
1801{
1802 ASSERT(info->bitmap);
1803
1804 /*
1805 * If our entry is empty it's because we're on a cluster and we don't
1806 * want to re-link it into our ctl bytes index.
1807 */
1808 if (RB_EMPTY_NODE(&info->bytes_index))
1809 return;
1810
1811 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1812 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1813}
1814
f594f13c
NB
1815static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1816 struct btrfs_free_space *info,
1817 u64 offset, u64 bytes, bool update_stat)
96303081 1818{
dfb79ddb
DZ
1819 unsigned long start, count, end;
1820 int extent_delta = -1;
96303081 1821
34d52cb6
LZ
1822 start = offset_to_bit(info->offset, ctl->unit, offset);
1823 count = bytes_to_bits(bytes, ctl->unit);
dfb79ddb
DZ
1824 end = start + count;
1825 ASSERT(end <= BITS_PER_BITMAP);
96303081 1826
f38b6e75 1827 bitmap_clear(info->bitmap, start, count);
96303081
JB
1828
1829 info->bytes -= bytes;
553cceb4
JB
1830 if (info->max_extent_size > ctl->unit)
1831 info->max_extent_size = 0;
dfb79ddb 1832
59c7b566
JB
1833 relink_bitmap_entry(ctl, info);
1834
dfb79ddb
DZ
1835 if (start && test_bit(start - 1, info->bitmap))
1836 extent_delta++;
1837
1838 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1839 extent_delta++;
1840
1841 info->bitmap_extents += extent_delta;
5dc7c10b 1842 if (!btrfs_free_space_trimmed(info)) {
dfb79ddb 1843 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
5dc7c10b
DZ
1844 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1845 }
bb3ac5a4 1846
f594f13c
NB
1847 if (update_stat)
1848 ctl->free_space -= bytes;
96303081
JB
1849}
1850
34d52cb6 1851static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
817d52f8
JB
1852 struct btrfs_free_space *info, u64 offset,
1853 u64 bytes)
96303081 1854{
dfb79ddb
DZ
1855 unsigned long start, count, end;
1856 int extent_delta = 1;
96303081 1857
34d52cb6
LZ
1858 start = offset_to_bit(info->offset, ctl->unit, offset);
1859 count = bytes_to_bits(bytes, ctl->unit);
dfb79ddb
DZ
1860 end = start + count;
1861 ASSERT(end <= BITS_PER_BITMAP);
96303081 1862
f38b6e75 1863 bitmap_set(info->bitmap, start, count);
96303081 1864
59c7b566
JB
1865 /*
1866 * We set some bytes, we have no idea what the max extent size is
1867 * anymore.
1868 */
1869 info->max_extent_size = 0;
96303081 1870 info->bytes += bytes;
34d52cb6 1871 ctl->free_space += bytes;
dfb79ddb 1872
59c7b566
JB
1873 relink_bitmap_entry(ctl, info);
1874
dfb79ddb
DZ
1875 if (start && test_bit(start - 1, info->bitmap))
1876 extent_delta--;
1877
1878 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1879 extent_delta--;
1880
1881 info->bitmap_extents += extent_delta;
5dc7c10b 1882 if (!btrfs_free_space_trimmed(info)) {
dfb79ddb 1883 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
5dc7c10b
DZ
1884 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1885 }
96303081
JB
1886}
1887
a4820398
MX
1888/*
1889 * If we can not find suitable extent, we will use bytes to record
1890 * the size of the max extent.
1891 */
34d52cb6 1892static int search_bitmap(struct btrfs_free_space_ctl *ctl,
96303081 1893 struct btrfs_free_space *bitmap_info, u64 *offset,
0584f718 1894 u64 *bytes, bool for_alloc)
96303081
JB
1895{
1896 unsigned long found_bits = 0;
a4820398 1897 unsigned long max_bits = 0;
96303081
JB
1898 unsigned long bits, i;
1899 unsigned long next_zero;
a4820398 1900 unsigned long extent_bits;
96303081 1901
cef40483
JB
1902 /*
1903 * Skip searching the bitmap if we don't have a contiguous section that
1904 * is large enough for this allocation.
1905 */
0584f718
JB
1906 if (for_alloc &&
1907 bitmap_info->max_extent_size &&
cef40483
JB
1908 bitmap_info->max_extent_size < *bytes) {
1909 *bytes = bitmap_info->max_extent_size;
1910 return -1;
1911 }
1912
34d52cb6 1913 i = offset_to_bit(bitmap_info->offset, ctl->unit,
96303081 1914 max_t(u64, *offset, bitmap_info->offset));
34d52cb6 1915 bits = bytes_to_bits(*bytes, ctl->unit);
96303081 1916
ebb3dad4 1917 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
0584f718
JB
1918 if (for_alloc && bits == 1) {
1919 found_bits = 1;
1920 break;
1921 }
96303081
JB
1922 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1923 BITS_PER_BITMAP, i);
a4820398
MX
1924 extent_bits = next_zero - i;
1925 if (extent_bits >= bits) {
1926 found_bits = extent_bits;
96303081 1927 break;
a4820398
MX
1928 } else if (extent_bits > max_bits) {
1929 max_bits = extent_bits;
96303081
JB
1930 }
1931 i = next_zero;
1932 }
1933
1934 if (found_bits) {
34d52cb6
LZ
1935 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1936 *bytes = (u64)(found_bits) * ctl->unit;
96303081
JB
1937 return 0;
1938 }
1939
a4820398 1940 *bytes = (u64)(max_bits) * ctl->unit;
cef40483 1941 bitmap_info->max_extent_size = *bytes;
59c7b566 1942 relink_bitmap_entry(ctl, bitmap_info);
96303081
JB
1943 return -1;
1944}
1945
a4820398 1946/* Cache the size of the max extent in bytes */
34d52cb6 1947static struct btrfs_free_space *
53b381b3 1948find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
59c7b566 1949 unsigned long align, u64 *max_extent_size, bool use_bytes_index)
96303081
JB
1950{
1951 struct btrfs_free_space *entry;
1952 struct rb_node *node;
53b381b3
DW
1953 u64 tmp;
1954 u64 align_off;
96303081
JB
1955 int ret;
1956
34d52cb6 1957 if (!ctl->free_space_offset.rb_node)
a4820398 1958 goto out;
59c7b566
JB
1959again:
1960 if (use_bytes_index) {
1961 node = rb_first_cached(&ctl->free_space_bytes);
1962 } else {
1963 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
1964 0, 1);
1965 if (!entry)
1966 goto out;
1967 node = &entry->offset_index;
1968 }
96303081 1969
59c7b566
JB
1970 for (; node; node = rb_next(node)) {
1971 if (use_bytes_index)
1972 entry = rb_entry(node, struct btrfs_free_space,
1973 bytes_index);
1974 else
1975 entry = rb_entry(node, struct btrfs_free_space,
1976 offset_index);
96303081 1977
59c7b566
JB
1978 /*
1979 * If we are using the bytes index then all subsequent entries
1980 * in this tree are going to be < bytes, so simply set the max
1981 * extent size and exit the loop.
1982 *
1983 * If we're using the offset index then we need to keep going
1984 * through the rest of the tree.
1985 */
a4820398 1986 if (entry->bytes < *bytes) {
ad22cf6e
JB
1987 *max_extent_size = max(get_max_extent_size(entry),
1988 *max_extent_size);
59c7b566
JB
1989 if (use_bytes_index)
1990 break;
96303081 1991 continue;
a4820398 1992 }
96303081 1993
53b381b3
DW
1994 /* make sure the space returned is big enough
1995 * to match our requested alignment
1996 */
1997 if (*bytes >= align) {
a4820398 1998 tmp = entry->offset - ctl->start + align - 1;
47c5713f 1999 tmp = div64_u64(tmp, align);
53b381b3
DW
2000 tmp = tmp * align + ctl->start;
2001 align_off = tmp - entry->offset;
2002 } else {
2003 align_off = 0;
2004 tmp = entry->offset;
2005 }
2006
59c7b566
JB
2007 /*
2008 * We don't break here if we're using the bytes index because we
2009 * may have another entry that has the correct alignment that is
2010 * the right size, so we don't want to miss that possibility.
2011 * At worst this adds another loop through the logic, but if we
2012 * broke here we could prematurely ENOSPC.
2013 */
a4820398 2014 if (entry->bytes < *bytes + align_off) {
ad22cf6e
JB
2015 *max_extent_size = max(get_max_extent_size(entry),
2016 *max_extent_size);
53b381b3 2017 continue;
a4820398 2018 }
53b381b3 2019
96303081 2020 if (entry->bitmap) {
59c7b566 2021 struct rb_node *old_next = rb_next(node);
a4820398
MX
2022 u64 size = *bytes;
2023
0584f718 2024 ret = search_bitmap(ctl, entry, &tmp, &size, true);
53b381b3
DW
2025 if (!ret) {
2026 *offset = tmp;
a4820398 2027 *bytes = size;
96303081 2028 return entry;
ad22cf6e
JB
2029 } else {
2030 *max_extent_size =
2031 max(get_max_extent_size(entry),
2032 *max_extent_size);
53b381b3 2033 }
59c7b566
JB
2034
2035 /*
2036 * The bitmap may have gotten re-arranged in the space
2037 * index here because the max_extent_size may have been
2038 * updated. Start from the beginning again if this
2039 * happened.
2040 */
2041 if (use_bytes_index && old_next != rb_next(node))
2042 goto again;
96303081
JB
2043 continue;
2044 }
2045
53b381b3
DW
2046 *offset = tmp;
2047 *bytes = entry->bytes - align_off;
96303081
JB
2048 return entry;
2049 }
a4820398 2050out:
96303081
JB
2051 return NULL;
2052}
2053
34d52cb6 2054static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
2055 struct btrfs_free_space *info, u64 offset)
2056{
34d52cb6 2057 info->offset = offset_to_bitmap(ctl, offset);
f019f426 2058 info->bytes = 0;
dfb79ddb 2059 info->bitmap_extents = 0;
f2d0f676 2060 INIT_LIST_HEAD(&info->list);
34d52cb6
LZ
2061 link_free_space(ctl, info);
2062 ctl->total_bitmaps++;
fa598b06 2063 recalculate_thresholds(ctl);
96303081
JB
2064}
2065
34d52cb6 2066static void free_bitmap(struct btrfs_free_space_ctl *ctl,
edf6e2d1
LZ
2067 struct btrfs_free_space *bitmap_info)
2068{
27f0afc7
DZ
2069 /*
2070 * Normally when this is called, the bitmap is completely empty. However,
2071 * if we are blowing up the free space cache for one reason or another
2072 * via __btrfs_remove_free_space_cache(), then it may not be freed and
2073 * we may leave stats on the table.
2074 */
2075 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2076 ctl->discardable_extents[BTRFS_STAT_CURR] -=
2077 bitmap_info->bitmap_extents;
2078 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2079
2080 }
32e1649b 2081 unlink_free_space(ctl, bitmap_info, true);
3acd4850 2082 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
dc89e982 2083 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
34d52cb6 2084 ctl->total_bitmaps--;
fa598b06 2085 recalculate_thresholds(ctl);
edf6e2d1
LZ
2086}
2087
34d52cb6 2088static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
2089 struct btrfs_free_space *bitmap_info,
2090 u64 *offset, u64 *bytes)
2091{
2092 u64 end;
6606bb97
JB
2093 u64 search_start, search_bytes;
2094 int ret;
96303081
JB
2095
2096again:
34d52cb6 2097 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
96303081 2098
6606bb97 2099 /*
bdb7d303
JB
2100 * We need to search for bits in this bitmap. We could only cover some
2101 * of the extent in this bitmap thanks to how we add space, so we need
2102 * to search for as much as it as we can and clear that amount, and then
2103 * go searching for the next bit.
6606bb97
JB
2104 */
2105 search_start = *offset;
bdb7d303 2106 search_bytes = ctl->unit;
13dbc089 2107 search_bytes = min(search_bytes, end - search_start + 1);
0584f718
JB
2108 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2109 false);
b50c6e25
JB
2110 if (ret < 0 || search_start != *offset)
2111 return -EINVAL;
6606bb97 2112
bdb7d303
JB
2113 /* We may have found more bits than what we need */
2114 search_bytes = min(search_bytes, *bytes);
2115
2116 /* Cannot clear past the end of the bitmap */
2117 search_bytes = min(search_bytes, end - search_start + 1);
2118
f594f13c 2119 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
bdb7d303
JB
2120 *offset += search_bytes;
2121 *bytes -= search_bytes;
96303081
JB
2122
2123 if (*bytes) {
6606bb97 2124 struct rb_node *next = rb_next(&bitmap_info->offset_index);
edf6e2d1 2125 if (!bitmap_info->bytes)
34d52cb6 2126 free_bitmap(ctl, bitmap_info);
96303081 2127
6606bb97
JB
2128 /*
2129 * no entry after this bitmap, but we still have bytes to
2130 * remove, so something has gone wrong.
2131 */
2132 if (!next)
96303081
JB
2133 return -EINVAL;
2134
6606bb97
JB
2135 bitmap_info = rb_entry(next, struct btrfs_free_space,
2136 offset_index);
2137
2138 /*
2139 * if the next entry isn't a bitmap we need to return to let the
2140 * extent stuff do its work.
2141 */
96303081
JB
2142 if (!bitmap_info->bitmap)
2143 return -EAGAIN;
2144
6606bb97
JB
2145 /*
2146 * Ok the next item is a bitmap, but it may not actually hold
2147 * the information for the rest of this free space stuff, so
2148 * look for it, and if we don't find it return so we can try
2149 * everything over again.
2150 */
2151 search_start = *offset;
bdb7d303 2152 search_bytes = ctl->unit;
34d52cb6 2153 ret = search_bitmap(ctl, bitmap_info, &search_start,
0584f718 2154 &search_bytes, false);
6606bb97
JB
2155 if (ret < 0 || search_start != *offset)
2156 return -EAGAIN;
2157
96303081 2158 goto again;
edf6e2d1 2159 } else if (!bitmap_info->bytes)
34d52cb6 2160 free_bitmap(ctl, bitmap_info);
96303081
JB
2161
2162 return 0;
2163}
2164
2cdc342c
JB
2165static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2166 struct btrfs_free_space *info, u64 offset,
da080fe1 2167 u64 bytes, enum btrfs_trim_state trim_state)
2cdc342c
JB
2168{
2169 u64 bytes_to_set = 0;
2170 u64 end;
2171
da080fe1
DZ
2172 /*
2173 * This is a tradeoff to make bitmap trim state minimal. We mark the
2174 * whole bitmap untrimmed if at any point we add untrimmed regions.
2175 */
dfb79ddb 2176 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
5dc7c10b 2177 if (btrfs_free_space_trimmed(info)) {
dfb79ddb
DZ
2178 ctl->discardable_extents[BTRFS_STAT_CURR] +=
2179 info->bitmap_extents;
5dc7c10b
DZ
2180 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2181 }
da080fe1 2182 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
dfb79ddb 2183 }
da080fe1 2184
2cdc342c
JB
2185 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2186
2187 bytes_to_set = min(end - offset, bytes);
2188
2189 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2190
2191 return bytes_to_set;
2192
2193}
2194
34d52cb6
LZ
2195static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2196 struct btrfs_free_space *info)
96303081 2197{
364be842 2198 struct btrfs_block_group *block_group = ctl->block_group;
0b246afa 2199 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
2200 bool forced = false;
2201
2202#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 2203 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
2204 forced = true;
2205#endif
96303081 2206
5d90c5c7
DZ
2207 /* This is a way to reclaim large regions from the bitmaps. */
2208 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2209 return false;
2210
96303081
JB
2211 /*
2212 * If we are below the extents threshold then we can add this as an
2213 * extent, and don't have to deal with the bitmap
2214 */
d0bd4560 2215 if (!forced && ctl->free_extents < ctl->extents_thresh) {
32cb0840
JB
2216 /*
2217 * If this block group has some small extents we don't want to
2218 * use up all of our free slots in the cache with them, we want
01327610 2219 * to reserve them to larger extents, however if we have plenty
32cb0840
JB
2220 * of cache left then go ahead an dadd them, no sense in adding
2221 * the overhead of a bitmap if we don't have to.
2222 */
f9bb615a
DZ
2223 if (info->bytes <= fs_info->sectorsize * 8) {
2224 if (ctl->free_extents * 3 <= ctl->extents_thresh)
34d52cb6 2225 return false;
32cb0840 2226 } else {
34d52cb6 2227 return false;
32cb0840
JB
2228 }
2229 }
96303081
JB
2230
2231 /*
dde5740f
JB
2232 * The original block groups from mkfs can be really small, like 8
2233 * megabytes, so don't bother with a bitmap for those entries. However
2234 * some block groups can be smaller than what a bitmap would cover but
2235 * are still large enough that they could overflow the 32k memory limit,
2236 * so allow those block groups to still be allowed to have a bitmap
2237 * entry.
96303081 2238 */
b3470b5d 2239 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
34d52cb6
LZ
2240 return false;
2241
2242 return true;
2243}
2244
20e5506b 2245static const struct btrfs_free_space_op free_space_op = {
2cdc342c
JB
2246 .use_bitmap = use_bitmap,
2247};
2248
34d52cb6
LZ
2249static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2250 struct btrfs_free_space *info)
2251{
2252 struct btrfs_free_space *bitmap_info;
32da5386 2253 struct btrfs_block_group *block_group = NULL;
34d52cb6 2254 int added = 0;
2cdc342c 2255 u64 bytes, offset, bytes_added;
da080fe1 2256 enum btrfs_trim_state trim_state;
34d52cb6 2257 int ret;
96303081
JB
2258
2259 bytes = info->bytes;
2260 offset = info->offset;
da080fe1 2261 trim_state = info->trim_state;
96303081 2262
34d52cb6
LZ
2263 if (!ctl->op->use_bitmap(ctl, info))
2264 return 0;
2265
2cdc342c 2266 if (ctl->op == &free_space_op)
364be842 2267 block_group = ctl->block_group;
38e87880 2268again:
2cdc342c
JB
2269 /*
2270 * Since we link bitmaps right into the cluster we need to see if we
2271 * have a cluster here, and if so and it has our bitmap we need to add
2272 * the free space to that bitmap.
2273 */
2274 if (block_group && !list_empty(&block_group->cluster_list)) {
2275 struct btrfs_free_cluster *cluster;
2276 struct rb_node *node;
2277 struct btrfs_free_space *entry;
2278
2279 cluster = list_entry(block_group->cluster_list.next,
2280 struct btrfs_free_cluster,
2281 block_group_list);
2282 spin_lock(&cluster->lock);
2283 node = rb_first(&cluster->root);
2284 if (!node) {
2285 spin_unlock(&cluster->lock);
38e87880 2286 goto no_cluster_bitmap;
2cdc342c
JB
2287 }
2288
2289 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2290 if (!entry->bitmap) {
2291 spin_unlock(&cluster->lock);
38e87880 2292 goto no_cluster_bitmap;
2cdc342c
JB
2293 }
2294
2295 if (entry->offset == offset_to_bitmap(ctl, offset)) {
da080fe1
DZ
2296 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2297 bytes, trim_state);
2cdc342c
JB
2298 bytes -= bytes_added;
2299 offset += bytes_added;
2300 }
2301 spin_unlock(&cluster->lock);
2302 if (!bytes) {
2303 ret = 1;
2304 goto out;
2305 }
2306 }
38e87880
CM
2307
2308no_cluster_bitmap:
34d52cb6 2309 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
96303081
JB
2310 1, 0);
2311 if (!bitmap_info) {
b12d6869 2312 ASSERT(added == 0);
96303081
JB
2313 goto new_bitmap;
2314 }
2315
da080fe1
DZ
2316 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2317 trim_state);
2cdc342c
JB
2318 bytes -= bytes_added;
2319 offset += bytes_added;
2320 added = 0;
96303081
JB
2321
2322 if (!bytes) {
2323 ret = 1;
2324 goto out;
2325 } else
2326 goto again;
2327
2328new_bitmap:
2329 if (info && info->bitmap) {
34d52cb6 2330 add_new_bitmap(ctl, info, offset);
96303081
JB
2331 added = 1;
2332 info = NULL;
2333 goto again;
2334 } else {
34d52cb6 2335 spin_unlock(&ctl->tree_lock);
96303081
JB
2336
2337 /* no pre-allocated info, allocate a new one */
2338 if (!info) {
dc89e982
JB
2339 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2340 GFP_NOFS);
96303081 2341 if (!info) {
34d52cb6 2342 spin_lock(&ctl->tree_lock);
96303081
JB
2343 ret = -ENOMEM;
2344 goto out;
2345 }
2346 }
2347
2348 /* allocate the bitmap */
3acd4850
CL
2349 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2350 GFP_NOFS);
da080fe1 2351 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
34d52cb6 2352 spin_lock(&ctl->tree_lock);
96303081
JB
2353 if (!info->bitmap) {
2354 ret = -ENOMEM;
2355 goto out;
2356 }
2357 goto again;
2358 }
2359
2360out:
2361 if (info) {
3acd4850
CL
2362 if (info->bitmap)
2363 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2364 info->bitmap);
dc89e982 2365 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2366 }
0f9dd46c
JB
2367
2368 return ret;
2369}
2370
a7ccb255
DZ
2371/*
2372 * Free space merging rules:
2373 * 1) Merge trimmed areas together
2374 * 2) Let untrimmed areas coalesce with trimmed areas
2375 * 3) Always pull neighboring regions from bitmaps
2376 *
2377 * The above rules are for when we merge free space based on btrfs_trim_state.
2378 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2379 * same reason: to promote larger extent regions which makes life easier for
2380 * find_free_extent(). Rule 2 enables coalescing based on the common path
2381 * being returning free space from btrfs_finish_extent_commit(). So when free
2382 * space is trimmed, it will prevent aggregating trimmed new region and
2383 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2384 * and provide find_free_extent() with the largest extents possible hoping for
2385 * the reuse path.
2386 */
945d8962 2387static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 2388 struct btrfs_free_space *info, bool update_stat)
0f9dd46c 2389{
bf53d468 2390 struct btrfs_free_space *left_info = NULL;
120d66ee
LZ
2391 struct btrfs_free_space *right_info;
2392 bool merged = false;
2393 u64 offset = info->offset;
2394 u64 bytes = info->bytes;
a7ccb255 2395 const bool is_trimmed = btrfs_free_space_trimmed(info);
6226cb0a 2396
0f9dd46c
JB
2397 /*
2398 * first we want to see if there is free space adjacent to the range we
2399 * are adding, if there is remove that struct and add a new one to
2400 * cover the entire range
2401 */
34d52cb6 2402 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
96303081
JB
2403 if (right_info && rb_prev(&right_info->offset_index))
2404 left_info = rb_entry(rb_prev(&right_info->offset_index),
2405 struct btrfs_free_space, offset_index);
bf53d468 2406 else if (!right_info)
34d52cb6 2407 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
0f9dd46c 2408
a7ccb255
DZ
2409 /* See try_merge_free_space() comment. */
2410 if (right_info && !right_info->bitmap &&
2411 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
32e1649b 2412 unlink_free_space(ctl, right_info, update_stat);
6226cb0a 2413 info->bytes += right_info->bytes;
dc89e982 2414 kmem_cache_free(btrfs_free_space_cachep, right_info);
120d66ee 2415 merged = true;
0f9dd46c
JB
2416 }
2417
a7ccb255 2418 /* See try_merge_free_space() comment. */
96303081 2419 if (left_info && !left_info->bitmap &&
a7ccb255
DZ
2420 left_info->offset + left_info->bytes == offset &&
2421 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
32e1649b 2422 unlink_free_space(ctl, left_info, update_stat);
6226cb0a
JB
2423 info->offset = left_info->offset;
2424 info->bytes += left_info->bytes;
dc89e982 2425 kmem_cache_free(btrfs_free_space_cachep, left_info);
120d66ee 2426 merged = true;
0f9dd46c
JB
2427 }
2428
120d66ee
LZ
2429 return merged;
2430}
2431
20005523
FM
2432static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2433 struct btrfs_free_space *info,
2434 bool update_stat)
2435{
2436 struct btrfs_free_space *bitmap;
2437 unsigned long i;
2438 unsigned long j;
2439 const u64 end = info->offset + info->bytes;
2440 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2441 u64 bytes;
2442
2443 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2444 if (!bitmap)
2445 return false;
2446
2447 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2448 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2449 if (j == i)
2450 return false;
2451 bytes = (j - i) * ctl->unit;
2452 info->bytes += bytes;
2453
a7ccb255
DZ
2454 /* See try_merge_free_space() comment. */
2455 if (!btrfs_free_space_trimmed(bitmap))
2456 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2457
f594f13c 2458 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
20005523
FM
2459
2460 if (!bitmap->bytes)
2461 free_bitmap(ctl, bitmap);
2462
2463 return true;
2464}
2465
2466static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2467 struct btrfs_free_space *info,
2468 bool update_stat)
2469{
2470 struct btrfs_free_space *bitmap;
2471 u64 bitmap_offset;
2472 unsigned long i;
2473 unsigned long j;
2474 unsigned long prev_j;
2475 u64 bytes;
2476
2477 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2478 /* If we're on a boundary, try the previous logical bitmap. */
2479 if (bitmap_offset == info->offset) {
2480 if (info->offset == 0)
2481 return false;
2482 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2483 }
2484
2485 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2486 if (!bitmap)
2487 return false;
2488
2489 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2490 j = 0;
2491 prev_j = (unsigned long)-1;
2492 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2493 if (j > i)
2494 break;
2495 prev_j = j;
2496 }
2497 if (prev_j == i)
2498 return false;
2499
2500 if (prev_j == (unsigned long)-1)
2501 bytes = (i + 1) * ctl->unit;
2502 else
2503 bytes = (i - prev_j) * ctl->unit;
2504
2505 info->offset -= bytes;
2506 info->bytes += bytes;
2507
a7ccb255
DZ
2508 /* See try_merge_free_space() comment. */
2509 if (!btrfs_free_space_trimmed(bitmap))
2510 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2511
f594f13c 2512 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
20005523
FM
2513
2514 if (!bitmap->bytes)
2515 free_bitmap(ctl, bitmap);
2516
2517 return true;
2518}
2519
2520/*
2521 * We prefer always to allocate from extent entries, both for clustered and
2522 * non-clustered allocation requests. So when attempting to add a new extent
2523 * entry, try to see if there's adjacent free space in bitmap entries, and if
2524 * there is, migrate that space from the bitmaps to the extent.
2525 * Like this we get better chances of satisfying space allocation requests
2526 * because we attempt to satisfy them based on a single cache entry, and never
2527 * on 2 or more entries - even if the entries represent a contiguous free space
2528 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2529 * ends).
2530 */
2531static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2532 struct btrfs_free_space *info,
2533 bool update_stat)
2534{
2535 /*
2536 * Only work with disconnected entries, as we can change their offset,
2537 * and must be extent entries.
2538 */
2539 ASSERT(!info->bitmap);
2540 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2541
2542 if (ctl->total_bitmaps > 0) {
2543 bool stole_end;
2544 bool stole_front = false;
2545
2546 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2547 if (ctl->total_bitmaps > 0)
2548 stole_front = steal_from_bitmap_to_front(ctl, info,
2549 update_stat);
2550
2551 if (stole_end || stole_front)
2552 try_merge_free_space(ctl, info, update_stat);
2553 }
2554}
2555
290ef19a 2556int __btrfs_add_free_space(struct btrfs_block_group *block_group,
a7ccb255
DZ
2557 u64 offset, u64 bytes,
2558 enum btrfs_trim_state trim_state)
120d66ee 2559{
290ef19a
NB
2560 struct btrfs_fs_info *fs_info = block_group->fs_info;
2561 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
120d66ee
LZ
2562 struct btrfs_free_space *info;
2563 int ret = 0;
7fe6d45e 2564 u64 filter_bytes = bytes;
120d66ee 2565
169e0da9
NA
2566 ASSERT(!btrfs_is_zoned(fs_info));
2567
dc89e982 2568 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
120d66ee
LZ
2569 if (!info)
2570 return -ENOMEM;
2571
2572 info->offset = offset;
2573 info->bytes = bytes;
a7ccb255 2574 info->trim_state = trim_state;
20005523 2575 RB_CLEAR_NODE(&info->offset_index);
59c7b566 2576 RB_CLEAR_NODE(&info->bytes_index);
120d66ee 2577
34d52cb6 2578 spin_lock(&ctl->tree_lock);
120d66ee 2579
34d52cb6 2580 if (try_merge_free_space(ctl, info, true))
120d66ee
LZ
2581 goto link;
2582
2583 /*
2584 * There was no extent directly to the left or right of this new
2585 * extent then we know we're going to have to allocate a new extent, so
2586 * before we do that see if we need to drop this into a bitmap
2587 */
34d52cb6 2588 ret = insert_into_bitmap(ctl, info);
120d66ee
LZ
2589 if (ret < 0) {
2590 goto out;
2591 } else if (ret) {
2592 ret = 0;
2593 goto out;
2594 }
2595link:
20005523
FM
2596 /*
2597 * Only steal free space from adjacent bitmaps if we're sure we're not
2598 * going to add the new free space to existing bitmap entries - because
2599 * that would mean unnecessary work that would be reverted. Therefore
2600 * attempt to steal space from bitmaps if we're adding an extent entry.
2601 */
2602 steal_from_bitmap(ctl, info, true);
2603
7fe6d45e
DZ
2604 filter_bytes = max(filter_bytes, info->bytes);
2605
34d52cb6 2606 ret = link_free_space(ctl, info);
0f9dd46c 2607 if (ret)
dc89e982 2608 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2609out:
66b53bae 2610 btrfs_discard_update_discardable(block_group);
34d52cb6 2611 spin_unlock(&ctl->tree_lock);
6226cb0a 2612
0f9dd46c 2613 if (ret) {
ab8d0fc4 2614 btrfs_crit(fs_info, "unable to add free space :%d", ret);
b12d6869 2615 ASSERT(ret != -EEXIST);
0f9dd46c
JB
2616 }
2617
7fe6d45e
DZ
2618 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2619 btrfs_discard_check_filter(block_group, filter_bytes);
b0643e59 2620 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
7fe6d45e 2621 }
b0643e59 2622
0f9dd46c
JB
2623 return ret;
2624}
2625
169e0da9
NA
2626static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2627 u64 bytenr, u64 size, bool used)
2628{
18bb8bbf 2629 struct btrfs_fs_info *fs_info = block_group->fs_info;
169e0da9
NA
2630 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2631 u64 offset = bytenr - block_group->start;
2632 u64 to_free, to_unusable;
77233c2d 2633 const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
98173255 2634 bool initial = (size == block_group->length);
d8da0e85 2635 u64 reclaimable_unusable;
98173255
NA
2636
2637 WARN_ON(!initial && offset + size > block_group->zone_capacity);
169e0da9
NA
2638
2639 spin_lock(&ctl->tree_lock);
2640 if (!used)
2641 to_free = size;
98173255
NA
2642 else if (initial)
2643 to_free = block_group->zone_capacity;
169e0da9
NA
2644 else if (offset >= block_group->alloc_offset)
2645 to_free = size;
2646 else if (offset + size <= block_group->alloc_offset)
2647 to_free = 0;
2648 else
2649 to_free = offset + size - block_group->alloc_offset;
2650 to_unusable = size - to_free;
2651
2652 ctl->free_space += to_free;
badae9c8
NA
2653 /*
2654 * If the block group is read-only, we should account freed space into
2655 * bytes_readonly.
2656 */
2657 if (!block_group->ro)
2658 block_group->zone_unusable += to_unusable;
169e0da9
NA
2659 spin_unlock(&ctl->tree_lock);
2660 if (!used) {
2661 spin_lock(&block_group->lock);
2662 block_group->alloc_offset -= size;
2663 spin_unlock(&block_group->lock);
2664 }
2665
d8da0e85
NA
2666 reclaimable_unusable = block_group->zone_unusable -
2667 (block_group->length - block_group->zone_capacity);
169e0da9 2668 /* All the region is now unusable. Mark it as unused and reclaim */
18bb8bbf 2669 if (block_group->zone_unusable == block_group->length) {
169e0da9 2670 btrfs_mark_bg_unused(block_group);
77233c2d 2671 } else if (bg_reclaim_threshold &&
d8da0e85
NA
2672 reclaimable_unusable >=
2673 div_factor_fine(block_group->zone_capacity,
2674 bg_reclaim_threshold)) {
18bb8bbf
JT
2675 btrfs_mark_bg_to_reclaim(block_group);
2676 }
169e0da9
NA
2677
2678 return 0;
2679}
2680
32da5386 2681int btrfs_add_free_space(struct btrfs_block_group *block_group,
478b4d9f
JB
2682 u64 bytenr, u64 size)
2683{
a7ccb255
DZ
2684 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2685
169e0da9
NA
2686 if (btrfs_is_zoned(block_group->fs_info))
2687 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2688 true);
2689
a7ccb255
DZ
2690 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2691 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2692
290ef19a 2693 return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
478b4d9f
JB
2694}
2695
169e0da9
NA
2696int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2697 u64 bytenr, u64 size)
2698{
2699 if (btrfs_is_zoned(block_group->fs_info))
2700 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2701 false);
2702
2703 return btrfs_add_free_space(block_group, bytenr, size);
2704}
2705
b0643e59
DZ
2706/*
2707 * This is a subtle distinction because when adding free space back in general,
2708 * we want it to be added as untrimmed for async. But in the case where we add
2709 * it on loading of a block group, we want to consider it trimmed.
2710 */
2711int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2712 u64 bytenr, u64 size)
2713{
2714 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2715
169e0da9
NA
2716 if (btrfs_is_zoned(block_group->fs_info))
2717 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2718 true);
2719
b0643e59
DZ
2720 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2721 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2722 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2723
290ef19a 2724 return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
b0643e59
DZ
2725}
2726
32da5386 2727int btrfs_remove_free_space(struct btrfs_block_group *block_group,
6226cb0a 2728 u64 offset, u64 bytes)
0f9dd46c 2729{
34d52cb6 2730 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2731 struct btrfs_free_space *info;
b0175117
JB
2732 int ret;
2733 bool re_search = false;
0f9dd46c 2734
011b41bf
NA
2735 if (btrfs_is_zoned(block_group->fs_info)) {
2736 /*
2737 * This can happen with conventional zones when replaying log.
2738 * Since the allocation info of tree-log nodes are not recorded
2739 * to the extent-tree, calculate_alloc_pointer() failed to
2740 * advance the allocation pointer after last allocated tree log
2741 * node blocks.
2742 *
2743 * This function is called from
2744 * btrfs_pin_extent_for_log_replay() when replaying the log.
2745 * Advance the pointer not to overwrite the tree-log nodes.
2746 */
0ae79c6f
NA
2747 if (block_group->start + block_group->alloc_offset <
2748 offset + bytes) {
2749 block_group->alloc_offset =
2750 offset + bytes - block_group->start;
2751 }
169e0da9 2752 return 0;
011b41bf 2753 }
169e0da9 2754
34d52cb6 2755 spin_lock(&ctl->tree_lock);
6226cb0a 2756
96303081 2757again:
b0175117 2758 ret = 0;
bdb7d303
JB
2759 if (!bytes)
2760 goto out_lock;
2761
34d52cb6 2762 info = tree_search_offset(ctl, offset, 0, 0);
96303081 2763 if (!info) {
6606bb97
JB
2764 /*
2765 * oops didn't find an extent that matched the space we wanted
2766 * to remove, look for a bitmap instead
2767 */
34d52cb6 2768 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
6606bb97
JB
2769 1, 0);
2770 if (!info) {
b0175117
JB
2771 /*
2772 * If we found a partial bit of our free space in a
2773 * bitmap but then couldn't find the other part this may
2774 * be a problem, so WARN about it.
24a70313 2775 */
b0175117 2776 WARN_ON(re_search);
6606bb97
JB
2777 goto out_lock;
2778 }
96303081
JB
2779 }
2780
b0175117 2781 re_search = false;
bdb7d303 2782 if (!info->bitmap) {
32e1649b 2783 unlink_free_space(ctl, info, true);
bdb7d303
JB
2784 if (offset == info->offset) {
2785 u64 to_free = min(bytes, info->bytes);
2786
2787 info->bytes -= to_free;
2788 info->offset += to_free;
2789 if (info->bytes) {
2790 ret = link_free_space(ctl, info);
2791 WARN_ON(ret);
2792 } else {
2793 kmem_cache_free(btrfs_free_space_cachep, info);
2794 }
0f9dd46c 2795
bdb7d303
JB
2796 offset += to_free;
2797 bytes -= to_free;
2798 goto again;
2799 } else {
2800 u64 old_end = info->bytes + info->offset;
9b49c9b9 2801
bdb7d303 2802 info->bytes = offset - info->offset;
34d52cb6 2803 ret = link_free_space(ctl, info);
96303081
JB
2804 WARN_ON(ret);
2805 if (ret)
2806 goto out_lock;
96303081 2807
bdb7d303
JB
2808 /* Not enough bytes in this entry to satisfy us */
2809 if (old_end < offset + bytes) {
2810 bytes -= old_end - offset;
2811 offset = old_end;
2812 goto again;
2813 } else if (old_end == offset + bytes) {
2814 /* all done */
2815 goto out_lock;
2816 }
2817 spin_unlock(&ctl->tree_lock);
2818
290ef19a 2819 ret = __btrfs_add_free_space(block_group,
a7ccb255
DZ
2820 offset + bytes,
2821 old_end - (offset + bytes),
2822 info->trim_state);
bdb7d303
JB
2823 WARN_ON(ret);
2824 goto out;
2825 }
0f9dd46c 2826 }
96303081 2827
34d52cb6 2828 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
b0175117
JB
2829 if (ret == -EAGAIN) {
2830 re_search = true;
96303081 2831 goto again;
b0175117 2832 }
96303081 2833out_lock:
66b53bae 2834 btrfs_discard_update_discardable(block_group);
34d52cb6 2835 spin_unlock(&ctl->tree_lock);
0f9dd46c 2836out:
25179201
JB
2837 return ret;
2838}
2839
32da5386 2840void btrfs_dump_free_space(struct btrfs_block_group *block_group,
0f9dd46c
JB
2841 u64 bytes)
2842{
0b246afa 2843 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2844 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c
JB
2845 struct btrfs_free_space *info;
2846 struct rb_node *n;
2847 int count = 0;
2848
169e0da9
NA
2849 /*
2850 * Zoned btrfs does not use free space tree and cluster. Just print
2851 * out the free space after the allocation offset.
2852 */
2853 if (btrfs_is_zoned(fs_info)) {
afba2bc0
NA
2854 btrfs_info(fs_info, "free space %llu active %d",
2855 block_group->zone_capacity - block_group->alloc_offset,
2856 block_group->zone_is_active);
169e0da9
NA
2857 return;
2858 }
2859
9084cb6a 2860 spin_lock(&ctl->tree_lock);
34d52cb6 2861 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
0f9dd46c 2862 info = rb_entry(n, struct btrfs_free_space, offset_index);
f6175efa 2863 if (info->bytes >= bytes && !block_group->ro)
0f9dd46c 2864 count++;
0b246afa 2865 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
efe120a0 2866 info->offset, info->bytes,
96303081 2867 (info->bitmap) ? "yes" : "no");
0f9dd46c 2868 }
9084cb6a 2869 spin_unlock(&ctl->tree_lock);
0b246afa 2870 btrfs_info(fs_info, "block group has cluster?: %s",
96303081 2871 list_empty(&block_group->cluster_list) ? "no" : "yes");
0b246afa 2872 btrfs_info(fs_info,
efe120a0 2873 "%d blocks of free space at or bigger than bytes is", count);
0f9dd46c
JB
2874}
2875
cd79909b
JB
2876void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2877 struct btrfs_free_space_ctl *ctl)
0f9dd46c 2878{
0b246afa 2879 struct btrfs_fs_info *fs_info = block_group->fs_info;
0f9dd46c 2880
34d52cb6 2881 spin_lock_init(&ctl->tree_lock);
0b246afa 2882 ctl->unit = fs_info->sectorsize;
b3470b5d 2883 ctl->start = block_group->start;
364be842 2884 ctl->block_group = block_group;
34d52cb6 2885 ctl->op = &free_space_op;
59c7b566 2886 ctl->free_space_bytes = RB_ROOT_CACHED;
55507ce3
FM
2887 INIT_LIST_HEAD(&ctl->trimming_ranges);
2888 mutex_init(&ctl->cache_writeout_mutex);
0f9dd46c 2889
34d52cb6
LZ
2890 /*
2891 * we only want to have 32k of ram per block group for keeping
2892 * track of free space, and if we pass 1/2 of that we want to
2893 * start converting things over to using bitmaps
2894 */
ee22184b 2895 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
0f9dd46c
JB
2896}
2897
fa9c0d79
CM
2898/*
2899 * for a given cluster, put all of its extents back into the free
2900 * space cache. If the block group passed doesn't match the block group
2901 * pointed to by the cluster, someone else raced in and freed the
2902 * cluster already. In that case, we just return without changing anything
2903 */
69b0e093 2904static void __btrfs_return_cluster_to_free_space(
32da5386 2905 struct btrfs_block_group *block_group,
fa9c0d79
CM
2906 struct btrfs_free_cluster *cluster)
2907{
34d52cb6 2908 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2909 struct btrfs_free_space *entry;
2910 struct rb_node *node;
2911
2912 spin_lock(&cluster->lock);
95c85fba
JB
2913 if (cluster->block_group != block_group) {
2914 spin_unlock(&cluster->lock);
2915 return;
2916 }
fa9c0d79 2917
96303081 2918 cluster->block_group = NULL;
fa9c0d79 2919 cluster->window_start = 0;
96303081 2920 list_del_init(&cluster->block_group_list);
96303081 2921
fa9c0d79 2922 node = rb_first(&cluster->root);
96303081 2923 while (node) {
4e69b598
JB
2924 bool bitmap;
2925
fa9c0d79
CM
2926 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2927 node = rb_next(&entry->offset_index);
2928 rb_erase(&entry->offset_index, &cluster->root);
20005523 2929 RB_CLEAR_NODE(&entry->offset_index);
4e69b598
JB
2930
2931 bitmap = (entry->bitmap != NULL);
20005523 2932 if (!bitmap) {
dfb79ddb 2933 /* Merging treats extents as if they were new */
5dc7c10b 2934 if (!btrfs_free_space_trimmed(entry)) {
dfb79ddb 2935 ctl->discardable_extents[BTRFS_STAT_CURR]--;
5dc7c10b
DZ
2936 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2937 entry->bytes;
2938 }
dfb79ddb 2939
34d52cb6 2940 try_merge_free_space(ctl, entry, false);
20005523 2941 steal_from_bitmap(ctl, entry, false);
dfb79ddb
DZ
2942
2943 /* As we insert directly, update these statistics */
5dc7c10b 2944 if (!btrfs_free_space_trimmed(entry)) {
dfb79ddb 2945 ctl->discardable_extents[BTRFS_STAT_CURR]++;
5dc7c10b
DZ
2946 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2947 entry->bytes;
2948 }
20005523 2949 }
34d52cb6 2950 tree_insert_offset(&ctl->free_space_offset,
4e69b598 2951 entry->offset, &entry->offset_index, bitmap);
59c7b566
JB
2952 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
2953 entry_less);
fa9c0d79 2954 }
6bef4d31 2955 cluster->root = RB_ROOT;
fa9c0d79 2956 spin_unlock(&cluster->lock);
96303081 2957 btrfs_put_block_group(block_group);
fa9c0d79
CM
2958}
2959
48a3b636
ES
2960static void __btrfs_remove_free_space_cache_locked(
2961 struct btrfs_free_space_ctl *ctl)
0f9dd46c
JB
2962{
2963 struct btrfs_free_space *info;
2964 struct rb_node *node;
581bb050 2965
581bb050
LZ
2966 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2967 info = rb_entry(node, struct btrfs_free_space, offset_index);
9b90f513 2968 if (!info->bitmap) {
32e1649b 2969 unlink_free_space(ctl, info, true);
9b90f513
JB
2970 kmem_cache_free(btrfs_free_space_cachep, info);
2971 } else {
2972 free_bitmap(ctl, info);
2973 }
351810c1
DS
2974
2975 cond_resched_lock(&ctl->tree_lock);
581bb050 2976 }
09655373
CM
2977}
2978
2979void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2980{
2981 spin_lock(&ctl->tree_lock);
2982 __btrfs_remove_free_space_cache_locked(ctl);
364be842
NB
2983 if (ctl->block_group)
2984 btrfs_discard_update_discardable(ctl->block_group);
581bb050
LZ
2985 spin_unlock(&ctl->tree_lock);
2986}
2987
32da5386 2988void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
581bb050
LZ
2989{
2990 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79 2991 struct btrfs_free_cluster *cluster;
96303081 2992 struct list_head *head;
0f9dd46c 2993
34d52cb6 2994 spin_lock(&ctl->tree_lock);
96303081
JB
2995 while ((head = block_group->cluster_list.next) !=
2996 &block_group->cluster_list) {
2997 cluster = list_entry(head, struct btrfs_free_cluster,
2998 block_group_list);
fa9c0d79
CM
2999
3000 WARN_ON(cluster->block_group != block_group);
3001 __btrfs_return_cluster_to_free_space(block_group, cluster);
351810c1
DS
3002
3003 cond_resched_lock(&ctl->tree_lock);
fa9c0d79 3004 }
09655373 3005 __btrfs_remove_free_space_cache_locked(ctl);
66b53bae 3006 btrfs_discard_update_discardable(block_group);
34d52cb6 3007 spin_unlock(&ctl->tree_lock);
fa9c0d79 3008
0f9dd46c
JB
3009}
3010
6e80d4f8
DZ
3011/**
3012 * btrfs_is_free_space_trimmed - see if everything is trimmed
3013 * @block_group: block_group of interest
3014 *
3015 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3016 */
3017bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3018{
3019 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3020 struct btrfs_free_space *info;
3021 struct rb_node *node;
3022 bool ret = true;
3023
3024 spin_lock(&ctl->tree_lock);
3025 node = rb_first(&ctl->free_space_offset);
3026
3027 while (node) {
3028 info = rb_entry(node, struct btrfs_free_space, offset_index);
3029
3030 if (!btrfs_free_space_trimmed(info)) {
3031 ret = false;
3032 break;
3033 }
3034
3035 node = rb_next(node);
3036 }
3037
3038 spin_unlock(&ctl->tree_lock);
3039 return ret;
3040}
3041
32da5386 3042u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
a4820398
MX
3043 u64 offset, u64 bytes, u64 empty_size,
3044 u64 *max_extent_size)
0f9dd46c 3045{
34d52cb6 3046 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
9ddf648f
DZ
3047 struct btrfs_discard_ctl *discard_ctl =
3048 &block_group->fs_info->discard_ctl;
6226cb0a 3049 struct btrfs_free_space *entry = NULL;
96303081 3050 u64 bytes_search = bytes + empty_size;
6226cb0a 3051 u64 ret = 0;
53b381b3
DW
3052 u64 align_gap = 0;
3053 u64 align_gap_len = 0;
a7ccb255 3054 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
59c7b566 3055 bool use_bytes_index = (offset == block_group->start);
0f9dd46c 3056
2eda5708
NA
3057 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3058
34d52cb6 3059 spin_lock(&ctl->tree_lock);
53b381b3 3060 entry = find_free_space(ctl, &offset, &bytes_search,
59c7b566
JB
3061 block_group->full_stripe_len, max_extent_size,
3062 use_bytes_index);
6226cb0a 3063 if (!entry)
96303081
JB
3064 goto out;
3065
3066 ret = offset;
3067 if (entry->bitmap) {
f594f13c 3068 bitmap_clear_bits(ctl, entry, offset, bytes, true);
9ddf648f
DZ
3069
3070 if (!btrfs_free_space_trimmed(entry))
3071 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3072
edf6e2d1 3073 if (!entry->bytes)
34d52cb6 3074 free_bitmap(ctl, entry);
96303081 3075 } else {
32e1649b 3076 unlink_free_space(ctl, entry, true);
53b381b3
DW
3077 align_gap_len = offset - entry->offset;
3078 align_gap = entry->offset;
a7ccb255 3079 align_gap_trim_state = entry->trim_state;
53b381b3 3080
9ddf648f
DZ
3081 if (!btrfs_free_space_trimmed(entry))
3082 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3083
53b381b3
DW
3084 entry->offset = offset + bytes;
3085 WARN_ON(entry->bytes < bytes + align_gap_len);
3086
3087 entry->bytes -= bytes + align_gap_len;
6226cb0a 3088 if (!entry->bytes)
dc89e982 3089 kmem_cache_free(btrfs_free_space_cachep, entry);
6226cb0a 3090 else
34d52cb6 3091 link_free_space(ctl, entry);
6226cb0a 3092 }
96303081 3093out:
66b53bae 3094 btrfs_discard_update_discardable(block_group);
34d52cb6 3095 spin_unlock(&ctl->tree_lock);
817d52f8 3096
53b381b3 3097 if (align_gap_len)
290ef19a 3098 __btrfs_add_free_space(block_group, align_gap, align_gap_len,
a7ccb255 3099 align_gap_trim_state);
0f9dd46c
JB
3100 return ret;
3101}
fa9c0d79
CM
3102
3103/*
3104 * given a cluster, put all of its extents back into the free space
3105 * cache. If a block group is passed, this function will only free
3106 * a cluster that belongs to the passed block group.
3107 *
3108 * Otherwise, it'll get a reference on the block group pointed to by the
3109 * cluster and remove the cluster from it.
3110 */
69b0e093 3111void btrfs_return_cluster_to_free_space(
32da5386 3112 struct btrfs_block_group *block_group,
fa9c0d79
CM
3113 struct btrfs_free_cluster *cluster)
3114{
34d52cb6 3115 struct btrfs_free_space_ctl *ctl;
fa9c0d79
CM
3116
3117 /* first, get a safe pointer to the block group */
3118 spin_lock(&cluster->lock);
3119 if (!block_group) {
3120 block_group = cluster->block_group;
3121 if (!block_group) {
3122 spin_unlock(&cluster->lock);
69b0e093 3123 return;
fa9c0d79
CM
3124 }
3125 } else if (cluster->block_group != block_group) {
3126 /* someone else has already freed it don't redo their work */
3127 spin_unlock(&cluster->lock);
69b0e093 3128 return;
fa9c0d79 3129 }
b5790d51 3130 btrfs_get_block_group(block_group);
fa9c0d79
CM
3131 spin_unlock(&cluster->lock);
3132
34d52cb6
LZ
3133 ctl = block_group->free_space_ctl;
3134
fa9c0d79 3135 /* now return any extents the cluster had on it */
34d52cb6 3136 spin_lock(&ctl->tree_lock);
69b0e093 3137 __btrfs_return_cluster_to_free_space(block_group, cluster);
34d52cb6 3138 spin_unlock(&ctl->tree_lock);
fa9c0d79 3139
6e80d4f8
DZ
3140 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3141
fa9c0d79
CM
3142 /* finally drop our ref */
3143 btrfs_put_block_group(block_group);
fa9c0d79
CM
3144}
3145
32da5386 3146static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
96303081 3147 struct btrfs_free_cluster *cluster,
4e69b598 3148 struct btrfs_free_space *entry,
a4820398
MX
3149 u64 bytes, u64 min_start,
3150 u64 *max_extent_size)
96303081 3151{
34d52cb6 3152 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
3153 int err;
3154 u64 search_start = cluster->window_start;
3155 u64 search_bytes = bytes;
3156 u64 ret = 0;
3157
96303081
JB
3158 search_start = min_start;
3159 search_bytes = bytes;
3160
0584f718 3161 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
a4820398 3162 if (err) {
ad22cf6e
JB
3163 *max_extent_size = max(get_max_extent_size(entry),
3164 *max_extent_size);
4e69b598 3165 return 0;
a4820398 3166 }
96303081
JB
3167
3168 ret = search_start;
f594f13c 3169 bitmap_clear_bits(ctl, entry, ret, bytes, false);
96303081
JB
3170
3171 return ret;
3172}
3173
fa9c0d79
CM
3174/*
3175 * given a cluster, try to allocate 'bytes' from it, returns 0
3176 * if it couldn't find anything suitably large, or a logical disk offset
3177 * if things worked out
3178 */
32da5386 3179u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
fa9c0d79 3180 struct btrfs_free_cluster *cluster, u64 bytes,
a4820398 3181 u64 min_start, u64 *max_extent_size)
fa9c0d79 3182{
34d52cb6 3183 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
9ddf648f
DZ
3184 struct btrfs_discard_ctl *discard_ctl =
3185 &block_group->fs_info->discard_ctl;
fa9c0d79
CM
3186 struct btrfs_free_space *entry = NULL;
3187 struct rb_node *node;
3188 u64 ret = 0;
3189
2eda5708
NA
3190 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3191
fa9c0d79
CM
3192 spin_lock(&cluster->lock);
3193 if (bytes > cluster->max_size)
3194 goto out;
3195
3196 if (cluster->block_group != block_group)
3197 goto out;
3198
3199 node = rb_first(&cluster->root);
3200 if (!node)
3201 goto out;
3202
3203 entry = rb_entry(node, struct btrfs_free_space, offset_index);
67871254 3204 while (1) {
ad22cf6e
JB
3205 if (entry->bytes < bytes)
3206 *max_extent_size = max(get_max_extent_size(entry),
3207 *max_extent_size);
a4820398 3208
4e69b598
JB
3209 if (entry->bytes < bytes ||
3210 (!entry->bitmap && entry->offset < min_start)) {
fa9c0d79
CM
3211 node = rb_next(&entry->offset_index);
3212 if (!node)
3213 break;
3214 entry = rb_entry(node, struct btrfs_free_space,
3215 offset_index);
3216 continue;
3217 }
fa9c0d79 3218
4e69b598
JB
3219 if (entry->bitmap) {
3220 ret = btrfs_alloc_from_bitmap(block_group,
3221 cluster, entry, bytes,
a4820398
MX
3222 cluster->window_start,
3223 max_extent_size);
4e69b598 3224 if (ret == 0) {
4e69b598
JB
3225 node = rb_next(&entry->offset_index);
3226 if (!node)
3227 break;
3228 entry = rb_entry(node, struct btrfs_free_space,
3229 offset_index);
3230 continue;
3231 }
9b230628 3232 cluster->window_start += bytes;
4e69b598 3233 } else {
4e69b598
JB
3234 ret = entry->offset;
3235
3236 entry->offset += bytes;
3237 entry->bytes -= bytes;
3238 }
fa9c0d79 3239
fa9c0d79
CM
3240 break;
3241 }
3242out:
3243 spin_unlock(&cluster->lock);
96303081 3244
5e71b5d5
LZ
3245 if (!ret)
3246 return 0;
3247
34d52cb6 3248 spin_lock(&ctl->tree_lock);
5e71b5d5 3249
9ddf648f
DZ
3250 if (!btrfs_free_space_trimmed(entry))
3251 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3252
34d52cb6 3253 ctl->free_space -= bytes;
5dc7c10b
DZ
3254 if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3255 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3c179165
NB
3256
3257 spin_lock(&cluster->lock);
5e71b5d5 3258 if (entry->bytes == 0) {
3c179165 3259 rb_erase(&entry->offset_index, &cluster->root);
34d52cb6 3260 ctl->free_extents--;
4e69b598 3261 if (entry->bitmap) {
3acd4850
CL
3262 kmem_cache_free(btrfs_free_space_bitmap_cachep,
3263 entry->bitmap);
34d52cb6 3264 ctl->total_bitmaps--;
fa598b06 3265 recalculate_thresholds(ctl);
dfb79ddb
DZ
3266 } else if (!btrfs_free_space_trimmed(entry)) {
3267 ctl->discardable_extents[BTRFS_STAT_CURR]--;
4e69b598 3268 }
dc89e982 3269 kmem_cache_free(btrfs_free_space_cachep, entry);
5e71b5d5
LZ
3270 }
3271
3c179165 3272 spin_unlock(&cluster->lock);
34d52cb6 3273 spin_unlock(&ctl->tree_lock);
5e71b5d5 3274
fa9c0d79
CM
3275 return ret;
3276}
3277
32da5386 3278static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
96303081
JB
3279 struct btrfs_free_space *entry,
3280 struct btrfs_free_cluster *cluster,
1bb91902
AO
3281 u64 offset, u64 bytes,
3282 u64 cont1_bytes, u64 min_bytes)
96303081 3283{
34d52cb6 3284 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
3285 unsigned long next_zero;
3286 unsigned long i;
1bb91902
AO
3287 unsigned long want_bits;
3288 unsigned long min_bits;
96303081 3289 unsigned long found_bits;
cef40483 3290 unsigned long max_bits = 0;
96303081
JB
3291 unsigned long start = 0;
3292 unsigned long total_found = 0;
4e69b598 3293 int ret;
96303081 3294
96009762 3295 i = offset_to_bit(entry->offset, ctl->unit,
96303081 3296 max_t(u64, offset, entry->offset));
96009762
WSH
3297 want_bits = bytes_to_bits(bytes, ctl->unit);
3298 min_bits = bytes_to_bits(min_bytes, ctl->unit);
96303081 3299
cef40483
JB
3300 /*
3301 * Don't bother looking for a cluster in this bitmap if it's heavily
3302 * fragmented.
3303 */
3304 if (entry->max_extent_size &&
3305 entry->max_extent_size < cont1_bytes)
3306 return -ENOSPC;
96303081
JB
3307again:
3308 found_bits = 0;
ebb3dad4 3309 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
96303081
JB
3310 next_zero = find_next_zero_bit(entry->bitmap,
3311 BITS_PER_BITMAP, i);
1bb91902 3312 if (next_zero - i >= min_bits) {
96303081 3313 found_bits = next_zero - i;
cef40483
JB
3314 if (found_bits > max_bits)
3315 max_bits = found_bits;
96303081
JB
3316 break;
3317 }
cef40483
JB
3318 if (next_zero - i > max_bits)
3319 max_bits = next_zero - i;
96303081
JB
3320 i = next_zero;
3321 }
3322
cef40483
JB
3323 if (!found_bits) {
3324 entry->max_extent_size = (u64)max_bits * ctl->unit;
4e69b598 3325 return -ENOSPC;
cef40483 3326 }
96303081 3327
1bb91902 3328 if (!total_found) {
96303081 3329 start = i;
b78d09bc 3330 cluster->max_size = 0;
96303081
JB
3331 }
3332
3333 total_found += found_bits;
3334
96009762
WSH
3335 if (cluster->max_size < found_bits * ctl->unit)
3336 cluster->max_size = found_bits * ctl->unit;
96303081 3337
1bb91902
AO
3338 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3339 i = next_zero + 1;
96303081
JB
3340 goto again;
3341 }
3342
96009762 3343 cluster->window_start = start * ctl->unit + entry->offset;
34d52cb6 3344 rb_erase(&entry->offset_index, &ctl->free_space_offset);
59c7b566
JB
3345 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3346
3347 /*
3348 * We need to know if we're currently on the normal space index when we
3349 * manipulate the bitmap so that we know we need to remove and re-insert
3350 * it into the space_index tree. Clear the bytes_index node here so the
3351 * bitmap manipulation helpers know not to mess with the space_index
3352 * until this bitmap entry is added back into the normal cache.
3353 */
3354 RB_CLEAR_NODE(&entry->bytes_index);
3355
4e69b598
JB
3356 ret = tree_insert_offset(&cluster->root, entry->offset,
3357 &entry->offset_index, 1);
b12d6869 3358 ASSERT(!ret); /* -EEXIST; Logic error */
96303081 3359
3f7de037 3360 trace_btrfs_setup_cluster(block_group, cluster,
96009762 3361 total_found * ctl->unit, 1);
96303081
JB
3362 return 0;
3363}
3364
4e69b598
JB
3365/*
3366 * This searches the block group for just extents to fill the cluster with.
1bb91902
AO
3367 * Try to find a cluster with at least bytes total bytes, at least one
3368 * extent of cont1_bytes, and other clusters of at least min_bytes.
4e69b598 3369 */
3de85bb9 3370static noinline int
32da5386 3371setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3372 struct btrfs_free_cluster *cluster,
3373 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3374 u64 cont1_bytes, u64 min_bytes)
4e69b598 3375{
34d52cb6 3376 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4e69b598
JB
3377 struct btrfs_free_space *first = NULL;
3378 struct btrfs_free_space *entry = NULL;
4e69b598
JB
3379 struct btrfs_free_space *last;
3380 struct rb_node *node;
4e69b598
JB
3381 u64 window_free;
3382 u64 max_extent;
3f7de037 3383 u64 total_size = 0;
4e69b598 3384
34d52cb6 3385 entry = tree_search_offset(ctl, offset, 0, 1);
4e69b598
JB
3386 if (!entry)
3387 return -ENOSPC;
3388
3389 /*
3390 * We don't want bitmaps, so just move along until we find a normal
3391 * extent entry.
3392 */
1bb91902
AO
3393 while (entry->bitmap || entry->bytes < min_bytes) {
3394 if (entry->bitmap && list_empty(&entry->list))
86d4a77b 3395 list_add_tail(&entry->list, bitmaps);
4e69b598
JB
3396 node = rb_next(&entry->offset_index);
3397 if (!node)
3398 return -ENOSPC;
3399 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3400 }
3401
4e69b598
JB
3402 window_free = entry->bytes;
3403 max_extent = entry->bytes;
3404 first = entry;
3405 last = entry;
4e69b598 3406
1bb91902
AO
3407 for (node = rb_next(&entry->offset_index); node;
3408 node = rb_next(&entry->offset_index)) {
4e69b598
JB
3409 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3410
86d4a77b
JB
3411 if (entry->bitmap) {
3412 if (list_empty(&entry->list))
3413 list_add_tail(&entry->list, bitmaps);
4e69b598 3414 continue;
86d4a77b
JB
3415 }
3416
1bb91902
AO
3417 if (entry->bytes < min_bytes)
3418 continue;
3419
3420 last = entry;
3421 window_free += entry->bytes;
3422 if (entry->bytes > max_extent)
4e69b598 3423 max_extent = entry->bytes;
4e69b598
JB
3424 }
3425
1bb91902
AO
3426 if (window_free < bytes || max_extent < cont1_bytes)
3427 return -ENOSPC;
3428
4e69b598
JB
3429 cluster->window_start = first->offset;
3430
3431 node = &first->offset_index;
3432
3433 /*
3434 * now we've found our entries, pull them out of the free space
3435 * cache and put them into the cluster rbtree
3436 */
3437 do {
3438 int ret;
3439
3440 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3441 node = rb_next(&entry->offset_index);
1bb91902 3442 if (entry->bitmap || entry->bytes < min_bytes)
4e69b598
JB
3443 continue;
3444
34d52cb6 3445 rb_erase(&entry->offset_index, &ctl->free_space_offset);
59c7b566 3446 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
4e69b598
JB
3447 ret = tree_insert_offset(&cluster->root, entry->offset,
3448 &entry->offset_index, 0);
3f7de037 3449 total_size += entry->bytes;
b12d6869 3450 ASSERT(!ret); /* -EEXIST; Logic error */
4e69b598
JB
3451 } while (node && entry != last);
3452
3453 cluster->max_size = max_extent;
3f7de037 3454 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
4e69b598
JB
3455 return 0;
3456}
3457
3458/*
3459 * This specifically looks for bitmaps that may work in the cluster, we assume
3460 * that we have already failed to find extents that will work.
3461 */
3de85bb9 3462static noinline int
32da5386 3463setup_cluster_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3464 struct btrfs_free_cluster *cluster,
3465 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3466 u64 cont1_bytes, u64 min_bytes)
4e69b598 3467{
34d52cb6 3468 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1b9b922a 3469 struct btrfs_free_space *entry = NULL;
4e69b598 3470 int ret = -ENOSPC;
0f0fbf1d 3471 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
4e69b598 3472
34d52cb6 3473 if (ctl->total_bitmaps == 0)
4e69b598
JB
3474 return -ENOSPC;
3475
0f0fbf1d
LZ
3476 /*
3477 * The bitmap that covers offset won't be in the list unless offset
3478 * is just its start offset.
3479 */
1b9b922a
CM
3480 if (!list_empty(bitmaps))
3481 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3482
3483 if (!entry || entry->offset != bitmap_offset) {
0f0fbf1d
LZ
3484 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3485 if (entry && list_empty(&entry->list))
3486 list_add(&entry->list, bitmaps);
3487 }
3488
86d4a77b 3489 list_for_each_entry(entry, bitmaps, list) {
357b9784 3490 if (entry->bytes < bytes)
86d4a77b
JB
3491 continue;
3492 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
1bb91902 3493 bytes, cont1_bytes, min_bytes);
86d4a77b
JB
3494 if (!ret)
3495 return 0;
3496 }
3497
3498 /*
52621cb6
LZ
3499 * The bitmaps list has all the bitmaps that record free space
3500 * starting after offset, so no more search is required.
86d4a77b 3501 */
52621cb6 3502 return -ENOSPC;
4e69b598
JB
3503}
3504
fa9c0d79
CM
3505/*
3506 * here we try to find a cluster of blocks in a block group. The goal
1bb91902 3507 * is to find at least bytes+empty_size.
fa9c0d79
CM
3508 * We might not find them all in one contiguous area.
3509 *
3510 * returns zero and sets up cluster if things worked out, otherwise
3511 * it returns -enospc
3512 */
32da5386 3513int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
fa9c0d79
CM
3514 struct btrfs_free_cluster *cluster,
3515 u64 offset, u64 bytes, u64 empty_size)
3516{
2ceeae2e 3517 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 3518 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
86d4a77b 3519 struct btrfs_free_space *entry, *tmp;
52621cb6 3520 LIST_HEAD(bitmaps);
fa9c0d79 3521 u64 min_bytes;
1bb91902 3522 u64 cont1_bytes;
fa9c0d79
CM
3523 int ret;
3524
1bb91902
AO
3525 /*
3526 * Choose the minimum extent size we'll require for this
3527 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3528 * For metadata, allow allocates with smaller extents. For
3529 * data, keep it dense.
3530 */
0b246afa 3531 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
1bb91902 3532 cont1_bytes = min_bytes = bytes + empty_size;
451d7585 3533 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1bb91902 3534 cont1_bytes = bytes;
0b246afa 3535 min_bytes = fs_info->sectorsize;
1bb91902
AO
3536 } else {
3537 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
0b246afa 3538 min_bytes = fs_info->sectorsize;
1bb91902 3539 }
fa9c0d79 3540
34d52cb6 3541 spin_lock(&ctl->tree_lock);
7d0d2e8e
JB
3542
3543 /*
3544 * If we know we don't have enough space to make a cluster don't even
3545 * bother doing all the work to try and find one.
3546 */
1bb91902 3547 if (ctl->free_space < bytes) {
34d52cb6 3548 spin_unlock(&ctl->tree_lock);
7d0d2e8e
JB
3549 return -ENOSPC;
3550 }
3551
fa9c0d79
CM
3552 spin_lock(&cluster->lock);
3553
3554 /* someone already found a cluster, hooray */
3555 if (cluster->block_group) {
3556 ret = 0;
3557 goto out;
3558 }
fa9c0d79 3559
3f7de037
JB
3560 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3561 min_bytes);
3562
86d4a77b 3563 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
1bb91902
AO
3564 bytes + empty_size,
3565 cont1_bytes, min_bytes);
4e69b598 3566 if (ret)
86d4a77b 3567 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
1bb91902
AO
3568 offset, bytes + empty_size,
3569 cont1_bytes, min_bytes);
86d4a77b
JB
3570
3571 /* Clear our temporary list */
3572 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3573 list_del_init(&entry->list);
fa9c0d79 3574
4e69b598 3575 if (!ret) {
b5790d51 3576 btrfs_get_block_group(block_group);
4e69b598
JB
3577 list_add_tail(&cluster->block_group_list,
3578 &block_group->cluster_list);
3579 cluster->block_group = block_group;
3f7de037
JB
3580 } else {
3581 trace_btrfs_failed_cluster_setup(block_group);
fa9c0d79 3582 }
fa9c0d79
CM
3583out:
3584 spin_unlock(&cluster->lock);
34d52cb6 3585 spin_unlock(&ctl->tree_lock);
fa9c0d79
CM
3586
3587 return ret;
3588}
3589
3590/*
3591 * simple code to zero out a cluster
3592 */
3593void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3594{
3595 spin_lock_init(&cluster->lock);
3596 spin_lock_init(&cluster->refill_lock);
6bef4d31 3597 cluster->root = RB_ROOT;
fa9c0d79 3598 cluster->max_size = 0;
c759c4e1 3599 cluster->fragmented = false;
fa9c0d79
CM
3600 INIT_LIST_HEAD(&cluster->block_group_list);
3601 cluster->block_group = NULL;
3602}
3603
32da5386 3604static int do_trimming(struct btrfs_block_group *block_group,
7fe1e641 3605 u64 *total_trimmed, u64 start, u64 bytes,
55507ce3 3606 u64 reserved_start, u64 reserved_bytes,
b0643e59 3607 enum btrfs_trim_state reserved_trim_state,
55507ce3 3608 struct btrfs_trim_range *trim_entry)
f7039b1d 3609{
7fe1e641 3610 struct btrfs_space_info *space_info = block_group->space_info;
f7039b1d 3611 struct btrfs_fs_info *fs_info = block_group->fs_info;
55507ce3 3612 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
7fe1e641
LZ
3613 int ret;
3614 int update = 0;
b0643e59
DZ
3615 const u64 end = start + bytes;
3616 const u64 reserved_end = reserved_start + reserved_bytes;
3617 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3618 u64 trimmed = 0;
f7039b1d 3619
7fe1e641
LZ
3620 spin_lock(&space_info->lock);
3621 spin_lock(&block_group->lock);
3622 if (!block_group->ro) {
3623 block_group->reserved += reserved_bytes;
3624 space_info->bytes_reserved += reserved_bytes;
3625 update = 1;
3626 }
3627 spin_unlock(&block_group->lock);
3628 spin_unlock(&space_info->lock);
3629
2ff7e61e 3630 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
b0643e59 3631 if (!ret) {
7fe1e641 3632 *total_trimmed += trimmed;
b0643e59
DZ
3633 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3634 }
7fe1e641 3635
55507ce3 3636 mutex_lock(&ctl->cache_writeout_mutex);
b0643e59 3637 if (reserved_start < start)
290ef19a 3638 __btrfs_add_free_space(block_group, reserved_start,
b0643e59
DZ
3639 start - reserved_start,
3640 reserved_trim_state);
3641 if (start + bytes < reserved_start + reserved_bytes)
290ef19a 3642 __btrfs_add_free_space(block_group, end, reserved_end - end,
b0643e59 3643 reserved_trim_state);
290ef19a 3644 __btrfs_add_free_space(block_group, start, bytes, trim_state);
55507ce3
FM
3645 list_del(&trim_entry->list);
3646 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3647
3648 if (update) {
3649 spin_lock(&space_info->lock);
3650 spin_lock(&block_group->lock);
3651 if (block_group->ro)
3652 space_info->bytes_readonly += reserved_bytes;
3653 block_group->reserved -= reserved_bytes;
3654 space_info->bytes_reserved -= reserved_bytes;
7fe1e641 3655 spin_unlock(&block_group->lock);
8f63a840 3656 spin_unlock(&space_info->lock);
7fe1e641
LZ
3657 }
3658
3659 return ret;
3660}
3661
2bee7eb8
DZ
3662/*
3663 * If @async is set, then we will trim 1 region and return.
3664 */
32da5386 3665static int trim_no_bitmap(struct btrfs_block_group *block_group,
2bee7eb8
DZ
3666 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3667 bool async)
7fe1e641 3668{
19b2a2c7
DZ
3669 struct btrfs_discard_ctl *discard_ctl =
3670 &block_group->fs_info->discard_ctl;
7fe1e641
LZ
3671 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3672 struct btrfs_free_space *entry;
3673 struct rb_node *node;
3674 int ret = 0;
3675 u64 extent_start;
3676 u64 extent_bytes;
b0643e59 3677 enum btrfs_trim_state extent_trim_state;
7fe1e641 3678 u64 bytes;
19b2a2c7 3679 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
f7039b1d
LD
3680
3681 while (start < end) {
55507ce3
FM
3682 struct btrfs_trim_range trim_entry;
3683
3684 mutex_lock(&ctl->cache_writeout_mutex);
34d52cb6 3685 spin_lock(&ctl->tree_lock);
f7039b1d 3686
2bee7eb8
DZ
3687 if (ctl->free_space < minlen)
3688 goto out_unlock;
f7039b1d 3689
34d52cb6 3690 entry = tree_search_offset(ctl, start, 0, 1);
2bee7eb8
DZ
3691 if (!entry)
3692 goto out_unlock;
f7039b1d 3693
2bee7eb8
DZ
3694 /* Skip bitmaps and if async, already trimmed entries */
3695 while (entry->bitmap ||
3696 (async && btrfs_free_space_trimmed(entry))) {
7fe1e641 3697 node = rb_next(&entry->offset_index);
2bee7eb8
DZ
3698 if (!node)
3699 goto out_unlock;
7fe1e641
LZ
3700 entry = rb_entry(node, struct btrfs_free_space,
3701 offset_index);
f7039b1d
LD
3702 }
3703
2bee7eb8
DZ
3704 if (entry->offset >= end)
3705 goto out_unlock;
f7039b1d 3706
7fe1e641
LZ
3707 extent_start = entry->offset;
3708 extent_bytes = entry->bytes;
b0643e59 3709 extent_trim_state = entry->trim_state;
4aa9ad52
DZ
3710 if (async) {
3711 start = entry->offset;
3712 bytes = entry->bytes;
3713 if (bytes < minlen) {
3714 spin_unlock(&ctl->tree_lock);
3715 mutex_unlock(&ctl->cache_writeout_mutex);
3716 goto next;
3717 }
32e1649b 3718 unlink_free_space(ctl, entry, true);
7fe6d45e
DZ
3719 /*
3720 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3721 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3722 * X when we come back around. So trim it now.
3723 */
3724 if (max_discard_size &&
3725 bytes >= (max_discard_size +
3726 BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
19b2a2c7
DZ
3727 bytes = max_discard_size;
3728 extent_bytes = max_discard_size;
3729 entry->offset += max_discard_size;
3730 entry->bytes -= max_discard_size;
4aa9ad52
DZ
3731 link_free_space(ctl, entry);
3732 } else {
3733 kmem_cache_free(btrfs_free_space_cachep, entry);
3734 }
3735 } else {
3736 start = max(start, extent_start);
3737 bytes = min(extent_start + extent_bytes, end) - start;
3738 if (bytes < minlen) {
3739 spin_unlock(&ctl->tree_lock);
3740 mutex_unlock(&ctl->cache_writeout_mutex);
3741 goto next;
3742 }
f7039b1d 3743
32e1649b 3744 unlink_free_space(ctl, entry, true);
4aa9ad52
DZ
3745 kmem_cache_free(btrfs_free_space_cachep, entry);
3746 }
7fe1e641 3747
34d52cb6 3748 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3749 trim_entry.start = extent_start;
3750 trim_entry.bytes = extent_bytes;
3751 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3752 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d 3753
7fe1e641 3754 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59
DZ
3755 extent_start, extent_bytes, extent_trim_state,
3756 &trim_entry);
2bee7eb8
DZ
3757 if (ret) {
3758 block_group->discard_cursor = start + bytes;
7fe1e641 3759 break;
2bee7eb8 3760 }
7fe1e641
LZ
3761next:
3762 start += bytes;
2bee7eb8
DZ
3763 block_group->discard_cursor = start;
3764 if (async && *total_trimmed)
3765 break;
f7039b1d 3766
7fe1e641
LZ
3767 if (fatal_signal_pending(current)) {
3768 ret = -ERESTARTSYS;
3769 break;
3770 }
3771
3772 cond_resched();
3773 }
2bee7eb8
DZ
3774
3775 return ret;
3776
3777out_unlock:
3778 block_group->discard_cursor = btrfs_block_group_end(block_group);
3779 spin_unlock(&ctl->tree_lock);
3780 mutex_unlock(&ctl->cache_writeout_mutex);
3781
7fe1e641
LZ
3782 return ret;
3783}
3784
da080fe1
DZ
3785/*
3786 * If we break out of trimming a bitmap prematurely, we should reset the
3787 * trimming bit. In a rather contrieved case, it's possible to race here so
3788 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3789 *
3790 * start = start of bitmap
3791 * end = near end of bitmap
3792 *
3793 * Thread 1: Thread 2:
3794 * trim_bitmaps(start)
3795 * trim_bitmaps(end)
3796 * end_trimming_bitmap()
3797 * reset_trimming_bitmap()
3798 */
3799static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3800{
3801 struct btrfs_free_space *entry;
3802
3803 spin_lock(&ctl->tree_lock);
3804 entry = tree_search_offset(ctl, offset, 1, 0);
dfb79ddb 3805 if (entry) {
5dc7c10b 3806 if (btrfs_free_space_trimmed(entry)) {
dfb79ddb
DZ
3807 ctl->discardable_extents[BTRFS_STAT_CURR] +=
3808 entry->bitmap_extents;
5dc7c10b
DZ
3809 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3810 }
da080fe1 3811 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
dfb79ddb
DZ
3812 }
3813
da080fe1
DZ
3814 spin_unlock(&ctl->tree_lock);
3815}
3816
dfb79ddb
DZ
3817static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3818 struct btrfs_free_space *entry)
da080fe1 3819{
dfb79ddb 3820 if (btrfs_free_space_trimming_bitmap(entry)) {
da080fe1 3821 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
dfb79ddb
DZ
3822 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3823 entry->bitmap_extents;
5dc7c10b 3824 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
dfb79ddb 3825 }
da080fe1
DZ
3826}
3827
2bee7eb8
DZ
3828/*
3829 * If @async is set, then we will trim 1 region and return.
3830 */
32da5386 3831static int trim_bitmaps(struct btrfs_block_group *block_group,
2bee7eb8 3832 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
7fe6d45e 3833 u64 maxlen, bool async)
7fe1e641 3834{
19b2a2c7
DZ
3835 struct btrfs_discard_ctl *discard_ctl =
3836 &block_group->fs_info->discard_ctl;
7fe1e641
LZ
3837 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3838 struct btrfs_free_space *entry;
3839 int ret = 0;
3840 int ret2;
3841 u64 bytes;
3842 u64 offset = offset_to_bitmap(ctl, start);
19b2a2c7 3843 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
7fe1e641
LZ
3844
3845 while (offset < end) {
3846 bool next_bitmap = false;
55507ce3 3847 struct btrfs_trim_range trim_entry;
7fe1e641 3848
55507ce3 3849 mutex_lock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3850 spin_lock(&ctl->tree_lock);
3851
3852 if (ctl->free_space < minlen) {
2bee7eb8
DZ
3853 block_group->discard_cursor =
3854 btrfs_block_group_end(block_group);
7fe1e641 3855 spin_unlock(&ctl->tree_lock);
55507ce3 3856 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3857 break;
3858 }
3859
3860 entry = tree_search_offset(ctl, offset, 1, 0);
7fe6d45e
DZ
3861 /*
3862 * Bitmaps are marked trimmed lossily now to prevent constant
3863 * discarding of the same bitmap (the reason why we are bound
3864 * by the filters). So, retrim the block group bitmaps when we
3865 * are preparing to punt to the unused_bgs list. This uses
3866 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3867 * which is the only discard index which sets minlen to 0.
3868 */
3869 if (!entry || (async && minlen && start == offset &&
2bee7eb8 3870 btrfs_free_space_trimmed(entry))) {
7fe1e641 3871 spin_unlock(&ctl->tree_lock);
55507ce3 3872 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3873 next_bitmap = true;
3874 goto next;
3875 }
3876
da080fe1
DZ
3877 /*
3878 * Async discard bitmap trimming begins at by setting the start
3879 * to be key.objectid and the offset_to_bitmap() aligns to the
3880 * start of the bitmap. This lets us know we are fully
3881 * scanning the bitmap rather than only some portion of it.
3882 */
3883 if (start == offset)
3884 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3885
7fe1e641 3886 bytes = minlen;
0584f718 3887 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
7fe1e641 3888 if (ret2 || start >= end) {
da080fe1 3889 /*
7fe6d45e
DZ
3890 * We lossily consider a bitmap trimmed if we only skip
3891 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
da080fe1 3892 */
7fe6d45e 3893 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
dfb79ddb 3894 end_trimming_bitmap(ctl, entry);
da080fe1
DZ
3895 else
3896 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3897 spin_unlock(&ctl->tree_lock);
55507ce3 3898 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3899 next_bitmap = true;
3900 goto next;
3901 }
3902
2bee7eb8
DZ
3903 /*
3904 * We already trimmed a region, but are using the locking above
3905 * to reset the trim_state.
3906 */
3907 if (async && *total_trimmed) {
3908 spin_unlock(&ctl->tree_lock);
3909 mutex_unlock(&ctl->cache_writeout_mutex);
3910 goto out;
3911 }
3912
7fe1e641 3913 bytes = min(bytes, end - start);
7fe6d45e 3914 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
7fe1e641 3915 spin_unlock(&ctl->tree_lock);
55507ce3 3916 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3917 goto next;
3918 }
3919
7fe6d45e
DZ
3920 /*
3921 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3922 * If X < @minlen, we won't trim X when we come back around.
3923 * So trim it now. We differ here from trimming extents as we
3924 * don't keep individual state per bit.
3925 */
3926 if (async &&
3927 max_discard_size &&
3928 bytes > (max_discard_size + minlen))
19b2a2c7 3929 bytes = max_discard_size;
4aa9ad52 3930
f594f13c 3931 bitmap_clear_bits(ctl, entry, start, bytes, true);
7fe1e641
LZ
3932 if (entry->bytes == 0)
3933 free_bitmap(ctl, entry);
3934
3935 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3936 trim_entry.start = start;
3937 trim_entry.bytes = bytes;
3938 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3939 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3940
3941 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59 3942 start, bytes, 0, &trim_entry);
da080fe1
DZ
3943 if (ret) {
3944 reset_trimming_bitmap(ctl, offset);
2bee7eb8
DZ
3945 block_group->discard_cursor =
3946 btrfs_block_group_end(block_group);
7fe1e641 3947 break;
da080fe1 3948 }
7fe1e641
LZ
3949next:
3950 if (next_bitmap) {
3951 offset += BITS_PER_BITMAP * ctl->unit;
da080fe1 3952 start = offset;
7fe1e641
LZ
3953 } else {
3954 start += bytes;
f7039b1d 3955 }
2bee7eb8 3956 block_group->discard_cursor = start;
f7039b1d
LD
3957
3958 if (fatal_signal_pending(current)) {
da080fe1
DZ
3959 if (start != offset)
3960 reset_trimming_bitmap(ctl, offset);
f7039b1d
LD
3961 ret = -ERESTARTSYS;
3962 break;
3963 }
3964
3965 cond_resched();
3966 }
3967
2bee7eb8
DZ
3968 if (offset >= end)
3969 block_group->discard_cursor = end;
3970
3971out:
f7039b1d
LD
3972 return ret;
3973}
581bb050 3974
32da5386 3975int btrfs_trim_block_group(struct btrfs_block_group *block_group,
e33e17ee
JM
3976 u64 *trimmed, u64 start, u64 end, u64 minlen)
3977{
da080fe1 3978 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
e33e17ee 3979 int ret;
da080fe1 3980 u64 rem = 0;
e33e17ee 3981
2eda5708
NA
3982 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3983
e33e17ee
JM
3984 *trimmed = 0;
3985
3986 spin_lock(&block_group->lock);
3987 if (block_group->removed) {
04216820 3988 spin_unlock(&block_group->lock);
e33e17ee 3989 return 0;
04216820 3990 }
6b7304af 3991 btrfs_freeze_block_group(block_group);
e33e17ee
JM
3992 spin_unlock(&block_group->lock);
3993
2bee7eb8 3994 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
e33e17ee
JM
3995 if (ret)
3996 goto out;
7fe1e641 3997
7fe6d45e 3998 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
da080fe1
DZ
3999 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4000 /* If we ended in the middle of a bitmap, reset the trimming flag */
4001 if (rem)
4002 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
e33e17ee 4003out:
6b7304af 4004 btrfs_unfreeze_block_group(block_group);
7fe1e641
LZ
4005 return ret;
4006}
4007
2bee7eb8
DZ
4008int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4009 u64 *trimmed, u64 start, u64 end, u64 minlen,
4010 bool async)
4011{
4012 int ret;
4013
4014 *trimmed = 0;
4015
4016 spin_lock(&block_group->lock);
4017 if (block_group->removed) {
4018 spin_unlock(&block_group->lock);
4019 return 0;
4020 }
6b7304af 4021 btrfs_freeze_block_group(block_group);
2bee7eb8
DZ
4022 spin_unlock(&block_group->lock);
4023
4024 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
6b7304af 4025 btrfs_unfreeze_block_group(block_group);
2bee7eb8
DZ
4026
4027 return ret;
4028}
4029
4030int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4031 u64 *trimmed, u64 start, u64 end, u64 minlen,
7fe6d45e 4032 u64 maxlen, bool async)
2bee7eb8
DZ
4033{
4034 int ret;
4035
4036 *trimmed = 0;
4037
4038 spin_lock(&block_group->lock);
4039 if (block_group->removed) {
4040 spin_unlock(&block_group->lock);
4041 return 0;
4042 }
6b7304af 4043 btrfs_freeze_block_group(block_group);
2bee7eb8
DZ
4044 spin_unlock(&block_group->lock);
4045
7fe6d45e
DZ
4046 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4047 async);
4048
6b7304af 4049 btrfs_unfreeze_block_group(block_group);
2bee7eb8
DZ
4050
4051 return ret;
4052}
4053
94846229
BB
4054bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4055{
4056 return btrfs_super_cache_generation(fs_info->super_copy);
4057}
4058
36b216c8
BB
4059static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4060 struct btrfs_trans_handle *trans)
4061{
4062 struct btrfs_block_group *block_group;
4063 struct rb_node *node;
77364faf 4064 int ret = 0;
36b216c8
BB
4065
4066 btrfs_info(fs_info, "cleaning free space cache v1");
4067
4068 node = rb_first(&fs_info->block_group_cache_tree);
4069 while (node) {
4070 block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4071 ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4072 if (ret)
4073 goto out;
4074 node = rb_next(node);
4075 }
4076out:
4077 return ret;
4078}
4079
94846229
BB
4080int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4081{
4082 struct btrfs_trans_handle *trans;
4083 int ret;
4084
4085 /*
36b216c8
BB
4086 * update_super_roots will appropriately set or unset
4087 * super_copy->cache_generation based on SPACE_CACHE and
4088 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4089 * transaction commit whether we are enabling space cache v1 and don't
4090 * have any other work to do, or are disabling it and removing free
4091 * space inodes.
94846229
BB
4092 */
4093 trans = btrfs_start_transaction(fs_info->tree_root, 0);
4094 if (IS_ERR(trans))
4095 return PTR_ERR(trans);
4096
36b216c8 4097 if (!active) {
94846229 4098 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
36b216c8
BB
4099 ret = cleanup_free_space_cache_v1(fs_info, trans);
4100 if (ret) {
4101 btrfs_abort_transaction(trans, ret);
4102 btrfs_end_transaction(trans);
4103 goto out;
4104 }
4105 }
94846229
BB
4106
4107 ret = btrfs_commit_transaction(trans);
36b216c8 4108out:
94846229
BB
4109 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4110
4111 return ret;
4112}
4113
74255aa0 4114#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
dc11dd5d
JB
4115/*
4116 * Use this if you need to make a bitmap or extent entry specifically, it
4117 * doesn't do any of the merging that add_free_space does, this acts a lot like
4118 * how the free space cache loading stuff works, so you can get really weird
4119 * configurations.
4120 */
32da5386 4121int test_add_free_space_entry(struct btrfs_block_group *cache,
dc11dd5d 4122 u64 offset, u64 bytes, bool bitmap)
74255aa0 4123{
dc11dd5d
JB
4124 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4125 struct btrfs_free_space *info = NULL, *bitmap_info;
4126 void *map = NULL;
da080fe1 4127 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
dc11dd5d
JB
4128 u64 bytes_added;
4129 int ret;
74255aa0 4130
dc11dd5d
JB
4131again:
4132 if (!info) {
4133 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4134 if (!info)
4135 return -ENOMEM;
74255aa0
JB
4136 }
4137
dc11dd5d
JB
4138 if (!bitmap) {
4139 spin_lock(&ctl->tree_lock);
4140 info->offset = offset;
4141 info->bytes = bytes;
cef40483 4142 info->max_extent_size = 0;
dc11dd5d
JB
4143 ret = link_free_space(ctl, info);
4144 spin_unlock(&ctl->tree_lock);
4145 if (ret)
4146 kmem_cache_free(btrfs_free_space_cachep, info);
4147 return ret;
4148 }
4149
4150 if (!map) {
3acd4850 4151 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
dc11dd5d
JB
4152 if (!map) {
4153 kmem_cache_free(btrfs_free_space_cachep, info);
4154 return -ENOMEM;
4155 }
4156 }
4157
4158 spin_lock(&ctl->tree_lock);
4159 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4160 1, 0);
4161 if (!bitmap_info) {
4162 info->bitmap = map;
4163 map = NULL;
4164 add_new_bitmap(ctl, info, offset);
4165 bitmap_info = info;
20005523 4166 info = NULL;
dc11dd5d 4167 }
74255aa0 4168
da080fe1
DZ
4169 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4170 trim_state);
cef40483 4171
dc11dd5d
JB
4172 bytes -= bytes_added;
4173 offset += bytes_added;
4174 spin_unlock(&ctl->tree_lock);
74255aa0 4175
dc11dd5d
JB
4176 if (bytes)
4177 goto again;
74255aa0 4178
20005523
FM
4179 if (info)
4180 kmem_cache_free(btrfs_free_space_cachep, info);
3acd4850
CL
4181 if (map)
4182 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
dc11dd5d 4183 return 0;
74255aa0
JB
4184}
4185
4186/*
4187 * Checks to see if the given range is in the free space cache. This is really
4188 * just used to check the absence of space, so if there is free space in the
4189 * range at all we will return 1.
4190 */
32da5386 4191int test_check_exists(struct btrfs_block_group *cache,
dc11dd5d 4192 u64 offset, u64 bytes)
74255aa0
JB
4193{
4194 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4195 struct btrfs_free_space *info;
4196 int ret = 0;
4197
4198 spin_lock(&ctl->tree_lock);
4199 info = tree_search_offset(ctl, offset, 0, 0);
4200 if (!info) {
4201 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4202 1, 0);
4203 if (!info)
4204 goto out;
4205 }
4206
4207have_info:
4208 if (info->bitmap) {
4209 u64 bit_off, bit_bytes;
4210 struct rb_node *n;
4211 struct btrfs_free_space *tmp;
4212
4213 bit_off = offset;
4214 bit_bytes = ctl->unit;
0584f718 4215 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
74255aa0
JB
4216 if (!ret) {
4217 if (bit_off == offset) {
4218 ret = 1;
4219 goto out;
4220 } else if (bit_off > offset &&
4221 offset + bytes > bit_off) {
4222 ret = 1;
4223 goto out;
4224 }
4225 }
4226
4227 n = rb_prev(&info->offset_index);
4228 while (n) {
4229 tmp = rb_entry(n, struct btrfs_free_space,
4230 offset_index);
4231 if (tmp->offset + tmp->bytes < offset)
4232 break;
4233 if (offset + bytes < tmp->offset) {
5473e0c4 4234 n = rb_prev(&tmp->offset_index);
74255aa0
JB
4235 continue;
4236 }
4237 info = tmp;
4238 goto have_info;
4239 }
4240
4241 n = rb_next(&info->offset_index);
4242 while (n) {
4243 tmp = rb_entry(n, struct btrfs_free_space,
4244 offset_index);
4245 if (offset + bytes < tmp->offset)
4246 break;
4247 if (tmp->offset + tmp->bytes < offset) {
5473e0c4 4248 n = rb_next(&tmp->offset_index);
74255aa0
JB
4249 continue;
4250 }
4251 info = tmp;
4252 goto have_info;
4253 }
4254
20005523 4255 ret = 0;
74255aa0
JB
4256 goto out;
4257 }
4258
4259 if (info->offset == offset) {
4260 ret = 1;
4261 goto out;
4262 }
4263
4264 if (offset > info->offset && offset < info->offset + info->bytes)
4265 ret = 1;
4266out:
4267 spin_unlock(&ctl->tree_lock);
4268 return ret;
4269}
dc11dd5d 4270#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */