btrfs: handle empty block_group removal for async discard
[linux-block.git] / fs / btrfs / free-space-cache.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0f9dd46c
JB
2/*
3 * Copyright (C) 2008 Red Hat. All rights reserved.
0f9dd46c
JB
4 */
5
96303081 6#include <linux/pagemap.h>
0f9dd46c 7#include <linux/sched.h>
f361bf4a 8#include <linux/sched/signal.h>
5a0e3ad6 9#include <linux/slab.h>
96303081 10#include <linux/math64.h>
6ab60601 11#include <linux/ratelimit.h>
540adea3 12#include <linux/error-injection.h>
84de76a2 13#include <linux/sched/mm.h>
0f9dd46c 14#include "ctree.h"
fa9c0d79
CM
15#include "free-space-cache.h"
16#include "transaction.h"
0af3d00b 17#include "disk-io.h"
43be2146 18#include "extent_io.h"
581bb050 19#include "inode-map.h"
04216820 20#include "volumes.h"
8719aaae 21#include "space-info.h"
86736342 22#include "delalloc-space.h"
aac0023c 23#include "block-group.h"
b0643e59 24#include "discard.h"
fa9c0d79 25
0ef6447a 26#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
ee22184b 27#define MAX_CACHE_BYTES_PER_GIG SZ_32K
0f9dd46c 28
55507ce3
FM
29struct btrfs_trim_range {
30 u64 start;
31 u64 bytes;
32 struct list_head list;
33};
34
34d52cb6 35static int link_free_space(struct btrfs_free_space_ctl *ctl,
0cb59c99 36 struct btrfs_free_space *info);
cd023e7b
JB
37static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
38 struct btrfs_free_space *info);
afdb5718
JM
39static int btrfs_wait_cache_io_root(struct btrfs_root *root,
40 struct btrfs_trans_handle *trans,
41 struct btrfs_io_ctl *io_ctl,
42 struct btrfs_path *path);
0cb59c99 43
0414efae
LZ
44static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
45 struct btrfs_path *path,
46 u64 offset)
0af3d00b 47{
0b246afa 48 struct btrfs_fs_info *fs_info = root->fs_info;
0af3d00b
JB
49 struct btrfs_key key;
50 struct btrfs_key location;
51 struct btrfs_disk_key disk_key;
52 struct btrfs_free_space_header *header;
53 struct extent_buffer *leaf;
54 struct inode *inode = NULL;
84de76a2 55 unsigned nofs_flag;
0af3d00b
JB
56 int ret;
57
0af3d00b 58 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 59 key.offset = offset;
0af3d00b
JB
60 key.type = 0;
61
62 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
63 if (ret < 0)
64 return ERR_PTR(ret);
65 if (ret > 0) {
b3b4aa74 66 btrfs_release_path(path);
0af3d00b
JB
67 return ERR_PTR(-ENOENT);
68 }
69
70 leaf = path->nodes[0];
71 header = btrfs_item_ptr(leaf, path->slots[0],
72 struct btrfs_free_space_header);
73 btrfs_free_space_key(leaf, header, &disk_key);
74 btrfs_disk_key_to_cpu(&location, &disk_key);
b3b4aa74 75 btrfs_release_path(path);
0af3d00b 76
84de76a2
JB
77 /*
78 * We are often under a trans handle at this point, so we need to make
79 * sure NOFS is set to keep us from deadlocking.
80 */
81 nofs_flag = memalloc_nofs_save();
4c66e0d4 82 inode = btrfs_iget_path(fs_info->sb, &location, root, path);
4222ea71 83 btrfs_release_path(path);
84de76a2 84 memalloc_nofs_restore(nofs_flag);
0af3d00b
JB
85 if (IS_ERR(inode))
86 return inode;
0af3d00b 87
528c0327 88 mapping_set_gfp_mask(inode->i_mapping,
c62d2555
MH
89 mapping_gfp_constraint(inode->i_mapping,
90 ~(__GFP_FS | __GFP_HIGHMEM)));
adae52b9 91
0414efae
LZ
92 return inode;
93}
94
32da5386 95struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
7949f339 96 struct btrfs_path *path)
0414efae 97{
7949f339 98 struct btrfs_fs_info *fs_info = block_group->fs_info;
0414efae 99 struct inode *inode = NULL;
5b0e95bf 100 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0414efae
LZ
101
102 spin_lock(&block_group->lock);
103 if (block_group->inode)
104 inode = igrab(block_group->inode);
105 spin_unlock(&block_group->lock);
106 if (inode)
107 return inode;
108
77ab86bf 109 inode = __lookup_free_space_inode(fs_info->tree_root, path,
b3470b5d 110 block_group->start);
0414efae
LZ
111 if (IS_ERR(inode))
112 return inode;
113
0af3d00b 114 spin_lock(&block_group->lock);
5b0e95bf 115 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
0b246afa 116 btrfs_info(fs_info, "Old style space inode found, converting.");
5b0e95bf
JB
117 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118 BTRFS_INODE_NODATACOW;
2f356126
JB
119 block_group->disk_cache_state = BTRFS_DC_CLEAR;
120 }
121
300e4f8a 122 if (!block_group->iref) {
0af3d00b
JB
123 block_group->inode = igrab(inode);
124 block_group->iref = 1;
125 }
126 spin_unlock(&block_group->lock);
127
128 return inode;
129}
130
48a3b636
ES
131static int __create_free_space_inode(struct btrfs_root *root,
132 struct btrfs_trans_handle *trans,
133 struct btrfs_path *path,
134 u64 ino, u64 offset)
0af3d00b
JB
135{
136 struct btrfs_key key;
137 struct btrfs_disk_key disk_key;
138 struct btrfs_free_space_header *header;
139 struct btrfs_inode_item *inode_item;
140 struct extent_buffer *leaf;
5b0e95bf 141 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
0af3d00b
JB
142 int ret;
143
0414efae 144 ret = btrfs_insert_empty_inode(trans, root, path, ino);
0af3d00b
JB
145 if (ret)
146 return ret;
147
5b0e95bf
JB
148 /* We inline crc's for the free disk space cache */
149 if (ino != BTRFS_FREE_INO_OBJECTID)
150 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
151
0af3d00b
JB
152 leaf = path->nodes[0];
153 inode_item = btrfs_item_ptr(leaf, path->slots[0],
154 struct btrfs_inode_item);
155 btrfs_item_key(leaf, &disk_key, path->slots[0]);
b159fa28 156 memzero_extent_buffer(leaf, (unsigned long)inode_item,
0af3d00b
JB
157 sizeof(*inode_item));
158 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159 btrfs_set_inode_size(leaf, inode_item, 0);
160 btrfs_set_inode_nbytes(leaf, inode_item, 0);
161 btrfs_set_inode_uid(leaf, inode_item, 0);
162 btrfs_set_inode_gid(leaf, inode_item, 0);
163 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
5b0e95bf 164 btrfs_set_inode_flags(leaf, inode_item, flags);
0af3d00b
JB
165 btrfs_set_inode_nlink(leaf, inode_item, 1);
166 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
0414efae 167 btrfs_set_inode_block_group(leaf, inode_item, offset);
0af3d00b 168 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 169 btrfs_release_path(path);
0af3d00b
JB
170
171 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 172 key.offset = offset;
0af3d00b 173 key.type = 0;
0af3d00b
JB
174 ret = btrfs_insert_empty_item(trans, root, path, &key,
175 sizeof(struct btrfs_free_space_header));
176 if (ret < 0) {
b3b4aa74 177 btrfs_release_path(path);
0af3d00b
JB
178 return ret;
179 }
c9dc4c65 180
0af3d00b
JB
181 leaf = path->nodes[0];
182 header = btrfs_item_ptr(leaf, path->slots[0],
183 struct btrfs_free_space_header);
b159fa28 184 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
0af3d00b
JB
185 btrfs_set_free_space_key(leaf, header, &disk_key);
186 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 187 btrfs_release_path(path);
0af3d00b
JB
188
189 return 0;
190}
191
4ca75f1b 192int create_free_space_inode(struct btrfs_trans_handle *trans,
32da5386 193 struct btrfs_block_group *block_group,
0414efae
LZ
194 struct btrfs_path *path)
195{
196 int ret;
197 u64 ino;
198
4ca75f1b 199 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
0414efae
LZ
200 if (ret < 0)
201 return ret;
202
4ca75f1b 203 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
b3470b5d 204 ino, block_group->start);
0414efae
LZ
205}
206
2ff7e61e 207int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
7b61cd92 208 struct btrfs_block_rsv *rsv)
0af3d00b 209{
c8174313 210 u64 needed_bytes;
7b61cd92 211 int ret;
c8174313
JB
212
213 /* 1 for slack space, 1 for updating the inode */
2bd36e7b
JB
214 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
215 btrfs_calc_metadata_size(fs_info, 1);
c8174313 216
7b61cd92
MX
217 spin_lock(&rsv->lock);
218 if (rsv->reserved < needed_bytes)
219 ret = -ENOSPC;
220 else
221 ret = 0;
222 spin_unlock(&rsv->lock);
4b286cd1 223 return ret;
7b61cd92
MX
224}
225
77ab86bf 226int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
32da5386 227 struct btrfs_block_group *block_group,
7b61cd92
MX
228 struct inode *inode)
229{
77ab86bf 230 struct btrfs_root *root = BTRFS_I(inode)->root;
7b61cd92 231 int ret = 0;
35c76642 232 bool locked = false;
1bbc621e 233
1bbc621e 234 if (block_group) {
21e75ffe
JM
235 struct btrfs_path *path = btrfs_alloc_path();
236
237 if (!path) {
238 ret = -ENOMEM;
239 goto fail;
240 }
35c76642 241 locked = true;
1bbc621e
CM
242 mutex_lock(&trans->transaction->cache_write_mutex);
243 if (!list_empty(&block_group->io_list)) {
244 list_del_init(&block_group->io_list);
245
afdb5718 246 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
247 btrfs_put_block_group(block_group);
248 }
249
250 /*
251 * now that we've truncated the cache away, its no longer
252 * setup or written
253 */
254 spin_lock(&block_group->lock);
255 block_group->disk_cache_state = BTRFS_DC_CLEAR;
256 spin_unlock(&block_group->lock);
21e75ffe 257 btrfs_free_path(path);
1bbc621e 258 }
0af3d00b 259
6ef06d27 260 btrfs_i_size_write(BTRFS_I(inode), 0);
7caef267 261 truncate_pagecache(inode, 0);
0af3d00b
JB
262
263 /*
f7e9e8fc
OS
264 * We skip the throttling logic for free space cache inodes, so we don't
265 * need to check for -EAGAIN.
0af3d00b
JB
266 */
267 ret = btrfs_truncate_inode_items(trans, root, inode,
268 0, BTRFS_EXTENT_DATA_KEY);
35c76642
FM
269 if (ret)
270 goto fail;
0af3d00b 271
82d5902d 272 ret = btrfs_update_inode(trans, root, inode);
1bbc621e 273
1bbc621e 274fail:
35c76642
FM
275 if (locked)
276 mutex_unlock(&trans->transaction->cache_write_mutex);
79787eaa 277 if (ret)
66642832 278 btrfs_abort_transaction(trans, ret);
c8174313 279
82d5902d 280 return ret;
0af3d00b
JB
281}
282
1d480538 283static void readahead_cache(struct inode *inode)
9d66e233
JB
284{
285 struct file_ra_state *ra;
286 unsigned long last_index;
287
288 ra = kzalloc(sizeof(*ra), GFP_NOFS);
289 if (!ra)
1d480538 290 return;
9d66e233
JB
291
292 file_ra_state_init(ra, inode->i_mapping);
09cbfeaf 293 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
9d66e233
JB
294
295 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
296
297 kfree(ra);
9d66e233
JB
298}
299
4c6d1d85 300static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
f15376df 301 int write)
a67509c3 302{
5349d6c3
MX
303 int num_pages;
304 int check_crcs = 0;
305
09cbfeaf 306 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
5349d6c3 307
4a0cc7ca 308 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
5349d6c3
MX
309 check_crcs = 1;
310
8f6c72a9 311 /* Make sure we can fit our crcs and generation into the first page */
5349d6c3 312 if (write && check_crcs &&
8f6c72a9 313 (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
5349d6c3
MX
314 return -ENOSPC;
315
4c6d1d85 316 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
5349d6c3 317
31e818fe 318 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
a67509c3
JB
319 if (!io_ctl->pages)
320 return -ENOMEM;
5349d6c3
MX
321
322 io_ctl->num_pages = num_pages;
f15376df 323 io_ctl->fs_info = btrfs_sb(inode->i_sb);
5349d6c3 324 io_ctl->check_crcs = check_crcs;
c9dc4c65 325 io_ctl->inode = inode;
5349d6c3 326
a67509c3
JB
327 return 0;
328}
663faf9f 329ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
a67509c3 330
4c6d1d85 331static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
332{
333 kfree(io_ctl->pages);
c9dc4c65 334 io_ctl->pages = NULL;
a67509c3
JB
335}
336
4c6d1d85 337static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
338{
339 if (io_ctl->cur) {
a67509c3
JB
340 io_ctl->cur = NULL;
341 io_ctl->orig = NULL;
342 }
343}
344
4c6d1d85 345static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
a67509c3 346{
b12d6869 347 ASSERT(io_ctl->index < io_ctl->num_pages);
a67509c3 348 io_ctl->page = io_ctl->pages[io_ctl->index++];
2b108268 349 io_ctl->cur = page_address(io_ctl->page);
a67509c3 350 io_ctl->orig = io_ctl->cur;
09cbfeaf 351 io_ctl->size = PAGE_SIZE;
a67509c3 352 if (clear)
619a9742 353 clear_page(io_ctl->cur);
a67509c3
JB
354}
355
4c6d1d85 356static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
357{
358 int i;
359
360 io_ctl_unmap_page(io_ctl);
361
362 for (i = 0; i < io_ctl->num_pages; i++) {
a1ee5a45
LZ
363 if (io_ctl->pages[i]) {
364 ClearPageChecked(io_ctl->pages[i]);
365 unlock_page(io_ctl->pages[i]);
09cbfeaf 366 put_page(io_ctl->pages[i]);
a1ee5a45 367 }
a67509c3
JB
368 }
369}
370
4c6d1d85 371static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
a67509c3
JB
372 int uptodate)
373{
374 struct page *page;
375 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
376 int i;
377
378 for (i = 0; i < io_ctl->num_pages; i++) {
379 page = find_or_create_page(inode->i_mapping, i, mask);
380 if (!page) {
381 io_ctl_drop_pages(io_ctl);
382 return -ENOMEM;
383 }
384 io_ctl->pages[i] = page;
385 if (uptodate && !PageUptodate(page)) {
386 btrfs_readpage(NULL, page);
387 lock_page(page);
3797136b
JB
388 if (page->mapping != inode->i_mapping) {
389 btrfs_err(BTRFS_I(inode)->root->fs_info,
390 "free space cache page truncated");
391 io_ctl_drop_pages(io_ctl);
392 return -EIO;
393 }
a67509c3 394 if (!PageUptodate(page)) {
efe120a0
FH
395 btrfs_err(BTRFS_I(inode)->root->fs_info,
396 "error reading free space cache");
a67509c3
JB
397 io_ctl_drop_pages(io_ctl);
398 return -EIO;
399 }
400 }
401 }
402
f7d61dcd
JB
403 for (i = 0; i < io_ctl->num_pages; i++) {
404 clear_page_dirty_for_io(io_ctl->pages[i]);
405 set_page_extent_mapped(io_ctl->pages[i]);
406 }
407
a67509c3
JB
408 return 0;
409}
410
4c6d1d85 411static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 412{
528c0327 413 __le64 *val;
a67509c3
JB
414
415 io_ctl_map_page(io_ctl, 1);
416
417 /*
5b0e95bf
JB
418 * Skip the csum areas. If we don't check crcs then we just have a
419 * 64bit chunk at the front of the first page.
a67509c3 420 */
5b0e95bf
JB
421 if (io_ctl->check_crcs) {
422 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
423 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
424 } else {
425 io_ctl->cur += sizeof(u64);
426 io_ctl->size -= sizeof(u64) * 2;
427 }
a67509c3
JB
428
429 val = io_ctl->cur;
430 *val = cpu_to_le64(generation);
431 io_ctl->cur += sizeof(u64);
a67509c3
JB
432}
433
4c6d1d85 434static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 435{
528c0327 436 __le64 *gen;
a67509c3 437
5b0e95bf
JB
438 /*
439 * Skip the crc area. If we don't check crcs then we just have a 64bit
440 * chunk at the front of the first page.
441 */
442 if (io_ctl->check_crcs) {
443 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
444 io_ctl->size -= sizeof(u64) +
445 (sizeof(u32) * io_ctl->num_pages);
446 } else {
447 io_ctl->cur += sizeof(u64);
448 io_ctl->size -= sizeof(u64) * 2;
449 }
a67509c3 450
a67509c3
JB
451 gen = io_ctl->cur;
452 if (le64_to_cpu(*gen) != generation) {
f15376df 453 btrfs_err_rl(io_ctl->fs_info,
94647322
DS
454 "space cache generation (%llu) does not match inode (%llu)",
455 *gen, generation);
a67509c3
JB
456 io_ctl_unmap_page(io_ctl);
457 return -EIO;
458 }
459 io_ctl->cur += sizeof(u64);
5b0e95bf
JB
460 return 0;
461}
462
4c6d1d85 463static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
464{
465 u32 *tmp;
466 u32 crc = ~(u32)0;
467 unsigned offset = 0;
468
469 if (!io_ctl->check_crcs) {
470 io_ctl_unmap_page(io_ctl);
471 return;
472 }
473
474 if (index == 0)
cb54f257 475 offset = sizeof(u32) * io_ctl->num_pages;
5b0e95bf 476
4bb3c2e2
JT
477 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
478 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 479 io_ctl_unmap_page(io_ctl);
2b108268 480 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
481 tmp += index;
482 *tmp = crc;
5b0e95bf
JB
483}
484
4c6d1d85 485static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
486{
487 u32 *tmp, val;
488 u32 crc = ~(u32)0;
489 unsigned offset = 0;
490
491 if (!io_ctl->check_crcs) {
492 io_ctl_map_page(io_ctl, 0);
493 return 0;
494 }
495
496 if (index == 0)
497 offset = sizeof(u32) * io_ctl->num_pages;
498
2b108268 499 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
500 tmp += index;
501 val = *tmp;
5b0e95bf
JB
502
503 io_ctl_map_page(io_ctl, 0);
4bb3c2e2
JT
504 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
505 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 506 if (val != crc) {
f15376df 507 btrfs_err_rl(io_ctl->fs_info,
94647322 508 "csum mismatch on free space cache");
5b0e95bf
JB
509 io_ctl_unmap_page(io_ctl);
510 return -EIO;
511 }
512
a67509c3
JB
513 return 0;
514}
515
4c6d1d85 516static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
a67509c3
JB
517 void *bitmap)
518{
519 struct btrfs_free_space_entry *entry;
520
521 if (!io_ctl->cur)
522 return -ENOSPC;
523
524 entry = io_ctl->cur;
525 entry->offset = cpu_to_le64(offset);
526 entry->bytes = cpu_to_le64(bytes);
527 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
528 BTRFS_FREE_SPACE_EXTENT;
529 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
530 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
531
532 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
533 return 0;
534
5b0e95bf 535 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
536
537 /* No more pages to map */
538 if (io_ctl->index >= io_ctl->num_pages)
539 return 0;
540
541 /* map the next page */
542 io_ctl_map_page(io_ctl, 1);
543 return 0;
544}
545
4c6d1d85 546static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
a67509c3
JB
547{
548 if (!io_ctl->cur)
549 return -ENOSPC;
550
551 /*
552 * If we aren't at the start of the current page, unmap this one and
553 * map the next one if there is any left.
554 */
555 if (io_ctl->cur != io_ctl->orig) {
5b0e95bf 556 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
557 if (io_ctl->index >= io_ctl->num_pages)
558 return -ENOSPC;
559 io_ctl_map_page(io_ctl, 0);
560 }
561
69d24804 562 copy_page(io_ctl->cur, bitmap);
5b0e95bf 563 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
564 if (io_ctl->index < io_ctl->num_pages)
565 io_ctl_map_page(io_ctl, 0);
566 return 0;
567}
568
4c6d1d85 569static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
a67509c3 570{
5b0e95bf
JB
571 /*
572 * If we're not on the boundary we know we've modified the page and we
573 * need to crc the page.
574 */
575 if (io_ctl->cur != io_ctl->orig)
576 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
577 else
578 io_ctl_unmap_page(io_ctl);
a67509c3
JB
579
580 while (io_ctl->index < io_ctl->num_pages) {
581 io_ctl_map_page(io_ctl, 1);
5b0e95bf 582 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
583 }
584}
585
4c6d1d85 586static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
5b0e95bf 587 struct btrfs_free_space *entry, u8 *type)
a67509c3
JB
588{
589 struct btrfs_free_space_entry *e;
2f120c05
JB
590 int ret;
591
592 if (!io_ctl->cur) {
593 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
594 if (ret)
595 return ret;
596 }
a67509c3
JB
597
598 e = io_ctl->cur;
599 entry->offset = le64_to_cpu(e->offset);
600 entry->bytes = le64_to_cpu(e->bytes);
5b0e95bf 601 *type = e->type;
a67509c3
JB
602 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
603 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
604
605 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
5b0e95bf 606 return 0;
a67509c3
JB
607
608 io_ctl_unmap_page(io_ctl);
609
2f120c05 610 return 0;
a67509c3
JB
611}
612
4c6d1d85 613static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
5b0e95bf 614 struct btrfs_free_space *entry)
a67509c3 615{
5b0e95bf
JB
616 int ret;
617
5b0e95bf
JB
618 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
619 if (ret)
620 return ret;
621
69d24804 622 copy_page(entry->bitmap, io_ctl->cur);
a67509c3 623 io_ctl_unmap_page(io_ctl);
5b0e95bf
JB
624
625 return 0;
a67509c3
JB
626}
627
cd023e7b
JB
628/*
629 * Since we attach pinned extents after the fact we can have contiguous sections
630 * of free space that are split up in entries. This poses a problem with the
631 * tree logging stuff since it could have allocated across what appears to be 2
632 * entries since we would have merged the entries when adding the pinned extents
633 * back to the free space cache. So run through the space cache that we just
634 * loaded and merge contiguous entries. This will make the log replay stuff not
635 * blow up and it will make for nicer allocator behavior.
636 */
637static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
638{
639 struct btrfs_free_space *e, *prev = NULL;
640 struct rb_node *n;
641
642again:
643 spin_lock(&ctl->tree_lock);
644 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
645 e = rb_entry(n, struct btrfs_free_space, offset_index);
646 if (!prev)
647 goto next;
648 if (e->bitmap || prev->bitmap)
649 goto next;
650 if (prev->offset + prev->bytes == e->offset) {
651 unlink_free_space(ctl, prev);
652 unlink_free_space(ctl, e);
653 prev->bytes += e->bytes;
654 kmem_cache_free(btrfs_free_space_cachep, e);
655 link_free_space(ctl, prev);
656 prev = NULL;
657 spin_unlock(&ctl->tree_lock);
658 goto again;
659 }
660next:
661 prev = e;
662 }
663 spin_unlock(&ctl->tree_lock);
664}
665
48a3b636
ES
666static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
667 struct btrfs_free_space_ctl *ctl,
668 struct btrfs_path *path, u64 offset)
9d66e233 669{
3ffbd68c 670 struct btrfs_fs_info *fs_info = root->fs_info;
9d66e233
JB
671 struct btrfs_free_space_header *header;
672 struct extent_buffer *leaf;
4c6d1d85 673 struct btrfs_io_ctl io_ctl;
9d66e233 674 struct btrfs_key key;
a67509c3 675 struct btrfs_free_space *e, *n;
b76808fc 676 LIST_HEAD(bitmaps);
9d66e233
JB
677 u64 num_entries;
678 u64 num_bitmaps;
679 u64 generation;
a67509c3 680 u8 type;
f6a39829 681 int ret = 0;
9d66e233 682
9d66e233 683 /* Nothing in the space cache, goodbye */
0414efae 684 if (!i_size_read(inode))
a67509c3 685 return 0;
9d66e233
JB
686
687 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 688 key.offset = offset;
9d66e233
JB
689 key.type = 0;
690
691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0414efae 692 if (ret < 0)
a67509c3 693 return 0;
0414efae 694 else if (ret > 0) {
945d8962 695 btrfs_release_path(path);
a67509c3 696 return 0;
9d66e233
JB
697 }
698
0414efae
LZ
699 ret = -1;
700
9d66e233
JB
701 leaf = path->nodes[0];
702 header = btrfs_item_ptr(leaf, path->slots[0],
703 struct btrfs_free_space_header);
704 num_entries = btrfs_free_space_entries(leaf, header);
705 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
706 generation = btrfs_free_space_generation(leaf, header);
945d8962 707 btrfs_release_path(path);
9d66e233 708
e570fd27 709 if (!BTRFS_I(inode)->generation) {
0b246afa 710 btrfs_info(fs_info,
913e1535 711 "the free space cache file (%llu) is invalid, skip it",
e570fd27
MX
712 offset);
713 return 0;
714 }
715
9d66e233 716 if (BTRFS_I(inode)->generation != generation) {
0b246afa
JM
717 btrfs_err(fs_info,
718 "free space inode generation (%llu) did not match free space cache generation (%llu)",
719 BTRFS_I(inode)->generation, generation);
a67509c3 720 return 0;
9d66e233
JB
721 }
722
723 if (!num_entries)
a67509c3 724 return 0;
9d66e233 725
f15376df 726 ret = io_ctl_init(&io_ctl, inode, 0);
706efc66
LZ
727 if (ret)
728 return ret;
729
1d480538 730 readahead_cache(inode);
9d66e233 731
a67509c3
JB
732 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
733 if (ret)
734 goto out;
9d66e233 735
5b0e95bf
JB
736 ret = io_ctl_check_crc(&io_ctl, 0);
737 if (ret)
738 goto free_cache;
739
a67509c3
JB
740 ret = io_ctl_check_generation(&io_ctl, generation);
741 if (ret)
742 goto free_cache;
9d66e233 743
a67509c3
JB
744 while (num_entries) {
745 e = kmem_cache_zalloc(btrfs_free_space_cachep,
746 GFP_NOFS);
747 if (!e)
9d66e233 748 goto free_cache;
9d66e233 749
5b0e95bf
JB
750 ret = io_ctl_read_entry(&io_ctl, e, &type);
751 if (ret) {
752 kmem_cache_free(btrfs_free_space_cachep, e);
753 goto free_cache;
754 }
755
a7ccb255
DZ
756 /*
757 * Sync discard ensures that the free space cache is always
758 * trimmed. So when reading this in, the state should reflect
b0643e59
DZ
759 * that. We also do this for async as a stop gap for lack of
760 * persistence.
a7ccb255 761 */
b0643e59
DZ
762 if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
763 btrfs_test_opt(fs_info, DISCARD_ASYNC))
a7ccb255
DZ
764 e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
765
a67509c3
JB
766 if (!e->bytes) {
767 kmem_cache_free(btrfs_free_space_cachep, e);
768 goto free_cache;
9d66e233 769 }
a67509c3
JB
770
771 if (type == BTRFS_FREE_SPACE_EXTENT) {
772 spin_lock(&ctl->tree_lock);
773 ret = link_free_space(ctl, e);
774 spin_unlock(&ctl->tree_lock);
775 if (ret) {
0b246afa 776 btrfs_err(fs_info,
c2cf52eb 777 "Duplicate entries in free space cache, dumping");
a67509c3 778 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
779 goto free_cache;
780 }
a67509c3 781 } else {
b12d6869 782 ASSERT(num_bitmaps);
a67509c3 783 num_bitmaps--;
3acd4850
CL
784 e->bitmap = kmem_cache_zalloc(
785 btrfs_free_space_bitmap_cachep, GFP_NOFS);
a67509c3
JB
786 if (!e->bitmap) {
787 kmem_cache_free(
788 btrfs_free_space_cachep, e);
9d66e233
JB
789 goto free_cache;
790 }
a67509c3
JB
791 spin_lock(&ctl->tree_lock);
792 ret = link_free_space(ctl, e);
793 ctl->total_bitmaps++;
794 ctl->op->recalc_thresholds(ctl);
795 spin_unlock(&ctl->tree_lock);
796 if (ret) {
0b246afa 797 btrfs_err(fs_info,
c2cf52eb 798 "Duplicate entries in free space cache, dumping");
dc89e982 799 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
800 goto free_cache;
801 }
a67509c3 802 list_add_tail(&e->list, &bitmaps);
9d66e233
JB
803 }
804
a67509c3
JB
805 num_entries--;
806 }
9d66e233 807
2f120c05
JB
808 io_ctl_unmap_page(&io_ctl);
809
a67509c3
JB
810 /*
811 * We add the bitmaps at the end of the entries in order that
812 * the bitmap entries are added to the cache.
813 */
814 list_for_each_entry_safe(e, n, &bitmaps, list) {
9d66e233 815 list_del_init(&e->list);
5b0e95bf
JB
816 ret = io_ctl_read_bitmap(&io_ctl, e);
817 if (ret)
818 goto free_cache;
9d66e233
JB
819 }
820
a67509c3 821 io_ctl_drop_pages(&io_ctl);
cd023e7b 822 merge_space_tree(ctl);
9d66e233
JB
823 ret = 1;
824out:
a67509c3 825 io_ctl_free(&io_ctl);
9d66e233 826 return ret;
9d66e233 827free_cache:
a67509c3 828 io_ctl_drop_pages(&io_ctl);
0414efae 829 __btrfs_remove_free_space_cache(ctl);
9d66e233
JB
830 goto out;
831}
832
32da5386 833int load_free_space_cache(struct btrfs_block_group *block_group)
0cb59c99 834{
bb6cb1c5 835 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 836 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0414efae
LZ
837 struct inode *inode;
838 struct btrfs_path *path;
5b0e95bf 839 int ret = 0;
0414efae 840 bool matched;
bf38be65 841 u64 used = block_group->used;
0414efae 842
0414efae
LZ
843 /*
844 * If this block group has been marked to be cleared for one reason or
845 * another then we can't trust the on disk cache, so just return.
846 */
9d66e233 847 spin_lock(&block_group->lock);
0414efae
LZ
848 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
849 spin_unlock(&block_group->lock);
850 return 0;
851 }
9d66e233 852 spin_unlock(&block_group->lock);
0414efae
LZ
853
854 path = btrfs_alloc_path();
855 if (!path)
856 return 0;
d53ba474
JB
857 path->search_commit_root = 1;
858 path->skip_locking = 1;
0414efae 859
4222ea71
FM
860 /*
861 * We must pass a path with search_commit_root set to btrfs_iget in
862 * order to avoid a deadlock when allocating extents for the tree root.
863 *
864 * When we are COWing an extent buffer from the tree root, when looking
865 * for a free extent, at extent-tree.c:find_free_extent(), we can find
866 * block group without its free space cache loaded. When we find one
867 * we must load its space cache which requires reading its free space
868 * cache's inode item from the root tree. If this inode item is located
869 * in the same leaf that we started COWing before, then we end up in
870 * deadlock on the extent buffer (trying to read lock it when we
871 * previously write locked it).
872 *
873 * It's safe to read the inode item using the commit root because
874 * block groups, once loaded, stay in memory forever (until they are
875 * removed) as well as their space caches once loaded. New block groups
876 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
877 * we will never try to read their inode item while the fs is mounted.
878 */
7949f339 879 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
880 if (IS_ERR(inode)) {
881 btrfs_free_path(path);
882 return 0;
883 }
884
5b0e95bf
JB
885 /* We may have converted the inode and made the cache invalid. */
886 spin_lock(&block_group->lock);
887 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
888 spin_unlock(&block_group->lock);
a7e221e9 889 btrfs_free_path(path);
5b0e95bf
JB
890 goto out;
891 }
892 spin_unlock(&block_group->lock);
893
0414efae 894 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
b3470b5d 895 path, block_group->start);
0414efae
LZ
896 btrfs_free_path(path);
897 if (ret <= 0)
898 goto out;
899
900 spin_lock(&ctl->tree_lock);
b3470b5d 901 matched = (ctl->free_space == (block_group->length - used -
0414efae
LZ
902 block_group->bytes_super));
903 spin_unlock(&ctl->tree_lock);
904
905 if (!matched) {
906 __btrfs_remove_free_space_cache(ctl);
5d163e0e
JM
907 btrfs_warn(fs_info,
908 "block group %llu has wrong amount of free space",
b3470b5d 909 block_group->start);
0414efae
LZ
910 ret = -1;
911 }
912out:
913 if (ret < 0) {
914 /* This cache is bogus, make sure it gets cleared */
915 spin_lock(&block_group->lock);
916 block_group->disk_cache_state = BTRFS_DC_CLEAR;
917 spin_unlock(&block_group->lock);
82d5902d 918 ret = 0;
0414efae 919
5d163e0e
JM
920 btrfs_warn(fs_info,
921 "failed to load free space cache for block group %llu, rebuilding it now",
b3470b5d 922 block_group->start);
0414efae
LZ
923 }
924
925 iput(inode);
926 return ret;
9d66e233
JB
927}
928
d4452bc5 929static noinline_for_stack
4c6d1d85 930int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
d4452bc5 931 struct btrfs_free_space_ctl *ctl,
32da5386 932 struct btrfs_block_group *block_group,
d4452bc5
CM
933 int *entries, int *bitmaps,
934 struct list_head *bitmap_list)
0cb59c99 935{
c09544e0 936 int ret;
d4452bc5 937 struct btrfs_free_cluster *cluster = NULL;
1bbc621e 938 struct btrfs_free_cluster *cluster_locked = NULL;
d4452bc5 939 struct rb_node *node = rb_first(&ctl->free_space_offset);
55507ce3 940 struct btrfs_trim_range *trim_entry;
be1a12a0 941
43be2146 942 /* Get the cluster for this block_group if it exists */
d4452bc5 943 if (block_group && !list_empty(&block_group->cluster_list)) {
43be2146
JB
944 cluster = list_entry(block_group->cluster_list.next,
945 struct btrfs_free_cluster,
946 block_group_list);
d4452bc5 947 }
43be2146 948
f75b130e 949 if (!node && cluster) {
1bbc621e
CM
950 cluster_locked = cluster;
951 spin_lock(&cluster_locked->lock);
f75b130e
JB
952 node = rb_first(&cluster->root);
953 cluster = NULL;
954 }
955
a67509c3
JB
956 /* Write out the extent entries */
957 while (node) {
958 struct btrfs_free_space *e;
0cb59c99 959
a67509c3 960 e = rb_entry(node, struct btrfs_free_space, offset_index);
d4452bc5 961 *entries += 1;
0cb59c99 962
d4452bc5 963 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
a67509c3
JB
964 e->bitmap);
965 if (ret)
d4452bc5 966 goto fail;
2f356126 967
a67509c3 968 if (e->bitmap) {
d4452bc5
CM
969 list_add_tail(&e->list, bitmap_list);
970 *bitmaps += 1;
2f356126 971 }
a67509c3
JB
972 node = rb_next(node);
973 if (!node && cluster) {
974 node = rb_first(&cluster->root);
1bbc621e
CM
975 cluster_locked = cluster;
976 spin_lock(&cluster_locked->lock);
a67509c3 977 cluster = NULL;
43be2146 978 }
a67509c3 979 }
1bbc621e
CM
980 if (cluster_locked) {
981 spin_unlock(&cluster_locked->lock);
982 cluster_locked = NULL;
983 }
55507ce3
FM
984
985 /*
986 * Make sure we don't miss any range that was removed from our rbtree
987 * because trimming is running. Otherwise after a umount+mount (or crash
988 * after committing the transaction) we would leak free space and get
989 * an inconsistent free space cache report from fsck.
990 */
991 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
992 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
993 trim_entry->bytes, NULL);
994 if (ret)
995 goto fail;
996 *entries += 1;
997 }
998
d4452bc5
CM
999 return 0;
1000fail:
1bbc621e
CM
1001 if (cluster_locked)
1002 spin_unlock(&cluster_locked->lock);
d4452bc5
CM
1003 return -ENOSPC;
1004}
1005
1006static noinline_for_stack int
1007update_cache_item(struct btrfs_trans_handle *trans,
1008 struct btrfs_root *root,
1009 struct inode *inode,
1010 struct btrfs_path *path, u64 offset,
1011 int entries, int bitmaps)
1012{
1013 struct btrfs_key key;
1014 struct btrfs_free_space_header *header;
1015 struct extent_buffer *leaf;
1016 int ret;
1017
1018 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1019 key.offset = offset;
1020 key.type = 0;
1021
1022 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1023 if (ret < 0) {
1024 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1025 EXTENT_DELALLOC, 0, 0, NULL);
d4452bc5
CM
1026 goto fail;
1027 }
1028 leaf = path->nodes[0];
1029 if (ret > 0) {
1030 struct btrfs_key found_key;
1031 ASSERT(path->slots[0]);
1032 path->slots[0]--;
1033 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1034 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1035 found_key.offset != offset) {
1036 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
e182163d
OS
1037 inode->i_size - 1, EXTENT_DELALLOC, 0,
1038 0, NULL);
d4452bc5
CM
1039 btrfs_release_path(path);
1040 goto fail;
1041 }
1042 }
1043
1044 BTRFS_I(inode)->generation = trans->transid;
1045 header = btrfs_item_ptr(leaf, path->slots[0],
1046 struct btrfs_free_space_header);
1047 btrfs_set_free_space_entries(leaf, header, entries);
1048 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1049 btrfs_set_free_space_generation(leaf, header, trans->transid);
1050 btrfs_mark_buffer_dirty(leaf);
1051 btrfs_release_path(path);
1052
1053 return 0;
1054
1055fail:
1056 return -1;
1057}
1058
6701bdb3 1059static noinline_for_stack int write_pinned_extent_entries(
32da5386 1060 struct btrfs_block_group *block_group,
4c6d1d85 1061 struct btrfs_io_ctl *io_ctl,
5349d6c3 1062 int *entries)
d4452bc5
CM
1063{
1064 u64 start, extent_start, extent_end, len;
d4452bc5
CM
1065 struct extent_io_tree *unpin = NULL;
1066 int ret;
43be2146 1067
5349d6c3
MX
1068 if (!block_group)
1069 return 0;
1070
a67509c3
JB
1071 /*
1072 * We want to add any pinned extents to our free space cache
1073 * so we don't leak the space
d4452bc5 1074 *
db804f23
LZ
1075 * We shouldn't have switched the pinned extents yet so this is the
1076 * right one
1077 */
6701bdb3 1078 unpin = block_group->fs_info->pinned_extents;
db804f23 1079
b3470b5d 1080 start = block_group->start;
db804f23 1081
b3470b5d 1082 while (start < block_group->start + block_group->length) {
db804f23
LZ
1083 ret = find_first_extent_bit(unpin, start,
1084 &extent_start, &extent_end,
e6138876 1085 EXTENT_DIRTY, NULL);
5349d6c3
MX
1086 if (ret)
1087 return 0;
0cb59c99 1088
a67509c3 1089 /* This pinned extent is out of our range */
b3470b5d 1090 if (extent_start >= block_group->start + block_group->length)
5349d6c3 1091 return 0;
2f356126 1092
db804f23 1093 extent_start = max(extent_start, start);
b3470b5d
DS
1094 extent_end = min(block_group->start + block_group->length,
1095 extent_end + 1);
db804f23 1096 len = extent_end - extent_start;
0cb59c99 1097
d4452bc5
CM
1098 *entries += 1;
1099 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
a67509c3 1100 if (ret)
5349d6c3 1101 return -ENOSPC;
0cb59c99 1102
db804f23 1103 start = extent_end;
a67509c3 1104 }
0cb59c99 1105
5349d6c3
MX
1106 return 0;
1107}
1108
1109static noinline_for_stack int
4c6d1d85 1110write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
5349d6c3 1111{
7ae1681e 1112 struct btrfs_free_space *entry, *next;
5349d6c3
MX
1113 int ret;
1114
0cb59c99 1115 /* Write out the bitmaps */
7ae1681e 1116 list_for_each_entry_safe(entry, next, bitmap_list, list) {
d4452bc5 1117 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
a67509c3 1118 if (ret)
5349d6c3 1119 return -ENOSPC;
0cb59c99 1120 list_del_init(&entry->list);
be1a12a0
JB
1121 }
1122
5349d6c3
MX
1123 return 0;
1124}
0cb59c99 1125
5349d6c3
MX
1126static int flush_dirty_cache(struct inode *inode)
1127{
1128 int ret;
be1a12a0 1129
0ef8b726 1130 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5349d6c3 1131 if (ret)
0ef8b726 1132 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1133 EXTENT_DELALLOC, 0, 0, NULL);
0cb59c99 1134
5349d6c3 1135 return ret;
d4452bc5
CM
1136}
1137
1138static void noinline_for_stack
a3bdccc4 1139cleanup_bitmap_list(struct list_head *bitmap_list)
d4452bc5 1140{
7ae1681e 1141 struct btrfs_free_space *entry, *next;
5349d6c3 1142
7ae1681e 1143 list_for_each_entry_safe(entry, next, bitmap_list, list)
d4452bc5 1144 list_del_init(&entry->list);
a3bdccc4
CM
1145}
1146
1147static void noinline_for_stack
1148cleanup_write_cache_enospc(struct inode *inode,
1149 struct btrfs_io_ctl *io_ctl,
7bf1a159 1150 struct extent_state **cached_state)
a3bdccc4 1151{
d4452bc5
CM
1152 io_ctl_drop_pages(io_ctl);
1153 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1154 i_size_read(inode) - 1, cached_state);
d4452bc5 1155}
549b4fdb 1156
afdb5718
JM
1157static int __btrfs_wait_cache_io(struct btrfs_root *root,
1158 struct btrfs_trans_handle *trans,
32da5386 1159 struct btrfs_block_group *block_group,
afdb5718
JM
1160 struct btrfs_io_ctl *io_ctl,
1161 struct btrfs_path *path, u64 offset)
c9dc4c65
CM
1162{
1163 int ret;
1164 struct inode *inode = io_ctl->inode;
1165
1bbc621e
CM
1166 if (!inode)
1167 return 0;
1168
c9dc4c65
CM
1169 /* Flush the dirty pages in the cache file. */
1170 ret = flush_dirty_cache(inode);
1171 if (ret)
1172 goto out;
1173
1174 /* Update the cache item to tell everyone this cache file is valid. */
1175 ret = update_cache_item(trans, root, inode, path, offset,
1176 io_ctl->entries, io_ctl->bitmaps);
1177out:
1178 io_ctl_free(io_ctl);
1179 if (ret) {
1180 invalidate_inode_pages2(inode->i_mapping);
1181 BTRFS_I(inode)->generation = 0;
1182 if (block_group) {
1183#ifdef DEBUG
3ffbd68c 1184 btrfs_err(root->fs_info,
0b246afa 1185 "failed to write free space cache for block group %llu",
b3470b5d 1186 block_group->start);
c9dc4c65
CM
1187#endif
1188 }
1189 }
1190 btrfs_update_inode(trans, root, inode);
1191
1192 if (block_group) {
1bbc621e
CM
1193 /* the dirty list is protected by the dirty_bgs_lock */
1194 spin_lock(&trans->transaction->dirty_bgs_lock);
1195
1196 /* the disk_cache_state is protected by the block group lock */
c9dc4c65
CM
1197 spin_lock(&block_group->lock);
1198
1199 /*
1200 * only mark this as written if we didn't get put back on
1bbc621e
CM
1201 * the dirty list while waiting for IO. Otherwise our
1202 * cache state won't be right, and we won't get written again
c9dc4c65
CM
1203 */
1204 if (!ret && list_empty(&block_group->dirty_list))
1205 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1206 else if (ret)
1207 block_group->disk_cache_state = BTRFS_DC_ERROR;
1208
1209 spin_unlock(&block_group->lock);
1bbc621e 1210 spin_unlock(&trans->transaction->dirty_bgs_lock);
c9dc4c65
CM
1211 io_ctl->inode = NULL;
1212 iput(inode);
1213 }
1214
1215 return ret;
1216
1217}
1218
afdb5718
JM
1219static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1220 struct btrfs_trans_handle *trans,
1221 struct btrfs_io_ctl *io_ctl,
1222 struct btrfs_path *path)
1223{
1224 return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1225}
1226
1227int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
32da5386 1228 struct btrfs_block_group *block_group,
afdb5718
JM
1229 struct btrfs_path *path)
1230{
1231 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1232 block_group, &block_group->io_ctl,
b3470b5d 1233 path, block_group->start);
afdb5718
JM
1234}
1235
d4452bc5
CM
1236/**
1237 * __btrfs_write_out_cache - write out cached info to an inode
1238 * @root - the root the inode belongs to
1239 * @ctl - the free space cache we are going to write out
1240 * @block_group - the block_group for this cache if it belongs to a block_group
1241 * @trans - the trans handle
d4452bc5
CM
1242 *
1243 * This function writes out a free space cache struct to disk for quick recovery
8cd1e731 1244 * on mount. This will return 0 if it was successful in writing the cache out,
b8605454 1245 * or an errno if it was not.
d4452bc5
CM
1246 */
1247static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1248 struct btrfs_free_space_ctl *ctl,
32da5386 1249 struct btrfs_block_group *block_group,
c9dc4c65 1250 struct btrfs_io_ctl *io_ctl,
0e8d931a 1251 struct btrfs_trans_handle *trans)
d4452bc5
CM
1252{
1253 struct extent_state *cached_state = NULL;
5349d6c3 1254 LIST_HEAD(bitmap_list);
d4452bc5
CM
1255 int entries = 0;
1256 int bitmaps = 0;
1257 int ret;
c9dc4c65 1258 int must_iput = 0;
d4452bc5
CM
1259
1260 if (!i_size_read(inode))
b8605454 1261 return -EIO;
d4452bc5 1262
c9dc4c65 1263 WARN_ON(io_ctl->pages);
f15376df 1264 ret = io_ctl_init(io_ctl, inode, 1);
d4452bc5 1265 if (ret)
b8605454 1266 return ret;
d4452bc5 1267
e570fd27
MX
1268 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1269 down_write(&block_group->data_rwsem);
1270 spin_lock(&block_group->lock);
1271 if (block_group->delalloc_bytes) {
1272 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1273 spin_unlock(&block_group->lock);
1274 up_write(&block_group->data_rwsem);
1275 BTRFS_I(inode)->generation = 0;
1276 ret = 0;
c9dc4c65 1277 must_iput = 1;
e570fd27
MX
1278 goto out;
1279 }
1280 spin_unlock(&block_group->lock);
1281 }
1282
d4452bc5 1283 /* Lock all pages first so we can lock the extent safely. */
b8605454
OS
1284 ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1285 if (ret)
b77000ed 1286 goto out_unlock;
d4452bc5
CM
1287
1288 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
ff13db41 1289 &cached_state);
d4452bc5 1290
c9dc4c65 1291 io_ctl_set_generation(io_ctl, trans->transid);
d4452bc5 1292
55507ce3 1293 mutex_lock(&ctl->cache_writeout_mutex);
5349d6c3 1294 /* Write out the extent entries in the free space cache */
1bbc621e 1295 spin_lock(&ctl->tree_lock);
c9dc4c65 1296 ret = write_cache_extent_entries(io_ctl, ctl,
d4452bc5
CM
1297 block_group, &entries, &bitmaps,
1298 &bitmap_list);
a3bdccc4
CM
1299 if (ret)
1300 goto out_nospc_locked;
d4452bc5 1301
5349d6c3
MX
1302 /*
1303 * Some spaces that are freed in the current transaction are pinned,
1304 * they will be added into free space cache after the transaction is
1305 * committed, we shouldn't lose them.
1bbc621e
CM
1306 *
1307 * If this changes while we are working we'll get added back to
1308 * the dirty list and redo it. No locking needed
5349d6c3 1309 */
6701bdb3 1310 ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
a3bdccc4
CM
1311 if (ret)
1312 goto out_nospc_locked;
5349d6c3 1313
55507ce3
FM
1314 /*
1315 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1316 * locked while doing it because a concurrent trim can be manipulating
1317 * or freeing the bitmap.
1318 */
c9dc4c65 1319 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1bbc621e 1320 spin_unlock(&ctl->tree_lock);
55507ce3 1321 mutex_unlock(&ctl->cache_writeout_mutex);
5349d6c3
MX
1322 if (ret)
1323 goto out_nospc;
1324
1325 /* Zero out the rest of the pages just to make sure */
c9dc4c65 1326 io_ctl_zero_remaining_pages(io_ctl);
d4452bc5 1327
5349d6c3 1328 /* Everything is written out, now we dirty the pages in the file. */
2ff7e61e
JM
1329 ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1330 i_size_read(inode), &cached_state);
5349d6c3 1331 if (ret)
d4452bc5 1332 goto out_nospc;
5349d6c3 1333
e570fd27
MX
1334 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1335 up_write(&block_group->data_rwsem);
5349d6c3
MX
1336 /*
1337 * Release the pages and unlock the extent, we will flush
1338 * them out later
1339 */
c9dc4c65 1340 io_ctl_drop_pages(io_ctl);
5349d6c3
MX
1341
1342 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1343 i_size_read(inode) - 1, &cached_state);
5349d6c3 1344
c9dc4c65
CM
1345 /*
1346 * at this point the pages are under IO and we're happy,
1347 * The caller is responsible for waiting on them and updating the
1348 * the cache and the inode
1349 */
1350 io_ctl->entries = entries;
1351 io_ctl->bitmaps = bitmaps;
1352
1353 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
5349d6c3 1354 if (ret)
d4452bc5
CM
1355 goto out;
1356
c9dc4c65
CM
1357 return 0;
1358
2f356126 1359out:
c9dc4c65
CM
1360 io_ctl->inode = NULL;
1361 io_ctl_free(io_ctl);
5349d6c3 1362 if (ret) {
a67509c3 1363 invalidate_inode_pages2(inode->i_mapping);
0cb59c99
JB
1364 BTRFS_I(inode)->generation = 0;
1365 }
0cb59c99 1366 btrfs_update_inode(trans, root, inode);
c9dc4c65
CM
1367 if (must_iput)
1368 iput(inode);
5349d6c3 1369 return ret;
a67509c3 1370
a3bdccc4
CM
1371out_nospc_locked:
1372 cleanup_bitmap_list(&bitmap_list);
1373 spin_unlock(&ctl->tree_lock);
1374 mutex_unlock(&ctl->cache_writeout_mutex);
1375
a67509c3 1376out_nospc:
7bf1a159 1377 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
e570fd27 1378
b77000ed 1379out_unlock:
e570fd27
MX
1380 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1381 up_write(&block_group->data_rwsem);
1382
a67509c3 1383 goto out;
0414efae
LZ
1384}
1385
fe041534 1386int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
32da5386 1387 struct btrfs_block_group *block_group,
0414efae
LZ
1388 struct btrfs_path *path)
1389{
fe041534 1390 struct btrfs_fs_info *fs_info = trans->fs_info;
0414efae
LZ
1391 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1392 struct inode *inode;
1393 int ret = 0;
1394
0414efae
LZ
1395 spin_lock(&block_group->lock);
1396 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1397 spin_unlock(&block_group->lock);
e570fd27
MX
1398 return 0;
1399 }
0414efae
LZ
1400 spin_unlock(&block_group->lock);
1401
7949f339 1402 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
1403 if (IS_ERR(inode))
1404 return 0;
1405
77ab86bf
JM
1406 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1407 block_group, &block_group->io_ctl, trans);
c09544e0 1408 if (ret) {
c09544e0 1409#ifdef DEBUG
0b246afa
JM
1410 btrfs_err(fs_info,
1411 "failed to write free space cache for block group %llu",
b3470b5d 1412 block_group->start);
c09544e0 1413#endif
c9dc4c65
CM
1414 spin_lock(&block_group->lock);
1415 block_group->disk_cache_state = BTRFS_DC_ERROR;
1416 spin_unlock(&block_group->lock);
1417
1418 block_group->io_ctl.inode = NULL;
1419 iput(inode);
0414efae
LZ
1420 }
1421
c9dc4c65
CM
1422 /*
1423 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1424 * to wait for IO and put the inode
1425 */
1426
0cb59c99
JB
1427 return ret;
1428}
1429
34d52cb6 1430static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
96303081 1431 u64 offset)
0f9dd46c 1432{
b12d6869 1433 ASSERT(offset >= bitmap_start);
96303081 1434 offset -= bitmap_start;
34d52cb6 1435 return (unsigned long)(div_u64(offset, unit));
96303081 1436}
0f9dd46c 1437
34d52cb6 1438static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
96303081 1439{
34d52cb6 1440 return (unsigned long)(div_u64(bytes, unit));
96303081 1441}
0f9dd46c 1442
34d52cb6 1443static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1444 u64 offset)
1445{
1446 u64 bitmap_start;
0ef6447a 1447 u64 bytes_per_bitmap;
0f9dd46c 1448
34d52cb6
LZ
1449 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1450 bitmap_start = offset - ctl->start;
0ef6447a 1451 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
96303081 1452 bitmap_start *= bytes_per_bitmap;
34d52cb6 1453 bitmap_start += ctl->start;
0f9dd46c 1454
96303081 1455 return bitmap_start;
0f9dd46c
JB
1456}
1457
96303081
JB
1458static int tree_insert_offset(struct rb_root *root, u64 offset,
1459 struct rb_node *node, int bitmap)
0f9dd46c
JB
1460{
1461 struct rb_node **p = &root->rb_node;
1462 struct rb_node *parent = NULL;
1463 struct btrfs_free_space *info;
1464
1465 while (*p) {
1466 parent = *p;
96303081 1467 info = rb_entry(parent, struct btrfs_free_space, offset_index);
0f9dd46c 1468
96303081 1469 if (offset < info->offset) {
0f9dd46c 1470 p = &(*p)->rb_left;
96303081 1471 } else if (offset > info->offset) {
0f9dd46c 1472 p = &(*p)->rb_right;
96303081
JB
1473 } else {
1474 /*
1475 * we could have a bitmap entry and an extent entry
1476 * share the same offset. If this is the case, we want
1477 * the extent entry to always be found first if we do a
1478 * linear search through the tree, since we want to have
1479 * the quickest allocation time, and allocating from an
1480 * extent is faster than allocating from a bitmap. So
1481 * if we're inserting a bitmap and we find an entry at
1482 * this offset, we want to go right, or after this entry
1483 * logically. If we are inserting an extent and we've
1484 * found a bitmap, we want to go left, or before
1485 * logically.
1486 */
1487 if (bitmap) {
207dde82
JB
1488 if (info->bitmap) {
1489 WARN_ON_ONCE(1);
1490 return -EEXIST;
1491 }
96303081
JB
1492 p = &(*p)->rb_right;
1493 } else {
207dde82
JB
1494 if (!info->bitmap) {
1495 WARN_ON_ONCE(1);
1496 return -EEXIST;
1497 }
96303081
JB
1498 p = &(*p)->rb_left;
1499 }
1500 }
0f9dd46c
JB
1501 }
1502
1503 rb_link_node(node, parent, p);
1504 rb_insert_color(node, root);
1505
1506 return 0;
1507}
1508
1509/*
70cb0743
JB
1510 * searches the tree for the given offset.
1511 *
96303081
JB
1512 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1513 * want a section that has at least bytes size and comes at or after the given
1514 * offset.
0f9dd46c 1515 */
96303081 1516static struct btrfs_free_space *
34d52cb6 1517tree_search_offset(struct btrfs_free_space_ctl *ctl,
96303081 1518 u64 offset, int bitmap_only, int fuzzy)
0f9dd46c 1519{
34d52cb6 1520 struct rb_node *n = ctl->free_space_offset.rb_node;
96303081
JB
1521 struct btrfs_free_space *entry, *prev = NULL;
1522
1523 /* find entry that is closest to the 'offset' */
1524 while (1) {
1525 if (!n) {
1526 entry = NULL;
1527 break;
1528 }
0f9dd46c 1529
0f9dd46c 1530 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081 1531 prev = entry;
0f9dd46c 1532
96303081 1533 if (offset < entry->offset)
0f9dd46c 1534 n = n->rb_left;
96303081 1535 else if (offset > entry->offset)
0f9dd46c 1536 n = n->rb_right;
96303081 1537 else
0f9dd46c 1538 break;
0f9dd46c
JB
1539 }
1540
96303081
JB
1541 if (bitmap_only) {
1542 if (!entry)
1543 return NULL;
1544 if (entry->bitmap)
1545 return entry;
0f9dd46c 1546
96303081
JB
1547 /*
1548 * bitmap entry and extent entry may share same offset,
1549 * in that case, bitmap entry comes after extent entry.
1550 */
1551 n = rb_next(n);
1552 if (!n)
1553 return NULL;
1554 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1555 if (entry->offset != offset)
1556 return NULL;
0f9dd46c 1557
96303081
JB
1558 WARN_ON(!entry->bitmap);
1559 return entry;
1560 } else if (entry) {
1561 if (entry->bitmap) {
0f9dd46c 1562 /*
96303081
JB
1563 * if previous extent entry covers the offset,
1564 * we should return it instead of the bitmap entry
0f9dd46c 1565 */
de6c4115
MX
1566 n = rb_prev(&entry->offset_index);
1567 if (n) {
96303081
JB
1568 prev = rb_entry(n, struct btrfs_free_space,
1569 offset_index);
de6c4115
MX
1570 if (!prev->bitmap &&
1571 prev->offset + prev->bytes > offset)
1572 entry = prev;
0f9dd46c 1573 }
96303081
JB
1574 }
1575 return entry;
1576 }
1577
1578 if (!prev)
1579 return NULL;
1580
1581 /* find last entry before the 'offset' */
1582 entry = prev;
1583 if (entry->offset > offset) {
1584 n = rb_prev(&entry->offset_index);
1585 if (n) {
1586 entry = rb_entry(n, struct btrfs_free_space,
1587 offset_index);
b12d6869 1588 ASSERT(entry->offset <= offset);
0f9dd46c 1589 } else {
96303081
JB
1590 if (fuzzy)
1591 return entry;
1592 else
1593 return NULL;
0f9dd46c
JB
1594 }
1595 }
1596
96303081 1597 if (entry->bitmap) {
de6c4115
MX
1598 n = rb_prev(&entry->offset_index);
1599 if (n) {
96303081
JB
1600 prev = rb_entry(n, struct btrfs_free_space,
1601 offset_index);
de6c4115
MX
1602 if (!prev->bitmap &&
1603 prev->offset + prev->bytes > offset)
1604 return prev;
96303081 1605 }
34d52cb6 1606 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
96303081
JB
1607 return entry;
1608 } else if (entry->offset + entry->bytes > offset)
1609 return entry;
1610
1611 if (!fuzzy)
1612 return NULL;
1613
1614 while (1) {
1615 if (entry->bitmap) {
1616 if (entry->offset + BITS_PER_BITMAP *
34d52cb6 1617 ctl->unit > offset)
96303081
JB
1618 break;
1619 } else {
1620 if (entry->offset + entry->bytes > offset)
1621 break;
1622 }
1623
1624 n = rb_next(&entry->offset_index);
1625 if (!n)
1626 return NULL;
1627 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1628 }
1629 return entry;
0f9dd46c
JB
1630}
1631
f333adb5 1632static inline void
34d52cb6 1633__unlink_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 1634 struct btrfs_free_space *info)
0f9dd46c 1635{
34d52cb6
LZ
1636 rb_erase(&info->offset_index, &ctl->free_space_offset);
1637 ctl->free_extents--;
f333adb5
LZ
1638}
1639
34d52cb6 1640static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5
LZ
1641 struct btrfs_free_space *info)
1642{
34d52cb6
LZ
1643 __unlink_free_space(ctl, info);
1644 ctl->free_space -= info->bytes;
0f9dd46c
JB
1645}
1646
34d52cb6 1647static int link_free_space(struct btrfs_free_space_ctl *ctl,
0f9dd46c
JB
1648 struct btrfs_free_space *info)
1649{
1650 int ret = 0;
1651
b12d6869 1652 ASSERT(info->bytes || info->bitmap);
34d52cb6 1653 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
96303081 1654 &info->offset_index, (info->bitmap != NULL));
0f9dd46c
JB
1655 if (ret)
1656 return ret;
1657
34d52cb6
LZ
1658 ctl->free_space += info->bytes;
1659 ctl->free_extents++;
96303081
JB
1660 return ret;
1661}
1662
34d52cb6 1663static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
96303081 1664{
32da5386 1665 struct btrfs_block_group *block_group = ctl->private;
25891f79
JB
1666 u64 max_bytes;
1667 u64 bitmap_bytes;
1668 u64 extent_bytes;
b3470b5d 1669 u64 size = block_group->length;
0ef6447a
FX
1670 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1671 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
34d52cb6 1672
0ef6447a 1673 max_bitmaps = max_t(u64, max_bitmaps, 1);
dde5740f 1674
b12d6869 1675 ASSERT(ctl->total_bitmaps <= max_bitmaps);
96303081
JB
1676
1677 /*
1678 * The goal is to keep the total amount of memory used per 1gb of space
1679 * at or below 32k, so we need to adjust how much memory we allow to be
1680 * used by extent based free space tracking
1681 */
ee22184b 1682 if (size < SZ_1G)
8eb2d829
LZ
1683 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1684 else
ee22184b 1685 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
96303081 1686
25891f79
JB
1687 /*
1688 * we want to account for 1 more bitmap than what we have so we can make
1689 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1690 * we add more bitmaps.
1691 */
b9ef22de 1692 bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
96303081 1693
25891f79 1694 if (bitmap_bytes >= max_bytes) {
34d52cb6 1695 ctl->extents_thresh = 0;
25891f79
JB
1696 return;
1697 }
96303081 1698
25891f79 1699 /*
f8c269d7 1700 * we want the extent entry threshold to always be at most 1/2 the max
25891f79
JB
1701 * bytes we can have, or whatever is less than that.
1702 */
1703 extent_bytes = max_bytes - bitmap_bytes;
f8c269d7 1704 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
96303081 1705
34d52cb6 1706 ctl->extents_thresh =
f8c269d7 1707 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
96303081
JB
1708}
1709
bb3ac5a4
MX
1710static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1711 struct btrfs_free_space *info,
1712 u64 offset, u64 bytes)
96303081 1713{
f38b6e75 1714 unsigned long start, count;
96303081 1715
34d52cb6
LZ
1716 start = offset_to_bit(info->offset, ctl->unit, offset);
1717 count = bytes_to_bits(bytes, ctl->unit);
b12d6869 1718 ASSERT(start + count <= BITS_PER_BITMAP);
96303081 1719
f38b6e75 1720 bitmap_clear(info->bitmap, start, count);
96303081
JB
1721
1722 info->bytes -= bytes;
553cceb4
JB
1723 if (info->max_extent_size > ctl->unit)
1724 info->max_extent_size = 0;
bb3ac5a4
MX
1725}
1726
1727static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1728 struct btrfs_free_space *info, u64 offset,
1729 u64 bytes)
1730{
1731 __bitmap_clear_bits(ctl, info, offset, bytes);
34d52cb6 1732 ctl->free_space -= bytes;
96303081
JB
1733}
1734
34d52cb6 1735static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
817d52f8
JB
1736 struct btrfs_free_space *info, u64 offset,
1737 u64 bytes)
96303081 1738{
f38b6e75 1739 unsigned long start, count;
96303081 1740
34d52cb6
LZ
1741 start = offset_to_bit(info->offset, ctl->unit, offset);
1742 count = bytes_to_bits(bytes, ctl->unit);
b12d6869 1743 ASSERT(start + count <= BITS_PER_BITMAP);
96303081 1744
f38b6e75 1745 bitmap_set(info->bitmap, start, count);
96303081
JB
1746
1747 info->bytes += bytes;
34d52cb6 1748 ctl->free_space += bytes;
96303081
JB
1749}
1750
a4820398
MX
1751/*
1752 * If we can not find suitable extent, we will use bytes to record
1753 * the size of the max extent.
1754 */
34d52cb6 1755static int search_bitmap(struct btrfs_free_space_ctl *ctl,
96303081 1756 struct btrfs_free_space *bitmap_info, u64 *offset,
0584f718 1757 u64 *bytes, bool for_alloc)
96303081
JB
1758{
1759 unsigned long found_bits = 0;
a4820398 1760 unsigned long max_bits = 0;
96303081
JB
1761 unsigned long bits, i;
1762 unsigned long next_zero;
a4820398 1763 unsigned long extent_bits;
96303081 1764
cef40483
JB
1765 /*
1766 * Skip searching the bitmap if we don't have a contiguous section that
1767 * is large enough for this allocation.
1768 */
0584f718
JB
1769 if (for_alloc &&
1770 bitmap_info->max_extent_size &&
cef40483
JB
1771 bitmap_info->max_extent_size < *bytes) {
1772 *bytes = bitmap_info->max_extent_size;
1773 return -1;
1774 }
1775
34d52cb6 1776 i = offset_to_bit(bitmap_info->offset, ctl->unit,
96303081 1777 max_t(u64, *offset, bitmap_info->offset));
34d52cb6 1778 bits = bytes_to_bits(*bytes, ctl->unit);
96303081 1779
ebb3dad4 1780 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
0584f718
JB
1781 if (for_alloc && bits == 1) {
1782 found_bits = 1;
1783 break;
1784 }
96303081
JB
1785 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1786 BITS_PER_BITMAP, i);
a4820398
MX
1787 extent_bits = next_zero - i;
1788 if (extent_bits >= bits) {
1789 found_bits = extent_bits;
96303081 1790 break;
a4820398
MX
1791 } else if (extent_bits > max_bits) {
1792 max_bits = extent_bits;
96303081
JB
1793 }
1794 i = next_zero;
1795 }
1796
1797 if (found_bits) {
34d52cb6
LZ
1798 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1799 *bytes = (u64)(found_bits) * ctl->unit;
96303081
JB
1800 return 0;
1801 }
1802
a4820398 1803 *bytes = (u64)(max_bits) * ctl->unit;
cef40483 1804 bitmap_info->max_extent_size = *bytes;
96303081
JB
1805 return -1;
1806}
1807
ad22cf6e
JB
1808static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1809{
1810 if (entry->bitmap)
1811 return entry->max_extent_size;
1812 return entry->bytes;
1813}
1814
a4820398 1815/* Cache the size of the max extent in bytes */
34d52cb6 1816static struct btrfs_free_space *
53b381b3 1817find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
a4820398 1818 unsigned long align, u64 *max_extent_size)
96303081
JB
1819{
1820 struct btrfs_free_space *entry;
1821 struct rb_node *node;
53b381b3
DW
1822 u64 tmp;
1823 u64 align_off;
96303081
JB
1824 int ret;
1825
34d52cb6 1826 if (!ctl->free_space_offset.rb_node)
a4820398 1827 goto out;
96303081 1828
34d52cb6 1829 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
96303081 1830 if (!entry)
a4820398 1831 goto out;
96303081
JB
1832
1833 for (node = &entry->offset_index; node; node = rb_next(node)) {
1834 entry = rb_entry(node, struct btrfs_free_space, offset_index);
a4820398 1835 if (entry->bytes < *bytes) {
ad22cf6e
JB
1836 *max_extent_size = max(get_max_extent_size(entry),
1837 *max_extent_size);
96303081 1838 continue;
a4820398 1839 }
96303081 1840
53b381b3
DW
1841 /* make sure the space returned is big enough
1842 * to match our requested alignment
1843 */
1844 if (*bytes >= align) {
a4820398 1845 tmp = entry->offset - ctl->start + align - 1;
47c5713f 1846 tmp = div64_u64(tmp, align);
53b381b3
DW
1847 tmp = tmp * align + ctl->start;
1848 align_off = tmp - entry->offset;
1849 } else {
1850 align_off = 0;
1851 tmp = entry->offset;
1852 }
1853
a4820398 1854 if (entry->bytes < *bytes + align_off) {
ad22cf6e
JB
1855 *max_extent_size = max(get_max_extent_size(entry),
1856 *max_extent_size);
53b381b3 1857 continue;
a4820398 1858 }
53b381b3 1859
96303081 1860 if (entry->bitmap) {
a4820398
MX
1861 u64 size = *bytes;
1862
0584f718 1863 ret = search_bitmap(ctl, entry, &tmp, &size, true);
53b381b3
DW
1864 if (!ret) {
1865 *offset = tmp;
a4820398 1866 *bytes = size;
96303081 1867 return entry;
ad22cf6e
JB
1868 } else {
1869 *max_extent_size =
1870 max(get_max_extent_size(entry),
1871 *max_extent_size);
53b381b3 1872 }
96303081
JB
1873 continue;
1874 }
1875
53b381b3
DW
1876 *offset = tmp;
1877 *bytes = entry->bytes - align_off;
96303081
JB
1878 return entry;
1879 }
a4820398 1880out:
96303081
JB
1881 return NULL;
1882}
1883
34d52cb6 1884static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1885 struct btrfs_free_space *info, u64 offset)
1886{
34d52cb6 1887 info->offset = offset_to_bitmap(ctl, offset);
f019f426 1888 info->bytes = 0;
f2d0f676 1889 INIT_LIST_HEAD(&info->list);
34d52cb6
LZ
1890 link_free_space(ctl, info);
1891 ctl->total_bitmaps++;
96303081 1892
34d52cb6 1893 ctl->op->recalc_thresholds(ctl);
96303081
JB
1894}
1895
34d52cb6 1896static void free_bitmap(struct btrfs_free_space_ctl *ctl,
edf6e2d1
LZ
1897 struct btrfs_free_space *bitmap_info)
1898{
34d52cb6 1899 unlink_free_space(ctl, bitmap_info);
3acd4850 1900 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
dc89e982 1901 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
34d52cb6
LZ
1902 ctl->total_bitmaps--;
1903 ctl->op->recalc_thresholds(ctl);
edf6e2d1
LZ
1904}
1905
34d52cb6 1906static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1907 struct btrfs_free_space *bitmap_info,
1908 u64 *offset, u64 *bytes)
1909{
1910 u64 end;
6606bb97
JB
1911 u64 search_start, search_bytes;
1912 int ret;
96303081
JB
1913
1914again:
34d52cb6 1915 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
96303081 1916
6606bb97 1917 /*
bdb7d303
JB
1918 * We need to search for bits in this bitmap. We could only cover some
1919 * of the extent in this bitmap thanks to how we add space, so we need
1920 * to search for as much as it as we can and clear that amount, and then
1921 * go searching for the next bit.
6606bb97
JB
1922 */
1923 search_start = *offset;
bdb7d303 1924 search_bytes = ctl->unit;
13dbc089 1925 search_bytes = min(search_bytes, end - search_start + 1);
0584f718
JB
1926 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1927 false);
b50c6e25
JB
1928 if (ret < 0 || search_start != *offset)
1929 return -EINVAL;
6606bb97 1930
bdb7d303
JB
1931 /* We may have found more bits than what we need */
1932 search_bytes = min(search_bytes, *bytes);
1933
1934 /* Cannot clear past the end of the bitmap */
1935 search_bytes = min(search_bytes, end - search_start + 1);
1936
1937 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1938 *offset += search_bytes;
1939 *bytes -= search_bytes;
96303081
JB
1940
1941 if (*bytes) {
6606bb97 1942 struct rb_node *next = rb_next(&bitmap_info->offset_index);
edf6e2d1 1943 if (!bitmap_info->bytes)
34d52cb6 1944 free_bitmap(ctl, bitmap_info);
96303081 1945
6606bb97
JB
1946 /*
1947 * no entry after this bitmap, but we still have bytes to
1948 * remove, so something has gone wrong.
1949 */
1950 if (!next)
96303081
JB
1951 return -EINVAL;
1952
6606bb97
JB
1953 bitmap_info = rb_entry(next, struct btrfs_free_space,
1954 offset_index);
1955
1956 /*
1957 * if the next entry isn't a bitmap we need to return to let the
1958 * extent stuff do its work.
1959 */
96303081
JB
1960 if (!bitmap_info->bitmap)
1961 return -EAGAIN;
1962
6606bb97
JB
1963 /*
1964 * Ok the next item is a bitmap, but it may not actually hold
1965 * the information for the rest of this free space stuff, so
1966 * look for it, and if we don't find it return so we can try
1967 * everything over again.
1968 */
1969 search_start = *offset;
bdb7d303 1970 search_bytes = ctl->unit;
34d52cb6 1971 ret = search_bitmap(ctl, bitmap_info, &search_start,
0584f718 1972 &search_bytes, false);
6606bb97
JB
1973 if (ret < 0 || search_start != *offset)
1974 return -EAGAIN;
1975
96303081 1976 goto again;
edf6e2d1 1977 } else if (!bitmap_info->bytes)
34d52cb6 1978 free_bitmap(ctl, bitmap_info);
96303081
JB
1979
1980 return 0;
1981}
1982
2cdc342c
JB
1983static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1984 struct btrfs_free_space *info, u64 offset,
da080fe1 1985 u64 bytes, enum btrfs_trim_state trim_state)
2cdc342c
JB
1986{
1987 u64 bytes_to_set = 0;
1988 u64 end;
1989
da080fe1
DZ
1990 /*
1991 * This is a tradeoff to make bitmap trim state minimal. We mark the
1992 * whole bitmap untrimmed if at any point we add untrimmed regions.
1993 */
1994 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED)
1995 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
1996
2cdc342c
JB
1997 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1998
1999 bytes_to_set = min(end - offset, bytes);
2000
2001 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2002
cef40483
JB
2003 /*
2004 * We set some bytes, we have no idea what the max extent size is
2005 * anymore.
2006 */
2007 info->max_extent_size = 0;
2008
2cdc342c
JB
2009 return bytes_to_set;
2010
2011}
2012
34d52cb6
LZ
2013static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2014 struct btrfs_free_space *info)
96303081 2015{
32da5386 2016 struct btrfs_block_group *block_group = ctl->private;
0b246afa 2017 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
2018 bool forced = false;
2019
2020#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 2021 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
2022 forced = true;
2023#endif
96303081
JB
2024
2025 /*
2026 * If we are below the extents threshold then we can add this as an
2027 * extent, and don't have to deal with the bitmap
2028 */
d0bd4560 2029 if (!forced && ctl->free_extents < ctl->extents_thresh) {
32cb0840
JB
2030 /*
2031 * If this block group has some small extents we don't want to
2032 * use up all of our free slots in the cache with them, we want
01327610 2033 * to reserve them to larger extents, however if we have plenty
32cb0840
JB
2034 * of cache left then go ahead an dadd them, no sense in adding
2035 * the overhead of a bitmap if we don't have to.
2036 */
0b246afa 2037 if (info->bytes <= fs_info->sectorsize * 4) {
34d52cb6
LZ
2038 if (ctl->free_extents * 2 <= ctl->extents_thresh)
2039 return false;
32cb0840 2040 } else {
34d52cb6 2041 return false;
32cb0840
JB
2042 }
2043 }
96303081
JB
2044
2045 /*
dde5740f
JB
2046 * The original block groups from mkfs can be really small, like 8
2047 * megabytes, so don't bother with a bitmap for those entries. However
2048 * some block groups can be smaller than what a bitmap would cover but
2049 * are still large enough that they could overflow the 32k memory limit,
2050 * so allow those block groups to still be allowed to have a bitmap
2051 * entry.
96303081 2052 */
b3470b5d 2053 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
34d52cb6
LZ
2054 return false;
2055
2056 return true;
2057}
2058
20e5506b 2059static const struct btrfs_free_space_op free_space_op = {
2cdc342c
JB
2060 .recalc_thresholds = recalculate_thresholds,
2061 .use_bitmap = use_bitmap,
2062};
2063
34d52cb6
LZ
2064static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2065 struct btrfs_free_space *info)
2066{
2067 struct btrfs_free_space *bitmap_info;
32da5386 2068 struct btrfs_block_group *block_group = NULL;
34d52cb6 2069 int added = 0;
2cdc342c 2070 u64 bytes, offset, bytes_added;
da080fe1 2071 enum btrfs_trim_state trim_state;
34d52cb6 2072 int ret;
96303081
JB
2073
2074 bytes = info->bytes;
2075 offset = info->offset;
da080fe1 2076 trim_state = info->trim_state;
96303081 2077
34d52cb6
LZ
2078 if (!ctl->op->use_bitmap(ctl, info))
2079 return 0;
2080
2cdc342c
JB
2081 if (ctl->op == &free_space_op)
2082 block_group = ctl->private;
38e87880 2083again:
2cdc342c
JB
2084 /*
2085 * Since we link bitmaps right into the cluster we need to see if we
2086 * have a cluster here, and if so and it has our bitmap we need to add
2087 * the free space to that bitmap.
2088 */
2089 if (block_group && !list_empty(&block_group->cluster_list)) {
2090 struct btrfs_free_cluster *cluster;
2091 struct rb_node *node;
2092 struct btrfs_free_space *entry;
2093
2094 cluster = list_entry(block_group->cluster_list.next,
2095 struct btrfs_free_cluster,
2096 block_group_list);
2097 spin_lock(&cluster->lock);
2098 node = rb_first(&cluster->root);
2099 if (!node) {
2100 spin_unlock(&cluster->lock);
38e87880 2101 goto no_cluster_bitmap;
2cdc342c
JB
2102 }
2103
2104 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2105 if (!entry->bitmap) {
2106 spin_unlock(&cluster->lock);
38e87880 2107 goto no_cluster_bitmap;
2cdc342c
JB
2108 }
2109
2110 if (entry->offset == offset_to_bitmap(ctl, offset)) {
da080fe1
DZ
2111 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2112 bytes, trim_state);
2cdc342c
JB
2113 bytes -= bytes_added;
2114 offset += bytes_added;
2115 }
2116 spin_unlock(&cluster->lock);
2117 if (!bytes) {
2118 ret = 1;
2119 goto out;
2120 }
2121 }
38e87880
CM
2122
2123no_cluster_bitmap:
34d52cb6 2124 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
96303081
JB
2125 1, 0);
2126 if (!bitmap_info) {
b12d6869 2127 ASSERT(added == 0);
96303081
JB
2128 goto new_bitmap;
2129 }
2130
da080fe1
DZ
2131 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2132 trim_state);
2cdc342c
JB
2133 bytes -= bytes_added;
2134 offset += bytes_added;
2135 added = 0;
96303081
JB
2136
2137 if (!bytes) {
2138 ret = 1;
2139 goto out;
2140 } else
2141 goto again;
2142
2143new_bitmap:
2144 if (info && info->bitmap) {
34d52cb6 2145 add_new_bitmap(ctl, info, offset);
96303081
JB
2146 added = 1;
2147 info = NULL;
2148 goto again;
2149 } else {
34d52cb6 2150 spin_unlock(&ctl->tree_lock);
96303081
JB
2151
2152 /* no pre-allocated info, allocate a new one */
2153 if (!info) {
dc89e982
JB
2154 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2155 GFP_NOFS);
96303081 2156 if (!info) {
34d52cb6 2157 spin_lock(&ctl->tree_lock);
96303081
JB
2158 ret = -ENOMEM;
2159 goto out;
2160 }
2161 }
2162
2163 /* allocate the bitmap */
3acd4850
CL
2164 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2165 GFP_NOFS);
da080fe1 2166 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
34d52cb6 2167 spin_lock(&ctl->tree_lock);
96303081
JB
2168 if (!info->bitmap) {
2169 ret = -ENOMEM;
2170 goto out;
2171 }
2172 goto again;
2173 }
2174
2175out:
2176 if (info) {
3acd4850
CL
2177 if (info->bitmap)
2178 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2179 info->bitmap);
dc89e982 2180 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2181 }
0f9dd46c
JB
2182
2183 return ret;
2184}
2185
a7ccb255
DZ
2186/*
2187 * Free space merging rules:
2188 * 1) Merge trimmed areas together
2189 * 2) Let untrimmed areas coalesce with trimmed areas
2190 * 3) Always pull neighboring regions from bitmaps
2191 *
2192 * The above rules are for when we merge free space based on btrfs_trim_state.
2193 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2194 * same reason: to promote larger extent regions which makes life easier for
2195 * find_free_extent(). Rule 2 enables coalescing based on the common path
2196 * being returning free space from btrfs_finish_extent_commit(). So when free
2197 * space is trimmed, it will prevent aggregating trimmed new region and
2198 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2199 * and provide find_free_extent() with the largest extents possible hoping for
2200 * the reuse path.
2201 */
945d8962 2202static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 2203 struct btrfs_free_space *info, bool update_stat)
0f9dd46c 2204{
120d66ee
LZ
2205 struct btrfs_free_space *left_info;
2206 struct btrfs_free_space *right_info;
2207 bool merged = false;
2208 u64 offset = info->offset;
2209 u64 bytes = info->bytes;
a7ccb255 2210 const bool is_trimmed = btrfs_free_space_trimmed(info);
6226cb0a 2211
0f9dd46c
JB
2212 /*
2213 * first we want to see if there is free space adjacent to the range we
2214 * are adding, if there is remove that struct and add a new one to
2215 * cover the entire range
2216 */
34d52cb6 2217 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
96303081
JB
2218 if (right_info && rb_prev(&right_info->offset_index))
2219 left_info = rb_entry(rb_prev(&right_info->offset_index),
2220 struct btrfs_free_space, offset_index);
2221 else
34d52cb6 2222 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
0f9dd46c 2223
a7ccb255
DZ
2224 /* See try_merge_free_space() comment. */
2225 if (right_info && !right_info->bitmap &&
2226 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
f333adb5 2227 if (update_stat)
34d52cb6 2228 unlink_free_space(ctl, right_info);
f333adb5 2229 else
34d52cb6 2230 __unlink_free_space(ctl, right_info);
6226cb0a 2231 info->bytes += right_info->bytes;
dc89e982 2232 kmem_cache_free(btrfs_free_space_cachep, right_info);
120d66ee 2233 merged = true;
0f9dd46c
JB
2234 }
2235
a7ccb255 2236 /* See try_merge_free_space() comment. */
96303081 2237 if (left_info && !left_info->bitmap &&
a7ccb255
DZ
2238 left_info->offset + left_info->bytes == offset &&
2239 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
f333adb5 2240 if (update_stat)
34d52cb6 2241 unlink_free_space(ctl, left_info);
f333adb5 2242 else
34d52cb6 2243 __unlink_free_space(ctl, left_info);
6226cb0a
JB
2244 info->offset = left_info->offset;
2245 info->bytes += left_info->bytes;
dc89e982 2246 kmem_cache_free(btrfs_free_space_cachep, left_info);
120d66ee 2247 merged = true;
0f9dd46c
JB
2248 }
2249
120d66ee
LZ
2250 return merged;
2251}
2252
20005523
FM
2253static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2254 struct btrfs_free_space *info,
2255 bool update_stat)
2256{
2257 struct btrfs_free_space *bitmap;
2258 unsigned long i;
2259 unsigned long j;
2260 const u64 end = info->offset + info->bytes;
2261 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2262 u64 bytes;
2263
2264 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2265 if (!bitmap)
2266 return false;
2267
2268 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2269 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2270 if (j == i)
2271 return false;
2272 bytes = (j - i) * ctl->unit;
2273 info->bytes += bytes;
2274
a7ccb255
DZ
2275 /* See try_merge_free_space() comment. */
2276 if (!btrfs_free_space_trimmed(bitmap))
2277 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2278
20005523
FM
2279 if (update_stat)
2280 bitmap_clear_bits(ctl, bitmap, end, bytes);
2281 else
2282 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2283
2284 if (!bitmap->bytes)
2285 free_bitmap(ctl, bitmap);
2286
2287 return true;
2288}
2289
2290static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2291 struct btrfs_free_space *info,
2292 bool update_stat)
2293{
2294 struct btrfs_free_space *bitmap;
2295 u64 bitmap_offset;
2296 unsigned long i;
2297 unsigned long j;
2298 unsigned long prev_j;
2299 u64 bytes;
2300
2301 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2302 /* If we're on a boundary, try the previous logical bitmap. */
2303 if (bitmap_offset == info->offset) {
2304 if (info->offset == 0)
2305 return false;
2306 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2307 }
2308
2309 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2310 if (!bitmap)
2311 return false;
2312
2313 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2314 j = 0;
2315 prev_j = (unsigned long)-1;
2316 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2317 if (j > i)
2318 break;
2319 prev_j = j;
2320 }
2321 if (prev_j == i)
2322 return false;
2323
2324 if (prev_j == (unsigned long)-1)
2325 bytes = (i + 1) * ctl->unit;
2326 else
2327 bytes = (i - prev_j) * ctl->unit;
2328
2329 info->offset -= bytes;
2330 info->bytes += bytes;
2331
a7ccb255
DZ
2332 /* See try_merge_free_space() comment. */
2333 if (!btrfs_free_space_trimmed(bitmap))
2334 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2335
20005523
FM
2336 if (update_stat)
2337 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2338 else
2339 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2340
2341 if (!bitmap->bytes)
2342 free_bitmap(ctl, bitmap);
2343
2344 return true;
2345}
2346
2347/*
2348 * We prefer always to allocate from extent entries, both for clustered and
2349 * non-clustered allocation requests. So when attempting to add a new extent
2350 * entry, try to see if there's adjacent free space in bitmap entries, and if
2351 * there is, migrate that space from the bitmaps to the extent.
2352 * Like this we get better chances of satisfying space allocation requests
2353 * because we attempt to satisfy them based on a single cache entry, and never
2354 * on 2 or more entries - even if the entries represent a contiguous free space
2355 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2356 * ends).
2357 */
2358static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2359 struct btrfs_free_space *info,
2360 bool update_stat)
2361{
2362 /*
2363 * Only work with disconnected entries, as we can change their offset,
2364 * and must be extent entries.
2365 */
2366 ASSERT(!info->bitmap);
2367 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2368
2369 if (ctl->total_bitmaps > 0) {
2370 bool stole_end;
2371 bool stole_front = false;
2372
2373 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2374 if (ctl->total_bitmaps > 0)
2375 stole_front = steal_from_bitmap_to_front(ctl, info,
2376 update_stat);
2377
2378 if (stole_end || stole_front)
2379 try_merge_free_space(ctl, info, update_stat);
2380 }
2381}
2382
ab8d0fc4
JM
2383int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2384 struct btrfs_free_space_ctl *ctl,
a7ccb255
DZ
2385 u64 offset, u64 bytes,
2386 enum btrfs_trim_state trim_state)
120d66ee 2387{
b0643e59 2388 struct btrfs_block_group *block_group = ctl->private;
120d66ee
LZ
2389 struct btrfs_free_space *info;
2390 int ret = 0;
2391
dc89e982 2392 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
120d66ee
LZ
2393 if (!info)
2394 return -ENOMEM;
2395
2396 info->offset = offset;
2397 info->bytes = bytes;
a7ccb255 2398 info->trim_state = trim_state;
20005523 2399 RB_CLEAR_NODE(&info->offset_index);
120d66ee 2400
34d52cb6 2401 spin_lock(&ctl->tree_lock);
120d66ee 2402
34d52cb6 2403 if (try_merge_free_space(ctl, info, true))
120d66ee
LZ
2404 goto link;
2405
2406 /*
2407 * There was no extent directly to the left or right of this new
2408 * extent then we know we're going to have to allocate a new extent, so
2409 * before we do that see if we need to drop this into a bitmap
2410 */
34d52cb6 2411 ret = insert_into_bitmap(ctl, info);
120d66ee
LZ
2412 if (ret < 0) {
2413 goto out;
2414 } else if (ret) {
2415 ret = 0;
2416 goto out;
2417 }
2418link:
20005523
FM
2419 /*
2420 * Only steal free space from adjacent bitmaps if we're sure we're not
2421 * going to add the new free space to existing bitmap entries - because
2422 * that would mean unnecessary work that would be reverted. Therefore
2423 * attempt to steal space from bitmaps if we're adding an extent entry.
2424 */
2425 steal_from_bitmap(ctl, info, true);
2426
34d52cb6 2427 ret = link_free_space(ctl, info);
0f9dd46c 2428 if (ret)
dc89e982 2429 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2430out:
34d52cb6 2431 spin_unlock(&ctl->tree_lock);
6226cb0a 2432
0f9dd46c 2433 if (ret) {
ab8d0fc4 2434 btrfs_crit(fs_info, "unable to add free space :%d", ret);
b12d6869 2435 ASSERT(ret != -EEXIST);
0f9dd46c
JB
2436 }
2437
b0643e59
DZ
2438 if (trim_state != BTRFS_TRIM_STATE_TRIMMED)
2439 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2440
0f9dd46c
JB
2441 return ret;
2442}
2443
32da5386 2444int btrfs_add_free_space(struct btrfs_block_group *block_group,
478b4d9f
JB
2445 u64 bytenr, u64 size)
2446{
a7ccb255
DZ
2447 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2448
2449 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2450 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2451
478b4d9f
JB
2452 return __btrfs_add_free_space(block_group->fs_info,
2453 block_group->free_space_ctl,
a7ccb255 2454 bytenr, size, trim_state);
478b4d9f
JB
2455}
2456
b0643e59
DZ
2457/*
2458 * This is a subtle distinction because when adding free space back in general,
2459 * we want it to be added as untrimmed for async. But in the case where we add
2460 * it on loading of a block group, we want to consider it trimmed.
2461 */
2462int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2463 u64 bytenr, u64 size)
2464{
2465 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2466
2467 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2468 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2469 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2470
2471 return __btrfs_add_free_space(block_group->fs_info,
2472 block_group->free_space_ctl,
2473 bytenr, size, trim_state);
2474}
2475
32da5386 2476int btrfs_remove_free_space(struct btrfs_block_group *block_group,
6226cb0a 2477 u64 offset, u64 bytes)
0f9dd46c 2478{
34d52cb6 2479 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2480 struct btrfs_free_space *info;
b0175117
JB
2481 int ret;
2482 bool re_search = false;
0f9dd46c 2483
34d52cb6 2484 spin_lock(&ctl->tree_lock);
6226cb0a 2485
96303081 2486again:
b0175117 2487 ret = 0;
bdb7d303
JB
2488 if (!bytes)
2489 goto out_lock;
2490
34d52cb6 2491 info = tree_search_offset(ctl, offset, 0, 0);
96303081 2492 if (!info) {
6606bb97
JB
2493 /*
2494 * oops didn't find an extent that matched the space we wanted
2495 * to remove, look for a bitmap instead
2496 */
34d52cb6 2497 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
6606bb97
JB
2498 1, 0);
2499 if (!info) {
b0175117
JB
2500 /*
2501 * If we found a partial bit of our free space in a
2502 * bitmap but then couldn't find the other part this may
2503 * be a problem, so WARN about it.
24a70313 2504 */
b0175117 2505 WARN_ON(re_search);
6606bb97
JB
2506 goto out_lock;
2507 }
96303081
JB
2508 }
2509
b0175117 2510 re_search = false;
bdb7d303 2511 if (!info->bitmap) {
34d52cb6 2512 unlink_free_space(ctl, info);
bdb7d303
JB
2513 if (offset == info->offset) {
2514 u64 to_free = min(bytes, info->bytes);
2515
2516 info->bytes -= to_free;
2517 info->offset += to_free;
2518 if (info->bytes) {
2519 ret = link_free_space(ctl, info);
2520 WARN_ON(ret);
2521 } else {
2522 kmem_cache_free(btrfs_free_space_cachep, info);
2523 }
0f9dd46c 2524
bdb7d303
JB
2525 offset += to_free;
2526 bytes -= to_free;
2527 goto again;
2528 } else {
2529 u64 old_end = info->bytes + info->offset;
9b49c9b9 2530
bdb7d303 2531 info->bytes = offset - info->offset;
34d52cb6 2532 ret = link_free_space(ctl, info);
96303081
JB
2533 WARN_ON(ret);
2534 if (ret)
2535 goto out_lock;
96303081 2536
bdb7d303
JB
2537 /* Not enough bytes in this entry to satisfy us */
2538 if (old_end < offset + bytes) {
2539 bytes -= old_end - offset;
2540 offset = old_end;
2541 goto again;
2542 } else if (old_end == offset + bytes) {
2543 /* all done */
2544 goto out_lock;
2545 }
2546 spin_unlock(&ctl->tree_lock);
2547
a7ccb255
DZ
2548 ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2549 offset + bytes,
2550 old_end - (offset + bytes),
2551 info->trim_state);
bdb7d303
JB
2552 WARN_ON(ret);
2553 goto out;
2554 }
0f9dd46c 2555 }
96303081 2556
34d52cb6 2557 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
b0175117
JB
2558 if (ret == -EAGAIN) {
2559 re_search = true;
96303081 2560 goto again;
b0175117 2561 }
96303081 2562out_lock:
34d52cb6 2563 spin_unlock(&ctl->tree_lock);
0f9dd46c 2564out:
25179201
JB
2565 return ret;
2566}
2567
32da5386 2568void btrfs_dump_free_space(struct btrfs_block_group *block_group,
0f9dd46c
JB
2569 u64 bytes)
2570{
0b246afa 2571 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2572 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c
JB
2573 struct btrfs_free_space *info;
2574 struct rb_node *n;
2575 int count = 0;
2576
9084cb6a 2577 spin_lock(&ctl->tree_lock);
34d52cb6 2578 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
0f9dd46c 2579 info = rb_entry(n, struct btrfs_free_space, offset_index);
f6175efa 2580 if (info->bytes >= bytes && !block_group->ro)
0f9dd46c 2581 count++;
0b246afa 2582 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
efe120a0 2583 info->offset, info->bytes,
96303081 2584 (info->bitmap) ? "yes" : "no");
0f9dd46c 2585 }
9084cb6a 2586 spin_unlock(&ctl->tree_lock);
0b246afa 2587 btrfs_info(fs_info, "block group has cluster?: %s",
96303081 2588 list_empty(&block_group->cluster_list) ? "no" : "yes");
0b246afa 2589 btrfs_info(fs_info,
efe120a0 2590 "%d blocks of free space at or bigger than bytes is", count);
0f9dd46c
JB
2591}
2592
32da5386 2593void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
0f9dd46c 2594{
0b246afa 2595 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2596 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2597
34d52cb6 2598 spin_lock_init(&ctl->tree_lock);
0b246afa 2599 ctl->unit = fs_info->sectorsize;
b3470b5d 2600 ctl->start = block_group->start;
34d52cb6
LZ
2601 ctl->private = block_group;
2602 ctl->op = &free_space_op;
55507ce3
FM
2603 INIT_LIST_HEAD(&ctl->trimming_ranges);
2604 mutex_init(&ctl->cache_writeout_mutex);
0f9dd46c 2605
34d52cb6
LZ
2606 /*
2607 * we only want to have 32k of ram per block group for keeping
2608 * track of free space, and if we pass 1/2 of that we want to
2609 * start converting things over to using bitmaps
2610 */
ee22184b 2611 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
0f9dd46c
JB
2612}
2613
fa9c0d79
CM
2614/*
2615 * for a given cluster, put all of its extents back into the free
2616 * space cache. If the block group passed doesn't match the block group
2617 * pointed to by the cluster, someone else raced in and freed the
2618 * cluster already. In that case, we just return without changing anything
2619 */
2620static int
2621__btrfs_return_cluster_to_free_space(
32da5386 2622 struct btrfs_block_group *block_group,
fa9c0d79
CM
2623 struct btrfs_free_cluster *cluster)
2624{
34d52cb6 2625 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2626 struct btrfs_free_space *entry;
2627 struct rb_node *node;
2628
2629 spin_lock(&cluster->lock);
2630 if (cluster->block_group != block_group)
2631 goto out;
2632
96303081 2633 cluster->block_group = NULL;
fa9c0d79 2634 cluster->window_start = 0;
96303081 2635 list_del_init(&cluster->block_group_list);
96303081 2636
fa9c0d79 2637 node = rb_first(&cluster->root);
96303081 2638 while (node) {
4e69b598
JB
2639 bool bitmap;
2640
fa9c0d79
CM
2641 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2642 node = rb_next(&entry->offset_index);
2643 rb_erase(&entry->offset_index, &cluster->root);
20005523 2644 RB_CLEAR_NODE(&entry->offset_index);
4e69b598
JB
2645
2646 bitmap = (entry->bitmap != NULL);
20005523 2647 if (!bitmap) {
34d52cb6 2648 try_merge_free_space(ctl, entry, false);
20005523
FM
2649 steal_from_bitmap(ctl, entry, false);
2650 }
34d52cb6 2651 tree_insert_offset(&ctl->free_space_offset,
4e69b598 2652 entry->offset, &entry->offset_index, bitmap);
fa9c0d79 2653 }
6bef4d31 2654 cluster->root = RB_ROOT;
96303081 2655
fa9c0d79
CM
2656out:
2657 spin_unlock(&cluster->lock);
96303081 2658 btrfs_put_block_group(block_group);
fa9c0d79
CM
2659 return 0;
2660}
2661
48a3b636
ES
2662static void __btrfs_remove_free_space_cache_locked(
2663 struct btrfs_free_space_ctl *ctl)
0f9dd46c
JB
2664{
2665 struct btrfs_free_space *info;
2666 struct rb_node *node;
581bb050 2667
581bb050
LZ
2668 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2669 info = rb_entry(node, struct btrfs_free_space, offset_index);
9b90f513
JB
2670 if (!info->bitmap) {
2671 unlink_free_space(ctl, info);
2672 kmem_cache_free(btrfs_free_space_cachep, info);
2673 } else {
2674 free_bitmap(ctl, info);
2675 }
351810c1
DS
2676
2677 cond_resched_lock(&ctl->tree_lock);
581bb050 2678 }
09655373
CM
2679}
2680
2681void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2682{
2683 spin_lock(&ctl->tree_lock);
2684 __btrfs_remove_free_space_cache_locked(ctl);
581bb050
LZ
2685 spin_unlock(&ctl->tree_lock);
2686}
2687
32da5386 2688void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
581bb050
LZ
2689{
2690 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79 2691 struct btrfs_free_cluster *cluster;
96303081 2692 struct list_head *head;
0f9dd46c 2693
34d52cb6 2694 spin_lock(&ctl->tree_lock);
96303081
JB
2695 while ((head = block_group->cluster_list.next) !=
2696 &block_group->cluster_list) {
2697 cluster = list_entry(head, struct btrfs_free_cluster,
2698 block_group_list);
fa9c0d79
CM
2699
2700 WARN_ON(cluster->block_group != block_group);
2701 __btrfs_return_cluster_to_free_space(block_group, cluster);
351810c1
DS
2702
2703 cond_resched_lock(&ctl->tree_lock);
fa9c0d79 2704 }
09655373 2705 __btrfs_remove_free_space_cache_locked(ctl);
34d52cb6 2706 spin_unlock(&ctl->tree_lock);
fa9c0d79 2707
0f9dd46c
JB
2708}
2709
6e80d4f8
DZ
2710/**
2711 * btrfs_is_free_space_trimmed - see if everything is trimmed
2712 * @block_group: block_group of interest
2713 *
2714 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2715 */
2716bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2717{
2718 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2719 struct btrfs_free_space *info;
2720 struct rb_node *node;
2721 bool ret = true;
2722
2723 spin_lock(&ctl->tree_lock);
2724 node = rb_first(&ctl->free_space_offset);
2725
2726 while (node) {
2727 info = rb_entry(node, struct btrfs_free_space, offset_index);
2728
2729 if (!btrfs_free_space_trimmed(info)) {
2730 ret = false;
2731 break;
2732 }
2733
2734 node = rb_next(node);
2735 }
2736
2737 spin_unlock(&ctl->tree_lock);
2738 return ret;
2739}
2740
32da5386 2741u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
a4820398
MX
2742 u64 offset, u64 bytes, u64 empty_size,
2743 u64 *max_extent_size)
0f9dd46c 2744{
34d52cb6 2745 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
6226cb0a 2746 struct btrfs_free_space *entry = NULL;
96303081 2747 u64 bytes_search = bytes + empty_size;
6226cb0a 2748 u64 ret = 0;
53b381b3
DW
2749 u64 align_gap = 0;
2750 u64 align_gap_len = 0;
a7ccb255 2751 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
0f9dd46c 2752
34d52cb6 2753 spin_lock(&ctl->tree_lock);
53b381b3 2754 entry = find_free_space(ctl, &offset, &bytes_search,
a4820398 2755 block_group->full_stripe_len, max_extent_size);
6226cb0a 2756 if (!entry)
96303081
JB
2757 goto out;
2758
2759 ret = offset;
2760 if (entry->bitmap) {
34d52cb6 2761 bitmap_clear_bits(ctl, entry, offset, bytes);
edf6e2d1 2762 if (!entry->bytes)
34d52cb6 2763 free_bitmap(ctl, entry);
96303081 2764 } else {
34d52cb6 2765 unlink_free_space(ctl, entry);
53b381b3
DW
2766 align_gap_len = offset - entry->offset;
2767 align_gap = entry->offset;
a7ccb255 2768 align_gap_trim_state = entry->trim_state;
53b381b3
DW
2769
2770 entry->offset = offset + bytes;
2771 WARN_ON(entry->bytes < bytes + align_gap_len);
2772
2773 entry->bytes -= bytes + align_gap_len;
6226cb0a 2774 if (!entry->bytes)
dc89e982 2775 kmem_cache_free(btrfs_free_space_cachep, entry);
6226cb0a 2776 else
34d52cb6 2777 link_free_space(ctl, entry);
6226cb0a 2778 }
96303081 2779out:
34d52cb6 2780 spin_unlock(&ctl->tree_lock);
817d52f8 2781
53b381b3 2782 if (align_gap_len)
ab8d0fc4 2783 __btrfs_add_free_space(block_group->fs_info, ctl,
a7ccb255
DZ
2784 align_gap, align_gap_len,
2785 align_gap_trim_state);
0f9dd46c
JB
2786 return ret;
2787}
fa9c0d79
CM
2788
2789/*
2790 * given a cluster, put all of its extents back into the free space
2791 * cache. If a block group is passed, this function will only free
2792 * a cluster that belongs to the passed block group.
2793 *
2794 * Otherwise, it'll get a reference on the block group pointed to by the
2795 * cluster and remove the cluster from it.
2796 */
2797int btrfs_return_cluster_to_free_space(
32da5386 2798 struct btrfs_block_group *block_group,
fa9c0d79
CM
2799 struct btrfs_free_cluster *cluster)
2800{
34d52cb6 2801 struct btrfs_free_space_ctl *ctl;
fa9c0d79
CM
2802 int ret;
2803
2804 /* first, get a safe pointer to the block group */
2805 spin_lock(&cluster->lock);
2806 if (!block_group) {
2807 block_group = cluster->block_group;
2808 if (!block_group) {
2809 spin_unlock(&cluster->lock);
2810 return 0;
2811 }
2812 } else if (cluster->block_group != block_group) {
2813 /* someone else has already freed it don't redo their work */
2814 spin_unlock(&cluster->lock);
2815 return 0;
2816 }
2817 atomic_inc(&block_group->count);
2818 spin_unlock(&cluster->lock);
2819
34d52cb6
LZ
2820 ctl = block_group->free_space_ctl;
2821
fa9c0d79 2822 /* now return any extents the cluster had on it */
34d52cb6 2823 spin_lock(&ctl->tree_lock);
fa9c0d79 2824 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
34d52cb6 2825 spin_unlock(&ctl->tree_lock);
fa9c0d79 2826
6e80d4f8
DZ
2827 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
2828
fa9c0d79
CM
2829 /* finally drop our ref */
2830 btrfs_put_block_group(block_group);
2831 return ret;
2832}
2833
32da5386 2834static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
96303081 2835 struct btrfs_free_cluster *cluster,
4e69b598 2836 struct btrfs_free_space *entry,
a4820398
MX
2837 u64 bytes, u64 min_start,
2838 u64 *max_extent_size)
96303081 2839{
34d52cb6 2840 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
2841 int err;
2842 u64 search_start = cluster->window_start;
2843 u64 search_bytes = bytes;
2844 u64 ret = 0;
2845
96303081
JB
2846 search_start = min_start;
2847 search_bytes = bytes;
2848
0584f718 2849 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
a4820398 2850 if (err) {
ad22cf6e
JB
2851 *max_extent_size = max(get_max_extent_size(entry),
2852 *max_extent_size);
4e69b598 2853 return 0;
a4820398 2854 }
96303081
JB
2855
2856 ret = search_start;
bb3ac5a4 2857 __bitmap_clear_bits(ctl, entry, ret, bytes);
96303081
JB
2858
2859 return ret;
2860}
2861
fa9c0d79
CM
2862/*
2863 * given a cluster, try to allocate 'bytes' from it, returns 0
2864 * if it couldn't find anything suitably large, or a logical disk offset
2865 * if things worked out
2866 */
32da5386 2867u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
fa9c0d79 2868 struct btrfs_free_cluster *cluster, u64 bytes,
a4820398 2869 u64 min_start, u64 *max_extent_size)
fa9c0d79 2870{
34d52cb6 2871 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2872 struct btrfs_free_space *entry = NULL;
2873 struct rb_node *node;
2874 u64 ret = 0;
2875
2876 spin_lock(&cluster->lock);
2877 if (bytes > cluster->max_size)
2878 goto out;
2879
2880 if (cluster->block_group != block_group)
2881 goto out;
2882
2883 node = rb_first(&cluster->root);
2884 if (!node)
2885 goto out;
2886
2887 entry = rb_entry(node, struct btrfs_free_space, offset_index);
67871254 2888 while (1) {
ad22cf6e
JB
2889 if (entry->bytes < bytes)
2890 *max_extent_size = max(get_max_extent_size(entry),
2891 *max_extent_size);
a4820398 2892
4e69b598
JB
2893 if (entry->bytes < bytes ||
2894 (!entry->bitmap && entry->offset < min_start)) {
fa9c0d79
CM
2895 node = rb_next(&entry->offset_index);
2896 if (!node)
2897 break;
2898 entry = rb_entry(node, struct btrfs_free_space,
2899 offset_index);
2900 continue;
2901 }
fa9c0d79 2902
4e69b598
JB
2903 if (entry->bitmap) {
2904 ret = btrfs_alloc_from_bitmap(block_group,
2905 cluster, entry, bytes,
a4820398
MX
2906 cluster->window_start,
2907 max_extent_size);
4e69b598 2908 if (ret == 0) {
4e69b598
JB
2909 node = rb_next(&entry->offset_index);
2910 if (!node)
2911 break;
2912 entry = rb_entry(node, struct btrfs_free_space,
2913 offset_index);
2914 continue;
2915 }
9b230628 2916 cluster->window_start += bytes;
4e69b598 2917 } else {
4e69b598
JB
2918 ret = entry->offset;
2919
2920 entry->offset += bytes;
2921 entry->bytes -= bytes;
2922 }
fa9c0d79 2923
5e71b5d5 2924 if (entry->bytes == 0)
fa9c0d79 2925 rb_erase(&entry->offset_index, &cluster->root);
fa9c0d79
CM
2926 break;
2927 }
2928out:
2929 spin_unlock(&cluster->lock);
96303081 2930
5e71b5d5
LZ
2931 if (!ret)
2932 return 0;
2933
34d52cb6 2934 spin_lock(&ctl->tree_lock);
5e71b5d5 2935
34d52cb6 2936 ctl->free_space -= bytes;
5e71b5d5 2937 if (entry->bytes == 0) {
34d52cb6 2938 ctl->free_extents--;
4e69b598 2939 if (entry->bitmap) {
3acd4850
CL
2940 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2941 entry->bitmap);
34d52cb6
LZ
2942 ctl->total_bitmaps--;
2943 ctl->op->recalc_thresholds(ctl);
4e69b598 2944 }
dc89e982 2945 kmem_cache_free(btrfs_free_space_cachep, entry);
5e71b5d5
LZ
2946 }
2947
34d52cb6 2948 spin_unlock(&ctl->tree_lock);
5e71b5d5 2949
fa9c0d79
CM
2950 return ret;
2951}
2952
32da5386 2953static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
96303081
JB
2954 struct btrfs_free_space *entry,
2955 struct btrfs_free_cluster *cluster,
1bb91902
AO
2956 u64 offset, u64 bytes,
2957 u64 cont1_bytes, u64 min_bytes)
96303081 2958{
34d52cb6 2959 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
2960 unsigned long next_zero;
2961 unsigned long i;
1bb91902
AO
2962 unsigned long want_bits;
2963 unsigned long min_bits;
96303081 2964 unsigned long found_bits;
cef40483 2965 unsigned long max_bits = 0;
96303081
JB
2966 unsigned long start = 0;
2967 unsigned long total_found = 0;
4e69b598 2968 int ret;
96303081 2969
96009762 2970 i = offset_to_bit(entry->offset, ctl->unit,
96303081 2971 max_t(u64, offset, entry->offset));
96009762
WSH
2972 want_bits = bytes_to_bits(bytes, ctl->unit);
2973 min_bits = bytes_to_bits(min_bytes, ctl->unit);
96303081 2974
cef40483
JB
2975 /*
2976 * Don't bother looking for a cluster in this bitmap if it's heavily
2977 * fragmented.
2978 */
2979 if (entry->max_extent_size &&
2980 entry->max_extent_size < cont1_bytes)
2981 return -ENOSPC;
96303081
JB
2982again:
2983 found_bits = 0;
ebb3dad4 2984 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
96303081
JB
2985 next_zero = find_next_zero_bit(entry->bitmap,
2986 BITS_PER_BITMAP, i);
1bb91902 2987 if (next_zero - i >= min_bits) {
96303081 2988 found_bits = next_zero - i;
cef40483
JB
2989 if (found_bits > max_bits)
2990 max_bits = found_bits;
96303081
JB
2991 break;
2992 }
cef40483
JB
2993 if (next_zero - i > max_bits)
2994 max_bits = next_zero - i;
96303081
JB
2995 i = next_zero;
2996 }
2997
cef40483
JB
2998 if (!found_bits) {
2999 entry->max_extent_size = (u64)max_bits * ctl->unit;
4e69b598 3000 return -ENOSPC;
cef40483 3001 }
96303081 3002
1bb91902 3003 if (!total_found) {
96303081 3004 start = i;
b78d09bc 3005 cluster->max_size = 0;
96303081
JB
3006 }
3007
3008 total_found += found_bits;
3009
96009762
WSH
3010 if (cluster->max_size < found_bits * ctl->unit)
3011 cluster->max_size = found_bits * ctl->unit;
96303081 3012
1bb91902
AO
3013 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3014 i = next_zero + 1;
96303081
JB
3015 goto again;
3016 }
3017
96009762 3018 cluster->window_start = start * ctl->unit + entry->offset;
34d52cb6 3019 rb_erase(&entry->offset_index, &ctl->free_space_offset);
4e69b598
JB
3020 ret = tree_insert_offset(&cluster->root, entry->offset,
3021 &entry->offset_index, 1);
b12d6869 3022 ASSERT(!ret); /* -EEXIST; Logic error */
96303081 3023
3f7de037 3024 trace_btrfs_setup_cluster(block_group, cluster,
96009762 3025 total_found * ctl->unit, 1);
96303081
JB
3026 return 0;
3027}
3028
4e69b598
JB
3029/*
3030 * This searches the block group for just extents to fill the cluster with.
1bb91902
AO
3031 * Try to find a cluster with at least bytes total bytes, at least one
3032 * extent of cont1_bytes, and other clusters of at least min_bytes.
4e69b598 3033 */
3de85bb9 3034static noinline int
32da5386 3035setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3036 struct btrfs_free_cluster *cluster,
3037 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3038 u64 cont1_bytes, u64 min_bytes)
4e69b598 3039{
34d52cb6 3040 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4e69b598
JB
3041 struct btrfs_free_space *first = NULL;
3042 struct btrfs_free_space *entry = NULL;
4e69b598
JB
3043 struct btrfs_free_space *last;
3044 struct rb_node *node;
4e69b598
JB
3045 u64 window_free;
3046 u64 max_extent;
3f7de037 3047 u64 total_size = 0;
4e69b598 3048
34d52cb6 3049 entry = tree_search_offset(ctl, offset, 0, 1);
4e69b598
JB
3050 if (!entry)
3051 return -ENOSPC;
3052
3053 /*
3054 * We don't want bitmaps, so just move along until we find a normal
3055 * extent entry.
3056 */
1bb91902
AO
3057 while (entry->bitmap || entry->bytes < min_bytes) {
3058 if (entry->bitmap && list_empty(&entry->list))
86d4a77b 3059 list_add_tail(&entry->list, bitmaps);
4e69b598
JB
3060 node = rb_next(&entry->offset_index);
3061 if (!node)
3062 return -ENOSPC;
3063 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3064 }
3065
4e69b598
JB
3066 window_free = entry->bytes;
3067 max_extent = entry->bytes;
3068 first = entry;
3069 last = entry;
4e69b598 3070
1bb91902
AO
3071 for (node = rb_next(&entry->offset_index); node;
3072 node = rb_next(&entry->offset_index)) {
4e69b598
JB
3073 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3074
86d4a77b
JB
3075 if (entry->bitmap) {
3076 if (list_empty(&entry->list))
3077 list_add_tail(&entry->list, bitmaps);
4e69b598 3078 continue;
86d4a77b
JB
3079 }
3080
1bb91902
AO
3081 if (entry->bytes < min_bytes)
3082 continue;
3083
3084 last = entry;
3085 window_free += entry->bytes;
3086 if (entry->bytes > max_extent)
4e69b598 3087 max_extent = entry->bytes;
4e69b598
JB
3088 }
3089
1bb91902
AO
3090 if (window_free < bytes || max_extent < cont1_bytes)
3091 return -ENOSPC;
3092
4e69b598
JB
3093 cluster->window_start = first->offset;
3094
3095 node = &first->offset_index;
3096
3097 /*
3098 * now we've found our entries, pull them out of the free space
3099 * cache and put them into the cluster rbtree
3100 */
3101 do {
3102 int ret;
3103
3104 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3105 node = rb_next(&entry->offset_index);
1bb91902 3106 if (entry->bitmap || entry->bytes < min_bytes)
4e69b598
JB
3107 continue;
3108
34d52cb6 3109 rb_erase(&entry->offset_index, &ctl->free_space_offset);
4e69b598
JB
3110 ret = tree_insert_offset(&cluster->root, entry->offset,
3111 &entry->offset_index, 0);
3f7de037 3112 total_size += entry->bytes;
b12d6869 3113 ASSERT(!ret); /* -EEXIST; Logic error */
4e69b598
JB
3114 } while (node && entry != last);
3115
3116 cluster->max_size = max_extent;
3f7de037 3117 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
4e69b598
JB
3118 return 0;
3119}
3120
3121/*
3122 * This specifically looks for bitmaps that may work in the cluster, we assume
3123 * that we have already failed to find extents that will work.
3124 */
3de85bb9 3125static noinline int
32da5386 3126setup_cluster_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3127 struct btrfs_free_cluster *cluster,
3128 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3129 u64 cont1_bytes, u64 min_bytes)
4e69b598 3130{
34d52cb6 3131 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1b9b922a 3132 struct btrfs_free_space *entry = NULL;
4e69b598 3133 int ret = -ENOSPC;
0f0fbf1d 3134 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
4e69b598 3135
34d52cb6 3136 if (ctl->total_bitmaps == 0)
4e69b598
JB
3137 return -ENOSPC;
3138
0f0fbf1d
LZ
3139 /*
3140 * The bitmap that covers offset won't be in the list unless offset
3141 * is just its start offset.
3142 */
1b9b922a
CM
3143 if (!list_empty(bitmaps))
3144 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3145
3146 if (!entry || entry->offset != bitmap_offset) {
0f0fbf1d
LZ
3147 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3148 if (entry && list_empty(&entry->list))
3149 list_add(&entry->list, bitmaps);
3150 }
3151
86d4a77b 3152 list_for_each_entry(entry, bitmaps, list) {
357b9784 3153 if (entry->bytes < bytes)
86d4a77b
JB
3154 continue;
3155 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
1bb91902 3156 bytes, cont1_bytes, min_bytes);
86d4a77b
JB
3157 if (!ret)
3158 return 0;
3159 }
3160
3161 /*
52621cb6
LZ
3162 * The bitmaps list has all the bitmaps that record free space
3163 * starting after offset, so no more search is required.
86d4a77b 3164 */
52621cb6 3165 return -ENOSPC;
4e69b598
JB
3166}
3167
fa9c0d79
CM
3168/*
3169 * here we try to find a cluster of blocks in a block group. The goal
1bb91902 3170 * is to find at least bytes+empty_size.
fa9c0d79
CM
3171 * We might not find them all in one contiguous area.
3172 *
3173 * returns zero and sets up cluster if things worked out, otherwise
3174 * it returns -enospc
3175 */
32da5386 3176int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
fa9c0d79
CM
3177 struct btrfs_free_cluster *cluster,
3178 u64 offset, u64 bytes, u64 empty_size)
3179{
2ceeae2e 3180 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 3181 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
86d4a77b 3182 struct btrfs_free_space *entry, *tmp;
52621cb6 3183 LIST_HEAD(bitmaps);
fa9c0d79 3184 u64 min_bytes;
1bb91902 3185 u64 cont1_bytes;
fa9c0d79
CM
3186 int ret;
3187
1bb91902
AO
3188 /*
3189 * Choose the minimum extent size we'll require for this
3190 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3191 * For metadata, allow allocates with smaller extents. For
3192 * data, keep it dense.
3193 */
0b246afa 3194 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
1bb91902 3195 cont1_bytes = min_bytes = bytes + empty_size;
451d7585 3196 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1bb91902 3197 cont1_bytes = bytes;
0b246afa 3198 min_bytes = fs_info->sectorsize;
1bb91902
AO
3199 } else {
3200 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
0b246afa 3201 min_bytes = fs_info->sectorsize;
1bb91902 3202 }
fa9c0d79 3203
34d52cb6 3204 spin_lock(&ctl->tree_lock);
7d0d2e8e
JB
3205
3206 /*
3207 * If we know we don't have enough space to make a cluster don't even
3208 * bother doing all the work to try and find one.
3209 */
1bb91902 3210 if (ctl->free_space < bytes) {
34d52cb6 3211 spin_unlock(&ctl->tree_lock);
7d0d2e8e
JB
3212 return -ENOSPC;
3213 }
3214
fa9c0d79
CM
3215 spin_lock(&cluster->lock);
3216
3217 /* someone already found a cluster, hooray */
3218 if (cluster->block_group) {
3219 ret = 0;
3220 goto out;
3221 }
fa9c0d79 3222
3f7de037
JB
3223 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3224 min_bytes);
3225
86d4a77b 3226 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
1bb91902
AO
3227 bytes + empty_size,
3228 cont1_bytes, min_bytes);
4e69b598 3229 if (ret)
86d4a77b 3230 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
1bb91902
AO
3231 offset, bytes + empty_size,
3232 cont1_bytes, min_bytes);
86d4a77b
JB
3233
3234 /* Clear our temporary list */
3235 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3236 list_del_init(&entry->list);
fa9c0d79 3237
4e69b598
JB
3238 if (!ret) {
3239 atomic_inc(&block_group->count);
3240 list_add_tail(&cluster->block_group_list,
3241 &block_group->cluster_list);
3242 cluster->block_group = block_group;
3f7de037
JB
3243 } else {
3244 trace_btrfs_failed_cluster_setup(block_group);
fa9c0d79 3245 }
fa9c0d79
CM
3246out:
3247 spin_unlock(&cluster->lock);
34d52cb6 3248 spin_unlock(&ctl->tree_lock);
fa9c0d79
CM
3249
3250 return ret;
3251}
3252
3253/*
3254 * simple code to zero out a cluster
3255 */
3256void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3257{
3258 spin_lock_init(&cluster->lock);
3259 spin_lock_init(&cluster->refill_lock);
6bef4d31 3260 cluster->root = RB_ROOT;
fa9c0d79 3261 cluster->max_size = 0;
c759c4e1 3262 cluster->fragmented = false;
fa9c0d79
CM
3263 INIT_LIST_HEAD(&cluster->block_group_list);
3264 cluster->block_group = NULL;
3265}
3266
32da5386 3267static int do_trimming(struct btrfs_block_group *block_group,
7fe1e641 3268 u64 *total_trimmed, u64 start, u64 bytes,
55507ce3 3269 u64 reserved_start, u64 reserved_bytes,
b0643e59 3270 enum btrfs_trim_state reserved_trim_state,
55507ce3 3271 struct btrfs_trim_range *trim_entry)
f7039b1d 3272{
7fe1e641 3273 struct btrfs_space_info *space_info = block_group->space_info;
f7039b1d 3274 struct btrfs_fs_info *fs_info = block_group->fs_info;
55507ce3 3275 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
7fe1e641
LZ
3276 int ret;
3277 int update = 0;
b0643e59
DZ
3278 const u64 end = start + bytes;
3279 const u64 reserved_end = reserved_start + reserved_bytes;
3280 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3281 u64 trimmed = 0;
f7039b1d 3282
7fe1e641
LZ
3283 spin_lock(&space_info->lock);
3284 spin_lock(&block_group->lock);
3285 if (!block_group->ro) {
3286 block_group->reserved += reserved_bytes;
3287 space_info->bytes_reserved += reserved_bytes;
3288 update = 1;
3289 }
3290 spin_unlock(&block_group->lock);
3291 spin_unlock(&space_info->lock);
3292
2ff7e61e 3293 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
b0643e59 3294 if (!ret) {
7fe1e641 3295 *total_trimmed += trimmed;
b0643e59
DZ
3296 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3297 }
7fe1e641 3298
55507ce3 3299 mutex_lock(&ctl->cache_writeout_mutex);
b0643e59
DZ
3300 if (reserved_start < start)
3301 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3302 start - reserved_start,
3303 reserved_trim_state);
3304 if (start + bytes < reserved_start + reserved_bytes)
3305 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3306 reserved_trim_state);
3307 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
55507ce3
FM
3308 list_del(&trim_entry->list);
3309 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3310
3311 if (update) {
3312 spin_lock(&space_info->lock);
3313 spin_lock(&block_group->lock);
3314 if (block_group->ro)
3315 space_info->bytes_readonly += reserved_bytes;
3316 block_group->reserved -= reserved_bytes;
3317 space_info->bytes_reserved -= reserved_bytes;
7fe1e641 3318 spin_unlock(&block_group->lock);
8f63a840 3319 spin_unlock(&space_info->lock);
7fe1e641
LZ
3320 }
3321
3322 return ret;
3323}
3324
32da5386 3325static int trim_no_bitmap(struct btrfs_block_group *block_group,
7fe1e641
LZ
3326 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3327{
3328 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3329 struct btrfs_free_space *entry;
3330 struct rb_node *node;
3331 int ret = 0;
3332 u64 extent_start;
3333 u64 extent_bytes;
b0643e59 3334 enum btrfs_trim_state extent_trim_state;
7fe1e641 3335 u64 bytes;
f7039b1d
LD
3336
3337 while (start < end) {
55507ce3
FM
3338 struct btrfs_trim_range trim_entry;
3339
3340 mutex_lock(&ctl->cache_writeout_mutex);
34d52cb6 3341 spin_lock(&ctl->tree_lock);
f7039b1d 3342
34d52cb6
LZ
3343 if (ctl->free_space < minlen) {
3344 spin_unlock(&ctl->tree_lock);
55507ce3 3345 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d
LD
3346 break;
3347 }
3348
34d52cb6 3349 entry = tree_search_offset(ctl, start, 0, 1);
7fe1e641 3350 if (!entry) {
34d52cb6 3351 spin_unlock(&ctl->tree_lock);
55507ce3 3352 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d
LD
3353 break;
3354 }
3355
7fe1e641
LZ
3356 /* skip bitmaps */
3357 while (entry->bitmap) {
3358 node = rb_next(&entry->offset_index);
3359 if (!node) {
34d52cb6 3360 spin_unlock(&ctl->tree_lock);
55507ce3 3361 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3362 goto out;
f7039b1d 3363 }
7fe1e641
LZ
3364 entry = rb_entry(node, struct btrfs_free_space,
3365 offset_index);
f7039b1d
LD
3366 }
3367
7fe1e641
LZ
3368 if (entry->offset >= end) {
3369 spin_unlock(&ctl->tree_lock);
55507ce3 3370 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3371 break;
f7039b1d
LD
3372 }
3373
7fe1e641
LZ
3374 extent_start = entry->offset;
3375 extent_bytes = entry->bytes;
b0643e59 3376 extent_trim_state = entry->trim_state;
7fe1e641
LZ
3377 start = max(start, extent_start);
3378 bytes = min(extent_start + extent_bytes, end) - start;
3379 if (bytes < minlen) {
3380 spin_unlock(&ctl->tree_lock);
55507ce3 3381 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3382 goto next;
f7039b1d
LD
3383 }
3384
7fe1e641
LZ
3385 unlink_free_space(ctl, entry);
3386 kmem_cache_free(btrfs_free_space_cachep, entry);
3387
34d52cb6 3388 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3389 trim_entry.start = extent_start;
3390 trim_entry.bytes = extent_bytes;
3391 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3392 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d 3393
7fe1e641 3394 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59
DZ
3395 extent_start, extent_bytes, extent_trim_state,
3396 &trim_entry);
7fe1e641
LZ
3397 if (ret)
3398 break;
3399next:
3400 start += bytes;
f7039b1d 3401
7fe1e641
LZ
3402 if (fatal_signal_pending(current)) {
3403 ret = -ERESTARTSYS;
3404 break;
3405 }
3406
3407 cond_resched();
3408 }
3409out:
3410 return ret;
3411}
3412
da080fe1
DZ
3413/*
3414 * If we break out of trimming a bitmap prematurely, we should reset the
3415 * trimming bit. In a rather contrieved case, it's possible to race here so
3416 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3417 *
3418 * start = start of bitmap
3419 * end = near end of bitmap
3420 *
3421 * Thread 1: Thread 2:
3422 * trim_bitmaps(start)
3423 * trim_bitmaps(end)
3424 * end_trimming_bitmap()
3425 * reset_trimming_bitmap()
3426 */
3427static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3428{
3429 struct btrfs_free_space *entry;
3430
3431 spin_lock(&ctl->tree_lock);
3432 entry = tree_search_offset(ctl, offset, 1, 0);
3433 if (entry)
3434 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3435 spin_unlock(&ctl->tree_lock);
3436}
3437
3438static void end_trimming_bitmap(struct btrfs_free_space *entry)
3439{
3440 if (btrfs_free_space_trimming_bitmap(entry))
3441 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3442}
3443
32da5386 3444static int trim_bitmaps(struct btrfs_block_group *block_group,
7fe1e641
LZ
3445 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3446{
3447 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3448 struct btrfs_free_space *entry;
3449 int ret = 0;
3450 int ret2;
3451 u64 bytes;
3452 u64 offset = offset_to_bitmap(ctl, start);
3453
3454 while (offset < end) {
3455 bool next_bitmap = false;
55507ce3 3456 struct btrfs_trim_range trim_entry;
7fe1e641 3457
55507ce3 3458 mutex_lock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3459 spin_lock(&ctl->tree_lock);
3460
3461 if (ctl->free_space < minlen) {
3462 spin_unlock(&ctl->tree_lock);
55507ce3 3463 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3464 break;
3465 }
3466
3467 entry = tree_search_offset(ctl, offset, 1, 0);
da080fe1 3468 if (!entry || btrfs_free_space_trimmed(entry)) {
7fe1e641 3469 spin_unlock(&ctl->tree_lock);
55507ce3 3470 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3471 next_bitmap = true;
3472 goto next;
3473 }
3474
da080fe1
DZ
3475 /*
3476 * Async discard bitmap trimming begins at by setting the start
3477 * to be key.objectid and the offset_to_bitmap() aligns to the
3478 * start of the bitmap. This lets us know we are fully
3479 * scanning the bitmap rather than only some portion of it.
3480 */
3481 if (start == offset)
3482 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3483
7fe1e641 3484 bytes = minlen;
0584f718 3485 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
7fe1e641 3486 if (ret2 || start >= end) {
da080fe1
DZ
3487 /*
3488 * This keeps the invariant that all bytes are trimmed
3489 * if BTRFS_TRIM_STATE_TRIMMED is set on a bitmap.
3490 */
3491 if (ret2 && !minlen)
3492 end_trimming_bitmap(entry);
3493 else
3494 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3495 spin_unlock(&ctl->tree_lock);
55507ce3 3496 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3497 next_bitmap = true;
3498 goto next;
3499 }
3500
3501 bytes = min(bytes, end - start);
3502 if (bytes < minlen) {
da080fe1 3503 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3504 spin_unlock(&ctl->tree_lock);
55507ce3 3505 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3506 goto next;
3507 }
3508
3509 bitmap_clear_bits(ctl, entry, start, bytes);
3510 if (entry->bytes == 0)
3511 free_bitmap(ctl, entry);
3512
3513 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3514 trim_entry.start = start;
3515 trim_entry.bytes = bytes;
3516 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3517 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3518
3519 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59 3520 start, bytes, 0, &trim_entry);
da080fe1
DZ
3521 if (ret) {
3522 reset_trimming_bitmap(ctl, offset);
7fe1e641 3523 break;
da080fe1 3524 }
7fe1e641
LZ
3525next:
3526 if (next_bitmap) {
3527 offset += BITS_PER_BITMAP * ctl->unit;
da080fe1 3528 start = offset;
7fe1e641
LZ
3529 } else {
3530 start += bytes;
f7039b1d 3531 }
f7039b1d
LD
3532
3533 if (fatal_signal_pending(current)) {
da080fe1
DZ
3534 if (start != offset)
3535 reset_trimming_bitmap(ctl, offset);
f7039b1d
LD
3536 ret = -ERESTARTSYS;
3537 break;
3538 }
3539
3540 cond_resched();
3541 }
3542
3543 return ret;
3544}
581bb050 3545
32da5386 3546void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
7fe1e641 3547{
e33e17ee
JM
3548 atomic_inc(&cache->trimming);
3549}
7fe1e641 3550
32da5386 3551void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
e33e17ee 3552{
0b246afa 3553 struct btrfs_fs_info *fs_info = block_group->fs_info;
e33e17ee
JM
3554 struct extent_map_tree *em_tree;
3555 struct extent_map *em;
3556 bool cleanup;
7fe1e641 3557
04216820 3558 spin_lock(&block_group->lock);
e33e17ee
JM
3559 cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3560 block_group->removed);
04216820
FM
3561 spin_unlock(&block_group->lock);
3562
e33e17ee 3563 if (cleanup) {
34441361 3564 mutex_lock(&fs_info->chunk_mutex);
c8bf1b67 3565 em_tree = &fs_info->mapping_tree;
04216820 3566 write_lock(&em_tree->lock);
b3470b5d 3567 em = lookup_extent_mapping(em_tree, block_group->start,
04216820
FM
3568 1);
3569 BUG_ON(!em); /* logic error, can't happen */
3570 remove_extent_mapping(em_tree, em);
3571 write_unlock(&em_tree->lock);
34441361 3572 mutex_unlock(&fs_info->chunk_mutex);
04216820
FM
3573
3574 /* once for us and once for the tree */
3575 free_extent_map(em);
3576 free_extent_map(em);
946ddbe8
FM
3577
3578 /*
3579 * We've left one free space entry and other tasks trimming
3580 * this block group have left 1 entry each one. Free them.
3581 */
3582 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
e33e17ee
JM
3583 }
3584}
3585
32da5386 3586int btrfs_trim_block_group(struct btrfs_block_group *block_group,
e33e17ee
JM
3587 u64 *trimmed, u64 start, u64 end, u64 minlen)
3588{
da080fe1 3589 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
e33e17ee 3590 int ret;
da080fe1 3591 u64 rem = 0;
e33e17ee
JM
3592
3593 *trimmed = 0;
3594
3595 spin_lock(&block_group->lock);
3596 if (block_group->removed) {
04216820 3597 spin_unlock(&block_group->lock);
e33e17ee 3598 return 0;
04216820 3599 }
e33e17ee
JM
3600 btrfs_get_block_group_trimming(block_group);
3601 spin_unlock(&block_group->lock);
3602
3603 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3604 if (ret)
3605 goto out;
7fe1e641 3606
e33e17ee 3607 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
da080fe1
DZ
3608 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3609 /* If we ended in the middle of a bitmap, reset the trimming flag */
3610 if (rem)
3611 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
e33e17ee
JM
3612out:
3613 btrfs_put_block_group_trimming(block_group);
7fe1e641
LZ
3614 return ret;
3615}
3616
581bb050
LZ
3617/*
3618 * Find the left-most item in the cache tree, and then return the
3619 * smallest inode number in the item.
3620 *
3621 * Note: the returned inode number may not be the smallest one in
3622 * the tree, if the left-most item is a bitmap.
3623 */
3624u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3625{
3626 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3627 struct btrfs_free_space *entry = NULL;
3628 u64 ino = 0;
3629
3630 spin_lock(&ctl->tree_lock);
3631
3632 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3633 goto out;
3634
3635 entry = rb_entry(rb_first(&ctl->free_space_offset),
3636 struct btrfs_free_space, offset_index);
3637
3638 if (!entry->bitmap) {
3639 ino = entry->offset;
3640
3641 unlink_free_space(ctl, entry);
3642 entry->offset++;
3643 entry->bytes--;
3644 if (!entry->bytes)
3645 kmem_cache_free(btrfs_free_space_cachep, entry);
3646 else
3647 link_free_space(ctl, entry);
3648 } else {
3649 u64 offset = 0;
3650 u64 count = 1;
3651 int ret;
3652
0584f718 3653 ret = search_bitmap(ctl, entry, &offset, &count, true);
79787eaa 3654 /* Logic error; Should be empty if it can't find anything */
b12d6869 3655 ASSERT(!ret);
581bb050
LZ
3656
3657 ino = offset;
3658 bitmap_clear_bits(ctl, entry, offset, 1);
3659 if (entry->bytes == 0)
3660 free_bitmap(ctl, entry);
3661 }
3662out:
3663 spin_unlock(&ctl->tree_lock);
3664
3665 return ino;
3666}
82d5902d
LZ
3667
3668struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3669 struct btrfs_path *path)
3670{
3671 struct inode *inode = NULL;
3672
57cdc8db
DS
3673 spin_lock(&root->ino_cache_lock);
3674 if (root->ino_cache_inode)
3675 inode = igrab(root->ino_cache_inode);
3676 spin_unlock(&root->ino_cache_lock);
82d5902d
LZ
3677 if (inode)
3678 return inode;
3679
3680 inode = __lookup_free_space_inode(root, path, 0);
3681 if (IS_ERR(inode))
3682 return inode;
3683
57cdc8db 3684 spin_lock(&root->ino_cache_lock);
7841cb28 3685 if (!btrfs_fs_closing(root->fs_info))
57cdc8db
DS
3686 root->ino_cache_inode = igrab(inode);
3687 spin_unlock(&root->ino_cache_lock);
82d5902d
LZ
3688
3689 return inode;
3690}
3691
3692int create_free_ino_inode(struct btrfs_root *root,
3693 struct btrfs_trans_handle *trans,
3694 struct btrfs_path *path)
3695{
3696 return __create_free_space_inode(root, trans, path,
3697 BTRFS_FREE_INO_OBJECTID, 0);
3698}
3699
3700int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3701{
3702 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3703 struct btrfs_path *path;
3704 struct inode *inode;
3705 int ret = 0;
3706 u64 root_gen = btrfs_root_generation(&root->root_item);
3707
0b246afa 3708 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
4b9465cb
CM
3709 return 0;
3710
82d5902d
LZ
3711 /*
3712 * If we're unmounting then just return, since this does a search on the
3713 * normal root and not the commit root and we could deadlock.
3714 */
7841cb28 3715 if (btrfs_fs_closing(fs_info))
82d5902d
LZ
3716 return 0;
3717
3718 path = btrfs_alloc_path();
3719 if (!path)
3720 return 0;
3721
3722 inode = lookup_free_ino_inode(root, path);
3723 if (IS_ERR(inode))
3724 goto out;
3725
3726 if (root_gen != BTRFS_I(inode)->generation)
3727 goto out_put;
3728
3729 ret = __load_free_space_cache(root, inode, ctl, path, 0);
3730
3731 if (ret < 0)
c2cf52eb
SK
3732 btrfs_err(fs_info,
3733 "failed to load free ino cache for root %llu",
3734 root->root_key.objectid);
82d5902d
LZ
3735out_put:
3736 iput(inode);
3737out:
3738 btrfs_free_path(path);
3739 return ret;
3740}
3741
3742int btrfs_write_out_ino_cache(struct btrfs_root *root,
3743 struct btrfs_trans_handle *trans,
53645a91
FDBM
3744 struct btrfs_path *path,
3745 struct inode *inode)
82d5902d 3746{
0b246afa 3747 struct btrfs_fs_info *fs_info = root->fs_info;
82d5902d 3748 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
82d5902d 3749 int ret;
c9dc4c65 3750 struct btrfs_io_ctl io_ctl;
e43699d4 3751 bool release_metadata = true;
82d5902d 3752
0b246afa 3753 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
4b9465cb
CM
3754 return 0;
3755
85db36cf 3756 memset(&io_ctl, 0, sizeof(io_ctl));
0e8d931a 3757 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
e43699d4
FM
3758 if (!ret) {
3759 /*
3760 * At this point writepages() didn't error out, so our metadata
3761 * reservation is released when the writeback finishes, at
3762 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3763 * with or without an error.
3764 */
3765 release_metadata = false;
afdb5718 3766 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
e43699d4 3767 }
85db36cf 3768
c09544e0 3769 if (ret) {
e43699d4 3770 if (release_metadata)
691fa059 3771 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3772 inode->i_size, true);
c09544e0 3773#ifdef DEBUG
0b246afa
JM
3774 btrfs_err(fs_info,
3775 "failed to write free ino cache for root %llu",
3776 root->root_key.objectid);
c09544e0
JB
3777#endif
3778 }
82d5902d 3779
82d5902d
LZ
3780 return ret;
3781}
74255aa0
JB
3782
3783#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
dc11dd5d
JB
3784/*
3785 * Use this if you need to make a bitmap or extent entry specifically, it
3786 * doesn't do any of the merging that add_free_space does, this acts a lot like
3787 * how the free space cache loading stuff works, so you can get really weird
3788 * configurations.
3789 */
32da5386 3790int test_add_free_space_entry(struct btrfs_block_group *cache,
dc11dd5d 3791 u64 offset, u64 bytes, bool bitmap)
74255aa0 3792{
dc11dd5d
JB
3793 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3794 struct btrfs_free_space *info = NULL, *bitmap_info;
3795 void *map = NULL;
da080fe1 3796 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
dc11dd5d
JB
3797 u64 bytes_added;
3798 int ret;
74255aa0 3799
dc11dd5d
JB
3800again:
3801 if (!info) {
3802 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3803 if (!info)
3804 return -ENOMEM;
74255aa0
JB
3805 }
3806
dc11dd5d
JB
3807 if (!bitmap) {
3808 spin_lock(&ctl->tree_lock);
3809 info->offset = offset;
3810 info->bytes = bytes;
cef40483 3811 info->max_extent_size = 0;
dc11dd5d
JB
3812 ret = link_free_space(ctl, info);
3813 spin_unlock(&ctl->tree_lock);
3814 if (ret)
3815 kmem_cache_free(btrfs_free_space_cachep, info);
3816 return ret;
3817 }
3818
3819 if (!map) {
3acd4850 3820 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
dc11dd5d
JB
3821 if (!map) {
3822 kmem_cache_free(btrfs_free_space_cachep, info);
3823 return -ENOMEM;
3824 }
3825 }
3826
3827 spin_lock(&ctl->tree_lock);
3828 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3829 1, 0);
3830 if (!bitmap_info) {
3831 info->bitmap = map;
3832 map = NULL;
3833 add_new_bitmap(ctl, info, offset);
3834 bitmap_info = info;
20005523 3835 info = NULL;
dc11dd5d 3836 }
74255aa0 3837
da080fe1
DZ
3838 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
3839 trim_state);
cef40483 3840
dc11dd5d
JB
3841 bytes -= bytes_added;
3842 offset += bytes_added;
3843 spin_unlock(&ctl->tree_lock);
74255aa0 3844
dc11dd5d
JB
3845 if (bytes)
3846 goto again;
74255aa0 3847
20005523
FM
3848 if (info)
3849 kmem_cache_free(btrfs_free_space_cachep, info);
3acd4850
CL
3850 if (map)
3851 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
dc11dd5d 3852 return 0;
74255aa0
JB
3853}
3854
3855/*
3856 * Checks to see if the given range is in the free space cache. This is really
3857 * just used to check the absence of space, so if there is free space in the
3858 * range at all we will return 1.
3859 */
32da5386 3860int test_check_exists(struct btrfs_block_group *cache,
dc11dd5d 3861 u64 offset, u64 bytes)
74255aa0
JB
3862{
3863 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3864 struct btrfs_free_space *info;
3865 int ret = 0;
3866
3867 spin_lock(&ctl->tree_lock);
3868 info = tree_search_offset(ctl, offset, 0, 0);
3869 if (!info) {
3870 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3871 1, 0);
3872 if (!info)
3873 goto out;
3874 }
3875
3876have_info:
3877 if (info->bitmap) {
3878 u64 bit_off, bit_bytes;
3879 struct rb_node *n;
3880 struct btrfs_free_space *tmp;
3881
3882 bit_off = offset;
3883 bit_bytes = ctl->unit;
0584f718 3884 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
74255aa0
JB
3885 if (!ret) {
3886 if (bit_off == offset) {
3887 ret = 1;
3888 goto out;
3889 } else if (bit_off > offset &&
3890 offset + bytes > bit_off) {
3891 ret = 1;
3892 goto out;
3893 }
3894 }
3895
3896 n = rb_prev(&info->offset_index);
3897 while (n) {
3898 tmp = rb_entry(n, struct btrfs_free_space,
3899 offset_index);
3900 if (tmp->offset + tmp->bytes < offset)
3901 break;
3902 if (offset + bytes < tmp->offset) {
5473e0c4 3903 n = rb_prev(&tmp->offset_index);
74255aa0
JB
3904 continue;
3905 }
3906 info = tmp;
3907 goto have_info;
3908 }
3909
3910 n = rb_next(&info->offset_index);
3911 while (n) {
3912 tmp = rb_entry(n, struct btrfs_free_space,
3913 offset_index);
3914 if (offset + bytes < tmp->offset)
3915 break;
3916 if (tmp->offset + tmp->bytes < offset) {
5473e0c4 3917 n = rb_next(&tmp->offset_index);
74255aa0
JB
3918 continue;
3919 }
3920 info = tmp;
3921 goto have_info;
3922 }
3923
20005523 3924 ret = 0;
74255aa0
JB
3925 goto out;
3926 }
3927
3928 if (info->offset == offset) {
3929 ret = 1;
3930 goto out;
3931 }
3932
3933 if (offset > info->offset && offset < info->offset + info->bytes)
3934 ret = 1;
3935out:
3936 spin_unlock(&ctl->tree_lock);
3937 return ret;
3938}
dc11dd5d 3939#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */