btrfs: add the beginning of async discard, discard workqueue
[linux-block.git] / fs / btrfs / free-space-cache.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0f9dd46c
JB
2/*
3 * Copyright (C) 2008 Red Hat. All rights reserved.
0f9dd46c
JB
4 */
5
96303081 6#include <linux/pagemap.h>
0f9dd46c 7#include <linux/sched.h>
f361bf4a 8#include <linux/sched/signal.h>
5a0e3ad6 9#include <linux/slab.h>
96303081 10#include <linux/math64.h>
6ab60601 11#include <linux/ratelimit.h>
540adea3 12#include <linux/error-injection.h>
84de76a2 13#include <linux/sched/mm.h>
0f9dd46c 14#include "ctree.h"
fa9c0d79
CM
15#include "free-space-cache.h"
16#include "transaction.h"
0af3d00b 17#include "disk-io.h"
43be2146 18#include "extent_io.h"
581bb050 19#include "inode-map.h"
04216820 20#include "volumes.h"
8719aaae 21#include "space-info.h"
86736342 22#include "delalloc-space.h"
aac0023c 23#include "block-group.h"
b0643e59 24#include "discard.h"
fa9c0d79 25
0ef6447a 26#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
ee22184b 27#define MAX_CACHE_BYTES_PER_GIG SZ_32K
0f9dd46c 28
55507ce3
FM
29struct btrfs_trim_range {
30 u64 start;
31 u64 bytes;
32 struct list_head list;
33};
34
34d52cb6 35static int link_free_space(struct btrfs_free_space_ctl *ctl,
0cb59c99 36 struct btrfs_free_space *info);
cd023e7b
JB
37static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
38 struct btrfs_free_space *info);
afdb5718
JM
39static int btrfs_wait_cache_io_root(struct btrfs_root *root,
40 struct btrfs_trans_handle *trans,
41 struct btrfs_io_ctl *io_ctl,
42 struct btrfs_path *path);
0cb59c99 43
0414efae
LZ
44static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
45 struct btrfs_path *path,
46 u64 offset)
0af3d00b 47{
0b246afa 48 struct btrfs_fs_info *fs_info = root->fs_info;
0af3d00b
JB
49 struct btrfs_key key;
50 struct btrfs_key location;
51 struct btrfs_disk_key disk_key;
52 struct btrfs_free_space_header *header;
53 struct extent_buffer *leaf;
54 struct inode *inode = NULL;
84de76a2 55 unsigned nofs_flag;
0af3d00b
JB
56 int ret;
57
0af3d00b 58 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 59 key.offset = offset;
0af3d00b
JB
60 key.type = 0;
61
62 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
63 if (ret < 0)
64 return ERR_PTR(ret);
65 if (ret > 0) {
b3b4aa74 66 btrfs_release_path(path);
0af3d00b
JB
67 return ERR_PTR(-ENOENT);
68 }
69
70 leaf = path->nodes[0];
71 header = btrfs_item_ptr(leaf, path->slots[0],
72 struct btrfs_free_space_header);
73 btrfs_free_space_key(leaf, header, &disk_key);
74 btrfs_disk_key_to_cpu(&location, &disk_key);
b3b4aa74 75 btrfs_release_path(path);
0af3d00b 76
84de76a2
JB
77 /*
78 * We are often under a trans handle at this point, so we need to make
79 * sure NOFS is set to keep us from deadlocking.
80 */
81 nofs_flag = memalloc_nofs_save();
4c66e0d4 82 inode = btrfs_iget_path(fs_info->sb, &location, root, path);
4222ea71 83 btrfs_release_path(path);
84de76a2 84 memalloc_nofs_restore(nofs_flag);
0af3d00b
JB
85 if (IS_ERR(inode))
86 return inode;
0af3d00b 87
528c0327 88 mapping_set_gfp_mask(inode->i_mapping,
c62d2555
MH
89 mapping_gfp_constraint(inode->i_mapping,
90 ~(__GFP_FS | __GFP_HIGHMEM)));
adae52b9 91
0414efae
LZ
92 return inode;
93}
94
32da5386 95struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
7949f339 96 struct btrfs_path *path)
0414efae 97{
7949f339 98 struct btrfs_fs_info *fs_info = block_group->fs_info;
0414efae 99 struct inode *inode = NULL;
5b0e95bf 100 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0414efae
LZ
101
102 spin_lock(&block_group->lock);
103 if (block_group->inode)
104 inode = igrab(block_group->inode);
105 spin_unlock(&block_group->lock);
106 if (inode)
107 return inode;
108
77ab86bf 109 inode = __lookup_free_space_inode(fs_info->tree_root, path,
b3470b5d 110 block_group->start);
0414efae
LZ
111 if (IS_ERR(inode))
112 return inode;
113
0af3d00b 114 spin_lock(&block_group->lock);
5b0e95bf 115 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
0b246afa 116 btrfs_info(fs_info, "Old style space inode found, converting.");
5b0e95bf
JB
117 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118 BTRFS_INODE_NODATACOW;
2f356126
JB
119 block_group->disk_cache_state = BTRFS_DC_CLEAR;
120 }
121
300e4f8a 122 if (!block_group->iref) {
0af3d00b
JB
123 block_group->inode = igrab(inode);
124 block_group->iref = 1;
125 }
126 spin_unlock(&block_group->lock);
127
128 return inode;
129}
130
48a3b636
ES
131static int __create_free_space_inode(struct btrfs_root *root,
132 struct btrfs_trans_handle *trans,
133 struct btrfs_path *path,
134 u64 ino, u64 offset)
0af3d00b
JB
135{
136 struct btrfs_key key;
137 struct btrfs_disk_key disk_key;
138 struct btrfs_free_space_header *header;
139 struct btrfs_inode_item *inode_item;
140 struct extent_buffer *leaf;
5b0e95bf 141 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
0af3d00b
JB
142 int ret;
143
0414efae 144 ret = btrfs_insert_empty_inode(trans, root, path, ino);
0af3d00b
JB
145 if (ret)
146 return ret;
147
5b0e95bf
JB
148 /* We inline crc's for the free disk space cache */
149 if (ino != BTRFS_FREE_INO_OBJECTID)
150 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
151
0af3d00b
JB
152 leaf = path->nodes[0];
153 inode_item = btrfs_item_ptr(leaf, path->slots[0],
154 struct btrfs_inode_item);
155 btrfs_item_key(leaf, &disk_key, path->slots[0]);
b159fa28 156 memzero_extent_buffer(leaf, (unsigned long)inode_item,
0af3d00b
JB
157 sizeof(*inode_item));
158 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159 btrfs_set_inode_size(leaf, inode_item, 0);
160 btrfs_set_inode_nbytes(leaf, inode_item, 0);
161 btrfs_set_inode_uid(leaf, inode_item, 0);
162 btrfs_set_inode_gid(leaf, inode_item, 0);
163 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
5b0e95bf 164 btrfs_set_inode_flags(leaf, inode_item, flags);
0af3d00b
JB
165 btrfs_set_inode_nlink(leaf, inode_item, 1);
166 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
0414efae 167 btrfs_set_inode_block_group(leaf, inode_item, offset);
0af3d00b 168 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 169 btrfs_release_path(path);
0af3d00b
JB
170
171 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 172 key.offset = offset;
0af3d00b 173 key.type = 0;
0af3d00b
JB
174 ret = btrfs_insert_empty_item(trans, root, path, &key,
175 sizeof(struct btrfs_free_space_header));
176 if (ret < 0) {
b3b4aa74 177 btrfs_release_path(path);
0af3d00b
JB
178 return ret;
179 }
c9dc4c65 180
0af3d00b
JB
181 leaf = path->nodes[0];
182 header = btrfs_item_ptr(leaf, path->slots[0],
183 struct btrfs_free_space_header);
b159fa28 184 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
0af3d00b
JB
185 btrfs_set_free_space_key(leaf, header, &disk_key);
186 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 187 btrfs_release_path(path);
0af3d00b
JB
188
189 return 0;
190}
191
4ca75f1b 192int create_free_space_inode(struct btrfs_trans_handle *trans,
32da5386 193 struct btrfs_block_group *block_group,
0414efae
LZ
194 struct btrfs_path *path)
195{
196 int ret;
197 u64 ino;
198
4ca75f1b 199 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
0414efae
LZ
200 if (ret < 0)
201 return ret;
202
4ca75f1b 203 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
b3470b5d 204 ino, block_group->start);
0414efae
LZ
205}
206
2ff7e61e 207int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
7b61cd92 208 struct btrfs_block_rsv *rsv)
0af3d00b 209{
c8174313 210 u64 needed_bytes;
7b61cd92 211 int ret;
c8174313
JB
212
213 /* 1 for slack space, 1 for updating the inode */
2bd36e7b
JB
214 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
215 btrfs_calc_metadata_size(fs_info, 1);
c8174313 216
7b61cd92
MX
217 spin_lock(&rsv->lock);
218 if (rsv->reserved < needed_bytes)
219 ret = -ENOSPC;
220 else
221 ret = 0;
222 spin_unlock(&rsv->lock);
4b286cd1 223 return ret;
7b61cd92
MX
224}
225
77ab86bf 226int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
32da5386 227 struct btrfs_block_group *block_group,
7b61cd92
MX
228 struct inode *inode)
229{
77ab86bf 230 struct btrfs_root *root = BTRFS_I(inode)->root;
7b61cd92 231 int ret = 0;
35c76642 232 bool locked = false;
1bbc621e 233
1bbc621e 234 if (block_group) {
21e75ffe
JM
235 struct btrfs_path *path = btrfs_alloc_path();
236
237 if (!path) {
238 ret = -ENOMEM;
239 goto fail;
240 }
35c76642 241 locked = true;
1bbc621e
CM
242 mutex_lock(&trans->transaction->cache_write_mutex);
243 if (!list_empty(&block_group->io_list)) {
244 list_del_init(&block_group->io_list);
245
afdb5718 246 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
247 btrfs_put_block_group(block_group);
248 }
249
250 /*
251 * now that we've truncated the cache away, its no longer
252 * setup or written
253 */
254 spin_lock(&block_group->lock);
255 block_group->disk_cache_state = BTRFS_DC_CLEAR;
256 spin_unlock(&block_group->lock);
21e75ffe 257 btrfs_free_path(path);
1bbc621e 258 }
0af3d00b 259
6ef06d27 260 btrfs_i_size_write(BTRFS_I(inode), 0);
7caef267 261 truncate_pagecache(inode, 0);
0af3d00b
JB
262
263 /*
f7e9e8fc
OS
264 * We skip the throttling logic for free space cache inodes, so we don't
265 * need to check for -EAGAIN.
0af3d00b
JB
266 */
267 ret = btrfs_truncate_inode_items(trans, root, inode,
268 0, BTRFS_EXTENT_DATA_KEY);
35c76642
FM
269 if (ret)
270 goto fail;
0af3d00b 271
82d5902d 272 ret = btrfs_update_inode(trans, root, inode);
1bbc621e 273
1bbc621e 274fail:
35c76642
FM
275 if (locked)
276 mutex_unlock(&trans->transaction->cache_write_mutex);
79787eaa 277 if (ret)
66642832 278 btrfs_abort_transaction(trans, ret);
c8174313 279
82d5902d 280 return ret;
0af3d00b
JB
281}
282
1d480538 283static void readahead_cache(struct inode *inode)
9d66e233
JB
284{
285 struct file_ra_state *ra;
286 unsigned long last_index;
287
288 ra = kzalloc(sizeof(*ra), GFP_NOFS);
289 if (!ra)
1d480538 290 return;
9d66e233
JB
291
292 file_ra_state_init(ra, inode->i_mapping);
09cbfeaf 293 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
9d66e233
JB
294
295 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
296
297 kfree(ra);
9d66e233
JB
298}
299
4c6d1d85 300static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
f15376df 301 int write)
a67509c3 302{
5349d6c3
MX
303 int num_pages;
304 int check_crcs = 0;
305
09cbfeaf 306 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
5349d6c3 307
4a0cc7ca 308 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
5349d6c3
MX
309 check_crcs = 1;
310
8f6c72a9 311 /* Make sure we can fit our crcs and generation into the first page */
5349d6c3 312 if (write && check_crcs &&
8f6c72a9 313 (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
5349d6c3
MX
314 return -ENOSPC;
315
4c6d1d85 316 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
5349d6c3 317
31e818fe 318 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
a67509c3
JB
319 if (!io_ctl->pages)
320 return -ENOMEM;
5349d6c3
MX
321
322 io_ctl->num_pages = num_pages;
f15376df 323 io_ctl->fs_info = btrfs_sb(inode->i_sb);
5349d6c3 324 io_ctl->check_crcs = check_crcs;
c9dc4c65 325 io_ctl->inode = inode;
5349d6c3 326
a67509c3
JB
327 return 0;
328}
663faf9f 329ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
a67509c3 330
4c6d1d85 331static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
332{
333 kfree(io_ctl->pages);
c9dc4c65 334 io_ctl->pages = NULL;
a67509c3
JB
335}
336
4c6d1d85 337static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
338{
339 if (io_ctl->cur) {
a67509c3
JB
340 io_ctl->cur = NULL;
341 io_ctl->orig = NULL;
342 }
343}
344
4c6d1d85 345static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
a67509c3 346{
b12d6869 347 ASSERT(io_ctl->index < io_ctl->num_pages);
a67509c3 348 io_ctl->page = io_ctl->pages[io_ctl->index++];
2b108268 349 io_ctl->cur = page_address(io_ctl->page);
a67509c3 350 io_ctl->orig = io_ctl->cur;
09cbfeaf 351 io_ctl->size = PAGE_SIZE;
a67509c3 352 if (clear)
619a9742 353 clear_page(io_ctl->cur);
a67509c3
JB
354}
355
4c6d1d85 356static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
357{
358 int i;
359
360 io_ctl_unmap_page(io_ctl);
361
362 for (i = 0; i < io_ctl->num_pages; i++) {
a1ee5a45
LZ
363 if (io_ctl->pages[i]) {
364 ClearPageChecked(io_ctl->pages[i]);
365 unlock_page(io_ctl->pages[i]);
09cbfeaf 366 put_page(io_ctl->pages[i]);
a1ee5a45 367 }
a67509c3
JB
368 }
369}
370
4c6d1d85 371static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
a67509c3
JB
372 int uptodate)
373{
374 struct page *page;
375 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
376 int i;
377
378 for (i = 0; i < io_ctl->num_pages; i++) {
379 page = find_or_create_page(inode->i_mapping, i, mask);
380 if (!page) {
381 io_ctl_drop_pages(io_ctl);
382 return -ENOMEM;
383 }
384 io_ctl->pages[i] = page;
385 if (uptodate && !PageUptodate(page)) {
386 btrfs_readpage(NULL, page);
387 lock_page(page);
3797136b
JB
388 if (page->mapping != inode->i_mapping) {
389 btrfs_err(BTRFS_I(inode)->root->fs_info,
390 "free space cache page truncated");
391 io_ctl_drop_pages(io_ctl);
392 return -EIO;
393 }
a67509c3 394 if (!PageUptodate(page)) {
efe120a0
FH
395 btrfs_err(BTRFS_I(inode)->root->fs_info,
396 "error reading free space cache");
a67509c3
JB
397 io_ctl_drop_pages(io_ctl);
398 return -EIO;
399 }
400 }
401 }
402
f7d61dcd
JB
403 for (i = 0; i < io_ctl->num_pages; i++) {
404 clear_page_dirty_for_io(io_ctl->pages[i]);
405 set_page_extent_mapped(io_ctl->pages[i]);
406 }
407
a67509c3
JB
408 return 0;
409}
410
4c6d1d85 411static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 412{
528c0327 413 __le64 *val;
a67509c3
JB
414
415 io_ctl_map_page(io_ctl, 1);
416
417 /*
5b0e95bf
JB
418 * Skip the csum areas. If we don't check crcs then we just have a
419 * 64bit chunk at the front of the first page.
a67509c3 420 */
5b0e95bf
JB
421 if (io_ctl->check_crcs) {
422 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
423 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
424 } else {
425 io_ctl->cur += sizeof(u64);
426 io_ctl->size -= sizeof(u64) * 2;
427 }
a67509c3
JB
428
429 val = io_ctl->cur;
430 *val = cpu_to_le64(generation);
431 io_ctl->cur += sizeof(u64);
a67509c3
JB
432}
433
4c6d1d85 434static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 435{
528c0327 436 __le64 *gen;
a67509c3 437
5b0e95bf
JB
438 /*
439 * Skip the crc area. If we don't check crcs then we just have a 64bit
440 * chunk at the front of the first page.
441 */
442 if (io_ctl->check_crcs) {
443 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
444 io_ctl->size -= sizeof(u64) +
445 (sizeof(u32) * io_ctl->num_pages);
446 } else {
447 io_ctl->cur += sizeof(u64);
448 io_ctl->size -= sizeof(u64) * 2;
449 }
a67509c3 450
a67509c3
JB
451 gen = io_ctl->cur;
452 if (le64_to_cpu(*gen) != generation) {
f15376df 453 btrfs_err_rl(io_ctl->fs_info,
94647322
DS
454 "space cache generation (%llu) does not match inode (%llu)",
455 *gen, generation);
a67509c3
JB
456 io_ctl_unmap_page(io_ctl);
457 return -EIO;
458 }
459 io_ctl->cur += sizeof(u64);
5b0e95bf
JB
460 return 0;
461}
462
4c6d1d85 463static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
464{
465 u32 *tmp;
466 u32 crc = ~(u32)0;
467 unsigned offset = 0;
468
469 if (!io_ctl->check_crcs) {
470 io_ctl_unmap_page(io_ctl);
471 return;
472 }
473
474 if (index == 0)
cb54f257 475 offset = sizeof(u32) * io_ctl->num_pages;
5b0e95bf 476
4bb3c2e2
JT
477 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
478 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 479 io_ctl_unmap_page(io_ctl);
2b108268 480 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
481 tmp += index;
482 *tmp = crc;
5b0e95bf
JB
483}
484
4c6d1d85 485static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
486{
487 u32 *tmp, val;
488 u32 crc = ~(u32)0;
489 unsigned offset = 0;
490
491 if (!io_ctl->check_crcs) {
492 io_ctl_map_page(io_ctl, 0);
493 return 0;
494 }
495
496 if (index == 0)
497 offset = sizeof(u32) * io_ctl->num_pages;
498
2b108268 499 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
500 tmp += index;
501 val = *tmp;
5b0e95bf
JB
502
503 io_ctl_map_page(io_ctl, 0);
4bb3c2e2
JT
504 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
505 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 506 if (val != crc) {
f15376df 507 btrfs_err_rl(io_ctl->fs_info,
94647322 508 "csum mismatch on free space cache");
5b0e95bf
JB
509 io_ctl_unmap_page(io_ctl);
510 return -EIO;
511 }
512
a67509c3
JB
513 return 0;
514}
515
4c6d1d85 516static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
a67509c3
JB
517 void *bitmap)
518{
519 struct btrfs_free_space_entry *entry;
520
521 if (!io_ctl->cur)
522 return -ENOSPC;
523
524 entry = io_ctl->cur;
525 entry->offset = cpu_to_le64(offset);
526 entry->bytes = cpu_to_le64(bytes);
527 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
528 BTRFS_FREE_SPACE_EXTENT;
529 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
530 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
531
532 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
533 return 0;
534
5b0e95bf 535 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
536
537 /* No more pages to map */
538 if (io_ctl->index >= io_ctl->num_pages)
539 return 0;
540
541 /* map the next page */
542 io_ctl_map_page(io_ctl, 1);
543 return 0;
544}
545
4c6d1d85 546static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
a67509c3
JB
547{
548 if (!io_ctl->cur)
549 return -ENOSPC;
550
551 /*
552 * If we aren't at the start of the current page, unmap this one and
553 * map the next one if there is any left.
554 */
555 if (io_ctl->cur != io_ctl->orig) {
5b0e95bf 556 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
557 if (io_ctl->index >= io_ctl->num_pages)
558 return -ENOSPC;
559 io_ctl_map_page(io_ctl, 0);
560 }
561
69d24804 562 copy_page(io_ctl->cur, bitmap);
5b0e95bf 563 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
564 if (io_ctl->index < io_ctl->num_pages)
565 io_ctl_map_page(io_ctl, 0);
566 return 0;
567}
568
4c6d1d85 569static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
a67509c3 570{
5b0e95bf
JB
571 /*
572 * If we're not on the boundary we know we've modified the page and we
573 * need to crc the page.
574 */
575 if (io_ctl->cur != io_ctl->orig)
576 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
577 else
578 io_ctl_unmap_page(io_ctl);
a67509c3
JB
579
580 while (io_ctl->index < io_ctl->num_pages) {
581 io_ctl_map_page(io_ctl, 1);
5b0e95bf 582 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
583 }
584}
585
4c6d1d85 586static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
5b0e95bf 587 struct btrfs_free_space *entry, u8 *type)
a67509c3
JB
588{
589 struct btrfs_free_space_entry *e;
2f120c05
JB
590 int ret;
591
592 if (!io_ctl->cur) {
593 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
594 if (ret)
595 return ret;
596 }
a67509c3
JB
597
598 e = io_ctl->cur;
599 entry->offset = le64_to_cpu(e->offset);
600 entry->bytes = le64_to_cpu(e->bytes);
5b0e95bf 601 *type = e->type;
a67509c3
JB
602 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
603 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
604
605 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
5b0e95bf 606 return 0;
a67509c3
JB
607
608 io_ctl_unmap_page(io_ctl);
609
2f120c05 610 return 0;
a67509c3
JB
611}
612
4c6d1d85 613static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
5b0e95bf 614 struct btrfs_free_space *entry)
a67509c3 615{
5b0e95bf
JB
616 int ret;
617
5b0e95bf
JB
618 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
619 if (ret)
620 return ret;
621
69d24804 622 copy_page(entry->bitmap, io_ctl->cur);
a67509c3 623 io_ctl_unmap_page(io_ctl);
5b0e95bf
JB
624
625 return 0;
a67509c3
JB
626}
627
cd023e7b
JB
628/*
629 * Since we attach pinned extents after the fact we can have contiguous sections
630 * of free space that are split up in entries. This poses a problem with the
631 * tree logging stuff since it could have allocated across what appears to be 2
632 * entries since we would have merged the entries when adding the pinned extents
633 * back to the free space cache. So run through the space cache that we just
634 * loaded and merge contiguous entries. This will make the log replay stuff not
635 * blow up and it will make for nicer allocator behavior.
636 */
637static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
638{
639 struct btrfs_free_space *e, *prev = NULL;
640 struct rb_node *n;
641
642again:
643 spin_lock(&ctl->tree_lock);
644 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
645 e = rb_entry(n, struct btrfs_free_space, offset_index);
646 if (!prev)
647 goto next;
648 if (e->bitmap || prev->bitmap)
649 goto next;
650 if (prev->offset + prev->bytes == e->offset) {
651 unlink_free_space(ctl, prev);
652 unlink_free_space(ctl, e);
653 prev->bytes += e->bytes;
654 kmem_cache_free(btrfs_free_space_cachep, e);
655 link_free_space(ctl, prev);
656 prev = NULL;
657 spin_unlock(&ctl->tree_lock);
658 goto again;
659 }
660next:
661 prev = e;
662 }
663 spin_unlock(&ctl->tree_lock);
664}
665
48a3b636
ES
666static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
667 struct btrfs_free_space_ctl *ctl,
668 struct btrfs_path *path, u64 offset)
9d66e233 669{
3ffbd68c 670 struct btrfs_fs_info *fs_info = root->fs_info;
9d66e233
JB
671 struct btrfs_free_space_header *header;
672 struct extent_buffer *leaf;
4c6d1d85 673 struct btrfs_io_ctl io_ctl;
9d66e233 674 struct btrfs_key key;
a67509c3 675 struct btrfs_free_space *e, *n;
b76808fc 676 LIST_HEAD(bitmaps);
9d66e233
JB
677 u64 num_entries;
678 u64 num_bitmaps;
679 u64 generation;
a67509c3 680 u8 type;
f6a39829 681 int ret = 0;
9d66e233 682
9d66e233 683 /* Nothing in the space cache, goodbye */
0414efae 684 if (!i_size_read(inode))
a67509c3 685 return 0;
9d66e233
JB
686
687 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 688 key.offset = offset;
9d66e233
JB
689 key.type = 0;
690
691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0414efae 692 if (ret < 0)
a67509c3 693 return 0;
0414efae 694 else if (ret > 0) {
945d8962 695 btrfs_release_path(path);
a67509c3 696 return 0;
9d66e233
JB
697 }
698
0414efae
LZ
699 ret = -1;
700
9d66e233
JB
701 leaf = path->nodes[0];
702 header = btrfs_item_ptr(leaf, path->slots[0],
703 struct btrfs_free_space_header);
704 num_entries = btrfs_free_space_entries(leaf, header);
705 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
706 generation = btrfs_free_space_generation(leaf, header);
945d8962 707 btrfs_release_path(path);
9d66e233 708
e570fd27 709 if (!BTRFS_I(inode)->generation) {
0b246afa 710 btrfs_info(fs_info,
913e1535 711 "the free space cache file (%llu) is invalid, skip it",
e570fd27
MX
712 offset);
713 return 0;
714 }
715
9d66e233 716 if (BTRFS_I(inode)->generation != generation) {
0b246afa
JM
717 btrfs_err(fs_info,
718 "free space inode generation (%llu) did not match free space cache generation (%llu)",
719 BTRFS_I(inode)->generation, generation);
a67509c3 720 return 0;
9d66e233
JB
721 }
722
723 if (!num_entries)
a67509c3 724 return 0;
9d66e233 725
f15376df 726 ret = io_ctl_init(&io_ctl, inode, 0);
706efc66
LZ
727 if (ret)
728 return ret;
729
1d480538 730 readahead_cache(inode);
9d66e233 731
a67509c3
JB
732 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
733 if (ret)
734 goto out;
9d66e233 735
5b0e95bf
JB
736 ret = io_ctl_check_crc(&io_ctl, 0);
737 if (ret)
738 goto free_cache;
739
a67509c3
JB
740 ret = io_ctl_check_generation(&io_ctl, generation);
741 if (ret)
742 goto free_cache;
9d66e233 743
a67509c3
JB
744 while (num_entries) {
745 e = kmem_cache_zalloc(btrfs_free_space_cachep,
746 GFP_NOFS);
747 if (!e)
9d66e233 748 goto free_cache;
9d66e233 749
5b0e95bf
JB
750 ret = io_ctl_read_entry(&io_ctl, e, &type);
751 if (ret) {
752 kmem_cache_free(btrfs_free_space_cachep, e);
753 goto free_cache;
754 }
755
a7ccb255
DZ
756 /*
757 * Sync discard ensures that the free space cache is always
758 * trimmed. So when reading this in, the state should reflect
b0643e59
DZ
759 * that. We also do this for async as a stop gap for lack of
760 * persistence.
a7ccb255 761 */
b0643e59
DZ
762 if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
763 btrfs_test_opt(fs_info, DISCARD_ASYNC))
a7ccb255
DZ
764 e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
765
a67509c3
JB
766 if (!e->bytes) {
767 kmem_cache_free(btrfs_free_space_cachep, e);
768 goto free_cache;
9d66e233 769 }
a67509c3
JB
770
771 if (type == BTRFS_FREE_SPACE_EXTENT) {
772 spin_lock(&ctl->tree_lock);
773 ret = link_free_space(ctl, e);
774 spin_unlock(&ctl->tree_lock);
775 if (ret) {
0b246afa 776 btrfs_err(fs_info,
c2cf52eb 777 "Duplicate entries in free space cache, dumping");
a67509c3 778 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
779 goto free_cache;
780 }
a67509c3 781 } else {
b12d6869 782 ASSERT(num_bitmaps);
a67509c3 783 num_bitmaps--;
3acd4850
CL
784 e->bitmap = kmem_cache_zalloc(
785 btrfs_free_space_bitmap_cachep, GFP_NOFS);
a67509c3
JB
786 if (!e->bitmap) {
787 kmem_cache_free(
788 btrfs_free_space_cachep, e);
9d66e233
JB
789 goto free_cache;
790 }
a67509c3
JB
791 spin_lock(&ctl->tree_lock);
792 ret = link_free_space(ctl, e);
793 ctl->total_bitmaps++;
794 ctl->op->recalc_thresholds(ctl);
795 spin_unlock(&ctl->tree_lock);
796 if (ret) {
0b246afa 797 btrfs_err(fs_info,
c2cf52eb 798 "Duplicate entries in free space cache, dumping");
dc89e982 799 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
800 goto free_cache;
801 }
a67509c3 802 list_add_tail(&e->list, &bitmaps);
9d66e233
JB
803 }
804
a67509c3
JB
805 num_entries--;
806 }
9d66e233 807
2f120c05
JB
808 io_ctl_unmap_page(&io_ctl);
809
a67509c3
JB
810 /*
811 * We add the bitmaps at the end of the entries in order that
812 * the bitmap entries are added to the cache.
813 */
814 list_for_each_entry_safe(e, n, &bitmaps, list) {
9d66e233 815 list_del_init(&e->list);
5b0e95bf
JB
816 ret = io_ctl_read_bitmap(&io_ctl, e);
817 if (ret)
818 goto free_cache;
9d66e233
JB
819 }
820
a67509c3 821 io_ctl_drop_pages(&io_ctl);
cd023e7b 822 merge_space_tree(ctl);
9d66e233
JB
823 ret = 1;
824out:
a67509c3 825 io_ctl_free(&io_ctl);
9d66e233 826 return ret;
9d66e233 827free_cache:
a67509c3 828 io_ctl_drop_pages(&io_ctl);
0414efae 829 __btrfs_remove_free_space_cache(ctl);
9d66e233
JB
830 goto out;
831}
832
32da5386 833int load_free_space_cache(struct btrfs_block_group *block_group)
0cb59c99 834{
bb6cb1c5 835 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 836 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0414efae
LZ
837 struct inode *inode;
838 struct btrfs_path *path;
5b0e95bf 839 int ret = 0;
0414efae 840 bool matched;
bf38be65 841 u64 used = block_group->used;
0414efae 842
0414efae
LZ
843 /*
844 * If this block group has been marked to be cleared for one reason or
845 * another then we can't trust the on disk cache, so just return.
846 */
9d66e233 847 spin_lock(&block_group->lock);
0414efae
LZ
848 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
849 spin_unlock(&block_group->lock);
850 return 0;
851 }
9d66e233 852 spin_unlock(&block_group->lock);
0414efae
LZ
853
854 path = btrfs_alloc_path();
855 if (!path)
856 return 0;
d53ba474
JB
857 path->search_commit_root = 1;
858 path->skip_locking = 1;
0414efae 859
4222ea71
FM
860 /*
861 * We must pass a path with search_commit_root set to btrfs_iget in
862 * order to avoid a deadlock when allocating extents for the tree root.
863 *
864 * When we are COWing an extent buffer from the tree root, when looking
865 * for a free extent, at extent-tree.c:find_free_extent(), we can find
866 * block group without its free space cache loaded. When we find one
867 * we must load its space cache which requires reading its free space
868 * cache's inode item from the root tree. If this inode item is located
869 * in the same leaf that we started COWing before, then we end up in
870 * deadlock on the extent buffer (trying to read lock it when we
871 * previously write locked it).
872 *
873 * It's safe to read the inode item using the commit root because
874 * block groups, once loaded, stay in memory forever (until they are
875 * removed) as well as their space caches once loaded. New block groups
876 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
877 * we will never try to read their inode item while the fs is mounted.
878 */
7949f339 879 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
880 if (IS_ERR(inode)) {
881 btrfs_free_path(path);
882 return 0;
883 }
884
5b0e95bf
JB
885 /* We may have converted the inode and made the cache invalid. */
886 spin_lock(&block_group->lock);
887 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
888 spin_unlock(&block_group->lock);
a7e221e9 889 btrfs_free_path(path);
5b0e95bf
JB
890 goto out;
891 }
892 spin_unlock(&block_group->lock);
893
0414efae 894 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
b3470b5d 895 path, block_group->start);
0414efae
LZ
896 btrfs_free_path(path);
897 if (ret <= 0)
898 goto out;
899
900 spin_lock(&ctl->tree_lock);
b3470b5d 901 matched = (ctl->free_space == (block_group->length - used -
0414efae
LZ
902 block_group->bytes_super));
903 spin_unlock(&ctl->tree_lock);
904
905 if (!matched) {
906 __btrfs_remove_free_space_cache(ctl);
5d163e0e
JM
907 btrfs_warn(fs_info,
908 "block group %llu has wrong amount of free space",
b3470b5d 909 block_group->start);
0414efae
LZ
910 ret = -1;
911 }
912out:
913 if (ret < 0) {
914 /* This cache is bogus, make sure it gets cleared */
915 spin_lock(&block_group->lock);
916 block_group->disk_cache_state = BTRFS_DC_CLEAR;
917 spin_unlock(&block_group->lock);
82d5902d 918 ret = 0;
0414efae 919
5d163e0e
JM
920 btrfs_warn(fs_info,
921 "failed to load free space cache for block group %llu, rebuilding it now",
b3470b5d 922 block_group->start);
0414efae
LZ
923 }
924
925 iput(inode);
926 return ret;
9d66e233
JB
927}
928
d4452bc5 929static noinline_for_stack
4c6d1d85 930int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
d4452bc5 931 struct btrfs_free_space_ctl *ctl,
32da5386 932 struct btrfs_block_group *block_group,
d4452bc5
CM
933 int *entries, int *bitmaps,
934 struct list_head *bitmap_list)
0cb59c99 935{
c09544e0 936 int ret;
d4452bc5 937 struct btrfs_free_cluster *cluster = NULL;
1bbc621e 938 struct btrfs_free_cluster *cluster_locked = NULL;
d4452bc5 939 struct rb_node *node = rb_first(&ctl->free_space_offset);
55507ce3 940 struct btrfs_trim_range *trim_entry;
be1a12a0 941
43be2146 942 /* Get the cluster for this block_group if it exists */
d4452bc5 943 if (block_group && !list_empty(&block_group->cluster_list)) {
43be2146
JB
944 cluster = list_entry(block_group->cluster_list.next,
945 struct btrfs_free_cluster,
946 block_group_list);
d4452bc5 947 }
43be2146 948
f75b130e 949 if (!node && cluster) {
1bbc621e
CM
950 cluster_locked = cluster;
951 spin_lock(&cluster_locked->lock);
f75b130e
JB
952 node = rb_first(&cluster->root);
953 cluster = NULL;
954 }
955
a67509c3
JB
956 /* Write out the extent entries */
957 while (node) {
958 struct btrfs_free_space *e;
0cb59c99 959
a67509c3 960 e = rb_entry(node, struct btrfs_free_space, offset_index);
d4452bc5 961 *entries += 1;
0cb59c99 962
d4452bc5 963 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
a67509c3
JB
964 e->bitmap);
965 if (ret)
d4452bc5 966 goto fail;
2f356126 967
a67509c3 968 if (e->bitmap) {
d4452bc5
CM
969 list_add_tail(&e->list, bitmap_list);
970 *bitmaps += 1;
2f356126 971 }
a67509c3
JB
972 node = rb_next(node);
973 if (!node && cluster) {
974 node = rb_first(&cluster->root);
1bbc621e
CM
975 cluster_locked = cluster;
976 spin_lock(&cluster_locked->lock);
a67509c3 977 cluster = NULL;
43be2146 978 }
a67509c3 979 }
1bbc621e
CM
980 if (cluster_locked) {
981 spin_unlock(&cluster_locked->lock);
982 cluster_locked = NULL;
983 }
55507ce3
FM
984
985 /*
986 * Make sure we don't miss any range that was removed from our rbtree
987 * because trimming is running. Otherwise after a umount+mount (or crash
988 * after committing the transaction) we would leak free space and get
989 * an inconsistent free space cache report from fsck.
990 */
991 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
992 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
993 trim_entry->bytes, NULL);
994 if (ret)
995 goto fail;
996 *entries += 1;
997 }
998
d4452bc5
CM
999 return 0;
1000fail:
1bbc621e
CM
1001 if (cluster_locked)
1002 spin_unlock(&cluster_locked->lock);
d4452bc5
CM
1003 return -ENOSPC;
1004}
1005
1006static noinline_for_stack int
1007update_cache_item(struct btrfs_trans_handle *trans,
1008 struct btrfs_root *root,
1009 struct inode *inode,
1010 struct btrfs_path *path, u64 offset,
1011 int entries, int bitmaps)
1012{
1013 struct btrfs_key key;
1014 struct btrfs_free_space_header *header;
1015 struct extent_buffer *leaf;
1016 int ret;
1017
1018 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1019 key.offset = offset;
1020 key.type = 0;
1021
1022 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1023 if (ret < 0) {
1024 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1025 EXTENT_DELALLOC, 0, 0, NULL);
d4452bc5
CM
1026 goto fail;
1027 }
1028 leaf = path->nodes[0];
1029 if (ret > 0) {
1030 struct btrfs_key found_key;
1031 ASSERT(path->slots[0]);
1032 path->slots[0]--;
1033 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1034 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1035 found_key.offset != offset) {
1036 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
e182163d
OS
1037 inode->i_size - 1, EXTENT_DELALLOC, 0,
1038 0, NULL);
d4452bc5
CM
1039 btrfs_release_path(path);
1040 goto fail;
1041 }
1042 }
1043
1044 BTRFS_I(inode)->generation = trans->transid;
1045 header = btrfs_item_ptr(leaf, path->slots[0],
1046 struct btrfs_free_space_header);
1047 btrfs_set_free_space_entries(leaf, header, entries);
1048 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1049 btrfs_set_free_space_generation(leaf, header, trans->transid);
1050 btrfs_mark_buffer_dirty(leaf);
1051 btrfs_release_path(path);
1052
1053 return 0;
1054
1055fail:
1056 return -1;
1057}
1058
6701bdb3 1059static noinline_for_stack int write_pinned_extent_entries(
32da5386 1060 struct btrfs_block_group *block_group,
4c6d1d85 1061 struct btrfs_io_ctl *io_ctl,
5349d6c3 1062 int *entries)
d4452bc5
CM
1063{
1064 u64 start, extent_start, extent_end, len;
d4452bc5
CM
1065 struct extent_io_tree *unpin = NULL;
1066 int ret;
43be2146 1067
5349d6c3
MX
1068 if (!block_group)
1069 return 0;
1070
a67509c3
JB
1071 /*
1072 * We want to add any pinned extents to our free space cache
1073 * so we don't leak the space
d4452bc5 1074 *
db804f23
LZ
1075 * We shouldn't have switched the pinned extents yet so this is the
1076 * right one
1077 */
6701bdb3 1078 unpin = block_group->fs_info->pinned_extents;
db804f23 1079
b3470b5d 1080 start = block_group->start;
db804f23 1081
b3470b5d 1082 while (start < block_group->start + block_group->length) {
db804f23
LZ
1083 ret = find_first_extent_bit(unpin, start,
1084 &extent_start, &extent_end,
e6138876 1085 EXTENT_DIRTY, NULL);
5349d6c3
MX
1086 if (ret)
1087 return 0;
0cb59c99 1088
a67509c3 1089 /* This pinned extent is out of our range */
b3470b5d 1090 if (extent_start >= block_group->start + block_group->length)
5349d6c3 1091 return 0;
2f356126 1092
db804f23 1093 extent_start = max(extent_start, start);
b3470b5d
DS
1094 extent_end = min(block_group->start + block_group->length,
1095 extent_end + 1);
db804f23 1096 len = extent_end - extent_start;
0cb59c99 1097
d4452bc5
CM
1098 *entries += 1;
1099 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
a67509c3 1100 if (ret)
5349d6c3 1101 return -ENOSPC;
0cb59c99 1102
db804f23 1103 start = extent_end;
a67509c3 1104 }
0cb59c99 1105
5349d6c3
MX
1106 return 0;
1107}
1108
1109static noinline_for_stack int
4c6d1d85 1110write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
5349d6c3 1111{
7ae1681e 1112 struct btrfs_free_space *entry, *next;
5349d6c3
MX
1113 int ret;
1114
0cb59c99 1115 /* Write out the bitmaps */
7ae1681e 1116 list_for_each_entry_safe(entry, next, bitmap_list, list) {
d4452bc5 1117 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
a67509c3 1118 if (ret)
5349d6c3 1119 return -ENOSPC;
0cb59c99 1120 list_del_init(&entry->list);
be1a12a0
JB
1121 }
1122
5349d6c3
MX
1123 return 0;
1124}
0cb59c99 1125
5349d6c3
MX
1126static int flush_dirty_cache(struct inode *inode)
1127{
1128 int ret;
be1a12a0 1129
0ef8b726 1130 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5349d6c3 1131 if (ret)
0ef8b726 1132 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1133 EXTENT_DELALLOC, 0, 0, NULL);
0cb59c99 1134
5349d6c3 1135 return ret;
d4452bc5
CM
1136}
1137
1138static void noinline_for_stack
a3bdccc4 1139cleanup_bitmap_list(struct list_head *bitmap_list)
d4452bc5 1140{
7ae1681e 1141 struct btrfs_free_space *entry, *next;
5349d6c3 1142
7ae1681e 1143 list_for_each_entry_safe(entry, next, bitmap_list, list)
d4452bc5 1144 list_del_init(&entry->list);
a3bdccc4
CM
1145}
1146
1147static void noinline_for_stack
1148cleanup_write_cache_enospc(struct inode *inode,
1149 struct btrfs_io_ctl *io_ctl,
7bf1a159 1150 struct extent_state **cached_state)
a3bdccc4 1151{
d4452bc5
CM
1152 io_ctl_drop_pages(io_ctl);
1153 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1154 i_size_read(inode) - 1, cached_state);
d4452bc5 1155}
549b4fdb 1156
afdb5718
JM
1157static int __btrfs_wait_cache_io(struct btrfs_root *root,
1158 struct btrfs_trans_handle *trans,
32da5386 1159 struct btrfs_block_group *block_group,
afdb5718
JM
1160 struct btrfs_io_ctl *io_ctl,
1161 struct btrfs_path *path, u64 offset)
c9dc4c65
CM
1162{
1163 int ret;
1164 struct inode *inode = io_ctl->inode;
1165
1bbc621e
CM
1166 if (!inode)
1167 return 0;
1168
c9dc4c65
CM
1169 /* Flush the dirty pages in the cache file. */
1170 ret = flush_dirty_cache(inode);
1171 if (ret)
1172 goto out;
1173
1174 /* Update the cache item to tell everyone this cache file is valid. */
1175 ret = update_cache_item(trans, root, inode, path, offset,
1176 io_ctl->entries, io_ctl->bitmaps);
1177out:
1178 io_ctl_free(io_ctl);
1179 if (ret) {
1180 invalidate_inode_pages2(inode->i_mapping);
1181 BTRFS_I(inode)->generation = 0;
1182 if (block_group) {
1183#ifdef DEBUG
3ffbd68c 1184 btrfs_err(root->fs_info,
0b246afa 1185 "failed to write free space cache for block group %llu",
b3470b5d 1186 block_group->start);
c9dc4c65
CM
1187#endif
1188 }
1189 }
1190 btrfs_update_inode(trans, root, inode);
1191
1192 if (block_group) {
1bbc621e
CM
1193 /* the dirty list is protected by the dirty_bgs_lock */
1194 spin_lock(&trans->transaction->dirty_bgs_lock);
1195
1196 /* the disk_cache_state is protected by the block group lock */
c9dc4c65
CM
1197 spin_lock(&block_group->lock);
1198
1199 /*
1200 * only mark this as written if we didn't get put back on
1bbc621e
CM
1201 * the dirty list while waiting for IO. Otherwise our
1202 * cache state won't be right, and we won't get written again
c9dc4c65
CM
1203 */
1204 if (!ret && list_empty(&block_group->dirty_list))
1205 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1206 else if (ret)
1207 block_group->disk_cache_state = BTRFS_DC_ERROR;
1208
1209 spin_unlock(&block_group->lock);
1bbc621e 1210 spin_unlock(&trans->transaction->dirty_bgs_lock);
c9dc4c65
CM
1211 io_ctl->inode = NULL;
1212 iput(inode);
1213 }
1214
1215 return ret;
1216
1217}
1218
afdb5718
JM
1219static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1220 struct btrfs_trans_handle *trans,
1221 struct btrfs_io_ctl *io_ctl,
1222 struct btrfs_path *path)
1223{
1224 return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1225}
1226
1227int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
32da5386 1228 struct btrfs_block_group *block_group,
afdb5718
JM
1229 struct btrfs_path *path)
1230{
1231 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1232 block_group, &block_group->io_ctl,
b3470b5d 1233 path, block_group->start);
afdb5718
JM
1234}
1235
d4452bc5
CM
1236/**
1237 * __btrfs_write_out_cache - write out cached info to an inode
1238 * @root - the root the inode belongs to
1239 * @ctl - the free space cache we are going to write out
1240 * @block_group - the block_group for this cache if it belongs to a block_group
1241 * @trans - the trans handle
d4452bc5
CM
1242 *
1243 * This function writes out a free space cache struct to disk for quick recovery
8cd1e731 1244 * on mount. This will return 0 if it was successful in writing the cache out,
b8605454 1245 * or an errno if it was not.
d4452bc5
CM
1246 */
1247static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1248 struct btrfs_free_space_ctl *ctl,
32da5386 1249 struct btrfs_block_group *block_group,
c9dc4c65 1250 struct btrfs_io_ctl *io_ctl,
0e8d931a 1251 struct btrfs_trans_handle *trans)
d4452bc5
CM
1252{
1253 struct extent_state *cached_state = NULL;
5349d6c3 1254 LIST_HEAD(bitmap_list);
d4452bc5
CM
1255 int entries = 0;
1256 int bitmaps = 0;
1257 int ret;
c9dc4c65 1258 int must_iput = 0;
d4452bc5
CM
1259
1260 if (!i_size_read(inode))
b8605454 1261 return -EIO;
d4452bc5 1262
c9dc4c65 1263 WARN_ON(io_ctl->pages);
f15376df 1264 ret = io_ctl_init(io_ctl, inode, 1);
d4452bc5 1265 if (ret)
b8605454 1266 return ret;
d4452bc5 1267
e570fd27
MX
1268 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1269 down_write(&block_group->data_rwsem);
1270 spin_lock(&block_group->lock);
1271 if (block_group->delalloc_bytes) {
1272 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1273 spin_unlock(&block_group->lock);
1274 up_write(&block_group->data_rwsem);
1275 BTRFS_I(inode)->generation = 0;
1276 ret = 0;
c9dc4c65 1277 must_iput = 1;
e570fd27
MX
1278 goto out;
1279 }
1280 spin_unlock(&block_group->lock);
1281 }
1282
d4452bc5 1283 /* Lock all pages first so we can lock the extent safely. */
b8605454
OS
1284 ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1285 if (ret)
b77000ed 1286 goto out_unlock;
d4452bc5
CM
1287
1288 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
ff13db41 1289 &cached_state);
d4452bc5 1290
c9dc4c65 1291 io_ctl_set_generation(io_ctl, trans->transid);
d4452bc5 1292
55507ce3 1293 mutex_lock(&ctl->cache_writeout_mutex);
5349d6c3 1294 /* Write out the extent entries in the free space cache */
1bbc621e 1295 spin_lock(&ctl->tree_lock);
c9dc4c65 1296 ret = write_cache_extent_entries(io_ctl, ctl,
d4452bc5
CM
1297 block_group, &entries, &bitmaps,
1298 &bitmap_list);
a3bdccc4
CM
1299 if (ret)
1300 goto out_nospc_locked;
d4452bc5 1301
5349d6c3
MX
1302 /*
1303 * Some spaces that are freed in the current transaction are pinned,
1304 * they will be added into free space cache after the transaction is
1305 * committed, we shouldn't lose them.
1bbc621e
CM
1306 *
1307 * If this changes while we are working we'll get added back to
1308 * the dirty list and redo it. No locking needed
5349d6c3 1309 */
6701bdb3 1310 ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
a3bdccc4
CM
1311 if (ret)
1312 goto out_nospc_locked;
5349d6c3 1313
55507ce3
FM
1314 /*
1315 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1316 * locked while doing it because a concurrent trim can be manipulating
1317 * or freeing the bitmap.
1318 */
c9dc4c65 1319 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1bbc621e 1320 spin_unlock(&ctl->tree_lock);
55507ce3 1321 mutex_unlock(&ctl->cache_writeout_mutex);
5349d6c3
MX
1322 if (ret)
1323 goto out_nospc;
1324
1325 /* Zero out the rest of the pages just to make sure */
c9dc4c65 1326 io_ctl_zero_remaining_pages(io_ctl);
d4452bc5 1327
5349d6c3 1328 /* Everything is written out, now we dirty the pages in the file. */
2ff7e61e
JM
1329 ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1330 i_size_read(inode), &cached_state);
5349d6c3 1331 if (ret)
d4452bc5 1332 goto out_nospc;
5349d6c3 1333
e570fd27
MX
1334 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1335 up_write(&block_group->data_rwsem);
5349d6c3
MX
1336 /*
1337 * Release the pages and unlock the extent, we will flush
1338 * them out later
1339 */
c9dc4c65 1340 io_ctl_drop_pages(io_ctl);
5349d6c3
MX
1341
1342 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1343 i_size_read(inode) - 1, &cached_state);
5349d6c3 1344
c9dc4c65
CM
1345 /*
1346 * at this point the pages are under IO and we're happy,
1347 * The caller is responsible for waiting on them and updating the
1348 * the cache and the inode
1349 */
1350 io_ctl->entries = entries;
1351 io_ctl->bitmaps = bitmaps;
1352
1353 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
5349d6c3 1354 if (ret)
d4452bc5
CM
1355 goto out;
1356
c9dc4c65
CM
1357 return 0;
1358
2f356126 1359out:
c9dc4c65
CM
1360 io_ctl->inode = NULL;
1361 io_ctl_free(io_ctl);
5349d6c3 1362 if (ret) {
a67509c3 1363 invalidate_inode_pages2(inode->i_mapping);
0cb59c99
JB
1364 BTRFS_I(inode)->generation = 0;
1365 }
0cb59c99 1366 btrfs_update_inode(trans, root, inode);
c9dc4c65
CM
1367 if (must_iput)
1368 iput(inode);
5349d6c3 1369 return ret;
a67509c3 1370
a3bdccc4
CM
1371out_nospc_locked:
1372 cleanup_bitmap_list(&bitmap_list);
1373 spin_unlock(&ctl->tree_lock);
1374 mutex_unlock(&ctl->cache_writeout_mutex);
1375
a67509c3 1376out_nospc:
7bf1a159 1377 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
e570fd27 1378
b77000ed 1379out_unlock:
e570fd27
MX
1380 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1381 up_write(&block_group->data_rwsem);
1382
a67509c3 1383 goto out;
0414efae
LZ
1384}
1385
fe041534 1386int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
32da5386 1387 struct btrfs_block_group *block_group,
0414efae
LZ
1388 struct btrfs_path *path)
1389{
fe041534 1390 struct btrfs_fs_info *fs_info = trans->fs_info;
0414efae
LZ
1391 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1392 struct inode *inode;
1393 int ret = 0;
1394
0414efae
LZ
1395 spin_lock(&block_group->lock);
1396 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1397 spin_unlock(&block_group->lock);
e570fd27
MX
1398 return 0;
1399 }
0414efae
LZ
1400 spin_unlock(&block_group->lock);
1401
7949f339 1402 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
1403 if (IS_ERR(inode))
1404 return 0;
1405
77ab86bf
JM
1406 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1407 block_group, &block_group->io_ctl, trans);
c09544e0 1408 if (ret) {
c09544e0 1409#ifdef DEBUG
0b246afa
JM
1410 btrfs_err(fs_info,
1411 "failed to write free space cache for block group %llu",
b3470b5d 1412 block_group->start);
c09544e0 1413#endif
c9dc4c65
CM
1414 spin_lock(&block_group->lock);
1415 block_group->disk_cache_state = BTRFS_DC_ERROR;
1416 spin_unlock(&block_group->lock);
1417
1418 block_group->io_ctl.inode = NULL;
1419 iput(inode);
0414efae
LZ
1420 }
1421
c9dc4c65
CM
1422 /*
1423 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1424 * to wait for IO and put the inode
1425 */
1426
0cb59c99
JB
1427 return ret;
1428}
1429
34d52cb6 1430static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
96303081 1431 u64 offset)
0f9dd46c 1432{
b12d6869 1433 ASSERT(offset >= bitmap_start);
96303081 1434 offset -= bitmap_start;
34d52cb6 1435 return (unsigned long)(div_u64(offset, unit));
96303081 1436}
0f9dd46c 1437
34d52cb6 1438static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
96303081 1439{
34d52cb6 1440 return (unsigned long)(div_u64(bytes, unit));
96303081 1441}
0f9dd46c 1442
34d52cb6 1443static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1444 u64 offset)
1445{
1446 u64 bitmap_start;
0ef6447a 1447 u64 bytes_per_bitmap;
0f9dd46c 1448
34d52cb6
LZ
1449 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1450 bitmap_start = offset - ctl->start;
0ef6447a 1451 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
96303081 1452 bitmap_start *= bytes_per_bitmap;
34d52cb6 1453 bitmap_start += ctl->start;
0f9dd46c 1454
96303081 1455 return bitmap_start;
0f9dd46c
JB
1456}
1457
96303081
JB
1458static int tree_insert_offset(struct rb_root *root, u64 offset,
1459 struct rb_node *node, int bitmap)
0f9dd46c
JB
1460{
1461 struct rb_node **p = &root->rb_node;
1462 struct rb_node *parent = NULL;
1463 struct btrfs_free_space *info;
1464
1465 while (*p) {
1466 parent = *p;
96303081 1467 info = rb_entry(parent, struct btrfs_free_space, offset_index);
0f9dd46c 1468
96303081 1469 if (offset < info->offset) {
0f9dd46c 1470 p = &(*p)->rb_left;
96303081 1471 } else if (offset > info->offset) {
0f9dd46c 1472 p = &(*p)->rb_right;
96303081
JB
1473 } else {
1474 /*
1475 * we could have a bitmap entry and an extent entry
1476 * share the same offset. If this is the case, we want
1477 * the extent entry to always be found first if we do a
1478 * linear search through the tree, since we want to have
1479 * the quickest allocation time, and allocating from an
1480 * extent is faster than allocating from a bitmap. So
1481 * if we're inserting a bitmap and we find an entry at
1482 * this offset, we want to go right, or after this entry
1483 * logically. If we are inserting an extent and we've
1484 * found a bitmap, we want to go left, or before
1485 * logically.
1486 */
1487 if (bitmap) {
207dde82
JB
1488 if (info->bitmap) {
1489 WARN_ON_ONCE(1);
1490 return -EEXIST;
1491 }
96303081
JB
1492 p = &(*p)->rb_right;
1493 } else {
207dde82
JB
1494 if (!info->bitmap) {
1495 WARN_ON_ONCE(1);
1496 return -EEXIST;
1497 }
96303081
JB
1498 p = &(*p)->rb_left;
1499 }
1500 }
0f9dd46c
JB
1501 }
1502
1503 rb_link_node(node, parent, p);
1504 rb_insert_color(node, root);
1505
1506 return 0;
1507}
1508
1509/*
70cb0743
JB
1510 * searches the tree for the given offset.
1511 *
96303081
JB
1512 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1513 * want a section that has at least bytes size and comes at or after the given
1514 * offset.
0f9dd46c 1515 */
96303081 1516static struct btrfs_free_space *
34d52cb6 1517tree_search_offset(struct btrfs_free_space_ctl *ctl,
96303081 1518 u64 offset, int bitmap_only, int fuzzy)
0f9dd46c 1519{
34d52cb6 1520 struct rb_node *n = ctl->free_space_offset.rb_node;
96303081
JB
1521 struct btrfs_free_space *entry, *prev = NULL;
1522
1523 /* find entry that is closest to the 'offset' */
1524 while (1) {
1525 if (!n) {
1526 entry = NULL;
1527 break;
1528 }
0f9dd46c 1529
0f9dd46c 1530 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081 1531 prev = entry;
0f9dd46c 1532
96303081 1533 if (offset < entry->offset)
0f9dd46c 1534 n = n->rb_left;
96303081 1535 else if (offset > entry->offset)
0f9dd46c 1536 n = n->rb_right;
96303081 1537 else
0f9dd46c 1538 break;
0f9dd46c
JB
1539 }
1540
96303081
JB
1541 if (bitmap_only) {
1542 if (!entry)
1543 return NULL;
1544 if (entry->bitmap)
1545 return entry;
0f9dd46c 1546
96303081
JB
1547 /*
1548 * bitmap entry and extent entry may share same offset,
1549 * in that case, bitmap entry comes after extent entry.
1550 */
1551 n = rb_next(n);
1552 if (!n)
1553 return NULL;
1554 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1555 if (entry->offset != offset)
1556 return NULL;
0f9dd46c 1557
96303081
JB
1558 WARN_ON(!entry->bitmap);
1559 return entry;
1560 } else if (entry) {
1561 if (entry->bitmap) {
0f9dd46c 1562 /*
96303081
JB
1563 * if previous extent entry covers the offset,
1564 * we should return it instead of the bitmap entry
0f9dd46c 1565 */
de6c4115
MX
1566 n = rb_prev(&entry->offset_index);
1567 if (n) {
96303081
JB
1568 prev = rb_entry(n, struct btrfs_free_space,
1569 offset_index);
de6c4115
MX
1570 if (!prev->bitmap &&
1571 prev->offset + prev->bytes > offset)
1572 entry = prev;
0f9dd46c 1573 }
96303081
JB
1574 }
1575 return entry;
1576 }
1577
1578 if (!prev)
1579 return NULL;
1580
1581 /* find last entry before the 'offset' */
1582 entry = prev;
1583 if (entry->offset > offset) {
1584 n = rb_prev(&entry->offset_index);
1585 if (n) {
1586 entry = rb_entry(n, struct btrfs_free_space,
1587 offset_index);
b12d6869 1588 ASSERT(entry->offset <= offset);
0f9dd46c 1589 } else {
96303081
JB
1590 if (fuzzy)
1591 return entry;
1592 else
1593 return NULL;
0f9dd46c
JB
1594 }
1595 }
1596
96303081 1597 if (entry->bitmap) {
de6c4115
MX
1598 n = rb_prev(&entry->offset_index);
1599 if (n) {
96303081
JB
1600 prev = rb_entry(n, struct btrfs_free_space,
1601 offset_index);
de6c4115
MX
1602 if (!prev->bitmap &&
1603 prev->offset + prev->bytes > offset)
1604 return prev;
96303081 1605 }
34d52cb6 1606 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
96303081
JB
1607 return entry;
1608 } else if (entry->offset + entry->bytes > offset)
1609 return entry;
1610
1611 if (!fuzzy)
1612 return NULL;
1613
1614 while (1) {
1615 if (entry->bitmap) {
1616 if (entry->offset + BITS_PER_BITMAP *
34d52cb6 1617 ctl->unit > offset)
96303081
JB
1618 break;
1619 } else {
1620 if (entry->offset + entry->bytes > offset)
1621 break;
1622 }
1623
1624 n = rb_next(&entry->offset_index);
1625 if (!n)
1626 return NULL;
1627 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1628 }
1629 return entry;
0f9dd46c
JB
1630}
1631
f333adb5 1632static inline void
34d52cb6 1633__unlink_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 1634 struct btrfs_free_space *info)
0f9dd46c 1635{
34d52cb6
LZ
1636 rb_erase(&info->offset_index, &ctl->free_space_offset);
1637 ctl->free_extents--;
f333adb5
LZ
1638}
1639
34d52cb6 1640static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5
LZ
1641 struct btrfs_free_space *info)
1642{
34d52cb6
LZ
1643 __unlink_free_space(ctl, info);
1644 ctl->free_space -= info->bytes;
0f9dd46c
JB
1645}
1646
34d52cb6 1647static int link_free_space(struct btrfs_free_space_ctl *ctl,
0f9dd46c
JB
1648 struct btrfs_free_space *info)
1649{
1650 int ret = 0;
1651
b12d6869 1652 ASSERT(info->bytes || info->bitmap);
34d52cb6 1653 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
96303081 1654 &info->offset_index, (info->bitmap != NULL));
0f9dd46c
JB
1655 if (ret)
1656 return ret;
1657
34d52cb6
LZ
1658 ctl->free_space += info->bytes;
1659 ctl->free_extents++;
96303081
JB
1660 return ret;
1661}
1662
34d52cb6 1663static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
96303081 1664{
32da5386 1665 struct btrfs_block_group *block_group = ctl->private;
25891f79
JB
1666 u64 max_bytes;
1667 u64 bitmap_bytes;
1668 u64 extent_bytes;
b3470b5d 1669 u64 size = block_group->length;
0ef6447a
FX
1670 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1671 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
34d52cb6 1672
0ef6447a 1673 max_bitmaps = max_t(u64, max_bitmaps, 1);
dde5740f 1674
b12d6869 1675 ASSERT(ctl->total_bitmaps <= max_bitmaps);
96303081
JB
1676
1677 /*
1678 * The goal is to keep the total amount of memory used per 1gb of space
1679 * at or below 32k, so we need to adjust how much memory we allow to be
1680 * used by extent based free space tracking
1681 */
ee22184b 1682 if (size < SZ_1G)
8eb2d829
LZ
1683 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1684 else
ee22184b 1685 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
96303081 1686
25891f79
JB
1687 /*
1688 * we want to account for 1 more bitmap than what we have so we can make
1689 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1690 * we add more bitmaps.
1691 */
b9ef22de 1692 bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
96303081 1693
25891f79 1694 if (bitmap_bytes >= max_bytes) {
34d52cb6 1695 ctl->extents_thresh = 0;
25891f79
JB
1696 return;
1697 }
96303081 1698
25891f79 1699 /*
f8c269d7 1700 * we want the extent entry threshold to always be at most 1/2 the max
25891f79
JB
1701 * bytes we can have, or whatever is less than that.
1702 */
1703 extent_bytes = max_bytes - bitmap_bytes;
f8c269d7 1704 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
96303081 1705
34d52cb6 1706 ctl->extents_thresh =
f8c269d7 1707 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
96303081
JB
1708}
1709
bb3ac5a4
MX
1710static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1711 struct btrfs_free_space *info,
1712 u64 offset, u64 bytes)
96303081 1713{
f38b6e75 1714 unsigned long start, count;
96303081 1715
34d52cb6
LZ
1716 start = offset_to_bit(info->offset, ctl->unit, offset);
1717 count = bytes_to_bits(bytes, ctl->unit);
b12d6869 1718 ASSERT(start + count <= BITS_PER_BITMAP);
96303081 1719
f38b6e75 1720 bitmap_clear(info->bitmap, start, count);
96303081
JB
1721
1722 info->bytes -= bytes;
553cceb4
JB
1723 if (info->max_extent_size > ctl->unit)
1724 info->max_extent_size = 0;
bb3ac5a4
MX
1725}
1726
1727static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1728 struct btrfs_free_space *info, u64 offset,
1729 u64 bytes)
1730{
1731 __bitmap_clear_bits(ctl, info, offset, bytes);
34d52cb6 1732 ctl->free_space -= bytes;
96303081
JB
1733}
1734
34d52cb6 1735static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
817d52f8
JB
1736 struct btrfs_free_space *info, u64 offset,
1737 u64 bytes)
96303081 1738{
f38b6e75 1739 unsigned long start, count;
96303081 1740
34d52cb6
LZ
1741 start = offset_to_bit(info->offset, ctl->unit, offset);
1742 count = bytes_to_bits(bytes, ctl->unit);
b12d6869 1743 ASSERT(start + count <= BITS_PER_BITMAP);
96303081 1744
f38b6e75 1745 bitmap_set(info->bitmap, start, count);
96303081
JB
1746
1747 info->bytes += bytes;
34d52cb6 1748 ctl->free_space += bytes;
96303081
JB
1749}
1750
a4820398
MX
1751/*
1752 * If we can not find suitable extent, we will use bytes to record
1753 * the size of the max extent.
1754 */
34d52cb6 1755static int search_bitmap(struct btrfs_free_space_ctl *ctl,
96303081 1756 struct btrfs_free_space *bitmap_info, u64 *offset,
0584f718 1757 u64 *bytes, bool for_alloc)
96303081
JB
1758{
1759 unsigned long found_bits = 0;
a4820398 1760 unsigned long max_bits = 0;
96303081
JB
1761 unsigned long bits, i;
1762 unsigned long next_zero;
a4820398 1763 unsigned long extent_bits;
96303081 1764
cef40483
JB
1765 /*
1766 * Skip searching the bitmap if we don't have a contiguous section that
1767 * is large enough for this allocation.
1768 */
0584f718
JB
1769 if (for_alloc &&
1770 bitmap_info->max_extent_size &&
cef40483
JB
1771 bitmap_info->max_extent_size < *bytes) {
1772 *bytes = bitmap_info->max_extent_size;
1773 return -1;
1774 }
1775
34d52cb6 1776 i = offset_to_bit(bitmap_info->offset, ctl->unit,
96303081 1777 max_t(u64, *offset, bitmap_info->offset));
34d52cb6 1778 bits = bytes_to_bits(*bytes, ctl->unit);
96303081 1779
ebb3dad4 1780 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
0584f718
JB
1781 if (for_alloc && bits == 1) {
1782 found_bits = 1;
1783 break;
1784 }
96303081
JB
1785 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1786 BITS_PER_BITMAP, i);
a4820398
MX
1787 extent_bits = next_zero - i;
1788 if (extent_bits >= bits) {
1789 found_bits = extent_bits;
96303081 1790 break;
a4820398
MX
1791 } else if (extent_bits > max_bits) {
1792 max_bits = extent_bits;
96303081
JB
1793 }
1794 i = next_zero;
1795 }
1796
1797 if (found_bits) {
34d52cb6
LZ
1798 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1799 *bytes = (u64)(found_bits) * ctl->unit;
96303081
JB
1800 return 0;
1801 }
1802
a4820398 1803 *bytes = (u64)(max_bits) * ctl->unit;
cef40483 1804 bitmap_info->max_extent_size = *bytes;
96303081
JB
1805 return -1;
1806}
1807
ad22cf6e
JB
1808static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1809{
1810 if (entry->bitmap)
1811 return entry->max_extent_size;
1812 return entry->bytes;
1813}
1814
a4820398 1815/* Cache the size of the max extent in bytes */
34d52cb6 1816static struct btrfs_free_space *
53b381b3 1817find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
a4820398 1818 unsigned long align, u64 *max_extent_size)
96303081
JB
1819{
1820 struct btrfs_free_space *entry;
1821 struct rb_node *node;
53b381b3
DW
1822 u64 tmp;
1823 u64 align_off;
96303081
JB
1824 int ret;
1825
34d52cb6 1826 if (!ctl->free_space_offset.rb_node)
a4820398 1827 goto out;
96303081 1828
34d52cb6 1829 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
96303081 1830 if (!entry)
a4820398 1831 goto out;
96303081
JB
1832
1833 for (node = &entry->offset_index; node; node = rb_next(node)) {
1834 entry = rb_entry(node, struct btrfs_free_space, offset_index);
a4820398 1835 if (entry->bytes < *bytes) {
ad22cf6e
JB
1836 *max_extent_size = max(get_max_extent_size(entry),
1837 *max_extent_size);
96303081 1838 continue;
a4820398 1839 }
96303081 1840
53b381b3
DW
1841 /* make sure the space returned is big enough
1842 * to match our requested alignment
1843 */
1844 if (*bytes >= align) {
a4820398 1845 tmp = entry->offset - ctl->start + align - 1;
47c5713f 1846 tmp = div64_u64(tmp, align);
53b381b3
DW
1847 tmp = tmp * align + ctl->start;
1848 align_off = tmp - entry->offset;
1849 } else {
1850 align_off = 0;
1851 tmp = entry->offset;
1852 }
1853
a4820398 1854 if (entry->bytes < *bytes + align_off) {
ad22cf6e
JB
1855 *max_extent_size = max(get_max_extent_size(entry),
1856 *max_extent_size);
53b381b3 1857 continue;
a4820398 1858 }
53b381b3 1859
96303081 1860 if (entry->bitmap) {
a4820398
MX
1861 u64 size = *bytes;
1862
0584f718 1863 ret = search_bitmap(ctl, entry, &tmp, &size, true);
53b381b3
DW
1864 if (!ret) {
1865 *offset = tmp;
a4820398 1866 *bytes = size;
96303081 1867 return entry;
ad22cf6e
JB
1868 } else {
1869 *max_extent_size =
1870 max(get_max_extent_size(entry),
1871 *max_extent_size);
53b381b3 1872 }
96303081
JB
1873 continue;
1874 }
1875
53b381b3
DW
1876 *offset = tmp;
1877 *bytes = entry->bytes - align_off;
96303081
JB
1878 return entry;
1879 }
a4820398 1880out:
96303081
JB
1881 return NULL;
1882}
1883
34d52cb6 1884static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1885 struct btrfs_free_space *info, u64 offset)
1886{
34d52cb6 1887 info->offset = offset_to_bitmap(ctl, offset);
f019f426 1888 info->bytes = 0;
f2d0f676 1889 INIT_LIST_HEAD(&info->list);
34d52cb6
LZ
1890 link_free_space(ctl, info);
1891 ctl->total_bitmaps++;
96303081 1892
34d52cb6 1893 ctl->op->recalc_thresholds(ctl);
96303081
JB
1894}
1895
34d52cb6 1896static void free_bitmap(struct btrfs_free_space_ctl *ctl,
edf6e2d1
LZ
1897 struct btrfs_free_space *bitmap_info)
1898{
34d52cb6 1899 unlink_free_space(ctl, bitmap_info);
3acd4850 1900 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
dc89e982 1901 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
34d52cb6
LZ
1902 ctl->total_bitmaps--;
1903 ctl->op->recalc_thresholds(ctl);
edf6e2d1
LZ
1904}
1905
34d52cb6 1906static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1907 struct btrfs_free_space *bitmap_info,
1908 u64 *offset, u64 *bytes)
1909{
1910 u64 end;
6606bb97
JB
1911 u64 search_start, search_bytes;
1912 int ret;
96303081
JB
1913
1914again:
34d52cb6 1915 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
96303081 1916
6606bb97 1917 /*
bdb7d303
JB
1918 * We need to search for bits in this bitmap. We could only cover some
1919 * of the extent in this bitmap thanks to how we add space, so we need
1920 * to search for as much as it as we can and clear that amount, and then
1921 * go searching for the next bit.
6606bb97
JB
1922 */
1923 search_start = *offset;
bdb7d303 1924 search_bytes = ctl->unit;
13dbc089 1925 search_bytes = min(search_bytes, end - search_start + 1);
0584f718
JB
1926 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1927 false);
b50c6e25
JB
1928 if (ret < 0 || search_start != *offset)
1929 return -EINVAL;
6606bb97 1930
bdb7d303
JB
1931 /* We may have found more bits than what we need */
1932 search_bytes = min(search_bytes, *bytes);
1933
1934 /* Cannot clear past the end of the bitmap */
1935 search_bytes = min(search_bytes, end - search_start + 1);
1936
1937 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1938 *offset += search_bytes;
1939 *bytes -= search_bytes;
96303081
JB
1940
1941 if (*bytes) {
6606bb97 1942 struct rb_node *next = rb_next(&bitmap_info->offset_index);
edf6e2d1 1943 if (!bitmap_info->bytes)
34d52cb6 1944 free_bitmap(ctl, bitmap_info);
96303081 1945
6606bb97
JB
1946 /*
1947 * no entry after this bitmap, but we still have bytes to
1948 * remove, so something has gone wrong.
1949 */
1950 if (!next)
96303081
JB
1951 return -EINVAL;
1952
6606bb97
JB
1953 bitmap_info = rb_entry(next, struct btrfs_free_space,
1954 offset_index);
1955
1956 /*
1957 * if the next entry isn't a bitmap we need to return to let the
1958 * extent stuff do its work.
1959 */
96303081
JB
1960 if (!bitmap_info->bitmap)
1961 return -EAGAIN;
1962
6606bb97
JB
1963 /*
1964 * Ok the next item is a bitmap, but it may not actually hold
1965 * the information for the rest of this free space stuff, so
1966 * look for it, and if we don't find it return so we can try
1967 * everything over again.
1968 */
1969 search_start = *offset;
bdb7d303 1970 search_bytes = ctl->unit;
34d52cb6 1971 ret = search_bitmap(ctl, bitmap_info, &search_start,
0584f718 1972 &search_bytes, false);
6606bb97
JB
1973 if (ret < 0 || search_start != *offset)
1974 return -EAGAIN;
1975
96303081 1976 goto again;
edf6e2d1 1977 } else if (!bitmap_info->bytes)
34d52cb6 1978 free_bitmap(ctl, bitmap_info);
96303081
JB
1979
1980 return 0;
1981}
1982
2cdc342c
JB
1983static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1984 struct btrfs_free_space *info, u64 offset,
da080fe1 1985 u64 bytes, enum btrfs_trim_state trim_state)
2cdc342c
JB
1986{
1987 u64 bytes_to_set = 0;
1988 u64 end;
1989
da080fe1
DZ
1990 /*
1991 * This is a tradeoff to make bitmap trim state minimal. We mark the
1992 * whole bitmap untrimmed if at any point we add untrimmed regions.
1993 */
1994 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED)
1995 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
1996
2cdc342c
JB
1997 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1998
1999 bytes_to_set = min(end - offset, bytes);
2000
2001 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2002
cef40483
JB
2003 /*
2004 * We set some bytes, we have no idea what the max extent size is
2005 * anymore.
2006 */
2007 info->max_extent_size = 0;
2008
2cdc342c
JB
2009 return bytes_to_set;
2010
2011}
2012
34d52cb6
LZ
2013static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2014 struct btrfs_free_space *info)
96303081 2015{
32da5386 2016 struct btrfs_block_group *block_group = ctl->private;
0b246afa 2017 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
2018 bool forced = false;
2019
2020#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 2021 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
2022 forced = true;
2023#endif
96303081
JB
2024
2025 /*
2026 * If we are below the extents threshold then we can add this as an
2027 * extent, and don't have to deal with the bitmap
2028 */
d0bd4560 2029 if (!forced && ctl->free_extents < ctl->extents_thresh) {
32cb0840
JB
2030 /*
2031 * If this block group has some small extents we don't want to
2032 * use up all of our free slots in the cache with them, we want
01327610 2033 * to reserve them to larger extents, however if we have plenty
32cb0840
JB
2034 * of cache left then go ahead an dadd them, no sense in adding
2035 * the overhead of a bitmap if we don't have to.
2036 */
0b246afa 2037 if (info->bytes <= fs_info->sectorsize * 4) {
34d52cb6
LZ
2038 if (ctl->free_extents * 2 <= ctl->extents_thresh)
2039 return false;
32cb0840 2040 } else {
34d52cb6 2041 return false;
32cb0840
JB
2042 }
2043 }
96303081
JB
2044
2045 /*
dde5740f
JB
2046 * The original block groups from mkfs can be really small, like 8
2047 * megabytes, so don't bother with a bitmap for those entries. However
2048 * some block groups can be smaller than what a bitmap would cover but
2049 * are still large enough that they could overflow the 32k memory limit,
2050 * so allow those block groups to still be allowed to have a bitmap
2051 * entry.
96303081 2052 */
b3470b5d 2053 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
34d52cb6
LZ
2054 return false;
2055
2056 return true;
2057}
2058
20e5506b 2059static const struct btrfs_free_space_op free_space_op = {
2cdc342c
JB
2060 .recalc_thresholds = recalculate_thresholds,
2061 .use_bitmap = use_bitmap,
2062};
2063
34d52cb6
LZ
2064static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2065 struct btrfs_free_space *info)
2066{
2067 struct btrfs_free_space *bitmap_info;
32da5386 2068 struct btrfs_block_group *block_group = NULL;
34d52cb6 2069 int added = 0;
2cdc342c 2070 u64 bytes, offset, bytes_added;
da080fe1 2071 enum btrfs_trim_state trim_state;
34d52cb6 2072 int ret;
96303081
JB
2073
2074 bytes = info->bytes;
2075 offset = info->offset;
da080fe1 2076 trim_state = info->trim_state;
96303081 2077
34d52cb6
LZ
2078 if (!ctl->op->use_bitmap(ctl, info))
2079 return 0;
2080
2cdc342c
JB
2081 if (ctl->op == &free_space_op)
2082 block_group = ctl->private;
38e87880 2083again:
2cdc342c
JB
2084 /*
2085 * Since we link bitmaps right into the cluster we need to see if we
2086 * have a cluster here, and if so and it has our bitmap we need to add
2087 * the free space to that bitmap.
2088 */
2089 if (block_group && !list_empty(&block_group->cluster_list)) {
2090 struct btrfs_free_cluster *cluster;
2091 struct rb_node *node;
2092 struct btrfs_free_space *entry;
2093
2094 cluster = list_entry(block_group->cluster_list.next,
2095 struct btrfs_free_cluster,
2096 block_group_list);
2097 spin_lock(&cluster->lock);
2098 node = rb_first(&cluster->root);
2099 if (!node) {
2100 spin_unlock(&cluster->lock);
38e87880 2101 goto no_cluster_bitmap;
2cdc342c
JB
2102 }
2103
2104 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2105 if (!entry->bitmap) {
2106 spin_unlock(&cluster->lock);
38e87880 2107 goto no_cluster_bitmap;
2cdc342c
JB
2108 }
2109
2110 if (entry->offset == offset_to_bitmap(ctl, offset)) {
da080fe1
DZ
2111 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2112 bytes, trim_state);
2cdc342c
JB
2113 bytes -= bytes_added;
2114 offset += bytes_added;
2115 }
2116 spin_unlock(&cluster->lock);
2117 if (!bytes) {
2118 ret = 1;
2119 goto out;
2120 }
2121 }
38e87880
CM
2122
2123no_cluster_bitmap:
34d52cb6 2124 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
96303081
JB
2125 1, 0);
2126 if (!bitmap_info) {
b12d6869 2127 ASSERT(added == 0);
96303081
JB
2128 goto new_bitmap;
2129 }
2130
da080fe1
DZ
2131 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2132 trim_state);
2cdc342c
JB
2133 bytes -= bytes_added;
2134 offset += bytes_added;
2135 added = 0;
96303081
JB
2136
2137 if (!bytes) {
2138 ret = 1;
2139 goto out;
2140 } else
2141 goto again;
2142
2143new_bitmap:
2144 if (info && info->bitmap) {
34d52cb6 2145 add_new_bitmap(ctl, info, offset);
96303081
JB
2146 added = 1;
2147 info = NULL;
2148 goto again;
2149 } else {
34d52cb6 2150 spin_unlock(&ctl->tree_lock);
96303081
JB
2151
2152 /* no pre-allocated info, allocate a new one */
2153 if (!info) {
dc89e982
JB
2154 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2155 GFP_NOFS);
96303081 2156 if (!info) {
34d52cb6 2157 spin_lock(&ctl->tree_lock);
96303081
JB
2158 ret = -ENOMEM;
2159 goto out;
2160 }
2161 }
2162
2163 /* allocate the bitmap */
3acd4850
CL
2164 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2165 GFP_NOFS);
da080fe1 2166 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
34d52cb6 2167 spin_lock(&ctl->tree_lock);
96303081
JB
2168 if (!info->bitmap) {
2169 ret = -ENOMEM;
2170 goto out;
2171 }
2172 goto again;
2173 }
2174
2175out:
2176 if (info) {
3acd4850
CL
2177 if (info->bitmap)
2178 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2179 info->bitmap);
dc89e982 2180 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2181 }
0f9dd46c
JB
2182
2183 return ret;
2184}
2185
a7ccb255
DZ
2186/*
2187 * Free space merging rules:
2188 * 1) Merge trimmed areas together
2189 * 2) Let untrimmed areas coalesce with trimmed areas
2190 * 3) Always pull neighboring regions from bitmaps
2191 *
2192 * The above rules are for when we merge free space based on btrfs_trim_state.
2193 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2194 * same reason: to promote larger extent regions which makes life easier for
2195 * find_free_extent(). Rule 2 enables coalescing based on the common path
2196 * being returning free space from btrfs_finish_extent_commit(). So when free
2197 * space is trimmed, it will prevent aggregating trimmed new region and
2198 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2199 * and provide find_free_extent() with the largest extents possible hoping for
2200 * the reuse path.
2201 */
945d8962 2202static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 2203 struct btrfs_free_space *info, bool update_stat)
0f9dd46c 2204{
120d66ee
LZ
2205 struct btrfs_free_space *left_info;
2206 struct btrfs_free_space *right_info;
2207 bool merged = false;
2208 u64 offset = info->offset;
2209 u64 bytes = info->bytes;
a7ccb255 2210 const bool is_trimmed = btrfs_free_space_trimmed(info);
6226cb0a 2211
0f9dd46c
JB
2212 /*
2213 * first we want to see if there is free space adjacent to the range we
2214 * are adding, if there is remove that struct and add a new one to
2215 * cover the entire range
2216 */
34d52cb6 2217 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
96303081
JB
2218 if (right_info && rb_prev(&right_info->offset_index))
2219 left_info = rb_entry(rb_prev(&right_info->offset_index),
2220 struct btrfs_free_space, offset_index);
2221 else
34d52cb6 2222 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
0f9dd46c 2223
a7ccb255
DZ
2224 /* See try_merge_free_space() comment. */
2225 if (right_info && !right_info->bitmap &&
2226 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
f333adb5 2227 if (update_stat)
34d52cb6 2228 unlink_free_space(ctl, right_info);
f333adb5 2229 else
34d52cb6 2230 __unlink_free_space(ctl, right_info);
6226cb0a 2231 info->bytes += right_info->bytes;
dc89e982 2232 kmem_cache_free(btrfs_free_space_cachep, right_info);
120d66ee 2233 merged = true;
0f9dd46c
JB
2234 }
2235
a7ccb255 2236 /* See try_merge_free_space() comment. */
96303081 2237 if (left_info && !left_info->bitmap &&
a7ccb255
DZ
2238 left_info->offset + left_info->bytes == offset &&
2239 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
f333adb5 2240 if (update_stat)
34d52cb6 2241 unlink_free_space(ctl, left_info);
f333adb5 2242 else
34d52cb6 2243 __unlink_free_space(ctl, left_info);
6226cb0a
JB
2244 info->offset = left_info->offset;
2245 info->bytes += left_info->bytes;
dc89e982 2246 kmem_cache_free(btrfs_free_space_cachep, left_info);
120d66ee 2247 merged = true;
0f9dd46c
JB
2248 }
2249
120d66ee
LZ
2250 return merged;
2251}
2252
20005523
FM
2253static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2254 struct btrfs_free_space *info,
2255 bool update_stat)
2256{
2257 struct btrfs_free_space *bitmap;
2258 unsigned long i;
2259 unsigned long j;
2260 const u64 end = info->offset + info->bytes;
2261 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2262 u64 bytes;
2263
2264 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2265 if (!bitmap)
2266 return false;
2267
2268 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2269 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2270 if (j == i)
2271 return false;
2272 bytes = (j - i) * ctl->unit;
2273 info->bytes += bytes;
2274
a7ccb255
DZ
2275 /* See try_merge_free_space() comment. */
2276 if (!btrfs_free_space_trimmed(bitmap))
2277 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2278
20005523
FM
2279 if (update_stat)
2280 bitmap_clear_bits(ctl, bitmap, end, bytes);
2281 else
2282 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2283
2284 if (!bitmap->bytes)
2285 free_bitmap(ctl, bitmap);
2286
2287 return true;
2288}
2289
2290static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2291 struct btrfs_free_space *info,
2292 bool update_stat)
2293{
2294 struct btrfs_free_space *bitmap;
2295 u64 bitmap_offset;
2296 unsigned long i;
2297 unsigned long j;
2298 unsigned long prev_j;
2299 u64 bytes;
2300
2301 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2302 /* If we're on a boundary, try the previous logical bitmap. */
2303 if (bitmap_offset == info->offset) {
2304 if (info->offset == 0)
2305 return false;
2306 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2307 }
2308
2309 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2310 if (!bitmap)
2311 return false;
2312
2313 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2314 j = 0;
2315 prev_j = (unsigned long)-1;
2316 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2317 if (j > i)
2318 break;
2319 prev_j = j;
2320 }
2321 if (prev_j == i)
2322 return false;
2323
2324 if (prev_j == (unsigned long)-1)
2325 bytes = (i + 1) * ctl->unit;
2326 else
2327 bytes = (i - prev_j) * ctl->unit;
2328
2329 info->offset -= bytes;
2330 info->bytes += bytes;
2331
a7ccb255
DZ
2332 /* See try_merge_free_space() comment. */
2333 if (!btrfs_free_space_trimmed(bitmap))
2334 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2335
20005523
FM
2336 if (update_stat)
2337 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2338 else
2339 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2340
2341 if (!bitmap->bytes)
2342 free_bitmap(ctl, bitmap);
2343
2344 return true;
2345}
2346
2347/*
2348 * We prefer always to allocate from extent entries, both for clustered and
2349 * non-clustered allocation requests. So when attempting to add a new extent
2350 * entry, try to see if there's adjacent free space in bitmap entries, and if
2351 * there is, migrate that space from the bitmaps to the extent.
2352 * Like this we get better chances of satisfying space allocation requests
2353 * because we attempt to satisfy them based on a single cache entry, and never
2354 * on 2 or more entries - even if the entries represent a contiguous free space
2355 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2356 * ends).
2357 */
2358static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2359 struct btrfs_free_space *info,
2360 bool update_stat)
2361{
2362 /*
2363 * Only work with disconnected entries, as we can change their offset,
2364 * and must be extent entries.
2365 */
2366 ASSERT(!info->bitmap);
2367 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2368
2369 if (ctl->total_bitmaps > 0) {
2370 bool stole_end;
2371 bool stole_front = false;
2372
2373 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2374 if (ctl->total_bitmaps > 0)
2375 stole_front = steal_from_bitmap_to_front(ctl, info,
2376 update_stat);
2377
2378 if (stole_end || stole_front)
2379 try_merge_free_space(ctl, info, update_stat);
2380 }
2381}
2382
ab8d0fc4
JM
2383int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2384 struct btrfs_free_space_ctl *ctl,
a7ccb255
DZ
2385 u64 offset, u64 bytes,
2386 enum btrfs_trim_state trim_state)
120d66ee 2387{
b0643e59 2388 struct btrfs_block_group *block_group = ctl->private;
120d66ee
LZ
2389 struct btrfs_free_space *info;
2390 int ret = 0;
2391
dc89e982 2392 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
120d66ee
LZ
2393 if (!info)
2394 return -ENOMEM;
2395
2396 info->offset = offset;
2397 info->bytes = bytes;
a7ccb255 2398 info->trim_state = trim_state;
20005523 2399 RB_CLEAR_NODE(&info->offset_index);
120d66ee 2400
34d52cb6 2401 spin_lock(&ctl->tree_lock);
120d66ee 2402
34d52cb6 2403 if (try_merge_free_space(ctl, info, true))
120d66ee
LZ
2404 goto link;
2405
2406 /*
2407 * There was no extent directly to the left or right of this new
2408 * extent then we know we're going to have to allocate a new extent, so
2409 * before we do that see if we need to drop this into a bitmap
2410 */
34d52cb6 2411 ret = insert_into_bitmap(ctl, info);
120d66ee
LZ
2412 if (ret < 0) {
2413 goto out;
2414 } else if (ret) {
2415 ret = 0;
2416 goto out;
2417 }
2418link:
20005523
FM
2419 /*
2420 * Only steal free space from adjacent bitmaps if we're sure we're not
2421 * going to add the new free space to existing bitmap entries - because
2422 * that would mean unnecessary work that would be reverted. Therefore
2423 * attempt to steal space from bitmaps if we're adding an extent entry.
2424 */
2425 steal_from_bitmap(ctl, info, true);
2426
34d52cb6 2427 ret = link_free_space(ctl, info);
0f9dd46c 2428 if (ret)
dc89e982 2429 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2430out:
34d52cb6 2431 spin_unlock(&ctl->tree_lock);
6226cb0a 2432
0f9dd46c 2433 if (ret) {
ab8d0fc4 2434 btrfs_crit(fs_info, "unable to add free space :%d", ret);
b12d6869 2435 ASSERT(ret != -EEXIST);
0f9dd46c
JB
2436 }
2437
b0643e59
DZ
2438 if (trim_state != BTRFS_TRIM_STATE_TRIMMED)
2439 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2440
0f9dd46c
JB
2441 return ret;
2442}
2443
32da5386 2444int btrfs_add_free_space(struct btrfs_block_group *block_group,
478b4d9f
JB
2445 u64 bytenr, u64 size)
2446{
a7ccb255
DZ
2447 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2448
2449 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2450 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2451
478b4d9f
JB
2452 return __btrfs_add_free_space(block_group->fs_info,
2453 block_group->free_space_ctl,
a7ccb255 2454 bytenr, size, trim_state);
478b4d9f
JB
2455}
2456
b0643e59
DZ
2457/*
2458 * This is a subtle distinction because when adding free space back in general,
2459 * we want it to be added as untrimmed for async. But in the case where we add
2460 * it on loading of a block group, we want to consider it trimmed.
2461 */
2462int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2463 u64 bytenr, u64 size)
2464{
2465 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2466
2467 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2468 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2469 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2470
2471 return __btrfs_add_free_space(block_group->fs_info,
2472 block_group->free_space_ctl,
2473 bytenr, size, trim_state);
2474}
2475
32da5386 2476int btrfs_remove_free_space(struct btrfs_block_group *block_group,
6226cb0a 2477 u64 offset, u64 bytes)
0f9dd46c 2478{
34d52cb6 2479 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2480 struct btrfs_free_space *info;
b0175117
JB
2481 int ret;
2482 bool re_search = false;
0f9dd46c 2483
34d52cb6 2484 spin_lock(&ctl->tree_lock);
6226cb0a 2485
96303081 2486again:
b0175117 2487 ret = 0;
bdb7d303
JB
2488 if (!bytes)
2489 goto out_lock;
2490
34d52cb6 2491 info = tree_search_offset(ctl, offset, 0, 0);
96303081 2492 if (!info) {
6606bb97
JB
2493 /*
2494 * oops didn't find an extent that matched the space we wanted
2495 * to remove, look for a bitmap instead
2496 */
34d52cb6 2497 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
6606bb97
JB
2498 1, 0);
2499 if (!info) {
b0175117
JB
2500 /*
2501 * If we found a partial bit of our free space in a
2502 * bitmap but then couldn't find the other part this may
2503 * be a problem, so WARN about it.
24a70313 2504 */
b0175117 2505 WARN_ON(re_search);
6606bb97
JB
2506 goto out_lock;
2507 }
96303081
JB
2508 }
2509
b0175117 2510 re_search = false;
bdb7d303 2511 if (!info->bitmap) {
34d52cb6 2512 unlink_free_space(ctl, info);
bdb7d303
JB
2513 if (offset == info->offset) {
2514 u64 to_free = min(bytes, info->bytes);
2515
2516 info->bytes -= to_free;
2517 info->offset += to_free;
2518 if (info->bytes) {
2519 ret = link_free_space(ctl, info);
2520 WARN_ON(ret);
2521 } else {
2522 kmem_cache_free(btrfs_free_space_cachep, info);
2523 }
0f9dd46c 2524
bdb7d303
JB
2525 offset += to_free;
2526 bytes -= to_free;
2527 goto again;
2528 } else {
2529 u64 old_end = info->bytes + info->offset;
9b49c9b9 2530
bdb7d303 2531 info->bytes = offset - info->offset;
34d52cb6 2532 ret = link_free_space(ctl, info);
96303081
JB
2533 WARN_ON(ret);
2534 if (ret)
2535 goto out_lock;
96303081 2536
bdb7d303
JB
2537 /* Not enough bytes in this entry to satisfy us */
2538 if (old_end < offset + bytes) {
2539 bytes -= old_end - offset;
2540 offset = old_end;
2541 goto again;
2542 } else if (old_end == offset + bytes) {
2543 /* all done */
2544 goto out_lock;
2545 }
2546 spin_unlock(&ctl->tree_lock);
2547
a7ccb255
DZ
2548 ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2549 offset + bytes,
2550 old_end - (offset + bytes),
2551 info->trim_state);
bdb7d303
JB
2552 WARN_ON(ret);
2553 goto out;
2554 }
0f9dd46c 2555 }
96303081 2556
34d52cb6 2557 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
b0175117
JB
2558 if (ret == -EAGAIN) {
2559 re_search = true;
96303081 2560 goto again;
b0175117 2561 }
96303081 2562out_lock:
34d52cb6 2563 spin_unlock(&ctl->tree_lock);
0f9dd46c 2564out:
25179201
JB
2565 return ret;
2566}
2567
32da5386 2568void btrfs_dump_free_space(struct btrfs_block_group *block_group,
0f9dd46c
JB
2569 u64 bytes)
2570{
0b246afa 2571 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2572 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c
JB
2573 struct btrfs_free_space *info;
2574 struct rb_node *n;
2575 int count = 0;
2576
9084cb6a 2577 spin_lock(&ctl->tree_lock);
34d52cb6 2578 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
0f9dd46c 2579 info = rb_entry(n, struct btrfs_free_space, offset_index);
f6175efa 2580 if (info->bytes >= bytes && !block_group->ro)
0f9dd46c 2581 count++;
0b246afa 2582 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
efe120a0 2583 info->offset, info->bytes,
96303081 2584 (info->bitmap) ? "yes" : "no");
0f9dd46c 2585 }
9084cb6a 2586 spin_unlock(&ctl->tree_lock);
0b246afa 2587 btrfs_info(fs_info, "block group has cluster?: %s",
96303081 2588 list_empty(&block_group->cluster_list) ? "no" : "yes");
0b246afa 2589 btrfs_info(fs_info,
efe120a0 2590 "%d blocks of free space at or bigger than bytes is", count);
0f9dd46c
JB
2591}
2592
32da5386 2593void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
0f9dd46c 2594{
0b246afa 2595 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2596 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2597
34d52cb6 2598 spin_lock_init(&ctl->tree_lock);
0b246afa 2599 ctl->unit = fs_info->sectorsize;
b3470b5d 2600 ctl->start = block_group->start;
34d52cb6
LZ
2601 ctl->private = block_group;
2602 ctl->op = &free_space_op;
55507ce3
FM
2603 INIT_LIST_HEAD(&ctl->trimming_ranges);
2604 mutex_init(&ctl->cache_writeout_mutex);
0f9dd46c 2605
34d52cb6
LZ
2606 /*
2607 * we only want to have 32k of ram per block group for keeping
2608 * track of free space, and if we pass 1/2 of that we want to
2609 * start converting things over to using bitmaps
2610 */
ee22184b 2611 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
0f9dd46c
JB
2612}
2613
fa9c0d79
CM
2614/*
2615 * for a given cluster, put all of its extents back into the free
2616 * space cache. If the block group passed doesn't match the block group
2617 * pointed to by the cluster, someone else raced in and freed the
2618 * cluster already. In that case, we just return without changing anything
2619 */
2620static int
2621__btrfs_return_cluster_to_free_space(
32da5386 2622 struct btrfs_block_group *block_group,
fa9c0d79
CM
2623 struct btrfs_free_cluster *cluster)
2624{
34d52cb6 2625 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2626 struct btrfs_free_space *entry;
2627 struct rb_node *node;
2628
2629 spin_lock(&cluster->lock);
2630 if (cluster->block_group != block_group)
2631 goto out;
2632
96303081 2633 cluster->block_group = NULL;
fa9c0d79 2634 cluster->window_start = 0;
96303081 2635 list_del_init(&cluster->block_group_list);
96303081 2636
fa9c0d79 2637 node = rb_first(&cluster->root);
96303081 2638 while (node) {
4e69b598
JB
2639 bool bitmap;
2640
fa9c0d79
CM
2641 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2642 node = rb_next(&entry->offset_index);
2643 rb_erase(&entry->offset_index, &cluster->root);
20005523 2644 RB_CLEAR_NODE(&entry->offset_index);
4e69b598
JB
2645
2646 bitmap = (entry->bitmap != NULL);
20005523 2647 if (!bitmap) {
34d52cb6 2648 try_merge_free_space(ctl, entry, false);
20005523
FM
2649 steal_from_bitmap(ctl, entry, false);
2650 }
34d52cb6 2651 tree_insert_offset(&ctl->free_space_offset,
4e69b598 2652 entry->offset, &entry->offset_index, bitmap);
fa9c0d79 2653 }
6bef4d31 2654 cluster->root = RB_ROOT;
96303081 2655
fa9c0d79
CM
2656out:
2657 spin_unlock(&cluster->lock);
96303081 2658 btrfs_put_block_group(block_group);
fa9c0d79
CM
2659 return 0;
2660}
2661
48a3b636
ES
2662static void __btrfs_remove_free_space_cache_locked(
2663 struct btrfs_free_space_ctl *ctl)
0f9dd46c
JB
2664{
2665 struct btrfs_free_space *info;
2666 struct rb_node *node;
581bb050 2667
581bb050
LZ
2668 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2669 info = rb_entry(node, struct btrfs_free_space, offset_index);
9b90f513
JB
2670 if (!info->bitmap) {
2671 unlink_free_space(ctl, info);
2672 kmem_cache_free(btrfs_free_space_cachep, info);
2673 } else {
2674 free_bitmap(ctl, info);
2675 }
351810c1
DS
2676
2677 cond_resched_lock(&ctl->tree_lock);
581bb050 2678 }
09655373
CM
2679}
2680
2681void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2682{
2683 spin_lock(&ctl->tree_lock);
2684 __btrfs_remove_free_space_cache_locked(ctl);
581bb050
LZ
2685 spin_unlock(&ctl->tree_lock);
2686}
2687
32da5386 2688void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
581bb050
LZ
2689{
2690 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79 2691 struct btrfs_free_cluster *cluster;
96303081 2692 struct list_head *head;
0f9dd46c 2693
34d52cb6 2694 spin_lock(&ctl->tree_lock);
96303081
JB
2695 while ((head = block_group->cluster_list.next) !=
2696 &block_group->cluster_list) {
2697 cluster = list_entry(head, struct btrfs_free_cluster,
2698 block_group_list);
fa9c0d79
CM
2699
2700 WARN_ON(cluster->block_group != block_group);
2701 __btrfs_return_cluster_to_free_space(block_group, cluster);
351810c1
DS
2702
2703 cond_resched_lock(&ctl->tree_lock);
fa9c0d79 2704 }
09655373 2705 __btrfs_remove_free_space_cache_locked(ctl);
34d52cb6 2706 spin_unlock(&ctl->tree_lock);
fa9c0d79 2707
0f9dd46c
JB
2708}
2709
32da5386 2710u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
a4820398
MX
2711 u64 offset, u64 bytes, u64 empty_size,
2712 u64 *max_extent_size)
0f9dd46c 2713{
34d52cb6 2714 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
6226cb0a 2715 struct btrfs_free_space *entry = NULL;
96303081 2716 u64 bytes_search = bytes + empty_size;
6226cb0a 2717 u64 ret = 0;
53b381b3
DW
2718 u64 align_gap = 0;
2719 u64 align_gap_len = 0;
a7ccb255 2720 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
0f9dd46c 2721
34d52cb6 2722 spin_lock(&ctl->tree_lock);
53b381b3 2723 entry = find_free_space(ctl, &offset, &bytes_search,
a4820398 2724 block_group->full_stripe_len, max_extent_size);
6226cb0a 2725 if (!entry)
96303081
JB
2726 goto out;
2727
2728 ret = offset;
2729 if (entry->bitmap) {
34d52cb6 2730 bitmap_clear_bits(ctl, entry, offset, bytes);
edf6e2d1 2731 if (!entry->bytes)
34d52cb6 2732 free_bitmap(ctl, entry);
96303081 2733 } else {
34d52cb6 2734 unlink_free_space(ctl, entry);
53b381b3
DW
2735 align_gap_len = offset - entry->offset;
2736 align_gap = entry->offset;
a7ccb255 2737 align_gap_trim_state = entry->trim_state;
53b381b3
DW
2738
2739 entry->offset = offset + bytes;
2740 WARN_ON(entry->bytes < bytes + align_gap_len);
2741
2742 entry->bytes -= bytes + align_gap_len;
6226cb0a 2743 if (!entry->bytes)
dc89e982 2744 kmem_cache_free(btrfs_free_space_cachep, entry);
6226cb0a 2745 else
34d52cb6 2746 link_free_space(ctl, entry);
6226cb0a 2747 }
96303081 2748out:
34d52cb6 2749 spin_unlock(&ctl->tree_lock);
817d52f8 2750
53b381b3 2751 if (align_gap_len)
ab8d0fc4 2752 __btrfs_add_free_space(block_group->fs_info, ctl,
a7ccb255
DZ
2753 align_gap, align_gap_len,
2754 align_gap_trim_state);
0f9dd46c
JB
2755 return ret;
2756}
fa9c0d79
CM
2757
2758/*
2759 * given a cluster, put all of its extents back into the free space
2760 * cache. If a block group is passed, this function will only free
2761 * a cluster that belongs to the passed block group.
2762 *
2763 * Otherwise, it'll get a reference on the block group pointed to by the
2764 * cluster and remove the cluster from it.
2765 */
2766int btrfs_return_cluster_to_free_space(
32da5386 2767 struct btrfs_block_group *block_group,
fa9c0d79
CM
2768 struct btrfs_free_cluster *cluster)
2769{
34d52cb6 2770 struct btrfs_free_space_ctl *ctl;
fa9c0d79
CM
2771 int ret;
2772
2773 /* first, get a safe pointer to the block group */
2774 spin_lock(&cluster->lock);
2775 if (!block_group) {
2776 block_group = cluster->block_group;
2777 if (!block_group) {
2778 spin_unlock(&cluster->lock);
2779 return 0;
2780 }
2781 } else if (cluster->block_group != block_group) {
2782 /* someone else has already freed it don't redo their work */
2783 spin_unlock(&cluster->lock);
2784 return 0;
2785 }
2786 atomic_inc(&block_group->count);
2787 spin_unlock(&cluster->lock);
2788
34d52cb6
LZ
2789 ctl = block_group->free_space_ctl;
2790
fa9c0d79 2791 /* now return any extents the cluster had on it */
34d52cb6 2792 spin_lock(&ctl->tree_lock);
fa9c0d79 2793 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
34d52cb6 2794 spin_unlock(&ctl->tree_lock);
fa9c0d79
CM
2795
2796 /* finally drop our ref */
2797 btrfs_put_block_group(block_group);
2798 return ret;
2799}
2800
32da5386 2801static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
96303081 2802 struct btrfs_free_cluster *cluster,
4e69b598 2803 struct btrfs_free_space *entry,
a4820398
MX
2804 u64 bytes, u64 min_start,
2805 u64 *max_extent_size)
96303081 2806{
34d52cb6 2807 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
2808 int err;
2809 u64 search_start = cluster->window_start;
2810 u64 search_bytes = bytes;
2811 u64 ret = 0;
2812
96303081
JB
2813 search_start = min_start;
2814 search_bytes = bytes;
2815
0584f718 2816 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
a4820398 2817 if (err) {
ad22cf6e
JB
2818 *max_extent_size = max(get_max_extent_size(entry),
2819 *max_extent_size);
4e69b598 2820 return 0;
a4820398 2821 }
96303081
JB
2822
2823 ret = search_start;
bb3ac5a4 2824 __bitmap_clear_bits(ctl, entry, ret, bytes);
96303081
JB
2825
2826 return ret;
2827}
2828
fa9c0d79
CM
2829/*
2830 * given a cluster, try to allocate 'bytes' from it, returns 0
2831 * if it couldn't find anything suitably large, or a logical disk offset
2832 * if things worked out
2833 */
32da5386 2834u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
fa9c0d79 2835 struct btrfs_free_cluster *cluster, u64 bytes,
a4820398 2836 u64 min_start, u64 *max_extent_size)
fa9c0d79 2837{
34d52cb6 2838 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2839 struct btrfs_free_space *entry = NULL;
2840 struct rb_node *node;
2841 u64 ret = 0;
2842
2843 spin_lock(&cluster->lock);
2844 if (bytes > cluster->max_size)
2845 goto out;
2846
2847 if (cluster->block_group != block_group)
2848 goto out;
2849
2850 node = rb_first(&cluster->root);
2851 if (!node)
2852 goto out;
2853
2854 entry = rb_entry(node, struct btrfs_free_space, offset_index);
67871254 2855 while (1) {
ad22cf6e
JB
2856 if (entry->bytes < bytes)
2857 *max_extent_size = max(get_max_extent_size(entry),
2858 *max_extent_size);
a4820398 2859
4e69b598
JB
2860 if (entry->bytes < bytes ||
2861 (!entry->bitmap && entry->offset < min_start)) {
fa9c0d79
CM
2862 node = rb_next(&entry->offset_index);
2863 if (!node)
2864 break;
2865 entry = rb_entry(node, struct btrfs_free_space,
2866 offset_index);
2867 continue;
2868 }
fa9c0d79 2869
4e69b598
JB
2870 if (entry->bitmap) {
2871 ret = btrfs_alloc_from_bitmap(block_group,
2872 cluster, entry, bytes,
a4820398
MX
2873 cluster->window_start,
2874 max_extent_size);
4e69b598 2875 if (ret == 0) {
4e69b598
JB
2876 node = rb_next(&entry->offset_index);
2877 if (!node)
2878 break;
2879 entry = rb_entry(node, struct btrfs_free_space,
2880 offset_index);
2881 continue;
2882 }
9b230628 2883 cluster->window_start += bytes;
4e69b598 2884 } else {
4e69b598
JB
2885 ret = entry->offset;
2886
2887 entry->offset += bytes;
2888 entry->bytes -= bytes;
2889 }
fa9c0d79 2890
5e71b5d5 2891 if (entry->bytes == 0)
fa9c0d79 2892 rb_erase(&entry->offset_index, &cluster->root);
fa9c0d79
CM
2893 break;
2894 }
2895out:
2896 spin_unlock(&cluster->lock);
96303081 2897
5e71b5d5
LZ
2898 if (!ret)
2899 return 0;
2900
34d52cb6 2901 spin_lock(&ctl->tree_lock);
5e71b5d5 2902
34d52cb6 2903 ctl->free_space -= bytes;
5e71b5d5 2904 if (entry->bytes == 0) {
34d52cb6 2905 ctl->free_extents--;
4e69b598 2906 if (entry->bitmap) {
3acd4850
CL
2907 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2908 entry->bitmap);
34d52cb6
LZ
2909 ctl->total_bitmaps--;
2910 ctl->op->recalc_thresholds(ctl);
4e69b598 2911 }
dc89e982 2912 kmem_cache_free(btrfs_free_space_cachep, entry);
5e71b5d5
LZ
2913 }
2914
34d52cb6 2915 spin_unlock(&ctl->tree_lock);
5e71b5d5 2916
fa9c0d79
CM
2917 return ret;
2918}
2919
32da5386 2920static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
96303081
JB
2921 struct btrfs_free_space *entry,
2922 struct btrfs_free_cluster *cluster,
1bb91902
AO
2923 u64 offset, u64 bytes,
2924 u64 cont1_bytes, u64 min_bytes)
96303081 2925{
34d52cb6 2926 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
2927 unsigned long next_zero;
2928 unsigned long i;
1bb91902
AO
2929 unsigned long want_bits;
2930 unsigned long min_bits;
96303081 2931 unsigned long found_bits;
cef40483 2932 unsigned long max_bits = 0;
96303081
JB
2933 unsigned long start = 0;
2934 unsigned long total_found = 0;
4e69b598 2935 int ret;
96303081 2936
96009762 2937 i = offset_to_bit(entry->offset, ctl->unit,
96303081 2938 max_t(u64, offset, entry->offset));
96009762
WSH
2939 want_bits = bytes_to_bits(bytes, ctl->unit);
2940 min_bits = bytes_to_bits(min_bytes, ctl->unit);
96303081 2941
cef40483
JB
2942 /*
2943 * Don't bother looking for a cluster in this bitmap if it's heavily
2944 * fragmented.
2945 */
2946 if (entry->max_extent_size &&
2947 entry->max_extent_size < cont1_bytes)
2948 return -ENOSPC;
96303081
JB
2949again:
2950 found_bits = 0;
ebb3dad4 2951 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
96303081
JB
2952 next_zero = find_next_zero_bit(entry->bitmap,
2953 BITS_PER_BITMAP, i);
1bb91902 2954 if (next_zero - i >= min_bits) {
96303081 2955 found_bits = next_zero - i;
cef40483
JB
2956 if (found_bits > max_bits)
2957 max_bits = found_bits;
96303081
JB
2958 break;
2959 }
cef40483
JB
2960 if (next_zero - i > max_bits)
2961 max_bits = next_zero - i;
96303081
JB
2962 i = next_zero;
2963 }
2964
cef40483
JB
2965 if (!found_bits) {
2966 entry->max_extent_size = (u64)max_bits * ctl->unit;
4e69b598 2967 return -ENOSPC;
cef40483 2968 }
96303081 2969
1bb91902 2970 if (!total_found) {
96303081 2971 start = i;
b78d09bc 2972 cluster->max_size = 0;
96303081
JB
2973 }
2974
2975 total_found += found_bits;
2976
96009762
WSH
2977 if (cluster->max_size < found_bits * ctl->unit)
2978 cluster->max_size = found_bits * ctl->unit;
96303081 2979
1bb91902
AO
2980 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2981 i = next_zero + 1;
96303081
JB
2982 goto again;
2983 }
2984
96009762 2985 cluster->window_start = start * ctl->unit + entry->offset;
34d52cb6 2986 rb_erase(&entry->offset_index, &ctl->free_space_offset);
4e69b598
JB
2987 ret = tree_insert_offset(&cluster->root, entry->offset,
2988 &entry->offset_index, 1);
b12d6869 2989 ASSERT(!ret); /* -EEXIST; Logic error */
96303081 2990
3f7de037 2991 trace_btrfs_setup_cluster(block_group, cluster,
96009762 2992 total_found * ctl->unit, 1);
96303081
JB
2993 return 0;
2994}
2995
4e69b598
JB
2996/*
2997 * This searches the block group for just extents to fill the cluster with.
1bb91902
AO
2998 * Try to find a cluster with at least bytes total bytes, at least one
2999 * extent of cont1_bytes, and other clusters of at least min_bytes.
4e69b598 3000 */
3de85bb9 3001static noinline int
32da5386 3002setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3003 struct btrfs_free_cluster *cluster,
3004 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3005 u64 cont1_bytes, u64 min_bytes)
4e69b598 3006{
34d52cb6 3007 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4e69b598
JB
3008 struct btrfs_free_space *first = NULL;
3009 struct btrfs_free_space *entry = NULL;
4e69b598
JB
3010 struct btrfs_free_space *last;
3011 struct rb_node *node;
4e69b598
JB
3012 u64 window_free;
3013 u64 max_extent;
3f7de037 3014 u64 total_size = 0;
4e69b598 3015
34d52cb6 3016 entry = tree_search_offset(ctl, offset, 0, 1);
4e69b598
JB
3017 if (!entry)
3018 return -ENOSPC;
3019
3020 /*
3021 * We don't want bitmaps, so just move along until we find a normal
3022 * extent entry.
3023 */
1bb91902
AO
3024 while (entry->bitmap || entry->bytes < min_bytes) {
3025 if (entry->bitmap && list_empty(&entry->list))
86d4a77b 3026 list_add_tail(&entry->list, bitmaps);
4e69b598
JB
3027 node = rb_next(&entry->offset_index);
3028 if (!node)
3029 return -ENOSPC;
3030 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3031 }
3032
4e69b598
JB
3033 window_free = entry->bytes;
3034 max_extent = entry->bytes;
3035 first = entry;
3036 last = entry;
4e69b598 3037
1bb91902
AO
3038 for (node = rb_next(&entry->offset_index); node;
3039 node = rb_next(&entry->offset_index)) {
4e69b598
JB
3040 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3041
86d4a77b
JB
3042 if (entry->bitmap) {
3043 if (list_empty(&entry->list))
3044 list_add_tail(&entry->list, bitmaps);
4e69b598 3045 continue;
86d4a77b
JB
3046 }
3047
1bb91902
AO
3048 if (entry->bytes < min_bytes)
3049 continue;
3050
3051 last = entry;
3052 window_free += entry->bytes;
3053 if (entry->bytes > max_extent)
4e69b598 3054 max_extent = entry->bytes;
4e69b598
JB
3055 }
3056
1bb91902
AO
3057 if (window_free < bytes || max_extent < cont1_bytes)
3058 return -ENOSPC;
3059
4e69b598
JB
3060 cluster->window_start = first->offset;
3061
3062 node = &first->offset_index;
3063
3064 /*
3065 * now we've found our entries, pull them out of the free space
3066 * cache and put them into the cluster rbtree
3067 */
3068 do {
3069 int ret;
3070
3071 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3072 node = rb_next(&entry->offset_index);
1bb91902 3073 if (entry->bitmap || entry->bytes < min_bytes)
4e69b598
JB
3074 continue;
3075
34d52cb6 3076 rb_erase(&entry->offset_index, &ctl->free_space_offset);
4e69b598
JB
3077 ret = tree_insert_offset(&cluster->root, entry->offset,
3078 &entry->offset_index, 0);
3f7de037 3079 total_size += entry->bytes;
b12d6869 3080 ASSERT(!ret); /* -EEXIST; Logic error */
4e69b598
JB
3081 } while (node && entry != last);
3082
3083 cluster->max_size = max_extent;
3f7de037 3084 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
4e69b598
JB
3085 return 0;
3086}
3087
3088/*
3089 * This specifically looks for bitmaps that may work in the cluster, we assume
3090 * that we have already failed to find extents that will work.
3091 */
3de85bb9 3092static noinline int
32da5386 3093setup_cluster_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3094 struct btrfs_free_cluster *cluster,
3095 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3096 u64 cont1_bytes, u64 min_bytes)
4e69b598 3097{
34d52cb6 3098 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1b9b922a 3099 struct btrfs_free_space *entry = NULL;
4e69b598 3100 int ret = -ENOSPC;
0f0fbf1d 3101 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
4e69b598 3102
34d52cb6 3103 if (ctl->total_bitmaps == 0)
4e69b598
JB
3104 return -ENOSPC;
3105
0f0fbf1d
LZ
3106 /*
3107 * The bitmap that covers offset won't be in the list unless offset
3108 * is just its start offset.
3109 */
1b9b922a
CM
3110 if (!list_empty(bitmaps))
3111 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3112
3113 if (!entry || entry->offset != bitmap_offset) {
0f0fbf1d
LZ
3114 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3115 if (entry && list_empty(&entry->list))
3116 list_add(&entry->list, bitmaps);
3117 }
3118
86d4a77b 3119 list_for_each_entry(entry, bitmaps, list) {
357b9784 3120 if (entry->bytes < bytes)
86d4a77b
JB
3121 continue;
3122 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
1bb91902 3123 bytes, cont1_bytes, min_bytes);
86d4a77b
JB
3124 if (!ret)
3125 return 0;
3126 }
3127
3128 /*
52621cb6
LZ
3129 * The bitmaps list has all the bitmaps that record free space
3130 * starting after offset, so no more search is required.
86d4a77b 3131 */
52621cb6 3132 return -ENOSPC;
4e69b598
JB
3133}
3134
fa9c0d79
CM
3135/*
3136 * here we try to find a cluster of blocks in a block group. The goal
1bb91902 3137 * is to find at least bytes+empty_size.
fa9c0d79
CM
3138 * We might not find them all in one contiguous area.
3139 *
3140 * returns zero and sets up cluster if things worked out, otherwise
3141 * it returns -enospc
3142 */
32da5386 3143int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
fa9c0d79
CM
3144 struct btrfs_free_cluster *cluster,
3145 u64 offset, u64 bytes, u64 empty_size)
3146{
2ceeae2e 3147 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 3148 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
86d4a77b 3149 struct btrfs_free_space *entry, *tmp;
52621cb6 3150 LIST_HEAD(bitmaps);
fa9c0d79 3151 u64 min_bytes;
1bb91902 3152 u64 cont1_bytes;
fa9c0d79
CM
3153 int ret;
3154
1bb91902
AO
3155 /*
3156 * Choose the minimum extent size we'll require for this
3157 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3158 * For metadata, allow allocates with smaller extents. For
3159 * data, keep it dense.
3160 */
0b246afa 3161 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
1bb91902 3162 cont1_bytes = min_bytes = bytes + empty_size;
451d7585 3163 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1bb91902 3164 cont1_bytes = bytes;
0b246afa 3165 min_bytes = fs_info->sectorsize;
1bb91902
AO
3166 } else {
3167 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
0b246afa 3168 min_bytes = fs_info->sectorsize;
1bb91902 3169 }
fa9c0d79 3170
34d52cb6 3171 spin_lock(&ctl->tree_lock);
7d0d2e8e
JB
3172
3173 /*
3174 * If we know we don't have enough space to make a cluster don't even
3175 * bother doing all the work to try and find one.
3176 */
1bb91902 3177 if (ctl->free_space < bytes) {
34d52cb6 3178 spin_unlock(&ctl->tree_lock);
7d0d2e8e
JB
3179 return -ENOSPC;
3180 }
3181
fa9c0d79
CM
3182 spin_lock(&cluster->lock);
3183
3184 /* someone already found a cluster, hooray */
3185 if (cluster->block_group) {
3186 ret = 0;
3187 goto out;
3188 }
fa9c0d79 3189
3f7de037
JB
3190 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3191 min_bytes);
3192
86d4a77b 3193 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
1bb91902
AO
3194 bytes + empty_size,
3195 cont1_bytes, min_bytes);
4e69b598 3196 if (ret)
86d4a77b 3197 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
1bb91902
AO
3198 offset, bytes + empty_size,
3199 cont1_bytes, min_bytes);
86d4a77b
JB
3200
3201 /* Clear our temporary list */
3202 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3203 list_del_init(&entry->list);
fa9c0d79 3204
4e69b598
JB
3205 if (!ret) {
3206 atomic_inc(&block_group->count);
3207 list_add_tail(&cluster->block_group_list,
3208 &block_group->cluster_list);
3209 cluster->block_group = block_group;
3f7de037
JB
3210 } else {
3211 trace_btrfs_failed_cluster_setup(block_group);
fa9c0d79 3212 }
fa9c0d79
CM
3213out:
3214 spin_unlock(&cluster->lock);
34d52cb6 3215 spin_unlock(&ctl->tree_lock);
fa9c0d79
CM
3216
3217 return ret;
3218}
3219
3220/*
3221 * simple code to zero out a cluster
3222 */
3223void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3224{
3225 spin_lock_init(&cluster->lock);
3226 spin_lock_init(&cluster->refill_lock);
6bef4d31 3227 cluster->root = RB_ROOT;
fa9c0d79 3228 cluster->max_size = 0;
c759c4e1 3229 cluster->fragmented = false;
fa9c0d79
CM
3230 INIT_LIST_HEAD(&cluster->block_group_list);
3231 cluster->block_group = NULL;
3232}
3233
32da5386 3234static int do_trimming(struct btrfs_block_group *block_group,
7fe1e641 3235 u64 *total_trimmed, u64 start, u64 bytes,
55507ce3 3236 u64 reserved_start, u64 reserved_bytes,
b0643e59 3237 enum btrfs_trim_state reserved_trim_state,
55507ce3 3238 struct btrfs_trim_range *trim_entry)
f7039b1d 3239{
7fe1e641 3240 struct btrfs_space_info *space_info = block_group->space_info;
f7039b1d 3241 struct btrfs_fs_info *fs_info = block_group->fs_info;
55507ce3 3242 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
7fe1e641
LZ
3243 int ret;
3244 int update = 0;
b0643e59
DZ
3245 const u64 end = start + bytes;
3246 const u64 reserved_end = reserved_start + reserved_bytes;
3247 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3248 u64 trimmed = 0;
f7039b1d 3249
7fe1e641
LZ
3250 spin_lock(&space_info->lock);
3251 spin_lock(&block_group->lock);
3252 if (!block_group->ro) {
3253 block_group->reserved += reserved_bytes;
3254 space_info->bytes_reserved += reserved_bytes;
3255 update = 1;
3256 }
3257 spin_unlock(&block_group->lock);
3258 spin_unlock(&space_info->lock);
3259
2ff7e61e 3260 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
b0643e59 3261 if (!ret) {
7fe1e641 3262 *total_trimmed += trimmed;
b0643e59
DZ
3263 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3264 }
7fe1e641 3265
55507ce3 3266 mutex_lock(&ctl->cache_writeout_mutex);
b0643e59
DZ
3267 if (reserved_start < start)
3268 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3269 start - reserved_start,
3270 reserved_trim_state);
3271 if (start + bytes < reserved_start + reserved_bytes)
3272 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3273 reserved_trim_state);
3274 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
55507ce3
FM
3275 list_del(&trim_entry->list);
3276 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3277
3278 if (update) {
3279 spin_lock(&space_info->lock);
3280 spin_lock(&block_group->lock);
3281 if (block_group->ro)
3282 space_info->bytes_readonly += reserved_bytes;
3283 block_group->reserved -= reserved_bytes;
3284 space_info->bytes_reserved -= reserved_bytes;
7fe1e641 3285 spin_unlock(&block_group->lock);
8f63a840 3286 spin_unlock(&space_info->lock);
7fe1e641
LZ
3287 }
3288
3289 return ret;
3290}
3291
32da5386 3292static int trim_no_bitmap(struct btrfs_block_group *block_group,
7fe1e641
LZ
3293 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3294{
3295 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3296 struct btrfs_free_space *entry;
3297 struct rb_node *node;
3298 int ret = 0;
3299 u64 extent_start;
3300 u64 extent_bytes;
b0643e59 3301 enum btrfs_trim_state extent_trim_state;
7fe1e641 3302 u64 bytes;
f7039b1d
LD
3303
3304 while (start < end) {
55507ce3
FM
3305 struct btrfs_trim_range trim_entry;
3306
3307 mutex_lock(&ctl->cache_writeout_mutex);
34d52cb6 3308 spin_lock(&ctl->tree_lock);
f7039b1d 3309
34d52cb6
LZ
3310 if (ctl->free_space < minlen) {
3311 spin_unlock(&ctl->tree_lock);
55507ce3 3312 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d
LD
3313 break;
3314 }
3315
34d52cb6 3316 entry = tree_search_offset(ctl, start, 0, 1);
7fe1e641 3317 if (!entry) {
34d52cb6 3318 spin_unlock(&ctl->tree_lock);
55507ce3 3319 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d
LD
3320 break;
3321 }
3322
7fe1e641
LZ
3323 /* skip bitmaps */
3324 while (entry->bitmap) {
3325 node = rb_next(&entry->offset_index);
3326 if (!node) {
34d52cb6 3327 spin_unlock(&ctl->tree_lock);
55507ce3 3328 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3329 goto out;
f7039b1d 3330 }
7fe1e641
LZ
3331 entry = rb_entry(node, struct btrfs_free_space,
3332 offset_index);
f7039b1d
LD
3333 }
3334
7fe1e641
LZ
3335 if (entry->offset >= end) {
3336 spin_unlock(&ctl->tree_lock);
55507ce3 3337 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3338 break;
f7039b1d
LD
3339 }
3340
7fe1e641
LZ
3341 extent_start = entry->offset;
3342 extent_bytes = entry->bytes;
b0643e59 3343 extent_trim_state = entry->trim_state;
7fe1e641
LZ
3344 start = max(start, extent_start);
3345 bytes = min(extent_start + extent_bytes, end) - start;
3346 if (bytes < minlen) {
3347 spin_unlock(&ctl->tree_lock);
55507ce3 3348 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641 3349 goto next;
f7039b1d
LD
3350 }
3351
7fe1e641
LZ
3352 unlink_free_space(ctl, entry);
3353 kmem_cache_free(btrfs_free_space_cachep, entry);
3354
34d52cb6 3355 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3356 trim_entry.start = extent_start;
3357 trim_entry.bytes = extent_bytes;
3358 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3359 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d 3360
7fe1e641 3361 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59
DZ
3362 extent_start, extent_bytes, extent_trim_state,
3363 &trim_entry);
7fe1e641
LZ
3364 if (ret)
3365 break;
3366next:
3367 start += bytes;
f7039b1d 3368
7fe1e641
LZ
3369 if (fatal_signal_pending(current)) {
3370 ret = -ERESTARTSYS;
3371 break;
3372 }
3373
3374 cond_resched();
3375 }
3376out:
3377 return ret;
3378}
3379
da080fe1
DZ
3380/*
3381 * If we break out of trimming a bitmap prematurely, we should reset the
3382 * trimming bit. In a rather contrieved case, it's possible to race here so
3383 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3384 *
3385 * start = start of bitmap
3386 * end = near end of bitmap
3387 *
3388 * Thread 1: Thread 2:
3389 * trim_bitmaps(start)
3390 * trim_bitmaps(end)
3391 * end_trimming_bitmap()
3392 * reset_trimming_bitmap()
3393 */
3394static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3395{
3396 struct btrfs_free_space *entry;
3397
3398 spin_lock(&ctl->tree_lock);
3399 entry = tree_search_offset(ctl, offset, 1, 0);
3400 if (entry)
3401 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3402 spin_unlock(&ctl->tree_lock);
3403}
3404
3405static void end_trimming_bitmap(struct btrfs_free_space *entry)
3406{
3407 if (btrfs_free_space_trimming_bitmap(entry))
3408 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3409}
3410
32da5386 3411static int trim_bitmaps(struct btrfs_block_group *block_group,
7fe1e641
LZ
3412 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3413{
3414 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3415 struct btrfs_free_space *entry;
3416 int ret = 0;
3417 int ret2;
3418 u64 bytes;
3419 u64 offset = offset_to_bitmap(ctl, start);
3420
3421 while (offset < end) {
3422 bool next_bitmap = false;
55507ce3 3423 struct btrfs_trim_range trim_entry;
7fe1e641 3424
55507ce3 3425 mutex_lock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3426 spin_lock(&ctl->tree_lock);
3427
3428 if (ctl->free_space < minlen) {
3429 spin_unlock(&ctl->tree_lock);
55507ce3 3430 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3431 break;
3432 }
3433
3434 entry = tree_search_offset(ctl, offset, 1, 0);
da080fe1 3435 if (!entry || btrfs_free_space_trimmed(entry)) {
7fe1e641 3436 spin_unlock(&ctl->tree_lock);
55507ce3 3437 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3438 next_bitmap = true;
3439 goto next;
3440 }
3441
da080fe1
DZ
3442 /*
3443 * Async discard bitmap trimming begins at by setting the start
3444 * to be key.objectid and the offset_to_bitmap() aligns to the
3445 * start of the bitmap. This lets us know we are fully
3446 * scanning the bitmap rather than only some portion of it.
3447 */
3448 if (start == offset)
3449 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3450
7fe1e641 3451 bytes = minlen;
0584f718 3452 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
7fe1e641 3453 if (ret2 || start >= end) {
da080fe1
DZ
3454 /*
3455 * This keeps the invariant that all bytes are trimmed
3456 * if BTRFS_TRIM_STATE_TRIMMED is set on a bitmap.
3457 */
3458 if (ret2 && !minlen)
3459 end_trimming_bitmap(entry);
3460 else
3461 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3462 spin_unlock(&ctl->tree_lock);
55507ce3 3463 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3464 next_bitmap = true;
3465 goto next;
3466 }
3467
3468 bytes = min(bytes, end - start);
3469 if (bytes < minlen) {
da080fe1 3470 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3471 spin_unlock(&ctl->tree_lock);
55507ce3 3472 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3473 goto next;
3474 }
3475
3476 bitmap_clear_bits(ctl, entry, start, bytes);
3477 if (entry->bytes == 0)
3478 free_bitmap(ctl, entry);
3479
3480 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3481 trim_entry.start = start;
3482 trim_entry.bytes = bytes;
3483 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3484 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3485
3486 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59 3487 start, bytes, 0, &trim_entry);
da080fe1
DZ
3488 if (ret) {
3489 reset_trimming_bitmap(ctl, offset);
7fe1e641 3490 break;
da080fe1 3491 }
7fe1e641
LZ
3492next:
3493 if (next_bitmap) {
3494 offset += BITS_PER_BITMAP * ctl->unit;
da080fe1 3495 start = offset;
7fe1e641
LZ
3496 } else {
3497 start += bytes;
f7039b1d 3498 }
f7039b1d
LD
3499
3500 if (fatal_signal_pending(current)) {
da080fe1
DZ
3501 if (start != offset)
3502 reset_trimming_bitmap(ctl, offset);
f7039b1d
LD
3503 ret = -ERESTARTSYS;
3504 break;
3505 }
3506
3507 cond_resched();
3508 }
3509
3510 return ret;
3511}
581bb050 3512
32da5386 3513void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
7fe1e641 3514{
e33e17ee
JM
3515 atomic_inc(&cache->trimming);
3516}
7fe1e641 3517
32da5386 3518void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
e33e17ee 3519{
0b246afa 3520 struct btrfs_fs_info *fs_info = block_group->fs_info;
e33e17ee
JM
3521 struct extent_map_tree *em_tree;
3522 struct extent_map *em;
3523 bool cleanup;
7fe1e641 3524
04216820 3525 spin_lock(&block_group->lock);
e33e17ee
JM
3526 cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3527 block_group->removed);
04216820
FM
3528 spin_unlock(&block_group->lock);
3529
e33e17ee 3530 if (cleanup) {
34441361 3531 mutex_lock(&fs_info->chunk_mutex);
c8bf1b67 3532 em_tree = &fs_info->mapping_tree;
04216820 3533 write_lock(&em_tree->lock);
b3470b5d 3534 em = lookup_extent_mapping(em_tree, block_group->start,
04216820
FM
3535 1);
3536 BUG_ON(!em); /* logic error, can't happen */
3537 remove_extent_mapping(em_tree, em);
3538 write_unlock(&em_tree->lock);
34441361 3539 mutex_unlock(&fs_info->chunk_mutex);
04216820
FM
3540
3541 /* once for us and once for the tree */
3542 free_extent_map(em);
3543 free_extent_map(em);
946ddbe8
FM
3544
3545 /*
3546 * We've left one free space entry and other tasks trimming
3547 * this block group have left 1 entry each one. Free them.
3548 */
3549 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
e33e17ee
JM
3550 }
3551}
3552
32da5386 3553int btrfs_trim_block_group(struct btrfs_block_group *block_group,
e33e17ee
JM
3554 u64 *trimmed, u64 start, u64 end, u64 minlen)
3555{
da080fe1 3556 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
e33e17ee 3557 int ret;
da080fe1 3558 u64 rem = 0;
e33e17ee
JM
3559
3560 *trimmed = 0;
3561
3562 spin_lock(&block_group->lock);
3563 if (block_group->removed) {
04216820 3564 spin_unlock(&block_group->lock);
e33e17ee 3565 return 0;
04216820 3566 }
e33e17ee
JM
3567 btrfs_get_block_group_trimming(block_group);
3568 spin_unlock(&block_group->lock);
3569
3570 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3571 if (ret)
3572 goto out;
7fe1e641 3573
e33e17ee 3574 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
da080fe1
DZ
3575 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3576 /* If we ended in the middle of a bitmap, reset the trimming flag */
3577 if (rem)
3578 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
e33e17ee
JM
3579out:
3580 btrfs_put_block_group_trimming(block_group);
7fe1e641
LZ
3581 return ret;
3582}
3583
581bb050
LZ
3584/*
3585 * Find the left-most item in the cache tree, and then return the
3586 * smallest inode number in the item.
3587 *
3588 * Note: the returned inode number may not be the smallest one in
3589 * the tree, if the left-most item is a bitmap.
3590 */
3591u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3592{
3593 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3594 struct btrfs_free_space *entry = NULL;
3595 u64 ino = 0;
3596
3597 spin_lock(&ctl->tree_lock);
3598
3599 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3600 goto out;
3601
3602 entry = rb_entry(rb_first(&ctl->free_space_offset),
3603 struct btrfs_free_space, offset_index);
3604
3605 if (!entry->bitmap) {
3606 ino = entry->offset;
3607
3608 unlink_free_space(ctl, entry);
3609 entry->offset++;
3610 entry->bytes--;
3611 if (!entry->bytes)
3612 kmem_cache_free(btrfs_free_space_cachep, entry);
3613 else
3614 link_free_space(ctl, entry);
3615 } else {
3616 u64 offset = 0;
3617 u64 count = 1;
3618 int ret;
3619
0584f718 3620 ret = search_bitmap(ctl, entry, &offset, &count, true);
79787eaa 3621 /* Logic error; Should be empty if it can't find anything */
b12d6869 3622 ASSERT(!ret);
581bb050
LZ
3623
3624 ino = offset;
3625 bitmap_clear_bits(ctl, entry, offset, 1);
3626 if (entry->bytes == 0)
3627 free_bitmap(ctl, entry);
3628 }
3629out:
3630 spin_unlock(&ctl->tree_lock);
3631
3632 return ino;
3633}
82d5902d
LZ
3634
3635struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3636 struct btrfs_path *path)
3637{
3638 struct inode *inode = NULL;
3639
57cdc8db
DS
3640 spin_lock(&root->ino_cache_lock);
3641 if (root->ino_cache_inode)
3642 inode = igrab(root->ino_cache_inode);
3643 spin_unlock(&root->ino_cache_lock);
82d5902d
LZ
3644 if (inode)
3645 return inode;
3646
3647 inode = __lookup_free_space_inode(root, path, 0);
3648 if (IS_ERR(inode))
3649 return inode;
3650
57cdc8db 3651 spin_lock(&root->ino_cache_lock);
7841cb28 3652 if (!btrfs_fs_closing(root->fs_info))
57cdc8db
DS
3653 root->ino_cache_inode = igrab(inode);
3654 spin_unlock(&root->ino_cache_lock);
82d5902d
LZ
3655
3656 return inode;
3657}
3658
3659int create_free_ino_inode(struct btrfs_root *root,
3660 struct btrfs_trans_handle *trans,
3661 struct btrfs_path *path)
3662{
3663 return __create_free_space_inode(root, trans, path,
3664 BTRFS_FREE_INO_OBJECTID, 0);
3665}
3666
3667int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3668{
3669 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3670 struct btrfs_path *path;
3671 struct inode *inode;
3672 int ret = 0;
3673 u64 root_gen = btrfs_root_generation(&root->root_item);
3674
0b246afa 3675 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
4b9465cb
CM
3676 return 0;
3677
82d5902d
LZ
3678 /*
3679 * If we're unmounting then just return, since this does a search on the
3680 * normal root and not the commit root and we could deadlock.
3681 */
7841cb28 3682 if (btrfs_fs_closing(fs_info))
82d5902d
LZ
3683 return 0;
3684
3685 path = btrfs_alloc_path();
3686 if (!path)
3687 return 0;
3688
3689 inode = lookup_free_ino_inode(root, path);
3690 if (IS_ERR(inode))
3691 goto out;
3692
3693 if (root_gen != BTRFS_I(inode)->generation)
3694 goto out_put;
3695
3696 ret = __load_free_space_cache(root, inode, ctl, path, 0);
3697
3698 if (ret < 0)
c2cf52eb
SK
3699 btrfs_err(fs_info,
3700 "failed to load free ino cache for root %llu",
3701 root->root_key.objectid);
82d5902d
LZ
3702out_put:
3703 iput(inode);
3704out:
3705 btrfs_free_path(path);
3706 return ret;
3707}
3708
3709int btrfs_write_out_ino_cache(struct btrfs_root *root,
3710 struct btrfs_trans_handle *trans,
53645a91
FDBM
3711 struct btrfs_path *path,
3712 struct inode *inode)
82d5902d 3713{
0b246afa 3714 struct btrfs_fs_info *fs_info = root->fs_info;
82d5902d 3715 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
82d5902d 3716 int ret;
c9dc4c65 3717 struct btrfs_io_ctl io_ctl;
e43699d4 3718 bool release_metadata = true;
82d5902d 3719
0b246afa 3720 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
4b9465cb
CM
3721 return 0;
3722
85db36cf 3723 memset(&io_ctl, 0, sizeof(io_ctl));
0e8d931a 3724 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
e43699d4
FM
3725 if (!ret) {
3726 /*
3727 * At this point writepages() didn't error out, so our metadata
3728 * reservation is released when the writeback finishes, at
3729 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3730 * with or without an error.
3731 */
3732 release_metadata = false;
afdb5718 3733 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
e43699d4 3734 }
85db36cf 3735
c09544e0 3736 if (ret) {
e43699d4 3737 if (release_metadata)
691fa059 3738 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3739 inode->i_size, true);
c09544e0 3740#ifdef DEBUG
0b246afa
JM
3741 btrfs_err(fs_info,
3742 "failed to write free ino cache for root %llu",
3743 root->root_key.objectid);
c09544e0
JB
3744#endif
3745 }
82d5902d 3746
82d5902d
LZ
3747 return ret;
3748}
74255aa0
JB
3749
3750#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
dc11dd5d
JB
3751/*
3752 * Use this if you need to make a bitmap or extent entry specifically, it
3753 * doesn't do any of the merging that add_free_space does, this acts a lot like
3754 * how the free space cache loading stuff works, so you can get really weird
3755 * configurations.
3756 */
32da5386 3757int test_add_free_space_entry(struct btrfs_block_group *cache,
dc11dd5d 3758 u64 offset, u64 bytes, bool bitmap)
74255aa0 3759{
dc11dd5d
JB
3760 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3761 struct btrfs_free_space *info = NULL, *bitmap_info;
3762 void *map = NULL;
da080fe1 3763 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
dc11dd5d
JB
3764 u64 bytes_added;
3765 int ret;
74255aa0 3766
dc11dd5d
JB
3767again:
3768 if (!info) {
3769 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3770 if (!info)
3771 return -ENOMEM;
74255aa0
JB
3772 }
3773
dc11dd5d
JB
3774 if (!bitmap) {
3775 spin_lock(&ctl->tree_lock);
3776 info->offset = offset;
3777 info->bytes = bytes;
cef40483 3778 info->max_extent_size = 0;
dc11dd5d
JB
3779 ret = link_free_space(ctl, info);
3780 spin_unlock(&ctl->tree_lock);
3781 if (ret)
3782 kmem_cache_free(btrfs_free_space_cachep, info);
3783 return ret;
3784 }
3785
3786 if (!map) {
3acd4850 3787 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
dc11dd5d
JB
3788 if (!map) {
3789 kmem_cache_free(btrfs_free_space_cachep, info);
3790 return -ENOMEM;
3791 }
3792 }
3793
3794 spin_lock(&ctl->tree_lock);
3795 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3796 1, 0);
3797 if (!bitmap_info) {
3798 info->bitmap = map;
3799 map = NULL;
3800 add_new_bitmap(ctl, info, offset);
3801 bitmap_info = info;
20005523 3802 info = NULL;
dc11dd5d 3803 }
74255aa0 3804
da080fe1
DZ
3805 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
3806 trim_state);
cef40483 3807
dc11dd5d
JB
3808 bytes -= bytes_added;
3809 offset += bytes_added;
3810 spin_unlock(&ctl->tree_lock);
74255aa0 3811
dc11dd5d
JB
3812 if (bytes)
3813 goto again;
74255aa0 3814
20005523
FM
3815 if (info)
3816 kmem_cache_free(btrfs_free_space_cachep, info);
3acd4850
CL
3817 if (map)
3818 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
dc11dd5d 3819 return 0;
74255aa0
JB
3820}
3821
3822/*
3823 * Checks to see if the given range is in the free space cache. This is really
3824 * just used to check the absence of space, so if there is free space in the
3825 * range at all we will return 1.
3826 */
32da5386 3827int test_check_exists(struct btrfs_block_group *cache,
dc11dd5d 3828 u64 offset, u64 bytes)
74255aa0
JB
3829{
3830 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3831 struct btrfs_free_space *info;
3832 int ret = 0;
3833
3834 spin_lock(&ctl->tree_lock);
3835 info = tree_search_offset(ctl, offset, 0, 0);
3836 if (!info) {
3837 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3838 1, 0);
3839 if (!info)
3840 goto out;
3841 }
3842
3843have_info:
3844 if (info->bitmap) {
3845 u64 bit_off, bit_bytes;
3846 struct rb_node *n;
3847 struct btrfs_free_space *tmp;
3848
3849 bit_off = offset;
3850 bit_bytes = ctl->unit;
0584f718 3851 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
74255aa0
JB
3852 if (!ret) {
3853 if (bit_off == offset) {
3854 ret = 1;
3855 goto out;
3856 } else if (bit_off > offset &&
3857 offset + bytes > bit_off) {
3858 ret = 1;
3859 goto out;
3860 }
3861 }
3862
3863 n = rb_prev(&info->offset_index);
3864 while (n) {
3865 tmp = rb_entry(n, struct btrfs_free_space,
3866 offset_index);
3867 if (tmp->offset + tmp->bytes < offset)
3868 break;
3869 if (offset + bytes < tmp->offset) {
5473e0c4 3870 n = rb_prev(&tmp->offset_index);
74255aa0
JB
3871 continue;
3872 }
3873 info = tmp;
3874 goto have_info;
3875 }
3876
3877 n = rb_next(&info->offset_index);
3878 while (n) {
3879 tmp = rb_entry(n, struct btrfs_free_space,
3880 offset_index);
3881 if (offset + bytes < tmp->offset)
3882 break;
3883 if (tmp->offset + tmp->bytes < offset) {
5473e0c4 3884 n = rb_next(&tmp->offset_index);
74255aa0
JB
3885 continue;
3886 }
3887 info = tmp;
3888 goto have_info;
3889 }
3890
20005523 3891 ret = 0;
74255aa0
JB
3892 goto out;
3893 }
3894
3895 if (info->offset == offset) {
3896 ret = 1;
3897 goto out;
3898 }
3899
3900 if (offset > info->offset && offset < info->offset + info->bytes)
3901 ret = 1;
3902out:
3903 spin_unlock(&ctl->tree_lock);
3904 return ret;
3905}
dc11dd5d 3906#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */