Merge branch 'for-5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
[linux-2.6-block.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
20 #include "volumes.h"
21 #include "space-info.h"
22 #include "delalloc-space.h"
23 #include "block-group.h"
24
25 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
26 #define MAX_CACHE_BYTES_PER_GIG SZ_32K
27
28 struct btrfs_trim_range {
29         u64 start;
30         u64 bytes;
31         struct list_head list;
32 };
33
34 static int link_free_space(struct btrfs_free_space_ctl *ctl,
35                            struct btrfs_free_space *info);
36 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
37                               struct btrfs_free_space *info);
38 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
39                              struct btrfs_trans_handle *trans,
40                              struct btrfs_io_ctl *io_ctl,
41                              struct btrfs_path *path);
42
43 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
44                                                struct btrfs_path *path,
45                                                u64 offset)
46 {
47         struct btrfs_fs_info *fs_info = root->fs_info;
48         struct btrfs_key key;
49         struct btrfs_key location;
50         struct btrfs_disk_key disk_key;
51         struct btrfs_free_space_header *header;
52         struct extent_buffer *leaf;
53         struct inode *inode = NULL;
54         unsigned nofs_flag;
55         int ret;
56
57         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
58         key.offset = offset;
59         key.type = 0;
60
61         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
62         if (ret < 0)
63                 return ERR_PTR(ret);
64         if (ret > 0) {
65                 btrfs_release_path(path);
66                 return ERR_PTR(-ENOENT);
67         }
68
69         leaf = path->nodes[0];
70         header = btrfs_item_ptr(leaf, path->slots[0],
71                                 struct btrfs_free_space_header);
72         btrfs_free_space_key(leaf, header, &disk_key);
73         btrfs_disk_key_to_cpu(&location, &disk_key);
74         btrfs_release_path(path);
75
76         /*
77          * We are often under a trans handle at this point, so we need to make
78          * sure NOFS is set to keep us from deadlocking.
79          */
80         nofs_flag = memalloc_nofs_save();
81         inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
82         btrfs_release_path(path);
83         memalloc_nofs_restore(nofs_flag);
84         if (IS_ERR(inode))
85                 return inode;
86
87         mapping_set_gfp_mask(inode->i_mapping,
88                         mapping_gfp_constraint(inode->i_mapping,
89                         ~(__GFP_FS | __GFP_HIGHMEM)));
90
91         return inode;
92 }
93
94 struct inode *lookup_free_space_inode(
95                 struct btrfs_block_group_cache *block_group,
96                 struct btrfs_path *path)
97 {
98         struct btrfs_fs_info *fs_info = block_group->fs_info;
99         struct inode *inode = NULL;
100         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
101
102         spin_lock(&block_group->lock);
103         if (block_group->inode)
104                 inode = igrab(block_group->inode);
105         spin_unlock(&block_group->lock);
106         if (inode)
107                 return inode;
108
109         inode = __lookup_free_space_inode(fs_info->tree_root, path,
110                                           block_group->key.objectid);
111         if (IS_ERR(inode))
112                 return inode;
113
114         spin_lock(&block_group->lock);
115         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
116                 btrfs_info(fs_info, "Old style space inode found, converting.");
117                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118                         BTRFS_INODE_NODATACOW;
119                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
120         }
121
122         if (!block_group->iref) {
123                 block_group->inode = igrab(inode);
124                 block_group->iref = 1;
125         }
126         spin_unlock(&block_group->lock);
127
128         return inode;
129 }
130
131 static int __create_free_space_inode(struct btrfs_root *root,
132                                      struct btrfs_trans_handle *trans,
133                                      struct btrfs_path *path,
134                                      u64 ino, u64 offset)
135 {
136         struct btrfs_key key;
137         struct btrfs_disk_key disk_key;
138         struct btrfs_free_space_header *header;
139         struct btrfs_inode_item *inode_item;
140         struct extent_buffer *leaf;
141         u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
142         int ret;
143
144         ret = btrfs_insert_empty_inode(trans, root, path, ino);
145         if (ret)
146                 return ret;
147
148         /* We inline crc's for the free disk space cache */
149         if (ino != BTRFS_FREE_INO_OBJECTID)
150                 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
151
152         leaf = path->nodes[0];
153         inode_item = btrfs_item_ptr(leaf, path->slots[0],
154                                     struct btrfs_inode_item);
155         btrfs_item_key(leaf, &disk_key, path->slots[0]);
156         memzero_extent_buffer(leaf, (unsigned long)inode_item,
157                              sizeof(*inode_item));
158         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159         btrfs_set_inode_size(leaf, inode_item, 0);
160         btrfs_set_inode_nbytes(leaf, inode_item, 0);
161         btrfs_set_inode_uid(leaf, inode_item, 0);
162         btrfs_set_inode_gid(leaf, inode_item, 0);
163         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
164         btrfs_set_inode_flags(leaf, inode_item, flags);
165         btrfs_set_inode_nlink(leaf, inode_item, 1);
166         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
167         btrfs_set_inode_block_group(leaf, inode_item, offset);
168         btrfs_mark_buffer_dirty(leaf);
169         btrfs_release_path(path);
170
171         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
172         key.offset = offset;
173         key.type = 0;
174         ret = btrfs_insert_empty_item(trans, root, path, &key,
175                                       sizeof(struct btrfs_free_space_header));
176         if (ret < 0) {
177                 btrfs_release_path(path);
178                 return ret;
179         }
180
181         leaf = path->nodes[0];
182         header = btrfs_item_ptr(leaf, path->slots[0],
183                                 struct btrfs_free_space_header);
184         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
185         btrfs_set_free_space_key(leaf, header, &disk_key);
186         btrfs_mark_buffer_dirty(leaf);
187         btrfs_release_path(path);
188
189         return 0;
190 }
191
192 int create_free_space_inode(struct btrfs_trans_handle *trans,
193                             struct btrfs_block_group_cache *block_group,
194                             struct btrfs_path *path)
195 {
196         int ret;
197         u64 ino;
198
199         ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
200         if (ret < 0)
201                 return ret;
202
203         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
204                                          ino, block_group->key.objectid);
205 }
206
207 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
208                                        struct btrfs_block_rsv *rsv)
209 {
210         u64 needed_bytes;
211         int ret;
212
213         /* 1 for slack space, 1 for updating the inode */
214         needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
215                 btrfs_calc_metadata_size(fs_info, 1);
216
217         spin_lock(&rsv->lock);
218         if (rsv->reserved < needed_bytes)
219                 ret = -ENOSPC;
220         else
221                 ret = 0;
222         spin_unlock(&rsv->lock);
223         return ret;
224 }
225
226 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
227                                     struct btrfs_block_group_cache *block_group,
228                                     struct inode *inode)
229 {
230         struct btrfs_root *root = BTRFS_I(inode)->root;
231         int ret = 0;
232         bool locked = false;
233
234         if (block_group) {
235                 struct btrfs_path *path = btrfs_alloc_path();
236
237                 if (!path) {
238                         ret = -ENOMEM;
239                         goto fail;
240                 }
241                 locked = true;
242                 mutex_lock(&trans->transaction->cache_write_mutex);
243                 if (!list_empty(&block_group->io_list)) {
244                         list_del_init(&block_group->io_list);
245
246                         btrfs_wait_cache_io(trans, block_group, path);
247                         btrfs_put_block_group(block_group);
248                 }
249
250                 /*
251                  * now that we've truncated the cache away, its no longer
252                  * setup or written
253                  */
254                 spin_lock(&block_group->lock);
255                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
256                 spin_unlock(&block_group->lock);
257                 btrfs_free_path(path);
258         }
259
260         btrfs_i_size_write(BTRFS_I(inode), 0);
261         truncate_pagecache(inode, 0);
262
263         /*
264          * We skip the throttling logic for free space cache inodes, so we don't
265          * need to check for -EAGAIN.
266          */
267         ret = btrfs_truncate_inode_items(trans, root, inode,
268                                          0, BTRFS_EXTENT_DATA_KEY);
269         if (ret)
270                 goto fail;
271
272         ret = btrfs_update_inode(trans, root, inode);
273
274 fail:
275         if (locked)
276                 mutex_unlock(&trans->transaction->cache_write_mutex);
277         if (ret)
278                 btrfs_abort_transaction(trans, ret);
279
280         return ret;
281 }
282
283 static void readahead_cache(struct inode *inode)
284 {
285         struct file_ra_state *ra;
286         unsigned long last_index;
287
288         ra = kzalloc(sizeof(*ra), GFP_NOFS);
289         if (!ra)
290                 return;
291
292         file_ra_state_init(ra, inode->i_mapping);
293         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
294
295         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
296
297         kfree(ra);
298 }
299
300 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
301                        int write)
302 {
303         int num_pages;
304         int check_crcs = 0;
305
306         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
307
308         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
309                 check_crcs = 1;
310
311         /* Make sure we can fit our crcs and generation into the first page */
312         if (write && check_crcs &&
313             (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
314                 return -ENOSPC;
315
316         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
317
318         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
319         if (!io_ctl->pages)
320                 return -ENOMEM;
321
322         io_ctl->num_pages = num_pages;
323         io_ctl->fs_info = btrfs_sb(inode->i_sb);
324         io_ctl->check_crcs = check_crcs;
325         io_ctl->inode = inode;
326
327         return 0;
328 }
329 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
330
331 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
332 {
333         kfree(io_ctl->pages);
334         io_ctl->pages = NULL;
335 }
336
337 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
338 {
339         if (io_ctl->cur) {
340                 io_ctl->cur = NULL;
341                 io_ctl->orig = NULL;
342         }
343 }
344
345 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
346 {
347         ASSERT(io_ctl->index < io_ctl->num_pages);
348         io_ctl->page = io_ctl->pages[io_ctl->index++];
349         io_ctl->cur = page_address(io_ctl->page);
350         io_ctl->orig = io_ctl->cur;
351         io_ctl->size = PAGE_SIZE;
352         if (clear)
353                 clear_page(io_ctl->cur);
354 }
355
356 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
357 {
358         int i;
359
360         io_ctl_unmap_page(io_ctl);
361
362         for (i = 0; i < io_ctl->num_pages; i++) {
363                 if (io_ctl->pages[i]) {
364                         ClearPageChecked(io_ctl->pages[i]);
365                         unlock_page(io_ctl->pages[i]);
366                         put_page(io_ctl->pages[i]);
367                 }
368         }
369 }
370
371 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
372                                 int uptodate)
373 {
374         struct page *page;
375         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
376         int i;
377
378         for (i = 0; i < io_ctl->num_pages; i++) {
379                 page = find_or_create_page(inode->i_mapping, i, mask);
380                 if (!page) {
381                         io_ctl_drop_pages(io_ctl);
382                         return -ENOMEM;
383                 }
384                 io_ctl->pages[i] = page;
385                 if (uptodate && !PageUptodate(page)) {
386                         btrfs_readpage(NULL, page);
387                         lock_page(page);
388                         if (!PageUptodate(page)) {
389                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
390                                            "error reading free space cache");
391                                 io_ctl_drop_pages(io_ctl);
392                                 return -EIO;
393                         }
394                 }
395         }
396
397         for (i = 0; i < io_ctl->num_pages; i++) {
398                 clear_page_dirty_for_io(io_ctl->pages[i]);
399                 set_page_extent_mapped(io_ctl->pages[i]);
400         }
401
402         return 0;
403 }
404
405 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
406 {
407         __le64 *val;
408
409         io_ctl_map_page(io_ctl, 1);
410
411         /*
412          * Skip the csum areas.  If we don't check crcs then we just have a
413          * 64bit chunk at the front of the first page.
414          */
415         if (io_ctl->check_crcs) {
416                 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
417                 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
418         } else {
419                 io_ctl->cur += sizeof(u64);
420                 io_ctl->size -= sizeof(u64) * 2;
421         }
422
423         val = io_ctl->cur;
424         *val = cpu_to_le64(generation);
425         io_ctl->cur += sizeof(u64);
426 }
427
428 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
429 {
430         __le64 *gen;
431
432         /*
433          * Skip the crc area.  If we don't check crcs then we just have a 64bit
434          * chunk at the front of the first page.
435          */
436         if (io_ctl->check_crcs) {
437                 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
438                 io_ctl->size -= sizeof(u64) +
439                         (sizeof(u32) * io_ctl->num_pages);
440         } else {
441                 io_ctl->cur += sizeof(u64);
442                 io_ctl->size -= sizeof(u64) * 2;
443         }
444
445         gen = io_ctl->cur;
446         if (le64_to_cpu(*gen) != generation) {
447                 btrfs_err_rl(io_ctl->fs_info,
448                         "space cache generation (%llu) does not match inode (%llu)",
449                                 *gen, generation);
450                 io_ctl_unmap_page(io_ctl);
451                 return -EIO;
452         }
453         io_ctl->cur += sizeof(u64);
454         return 0;
455 }
456
457 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
458 {
459         u32 *tmp;
460         u32 crc = ~(u32)0;
461         unsigned offset = 0;
462
463         if (!io_ctl->check_crcs) {
464                 io_ctl_unmap_page(io_ctl);
465                 return;
466         }
467
468         if (index == 0)
469                 offset = sizeof(u32) * io_ctl->num_pages;
470
471         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
472         btrfs_crc32c_final(crc, (u8 *)&crc);
473         io_ctl_unmap_page(io_ctl);
474         tmp = page_address(io_ctl->pages[0]);
475         tmp += index;
476         *tmp = crc;
477 }
478
479 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
480 {
481         u32 *tmp, val;
482         u32 crc = ~(u32)0;
483         unsigned offset = 0;
484
485         if (!io_ctl->check_crcs) {
486                 io_ctl_map_page(io_ctl, 0);
487                 return 0;
488         }
489
490         if (index == 0)
491                 offset = sizeof(u32) * io_ctl->num_pages;
492
493         tmp = page_address(io_ctl->pages[0]);
494         tmp += index;
495         val = *tmp;
496
497         io_ctl_map_page(io_ctl, 0);
498         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
499         btrfs_crc32c_final(crc, (u8 *)&crc);
500         if (val != crc) {
501                 btrfs_err_rl(io_ctl->fs_info,
502                         "csum mismatch on free space cache");
503                 io_ctl_unmap_page(io_ctl);
504                 return -EIO;
505         }
506
507         return 0;
508 }
509
510 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
511                             void *bitmap)
512 {
513         struct btrfs_free_space_entry *entry;
514
515         if (!io_ctl->cur)
516                 return -ENOSPC;
517
518         entry = io_ctl->cur;
519         entry->offset = cpu_to_le64(offset);
520         entry->bytes = cpu_to_le64(bytes);
521         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
522                 BTRFS_FREE_SPACE_EXTENT;
523         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
524         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
525
526         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
527                 return 0;
528
529         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
530
531         /* No more pages to map */
532         if (io_ctl->index >= io_ctl->num_pages)
533                 return 0;
534
535         /* map the next page */
536         io_ctl_map_page(io_ctl, 1);
537         return 0;
538 }
539
540 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
541 {
542         if (!io_ctl->cur)
543                 return -ENOSPC;
544
545         /*
546          * If we aren't at the start of the current page, unmap this one and
547          * map the next one if there is any left.
548          */
549         if (io_ctl->cur != io_ctl->orig) {
550                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
551                 if (io_ctl->index >= io_ctl->num_pages)
552                         return -ENOSPC;
553                 io_ctl_map_page(io_ctl, 0);
554         }
555
556         copy_page(io_ctl->cur, bitmap);
557         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
558         if (io_ctl->index < io_ctl->num_pages)
559                 io_ctl_map_page(io_ctl, 0);
560         return 0;
561 }
562
563 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
564 {
565         /*
566          * If we're not on the boundary we know we've modified the page and we
567          * need to crc the page.
568          */
569         if (io_ctl->cur != io_ctl->orig)
570                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
571         else
572                 io_ctl_unmap_page(io_ctl);
573
574         while (io_ctl->index < io_ctl->num_pages) {
575                 io_ctl_map_page(io_ctl, 1);
576                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
577         }
578 }
579
580 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
581                             struct btrfs_free_space *entry, u8 *type)
582 {
583         struct btrfs_free_space_entry *e;
584         int ret;
585
586         if (!io_ctl->cur) {
587                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
588                 if (ret)
589                         return ret;
590         }
591
592         e = io_ctl->cur;
593         entry->offset = le64_to_cpu(e->offset);
594         entry->bytes = le64_to_cpu(e->bytes);
595         *type = e->type;
596         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
597         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
598
599         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
600                 return 0;
601
602         io_ctl_unmap_page(io_ctl);
603
604         return 0;
605 }
606
607 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
608                               struct btrfs_free_space *entry)
609 {
610         int ret;
611
612         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
613         if (ret)
614                 return ret;
615
616         copy_page(entry->bitmap, io_ctl->cur);
617         io_ctl_unmap_page(io_ctl);
618
619         return 0;
620 }
621
622 /*
623  * Since we attach pinned extents after the fact we can have contiguous sections
624  * of free space that are split up in entries.  This poses a problem with the
625  * tree logging stuff since it could have allocated across what appears to be 2
626  * entries since we would have merged the entries when adding the pinned extents
627  * back to the free space cache.  So run through the space cache that we just
628  * loaded and merge contiguous entries.  This will make the log replay stuff not
629  * blow up and it will make for nicer allocator behavior.
630  */
631 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
632 {
633         struct btrfs_free_space *e, *prev = NULL;
634         struct rb_node *n;
635
636 again:
637         spin_lock(&ctl->tree_lock);
638         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
639                 e = rb_entry(n, struct btrfs_free_space, offset_index);
640                 if (!prev)
641                         goto next;
642                 if (e->bitmap || prev->bitmap)
643                         goto next;
644                 if (prev->offset + prev->bytes == e->offset) {
645                         unlink_free_space(ctl, prev);
646                         unlink_free_space(ctl, e);
647                         prev->bytes += e->bytes;
648                         kmem_cache_free(btrfs_free_space_cachep, e);
649                         link_free_space(ctl, prev);
650                         prev = NULL;
651                         spin_unlock(&ctl->tree_lock);
652                         goto again;
653                 }
654 next:
655                 prev = e;
656         }
657         spin_unlock(&ctl->tree_lock);
658 }
659
660 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
661                                    struct btrfs_free_space_ctl *ctl,
662                                    struct btrfs_path *path, u64 offset)
663 {
664         struct btrfs_fs_info *fs_info = root->fs_info;
665         struct btrfs_free_space_header *header;
666         struct extent_buffer *leaf;
667         struct btrfs_io_ctl io_ctl;
668         struct btrfs_key key;
669         struct btrfs_free_space *e, *n;
670         LIST_HEAD(bitmaps);
671         u64 num_entries;
672         u64 num_bitmaps;
673         u64 generation;
674         u8 type;
675         int ret = 0;
676
677         /* Nothing in the space cache, goodbye */
678         if (!i_size_read(inode))
679                 return 0;
680
681         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
682         key.offset = offset;
683         key.type = 0;
684
685         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
686         if (ret < 0)
687                 return 0;
688         else if (ret > 0) {
689                 btrfs_release_path(path);
690                 return 0;
691         }
692
693         ret = -1;
694
695         leaf = path->nodes[0];
696         header = btrfs_item_ptr(leaf, path->slots[0],
697                                 struct btrfs_free_space_header);
698         num_entries = btrfs_free_space_entries(leaf, header);
699         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
700         generation = btrfs_free_space_generation(leaf, header);
701         btrfs_release_path(path);
702
703         if (!BTRFS_I(inode)->generation) {
704                 btrfs_info(fs_info,
705                            "the free space cache file (%llu) is invalid, skip it",
706                            offset);
707                 return 0;
708         }
709
710         if (BTRFS_I(inode)->generation != generation) {
711                 btrfs_err(fs_info,
712                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
713                           BTRFS_I(inode)->generation, generation);
714                 return 0;
715         }
716
717         if (!num_entries)
718                 return 0;
719
720         ret = io_ctl_init(&io_ctl, inode, 0);
721         if (ret)
722                 return ret;
723
724         readahead_cache(inode);
725
726         ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
727         if (ret)
728                 goto out;
729
730         ret = io_ctl_check_crc(&io_ctl, 0);
731         if (ret)
732                 goto free_cache;
733
734         ret = io_ctl_check_generation(&io_ctl, generation);
735         if (ret)
736                 goto free_cache;
737
738         while (num_entries) {
739                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
740                                       GFP_NOFS);
741                 if (!e)
742                         goto free_cache;
743
744                 ret = io_ctl_read_entry(&io_ctl, e, &type);
745                 if (ret) {
746                         kmem_cache_free(btrfs_free_space_cachep, e);
747                         goto free_cache;
748                 }
749
750                 if (!e->bytes) {
751                         kmem_cache_free(btrfs_free_space_cachep, e);
752                         goto free_cache;
753                 }
754
755                 if (type == BTRFS_FREE_SPACE_EXTENT) {
756                         spin_lock(&ctl->tree_lock);
757                         ret = link_free_space(ctl, e);
758                         spin_unlock(&ctl->tree_lock);
759                         if (ret) {
760                                 btrfs_err(fs_info,
761                                         "Duplicate entries in free space cache, dumping");
762                                 kmem_cache_free(btrfs_free_space_cachep, e);
763                                 goto free_cache;
764                         }
765                 } else {
766                         ASSERT(num_bitmaps);
767                         num_bitmaps--;
768                         e->bitmap = kmem_cache_zalloc(
769                                         btrfs_free_space_bitmap_cachep, GFP_NOFS);
770                         if (!e->bitmap) {
771                                 kmem_cache_free(
772                                         btrfs_free_space_cachep, e);
773                                 goto free_cache;
774                         }
775                         spin_lock(&ctl->tree_lock);
776                         ret = link_free_space(ctl, e);
777                         ctl->total_bitmaps++;
778                         ctl->op->recalc_thresholds(ctl);
779                         spin_unlock(&ctl->tree_lock);
780                         if (ret) {
781                                 btrfs_err(fs_info,
782                                         "Duplicate entries in free space cache, dumping");
783                                 kmem_cache_free(btrfs_free_space_cachep, e);
784                                 goto free_cache;
785                         }
786                         list_add_tail(&e->list, &bitmaps);
787                 }
788
789                 num_entries--;
790         }
791
792         io_ctl_unmap_page(&io_ctl);
793
794         /*
795          * We add the bitmaps at the end of the entries in order that
796          * the bitmap entries are added to the cache.
797          */
798         list_for_each_entry_safe(e, n, &bitmaps, list) {
799                 list_del_init(&e->list);
800                 ret = io_ctl_read_bitmap(&io_ctl, e);
801                 if (ret)
802                         goto free_cache;
803         }
804
805         io_ctl_drop_pages(&io_ctl);
806         merge_space_tree(ctl);
807         ret = 1;
808 out:
809         io_ctl_free(&io_ctl);
810         return ret;
811 free_cache:
812         io_ctl_drop_pages(&io_ctl);
813         __btrfs_remove_free_space_cache(ctl);
814         goto out;
815 }
816
817 int load_free_space_cache(struct btrfs_block_group_cache *block_group)
818 {
819         struct btrfs_fs_info *fs_info = block_group->fs_info;
820         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
821         struct inode *inode;
822         struct btrfs_path *path;
823         int ret = 0;
824         bool matched;
825         u64 used = btrfs_block_group_used(&block_group->item);
826
827         /*
828          * If this block group has been marked to be cleared for one reason or
829          * another then we can't trust the on disk cache, so just return.
830          */
831         spin_lock(&block_group->lock);
832         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
833                 spin_unlock(&block_group->lock);
834                 return 0;
835         }
836         spin_unlock(&block_group->lock);
837
838         path = btrfs_alloc_path();
839         if (!path)
840                 return 0;
841         path->search_commit_root = 1;
842         path->skip_locking = 1;
843
844         /*
845          * We must pass a path with search_commit_root set to btrfs_iget in
846          * order to avoid a deadlock when allocating extents for the tree root.
847          *
848          * When we are COWing an extent buffer from the tree root, when looking
849          * for a free extent, at extent-tree.c:find_free_extent(), we can find
850          * block group without its free space cache loaded. When we find one
851          * we must load its space cache which requires reading its free space
852          * cache's inode item from the root tree. If this inode item is located
853          * in the same leaf that we started COWing before, then we end up in
854          * deadlock on the extent buffer (trying to read lock it when we
855          * previously write locked it).
856          *
857          * It's safe to read the inode item using the commit root because
858          * block groups, once loaded, stay in memory forever (until they are
859          * removed) as well as their space caches once loaded. New block groups
860          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
861          * we will never try to read their inode item while the fs is mounted.
862          */
863         inode = lookup_free_space_inode(block_group, path);
864         if (IS_ERR(inode)) {
865                 btrfs_free_path(path);
866                 return 0;
867         }
868
869         /* We may have converted the inode and made the cache invalid. */
870         spin_lock(&block_group->lock);
871         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
872                 spin_unlock(&block_group->lock);
873                 btrfs_free_path(path);
874                 goto out;
875         }
876         spin_unlock(&block_group->lock);
877
878         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
879                                       path, block_group->key.objectid);
880         btrfs_free_path(path);
881         if (ret <= 0)
882                 goto out;
883
884         spin_lock(&ctl->tree_lock);
885         matched = (ctl->free_space == (block_group->key.offset - used -
886                                        block_group->bytes_super));
887         spin_unlock(&ctl->tree_lock);
888
889         if (!matched) {
890                 __btrfs_remove_free_space_cache(ctl);
891                 btrfs_warn(fs_info,
892                            "block group %llu has wrong amount of free space",
893                            block_group->key.objectid);
894                 ret = -1;
895         }
896 out:
897         if (ret < 0) {
898                 /* This cache is bogus, make sure it gets cleared */
899                 spin_lock(&block_group->lock);
900                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
901                 spin_unlock(&block_group->lock);
902                 ret = 0;
903
904                 btrfs_warn(fs_info,
905                            "failed to load free space cache for block group %llu, rebuilding it now",
906                            block_group->key.objectid);
907         }
908
909         iput(inode);
910         return ret;
911 }
912
913 static noinline_for_stack
914 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
915                               struct btrfs_free_space_ctl *ctl,
916                               struct btrfs_block_group_cache *block_group,
917                               int *entries, int *bitmaps,
918                               struct list_head *bitmap_list)
919 {
920         int ret;
921         struct btrfs_free_cluster *cluster = NULL;
922         struct btrfs_free_cluster *cluster_locked = NULL;
923         struct rb_node *node = rb_first(&ctl->free_space_offset);
924         struct btrfs_trim_range *trim_entry;
925
926         /* Get the cluster for this block_group if it exists */
927         if (block_group && !list_empty(&block_group->cluster_list)) {
928                 cluster = list_entry(block_group->cluster_list.next,
929                                      struct btrfs_free_cluster,
930                                      block_group_list);
931         }
932
933         if (!node && cluster) {
934                 cluster_locked = cluster;
935                 spin_lock(&cluster_locked->lock);
936                 node = rb_first(&cluster->root);
937                 cluster = NULL;
938         }
939
940         /* Write out the extent entries */
941         while (node) {
942                 struct btrfs_free_space *e;
943
944                 e = rb_entry(node, struct btrfs_free_space, offset_index);
945                 *entries += 1;
946
947                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
948                                        e->bitmap);
949                 if (ret)
950                         goto fail;
951
952                 if (e->bitmap) {
953                         list_add_tail(&e->list, bitmap_list);
954                         *bitmaps += 1;
955                 }
956                 node = rb_next(node);
957                 if (!node && cluster) {
958                         node = rb_first(&cluster->root);
959                         cluster_locked = cluster;
960                         spin_lock(&cluster_locked->lock);
961                         cluster = NULL;
962                 }
963         }
964         if (cluster_locked) {
965                 spin_unlock(&cluster_locked->lock);
966                 cluster_locked = NULL;
967         }
968
969         /*
970          * Make sure we don't miss any range that was removed from our rbtree
971          * because trimming is running. Otherwise after a umount+mount (or crash
972          * after committing the transaction) we would leak free space and get
973          * an inconsistent free space cache report from fsck.
974          */
975         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
976                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
977                                        trim_entry->bytes, NULL);
978                 if (ret)
979                         goto fail;
980                 *entries += 1;
981         }
982
983         return 0;
984 fail:
985         if (cluster_locked)
986                 spin_unlock(&cluster_locked->lock);
987         return -ENOSPC;
988 }
989
990 static noinline_for_stack int
991 update_cache_item(struct btrfs_trans_handle *trans,
992                   struct btrfs_root *root,
993                   struct inode *inode,
994                   struct btrfs_path *path, u64 offset,
995                   int entries, int bitmaps)
996 {
997         struct btrfs_key key;
998         struct btrfs_free_space_header *header;
999         struct extent_buffer *leaf;
1000         int ret;
1001
1002         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1003         key.offset = offset;
1004         key.type = 0;
1005
1006         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1007         if (ret < 0) {
1008                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1009                                  EXTENT_DELALLOC, 0, 0, NULL);
1010                 goto fail;
1011         }
1012         leaf = path->nodes[0];
1013         if (ret > 0) {
1014                 struct btrfs_key found_key;
1015                 ASSERT(path->slots[0]);
1016                 path->slots[0]--;
1017                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1018                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1019                     found_key.offset != offset) {
1020                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1021                                          inode->i_size - 1, EXTENT_DELALLOC, 0,
1022                                          0, NULL);
1023                         btrfs_release_path(path);
1024                         goto fail;
1025                 }
1026         }
1027
1028         BTRFS_I(inode)->generation = trans->transid;
1029         header = btrfs_item_ptr(leaf, path->slots[0],
1030                                 struct btrfs_free_space_header);
1031         btrfs_set_free_space_entries(leaf, header, entries);
1032         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1033         btrfs_set_free_space_generation(leaf, header, trans->transid);
1034         btrfs_mark_buffer_dirty(leaf);
1035         btrfs_release_path(path);
1036
1037         return 0;
1038
1039 fail:
1040         return -1;
1041 }
1042
1043 static noinline_for_stack int write_pinned_extent_entries(
1044                             struct btrfs_block_group_cache *block_group,
1045                             struct btrfs_io_ctl *io_ctl,
1046                             int *entries)
1047 {
1048         u64 start, extent_start, extent_end, len;
1049         struct extent_io_tree *unpin = NULL;
1050         int ret;
1051
1052         if (!block_group)
1053                 return 0;
1054
1055         /*
1056          * We want to add any pinned extents to our free space cache
1057          * so we don't leak the space
1058          *
1059          * We shouldn't have switched the pinned extents yet so this is the
1060          * right one
1061          */
1062         unpin = block_group->fs_info->pinned_extents;
1063
1064         start = block_group->key.objectid;
1065
1066         while (start < block_group->key.objectid + block_group->key.offset) {
1067                 ret = find_first_extent_bit(unpin, start,
1068                                             &extent_start, &extent_end,
1069                                             EXTENT_DIRTY, NULL);
1070                 if (ret)
1071                         return 0;
1072
1073                 /* This pinned extent is out of our range */
1074                 if (extent_start >= block_group->key.objectid +
1075                     block_group->key.offset)
1076                         return 0;
1077
1078                 extent_start = max(extent_start, start);
1079                 extent_end = min(block_group->key.objectid +
1080                                  block_group->key.offset, extent_end + 1);
1081                 len = extent_end - extent_start;
1082
1083                 *entries += 1;
1084                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1085                 if (ret)
1086                         return -ENOSPC;
1087
1088                 start = extent_end;
1089         }
1090
1091         return 0;
1092 }
1093
1094 static noinline_for_stack int
1095 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1096 {
1097         struct btrfs_free_space *entry, *next;
1098         int ret;
1099
1100         /* Write out the bitmaps */
1101         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1102                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1103                 if (ret)
1104                         return -ENOSPC;
1105                 list_del_init(&entry->list);
1106         }
1107
1108         return 0;
1109 }
1110
1111 static int flush_dirty_cache(struct inode *inode)
1112 {
1113         int ret;
1114
1115         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1116         if (ret)
1117                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1118                                  EXTENT_DELALLOC, 0, 0, NULL);
1119
1120         return ret;
1121 }
1122
1123 static void noinline_for_stack
1124 cleanup_bitmap_list(struct list_head *bitmap_list)
1125 {
1126         struct btrfs_free_space *entry, *next;
1127
1128         list_for_each_entry_safe(entry, next, bitmap_list, list)
1129                 list_del_init(&entry->list);
1130 }
1131
1132 static void noinline_for_stack
1133 cleanup_write_cache_enospc(struct inode *inode,
1134                            struct btrfs_io_ctl *io_ctl,
1135                            struct extent_state **cached_state)
1136 {
1137         io_ctl_drop_pages(io_ctl);
1138         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1139                              i_size_read(inode) - 1, cached_state);
1140 }
1141
1142 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1143                                  struct btrfs_trans_handle *trans,
1144                                  struct btrfs_block_group_cache *block_group,
1145                                  struct btrfs_io_ctl *io_ctl,
1146                                  struct btrfs_path *path, u64 offset)
1147 {
1148         int ret;
1149         struct inode *inode = io_ctl->inode;
1150
1151         if (!inode)
1152                 return 0;
1153
1154         /* Flush the dirty pages in the cache file. */
1155         ret = flush_dirty_cache(inode);
1156         if (ret)
1157                 goto out;
1158
1159         /* Update the cache item to tell everyone this cache file is valid. */
1160         ret = update_cache_item(trans, root, inode, path, offset,
1161                                 io_ctl->entries, io_ctl->bitmaps);
1162 out:
1163         io_ctl_free(io_ctl);
1164         if (ret) {
1165                 invalidate_inode_pages2(inode->i_mapping);
1166                 BTRFS_I(inode)->generation = 0;
1167                 if (block_group) {
1168 #ifdef DEBUG
1169                         btrfs_err(root->fs_info,
1170                                   "failed to write free space cache for block group %llu",
1171                                   block_group->key.objectid);
1172 #endif
1173                 }
1174         }
1175         btrfs_update_inode(trans, root, inode);
1176
1177         if (block_group) {
1178                 /* the dirty list is protected by the dirty_bgs_lock */
1179                 spin_lock(&trans->transaction->dirty_bgs_lock);
1180
1181                 /* the disk_cache_state is protected by the block group lock */
1182                 spin_lock(&block_group->lock);
1183
1184                 /*
1185                  * only mark this as written if we didn't get put back on
1186                  * the dirty list while waiting for IO.   Otherwise our
1187                  * cache state won't be right, and we won't get written again
1188                  */
1189                 if (!ret && list_empty(&block_group->dirty_list))
1190                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1191                 else if (ret)
1192                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1193
1194                 spin_unlock(&block_group->lock);
1195                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1196                 io_ctl->inode = NULL;
1197                 iput(inode);
1198         }
1199
1200         return ret;
1201
1202 }
1203
1204 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1205                                     struct btrfs_trans_handle *trans,
1206                                     struct btrfs_io_ctl *io_ctl,
1207                                     struct btrfs_path *path)
1208 {
1209         return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1210 }
1211
1212 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1213                         struct btrfs_block_group_cache *block_group,
1214                         struct btrfs_path *path)
1215 {
1216         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1217                                      block_group, &block_group->io_ctl,
1218                                      path, block_group->key.objectid);
1219 }
1220
1221 /**
1222  * __btrfs_write_out_cache - write out cached info to an inode
1223  * @root - the root the inode belongs to
1224  * @ctl - the free space cache we are going to write out
1225  * @block_group - the block_group for this cache if it belongs to a block_group
1226  * @trans - the trans handle
1227  *
1228  * This function writes out a free space cache struct to disk for quick recovery
1229  * on mount.  This will return 0 if it was successful in writing the cache out,
1230  * or an errno if it was not.
1231  */
1232 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1233                                    struct btrfs_free_space_ctl *ctl,
1234                                    struct btrfs_block_group_cache *block_group,
1235                                    struct btrfs_io_ctl *io_ctl,
1236                                    struct btrfs_trans_handle *trans)
1237 {
1238         struct extent_state *cached_state = NULL;
1239         LIST_HEAD(bitmap_list);
1240         int entries = 0;
1241         int bitmaps = 0;
1242         int ret;
1243         int must_iput = 0;
1244
1245         if (!i_size_read(inode))
1246                 return -EIO;
1247
1248         WARN_ON(io_ctl->pages);
1249         ret = io_ctl_init(io_ctl, inode, 1);
1250         if (ret)
1251                 return ret;
1252
1253         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1254                 down_write(&block_group->data_rwsem);
1255                 spin_lock(&block_group->lock);
1256                 if (block_group->delalloc_bytes) {
1257                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1258                         spin_unlock(&block_group->lock);
1259                         up_write(&block_group->data_rwsem);
1260                         BTRFS_I(inode)->generation = 0;
1261                         ret = 0;
1262                         must_iput = 1;
1263                         goto out;
1264                 }
1265                 spin_unlock(&block_group->lock);
1266         }
1267
1268         /* Lock all pages first so we can lock the extent safely. */
1269         ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1270         if (ret)
1271                 goto out_unlock;
1272
1273         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1274                          &cached_state);
1275
1276         io_ctl_set_generation(io_ctl, trans->transid);
1277
1278         mutex_lock(&ctl->cache_writeout_mutex);
1279         /* Write out the extent entries in the free space cache */
1280         spin_lock(&ctl->tree_lock);
1281         ret = write_cache_extent_entries(io_ctl, ctl,
1282                                          block_group, &entries, &bitmaps,
1283                                          &bitmap_list);
1284         if (ret)
1285                 goto out_nospc_locked;
1286
1287         /*
1288          * Some spaces that are freed in the current transaction are pinned,
1289          * they will be added into free space cache after the transaction is
1290          * committed, we shouldn't lose them.
1291          *
1292          * If this changes while we are working we'll get added back to
1293          * the dirty list and redo it.  No locking needed
1294          */
1295         ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
1296         if (ret)
1297                 goto out_nospc_locked;
1298
1299         /*
1300          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1301          * locked while doing it because a concurrent trim can be manipulating
1302          * or freeing the bitmap.
1303          */
1304         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1305         spin_unlock(&ctl->tree_lock);
1306         mutex_unlock(&ctl->cache_writeout_mutex);
1307         if (ret)
1308                 goto out_nospc;
1309
1310         /* Zero out the rest of the pages just to make sure */
1311         io_ctl_zero_remaining_pages(io_ctl);
1312
1313         /* Everything is written out, now we dirty the pages in the file. */
1314         ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1315                                 i_size_read(inode), &cached_state);
1316         if (ret)
1317                 goto out_nospc;
1318
1319         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1320                 up_write(&block_group->data_rwsem);
1321         /*
1322          * Release the pages and unlock the extent, we will flush
1323          * them out later
1324          */
1325         io_ctl_drop_pages(io_ctl);
1326
1327         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1328                              i_size_read(inode) - 1, &cached_state);
1329
1330         /*
1331          * at this point the pages are under IO and we're happy,
1332          * The caller is responsible for waiting on them and updating the
1333          * the cache and the inode
1334          */
1335         io_ctl->entries = entries;
1336         io_ctl->bitmaps = bitmaps;
1337
1338         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1339         if (ret)
1340                 goto out;
1341
1342         return 0;
1343
1344 out:
1345         io_ctl->inode = NULL;
1346         io_ctl_free(io_ctl);
1347         if (ret) {
1348                 invalidate_inode_pages2(inode->i_mapping);
1349                 BTRFS_I(inode)->generation = 0;
1350         }
1351         btrfs_update_inode(trans, root, inode);
1352         if (must_iput)
1353                 iput(inode);
1354         return ret;
1355
1356 out_nospc_locked:
1357         cleanup_bitmap_list(&bitmap_list);
1358         spin_unlock(&ctl->tree_lock);
1359         mutex_unlock(&ctl->cache_writeout_mutex);
1360
1361 out_nospc:
1362         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1363
1364 out_unlock:
1365         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1366                 up_write(&block_group->data_rwsem);
1367
1368         goto out;
1369 }
1370
1371 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1372                           struct btrfs_block_group_cache *block_group,
1373                           struct btrfs_path *path)
1374 {
1375         struct btrfs_fs_info *fs_info = trans->fs_info;
1376         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1377         struct inode *inode;
1378         int ret = 0;
1379
1380         spin_lock(&block_group->lock);
1381         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1382                 spin_unlock(&block_group->lock);
1383                 return 0;
1384         }
1385         spin_unlock(&block_group->lock);
1386
1387         inode = lookup_free_space_inode(block_group, path);
1388         if (IS_ERR(inode))
1389                 return 0;
1390
1391         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1392                                 block_group, &block_group->io_ctl, trans);
1393         if (ret) {
1394 #ifdef DEBUG
1395                 btrfs_err(fs_info,
1396                           "failed to write free space cache for block group %llu",
1397                           block_group->key.objectid);
1398 #endif
1399                 spin_lock(&block_group->lock);
1400                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1401                 spin_unlock(&block_group->lock);
1402
1403                 block_group->io_ctl.inode = NULL;
1404                 iput(inode);
1405         }
1406
1407         /*
1408          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1409          * to wait for IO and put the inode
1410          */
1411
1412         return ret;
1413 }
1414
1415 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1416                                           u64 offset)
1417 {
1418         ASSERT(offset >= bitmap_start);
1419         offset -= bitmap_start;
1420         return (unsigned long)(div_u64(offset, unit));
1421 }
1422
1423 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1424 {
1425         return (unsigned long)(div_u64(bytes, unit));
1426 }
1427
1428 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1429                                    u64 offset)
1430 {
1431         u64 bitmap_start;
1432         u64 bytes_per_bitmap;
1433
1434         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1435         bitmap_start = offset - ctl->start;
1436         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1437         bitmap_start *= bytes_per_bitmap;
1438         bitmap_start += ctl->start;
1439
1440         return bitmap_start;
1441 }
1442
1443 static int tree_insert_offset(struct rb_root *root, u64 offset,
1444                               struct rb_node *node, int bitmap)
1445 {
1446         struct rb_node **p = &root->rb_node;
1447         struct rb_node *parent = NULL;
1448         struct btrfs_free_space *info;
1449
1450         while (*p) {
1451                 parent = *p;
1452                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1453
1454                 if (offset < info->offset) {
1455                         p = &(*p)->rb_left;
1456                 } else if (offset > info->offset) {
1457                         p = &(*p)->rb_right;
1458                 } else {
1459                         /*
1460                          * we could have a bitmap entry and an extent entry
1461                          * share the same offset.  If this is the case, we want
1462                          * the extent entry to always be found first if we do a
1463                          * linear search through the tree, since we want to have
1464                          * the quickest allocation time, and allocating from an
1465                          * extent is faster than allocating from a bitmap.  So
1466                          * if we're inserting a bitmap and we find an entry at
1467                          * this offset, we want to go right, or after this entry
1468                          * logically.  If we are inserting an extent and we've
1469                          * found a bitmap, we want to go left, or before
1470                          * logically.
1471                          */
1472                         if (bitmap) {
1473                                 if (info->bitmap) {
1474                                         WARN_ON_ONCE(1);
1475                                         return -EEXIST;
1476                                 }
1477                                 p = &(*p)->rb_right;
1478                         } else {
1479                                 if (!info->bitmap) {
1480                                         WARN_ON_ONCE(1);
1481                                         return -EEXIST;
1482                                 }
1483                                 p = &(*p)->rb_left;
1484                         }
1485                 }
1486         }
1487
1488         rb_link_node(node, parent, p);
1489         rb_insert_color(node, root);
1490
1491         return 0;
1492 }
1493
1494 /*
1495  * searches the tree for the given offset.
1496  *
1497  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1498  * want a section that has at least bytes size and comes at or after the given
1499  * offset.
1500  */
1501 static struct btrfs_free_space *
1502 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1503                    u64 offset, int bitmap_only, int fuzzy)
1504 {
1505         struct rb_node *n = ctl->free_space_offset.rb_node;
1506         struct btrfs_free_space *entry, *prev = NULL;
1507
1508         /* find entry that is closest to the 'offset' */
1509         while (1) {
1510                 if (!n) {
1511                         entry = NULL;
1512                         break;
1513                 }
1514
1515                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1516                 prev = entry;
1517
1518                 if (offset < entry->offset)
1519                         n = n->rb_left;
1520                 else if (offset > entry->offset)
1521                         n = n->rb_right;
1522                 else
1523                         break;
1524         }
1525
1526         if (bitmap_only) {
1527                 if (!entry)
1528                         return NULL;
1529                 if (entry->bitmap)
1530                         return entry;
1531
1532                 /*
1533                  * bitmap entry and extent entry may share same offset,
1534                  * in that case, bitmap entry comes after extent entry.
1535                  */
1536                 n = rb_next(n);
1537                 if (!n)
1538                         return NULL;
1539                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1540                 if (entry->offset != offset)
1541                         return NULL;
1542
1543                 WARN_ON(!entry->bitmap);
1544                 return entry;
1545         } else if (entry) {
1546                 if (entry->bitmap) {
1547                         /*
1548                          * if previous extent entry covers the offset,
1549                          * we should return it instead of the bitmap entry
1550                          */
1551                         n = rb_prev(&entry->offset_index);
1552                         if (n) {
1553                                 prev = rb_entry(n, struct btrfs_free_space,
1554                                                 offset_index);
1555                                 if (!prev->bitmap &&
1556                                     prev->offset + prev->bytes > offset)
1557                                         entry = prev;
1558                         }
1559                 }
1560                 return entry;
1561         }
1562
1563         if (!prev)
1564                 return NULL;
1565
1566         /* find last entry before the 'offset' */
1567         entry = prev;
1568         if (entry->offset > offset) {
1569                 n = rb_prev(&entry->offset_index);
1570                 if (n) {
1571                         entry = rb_entry(n, struct btrfs_free_space,
1572                                         offset_index);
1573                         ASSERT(entry->offset <= offset);
1574                 } else {
1575                         if (fuzzy)
1576                                 return entry;
1577                         else
1578                                 return NULL;
1579                 }
1580         }
1581
1582         if (entry->bitmap) {
1583                 n = rb_prev(&entry->offset_index);
1584                 if (n) {
1585                         prev = rb_entry(n, struct btrfs_free_space,
1586                                         offset_index);
1587                         if (!prev->bitmap &&
1588                             prev->offset + prev->bytes > offset)
1589                                 return prev;
1590                 }
1591                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1592                         return entry;
1593         } else if (entry->offset + entry->bytes > offset)
1594                 return entry;
1595
1596         if (!fuzzy)
1597                 return NULL;
1598
1599         while (1) {
1600                 if (entry->bitmap) {
1601                         if (entry->offset + BITS_PER_BITMAP *
1602                             ctl->unit > offset)
1603                                 break;
1604                 } else {
1605                         if (entry->offset + entry->bytes > offset)
1606                                 break;
1607                 }
1608
1609                 n = rb_next(&entry->offset_index);
1610                 if (!n)
1611                         return NULL;
1612                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1613         }
1614         return entry;
1615 }
1616
1617 static inline void
1618 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1619                     struct btrfs_free_space *info)
1620 {
1621         rb_erase(&info->offset_index, &ctl->free_space_offset);
1622         ctl->free_extents--;
1623 }
1624
1625 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1626                               struct btrfs_free_space *info)
1627 {
1628         __unlink_free_space(ctl, info);
1629         ctl->free_space -= info->bytes;
1630 }
1631
1632 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1633                            struct btrfs_free_space *info)
1634 {
1635         int ret = 0;
1636
1637         ASSERT(info->bytes || info->bitmap);
1638         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1639                                  &info->offset_index, (info->bitmap != NULL));
1640         if (ret)
1641                 return ret;
1642
1643         ctl->free_space += info->bytes;
1644         ctl->free_extents++;
1645         return ret;
1646 }
1647
1648 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1649 {
1650         struct btrfs_block_group_cache *block_group = ctl->private;
1651         u64 max_bytes;
1652         u64 bitmap_bytes;
1653         u64 extent_bytes;
1654         u64 size = block_group->key.offset;
1655         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1656         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1657
1658         max_bitmaps = max_t(u64, max_bitmaps, 1);
1659
1660         ASSERT(ctl->total_bitmaps <= max_bitmaps);
1661
1662         /*
1663          * The goal is to keep the total amount of memory used per 1gb of space
1664          * at or below 32k, so we need to adjust how much memory we allow to be
1665          * used by extent based free space tracking
1666          */
1667         if (size < SZ_1G)
1668                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1669         else
1670                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1671
1672         /*
1673          * we want to account for 1 more bitmap than what we have so we can make
1674          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1675          * we add more bitmaps.
1676          */
1677         bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1678
1679         if (bitmap_bytes >= max_bytes) {
1680                 ctl->extents_thresh = 0;
1681                 return;
1682         }
1683
1684         /*
1685          * we want the extent entry threshold to always be at most 1/2 the max
1686          * bytes we can have, or whatever is less than that.
1687          */
1688         extent_bytes = max_bytes - bitmap_bytes;
1689         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1690
1691         ctl->extents_thresh =
1692                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1693 }
1694
1695 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1696                                        struct btrfs_free_space *info,
1697                                        u64 offset, u64 bytes)
1698 {
1699         unsigned long start, count;
1700
1701         start = offset_to_bit(info->offset, ctl->unit, offset);
1702         count = bytes_to_bits(bytes, ctl->unit);
1703         ASSERT(start + count <= BITS_PER_BITMAP);
1704
1705         bitmap_clear(info->bitmap, start, count);
1706
1707         info->bytes -= bytes;
1708         if (info->max_extent_size > ctl->unit)
1709                 info->max_extent_size = 0;
1710 }
1711
1712 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1713                               struct btrfs_free_space *info, u64 offset,
1714                               u64 bytes)
1715 {
1716         __bitmap_clear_bits(ctl, info, offset, bytes);
1717         ctl->free_space -= bytes;
1718 }
1719
1720 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1721                             struct btrfs_free_space *info, u64 offset,
1722                             u64 bytes)
1723 {
1724         unsigned long start, count;
1725
1726         start = offset_to_bit(info->offset, ctl->unit, offset);
1727         count = bytes_to_bits(bytes, ctl->unit);
1728         ASSERT(start + count <= BITS_PER_BITMAP);
1729
1730         bitmap_set(info->bitmap, start, count);
1731
1732         info->bytes += bytes;
1733         ctl->free_space += bytes;
1734 }
1735
1736 /*
1737  * If we can not find suitable extent, we will use bytes to record
1738  * the size of the max extent.
1739  */
1740 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1741                          struct btrfs_free_space *bitmap_info, u64 *offset,
1742                          u64 *bytes, bool for_alloc)
1743 {
1744         unsigned long found_bits = 0;
1745         unsigned long max_bits = 0;
1746         unsigned long bits, i;
1747         unsigned long next_zero;
1748         unsigned long extent_bits;
1749
1750         /*
1751          * Skip searching the bitmap if we don't have a contiguous section that
1752          * is large enough for this allocation.
1753          */
1754         if (for_alloc &&
1755             bitmap_info->max_extent_size &&
1756             bitmap_info->max_extent_size < *bytes) {
1757                 *bytes = bitmap_info->max_extent_size;
1758                 return -1;
1759         }
1760
1761         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1762                           max_t(u64, *offset, bitmap_info->offset));
1763         bits = bytes_to_bits(*bytes, ctl->unit);
1764
1765         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1766                 if (for_alloc && bits == 1) {
1767                         found_bits = 1;
1768                         break;
1769                 }
1770                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1771                                                BITS_PER_BITMAP, i);
1772                 extent_bits = next_zero - i;
1773                 if (extent_bits >= bits) {
1774                         found_bits = extent_bits;
1775                         break;
1776                 } else if (extent_bits > max_bits) {
1777                         max_bits = extent_bits;
1778                 }
1779                 i = next_zero;
1780         }
1781
1782         if (found_bits) {
1783                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1784                 *bytes = (u64)(found_bits) * ctl->unit;
1785                 return 0;
1786         }
1787
1788         *bytes = (u64)(max_bits) * ctl->unit;
1789         bitmap_info->max_extent_size = *bytes;
1790         return -1;
1791 }
1792
1793 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1794 {
1795         if (entry->bitmap)
1796                 return entry->max_extent_size;
1797         return entry->bytes;
1798 }
1799
1800 /* Cache the size of the max extent in bytes */
1801 static struct btrfs_free_space *
1802 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1803                 unsigned long align, u64 *max_extent_size)
1804 {
1805         struct btrfs_free_space *entry;
1806         struct rb_node *node;
1807         u64 tmp;
1808         u64 align_off;
1809         int ret;
1810
1811         if (!ctl->free_space_offset.rb_node)
1812                 goto out;
1813
1814         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1815         if (!entry)
1816                 goto out;
1817
1818         for (node = &entry->offset_index; node; node = rb_next(node)) {
1819                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1820                 if (entry->bytes < *bytes) {
1821                         *max_extent_size = max(get_max_extent_size(entry),
1822                                                *max_extent_size);
1823                         continue;
1824                 }
1825
1826                 /* make sure the space returned is big enough
1827                  * to match our requested alignment
1828                  */
1829                 if (*bytes >= align) {
1830                         tmp = entry->offset - ctl->start + align - 1;
1831                         tmp = div64_u64(tmp, align);
1832                         tmp = tmp * align + ctl->start;
1833                         align_off = tmp - entry->offset;
1834                 } else {
1835                         align_off = 0;
1836                         tmp = entry->offset;
1837                 }
1838
1839                 if (entry->bytes < *bytes + align_off) {
1840                         *max_extent_size = max(get_max_extent_size(entry),
1841                                                *max_extent_size);
1842                         continue;
1843                 }
1844
1845                 if (entry->bitmap) {
1846                         u64 size = *bytes;
1847
1848                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1849                         if (!ret) {
1850                                 *offset = tmp;
1851                                 *bytes = size;
1852                                 return entry;
1853                         } else {
1854                                 *max_extent_size =
1855                                         max(get_max_extent_size(entry),
1856                                             *max_extent_size);
1857                         }
1858                         continue;
1859                 }
1860
1861                 *offset = tmp;
1862                 *bytes = entry->bytes - align_off;
1863                 return entry;
1864         }
1865 out:
1866         return NULL;
1867 }
1868
1869 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1870                            struct btrfs_free_space *info, u64 offset)
1871 {
1872         info->offset = offset_to_bitmap(ctl, offset);
1873         info->bytes = 0;
1874         INIT_LIST_HEAD(&info->list);
1875         link_free_space(ctl, info);
1876         ctl->total_bitmaps++;
1877
1878         ctl->op->recalc_thresholds(ctl);
1879 }
1880
1881 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1882                         struct btrfs_free_space *bitmap_info)
1883 {
1884         unlink_free_space(ctl, bitmap_info);
1885         kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1886         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1887         ctl->total_bitmaps--;
1888         ctl->op->recalc_thresholds(ctl);
1889 }
1890
1891 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1892                               struct btrfs_free_space *bitmap_info,
1893                               u64 *offset, u64 *bytes)
1894 {
1895         u64 end;
1896         u64 search_start, search_bytes;
1897         int ret;
1898
1899 again:
1900         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1901
1902         /*
1903          * We need to search for bits in this bitmap.  We could only cover some
1904          * of the extent in this bitmap thanks to how we add space, so we need
1905          * to search for as much as it as we can and clear that amount, and then
1906          * go searching for the next bit.
1907          */
1908         search_start = *offset;
1909         search_bytes = ctl->unit;
1910         search_bytes = min(search_bytes, end - search_start + 1);
1911         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1912                             false);
1913         if (ret < 0 || search_start != *offset)
1914                 return -EINVAL;
1915
1916         /* We may have found more bits than what we need */
1917         search_bytes = min(search_bytes, *bytes);
1918
1919         /* Cannot clear past the end of the bitmap */
1920         search_bytes = min(search_bytes, end - search_start + 1);
1921
1922         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1923         *offset += search_bytes;
1924         *bytes -= search_bytes;
1925
1926         if (*bytes) {
1927                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1928                 if (!bitmap_info->bytes)
1929                         free_bitmap(ctl, bitmap_info);
1930
1931                 /*
1932                  * no entry after this bitmap, but we still have bytes to
1933                  * remove, so something has gone wrong.
1934                  */
1935                 if (!next)
1936                         return -EINVAL;
1937
1938                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1939                                        offset_index);
1940
1941                 /*
1942                  * if the next entry isn't a bitmap we need to return to let the
1943                  * extent stuff do its work.
1944                  */
1945                 if (!bitmap_info->bitmap)
1946                         return -EAGAIN;
1947
1948                 /*
1949                  * Ok the next item is a bitmap, but it may not actually hold
1950                  * the information for the rest of this free space stuff, so
1951                  * look for it, and if we don't find it return so we can try
1952                  * everything over again.
1953                  */
1954                 search_start = *offset;
1955                 search_bytes = ctl->unit;
1956                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1957                                     &search_bytes, false);
1958                 if (ret < 0 || search_start != *offset)
1959                         return -EAGAIN;
1960
1961                 goto again;
1962         } else if (!bitmap_info->bytes)
1963                 free_bitmap(ctl, bitmap_info);
1964
1965         return 0;
1966 }
1967
1968 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1969                                struct btrfs_free_space *info, u64 offset,
1970                                u64 bytes)
1971 {
1972         u64 bytes_to_set = 0;
1973         u64 end;
1974
1975         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1976
1977         bytes_to_set = min(end - offset, bytes);
1978
1979         bitmap_set_bits(ctl, info, offset, bytes_to_set);
1980
1981         /*
1982          * We set some bytes, we have no idea what the max extent size is
1983          * anymore.
1984          */
1985         info->max_extent_size = 0;
1986
1987         return bytes_to_set;
1988
1989 }
1990
1991 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1992                       struct btrfs_free_space *info)
1993 {
1994         struct btrfs_block_group_cache *block_group = ctl->private;
1995         struct btrfs_fs_info *fs_info = block_group->fs_info;
1996         bool forced = false;
1997
1998 #ifdef CONFIG_BTRFS_DEBUG
1999         if (btrfs_should_fragment_free_space(block_group))
2000                 forced = true;
2001 #endif
2002
2003         /*
2004          * If we are below the extents threshold then we can add this as an
2005          * extent, and don't have to deal with the bitmap
2006          */
2007         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2008                 /*
2009                  * If this block group has some small extents we don't want to
2010                  * use up all of our free slots in the cache with them, we want
2011                  * to reserve them to larger extents, however if we have plenty
2012                  * of cache left then go ahead an dadd them, no sense in adding
2013                  * the overhead of a bitmap if we don't have to.
2014                  */
2015                 if (info->bytes <= fs_info->sectorsize * 4) {
2016                         if (ctl->free_extents * 2 <= ctl->extents_thresh)
2017                                 return false;
2018                 } else {
2019                         return false;
2020                 }
2021         }
2022
2023         /*
2024          * The original block groups from mkfs can be really small, like 8
2025          * megabytes, so don't bother with a bitmap for those entries.  However
2026          * some block groups can be smaller than what a bitmap would cover but
2027          * are still large enough that they could overflow the 32k memory limit,
2028          * so allow those block groups to still be allowed to have a bitmap
2029          * entry.
2030          */
2031         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
2032                 return false;
2033
2034         return true;
2035 }
2036
2037 static const struct btrfs_free_space_op free_space_op = {
2038         .recalc_thresholds      = recalculate_thresholds,
2039         .use_bitmap             = use_bitmap,
2040 };
2041
2042 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2043                               struct btrfs_free_space *info)
2044 {
2045         struct btrfs_free_space *bitmap_info;
2046         struct btrfs_block_group_cache *block_group = NULL;
2047         int added = 0;
2048         u64 bytes, offset, bytes_added;
2049         int ret;
2050
2051         bytes = info->bytes;
2052         offset = info->offset;
2053
2054         if (!ctl->op->use_bitmap(ctl, info))
2055                 return 0;
2056
2057         if (ctl->op == &free_space_op)
2058                 block_group = ctl->private;
2059 again:
2060         /*
2061          * Since we link bitmaps right into the cluster we need to see if we
2062          * have a cluster here, and if so and it has our bitmap we need to add
2063          * the free space to that bitmap.
2064          */
2065         if (block_group && !list_empty(&block_group->cluster_list)) {
2066                 struct btrfs_free_cluster *cluster;
2067                 struct rb_node *node;
2068                 struct btrfs_free_space *entry;
2069
2070                 cluster = list_entry(block_group->cluster_list.next,
2071                                      struct btrfs_free_cluster,
2072                                      block_group_list);
2073                 spin_lock(&cluster->lock);
2074                 node = rb_first(&cluster->root);
2075                 if (!node) {
2076                         spin_unlock(&cluster->lock);
2077                         goto no_cluster_bitmap;
2078                 }
2079
2080                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2081                 if (!entry->bitmap) {
2082                         spin_unlock(&cluster->lock);
2083                         goto no_cluster_bitmap;
2084                 }
2085
2086                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2087                         bytes_added = add_bytes_to_bitmap(ctl, entry,
2088                                                           offset, bytes);
2089                         bytes -= bytes_added;
2090                         offset += bytes_added;
2091                 }
2092                 spin_unlock(&cluster->lock);
2093                 if (!bytes) {
2094                         ret = 1;
2095                         goto out;
2096                 }
2097         }
2098
2099 no_cluster_bitmap:
2100         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2101                                          1, 0);
2102         if (!bitmap_info) {
2103                 ASSERT(added == 0);
2104                 goto new_bitmap;
2105         }
2106
2107         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2108         bytes -= bytes_added;
2109         offset += bytes_added;
2110         added = 0;
2111
2112         if (!bytes) {
2113                 ret = 1;
2114                 goto out;
2115         } else
2116                 goto again;
2117
2118 new_bitmap:
2119         if (info && info->bitmap) {
2120                 add_new_bitmap(ctl, info, offset);
2121                 added = 1;
2122                 info = NULL;
2123                 goto again;
2124         } else {
2125                 spin_unlock(&ctl->tree_lock);
2126
2127                 /* no pre-allocated info, allocate a new one */
2128                 if (!info) {
2129                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2130                                                  GFP_NOFS);
2131                         if (!info) {
2132                                 spin_lock(&ctl->tree_lock);
2133                                 ret = -ENOMEM;
2134                                 goto out;
2135                         }
2136                 }
2137
2138                 /* allocate the bitmap */
2139                 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2140                                                  GFP_NOFS);
2141                 spin_lock(&ctl->tree_lock);
2142                 if (!info->bitmap) {
2143                         ret = -ENOMEM;
2144                         goto out;
2145                 }
2146                 goto again;
2147         }
2148
2149 out:
2150         if (info) {
2151                 if (info->bitmap)
2152                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2153                                         info->bitmap);
2154                 kmem_cache_free(btrfs_free_space_cachep, info);
2155         }
2156
2157         return ret;
2158 }
2159
2160 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2161                           struct btrfs_free_space *info, bool update_stat)
2162 {
2163         struct btrfs_free_space *left_info;
2164         struct btrfs_free_space *right_info;
2165         bool merged = false;
2166         u64 offset = info->offset;
2167         u64 bytes = info->bytes;
2168
2169         /*
2170          * first we want to see if there is free space adjacent to the range we
2171          * are adding, if there is remove that struct and add a new one to
2172          * cover the entire range
2173          */
2174         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2175         if (right_info && rb_prev(&right_info->offset_index))
2176                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2177                                      struct btrfs_free_space, offset_index);
2178         else
2179                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2180
2181         if (right_info && !right_info->bitmap) {
2182                 if (update_stat)
2183                         unlink_free_space(ctl, right_info);
2184                 else
2185                         __unlink_free_space(ctl, right_info);
2186                 info->bytes += right_info->bytes;
2187                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2188                 merged = true;
2189         }
2190
2191         if (left_info && !left_info->bitmap &&
2192             left_info->offset + left_info->bytes == offset) {
2193                 if (update_stat)
2194                         unlink_free_space(ctl, left_info);
2195                 else
2196                         __unlink_free_space(ctl, left_info);
2197                 info->offset = left_info->offset;
2198                 info->bytes += left_info->bytes;
2199                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2200                 merged = true;
2201         }
2202
2203         return merged;
2204 }
2205
2206 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2207                                      struct btrfs_free_space *info,
2208                                      bool update_stat)
2209 {
2210         struct btrfs_free_space *bitmap;
2211         unsigned long i;
2212         unsigned long j;
2213         const u64 end = info->offset + info->bytes;
2214         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2215         u64 bytes;
2216
2217         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2218         if (!bitmap)
2219                 return false;
2220
2221         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2222         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2223         if (j == i)
2224                 return false;
2225         bytes = (j - i) * ctl->unit;
2226         info->bytes += bytes;
2227
2228         if (update_stat)
2229                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2230         else
2231                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2232
2233         if (!bitmap->bytes)
2234                 free_bitmap(ctl, bitmap);
2235
2236         return true;
2237 }
2238
2239 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2240                                        struct btrfs_free_space *info,
2241                                        bool update_stat)
2242 {
2243         struct btrfs_free_space *bitmap;
2244         u64 bitmap_offset;
2245         unsigned long i;
2246         unsigned long j;
2247         unsigned long prev_j;
2248         u64 bytes;
2249
2250         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2251         /* If we're on a boundary, try the previous logical bitmap. */
2252         if (bitmap_offset == info->offset) {
2253                 if (info->offset == 0)
2254                         return false;
2255                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2256         }
2257
2258         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2259         if (!bitmap)
2260                 return false;
2261
2262         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2263         j = 0;
2264         prev_j = (unsigned long)-1;
2265         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2266                 if (j > i)
2267                         break;
2268                 prev_j = j;
2269         }
2270         if (prev_j == i)
2271                 return false;
2272
2273         if (prev_j == (unsigned long)-1)
2274                 bytes = (i + 1) * ctl->unit;
2275         else
2276                 bytes = (i - prev_j) * ctl->unit;
2277
2278         info->offset -= bytes;
2279         info->bytes += bytes;
2280
2281         if (update_stat)
2282                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2283         else
2284                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2285
2286         if (!bitmap->bytes)
2287                 free_bitmap(ctl, bitmap);
2288
2289         return true;
2290 }
2291
2292 /*
2293  * We prefer always to allocate from extent entries, both for clustered and
2294  * non-clustered allocation requests. So when attempting to add a new extent
2295  * entry, try to see if there's adjacent free space in bitmap entries, and if
2296  * there is, migrate that space from the bitmaps to the extent.
2297  * Like this we get better chances of satisfying space allocation requests
2298  * because we attempt to satisfy them based on a single cache entry, and never
2299  * on 2 or more entries - even if the entries represent a contiguous free space
2300  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2301  * ends).
2302  */
2303 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2304                               struct btrfs_free_space *info,
2305                               bool update_stat)
2306 {
2307         /*
2308          * Only work with disconnected entries, as we can change their offset,
2309          * and must be extent entries.
2310          */
2311         ASSERT(!info->bitmap);
2312         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2313
2314         if (ctl->total_bitmaps > 0) {
2315                 bool stole_end;
2316                 bool stole_front = false;
2317
2318                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2319                 if (ctl->total_bitmaps > 0)
2320                         stole_front = steal_from_bitmap_to_front(ctl, info,
2321                                                                  update_stat);
2322
2323                 if (stole_end || stole_front)
2324                         try_merge_free_space(ctl, info, update_stat);
2325         }
2326 }
2327
2328 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2329                            struct btrfs_free_space_ctl *ctl,
2330                            u64 offset, u64 bytes)
2331 {
2332         struct btrfs_free_space *info;
2333         int ret = 0;
2334
2335         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2336         if (!info)
2337                 return -ENOMEM;
2338
2339         info->offset = offset;
2340         info->bytes = bytes;
2341         RB_CLEAR_NODE(&info->offset_index);
2342
2343         spin_lock(&ctl->tree_lock);
2344
2345         if (try_merge_free_space(ctl, info, true))
2346                 goto link;
2347
2348         /*
2349          * There was no extent directly to the left or right of this new
2350          * extent then we know we're going to have to allocate a new extent, so
2351          * before we do that see if we need to drop this into a bitmap
2352          */
2353         ret = insert_into_bitmap(ctl, info);
2354         if (ret < 0) {
2355                 goto out;
2356         } else if (ret) {
2357                 ret = 0;
2358                 goto out;
2359         }
2360 link:
2361         /*
2362          * Only steal free space from adjacent bitmaps if we're sure we're not
2363          * going to add the new free space to existing bitmap entries - because
2364          * that would mean unnecessary work that would be reverted. Therefore
2365          * attempt to steal space from bitmaps if we're adding an extent entry.
2366          */
2367         steal_from_bitmap(ctl, info, true);
2368
2369         ret = link_free_space(ctl, info);
2370         if (ret)
2371                 kmem_cache_free(btrfs_free_space_cachep, info);
2372 out:
2373         spin_unlock(&ctl->tree_lock);
2374
2375         if (ret) {
2376                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2377                 ASSERT(ret != -EEXIST);
2378         }
2379
2380         return ret;
2381 }
2382
2383 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2384                          u64 bytenr, u64 size)
2385 {
2386         return __btrfs_add_free_space(block_group->fs_info,
2387                                       block_group->free_space_ctl,
2388                                       bytenr, size);
2389 }
2390
2391 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2392                             u64 offset, u64 bytes)
2393 {
2394         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2395         struct btrfs_free_space *info;
2396         int ret;
2397         bool re_search = false;
2398
2399         spin_lock(&ctl->tree_lock);
2400
2401 again:
2402         ret = 0;
2403         if (!bytes)
2404                 goto out_lock;
2405
2406         info = tree_search_offset(ctl, offset, 0, 0);
2407         if (!info) {
2408                 /*
2409                  * oops didn't find an extent that matched the space we wanted
2410                  * to remove, look for a bitmap instead
2411                  */
2412                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2413                                           1, 0);
2414                 if (!info) {
2415                         /*
2416                          * If we found a partial bit of our free space in a
2417                          * bitmap but then couldn't find the other part this may
2418                          * be a problem, so WARN about it.
2419                          */
2420                         WARN_ON(re_search);
2421                         goto out_lock;
2422                 }
2423         }
2424
2425         re_search = false;
2426         if (!info->bitmap) {
2427                 unlink_free_space(ctl, info);
2428                 if (offset == info->offset) {
2429                         u64 to_free = min(bytes, info->bytes);
2430
2431                         info->bytes -= to_free;
2432                         info->offset += to_free;
2433                         if (info->bytes) {
2434                                 ret = link_free_space(ctl, info);
2435                                 WARN_ON(ret);
2436                         } else {
2437                                 kmem_cache_free(btrfs_free_space_cachep, info);
2438                         }
2439
2440                         offset += to_free;
2441                         bytes -= to_free;
2442                         goto again;
2443                 } else {
2444                         u64 old_end = info->bytes + info->offset;
2445
2446                         info->bytes = offset - info->offset;
2447                         ret = link_free_space(ctl, info);
2448                         WARN_ON(ret);
2449                         if (ret)
2450                                 goto out_lock;
2451
2452                         /* Not enough bytes in this entry to satisfy us */
2453                         if (old_end < offset + bytes) {
2454                                 bytes -= old_end - offset;
2455                                 offset = old_end;
2456                                 goto again;
2457                         } else if (old_end == offset + bytes) {
2458                                 /* all done */
2459                                 goto out_lock;
2460                         }
2461                         spin_unlock(&ctl->tree_lock);
2462
2463                         ret = btrfs_add_free_space(block_group, offset + bytes,
2464                                                    old_end - (offset + bytes));
2465                         WARN_ON(ret);
2466                         goto out;
2467                 }
2468         }
2469
2470         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2471         if (ret == -EAGAIN) {
2472                 re_search = true;
2473                 goto again;
2474         }
2475 out_lock:
2476         spin_unlock(&ctl->tree_lock);
2477 out:
2478         return ret;
2479 }
2480
2481 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2482                            u64 bytes)
2483 {
2484         struct btrfs_fs_info *fs_info = block_group->fs_info;
2485         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2486         struct btrfs_free_space *info;
2487         struct rb_node *n;
2488         int count = 0;
2489
2490         spin_lock(&ctl->tree_lock);
2491         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2492                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2493                 if (info->bytes >= bytes && !block_group->ro)
2494                         count++;
2495                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2496                            info->offset, info->bytes,
2497                        (info->bitmap) ? "yes" : "no");
2498         }
2499         spin_unlock(&ctl->tree_lock);
2500         btrfs_info(fs_info, "block group has cluster?: %s",
2501                list_empty(&block_group->cluster_list) ? "no" : "yes");
2502         btrfs_info(fs_info,
2503                    "%d blocks of free space at or bigger than bytes is", count);
2504 }
2505
2506 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2507 {
2508         struct btrfs_fs_info *fs_info = block_group->fs_info;
2509         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2510
2511         spin_lock_init(&ctl->tree_lock);
2512         ctl->unit = fs_info->sectorsize;
2513         ctl->start = block_group->key.objectid;
2514         ctl->private = block_group;
2515         ctl->op = &free_space_op;
2516         INIT_LIST_HEAD(&ctl->trimming_ranges);
2517         mutex_init(&ctl->cache_writeout_mutex);
2518
2519         /*
2520          * we only want to have 32k of ram per block group for keeping
2521          * track of free space, and if we pass 1/2 of that we want to
2522          * start converting things over to using bitmaps
2523          */
2524         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2525 }
2526
2527 /*
2528  * for a given cluster, put all of its extents back into the free
2529  * space cache.  If the block group passed doesn't match the block group
2530  * pointed to by the cluster, someone else raced in and freed the
2531  * cluster already.  In that case, we just return without changing anything
2532  */
2533 static int
2534 __btrfs_return_cluster_to_free_space(
2535                              struct btrfs_block_group_cache *block_group,
2536                              struct btrfs_free_cluster *cluster)
2537 {
2538         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2539         struct btrfs_free_space *entry;
2540         struct rb_node *node;
2541
2542         spin_lock(&cluster->lock);
2543         if (cluster->block_group != block_group)
2544                 goto out;
2545
2546         cluster->block_group = NULL;
2547         cluster->window_start = 0;
2548         list_del_init(&cluster->block_group_list);
2549
2550         node = rb_first(&cluster->root);
2551         while (node) {
2552                 bool bitmap;
2553
2554                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2555                 node = rb_next(&entry->offset_index);
2556                 rb_erase(&entry->offset_index, &cluster->root);
2557                 RB_CLEAR_NODE(&entry->offset_index);
2558
2559                 bitmap = (entry->bitmap != NULL);
2560                 if (!bitmap) {
2561                         try_merge_free_space(ctl, entry, false);
2562                         steal_from_bitmap(ctl, entry, false);
2563                 }
2564                 tree_insert_offset(&ctl->free_space_offset,
2565                                    entry->offset, &entry->offset_index, bitmap);
2566         }
2567         cluster->root = RB_ROOT;
2568
2569 out:
2570         spin_unlock(&cluster->lock);
2571         btrfs_put_block_group(block_group);
2572         return 0;
2573 }
2574
2575 static void __btrfs_remove_free_space_cache_locked(
2576                                 struct btrfs_free_space_ctl *ctl)
2577 {
2578         struct btrfs_free_space *info;
2579         struct rb_node *node;
2580
2581         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2582                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2583                 if (!info->bitmap) {
2584                         unlink_free_space(ctl, info);
2585                         kmem_cache_free(btrfs_free_space_cachep, info);
2586                 } else {
2587                         free_bitmap(ctl, info);
2588                 }
2589
2590                 cond_resched_lock(&ctl->tree_lock);
2591         }
2592 }
2593
2594 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2595 {
2596         spin_lock(&ctl->tree_lock);
2597         __btrfs_remove_free_space_cache_locked(ctl);
2598         spin_unlock(&ctl->tree_lock);
2599 }
2600
2601 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2602 {
2603         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2604         struct btrfs_free_cluster *cluster;
2605         struct list_head *head;
2606
2607         spin_lock(&ctl->tree_lock);
2608         while ((head = block_group->cluster_list.next) !=
2609                &block_group->cluster_list) {
2610                 cluster = list_entry(head, struct btrfs_free_cluster,
2611                                      block_group_list);
2612
2613                 WARN_ON(cluster->block_group != block_group);
2614                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2615
2616                 cond_resched_lock(&ctl->tree_lock);
2617         }
2618         __btrfs_remove_free_space_cache_locked(ctl);
2619         spin_unlock(&ctl->tree_lock);
2620
2621 }
2622
2623 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2624                                u64 offset, u64 bytes, u64 empty_size,
2625                                u64 *max_extent_size)
2626 {
2627         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2628         struct btrfs_free_space *entry = NULL;
2629         u64 bytes_search = bytes + empty_size;
2630         u64 ret = 0;
2631         u64 align_gap = 0;
2632         u64 align_gap_len = 0;
2633
2634         spin_lock(&ctl->tree_lock);
2635         entry = find_free_space(ctl, &offset, &bytes_search,
2636                                 block_group->full_stripe_len, max_extent_size);
2637         if (!entry)
2638                 goto out;
2639
2640         ret = offset;
2641         if (entry->bitmap) {
2642                 bitmap_clear_bits(ctl, entry, offset, bytes);
2643                 if (!entry->bytes)
2644                         free_bitmap(ctl, entry);
2645         } else {
2646                 unlink_free_space(ctl, entry);
2647                 align_gap_len = offset - entry->offset;
2648                 align_gap = entry->offset;
2649
2650                 entry->offset = offset + bytes;
2651                 WARN_ON(entry->bytes < bytes + align_gap_len);
2652
2653                 entry->bytes -= bytes + align_gap_len;
2654                 if (!entry->bytes)
2655                         kmem_cache_free(btrfs_free_space_cachep, entry);
2656                 else
2657                         link_free_space(ctl, entry);
2658         }
2659 out:
2660         spin_unlock(&ctl->tree_lock);
2661
2662         if (align_gap_len)
2663                 __btrfs_add_free_space(block_group->fs_info, ctl,
2664                                        align_gap, align_gap_len);
2665         return ret;
2666 }
2667
2668 /*
2669  * given a cluster, put all of its extents back into the free space
2670  * cache.  If a block group is passed, this function will only free
2671  * a cluster that belongs to the passed block group.
2672  *
2673  * Otherwise, it'll get a reference on the block group pointed to by the
2674  * cluster and remove the cluster from it.
2675  */
2676 int btrfs_return_cluster_to_free_space(
2677                                struct btrfs_block_group_cache *block_group,
2678                                struct btrfs_free_cluster *cluster)
2679 {
2680         struct btrfs_free_space_ctl *ctl;
2681         int ret;
2682
2683         /* first, get a safe pointer to the block group */
2684         spin_lock(&cluster->lock);
2685         if (!block_group) {
2686                 block_group = cluster->block_group;
2687                 if (!block_group) {
2688                         spin_unlock(&cluster->lock);
2689                         return 0;
2690                 }
2691         } else if (cluster->block_group != block_group) {
2692                 /* someone else has already freed it don't redo their work */
2693                 spin_unlock(&cluster->lock);
2694                 return 0;
2695         }
2696         atomic_inc(&block_group->count);
2697         spin_unlock(&cluster->lock);
2698
2699         ctl = block_group->free_space_ctl;
2700
2701         /* now return any extents the cluster had on it */
2702         spin_lock(&ctl->tree_lock);
2703         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2704         spin_unlock(&ctl->tree_lock);
2705
2706         /* finally drop our ref */
2707         btrfs_put_block_group(block_group);
2708         return ret;
2709 }
2710
2711 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2712                                    struct btrfs_free_cluster *cluster,
2713                                    struct btrfs_free_space *entry,
2714                                    u64 bytes, u64 min_start,
2715                                    u64 *max_extent_size)
2716 {
2717         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2718         int err;
2719         u64 search_start = cluster->window_start;
2720         u64 search_bytes = bytes;
2721         u64 ret = 0;
2722
2723         search_start = min_start;
2724         search_bytes = bytes;
2725
2726         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2727         if (err) {
2728                 *max_extent_size = max(get_max_extent_size(entry),
2729                                        *max_extent_size);
2730                 return 0;
2731         }
2732
2733         ret = search_start;
2734         __bitmap_clear_bits(ctl, entry, ret, bytes);
2735
2736         return ret;
2737 }
2738
2739 /*
2740  * given a cluster, try to allocate 'bytes' from it, returns 0
2741  * if it couldn't find anything suitably large, or a logical disk offset
2742  * if things worked out
2743  */
2744 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2745                              struct btrfs_free_cluster *cluster, u64 bytes,
2746                              u64 min_start, u64 *max_extent_size)
2747 {
2748         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2749         struct btrfs_free_space *entry = NULL;
2750         struct rb_node *node;
2751         u64 ret = 0;
2752
2753         spin_lock(&cluster->lock);
2754         if (bytes > cluster->max_size)
2755                 goto out;
2756
2757         if (cluster->block_group != block_group)
2758                 goto out;
2759
2760         node = rb_first(&cluster->root);
2761         if (!node)
2762                 goto out;
2763
2764         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2765         while (1) {
2766                 if (entry->bytes < bytes)
2767                         *max_extent_size = max(get_max_extent_size(entry),
2768                                                *max_extent_size);
2769
2770                 if (entry->bytes < bytes ||
2771                     (!entry->bitmap && entry->offset < min_start)) {
2772                         node = rb_next(&entry->offset_index);
2773                         if (!node)
2774                                 break;
2775                         entry = rb_entry(node, struct btrfs_free_space,
2776                                          offset_index);
2777                         continue;
2778                 }
2779
2780                 if (entry->bitmap) {
2781                         ret = btrfs_alloc_from_bitmap(block_group,
2782                                                       cluster, entry, bytes,
2783                                                       cluster->window_start,
2784                                                       max_extent_size);
2785                         if (ret == 0) {
2786                                 node = rb_next(&entry->offset_index);
2787                                 if (!node)
2788                                         break;
2789                                 entry = rb_entry(node, struct btrfs_free_space,
2790                                                  offset_index);
2791                                 continue;
2792                         }
2793                         cluster->window_start += bytes;
2794                 } else {
2795                         ret = entry->offset;
2796
2797                         entry->offset += bytes;
2798                         entry->bytes -= bytes;
2799                 }
2800
2801                 if (entry->bytes == 0)
2802                         rb_erase(&entry->offset_index, &cluster->root);
2803                 break;
2804         }
2805 out:
2806         spin_unlock(&cluster->lock);
2807
2808         if (!ret)
2809                 return 0;
2810
2811         spin_lock(&ctl->tree_lock);
2812
2813         ctl->free_space -= bytes;
2814         if (entry->bytes == 0) {
2815                 ctl->free_extents--;
2816                 if (entry->bitmap) {
2817                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2818                                         entry->bitmap);
2819                         ctl->total_bitmaps--;
2820                         ctl->op->recalc_thresholds(ctl);
2821                 }
2822                 kmem_cache_free(btrfs_free_space_cachep, entry);
2823         }
2824
2825         spin_unlock(&ctl->tree_lock);
2826
2827         return ret;
2828 }
2829
2830 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2831                                 struct btrfs_free_space *entry,
2832                                 struct btrfs_free_cluster *cluster,
2833                                 u64 offset, u64 bytes,
2834                                 u64 cont1_bytes, u64 min_bytes)
2835 {
2836         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2837         unsigned long next_zero;
2838         unsigned long i;
2839         unsigned long want_bits;
2840         unsigned long min_bits;
2841         unsigned long found_bits;
2842         unsigned long max_bits = 0;
2843         unsigned long start = 0;
2844         unsigned long total_found = 0;
2845         int ret;
2846
2847         i = offset_to_bit(entry->offset, ctl->unit,
2848                           max_t(u64, offset, entry->offset));
2849         want_bits = bytes_to_bits(bytes, ctl->unit);
2850         min_bits = bytes_to_bits(min_bytes, ctl->unit);
2851
2852         /*
2853          * Don't bother looking for a cluster in this bitmap if it's heavily
2854          * fragmented.
2855          */
2856         if (entry->max_extent_size &&
2857             entry->max_extent_size < cont1_bytes)
2858                 return -ENOSPC;
2859 again:
2860         found_bits = 0;
2861         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2862                 next_zero = find_next_zero_bit(entry->bitmap,
2863                                                BITS_PER_BITMAP, i);
2864                 if (next_zero - i >= min_bits) {
2865                         found_bits = next_zero - i;
2866                         if (found_bits > max_bits)
2867                                 max_bits = found_bits;
2868                         break;
2869                 }
2870                 if (next_zero - i > max_bits)
2871                         max_bits = next_zero - i;
2872                 i = next_zero;
2873         }
2874
2875         if (!found_bits) {
2876                 entry->max_extent_size = (u64)max_bits * ctl->unit;
2877                 return -ENOSPC;
2878         }
2879
2880         if (!total_found) {
2881                 start = i;
2882                 cluster->max_size = 0;
2883         }
2884
2885         total_found += found_bits;
2886
2887         if (cluster->max_size < found_bits * ctl->unit)
2888                 cluster->max_size = found_bits * ctl->unit;
2889
2890         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2891                 i = next_zero + 1;
2892                 goto again;
2893         }
2894
2895         cluster->window_start = start * ctl->unit + entry->offset;
2896         rb_erase(&entry->offset_index, &ctl->free_space_offset);
2897         ret = tree_insert_offset(&cluster->root, entry->offset,
2898                                  &entry->offset_index, 1);
2899         ASSERT(!ret); /* -EEXIST; Logic error */
2900
2901         trace_btrfs_setup_cluster(block_group, cluster,
2902                                   total_found * ctl->unit, 1);
2903         return 0;
2904 }
2905
2906 /*
2907  * This searches the block group for just extents to fill the cluster with.
2908  * Try to find a cluster with at least bytes total bytes, at least one
2909  * extent of cont1_bytes, and other clusters of at least min_bytes.
2910  */
2911 static noinline int
2912 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2913                         struct btrfs_free_cluster *cluster,
2914                         struct list_head *bitmaps, u64 offset, u64 bytes,
2915                         u64 cont1_bytes, u64 min_bytes)
2916 {
2917         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918         struct btrfs_free_space *first = NULL;
2919         struct btrfs_free_space *entry = NULL;
2920         struct btrfs_free_space *last;
2921         struct rb_node *node;
2922         u64 window_free;
2923         u64 max_extent;
2924         u64 total_size = 0;
2925
2926         entry = tree_search_offset(ctl, offset, 0, 1);
2927         if (!entry)
2928                 return -ENOSPC;
2929
2930         /*
2931          * We don't want bitmaps, so just move along until we find a normal
2932          * extent entry.
2933          */
2934         while (entry->bitmap || entry->bytes < min_bytes) {
2935                 if (entry->bitmap && list_empty(&entry->list))
2936                         list_add_tail(&entry->list, bitmaps);
2937                 node = rb_next(&entry->offset_index);
2938                 if (!node)
2939                         return -ENOSPC;
2940                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2941         }
2942
2943         window_free = entry->bytes;
2944         max_extent = entry->bytes;
2945         first = entry;
2946         last = entry;
2947
2948         for (node = rb_next(&entry->offset_index); node;
2949              node = rb_next(&entry->offset_index)) {
2950                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2951
2952                 if (entry->bitmap) {
2953                         if (list_empty(&entry->list))
2954                                 list_add_tail(&entry->list, bitmaps);
2955                         continue;
2956                 }
2957
2958                 if (entry->bytes < min_bytes)
2959                         continue;
2960
2961                 last = entry;
2962                 window_free += entry->bytes;
2963                 if (entry->bytes > max_extent)
2964                         max_extent = entry->bytes;
2965         }
2966
2967         if (window_free < bytes || max_extent < cont1_bytes)
2968                 return -ENOSPC;
2969
2970         cluster->window_start = first->offset;
2971
2972         node = &first->offset_index;
2973
2974         /*
2975          * now we've found our entries, pull them out of the free space
2976          * cache and put them into the cluster rbtree
2977          */
2978         do {
2979                 int ret;
2980
2981                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2982                 node = rb_next(&entry->offset_index);
2983                 if (entry->bitmap || entry->bytes < min_bytes)
2984                         continue;
2985
2986                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2987                 ret = tree_insert_offset(&cluster->root, entry->offset,
2988                                          &entry->offset_index, 0);
2989                 total_size += entry->bytes;
2990                 ASSERT(!ret); /* -EEXIST; Logic error */
2991         } while (node && entry != last);
2992
2993         cluster->max_size = max_extent;
2994         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2995         return 0;
2996 }
2997
2998 /*
2999  * This specifically looks for bitmaps that may work in the cluster, we assume
3000  * that we have already failed to find extents that will work.
3001  */
3002 static noinline int
3003 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
3004                      struct btrfs_free_cluster *cluster,
3005                      struct list_head *bitmaps, u64 offset, u64 bytes,
3006                      u64 cont1_bytes, u64 min_bytes)
3007 {
3008         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3009         struct btrfs_free_space *entry = NULL;
3010         int ret = -ENOSPC;
3011         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3012
3013         if (ctl->total_bitmaps == 0)
3014                 return -ENOSPC;
3015
3016         /*
3017          * The bitmap that covers offset won't be in the list unless offset
3018          * is just its start offset.
3019          */
3020         if (!list_empty(bitmaps))
3021                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3022
3023         if (!entry || entry->offset != bitmap_offset) {
3024                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3025                 if (entry && list_empty(&entry->list))
3026                         list_add(&entry->list, bitmaps);
3027         }
3028
3029         list_for_each_entry(entry, bitmaps, list) {
3030                 if (entry->bytes < bytes)
3031                         continue;
3032                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3033                                            bytes, cont1_bytes, min_bytes);
3034                 if (!ret)
3035                         return 0;
3036         }
3037
3038         /*
3039          * The bitmaps list has all the bitmaps that record free space
3040          * starting after offset, so no more search is required.
3041          */
3042         return -ENOSPC;
3043 }
3044
3045 /*
3046  * here we try to find a cluster of blocks in a block group.  The goal
3047  * is to find at least bytes+empty_size.
3048  * We might not find them all in one contiguous area.
3049  *
3050  * returns zero and sets up cluster if things worked out, otherwise
3051  * it returns -enospc
3052  */
3053 int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
3054                              struct btrfs_free_cluster *cluster,
3055                              u64 offset, u64 bytes, u64 empty_size)
3056 {
3057         struct btrfs_fs_info *fs_info = block_group->fs_info;
3058         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3059         struct btrfs_free_space *entry, *tmp;
3060         LIST_HEAD(bitmaps);
3061         u64 min_bytes;
3062         u64 cont1_bytes;
3063         int ret;
3064
3065         /*
3066          * Choose the minimum extent size we'll require for this
3067          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3068          * For metadata, allow allocates with smaller extents.  For
3069          * data, keep it dense.
3070          */
3071         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3072                 cont1_bytes = min_bytes = bytes + empty_size;
3073         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3074                 cont1_bytes = bytes;
3075                 min_bytes = fs_info->sectorsize;
3076         } else {
3077                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3078                 min_bytes = fs_info->sectorsize;
3079         }
3080
3081         spin_lock(&ctl->tree_lock);
3082
3083         /*
3084          * If we know we don't have enough space to make a cluster don't even
3085          * bother doing all the work to try and find one.
3086          */
3087         if (ctl->free_space < bytes) {
3088                 spin_unlock(&ctl->tree_lock);
3089                 return -ENOSPC;
3090         }
3091
3092         spin_lock(&cluster->lock);
3093
3094         /* someone already found a cluster, hooray */
3095         if (cluster->block_group) {
3096                 ret = 0;
3097                 goto out;
3098         }
3099
3100         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3101                                  min_bytes);
3102
3103         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3104                                       bytes + empty_size,
3105                                       cont1_bytes, min_bytes);
3106         if (ret)
3107                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3108                                            offset, bytes + empty_size,
3109                                            cont1_bytes, min_bytes);
3110
3111         /* Clear our temporary list */
3112         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3113                 list_del_init(&entry->list);
3114
3115         if (!ret) {
3116                 atomic_inc(&block_group->count);
3117                 list_add_tail(&cluster->block_group_list,
3118                               &block_group->cluster_list);
3119                 cluster->block_group = block_group;
3120         } else {
3121                 trace_btrfs_failed_cluster_setup(block_group);
3122         }
3123 out:
3124         spin_unlock(&cluster->lock);
3125         spin_unlock(&ctl->tree_lock);
3126
3127         return ret;
3128 }
3129
3130 /*
3131  * simple code to zero out a cluster
3132  */
3133 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3134 {
3135         spin_lock_init(&cluster->lock);
3136         spin_lock_init(&cluster->refill_lock);
3137         cluster->root = RB_ROOT;
3138         cluster->max_size = 0;
3139         cluster->fragmented = false;
3140         INIT_LIST_HEAD(&cluster->block_group_list);
3141         cluster->block_group = NULL;
3142 }
3143
3144 static int do_trimming(struct btrfs_block_group_cache *block_group,
3145                        u64 *total_trimmed, u64 start, u64 bytes,
3146                        u64 reserved_start, u64 reserved_bytes,
3147                        struct btrfs_trim_range *trim_entry)
3148 {
3149         struct btrfs_space_info *space_info = block_group->space_info;
3150         struct btrfs_fs_info *fs_info = block_group->fs_info;
3151         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3152         int ret;
3153         int update = 0;
3154         u64 trimmed = 0;
3155
3156         spin_lock(&space_info->lock);
3157         spin_lock(&block_group->lock);
3158         if (!block_group->ro) {
3159                 block_group->reserved += reserved_bytes;
3160                 space_info->bytes_reserved += reserved_bytes;
3161                 update = 1;
3162         }
3163         spin_unlock(&block_group->lock);
3164         spin_unlock(&space_info->lock);
3165
3166         ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3167         if (!ret)
3168                 *total_trimmed += trimmed;
3169
3170         mutex_lock(&ctl->cache_writeout_mutex);
3171         btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
3172         list_del(&trim_entry->list);
3173         mutex_unlock(&ctl->cache_writeout_mutex);
3174
3175         if (update) {
3176                 spin_lock(&space_info->lock);
3177                 spin_lock(&block_group->lock);
3178                 if (block_group->ro)
3179                         space_info->bytes_readonly += reserved_bytes;
3180                 block_group->reserved -= reserved_bytes;
3181                 space_info->bytes_reserved -= reserved_bytes;
3182                 spin_unlock(&block_group->lock);
3183                 spin_unlock(&space_info->lock);
3184         }
3185
3186         return ret;
3187 }
3188
3189 static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3190                           u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3191 {
3192         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3193         struct btrfs_free_space *entry;
3194         struct rb_node *node;
3195         int ret = 0;
3196         u64 extent_start;
3197         u64 extent_bytes;
3198         u64 bytes;
3199
3200         while (start < end) {
3201                 struct btrfs_trim_range trim_entry;
3202
3203                 mutex_lock(&ctl->cache_writeout_mutex);
3204                 spin_lock(&ctl->tree_lock);
3205
3206                 if (ctl->free_space < minlen) {
3207                         spin_unlock(&ctl->tree_lock);
3208                         mutex_unlock(&ctl->cache_writeout_mutex);
3209                         break;
3210                 }
3211
3212                 entry = tree_search_offset(ctl, start, 0, 1);
3213                 if (!entry) {
3214                         spin_unlock(&ctl->tree_lock);
3215                         mutex_unlock(&ctl->cache_writeout_mutex);
3216                         break;
3217                 }
3218
3219                 /* skip bitmaps */
3220                 while (entry->bitmap) {
3221                         node = rb_next(&entry->offset_index);
3222                         if (!node) {
3223                                 spin_unlock(&ctl->tree_lock);
3224                                 mutex_unlock(&ctl->cache_writeout_mutex);
3225                                 goto out;
3226                         }
3227                         entry = rb_entry(node, struct btrfs_free_space,
3228                                          offset_index);
3229                 }
3230
3231                 if (entry->offset >= end) {
3232                         spin_unlock(&ctl->tree_lock);
3233                         mutex_unlock(&ctl->cache_writeout_mutex);
3234                         break;
3235                 }
3236
3237                 extent_start = entry->offset;
3238                 extent_bytes = entry->bytes;
3239                 start = max(start, extent_start);
3240                 bytes = min(extent_start + extent_bytes, end) - start;
3241                 if (bytes < minlen) {
3242                         spin_unlock(&ctl->tree_lock);
3243                         mutex_unlock(&ctl->cache_writeout_mutex);
3244                         goto next;
3245                 }
3246
3247                 unlink_free_space(ctl, entry);
3248                 kmem_cache_free(btrfs_free_space_cachep, entry);
3249
3250                 spin_unlock(&ctl->tree_lock);
3251                 trim_entry.start = extent_start;
3252                 trim_entry.bytes = extent_bytes;
3253                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3254                 mutex_unlock(&ctl->cache_writeout_mutex);
3255
3256                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3257                                   extent_start, extent_bytes, &trim_entry);
3258                 if (ret)
3259                         break;
3260 next:
3261                 start += bytes;
3262
3263                 if (fatal_signal_pending(current)) {
3264                         ret = -ERESTARTSYS;
3265                         break;
3266                 }
3267
3268                 cond_resched();
3269         }
3270 out:
3271         return ret;
3272 }
3273
3274 static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3275                         u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3276 {
3277         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3278         struct btrfs_free_space *entry;
3279         int ret = 0;
3280         int ret2;
3281         u64 bytes;
3282         u64 offset = offset_to_bitmap(ctl, start);
3283
3284         while (offset < end) {
3285                 bool next_bitmap = false;
3286                 struct btrfs_trim_range trim_entry;
3287
3288                 mutex_lock(&ctl->cache_writeout_mutex);
3289                 spin_lock(&ctl->tree_lock);
3290
3291                 if (ctl->free_space < minlen) {
3292                         spin_unlock(&ctl->tree_lock);
3293                         mutex_unlock(&ctl->cache_writeout_mutex);
3294                         break;
3295                 }
3296
3297                 entry = tree_search_offset(ctl, offset, 1, 0);
3298                 if (!entry) {
3299                         spin_unlock(&ctl->tree_lock);
3300                         mutex_unlock(&ctl->cache_writeout_mutex);
3301                         next_bitmap = true;
3302                         goto next;
3303                 }
3304
3305                 bytes = minlen;
3306                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3307                 if (ret2 || start >= end) {
3308                         spin_unlock(&ctl->tree_lock);
3309                         mutex_unlock(&ctl->cache_writeout_mutex);
3310                         next_bitmap = true;
3311                         goto next;
3312                 }
3313
3314                 bytes = min(bytes, end - start);
3315                 if (bytes < minlen) {
3316                         spin_unlock(&ctl->tree_lock);
3317                         mutex_unlock(&ctl->cache_writeout_mutex);
3318                         goto next;
3319                 }
3320
3321                 bitmap_clear_bits(ctl, entry, start, bytes);
3322                 if (entry->bytes == 0)
3323                         free_bitmap(ctl, entry);
3324
3325                 spin_unlock(&ctl->tree_lock);
3326                 trim_entry.start = start;
3327                 trim_entry.bytes = bytes;
3328                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3329                 mutex_unlock(&ctl->cache_writeout_mutex);
3330
3331                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3332                                   start, bytes, &trim_entry);
3333                 if (ret)
3334                         break;
3335 next:
3336                 if (next_bitmap) {
3337                         offset += BITS_PER_BITMAP * ctl->unit;
3338                 } else {
3339                         start += bytes;
3340                         if (start >= offset + BITS_PER_BITMAP * ctl->unit)
3341                                 offset += BITS_PER_BITMAP * ctl->unit;
3342                 }
3343
3344                 if (fatal_signal_pending(current)) {
3345                         ret = -ERESTARTSYS;
3346                         break;
3347                 }
3348
3349                 cond_resched();
3350         }
3351
3352         return ret;
3353 }
3354
3355 void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
3356 {
3357         atomic_inc(&cache->trimming);
3358 }
3359
3360 void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3361 {
3362         struct btrfs_fs_info *fs_info = block_group->fs_info;
3363         struct extent_map_tree *em_tree;
3364         struct extent_map *em;
3365         bool cleanup;
3366
3367         spin_lock(&block_group->lock);
3368         cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3369                    block_group->removed);
3370         spin_unlock(&block_group->lock);
3371
3372         if (cleanup) {
3373                 mutex_lock(&fs_info->chunk_mutex);
3374                 em_tree = &fs_info->mapping_tree;
3375                 write_lock(&em_tree->lock);
3376                 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3377                                            1);
3378                 BUG_ON(!em); /* logic error, can't happen */
3379                 remove_extent_mapping(em_tree, em);
3380                 write_unlock(&em_tree->lock);
3381                 mutex_unlock(&fs_info->chunk_mutex);
3382
3383                 /* once for us and once for the tree */
3384                 free_extent_map(em);
3385                 free_extent_map(em);
3386
3387                 /*
3388                  * We've left one free space entry and other tasks trimming
3389                  * this block group have left 1 entry each one. Free them.
3390                  */
3391                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3392         }
3393 }
3394
3395 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3396                            u64 *trimmed, u64 start, u64 end, u64 minlen)
3397 {
3398         int ret;
3399
3400         *trimmed = 0;
3401
3402         spin_lock(&block_group->lock);
3403         if (block_group->removed) {
3404                 spin_unlock(&block_group->lock);
3405                 return 0;
3406         }
3407         btrfs_get_block_group_trimming(block_group);
3408         spin_unlock(&block_group->lock);
3409
3410         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3411         if (ret)
3412                 goto out;
3413
3414         ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3415 out:
3416         btrfs_put_block_group_trimming(block_group);
3417         return ret;
3418 }
3419
3420 /*
3421  * Find the left-most item in the cache tree, and then return the
3422  * smallest inode number in the item.
3423  *
3424  * Note: the returned inode number may not be the smallest one in
3425  * the tree, if the left-most item is a bitmap.
3426  */
3427 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3428 {
3429         struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3430         struct btrfs_free_space *entry = NULL;
3431         u64 ino = 0;
3432
3433         spin_lock(&ctl->tree_lock);
3434
3435         if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3436                 goto out;
3437
3438         entry = rb_entry(rb_first(&ctl->free_space_offset),
3439                          struct btrfs_free_space, offset_index);
3440
3441         if (!entry->bitmap) {
3442                 ino = entry->offset;
3443
3444                 unlink_free_space(ctl, entry);
3445                 entry->offset++;
3446                 entry->bytes--;
3447                 if (!entry->bytes)
3448                         kmem_cache_free(btrfs_free_space_cachep, entry);
3449                 else
3450                         link_free_space(ctl, entry);
3451         } else {
3452                 u64 offset = 0;
3453                 u64 count = 1;
3454                 int ret;
3455
3456                 ret = search_bitmap(ctl, entry, &offset, &count, true);
3457                 /* Logic error; Should be empty if it can't find anything */
3458                 ASSERT(!ret);
3459
3460                 ino = offset;
3461                 bitmap_clear_bits(ctl, entry, offset, 1);
3462                 if (entry->bytes == 0)
3463                         free_bitmap(ctl, entry);
3464         }
3465 out:
3466         spin_unlock(&ctl->tree_lock);
3467
3468         return ino;
3469 }
3470
3471 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3472                                     struct btrfs_path *path)
3473 {
3474         struct inode *inode = NULL;
3475
3476         spin_lock(&root->ino_cache_lock);
3477         if (root->ino_cache_inode)
3478                 inode = igrab(root->ino_cache_inode);
3479         spin_unlock(&root->ino_cache_lock);
3480         if (inode)
3481                 return inode;
3482
3483         inode = __lookup_free_space_inode(root, path, 0);
3484         if (IS_ERR(inode))
3485                 return inode;
3486
3487         spin_lock(&root->ino_cache_lock);
3488         if (!btrfs_fs_closing(root->fs_info))
3489                 root->ino_cache_inode = igrab(inode);
3490         spin_unlock(&root->ino_cache_lock);
3491
3492         return inode;
3493 }
3494
3495 int create_free_ino_inode(struct btrfs_root *root,
3496                           struct btrfs_trans_handle *trans,
3497                           struct btrfs_path *path)
3498 {
3499         return __create_free_space_inode(root, trans, path,
3500                                          BTRFS_FREE_INO_OBJECTID, 0);
3501 }
3502
3503 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3504 {
3505         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3506         struct btrfs_path *path;
3507         struct inode *inode;
3508         int ret = 0;
3509         u64 root_gen = btrfs_root_generation(&root->root_item);
3510
3511         if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3512                 return 0;
3513
3514         /*
3515          * If we're unmounting then just return, since this does a search on the
3516          * normal root and not the commit root and we could deadlock.
3517          */
3518         if (btrfs_fs_closing(fs_info))
3519                 return 0;
3520
3521         path = btrfs_alloc_path();
3522         if (!path)
3523                 return 0;
3524
3525         inode = lookup_free_ino_inode(root, path);
3526         if (IS_ERR(inode))
3527                 goto out;
3528
3529         if (root_gen != BTRFS_I(inode)->generation)
3530                 goto out_put;
3531
3532         ret = __load_free_space_cache(root, inode, ctl, path, 0);
3533
3534         if (ret < 0)
3535                 btrfs_err(fs_info,
3536                         "failed to load free ino cache for root %llu",
3537                         root->root_key.objectid);
3538 out_put:
3539         iput(inode);
3540 out:
3541         btrfs_free_path(path);
3542         return ret;
3543 }
3544
3545 int btrfs_write_out_ino_cache(struct btrfs_root *root,
3546                               struct btrfs_trans_handle *trans,
3547                               struct btrfs_path *path,
3548                               struct inode *inode)
3549 {
3550         struct btrfs_fs_info *fs_info = root->fs_info;
3551         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3552         int ret;
3553         struct btrfs_io_ctl io_ctl;
3554         bool release_metadata = true;
3555
3556         if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3557                 return 0;
3558
3559         memset(&io_ctl, 0, sizeof(io_ctl));
3560         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
3561         if (!ret) {
3562                 /*
3563                  * At this point writepages() didn't error out, so our metadata
3564                  * reservation is released when the writeback finishes, at
3565                  * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3566                  * with or without an error.
3567                  */
3568                 release_metadata = false;
3569                 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
3570         }
3571
3572         if (ret) {
3573                 if (release_metadata)
3574                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
3575                                         inode->i_size, true);
3576 #ifdef DEBUG
3577                 btrfs_err(fs_info,
3578                           "failed to write free ino cache for root %llu",
3579                           root->root_key.objectid);
3580 #endif
3581         }
3582
3583         return ret;
3584 }
3585
3586 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3587 /*
3588  * Use this if you need to make a bitmap or extent entry specifically, it
3589  * doesn't do any of the merging that add_free_space does, this acts a lot like
3590  * how the free space cache loading stuff works, so you can get really weird
3591  * configurations.
3592  */
3593 int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3594                               u64 offset, u64 bytes, bool bitmap)
3595 {
3596         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3597         struct btrfs_free_space *info = NULL, *bitmap_info;
3598         void *map = NULL;
3599         u64 bytes_added;
3600         int ret;
3601
3602 again:
3603         if (!info) {
3604                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3605                 if (!info)
3606                         return -ENOMEM;
3607         }
3608
3609         if (!bitmap) {
3610                 spin_lock(&ctl->tree_lock);
3611                 info->offset = offset;
3612                 info->bytes = bytes;
3613                 info->max_extent_size = 0;
3614                 ret = link_free_space(ctl, info);
3615                 spin_unlock(&ctl->tree_lock);
3616                 if (ret)
3617                         kmem_cache_free(btrfs_free_space_cachep, info);
3618                 return ret;
3619         }
3620
3621         if (!map) {
3622                 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
3623                 if (!map) {
3624                         kmem_cache_free(btrfs_free_space_cachep, info);
3625                         return -ENOMEM;
3626                 }
3627         }
3628
3629         spin_lock(&ctl->tree_lock);
3630         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3631                                          1, 0);
3632         if (!bitmap_info) {
3633                 info->bitmap = map;
3634                 map = NULL;
3635                 add_new_bitmap(ctl, info, offset);
3636                 bitmap_info = info;
3637                 info = NULL;
3638         }
3639
3640         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3641
3642         bytes -= bytes_added;
3643         offset += bytes_added;
3644         spin_unlock(&ctl->tree_lock);
3645
3646         if (bytes)
3647                 goto again;
3648
3649         if (info)
3650                 kmem_cache_free(btrfs_free_space_cachep, info);
3651         if (map)
3652                 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
3653         return 0;
3654 }
3655
3656 /*
3657  * Checks to see if the given range is in the free space cache.  This is really
3658  * just used to check the absence of space, so if there is free space in the
3659  * range at all we will return 1.
3660  */
3661 int test_check_exists(struct btrfs_block_group_cache *cache,
3662                       u64 offset, u64 bytes)
3663 {
3664         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3665         struct btrfs_free_space *info;
3666         int ret = 0;
3667
3668         spin_lock(&ctl->tree_lock);
3669         info = tree_search_offset(ctl, offset, 0, 0);
3670         if (!info) {
3671                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3672                                           1, 0);
3673                 if (!info)
3674                         goto out;
3675         }
3676
3677 have_info:
3678         if (info->bitmap) {
3679                 u64 bit_off, bit_bytes;
3680                 struct rb_node *n;
3681                 struct btrfs_free_space *tmp;
3682
3683                 bit_off = offset;
3684                 bit_bytes = ctl->unit;
3685                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3686                 if (!ret) {
3687                         if (bit_off == offset) {
3688                                 ret = 1;
3689                                 goto out;
3690                         } else if (bit_off > offset &&
3691                                    offset + bytes > bit_off) {
3692                                 ret = 1;
3693                                 goto out;
3694                         }
3695                 }
3696
3697                 n = rb_prev(&info->offset_index);
3698                 while (n) {
3699                         tmp = rb_entry(n, struct btrfs_free_space,
3700                                        offset_index);
3701                         if (tmp->offset + tmp->bytes < offset)
3702                                 break;
3703                         if (offset + bytes < tmp->offset) {
3704                                 n = rb_prev(&tmp->offset_index);
3705                                 continue;
3706                         }
3707                         info = tmp;
3708                         goto have_info;
3709                 }
3710
3711                 n = rb_next(&info->offset_index);
3712                 while (n) {
3713                         tmp = rb_entry(n, struct btrfs_free_space,
3714                                        offset_index);
3715                         if (offset + bytes < tmp->offset)
3716                                 break;
3717                         if (tmp->offset + tmp->bytes < offset) {
3718                                 n = rb_next(&tmp->offset_index);
3719                                 continue;
3720                         }
3721                         info = tmp;
3722                         goto have_info;
3723                 }
3724
3725                 ret = 0;
3726                 goto out;
3727         }
3728
3729         if (info->offset == offset) {
3730                 ret = 1;
3731                 goto out;
3732         }
3733
3734         if (offset > info->offset && offset < info->offset + info->bytes)
3735                 ret = 1;
3736 out:
3737         spin_unlock(&ctl->tree_lock);
3738         return ret;
3739 }
3740 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */