Btrfs: Change btree locking to use explicit blocking points
[linux-block.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/smp_lock.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mpage.h>
32 #include <linux/swap.h>
33 #include <linux/writeback.h>
34 #include <linux/statfs.h>
35 #include <linux/compat.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include "compat.h"
41 #include "ctree.h"
42 #include "disk-io.h"
43 #include "transaction.h"
44 #include "btrfs_inode.h"
45 #include "ioctl.h"
46 #include "print-tree.h"
47 #include "volumes.h"
48 #include "ordered-data.h"
49 #include "xattr.h"
50 #include "tree-log.h"
51 #include "ref-cache.h"
52 #include "compression.h"
53 #include "locking.h"
54
55 struct btrfs_iget_args {
56         u64 ino;
57         struct btrfs_root *root;
58 };
59
60 static struct inode_operations btrfs_dir_inode_operations;
61 static struct inode_operations btrfs_symlink_inode_operations;
62 static struct inode_operations btrfs_dir_ro_inode_operations;
63 static struct inode_operations btrfs_special_inode_operations;
64 static struct inode_operations btrfs_file_inode_operations;
65 static struct address_space_operations btrfs_aops;
66 static struct address_space_operations btrfs_symlink_aops;
67 static struct file_operations btrfs_dir_file_operations;
68 static struct extent_io_ops btrfs_extent_io_ops;
69
70 static struct kmem_cache *btrfs_inode_cachep;
71 struct kmem_cache *btrfs_trans_handle_cachep;
72 struct kmem_cache *btrfs_transaction_cachep;
73 struct kmem_cache *btrfs_bit_radix_cachep;
74 struct kmem_cache *btrfs_path_cachep;
75
76 #define S_SHIFT 12
77 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
79         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
80         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
81         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
82         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
83         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
84         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
85 };
86
87 static void btrfs_truncate(struct inode *inode);
88 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89 static noinline int cow_file_range(struct inode *inode,
90                                    struct page *locked_page,
91                                    u64 start, u64 end, int *page_started,
92                                    unsigned long *nr_written, int unlock);
93
94 static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
95 {
96         int err;
97
98         err = btrfs_init_acl(inode, dir);
99         if (!err)
100                 err = btrfs_xattr_security_init(inode, dir);
101         return err;
102 }
103
104 /*
105  * a very lame attempt at stopping writes when the FS is 85% full.  There
106  * are countless ways this is incorrect, but it is better than nothing.
107  */
108 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
109                            int for_del)
110 {
111         u64 total;
112         u64 used;
113         u64 thresh;
114         int ret = 0;
115
116         spin_lock(&root->fs_info->delalloc_lock);
117         total = btrfs_super_total_bytes(&root->fs_info->super_copy);
118         used = btrfs_super_bytes_used(&root->fs_info->super_copy);
119         if (for_del)
120                 thresh = total * 90;
121         else
122                 thresh = total * 85;
123
124         do_div(thresh, 100);
125
126         if (used + root->fs_info->delalloc_bytes + num_required > thresh)
127                 ret = -ENOSPC;
128         spin_unlock(&root->fs_info->delalloc_lock);
129         return ret;
130 }
131
132 /*
133  * this does all the hard work for inserting an inline extent into
134  * the btree.  The caller should have done a btrfs_drop_extents so that
135  * no overlapping inline items exist in the btree
136  */
137 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
138                                 struct btrfs_root *root, struct inode *inode,
139                                 u64 start, size_t size, size_t compressed_size,
140                                 struct page **compressed_pages)
141 {
142         struct btrfs_key key;
143         struct btrfs_path *path;
144         struct extent_buffer *leaf;
145         struct page *page = NULL;
146         char *kaddr;
147         unsigned long ptr;
148         struct btrfs_file_extent_item *ei;
149         int err = 0;
150         int ret;
151         size_t cur_size = size;
152         size_t datasize;
153         unsigned long offset;
154         int use_compress = 0;
155
156         if (compressed_size && compressed_pages) {
157                 use_compress = 1;
158                 cur_size = compressed_size;
159         }
160
161         path = btrfs_alloc_path();
162         if (!path)
163                 return -ENOMEM;
164
165         btrfs_set_trans_block_group(trans, inode);
166
167         key.objectid = inode->i_ino;
168         key.offset = start;
169         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
170         datasize = btrfs_file_extent_calc_inline_size(cur_size);
171
172         inode_add_bytes(inode, size);
173         ret = btrfs_insert_empty_item(trans, root, path, &key,
174                                       datasize);
175         BUG_ON(ret);
176         if (ret) {
177                 err = ret;
178                 goto fail;
179         }
180         leaf = path->nodes[0];
181         ei = btrfs_item_ptr(leaf, path->slots[0],
182                             struct btrfs_file_extent_item);
183         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
184         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
185         btrfs_set_file_extent_encryption(leaf, ei, 0);
186         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
187         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
188         ptr = btrfs_file_extent_inline_start(ei);
189
190         if (use_compress) {
191                 struct page *cpage;
192                 int i = 0;
193                 while (compressed_size > 0) {
194                         cpage = compressed_pages[i];
195                         cur_size = min_t(unsigned long, compressed_size,
196                                        PAGE_CACHE_SIZE);
197
198                         kaddr = kmap(cpage);
199                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
200                         kunmap(cpage);
201
202                         i++;
203                         ptr += cur_size;
204                         compressed_size -= cur_size;
205                 }
206                 btrfs_set_file_extent_compression(leaf, ei,
207                                                   BTRFS_COMPRESS_ZLIB);
208         } else {
209                 page = find_get_page(inode->i_mapping,
210                                      start >> PAGE_CACHE_SHIFT);
211                 btrfs_set_file_extent_compression(leaf, ei, 0);
212                 kaddr = kmap_atomic(page, KM_USER0);
213                 offset = start & (PAGE_CACHE_SIZE - 1);
214                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
215                 kunmap_atomic(kaddr, KM_USER0);
216                 page_cache_release(page);
217         }
218         btrfs_mark_buffer_dirty(leaf);
219         btrfs_free_path(path);
220
221         BTRFS_I(inode)->disk_i_size = inode->i_size;
222         btrfs_update_inode(trans, root, inode);
223         return 0;
224 fail:
225         btrfs_free_path(path);
226         return err;
227 }
228
229
230 /*
231  * conditionally insert an inline extent into the file.  This
232  * does the checks required to make sure the data is small enough
233  * to fit as an inline extent.
234  */
235 static int cow_file_range_inline(struct btrfs_trans_handle *trans,
236                                  struct btrfs_root *root,
237                                  struct inode *inode, u64 start, u64 end,
238                                  size_t compressed_size,
239                                  struct page **compressed_pages)
240 {
241         u64 isize = i_size_read(inode);
242         u64 actual_end = min(end + 1, isize);
243         u64 inline_len = actual_end - start;
244         u64 aligned_end = (end + root->sectorsize - 1) &
245                         ~((u64)root->sectorsize - 1);
246         u64 hint_byte;
247         u64 data_len = inline_len;
248         int ret;
249
250         if (compressed_size)
251                 data_len = compressed_size;
252
253         if (start > 0 ||
254             actual_end >= PAGE_CACHE_SIZE ||
255             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
256             (!compressed_size &&
257             (actual_end & (root->sectorsize - 1)) == 0) ||
258             end + 1 < isize ||
259             data_len > root->fs_info->max_inline) {
260                 return 1;
261         }
262
263         ret = btrfs_drop_extents(trans, root, inode, start,
264                                  aligned_end, start, &hint_byte);
265         BUG_ON(ret);
266
267         if (isize > actual_end)
268                 inline_len = min_t(u64, isize, actual_end);
269         ret = insert_inline_extent(trans, root, inode, start,
270                                    inline_len, compressed_size,
271                                    compressed_pages);
272         BUG_ON(ret);
273         btrfs_drop_extent_cache(inode, start, aligned_end, 0);
274         return 0;
275 }
276
277 struct async_extent {
278         u64 start;
279         u64 ram_size;
280         u64 compressed_size;
281         struct page **pages;
282         unsigned long nr_pages;
283         struct list_head list;
284 };
285
286 struct async_cow {
287         struct inode *inode;
288         struct btrfs_root *root;
289         struct page *locked_page;
290         u64 start;
291         u64 end;
292         struct list_head extents;
293         struct btrfs_work work;
294 };
295
296 static noinline int add_async_extent(struct async_cow *cow,
297                                      u64 start, u64 ram_size,
298                                      u64 compressed_size,
299                                      struct page **pages,
300                                      unsigned long nr_pages)
301 {
302         struct async_extent *async_extent;
303
304         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
305         async_extent->start = start;
306         async_extent->ram_size = ram_size;
307         async_extent->compressed_size = compressed_size;
308         async_extent->pages = pages;
309         async_extent->nr_pages = nr_pages;
310         list_add_tail(&async_extent->list, &cow->extents);
311         return 0;
312 }
313
314 /*
315  * we create compressed extents in two phases.  The first
316  * phase compresses a range of pages that have already been
317  * locked (both pages and state bits are locked).
318  *
319  * This is done inside an ordered work queue, and the compression
320  * is spread across many cpus.  The actual IO submission is step
321  * two, and the ordered work queue takes care of making sure that
322  * happens in the same order things were put onto the queue by
323  * writepages and friends.
324  *
325  * If this code finds it can't get good compression, it puts an
326  * entry onto the work queue to write the uncompressed bytes.  This
327  * makes sure that both compressed inodes and uncompressed inodes
328  * are written in the same order that pdflush sent them down.
329  */
330 static noinline int compress_file_range(struct inode *inode,
331                                         struct page *locked_page,
332                                         u64 start, u64 end,
333                                         struct async_cow *async_cow,
334                                         int *num_added)
335 {
336         struct btrfs_root *root = BTRFS_I(inode)->root;
337         struct btrfs_trans_handle *trans;
338         u64 num_bytes;
339         u64 orig_start;
340         u64 disk_num_bytes;
341         u64 blocksize = root->sectorsize;
342         u64 actual_end;
343         u64 isize = i_size_read(inode);
344         int ret = 0;
345         struct page **pages = NULL;
346         unsigned long nr_pages;
347         unsigned long nr_pages_ret = 0;
348         unsigned long total_compressed = 0;
349         unsigned long total_in = 0;
350         unsigned long max_compressed = 128 * 1024;
351         unsigned long max_uncompressed = 128 * 1024;
352         int i;
353         int will_compress;
354
355         orig_start = start;
356
357         actual_end = min_t(u64, isize, end + 1);
358 again:
359         will_compress = 0;
360         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
361         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
362
363         total_compressed = actual_end - start;
364
365         /* we want to make sure that amount of ram required to uncompress
366          * an extent is reasonable, so we limit the total size in ram
367          * of a compressed extent to 128k.  This is a crucial number
368          * because it also controls how easily we can spread reads across
369          * cpus for decompression.
370          *
371          * We also want to make sure the amount of IO required to do
372          * a random read is reasonably small, so we limit the size of
373          * a compressed extent to 128k.
374          */
375         total_compressed = min(total_compressed, max_uncompressed);
376         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
377         num_bytes = max(blocksize,  num_bytes);
378         disk_num_bytes = num_bytes;
379         total_in = 0;
380         ret = 0;
381
382         /*
383          * we do compression for mount -o compress and when the
384          * inode has not been flagged as nocompress.  This flag can
385          * change at any time if we discover bad compression ratios.
386          */
387         if (!btrfs_test_flag(inode, NOCOMPRESS) &&
388             btrfs_test_opt(root, COMPRESS)) {
389                 WARN_ON(pages);
390                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
391
392                 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
393                                                 total_compressed, pages,
394                                                 nr_pages, &nr_pages_ret,
395                                                 &total_in,
396                                                 &total_compressed,
397                                                 max_compressed);
398
399                 if (!ret) {
400                         unsigned long offset = total_compressed &
401                                 (PAGE_CACHE_SIZE - 1);
402                         struct page *page = pages[nr_pages_ret - 1];
403                         char *kaddr;
404
405                         /* zero the tail end of the last page, we might be
406                          * sending it down to disk
407                          */
408                         if (offset) {
409                                 kaddr = kmap_atomic(page, KM_USER0);
410                                 memset(kaddr + offset, 0,
411                                        PAGE_CACHE_SIZE - offset);
412                                 kunmap_atomic(kaddr, KM_USER0);
413                         }
414                         will_compress = 1;
415                 }
416         }
417         if (start == 0) {
418                 trans = btrfs_join_transaction(root, 1);
419                 BUG_ON(!trans);
420                 btrfs_set_trans_block_group(trans, inode);
421
422                 /* lets try to make an inline extent */
423                 if (ret || total_in < (actual_end - start)) {
424                         /* we didn't compress the entire range, try
425                          * to make an uncompressed inline extent.
426                          */
427                         ret = cow_file_range_inline(trans, root, inode,
428                                                     start, end, 0, NULL);
429                 } else {
430                         /* try making a compressed inline extent */
431                         ret = cow_file_range_inline(trans, root, inode,
432                                                     start, end,
433                                                     total_compressed, pages);
434                 }
435                 btrfs_end_transaction(trans, root);
436                 if (ret == 0) {
437                         /*
438                          * inline extent creation worked, we don't need
439                          * to create any more async work items.  Unlock
440                          * and free up our temp pages.
441                          */
442                         extent_clear_unlock_delalloc(inode,
443                                                      &BTRFS_I(inode)->io_tree,
444                                                      start, end, NULL, 1, 0,
445                                                      0, 1, 1, 1);
446                         ret = 0;
447                         goto free_pages_out;
448                 }
449         }
450
451         if (will_compress) {
452                 /*
453                  * we aren't doing an inline extent round the compressed size
454                  * up to a block size boundary so the allocator does sane
455                  * things
456                  */
457                 total_compressed = (total_compressed + blocksize - 1) &
458                         ~(blocksize - 1);
459
460                 /*
461                  * one last check to make sure the compression is really a
462                  * win, compare the page count read with the blocks on disk
463                  */
464                 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
465                         ~(PAGE_CACHE_SIZE - 1);
466                 if (total_compressed >= total_in) {
467                         will_compress = 0;
468                 } else {
469                         disk_num_bytes = total_compressed;
470                         num_bytes = total_in;
471                 }
472         }
473         if (!will_compress && pages) {
474                 /*
475                  * the compression code ran but failed to make things smaller,
476                  * free any pages it allocated and our page pointer array
477                  */
478                 for (i = 0; i < nr_pages_ret; i++) {
479                         WARN_ON(pages[i]->mapping);
480                         page_cache_release(pages[i]);
481                 }
482                 kfree(pages);
483                 pages = NULL;
484                 total_compressed = 0;
485                 nr_pages_ret = 0;
486
487                 /* flag the file so we don't compress in the future */
488                 btrfs_set_flag(inode, NOCOMPRESS);
489         }
490         if (will_compress) {
491                 *num_added += 1;
492
493                 /* the async work queues will take care of doing actual
494                  * allocation on disk for these compressed pages,
495                  * and will submit them to the elevator.
496                  */
497                 add_async_extent(async_cow, start, num_bytes,
498                                  total_compressed, pages, nr_pages_ret);
499
500                 if (start + num_bytes < end && start + num_bytes < actual_end) {
501                         start += num_bytes;
502                         pages = NULL;
503                         cond_resched();
504                         goto again;
505                 }
506         } else {
507                 /*
508                  * No compression, but we still need to write the pages in
509                  * the file we've been given so far.  redirty the locked
510                  * page if it corresponds to our extent and set things up
511                  * for the async work queue to run cow_file_range to do
512                  * the normal delalloc dance
513                  */
514                 if (page_offset(locked_page) >= start &&
515                     page_offset(locked_page) <= end) {
516                         __set_page_dirty_nobuffers(locked_page);
517                         /* unlocked later on in the async handlers */
518                 }
519                 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
520                 *num_added += 1;
521         }
522
523 out:
524         return 0;
525
526 free_pages_out:
527         for (i = 0; i < nr_pages_ret; i++) {
528                 WARN_ON(pages[i]->mapping);
529                 page_cache_release(pages[i]);
530         }
531         kfree(pages);
532
533         goto out;
534 }
535
536 /*
537  * phase two of compressed writeback.  This is the ordered portion
538  * of the code, which only gets called in the order the work was
539  * queued.  We walk all the async extents created by compress_file_range
540  * and send them down to the disk.
541  */
542 static noinline int submit_compressed_extents(struct inode *inode,
543                                               struct async_cow *async_cow)
544 {
545         struct async_extent *async_extent;
546         u64 alloc_hint = 0;
547         struct btrfs_trans_handle *trans;
548         struct btrfs_key ins;
549         struct extent_map *em;
550         struct btrfs_root *root = BTRFS_I(inode)->root;
551         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
552         struct extent_io_tree *io_tree;
553         int ret;
554
555         if (list_empty(&async_cow->extents))
556                 return 0;
557
558         trans = btrfs_join_transaction(root, 1);
559
560         while (!list_empty(&async_cow->extents)) {
561                 async_extent = list_entry(async_cow->extents.next,
562                                           struct async_extent, list);
563                 list_del(&async_extent->list);
564
565                 io_tree = &BTRFS_I(inode)->io_tree;
566
567                 /* did the compression code fall back to uncompressed IO? */
568                 if (!async_extent->pages) {
569                         int page_started = 0;
570                         unsigned long nr_written = 0;
571
572                         lock_extent(io_tree, async_extent->start,
573                                     async_extent->start +
574                                     async_extent->ram_size - 1, GFP_NOFS);
575
576                         /* allocate blocks */
577                         cow_file_range(inode, async_cow->locked_page,
578                                        async_extent->start,
579                                        async_extent->start +
580                                        async_extent->ram_size - 1,
581                                        &page_started, &nr_written, 0);
582
583                         /*
584                          * if page_started, cow_file_range inserted an
585                          * inline extent and took care of all the unlocking
586                          * and IO for us.  Otherwise, we need to submit
587                          * all those pages down to the drive.
588                          */
589                         if (!page_started)
590                                 extent_write_locked_range(io_tree,
591                                                   inode, async_extent->start,
592                                                   async_extent->start +
593                                                   async_extent->ram_size - 1,
594                                                   btrfs_get_extent,
595                                                   WB_SYNC_ALL);
596                         kfree(async_extent);
597                         cond_resched();
598                         continue;
599                 }
600
601                 lock_extent(io_tree, async_extent->start,
602                             async_extent->start + async_extent->ram_size - 1,
603                             GFP_NOFS);
604                 /*
605                  * here we're doing allocation and writeback of the
606                  * compressed pages
607                  */
608                 btrfs_drop_extent_cache(inode, async_extent->start,
609                                         async_extent->start +
610                                         async_extent->ram_size - 1, 0);
611
612                 ret = btrfs_reserve_extent(trans, root,
613                                            async_extent->compressed_size,
614                                            async_extent->compressed_size,
615                                            0, alloc_hint,
616                                            (u64)-1, &ins, 1);
617                 BUG_ON(ret);
618                 em = alloc_extent_map(GFP_NOFS);
619                 em->start = async_extent->start;
620                 em->len = async_extent->ram_size;
621                 em->orig_start = em->start;
622
623                 em->block_start = ins.objectid;
624                 em->block_len = ins.offset;
625                 em->bdev = root->fs_info->fs_devices->latest_bdev;
626                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
627                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
628
629                 while (1) {
630                         spin_lock(&em_tree->lock);
631                         ret = add_extent_mapping(em_tree, em);
632                         spin_unlock(&em_tree->lock);
633                         if (ret != -EEXIST) {
634                                 free_extent_map(em);
635                                 break;
636                         }
637                         btrfs_drop_extent_cache(inode, async_extent->start,
638                                                 async_extent->start +
639                                                 async_extent->ram_size - 1, 0);
640                 }
641
642                 ret = btrfs_add_ordered_extent(inode, async_extent->start,
643                                                ins.objectid,
644                                                async_extent->ram_size,
645                                                ins.offset,
646                                                BTRFS_ORDERED_COMPRESSED);
647                 BUG_ON(ret);
648
649                 btrfs_end_transaction(trans, root);
650
651                 /*
652                  * clear dirty, set writeback and unlock the pages.
653                  */
654                 extent_clear_unlock_delalloc(inode,
655                                              &BTRFS_I(inode)->io_tree,
656                                              async_extent->start,
657                                              async_extent->start +
658                                              async_extent->ram_size - 1,
659                                              NULL, 1, 1, 0, 1, 1, 0);
660
661                 ret = btrfs_submit_compressed_write(inode,
662                                     async_extent->start,
663                                     async_extent->ram_size,
664                                     ins.objectid,
665                                     ins.offset, async_extent->pages,
666                                     async_extent->nr_pages);
667
668                 BUG_ON(ret);
669                 trans = btrfs_join_transaction(root, 1);
670                 alloc_hint = ins.objectid + ins.offset;
671                 kfree(async_extent);
672                 cond_resched();
673         }
674
675         btrfs_end_transaction(trans, root);
676         return 0;
677 }
678
679 /*
680  * when extent_io.c finds a delayed allocation range in the file,
681  * the call backs end up in this code.  The basic idea is to
682  * allocate extents on disk for the range, and create ordered data structs
683  * in ram to track those extents.
684  *
685  * locked_page is the page that writepage had locked already.  We use
686  * it to make sure we don't do extra locks or unlocks.
687  *
688  * *page_started is set to one if we unlock locked_page and do everything
689  * required to start IO on it.  It may be clean and already done with
690  * IO when we return.
691  */
692 static noinline int cow_file_range(struct inode *inode,
693                                    struct page *locked_page,
694                                    u64 start, u64 end, int *page_started,
695                                    unsigned long *nr_written,
696                                    int unlock)
697 {
698         struct btrfs_root *root = BTRFS_I(inode)->root;
699         struct btrfs_trans_handle *trans;
700         u64 alloc_hint = 0;
701         u64 num_bytes;
702         unsigned long ram_size;
703         u64 disk_num_bytes;
704         u64 cur_alloc_size;
705         u64 blocksize = root->sectorsize;
706         u64 actual_end;
707         u64 isize = i_size_read(inode);
708         struct btrfs_key ins;
709         struct extent_map *em;
710         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
711         int ret = 0;
712
713         trans = btrfs_join_transaction(root, 1);
714         BUG_ON(!trans);
715         btrfs_set_trans_block_group(trans, inode);
716
717         actual_end = min_t(u64, isize, end + 1);
718
719         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
720         num_bytes = max(blocksize,  num_bytes);
721         disk_num_bytes = num_bytes;
722         ret = 0;
723
724         if (start == 0) {
725                 /* lets try to make an inline extent */
726                 ret = cow_file_range_inline(trans, root, inode,
727                                             start, end, 0, NULL);
728                 if (ret == 0) {
729                         extent_clear_unlock_delalloc(inode,
730                                                      &BTRFS_I(inode)->io_tree,
731                                                      start, end, NULL, 1, 1,
732                                                      1, 1, 1, 1);
733                         *nr_written = *nr_written +
734                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
735                         *page_started = 1;
736                         ret = 0;
737                         goto out;
738                 }
739         }
740
741         BUG_ON(disk_num_bytes >
742                btrfs_super_total_bytes(&root->fs_info->super_copy));
743
744         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
745
746         while (disk_num_bytes > 0) {
747                 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
748                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
749                                            root->sectorsize, 0, alloc_hint,
750                                            (u64)-1, &ins, 1);
751                 BUG_ON(ret);
752
753                 em = alloc_extent_map(GFP_NOFS);
754                 em->start = start;
755                 em->orig_start = em->start;
756
757                 ram_size = ins.offset;
758                 em->len = ins.offset;
759
760                 em->block_start = ins.objectid;
761                 em->block_len = ins.offset;
762                 em->bdev = root->fs_info->fs_devices->latest_bdev;
763                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
764
765                 while (1) {
766                         spin_lock(&em_tree->lock);
767                         ret = add_extent_mapping(em_tree, em);
768                         spin_unlock(&em_tree->lock);
769                         if (ret != -EEXIST) {
770                                 free_extent_map(em);
771                                 break;
772                         }
773                         btrfs_drop_extent_cache(inode, start,
774                                                 start + ram_size - 1, 0);
775                 }
776
777                 cur_alloc_size = ins.offset;
778                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
779                                                ram_size, cur_alloc_size, 0);
780                 BUG_ON(ret);
781
782                 if (root->root_key.objectid ==
783                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
784                         ret = btrfs_reloc_clone_csums(inode, start,
785                                                       cur_alloc_size);
786                         BUG_ON(ret);
787                 }
788
789                 if (disk_num_bytes < cur_alloc_size)
790                         break;
791
792                 /* we're not doing compressed IO, don't unlock the first
793                  * page (which the caller expects to stay locked), don't
794                  * clear any dirty bits and don't set any writeback bits
795                  */
796                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
797                                              start, start + ram_size - 1,
798                                              locked_page, unlock, 1,
799                                              1, 0, 0, 0);
800                 disk_num_bytes -= cur_alloc_size;
801                 num_bytes -= cur_alloc_size;
802                 alloc_hint = ins.objectid + ins.offset;
803                 start += cur_alloc_size;
804         }
805 out:
806         ret = 0;
807         btrfs_end_transaction(trans, root);
808
809         return ret;
810 }
811
812 /*
813  * work queue call back to started compression on a file and pages
814  */
815 static noinline void async_cow_start(struct btrfs_work *work)
816 {
817         struct async_cow *async_cow;
818         int num_added = 0;
819         async_cow = container_of(work, struct async_cow, work);
820
821         compress_file_range(async_cow->inode, async_cow->locked_page,
822                             async_cow->start, async_cow->end, async_cow,
823                             &num_added);
824         if (num_added == 0)
825                 async_cow->inode = NULL;
826 }
827
828 /*
829  * work queue call back to submit previously compressed pages
830  */
831 static noinline void async_cow_submit(struct btrfs_work *work)
832 {
833         struct async_cow *async_cow;
834         struct btrfs_root *root;
835         unsigned long nr_pages;
836
837         async_cow = container_of(work, struct async_cow, work);
838
839         root = async_cow->root;
840         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
841                 PAGE_CACHE_SHIFT;
842
843         atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
844
845         if (atomic_read(&root->fs_info->async_delalloc_pages) <
846             5 * 1042 * 1024 &&
847             waitqueue_active(&root->fs_info->async_submit_wait))
848                 wake_up(&root->fs_info->async_submit_wait);
849
850         if (async_cow->inode)
851                 submit_compressed_extents(async_cow->inode, async_cow);
852 }
853
854 static noinline void async_cow_free(struct btrfs_work *work)
855 {
856         struct async_cow *async_cow;
857         async_cow = container_of(work, struct async_cow, work);
858         kfree(async_cow);
859 }
860
861 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
862                                 u64 start, u64 end, int *page_started,
863                                 unsigned long *nr_written)
864 {
865         struct async_cow *async_cow;
866         struct btrfs_root *root = BTRFS_I(inode)->root;
867         unsigned long nr_pages;
868         u64 cur_end;
869         int limit = 10 * 1024 * 1042;
870
871         if (!btrfs_test_opt(root, COMPRESS)) {
872                 return cow_file_range(inode, locked_page, start, end,
873                                       page_started, nr_written, 1);
874         }
875
876         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
877                          EXTENT_DELALLOC, 1, 0, GFP_NOFS);
878         while (start < end) {
879                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
880                 async_cow->inode = inode;
881                 async_cow->root = root;
882                 async_cow->locked_page = locked_page;
883                 async_cow->start = start;
884
885                 if (btrfs_test_flag(inode, NOCOMPRESS))
886                         cur_end = end;
887                 else
888                         cur_end = min(end, start + 512 * 1024 - 1);
889
890                 async_cow->end = cur_end;
891                 INIT_LIST_HEAD(&async_cow->extents);
892
893                 async_cow->work.func = async_cow_start;
894                 async_cow->work.ordered_func = async_cow_submit;
895                 async_cow->work.ordered_free = async_cow_free;
896                 async_cow->work.flags = 0;
897
898                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
899                         PAGE_CACHE_SHIFT;
900                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
901
902                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
903                                    &async_cow->work);
904
905                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
906                         wait_event(root->fs_info->async_submit_wait,
907                            (atomic_read(&root->fs_info->async_delalloc_pages) <
908                             limit));
909                 }
910
911                 while (atomic_read(&root->fs_info->async_submit_draining) &&
912                       atomic_read(&root->fs_info->async_delalloc_pages)) {
913                         wait_event(root->fs_info->async_submit_wait,
914                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
915                            0));
916                 }
917
918                 *nr_written += nr_pages;
919                 start = cur_end + 1;
920         }
921         *page_started = 1;
922         return 0;
923 }
924
925 static noinline int csum_exist_in_range(struct btrfs_root *root,
926                                         u64 bytenr, u64 num_bytes)
927 {
928         int ret;
929         struct btrfs_ordered_sum *sums;
930         LIST_HEAD(list);
931
932         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
933                                        bytenr + num_bytes - 1, &list);
934         if (ret == 0 && list_empty(&list))
935                 return 0;
936
937         while (!list_empty(&list)) {
938                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
939                 list_del(&sums->list);
940                 kfree(sums);
941         }
942         return 1;
943 }
944
945 /*
946  * when nowcow writeback call back.  This checks for snapshots or COW copies
947  * of the extents that exist in the file, and COWs the file as required.
948  *
949  * If no cow copies or snapshots exist, we write directly to the existing
950  * blocks on disk
951  */
952 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
953                               u64 start, u64 end, int *page_started, int force,
954                               unsigned long *nr_written)
955 {
956         struct btrfs_root *root = BTRFS_I(inode)->root;
957         struct btrfs_trans_handle *trans;
958         struct extent_buffer *leaf;
959         struct btrfs_path *path;
960         struct btrfs_file_extent_item *fi;
961         struct btrfs_key found_key;
962         u64 cow_start;
963         u64 cur_offset;
964         u64 extent_end;
965         u64 disk_bytenr;
966         u64 num_bytes;
967         int extent_type;
968         int ret;
969         int type;
970         int nocow;
971         int check_prev = 1;
972
973         path = btrfs_alloc_path();
974         BUG_ON(!path);
975         trans = btrfs_join_transaction(root, 1);
976         BUG_ON(!trans);
977
978         cow_start = (u64)-1;
979         cur_offset = start;
980         while (1) {
981                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
982                                                cur_offset, 0);
983                 BUG_ON(ret < 0);
984                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
985                         leaf = path->nodes[0];
986                         btrfs_item_key_to_cpu(leaf, &found_key,
987                                               path->slots[0] - 1);
988                         if (found_key.objectid == inode->i_ino &&
989                             found_key.type == BTRFS_EXTENT_DATA_KEY)
990                                 path->slots[0]--;
991                 }
992                 check_prev = 0;
993 next_slot:
994                 leaf = path->nodes[0];
995                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
996                         ret = btrfs_next_leaf(root, path);
997                         if (ret < 0)
998                                 BUG_ON(1);
999                         if (ret > 0)
1000                                 break;
1001                         leaf = path->nodes[0];
1002                 }
1003
1004                 nocow = 0;
1005                 disk_bytenr = 0;
1006                 num_bytes = 0;
1007                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1008
1009                 if (found_key.objectid > inode->i_ino ||
1010                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1011                     found_key.offset > end)
1012                         break;
1013
1014                 if (found_key.offset > cur_offset) {
1015                         extent_end = found_key.offset;
1016                         goto out_check;
1017                 }
1018
1019                 fi = btrfs_item_ptr(leaf, path->slots[0],
1020                                     struct btrfs_file_extent_item);
1021                 extent_type = btrfs_file_extent_type(leaf, fi);
1022
1023                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1024                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1025                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1026                         extent_end = found_key.offset +
1027                                 btrfs_file_extent_num_bytes(leaf, fi);
1028                         if (extent_end <= start) {
1029                                 path->slots[0]++;
1030                                 goto next_slot;
1031                         }
1032                         if (disk_bytenr == 0)
1033                                 goto out_check;
1034                         if (btrfs_file_extent_compression(leaf, fi) ||
1035                             btrfs_file_extent_encryption(leaf, fi) ||
1036                             btrfs_file_extent_other_encoding(leaf, fi))
1037                                 goto out_check;
1038                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1039                                 goto out_check;
1040                         if (btrfs_extent_readonly(root, disk_bytenr))
1041                                 goto out_check;
1042                         if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1043                                                   disk_bytenr))
1044                                 goto out_check;
1045                         disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1046                         disk_bytenr += cur_offset - found_key.offset;
1047                         num_bytes = min(end + 1, extent_end) - cur_offset;
1048                         /*
1049                          * force cow if csum exists in the range.
1050                          * this ensure that csum for a given extent are
1051                          * either valid or do not exist.
1052                          */
1053                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1054                                 goto out_check;
1055                         nocow = 1;
1056                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1057                         extent_end = found_key.offset +
1058                                 btrfs_file_extent_inline_len(leaf, fi);
1059                         extent_end = ALIGN(extent_end, root->sectorsize);
1060                 } else {
1061                         BUG_ON(1);
1062                 }
1063 out_check:
1064                 if (extent_end <= start) {
1065                         path->slots[0]++;
1066                         goto next_slot;
1067                 }
1068                 if (!nocow) {
1069                         if (cow_start == (u64)-1)
1070                                 cow_start = cur_offset;
1071                         cur_offset = extent_end;
1072                         if (cur_offset > end)
1073                                 break;
1074                         path->slots[0]++;
1075                         goto next_slot;
1076                 }
1077
1078                 btrfs_release_path(root, path);
1079                 if (cow_start != (u64)-1) {
1080                         ret = cow_file_range(inode, locked_page, cow_start,
1081                                         found_key.offset - 1, page_started,
1082                                         nr_written, 1);
1083                         BUG_ON(ret);
1084                         cow_start = (u64)-1;
1085                 }
1086
1087                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1088                         struct extent_map *em;
1089                         struct extent_map_tree *em_tree;
1090                         em_tree = &BTRFS_I(inode)->extent_tree;
1091                         em = alloc_extent_map(GFP_NOFS);
1092                         em->start = cur_offset;
1093                         em->orig_start = em->start;
1094                         em->len = num_bytes;
1095                         em->block_len = num_bytes;
1096                         em->block_start = disk_bytenr;
1097                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1098                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1099                         while (1) {
1100                                 spin_lock(&em_tree->lock);
1101                                 ret = add_extent_mapping(em_tree, em);
1102                                 spin_unlock(&em_tree->lock);
1103                                 if (ret != -EEXIST) {
1104                                         free_extent_map(em);
1105                                         break;
1106                                 }
1107                                 btrfs_drop_extent_cache(inode, em->start,
1108                                                 em->start + em->len - 1, 0);
1109                         }
1110                         type = BTRFS_ORDERED_PREALLOC;
1111                 } else {
1112                         type = BTRFS_ORDERED_NOCOW;
1113                 }
1114
1115                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1116                                                num_bytes, num_bytes, type);
1117                 BUG_ON(ret);
1118
1119                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1120                                         cur_offset, cur_offset + num_bytes - 1,
1121                                         locked_page, 1, 1, 1, 0, 0, 0);
1122                 cur_offset = extent_end;
1123                 if (cur_offset > end)
1124                         break;
1125         }
1126         btrfs_release_path(root, path);
1127
1128         if (cur_offset <= end && cow_start == (u64)-1)
1129                 cow_start = cur_offset;
1130         if (cow_start != (u64)-1) {
1131                 ret = cow_file_range(inode, locked_page, cow_start, end,
1132                                      page_started, nr_written, 1);
1133                 BUG_ON(ret);
1134         }
1135
1136         ret = btrfs_end_transaction(trans, root);
1137         BUG_ON(ret);
1138         btrfs_free_path(path);
1139         return 0;
1140 }
1141
1142 /*
1143  * extent_io.c call back to do delayed allocation processing
1144  */
1145 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1146                               u64 start, u64 end, int *page_started,
1147                               unsigned long *nr_written)
1148 {
1149         int ret;
1150
1151         if (btrfs_test_flag(inode, NODATACOW))
1152                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1153                                          page_started, 1, nr_written);
1154         else if (btrfs_test_flag(inode, PREALLOC))
1155                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1156                                          page_started, 0, nr_written);
1157         else
1158                 ret = cow_file_range_async(inode, locked_page, start, end,
1159                                            page_started, nr_written);
1160
1161         return ret;
1162 }
1163
1164 /*
1165  * extent_io.c set_bit_hook, used to track delayed allocation
1166  * bytes in this file, and to maintain the list of inodes that
1167  * have pending delalloc work to be done.
1168  */
1169 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1170                        unsigned long old, unsigned long bits)
1171 {
1172         /*
1173          * set_bit and clear bit hooks normally require _irqsave/restore
1174          * but in this case, we are only testeing for the DELALLOC
1175          * bit, which is only set or cleared with irqs on
1176          */
1177         if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1178                 struct btrfs_root *root = BTRFS_I(inode)->root;
1179                 spin_lock(&root->fs_info->delalloc_lock);
1180                 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1181                 root->fs_info->delalloc_bytes += end - start + 1;
1182                 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1183                         list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1184                                       &root->fs_info->delalloc_inodes);
1185                 }
1186                 spin_unlock(&root->fs_info->delalloc_lock);
1187         }
1188         return 0;
1189 }
1190
1191 /*
1192  * extent_io.c clear_bit_hook, see set_bit_hook for why
1193  */
1194 static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1195                          unsigned long old, unsigned long bits)
1196 {
1197         /*
1198          * set_bit and clear bit hooks normally require _irqsave/restore
1199          * but in this case, we are only testeing for the DELALLOC
1200          * bit, which is only set or cleared with irqs on
1201          */
1202         if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1203                 struct btrfs_root *root = BTRFS_I(inode)->root;
1204
1205                 spin_lock(&root->fs_info->delalloc_lock);
1206                 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1207                         printk(KERN_INFO "btrfs warning: delalloc account "
1208                                "%llu %llu\n",
1209                                (unsigned long long)end - start + 1,
1210                                (unsigned long long)
1211                                root->fs_info->delalloc_bytes);
1212                         root->fs_info->delalloc_bytes = 0;
1213                         BTRFS_I(inode)->delalloc_bytes = 0;
1214                 } else {
1215                         root->fs_info->delalloc_bytes -= end - start + 1;
1216                         BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1217                 }
1218                 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1219                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1220                         list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1221                 }
1222                 spin_unlock(&root->fs_info->delalloc_lock);
1223         }
1224         return 0;
1225 }
1226
1227 /*
1228  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1229  * we don't create bios that span stripes or chunks
1230  */
1231 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1232                          size_t size, struct bio *bio,
1233                          unsigned long bio_flags)
1234 {
1235         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1236         struct btrfs_mapping_tree *map_tree;
1237         u64 logical = (u64)bio->bi_sector << 9;
1238         u64 length = 0;
1239         u64 map_length;
1240         int ret;
1241
1242         if (bio_flags & EXTENT_BIO_COMPRESSED)
1243                 return 0;
1244
1245         length = bio->bi_size;
1246         map_tree = &root->fs_info->mapping_tree;
1247         map_length = length;
1248         ret = btrfs_map_block(map_tree, READ, logical,
1249                               &map_length, NULL, 0);
1250
1251         if (map_length < length + size)
1252                 return 1;
1253         return 0;
1254 }
1255
1256 /*
1257  * in order to insert checksums into the metadata in large chunks,
1258  * we wait until bio submission time.   All the pages in the bio are
1259  * checksummed and sums are attached onto the ordered extent record.
1260  *
1261  * At IO completion time the cums attached on the ordered extent record
1262  * are inserted into the btree
1263  */
1264 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1265                                     struct bio *bio, int mirror_num,
1266                                     unsigned long bio_flags)
1267 {
1268         struct btrfs_root *root = BTRFS_I(inode)->root;
1269         int ret = 0;
1270
1271         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1272         BUG_ON(ret);
1273         return 0;
1274 }
1275
1276 /*
1277  * in order to insert checksums into the metadata in large chunks,
1278  * we wait until bio submission time.   All the pages in the bio are
1279  * checksummed and sums are attached onto the ordered extent record.
1280  *
1281  * At IO completion time the cums attached on the ordered extent record
1282  * are inserted into the btree
1283  */
1284 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1285                           int mirror_num, unsigned long bio_flags)
1286 {
1287         struct btrfs_root *root = BTRFS_I(inode)->root;
1288         return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1289 }
1290
1291 /*
1292  * extent_io.c submission hook. This does the right thing for csum calculation
1293  * on write, or reading the csums from the tree before a read
1294  */
1295 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1296                           int mirror_num, unsigned long bio_flags)
1297 {
1298         struct btrfs_root *root = BTRFS_I(inode)->root;
1299         int ret = 0;
1300         int skip_sum;
1301
1302         skip_sum = btrfs_test_flag(inode, NODATASUM);
1303
1304         ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1305         BUG_ON(ret);
1306
1307         if (!(rw & (1 << BIO_RW))) {
1308                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1309                         return btrfs_submit_compressed_read(inode, bio,
1310                                                     mirror_num, bio_flags);
1311                 } else if (!skip_sum)
1312                         btrfs_lookup_bio_sums(root, inode, bio, NULL);
1313                 goto mapit;
1314         } else if (!skip_sum) {
1315                 /* csum items have already been cloned */
1316                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1317                         goto mapit;
1318                 /* we're doing a write, do the async checksumming */
1319                 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1320                                    inode, rw, bio, mirror_num,
1321                                    bio_flags, __btrfs_submit_bio_start,
1322                                    __btrfs_submit_bio_done);
1323         }
1324
1325 mapit:
1326         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1327 }
1328
1329 /*
1330  * given a list of ordered sums record them in the inode.  This happens
1331  * at IO completion time based on sums calculated at bio submission time.
1332  */
1333 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1334                              struct inode *inode, u64 file_offset,
1335                              struct list_head *list)
1336 {
1337         struct btrfs_ordered_sum *sum;
1338
1339         btrfs_set_trans_block_group(trans, inode);
1340
1341         list_for_each_entry(sum, list, list) {
1342                 btrfs_csum_file_blocks(trans,
1343                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1344         }
1345         return 0;
1346 }
1347
1348 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1349 {
1350         if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1351                 WARN_ON(1);
1352         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1353                                    GFP_NOFS);
1354 }
1355
1356 /* see btrfs_writepage_start_hook for details on why this is required */
1357 struct btrfs_writepage_fixup {
1358         struct page *page;
1359         struct btrfs_work work;
1360 };
1361
1362 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1363 {
1364         struct btrfs_writepage_fixup *fixup;
1365         struct btrfs_ordered_extent *ordered;
1366         struct page *page;
1367         struct inode *inode;
1368         u64 page_start;
1369         u64 page_end;
1370
1371         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1372         page = fixup->page;
1373 again:
1374         lock_page(page);
1375         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1376                 ClearPageChecked(page);
1377                 goto out_page;
1378         }
1379
1380         inode = page->mapping->host;
1381         page_start = page_offset(page);
1382         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1383
1384         lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1385
1386         /* already ordered? We're done */
1387         if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1388                              EXTENT_ORDERED, 0)) {
1389                 goto out;
1390         }
1391
1392         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1393         if (ordered) {
1394                 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1395                               page_end, GFP_NOFS);
1396                 unlock_page(page);
1397                 btrfs_start_ordered_extent(inode, ordered, 1);
1398                 goto again;
1399         }
1400
1401         btrfs_set_extent_delalloc(inode, page_start, page_end);
1402         ClearPageChecked(page);
1403 out:
1404         unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1405 out_page:
1406         unlock_page(page);
1407         page_cache_release(page);
1408 }
1409
1410 /*
1411  * There are a few paths in the higher layers of the kernel that directly
1412  * set the page dirty bit without asking the filesystem if it is a
1413  * good idea.  This causes problems because we want to make sure COW
1414  * properly happens and the data=ordered rules are followed.
1415  *
1416  * In our case any range that doesn't have the ORDERED bit set
1417  * hasn't been properly setup for IO.  We kick off an async process
1418  * to fix it up.  The async helper will wait for ordered extents, set
1419  * the delalloc bit and make it safe to write the page.
1420  */
1421 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1422 {
1423         struct inode *inode = page->mapping->host;
1424         struct btrfs_writepage_fixup *fixup;
1425         struct btrfs_root *root = BTRFS_I(inode)->root;
1426         int ret;
1427
1428         ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1429                              EXTENT_ORDERED, 0);
1430         if (ret)
1431                 return 0;
1432
1433         if (PageChecked(page))
1434                 return -EAGAIN;
1435
1436         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1437         if (!fixup)
1438                 return -EAGAIN;
1439
1440         SetPageChecked(page);
1441         page_cache_get(page);
1442         fixup->work.func = btrfs_writepage_fixup_worker;
1443         fixup->page = page;
1444         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1445         return -EAGAIN;
1446 }
1447
1448 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1449                                        struct inode *inode, u64 file_pos,
1450                                        u64 disk_bytenr, u64 disk_num_bytes,
1451                                        u64 num_bytes, u64 ram_bytes,
1452                                        u8 compression, u8 encryption,
1453                                        u16 other_encoding, int extent_type)
1454 {
1455         struct btrfs_root *root = BTRFS_I(inode)->root;
1456         struct btrfs_file_extent_item *fi;
1457         struct btrfs_path *path;
1458         struct extent_buffer *leaf;
1459         struct btrfs_key ins;
1460         u64 hint;
1461         int ret;
1462
1463         path = btrfs_alloc_path();
1464         BUG_ON(!path);
1465
1466         ret = btrfs_drop_extents(trans, root, inode, file_pos,
1467                                  file_pos + num_bytes, file_pos, &hint);
1468         BUG_ON(ret);
1469
1470         ins.objectid = inode->i_ino;
1471         ins.offset = file_pos;
1472         ins.type = BTRFS_EXTENT_DATA_KEY;
1473         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1474         BUG_ON(ret);
1475         leaf = path->nodes[0];
1476         fi = btrfs_item_ptr(leaf, path->slots[0],
1477                             struct btrfs_file_extent_item);
1478         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1479         btrfs_set_file_extent_type(leaf, fi, extent_type);
1480         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1481         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1482         btrfs_set_file_extent_offset(leaf, fi, 0);
1483         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1484         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1485         btrfs_set_file_extent_compression(leaf, fi, compression);
1486         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1487         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1488         btrfs_mark_buffer_dirty(leaf);
1489
1490         inode_add_bytes(inode, num_bytes);
1491         btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1492
1493         ins.objectid = disk_bytenr;
1494         ins.offset = disk_num_bytes;
1495         ins.type = BTRFS_EXTENT_ITEM_KEY;
1496         ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1497                                           root->root_key.objectid,
1498                                           trans->transid, inode->i_ino, &ins);
1499         BUG_ON(ret);
1500
1501         btrfs_free_path(path);
1502         return 0;
1503 }
1504
1505 /* as ordered data IO finishes, this gets called so we can finish
1506  * an ordered extent if the range of bytes in the file it covers are
1507  * fully written.
1508  */
1509 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1510 {
1511         struct btrfs_root *root = BTRFS_I(inode)->root;
1512         struct btrfs_trans_handle *trans;
1513         struct btrfs_ordered_extent *ordered_extent;
1514         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1515         int compressed = 0;
1516         int ret;
1517
1518         ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1519         if (!ret)
1520                 return 0;
1521
1522         trans = btrfs_join_transaction(root, 1);
1523
1524         ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1525         BUG_ON(!ordered_extent);
1526         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1527                 goto nocow;
1528
1529         lock_extent(io_tree, ordered_extent->file_offset,
1530                     ordered_extent->file_offset + ordered_extent->len - 1,
1531                     GFP_NOFS);
1532
1533         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1534                 compressed = 1;
1535         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1536                 BUG_ON(compressed);
1537                 ret = btrfs_mark_extent_written(trans, root, inode,
1538                                                 ordered_extent->file_offset,
1539                                                 ordered_extent->file_offset +
1540                                                 ordered_extent->len);
1541                 BUG_ON(ret);
1542         } else {
1543                 ret = insert_reserved_file_extent(trans, inode,
1544                                                 ordered_extent->file_offset,
1545                                                 ordered_extent->start,
1546                                                 ordered_extent->disk_len,
1547                                                 ordered_extent->len,
1548                                                 ordered_extent->len,
1549                                                 compressed, 0, 0,
1550                                                 BTRFS_FILE_EXTENT_REG);
1551                 BUG_ON(ret);
1552         }
1553         unlock_extent(io_tree, ordered_extent->file_offset,
1554                     ordered_extent->file_offset + ordered_extent->len - 1,
1555                     GFP_NOFS);
1556 nocow:
1557         add_pending_csums(trans, inode, ordered_extent->file_offset,
1558                           &ordered_extent->list);
1559
1560         mutex_lock(&BTRFS_I(inode)->extent_mutex);
1561         btrfs_ordered_update_i_size(inode, ordered_extent);
1562         btrfs_update_inode(trans, root, inode);
1563         btrfs_remove_ordered_extent(inode, ordered_extent);
1564         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1565
1566         /* once for us */
1567         btrfs_put_ordered_extent(ordered_extent);
1568         /* once for the tree */
1569         btrfs_put_ordered_extent(ordered_extent);
1570
1571         btrfs_end_transaction(trans, root);
1572         return 0;
1573 }
1574
1575 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1576                                 struct extent_state *state, int uptodate)
1577 {
1578         return btrfs_finish_ordered_io(page->mapping->host, start, end);
1579 }
1580
1581 /*
1582  * When IO fails, either with EIO or csum verification fails, we
1583  * try other mirrors that might have a good copy of the data.  This
1584  * io_failure_record is used to record state as we go through all the
1585  * mirrors.  If another mirror has good data, the page is set up to date
1586  * and things continue.  If a good mirror can't be found, the original
1587  * bio end_io callback is called to indicate things have failed.
1588  */
1589 struct io_failure_record {
1590         struct page *page;
1591         u64 start;
1592         u64 len;
1593         u64 logical;
1594         unsigned long bio_flags;
1595         int last_mirror;
1596 };
1597
1598 static int btrfs_io_failed_hook(struct bio *failed_bio,
1599                          struct page *page, u64 start, u64 end,
1600                          struct extent_state *state)
1601 {
1602         struct io_failure_record *failrec = NULL;
1603         u64 private;
1604         struct extent_map *em;
1605         struct inode *inode = page->mapping->host;
1606         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1607         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1608         struct bio *bio;
1609         int num_copies;
1610         int ret;
1611         int rw;
1612         u64 logical;
1613
1614         ret = get_state_private(failure_tree, start, &private);
1615         if (ret) {
1616                 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1617                 if (!failrec)
1618                         return -ENOMEM;
1619                 failrec->start = start;
1620                 failrec->len = end - start + 1;
1621                 failrec->last_mirror = 0;
1622                 failrec->bio_flags = 0;
1623
1624                 spin_lock(&em_tree->lock);
1625                 em = lookup_extent_mapping(em_tree, start, failrec->len);
1626                 if (em->start > start || em->start + em->len < start) {
1627                         free_extent_map(em);
1628                         em = NULL;
1629                 }
1630                 spin_unlock(&em_tree->lock);
1631
1632                 if (!em || IS_ERR(em)) {
1633                         kfree(failrec);
1634                         return -EIO;
1635                 }
1636                 logical = start - em->start;
1637                 logical = em->block_start + logical;
1638                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1639                         logical = em->block_start;
1640                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1641                 }
1642                 failrec->logical = logical;
1643                 free_extent_map(em);
1644                 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1645                                 EXTENT_DIRTY, GFP_NOFS);
1646                 set_state_private(failure_tree, start,
1647                                  (u64)(unsigned long)failrec);
1648         } else {
1649                 failrec = (struct io_failure_record *)(unsigned long)private;
1650         }
1651         num_copies = btrfs_num_copies(
1652                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
1653                               failrec->logical, failrec->len);
1654         failrec->last_mirror++;
1655         if (!state) {
1656                 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1657                 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1658                                                     failrec->start,
1659                                                     EXTENT_LOCKED);
1660                 if (state && state->start != failrec->start)
1661                         state = NULL;
1662                 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1663         }
1664         if (!state || failrec->last_mirror > num_copies) {
1665                 set_state_private(failure_tree, failrec->start, 0);
1666                 clear_extent_bits(failure_tree, failrec->start,
1667                                   failrec->start + failrec->len - 1,
1668                                   EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1669                 kfree(failrec);
1670                 return -EIO;
1671         }
1672         bio = bio_alloc(GFP_NOFS, 1);
1673         bio->bi_private = state;
1674         bio->bi_end_io = failed_bio->bi_end_io;
1675         bio->bi_sector = failrec->logical >> 9;
1676         bio->bi_bdev = failed_bio->bi_bdev;
1677         bio->bi_size = 0;
1678
1679         bio_add_page(bio, page, failrec->len, start - page_offset(page));
1680         if (failed_bio->bi_rw & (1 << BIO_RW))
1681                 rw = WRITE;
1682         else
1683                 rw = READ;
1684
1685         BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1686                                                       failrec->last_mirror,
1687                                                       failrec->bio_flags);
1688         return 0;
1689 }
1690
1691 /*
1692  * each time an IO finishes, we do a fast check in the IO failure tree
1693  * to see if we need to process or clean up an io_failure_record
1694  */
1695 static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1696 {
1697         u64 private;
1698         u64 private_failure;
1699         struct io_failure_record *failure;
1700         int ret;
1701
1702         private = 0;
1703         if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1704                              (u64)-1, 1, EXTENT_DIRTY)) {
1705                 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1706                                         start, &private_failure);
1707                 if (ret == 0) {
1708                         failure = (struct io_failure_record *)(unsigned long)
1709                                    private_failure;
1710                         set_state_private(&BTRFS_I(inode)->io_failure_tree,
1711                                           failure->start, 0);
1712                         clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1713                                           failure->start,
1714                                           failure->start + failure->len - 1,
1715                                           EXTENT_DIRTY | EXTENT_LOCKED,
1716                                           GFP_NOFS);
1717                         kfree(failure);
1718                 }
1719         }
1720         return 0;
1721 }
1722
1723 /*
1724  * when reads are done, we need to check csums to verify the data is correct
1725  * if there's a match, we allow the bio to finish.  If not, we go through
1726  * the io_failure_record routines to find good copies
1727  */
1728 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1729                                struct extent_state *state)
1730 {
1731         size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1732         struct inode *inode = page->mapping->host;
1733         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1734         char *kaddr;
1735         u64 private = ~(u32)0;
1736         int ret;
1737         struct btrfs_root *root = BTRFS_I(inode)->root;
1738         u32 csum = ~(u32)0;
1739
1740         if (PageChecked(page)) {
1741                 ClearPageChecked(page);
1742                 goto good;
1743         }
1744         if (btrfs_test_flag(inode, NODATASUM))
1745                 return 0;
1746
1747         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1748             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1749                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1750                                   GFP_NOFS);
1751                 return 0;
1752         }
1753
1754         if (state && state->start == start) {
1755                 private = state->private;
1756                 ret = 0;
1757         } else {
1758                 ret = get_state_private(io_tree, start, &private);
1759         }
1760         kaddr = kmap_atomic(page, KM_USER0);
1761         if (ret)
1762                 goto zeroit;
1763
1764         csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
1765         btrfs_csum_final(csum, (char *)&csum);
1766         if (csum != private)
1767                 goto zeroit;
1768
1769         kunmap_atomic(kaddr, KM_USER0);
1770 good:
1771         /* if the io failure tree for this inode is non-empty,
1772          * check to see if we've recovered from a failed IO
1773          */
1774         btrfs_clean_io_failures(inode, start);
1775         return 0;
1776
1777 zeroit:
1778         printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1779                "private %llu\n", page->mapping->host->i_ino,
1780                (unsigned long long)start, csum,
1781                (unsigned long long)private);
1782         memset(kaddr + offset, 1, end - start + 1);
1783         flush_dcache_page(page);
1784         kunmap_atomic(kaddr, KM_USER0);
1785         if (private == 0)
1786                 return 0;
1787         return -EIO;
1788 }
1789
1790 /*
1791  * This creates an orphan entry for the given inode in case something goes
1792  * wrong in the middle of an unlink/truncate.
1793  */
1794 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1795 {
1796         struct btrfs_root *root = BTRFS_I(inode)->root;
1797         int ret = 0;
1798
1799         spin_lock(&root->list_lock);
1800
1801         /* already on the orphan list, we're good */
1802         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1803                 spin_unlock(&root->list_lock);
1804                 return 0;
1805         }
1806
1807         list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1808
1809         spin_unlock(&root->list_lock);
1810
1811         /*
1812          * insert an orphan item to track this unlinked/truncated file
1813          */
1814         ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1815
1816         return ret;
1817 }
1818
1819 /*
1820  * We have done the truncate/delete so we can go ahead and remove the orphan
1821  * item for this particular inode.
1822  */
1823 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1824 {
1825         struct btrfs_root *root = BTRFS_I(inode)->root;
1826         int ret = 0;
1827
1828         spin_lock(&root->list_lock);
1829
1830         if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1831                 spin_unlock(&root->list_lock);
1832                 return 0;
1833         }
1834
1835         list_del_init(&BTRFS_I(inode)->i_orphan);
1836         if (!trans) {
1837                 spin_unlock(&root->list_lock);
1838                 return 0;
1839         }
1840
1841         spin_unlock(&root->list_lock);
1842
1843         ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1844
1845         return ret;
1846 }
1847
1848 /*
1849  * this cleans up any orphans that may be left on the list from the last use
1850  * of this root.
1851  */
1852 void btrfs_orphan_cleanup(struct btrfs_root *root)
1853 {
1854         struct btrfs_path *path;
1855         struct extent_buffer *leaf;
1856         struct btrfs_item *item;
1857         struct btrfs_key key, found_key;
1858         struct btrfs_trans_handle *trans;
1859         struct inode *inode;
1860         int ret = 0, nr_unlink = 0, nr_truncate = 0;
1861
1862         path = btrfs_alloc_path();
1863         if (!path)
1864                 return;
1865         path->reada = -1;
1866
1867         key.objectid = BTRFS_ORPHAN_OBJECTID;
1868         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1869         key.offset = (u64)-1;
1870
1871
1872         while (1) {
1873                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1874                 if (ret < 0) {
1875                         printk(KERN_ERR "Error searching slot for orphan: %d"
1876                                "\n", ret);
1877                         break;
1878                 }
1879
1880                 /*
1881                  * if ret == 0 means we found what we were searching for, which
1882                  * is weird, but possible, so only screw with path if we didnt
1883                  * find the key and see if we have stuff that matches
1884                  */
1885                 if (ret > 0) {
1886                         if (path->slots[0] == 0)
1887                                 break;
1888                         path->slots[0]--;
1889                 }
1890
1891                 /* pull out the item */
1892                 leaf = path->nodes[0];
1893                 item = btrfs_item_nr(leaf, path->slots[0]);
1894                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1895
1896                 /* make sure the item matches what we want */
1897                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1898                         break;
1899                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1900                         break;
1901
1902                 /* release the path since we're done with it */
1903                 btrfs_release_path(root, path);
1904
1905                 /*
1906                  * this is where we are basically btrfs_lookup, without the
1907                  * crossing root thing.  we store the inode number in the
1908                  * offset of the orphan item.
1909                  */
1910                 inode = btrfs_iget_locked(root->fs_info->sb,
1911                                           found_key.offset, root);
1912                 if (!inode)
1913                         break;
1914
1915                 if (inode->i_state & I_NEW) {
1916                         BTRFS_I(inode)->root = root;
1917
1918                         /* have to set the location manually */
1919                         BTRFS_I(inode)->location.objectid = inode->i_ino;
1920                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1921                         BTRFS_I(inode)->location.offset = 0;
1922
1923                         btrfs_read_locked_inode(inode);
1924                         unlock_new_inode(inode);
1925                 }
1926
1927                 /*
1928                  * add this inode to the orphan list so btrfs_orphan_del does
1929                  * the proper thing when we hit it
1930                  */
1931                 spin_lock(&root->list_lock);
1932                 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1933                 spin_unlock(&root->list_lock);
1934
1935                 /*
1936                  * if this is a bad inode, means we actually succeeded in
1937                  * removing the inode, but not the orphan record, which means
1938                  * we need to manually delete the orphan since iput will just
1939                  * do a destroy_inode
1940                  */
1941                 if (is_bad_inode(inode)) {
1942                         trans = btrfs_start_transaction(root, 1);
1943                         btrfs_orphan_del(trans, inode);
1944                         btrfs_end_transaction(trans, root);
1945                         iput(inode);
1946                         continue;
1947                 }
1948
1949                 /* if we have links, this was a truncate, lets do that */
1950                 if (inode->i_nlink) {
1951                         nr_truncate++;
1952                         btrfs_truncate(inode);
1953                 } else {
1954                         nr_unlink++;
1955                 }
1956
1957                 /* this will do delete_inode and everything for us */
1958                 iput(inode);
1959         }
1960
1961         if (nr_unlink)
1962                 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1963         if (nr_truncate)
1964                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1965
1966         btrfs_free_path(path);
1967 }
1968
1969 /*
1970  * read an inode from the btree into the in-memory inode
1971  */
1972 void btrfs_read_locked_inode(struct inode *inode)
1973 {
1974         struct btrfs_path *path;
1975         struct extent_buffer *leaf;
1976         struct btrfs_inode_item *inode_item;
1977         struct btrfs_timespec *tspec;
1978         struct btrfs_root *root = BTRFS_I(inode)->root;
1979         struct btrfs_key location;
1980         u64 alloc_group_block;
1981         u32 rdev;
1982         int ret;
1983
1984         path = btrfs_alloc_path();
1985         BUG_ON(!path);
1986         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1987
1988         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1989         if (ret)
1990                 goto make_bad;
1991
1992         leaf = path->nodes[0];
1993         inode_item = btrfs_item_ptr(leaf, path->slots[0],
1994                                     struct btrfs_inode_item);
1995
1996         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1997         inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1998         inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1999         inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2000         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2001
2002         tspec = btrfs_inode_atime(inode_item);
2003         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2004         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2005
2006         tspec = btrfs_inode_mtime(inode_item);
2007         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2008         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2009
2010         tspec = btrfs_inode_ctime(inode_item);
2011         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2012         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2013
2014         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2015         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2016         BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2017         inode->i_generation = BTRFS_I(inode)->generation;
2018         inode->i_rdev = 0;
2019         rdev = btrfs_inode_rdev(leaf, inode_item);
2020
2021         BTRFS_I(inode)->index_cnt = (u64)-1;
2022         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2023
2024         alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2025
2026         BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2027                                                 alloc_group_block, 0);
2028         btrfs_free_path(path);
2029         inode_item = NULL;
2030
2031         switch (inode->i_mode & S_IFMT) {
2032         case S_IFREG:
2033                 inode->i_mapping->a_ops = &btrfs_aops;
2034                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2035                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2036                 inode->i_fop = &btrfs_file_operations;
2037                 inode->i_op = &btrfs_file_inode_operations;
2038                 break;
2039         case S_IFDIR:
2040                 inode->i_fop = &btrfs_dir_file_operations;
2041                 if (root == root->fs_info->tree_root)
2042                         inode->i_op = &btrfs_dir_ro_inode_operations;
2043                 else
2044                         inode->i_op = &btrfs_dir_inode_operations;
2045                 break;
2046         case S_IFLNK:
2047                 inode->i_op = &btrfs_symlink_inode_operations;
2048                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2049                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2050                 break;
2051         default:
2052                 inode->i_op = &btrfs_special_inode_operations;
2053                 init_special_inode(inode, inode->i_mode, rdev);
2054                 break;
2055         }
2056         return;
2057
2058 make_bad:
2059         btrfs_free_path(path);
2060         make_bad_inode(inode);
2061 }
2062
2063 /*
2064  * given a leaf and an inode, copy the inode fields into the leaf
2065  */
2066 static void fill_inode_item(struct btrfs_trans_handle *trans,
2067                             struct extent_buffer *leaf,
2068                             struct btrfs_inode_item *item,
2069                             struct inode *inode)
2070 {
2071         btrfs_set_inode_uid(leaf, item, inode->i_uid);
2072         btrfs_set_inode_gid(leaf, item, inode->i_gid);
2073         btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2074         btrfs_set_inode_mode(leaf, item, inode->i_mode);
2075         btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2076
2077         btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2078                                inode->i_atime.tv_sec);
2079         btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2080                                 inode->i_atime.tv_nsec);
2081
2082         btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2083                                inode->i_mtime.tv_sec);
2084         btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2085                                 inode->i_mtime.tv_nsec);
2086
2087         btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2088                                inode->i_ctime.tv_sec);
2089         btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2090                                 inode->i_ctime.tv_nsec);
2091
2092         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2093         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2094         btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2095         btrfs_set_inode_transid(leaf, item, trans->transid);
2096         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2097         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2098         btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2099 }
2100
2101 /*
2102  * copy everything in the in-memory inode into the btree.
2103  */
2104 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2105                                 struct btrfs_root *root, struct inode *inode)
2106 {
2107         struct btrfs_inode_item *inode_item;
2108         struct btrfs_path *path;
2109         struct extent_buffer *leaf;
2110         int ret;
2111
2112         path = btrfs_alloc_path();
2113         BUG_ON(!path);
2114         ret = btrfs_lookup_inode(trans, root, path,
2115                                  &BTRFS_I(inode)->location, 1);
2116         if (ret) {
2117                 if (ret > 0)
2118                         ret = -ENOENT;
2119                 goto failed;
2120         }
2121
2122         btrfs_unlock_up_safe(path, 1);
2123         leaf = path->nodes[0];
2124         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2125                                   struct btrfs_inode_item);
2126
2127         fill_inode_item(trans, leaf, inode_item, inode);
2128         btrfs_mark_buffer_dirty(leaf);
2129         btrfs_set_inode_last_trans(trans, inode);
2130         ret = 0;
2131 failed:
2132         btrfs_free_path(path);
2133         return ret;
2134 }
2135
2136
2137 /*
2138  * unlink helper that gets used here in inode.c and in the tree logging
2139  * recovery code.  It remove a link in a directory with a given name, and
2140  * also drops the back refs in the inode to the directory
2141  */
2142 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2143                        struct btrfs_root *root,
2144                        struct inode *dir, struct inode *inode,
2145                        const char *name, int name_len)
2146 {
2147         struct btrfs_path *path;
2148         int ret = 0;
2149         struct extent_buffer *leaf;
2150         struct btrfs_dir_item *di;
2151         struct btrfs_key key;
2152         u64 index;
2153
2154         path = btrfs_alloc_path();
2155         if (!path) {
2156                 ret = -ENOMEM;
2157                 goto err;
2158         }
2159
2160         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2161                                     name, name_len, -1);
2162         if (IS_ERR(di)) {
2163                 ret = PTR_ERR(di);
2164                 goto err;
2165         }
2166         if (!di) {
2167                 ret = -ENOENT;
2168                 goto err;
2169         }
2170         leaf = path->nodes[0];
2171         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2172         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2173         if (ret)
2174                 goto err;
2175         btrfs_release_path(root, path);
2176
2177         ret = btrfs_del_inode_ref(trans, root, name, name_len,
2178                                   inode->i_ino,
2179                                   dir->i_ino, &index);
2180         if (ret) {
2181                 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2182                        "inode %lu parent %lu\n", name_len, name,
2183                        inode->i_ino, dir->i_ino);
2184                 goto err;
2185         }
2186
2187         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2188                                          index, name, name_len, -1);
2189         if (IS_ERR(di)) {
2190                 ret = PTR_ERR(di);
2191                 goto err;
2192         }
2193         if (!di) {
2194                 ret = -ENOENT;
2195                 goto err;
2196         }
2197         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2198         btrfs_release_path(root, path);
2199
2200         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2201                                          inode, dir->i_ino);
2202         BUG_ON(ret != 0 && ret != -ENOENT);
2203         if (ret != -ENOENT)
2204                 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2205
2206         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2207                                            dir, index);
2208         BUG_ON(ret);
2209 err:
2210         btrfs_free_path(path);
2211         if (ret)
2212                 goto out;
2213
2214         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2215         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2216         btrfs_update_inode(trans, root, dir);
2217         btrfs_drop_nlink(inode);
2218         ret = btrfs_update_inode(trans, root, inode);
2219         dir->i_sb->s_dirt = 1;
2220 out:
2221         return ret;
2222 }
2223
2224 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2225 {
2226         struct btrfs_root *root;
2227         struct btrfs_trans_handle *trans;
2228         struct inode *inode = dentry->d_inode;
2229         int ret;
2230         unsigned long nr = 0;
2231
2232         root = BTRFS_I(dir)->root;
2233
2234         ret = btrfs_check_free_space(root, 1, 1);
2235         if (ret)
2236                 goto fail;
2237
2238         trans = btrfs_start_transaction(root, 1);
2239
2240         btrfs_set_trans_block_group(trans, dir);
2241         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2242                                  dentry->d_name.name, dentry->d_name.len);
2243
2244         if (inode->i_nlink == 0)
2245                 ret = btrfs_orphan_add(trans, inode);
2246
2247         nr = trans->blocks_used;
2248
2249         btrfs_end_transaction_throttle(trans, root);
2250 fail:
2251         btrfs_btree_balance_dirty(root, nr);
2252         return ret;
2253 }
2254
2255 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2256 {
2257         struct inode *inode = dentry->d_inode;
2258         int err = 0;
2259         int ret;
2260         struct btrfs_root *root = BTRFS_I(dir)->root;
2261         struct btrfs_trans_handle *trans;
2262         unsigned long nr = 0;
2263
2264         /*
2265          * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2266          * the root of a subvolume or snapshot
2267          */
2268         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2269             inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2270                 return -ENOTEMPTY;
2271         }
2272
2273         ret = btrfs_check_free_space(root, 1, 1);
2274         if (ret)
2275                 goto fail;
2276
2277         trans = btrfs_start_transaction(root, 1);
2278         btrfs_set_trans_block_group(trans, dir);
2279
2280         err = btrfs_orphan_add(trans, inode);
2281         if (err)
2282                 goto fail_trans;
2283
2284         /* now the directory is empty */
2285         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2286                                  dentry->d_name.name, dentry->d_name.len);
2287         if (!err)
2288                 btrfs_i_size_write(inode, 0);
2289
2290 fail_trans:
2291         nr = trans->blocks_used;
2292         ret = btrfs_end_transaction_throttle(trans, root);
2293 fail:
2294         btrfs_btree_balance_dirty(root, nr);
2295
2296         if (ret && !err)
2297                 err = ret;
2298         return err;
2299 }
2300
2301 #if 0
2302 /*
2303  * when truncating bytes in a file, it is possible to avoid reading
2304  * the leaves that contain only checksum items.  This can be the
2305  * majority of the IO required to delete a large file, but it must
2306  * be done carefully.
2307  *
2308  * The keys in the level just above the leaves are checked to make sure
2309  * the lowest key in a given leaf is a csum key, and starts at an offset
2310  * after the new  size.
2311  *
2312  * Then the key for the next leaf is checked to make sure it also has
2313  * a checksum item for the same file.  If it does, we know our target leaf
2314  * contains only checksum items, and it can be safely freed without reading
2315  * it.
2316  *
2317  * This is just an optimization targeted at large files.  It may do
2318  * nothing.  It will return 0 unless things went badly.
2319  */
2320 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2321                                      struct btrfs_root *root,
2322                                      struct btrfs_path *path,
2323                                      struct inode *inode, u64 new_size)
2324 {
2325         struct btrfs_key key;
2326         int ret;
2327         int nritems;
2328         struct btrfs_key found_key;
2329         struct btrfs_key other_key;
2330         struct btrfs_leaf_ref *ref;
2331         u64 leaf_gen;
2332         u64 leaf_start;
2333
2334         path->lowest_level = 1;
2335         key.objectid = inode->i_ino;
2336         key.type = BTRFS_CSUM_ITEM_KEY;
2337         key.offset = new_size;
2338 again:
2339         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2340         if (ret < 0)
2341                 goto out;
2342
2343         if (path->nodes[1] == NULL) {
2344                 ret = 0;
2345                 goto out;
2346         }
2347         ret = 0;
2348         btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2349         nritems = btrfs_header_nritems(path->nodes[1]);
2350
2351         if (!nritems)
2352                 goto out;
2353
2354         if (path->slots[1] >= nritems)
2355                 goto next_node;
2356
2357         /* did we find a key greater than anything we want to delete? */
2358         if (found_key.objectid > inode->i_ino ||
2359            (found_key.objectid == inode->i_ino && found_key.type > key.type))
2360                 goto out;
2361
2362         /* we check the next key in the node to make sure the leave contains
2363          * only checksum items.  This comparison doesn't work if our
2364          * leaf is the last one in the node
2365          */
2366         if (path->slots[1] + 1 >= nritems) {
2367 next_node:
2368                 /* search forward from the last key in the node, this
2369                  * will bring us into the next node in the tree
2370                  */
2371                 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2372
2373                 /* unlikely, but we inc below, so check to be safe */
2374                 if (found_key.offset == (u64)-1)
2375                         goto out;
2376
2377                 /* search_forward needs a path with locks held, do the
2378                  * search again for the original key.  It is possible
2379                  * this will race with a balance and return a path that
2380                  * we could modify, but this drop is just an optimization
2381                  * and is allowed to miss some leaves.
2382                  */
2383                 btrfs_release_path(root, path);
2384                 found_key.offset++;
2385
2386                 /* setup a max key for search_forward */
2387                 other_key.offset = (u64)-1;
2388                 other_key.type = key.type;
2389                 other_key.objectid = key.objectid;
2390
2391                 path->keep_locks = 1;
2392                 ret = btrfs_search_forward(root, &found_key, &other_key,
2393                                            path, 0, 0);
2394                 path->keep_locks = 0;
2395                 if (ret || found_key.objectid != key.objectid ||
2396                     found_key.type != key.type) {
2397                         ret = 0;
2398                         goto out;
2399                 }
2400
2401                 key.offset = found_key.offset;
2402                 btrfs_release_path(root, path);
2403                 cond_resched();
2404                 goto again;
2405         }
2406
2407         /* we know there's one more slot after us in the tree,
2408          * read that key so we can verify it is also a checksum item
2409          */
2410         btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2411
2412         if (found_key.objectid < inode->i_ino)
2413                 goto next_key;
2414
2415         if (found_key.type != key.type || found_key.offset < new_size)
2416                 goto next_key;
2417
2418         /*
2419          * if the key for the next leaf isn't a csum key from this objectid,
2420          * we can't be sure there aren't good items inside this leaf.
2421          * Bail out
2422          */
2423         if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2424                 goto out;
2425
2426         leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2427         leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2428         /*
2429          * it is safe to delete this leaf, it contains only
2430          * csum items from this inode at an offset >= new_size
2431          */
2432         ret = btrfs_del_leaf(trans, root, path, leaf_start);
2433         BUG_ON(ret);
2434
2435         if (root->ref_cows && leaf_gen < trans->transid) {
2436                 ref = btrfs_alloc_leaf_ref(root, 0);
2437                 if (ref) {
2438                         ref->root_gen = root->root_key.offset;
2439                         ref->bytenr = leaf_start;
2440                         ref->owner = 0;
2441                         ref->generation = leaf_gen;
2442                         ref->nritems = 0;
2443
2444                         ret = btrfs_add_leaf_ref(root, ref, 0);
2445                         WARN_ON(ret);
2446                         btrfs_free_leaf_ref(root, ref);
2447                 } else {
2448                         WARN_ON(1);
2449                 }
2450         }
2451 next_key:
2452         btrfs_release_path(root, path);
2453
2454         if (other_key.objectid == inode->i_ino &&
2455             other_key.type == key.type && other_key.offset > key.offset) {
2456                 key.offset = other_key.offset;
2457                 cond_resched();
2458                 goto again;
2459         }
2460         ret = 0;
2461 out:
2462         /* fixup any changes we've made to the path */
2463         path->lowest_level = 0;
2464         path->keep_locks = 0;
2465         btrfs_release_path(root, path);
2466         return ret;
2467 }
2468
2469 #endif
2470
2471 /*
2472  * this can truncate away extent items, csum items and directory items.
2473  * It starts at a high offset and removes keys until it can't find
2474  * any higher than new_size
2475  *
2476  * csum items that cross the new i_size are truncated to the new size
2477  * as well.
2478  *
2479  * min_type is the minimum key type to truncate down to.  If set to 0, this
2480  * will kill all the items on this inode, including the INODE_ITEM_KEY.
2481  */
2482 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2483                                         struct btrfs_root *root,
2484                                         struct inode *inode,
2485                                         u64 new_size, u32 min_type)
2486 {
2487         int ret;
2488         struct btrfs_path *path;
2489         struct btrfs_key key;
2490         struct btrfs_key found_key;
2491         u32 found_type;
2492         struct extent_buffer *leaf;
2493         struct btrfs_file_extent_item *fi;
2494         u64 extent_start = 0;
2495         u64 extent_num_bytes = 0;
2496         u64 item_end = 0;
2497         u64 root_gen = 0;
2498         u64 root_owner = 0;
2499         int found_extent;
2500         int del_item;
2501         int pending_del_nr = 0;
2502         int pending_del_slot = 0;
2503         int extent_type = -1;
2504         int encoding;
2505         u64 mask = root->sectorsize - 1;
2506
2507         if (root->ref_cows)
2508                 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2509         path = btrfs_alloc_path();
2510         path->reada = -1;
2511         BUG_ON(!path);
2512
2513         /* FIXME, add redo link to tree so we don't leak on crash */
2514         key.objectid = inode->i_ino;
2515         key.offset = (u64)-1;
2516         key.type = (u8)-1;
2517
2518         btrfs_init_path(path);
2519
2520 search_again:
2521         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2522         if (ret < 0)
2523                 goto error;
2524
2525         if (ret > 0) {
2526                 /* there are no items in the tree for us to truncate, we're
2527                  * done
2528                  */
2529                 if (path->slots[0] == 0) {
2530                         ret = 0;
2531                         goto error;
2532                 }
2533                 path->slots[0]--;
2534         }
2535
2536         while (1) {
2537                 fi = NULL;
2538                 leaf = path->nodes[0];
2539                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2540                 found_type = btrfs_key_type(&found_key);
2541                 encoding = 0;
2542
2543                 if (found_key.objectid != inode->i_ino)
2544                         break;
2545
2546                 if (found_type < min_type)
2547                         break;
2548
2549                 item_end = found_key.offset;
2550                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2551                         fi = btrfs_item_ptr(leaf, path->slots[0],
2552                                             struct btrfs_file_extent_item);
2553                         extent_type = btrfs_file_extent_type(leaf, fi);
2554                         encoding = btrfs_file_extent_compression(leaf, fi);
2555                         encoding |= btrfs_file_extent_encryption(leaf, fi);
2556                         encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2557
2558                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2559                                 item_end +=
2560                                     btrfs_file_extent_num_bytes(leaf, fi);
2561                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2562                                 item_end += btrfs_file_extent_inline_len(leaf,
2563                                                                          fi);
2564                         }
2565                         item_end--;
2566                 }
2567                 if (item_end < new_size) {
2568                         if (found_type == BTRFS_DIR_ITEM_KEY)
2569                                 found_type = BTRFS_INODE_ITEM_KEY;
2570                         else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2571                                 found_type = BTRFS_EXTENT_DATA_KEY;
2572                         else if (found_type == BTRFS_EXTENT_DATA_KEY)
2573                                 found_type = BTRFS_XATTR_ITEM_KEY;
2574                         else if (found_type == BTRFS_XATTR_ITEM_KEY)
2575                                 found_type = BTRFS_INODE_REF_KEY;
2576                         else if (found_type)
2577                                 found_type--;
2578                         else
2579                                 break;
2580                         btrfs_set_key_type(&key, found_type);
2581                         goto next;
2582                 }
2583                 if (found_key.offset >= new_size)
2584                         del_item = 1;
2585                 else
2586                         del_item = 0;
2587                 found_extent = 0;
2588
2589                 /* FIXME, shrink the extent if the ref count is only 1 */
2590                 if (found_type != BTRFS_EXTENT_DATA_KEY)
2591                         goto delete;
2592
2593                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2594                         u64 num_dec;
2595                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2596                         if (!del_item && !encoding) {
2597                                 u64 orig_num_bytes =
2598                                         btrfs_file_extent_num_bytes(leaf, fi);
2599                                 extent_num_bytes = new_size -
2600                                         found_key.offset + root->sectorsize - 1;
2601                                 extent_num_bytes = extent_num_bytes &
2602                                         ~((u64)root->sectorsize - 1);
2603                                 btrfs_set_file_extent_num_bytes(leaf, fi,
2604                                                          extent_num_bytes);
2605                                 num_dec = (orig_num_bytes -
2606                                            extent_num_bytes);
2607                                 if (root->ref_cows && extent_start != 0)
2608                                         inode_sub_bytes(inode, num_dec);
2609                                 btrfs_mark_buffer_dirty(leaf);
2610                         } else {
2611                                 extent_num_bytes =
2612                                         btrfs_file_extent_disk_num_bytes(leaf,
2613                                                                          fi);
2614                                 /* FIXME blocksize != 4096 */
2615                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2616                                 if (extent_start != 0) {
2617                                         found_extent = 1;
2618                                         if (root->ref_cows)
2619                                                 inode_sub_bytes(inode, num_dec);
2620                                 }
2621                                 root_gen = btrfs_header_generation(leaf);
2622                                 root_owner = btrfs_header_owner(leaf);
2623                         }
2624                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2625                         /*
2626                          * we can't truncate inline items that have had
2627                          * special encodings
2628                          */
2629                         if (!del_item &&
2630                             btrfs_file_extent_compression(leaf, fi) == 0 &&
2631                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
2632                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2633                                 u32 size = new_size - found_key.offset;
2634
2635                                 if (root->ref_cows) {
2636                                         inode_sub_bytes(inode, item_end + 1 -
2637                                                         new_size);
2638                                 }
2639                                 size =
2640                                     btrfs_file_extent_calc_inline_size(size);
2641                                 ret = btrfs_truncate_item(trans, root, path,
2642                                                           size, 1);
2643                                 BUG_ON(ret);
2644                         } else if (root->ref_cows) {
2645                                 inode_sub_bytes(inode, item_end + 1 -
2646                                                 found_key.offset);
2647                         }
2648                 }
2649 delete:
2650                 if (del_item) {
2651                         if (!pending_del_nr) {
2652                                 /* no pending yet, add ourselves */
2653                                 pending_del_slot = path->slots[0];
2654                                 pending_del_nr = 1;
2655                         } else if (pending_del_nr &&
2656                                    path->slots[0] + 1 == pending_del_slot) {
2657                                 /* hop on the pending chunk */
2658                                 pending_del_nr++;
2659                                 pending_del_slot = path->slots[0];
2660                         } else {
2661                                 BUG();
2662                         }
2663                 } else {
2664                         break;
2665                 }
2666                 if (found_extent) {
2667                         ret = btrfs_free_extent(trans, root, extent_start,
2668                                                 extent_num_bytes,
2669                                                 leaf->start, root_owner,
2670                                                 root_gen, inode->i_ino, 0);
2671                         BUG_ON(ret);
2672                 }
2673 next:
2674                 if (path->slots[0] == 0) {
2675                         if (pending_del_nr)
2676                                 goto del_pending;
2677                         btrfs_release_path(root, path);
2678                         goto search_again;
2679                 }
2680
2681                 path->slots[0]--;
2682                 if (pending_del_nr &&
2683                     path->slots[0] + 1 != pending_del_slot) {
2684                         struct btrfs_key debug;
2685 del_pending:
2686                         btrfs_item_key_to_cpu(path->nodes[0], &debug,
2687                                               pending_del_slot);
2688                         ret = btrfs_del_items(trans, root, path,
2689                                               pending_del_slot,
2690                                               pending_del_nr);
2691                         BUG_ON(ret);
2692                         pending_del_nr = 0;
2693                         btrfs_release_path(root, path);
2694                         goto search_again;
2695                 }
2696         }
2697         ret = 0;
2698 error:
2699         if (pending_del_nr) {
2700                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2701                                       pending_del_nr);
2702         }
2703         btrfs_free_path(path);
2704         inode->i_sb->s_dirt = 1;
2705         return ret;
2706 }
2707
2708 /*
2709  * taken from block_truncate_page, but does cow as it zeros out
2710  * any bytes left in the last page in the file.
2711  */
2712 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2713 {
2714         struct inode *inode = mapping->host;
2715         struct btrfs_root *root = BTRFS_I(inode)->root;
2716         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2717         struct btrfs_ordered_extent *ordered;
2718         char *kaddr;
2719         u32 blocksize = root->sectorsize;
2720         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2721         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2722         struct page *page;
2723         int ret = 0;
2724         u64 page_start;
2725         u64 page_end;
2726
2727         if ((offset & (blocksize - 1)) == 0)
2728                 goto out;
2729
2730         ret = -ENOMEM;
2731 again:
2732         page = grab_cache_page(mapping, index);
2733         if (!page)
2734                 goto out;
2735
2736         page_start = page_offset(page);
2737         page_end = page_start + PAGE_CACHE_SIZE - 1;
2738
2739         if (!PageUptodate(page)) {
2740                 ret = btrfs_readpage(NULL, page);
2741                 lock_page(page);
2742                 if (page->mapping != mapping) {
2743                         unlock_page(page);
2744                         page_cache_release(page);
2745                         goto again;
2746                 }
2747                 if (!PageUptodate(page)) {
2748                         ret = -EIO;
2749                         goto out_unlock;
2750                 }
2751         }
2752         wait_on_page_writeback(page);
2753
2754         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2755         set_page_extent_mapped(page);
2756
2757         ordered = btrfs_lookup_ordered_extent(inode, page_start);
2758         if (ordered) {
2759                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2760                 unlock_page(page);
2761                 page_cache_release(page);
2762                 btrfs_start_ordered_extent(inode, ordered, 1);
2763                 btrfs_put_ordered_extent(ordered);
2764                 goto again;
2765         }
2766
2767         btrfs_set_extent_delalloc(inode, page_start, page_end);
2768         ret = 0;
2769         if (offset != PAGE_CACHE_SIZE) {
2770                 kaddr = kmap(page);
2771                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2772                 flush_dcache_page(page);
2773                 kunmap(page);
2774         }
2775         ClearPageChecked(page);
2776         set_page_dirty(page);
2777         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2778
2779 out_unlock:
2780         unlock_page(page);
2781         page_cache_release(page);
2782 out:
2783         return ret;
2784 }
2785
2786 int btrfs_cont_expand(struct inode *inode, loff_t size)
2787 {
2788         struct btrfs_trans_handle *trans;
2789         struct btrfs_root *root = BTRFS_I(inode)->root;
2790         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2791         struct extent_map *em;
2792         u64 mask = root->sectorsize - 1;
2793         u64 hole_start = (inode->i_size + mask) & ~mask;
2794         u64 block_end = (size + mask) & ~mask;
2795         u64 last_byte;
2796         u64 cur_offset;
2797         u64 hole_size;
2798         int err;
2799
2800         if (size <= hole_start)
2801                 return 0;
2802
2803         err = btrfs_check_free_space(root, 1, 0);
2804         if (err)
2805                 return err;
2806
2807         btrfs_truncate_page(inode->i_mapping, inode->i_size);
2808
2809         while (1) {
2810                 struct btrfs_ordered_extent *ordered;
2811                 btrfs_wait_ordered_range(inode, hole_start,
2812                                          block_end - hole_start);
2813                 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2814                 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2815                 if (!ordered)
2816                         break;
2817                 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2818                 btrfs_put_ordered_extent(ordered);
2819         }
2820
2821         trans = btrfs_start_transaction(root, 1);
2822         btrfs_set_trans_block_group(trans, inode);
2823
2824         cur_offset = hole_start;
2825         while (1) {
2826                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2827                                 block_end - cur_offset, 0);
2828                 BUG_ON(IS_ERR(em) || !em);
2829                 last_byte = min(extent_map_end(em), block_end);
2830                 last_byte = (last_byte + mask) & ~mask;
2831                 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2832                         u64 hint_byte = 0;
2833                         hole_size = last_byte - cur_offset;
2834                         err = btrfs_drop_extents(trans, root, inode,
2835                                                  cur_offset,
2836                                                  cur_offset + hole_size,
2837                                                  cur_offset, &hint_byte);
2838                         if (err)
2839                                 break;
2840                         err = btrfs_insert_file_extent(trans, root,
2841                                         inode->i_ino, cur_offset, 0,
2842                                         0, hole_size, 0, hole_size,
2843                                         0, 0, 0);
2844                         btrfs_drop_extent_cache(inode, hole_start,
2845                                         last_byte - 1, 0);
2846                 }
2847                 free_extent_map(em);
2848                 cur_offset = last_byte;
2849                 if (err || cur_offset >= block_end)
2850                         break;
2851         }
2852
2853         btrfs_end_transaction(trans, root);
2854         unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2855         return err;
2856 }
2857
2858 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2859 {
2860         struct inode *inode = dentry->d_inode;
2861         int err;
2862
2863         err = inode_change_ok(inode, attr);
2864         if (err)
2865                 return err;
2866
2867         if (S_ISREG(inode->i_mode) &&
2868             attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2869                 err = btrfs_cont_expand(inode, attr->ia_size);
2870                 if (err)
2871                         return err;
2872         }
2873
2874         err = inode_setattr(inode, attr);
2875
2876         if (!err && ((attr->ia_valid & ATTR_MODE)))
2877                 err = btrfs_acl_chmod(inode);
2878         return err;
2879 }
2880
2881 void btrfs_delete_inode(struct inode *inode)
2882 {
2883         struct btrfs_trans_handle *trans;
2884         struct btrfs_root *root = BTRFS_I(inode)->root;
2885         unsigned long nr;
2886         int ret;
2887
2888         truncate_inode_pages(&inode->i_data, 0);
2889         if (is_bad_inode(inode)) {
2890                 btrfs_orphan_del(NULL, inode);
2891                 goto no_delete;
2892         }
2893         btrfs_wait_ordered_range(inode, 0, (u64)-1);
2894
2895         btrfs_i_size_write(inode, 0);
2896         trans = btrfs_join_transaction(root, 1);
2897
2898         btrfs_set_trans_block_group(trans, inode);
2899         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2900         if (ret) {
2901                 btrfs_orphan_del(NULL, inode);
2902                 goto no_delete_lock;
2903         }
2904
2905         btrfs_orphan_del(trans, inode);
2906
2907         nr = trans->blocks_used;
2908         clear_inode(inode);
2909
2910         btrfs_end_transaction(trans, root);
2911         btrfs_btree_balance_dirty(root, nr);
2912         return;
2913
2914 no_delete_lock:
2915         nr = trans->blocks_used;
2916         btrfs_end_transaction(trans, root);
2917         btrfs_btree_balance_dirty(root, nr);
2918 no_delete:
2919         clear_inode(inode);
2920 }
2921
2922 /*
2923  * this returns the key found in the dir entry in the location pointer.
2924  * If no dir entries were found, location->objectid is 0.
2925  */
2926 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2927                                struct btrfs_key *location)
2928 {
2929         const char *name = dentry->d_name.name;
2930         int namelen = dentry->d_name.len;
2931         struct btrfs_dir_item *di;
2932         struct btrfs_path *path;
2933         struct btrfs_root *root = BTRFS_I(dir)->root;
2934         int ret = 0;
2935
2936         path = btrfs_alloc_path();
2937         BUG_ON(!path);
2938
2939         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2940                                     namelen, 0);
2941         if (IS_ERR(di))
2942                 ret = PTR_ERR(di);
2943
2944         if (!di || IS_ERR(di))
2945                 goto out_err;
2946
2947         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2948 out:
2949         btrfs_free_path(path);
2950         return ret;
2951 out_err:
2952         location->objectid = 0;
2953         goto out;
2954 }
2955
2956 /*
2957  * when we hit a tree root in a directory, the btrfs part of the inode
2958  * needs to be changed to reflect the root directory of the tree root.  This
2959  * is kind of like crossing a mount point.
2960  */
2961 static int fixup_tree_root_location(struct btrfs_root *root,
2962                              struct btrfs_key *location,
2963                              struct btrfs_root **sub_root,
2964                              struct dentry *dentry)
2965 {
2966         struct btrfs_root_item *ri;
2967
2968         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2969                 return 0;
2970         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2971                 return 0;
2972
2973         *sub_root = btrfs_read_fs_root(root->fs_info, location,
2974                                         dentry->d_name.name,
2975                                         dentry->d_name.len);
2976         if (IS_ERR(*sub_root))
2977                 return PTR_ERR(*sub_root);
2978
2979         ri = &(*sub_root)->root_item;
2980         location->objectid = btrfs_root_dirid(ri);
2981         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2982         location->offset = 0;
2983
2984         return 0;
2985 }
2986
2987 static noinline void init_btrfs_i(struct inode *inode)
2988 {
2989         struct btrfs_inode *bi = BTRFS_I(inode);
2990
2991         bi->i_acl = NULL;
2992         bi->i_default_acl = NULL;
2993
2994         bi->generation = 0;
2995         bi->sequence = 0;
2996         bi->last_trans = 0;
2997         bi->logged_trans = 0;
2998         bi->delalloc_bytes = 0;
2999         bi->disk_i_size = 0;
3000         bi->flags = 0;
3001         bi->index_cnt = (u64)-1;
3002         bi->log_dirty_trans = 0;
3003         extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3004         extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3005                              inode->i_mapping, GFP_NOFS);
3006         extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3007                              inode->i_mapping, GFP_NOFS);
3008         INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3009         btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3010         mutex_init(&BTRFS_I(inode)->extent_mutex);
3011         mutex_init(&BTRFS_I(inode)->log_mutex);
3012 }
3013
3014 static int btrfs_init_locked_inode(struct inode *inode, void *p)
3015 {
3016         struct btrfs_iget_args *args = p;
3017         inode->i_ino = args->ino;
3018         init_btrfs_i(inode);
3019         BTRFS_I(inode)->root = args->root;
3020         return 0;
3021 }
3022
3023 static int btrfs_find_actor(struct inode *inode, void *opaque)
3024 {
3025         struct btrfs_iget_args *args = opaque;
3026         return args->ino == inode->i_ino &&
3027                 args->root == BTRFS_I(inode)->root;
3028 }
3029
3030 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3031                             struct btrfs_root *root, int wait)
3032 {
3033         struct inode *inode;
3034         struct btrfs_iget_args args;
3035         args.ino = objectid;
3036         args.root = root;
3037
3038         if (wait) {
3039                 inode = ilookup5(s, objectid, btrfs_find_actor,
3040                                  (void *)&args);
3041         } else {
3042                 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3043                                         (void *)&args);
3044         }
3045         return inode;
3046 }
3047
3048 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3049                                 struct btrfs_root *root)
3050 {
3051         struct inode *inode;
3052         struct btrfs_iget_args args;
3053         args.ino = objectid;
3054         args.root = root;
3055
3056         inode = iget5_locked(s, objectid, btrfs_find_actor,
3057                              btrfs_init_locked_inode,
3058                              (void *)&args);
3059         return inode;
3060 }
3061
3062 /* Get an inode object given its location and corresponding root.
3063  * Returns in *is_new if the inode was read from disk
3064  */
3065 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3066                          struct btrfs_root *root, int *is_new)
3067 {
3068         struct inode *inode;
3069
3070         inode = btrfs_iget_locked(s, location->objectid, root);
3071         if (!inode)
3072                 return ERR_PTR(-EACCES);
3073
3074         if (inode->i_state & I_NEW) {
3075                 BTRFS_I(inode)->root = root;
3076                 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3077                 btrfs_read_locked_inode(inode);
3078                 unlock_new_inode(inode);
3079                 if (is_new)
3080                         *is_new = 1;
3081         } else {
3082                 if (is_new)
3083                         *is_new = 0;
3084         }
3085
3086         return inode;
3087 }
3088
3089 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3090 {
3091         struct inode *inode;
3092         struct btrfs_inode *bi = BTRFS_I(dir);
3093         struct btrfs_root *root = bi->root;
3094         struct btrfs_root *sub_root = root;
3095         struct btrfs_key location;
3096         int ret, new;
3097
3098         if (dentry->d_name.len > BTRFS_NAME_LEN)
3099                 return ERR_PTR(-ENAMETOOLONG);
3100
3101         ret = btrfs_inode_by_name(dir, dentry, &location);
3102
3103         if (ret < 0)
3104                 return ERR_PTR(ret);
3105
3106         inode = NULL;
3107         if (location.objectid) {
3108                 ret = fixup_tree_root_location(root, &location, &sub_root,
3109                                                 dentry);
3110                 if (ret < 0)
3111                         return ERR_PTR(ret);
3112                 if (ret > 0)
3113                         return ERR_PTR(-ENOENT);
3114                 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3115                 if (IS_ERR(inode))
3116                         return ERR_CAST(inode);
3117         }
3118         return inode;
3119 }
3120
3121 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3122                                    struct nameidata *nd)
3123 {
3124         struct inode *inode;
3125
3126         if (dentry->d_name.len > BTRFS_NAME_LEN)
3127                 return ERR_PTR(-ENAMETOOLONG);
3128
3129         inode = btrfs_lookup_dentry(dir, dentry);
3130         if (IS_ERR(inode))
3131                 return ERR_CAST(inode);
3132
3133         return d_splice_alias(inode, dentry);
3134 }
3135
3136 static unsigned char btrfs_filetype_table[] = {
3137         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3138 };
3139
3140 static int btrfs_real_readdir(struct file *filp, void *dirent,
3141                               filldir_t filldir)
3142 {
3143         struct inode *inode = filp->f_dentry->d_inode;
3144         struct btrfs_root *root = BTRFS_I(inode)->root;
3145         struct btrfs_item *item;
3146         struct btrfs_dir_item *di;
3147         struct btrfs_key key;
3148         struct btrfs_key found_key;
3149         struct btrfs_path *path;
3150         int ret;
3151         u32 nritems;
3152         struct extent_buffer *leaf;
3153         int slot;
3154         int advance;
3155         unsigned char d_type;
3156         int over = 0;
3157         u32 di_cur;
3158         u32 di_total;
3159         u32 di_len;
3160         int key_type = BTRFS_DIR_INDEX_KEY;
3161         char tmp_name[32];
3162         char *name_ptr;
3163         int name_len;
3164
3165         /* FIXME, use a real flag for deciding about the key type */
3166         if (root->fs_info->tree_root == root)
3167                 key_type = BTRFS_DIR_ITEM_KEY;
3168
3169         /* special case for "." */
3170         if (filp->f_pos == 0) {
3171                 over = filldir(dirent, ".", 1,
3172                                1, inode->i_ino,
3173                                DT_DIR);
3174                 if (over)
3175                         return 0;
3176                 filp->f_pos = 1;
3177         }
3178         /* special case for .., just use the back ref */
3179         if (filp->f_pos == 1) {
3180                 u64 pino = parent_ino(filp->f_path.dentry);
3181                 over = filldir(dirent, "..", 2,
3182                                2, pino, DT_DIR);
3183                 if (over)
3184                         return 0;
3185                 filp->f_pos = 2;
3186         }
3187         path = btrfs_alloc_path();
3188         path->reada = 2;
3189
3190         btrfs_set_key_type(&key, key_type);
3191         key.offset = filp->f_pos;
3192         key.objectid = inode->i_ino;
3193
3194         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3195         if (ret < 0)
3196                 goto err;
3197         advance = 0;
3198
3199         while (1) {
3200                 leaf = path->nodes[0];
3201                 nritems = btrfs_header_nritems(leaf);
3202                 slot = path->slots[0];
3203                 if (advance || slot >= nritems) {
3204                         if (slot >= nritems - 1) {
3205                                 ret = btrfs_next_leaf(root, path);
3206                                 if (ret)
3207                                         break;
3208                                 leaf = path->nodes[0];
3209                                 nritems = btrfs_header_nritems(leaf);
3210                                 slot = path->slots[0];
3211                         } else {
3212                                 slot++;
3213                                 path->slots[0]++;
3214                         }
3215                 }
3216
3217                 advance = 1;
3218                 item = btrfs_item_nr(leaf, slot);
3219                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3220
3221                 if (found_key.objectid != key.objectid)
3222                         break;
3223                 if (btrfs_key_type(&found_key) != key_type)
3224                         break;
3225                 if (found_key.offset < filp->f_pos)
3226                         continue;
3227
3228                 filp->f_pos = found_key.offset;
3229
3230                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3231                 di_cur = 0;
3232                 di_total = btrfs_item_size(leaf, item);
3233
3234                 while (di_cur < di_total) {
3235                         struct btrfs_key location;
3236
3237                         name_len = btrfs_dir_name_len(leaf, di);
3238                         if (name_len <= sizeof(tmp_name)) {
3239                                 name_ptr = tmp_name;
3240                         } else {
3241                                 name_ptr = kmalloc(name_len, GFP_NOFS);
3242                                 if (!name_ptr) {
3243                                         ret = -ENOMEM;
3244                                         goto err;
3245                                 }
3246                         }
3247                         read_extent_buffer(leaf, name_ptr,
3248                                            (unsigned long)(di + 1), name_len);
3249
3250                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3251                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
3252
3253                         /* is this a reference to our own snapshot? If so
3254                          * skip it
3255                          */
3256                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
3257                             location.objectid == root->root_key.objectid) {
3258                                 over = 0;
3259                                 goto skip;
3260                         }
3261                         over = filldir(dirent, name_ptr, name_len,
3262                                        found_key.offset, location.objectid,
3263                                        d_type);
3264
3265 skip:
3266                         if (name_ptr != tmp_name)
3267                                 kfree(name_ptr);
3268
3269                         if (over)
3270                                 goto nopos;
3271                         di_len = btrfs_dir_name_len(leaf, di) +
3272                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
3273                         di_cur += di_len;
3274                         di = (struct btrfs_dir_item *)((char *)di + di_len);
3275                 }
3276         }
3277
3278         /* Reached end of directory/root. Bump pos past the last item. */
3279         if (key_type == BTRFS_DIR_INDEX_KEY)
3280                 filp->f_pos = INT_LIMIT(off_t);
3281         else
3282                 filp->f_pos++;
3283 nopos:
3284         ret = 0;
3285 err:
3286         btrfs_free_path(path);
3287         return ret;
3288 }
3289
3290 int btrfs_write_inode(struct inode *inode, int wait)
3291 {
3292         struct btrfs_root *root = BTRFS_I(inode)->root;
3293         struct btrfs_trans_handle *trans;
3294         int ret = 0;
3295
3296         if (root->fs_info->btree_inode == inode)
3297                 return 0;
3298
3299         if (wait) {
3300                 trans = btrfs_join_transaction(root, 1);
3301                 btrfs_set_trans_block_group(trans, inode);
3302                 ret = btrfs_commit_transaction(trans, root);
3303         }
3304         return ret;
3305 }
3306
3307 /*
3308  * This is somewhat expensive, updating the tree every time the
3309  * inode changes.  But, it is most likely to find the inode in cache.
3310  * FIXME, needs more benchmarking...there are no reasons other than performance
3311  * to keep or drop this code.
3312  */
3313 void btrfs_dirty_inode(struct inode *inode)
3314 {
3315         struct btrfs_root *root = BTRFS_I(inode)->root;
3316         struct btrfs_trans_handle *trans;
3317
3318         trans = btrfs_join_transaction(root, 1);
3319         btrfs_set_trans_block_group(trans, inode);
3320         btrfs_update_inode(trans, root, inode);
3321         btrfs_end_transaction(trans, root);
3322 }
3323
3324 /*
3325  * find the highest existing sequence number in a directory
3326  * and then set the in-memory index_cnt variable to reflect
3327  * free sequence numbers
3328  */
3329 static int btrfs_set_inode_index_count(struct inode *inode)
3330 {
3331         struct btrfs_root *root = BTRFS_I(inode)->root;
3332         struct btrfs_key key, found_key;
3333         struct btrfs_path *path;
3334         struct extent_buffer *leaf;
3335         int ret;
3336
3337         key.objectid = inode->i_ino;
3338         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3339         key.offset = (u64)-1;
3340
3341         path = btrfs_alloc_path();
3342         if (!path)
3343                 return -ENOMEM;
3344
3345         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3346         if (ret < 0)
3347                 goto out;
3348         /* FIXME: we should be able to handle this */
3349         if (ret == 0)
3350                 goto out;
3351         ret = 0;
3352
3353         /*
3354          * MAGIC NUMBER EXPLANATION:
3355          * since we search a directory based on f_pos we have to start at 2
3356          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3357          * else has to start at 2
3358          */
3359         if (path->slots[0] == 0) {
3360                 BTRFS_I(inode)->index_cnt = 2;
3361                 goto out;
3362         }
3363
3364         path->slots[0]--;
3365
3366         leaf = path->nodes[0];
3367         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3368
3369         if (found_key.objectid != inode->i_ino ||
3370             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3371                 BTRFS_I(inode)->index_cnt = 2;
3372                 goto out;
3373         }
3374
3375         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3376 out:
3377         btrfs_free_path(path);
3378         return ret;
3379 }
3380
3381 /*
3382  * helper to find a free sequence number in a given directory.  This current
3383  * code is very simple, later versions will do smarter things in the btree
3384  */
3385 int btrfs_set_inode_index(struct inode *dir, u64 *index)
3386 {
3387         int ret = 0;
3388
3389         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3390                 ret = btrfs_set_inode_index_count(dir);
3391                 if (ret)
3392                         return ret;
3393         }
3394
3395         *index = BTRFS_I(dir)->index_cnt;
3396         BTRFS_I(dir)->index_cnt++;
3397
3398         return ret;
3399 }
3400
3401 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3402                                      struct btrfs_root *root,
3403                                      struct inode *dir,
3404                                      const char *name, int name_len,
3405                                      u64 ref_objectid, u64 objectid,
3406                                      u64 alloc_hint, int mode, u64 *index)
3407 {
3408         struct inode *inode;
3409         struct btrfs_inode_item *inode_item;
3410         struct btrfs_key *location;
3411         struct btrfs_path *path;
3412         struct btrfs_inode_ref *ref;
3413         struct btrfs_key key[2];
3414         u32 sizes[2];
3415         unsigned long ptr;
3416         int ret;
3417         int owner;
3418
3419         path = btrfs_alloc_path();
3420         BUG_ON(!path);
3421
3422         inode = new_inode(root->fs_info->sb);
3423         if (!inode)
3424                 return ERR_PTR(-ENOMEM);
3425
3426         if (dir) {
3427                 ret = btrfs_set_inode_index(dir, index);
3428                 if (ret)
3429                         return ERR_PTR(ret);
3430         }
3431         /*
3432          * index_cnt is ignored for everything but a dir,
3433          * btrfs_get_inode_index_count has an explanation for the magic
3434          * number
3435          */
3436         init_btrfs_i(inode);
3437         BTRFS_I(inode)->index_cnt = 2;
3438         BTRFS_I(inode)->root = root;
3439         BTRFS_I(inode)->generation = trans->transid;
3440
3441         if (mode & S_IFDIR)
3442                 owner = 0;
3443         else
3444                 owner = 1;
3445         BTRFS_I(inode)->block_group =
3446                         btrfs_find_block_group(root, 0, alloc_hint, owner);
3447         if ((mode & S_IFREG)) {
3448                 if (btrfs_test_opt(root, NODATASUM))
3449                         btrfs_set_flag(inode, NODATASUM);
3450                 if (btrfs_test_opt(root, NODATACOW))
3451                         btrfs_set_flag(inode, NODATACOW);
3452         }
3453
3454         key[0].objectid = objectid;
3455         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3456         key[0].offset = 0;
3457
3458         key[1].objectid = objectid;
3459         btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3460         key[1].offset = ref_objectid;
3461
3462         sizes[0] = sizeof(struct btrfs_inode_item);
3463         sizes[1] = name_len + sizeof(*ref);
3464
3465         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3466         if (ret != 0)
3467                 goto fail;
3468
3469         if (objectid > root->highest_inode)
3470                 root->highest_inode = objectid;
3471
3472         inode->i_uid = current_fsuid();
3473         inode->i_gid = current_fsgid();
3474         inode->i_mode = mode;
3475         inode->i_ino = objectid;
3476         inode_set_bytes(inode, 0);
3477         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3478         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3479                                   struct btrfs_inode_item);
3480         fill_inode_item(trans, path->nodes[0], inode_item, inode);
3481
3482         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3483                              struct btrfs_inode_ref);
3484         btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3485         btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3486         ptr = (unsigned long)(ref + 1);
3487         write_extent_buffer(path->nodes[0], name, ptr, name_len);
3488
3489         btrfs_mark_buffer_dirty(path->nodes[0]);
3490         btrfs_free_path(path);
3491
3492         location = &BTRFS_I(inode)->location;
3493         location->objectid = objectid;
3494         location->offset = 0;
3495         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3496
3497         insert_inode_hash(inode);
3498         return inode;
3499 fail:
3500         if (dir)
3501                 BTRFS_I(dir)->index_cnt--;
3502         btrfs_free_path(path);
3503         return ERR_PTR(ret);
3504 }
3505
3506 static inline u8 btrfs_inode_type(struct inode *inode)
3507 {
3508         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3509 }
3510
3511 /*
3512  * utility function to add 'inode' into 'parent_inode' with
3513  * a give name and a given sequence number.
3514  * if 'add_backref' is true, also insert a backref from the
3515  * inode to the parent directory.
3516  */
3517 int btrfs_add_link(struct btrfs_trans_handle *trans,
3518                    struct inode *parent_inode, struct inode *inode,
3519                    const char *name, int name_len, int add_backref, u64 index)
3520 {
3521         int ret;
3522         struct btrfs_key key;
3523         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3524
3525         key.objectid = inode->i_ino;
3526         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3527         key.offset = 0;
3528
3529         ret = btrfs_insert_dir_item(trans, root, name, name_len,
3530                                     parent_inode->i_ino,
3531                                     &key, btrfs_inode_type(inode),
3532                                     index);
3533         if (ret == 0) {
3534                 if (add_backref) {
3535                         ret = btrfs_insert_inode_ref(trans, root,
3536                                                      name, name_len,
3537                                                      inode->i_ino,
3538                                                      parent_inode->i_ino,
3539                                                      index);
3540                 }
3541                 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3542                                    name_len * 2);
3543                 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3544                 ret = btrfs_update_inode(trans, root, parent_inode);
3545         }
3546         return ret;
3547 }
3548
3549 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3550                             struct dentry *dentry, struct inode *inode,
3551                             int backref, u64 index)
3552 {
3553         int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3554                                  inode, dentry->d_name.name,
3555                                  dentry->d_name.len, backref, index);
3556         if (!err) {
3557                 d_instantiate(dentry, inode);
3558                 return 0;
3559         }
3560         if (err > 0)
3561                 err = -EEXIST;
3562         return err;
3563 }
3564
3565 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3566                         int mode, dev_t rdev)
3567 {
3568         struct btrfs_trans_handle *trans;
3569         struct btrfs_root *root = BTRFS_I(dir)->root;
3570         struct inode *inode = NULL;
3571         int err;
3572         int drop_inode = 0;
3573         u64 objectid;
3574         unsigned long nr = 0;
3575         u64 index = 0;
3576
3577         if (!new_valid_dev(rdev))
3578                 return -EINVAL;
3579
3580         err = btrfs_check_free_space(root, 1, 0);
3581         if (err)
3582                 goto fail;
3583
3584         trans = btrfs_start_transaction(root, 1);
3585         btrfs_set_trans_block_group(trans, dir);
3586
3587         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3588         if (err) {
3589                 err = -ENOSPC;
3590                 goto out_unlock;
3591         }
3592
3593         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3594                                 dentry->d_name.len,
3595                                 dentry->d_parent->d_inode->i_ino, objectid,
3596                                 BTRFS_I(dir)->block_group, mode, &index);
3597         err = PTR_ERR(inode);
3598         if (IS_ERR(inode))
3599                 goto out_unlock;
3600
3601         err = btrfs_init_inode_security(inode, dir);
3602         if (err) {
3603                 drop_inode = 1;
3604                 goto out_unlock;
3605         }
3606
3607         btrfs_set_trans_block_group(trans, inode);
3608         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3609         if (err)
3610                 drop_inode = 1;
3611         else {
3612                 inode->i_op = &btrfs_special_inode_operations;
3613                 init_special_inode(inode, inode->i_mode, rdev);
3614                 btrfs_update_inode(trans, root, inode);
3615         }
3616         dir->i_sb->s_dirt = 1;
3617         btrfs_update_inode_block_group(trans, inode);
3618         btrfs_update_inode_block_group(trans, dir);
3619 out_unlock:
3620         nr = trans->blocks_used;
3621         btrfs_end_transaction_throttle(trans, root);
3622 fail:
3623         if (drop_inode) {
3624                 inode_dec_link_count(inode);
3625                 iput(inode);
3626         }
3627         btrfs_btree_balance_dirty(root, nr);
3628         return err;
3629 }
3630
3631 static int btrfs_create(struct inode *dir, struct dentry *dentry,
3632                         int mode, struct nameidata *nd)
3633 {
3634         struct btrfs_trans_handle *trans;
3635         struct btrfs_root *root = BTRFS_I(dir)->root;
3636         struct inode *inode = NULL;
3637         int err;
3638         int drop_inode = 0;
3639         unsigned long nr = 0;
3640         u64 objectid;
3641         u64 index = 0;
3642
3643         err = btrfs_check_free_space(root, 1, 0);
3644         if (err)
3645                 goto fail;
3646         trans = btrfs_start_transaction(root, 1);
3647         btrfs_set_trans_block_group(trans, dir);
3648
3649         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3650         if (err) {
3651                 err = -ENOSPC;
3652                 goto out_unlock;
3653         }
3654
3655         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3656                                 dentry->d_name.len,
3657                                 dentry->d_parent->d_inode->i_ino,
3658                                 objectid, BTRFS_I(dir)->block_group, mode,
3659                                 &index);
3660         err = PTR_ERR(inode);
3661         if (IS_ERR(inode))
3662                 goto out_unlock;
3663
3664         err = btrfs_init_inode_security(inode, dir);
3665         if (err) {
3666                 drop_inode = 1;
3667                 goto out_unlock;
3668         }
3669
3670         btrfs_set_trans_block_group(trans, inode);
3671         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3672         if (err)
3673                 drop_inode = 1;
3674         else {
3675                 inode->i_mapping->a_ops = &btrfs_aops;
3676                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3677                 inode->i_fop = &btrfs_file_operations;
3678                 inode->i_op = &btrfs_file_inode_operations;
3679                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3680         }
3681         dir->i_sb->s_dirt = 1;
3682         btrfs_update_inode_block_group(trans, inode);
3683         btrfs_update_inode_block_group(trans, dir);
3684 out_unlock:
3685         nr = trans->blocks_used;
3686         btrfs_end_transaction_throttle(trans, root);
3687 fail:
3688         if (drop_inode) {
3689                 inode_dec_link_count(inode);
3690                 iput(inode);
3691         }
3692         btrfs_btree_balance_dirty(root, nr);
3693         return err;
3694 }
3695
3696 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3697                       struct dentry *dentry)
3698 {
3699         struct btrfs_trans_handle *trans;
3700         struct btrfs_root *root = BTRFS_I(dir)->root;
3701         struct inode *inode = old_dentry->d_inode;
3702         u64 index;
3703         unsigned long nr = 0;
3704         int err;
3705         int drop_inode = 0;
3706
3707         if (inode->i_nlink == 0)
3708                 return -ENOENT;
3709
3710         btrfs_inc_nlink(inode);
3711         err = btrfs_check_free_space(root, 1, 0);
3712         if (err)
3713                 goto fail;
3714         err = btrfs_set_inode_index(dir, &index);
3715         if (err)
3716                 goto fail;
3717
3718         trans = btrfs_start_transaction(root, 1);
3719
3720         btrfs_set_trans_block_group(trans, dir);
3721         atomic_inc(&inode->i_count);
3722
3723         err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3724
3725         if (err)
3726                 drop_inode = 1;
3727
3728         dir->i_sb->s_dirt = 1;
3729         btrfs_update_inode_block_group(trans, dir);
3730         err = btrfs_update_inode(trans, root, inode);
3731
3732         if (err)
3733                 drop_inode = 1;
3734
3735         nr = trans->blocks_used;
3736         btrfs_end_transaction_throttle(trans, root);
3737 fail:
3738         if (drop_inode) {
3739                 inode_dec_link_count(inode);
3740                 iput(inode);
3741         }
3742         btrfs_btree_balance_dirty(root, nr);
3743         return err;
3744 }
3745
3746 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3747 {
3748         struct inode *inode = NULL;
3749         struct btrfs_trans_handle *trans;
3750         struct btrfs_root *root = BTRFS_I(dir)->root;
3751         int err = 0;
3752         int drop_on_err = 0;
3753         u64 objectid = 0;
3754         u64 index = 0;
3755         unsigned long nr = 1;
3756
3757         err = btrfs_check_free_space(root, 1, 0);
3758         if (err)
3759                 goto out_unlock;
3760
3761         trans = btrfs_start_transaction(root, 1);
3762         btrfs_set_trans_block_group(trans, dir);
3763
3764         if (IS_ERR(trans)) {
3765                 err = PTR_ERR(trans);
3766                 goto out_unlock;
3767         }
3768
3769         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3770         if (err) {
3771                 err = -ENOSPC;
3772                 goto out_unlock;
3773         }
3774
3775         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3776                                 dentry->d_name.len,
3777                                 dentry->d_parent->d_inode->i_ino, objectid,
3778                                 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3779                                 &index);
3780         if (IS_ERR(inode)) {
3781                 err = PTR_ERR(inode);
3782                 goto out_fail;
3783         }
3784
3785         drop_on_err = 1;
3786
3787         err = btrfs_init_inode_security(inode, dir);
3788         if (err)
3789                 goto out_fail;
3790
3791         inode->i_op = &btrfs_dir_inode_operations;
3792         inode->i_fop = &btrfs_dir_file_operations;
3793         btrfs_set_trans_block_group(trans, inode);
3794
3795         btrfs_i_size_write(inode, 0);
3796         err = btrfs_update_inode(trans, root, inode);
3797         if (err)
3798                 goto out_fail;
3799
3800         err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3801                                  inode, dentry->d_name.name,
3802                                  dentry->d_name.len, 0, index);
3803         if (err)
3804                 goto out_fail;
3805
3806         d_instantiate(dentry, inode);
3807         drop_on_err = 0;
3808         dir->i_sb->s_dirt = 1;
3809         btrfs_update_inode_block_group(trans, inode);
3810         btrfs_update_inode_block_group(trans, dir);
3811
3812 out_fail:
3813         nr = trans->blocks_used;
3814         btrfs_end_transaction_throttle(trans, root);
3815
3816 out_unlock:
3817         if (drop_on_err)
3818                 iput(inode);
3819         btrfs_btree_balance_dirty(root, nr);
3820         return err;
3821 }
3822
3823 /* helper for btfs_get_extent.  Given an existing extent in the tree,
3824  * and an extent that you want to insert, deal with overlap and insert
3825  * the new extent into the tree.
3826  */
3827 static int merge_extent_mapping(struct extent_map_tree *em_tree,
3828                                 struct extent_map *existing,
3829                                 struct extent_map *em,
3830                                 u64 map_start, u64 map_len)
3831 {
3832         u64 start_diff;
3833
3834         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3835         start_diff = map_start - em->start;
3836         em->start = map_start;
3837         em->len = map_len;
3838         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3839             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3840                 em->block_start += start_diff;
3841                 em->block_len -= start_diff;
3842         }
3843         return add_extent_mapping(em_tree, em);
3844 }
3845
3846 static noinline int uncompress_inline(struct btrfs_path *path,
3847                                       struct inode *inode, struct page *page,
3848                                       size_t pg_offset, u64 extent_offset,
3849                                       struct btrfs_file_extent_item *item)
3850 {
3851         int ret;
3852         struct extent_buffer *leaf = path->nodes[0];
3853         char *tmp;
3854         size_t max_size;
3855         unsigned long inline_size;
3856         unsigned long ptr;
3857
3858         WARN_ON(pg_offset != 0);
3859         max_size = btrfs_file_extent_ram_bytes(leaf, item);
3860         inline_size = btrfs_file_extent_inline_item_len(leaf,
3861                                         btrfs_item_nr(leaf, path->slots[0]));
3862         tmp = kmalloc(inline_size, GFP_NOFS);
3863         ptr = btrfs_file_extent_inline_start(item);
3864
3865         read_extent_buffer(leaf, tmp, ptr, inline_size);
3866
3867         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3868         ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3869                                     inline_size, max_size);
3870         if (ret) {
3871                 char *kaddr = kmap_atomic(page, KM_USER0);
3872                 unsigned long copy_size = min_t(u64,
3873                                   PAGE_CACHE_SIZE - pg_offset,
3874                                   max_size - extent_offset);
3875                 memset(kaddr + pg_offset, 0, copy_size);
3876                 kunmap_atomic(kaddr, KM_USER0);
3877         }
3878         kfree(tmp);
3879         return 0;
3880 }
3881
3882 /*
3883  * a bit scary, this does extent mapping from logical file offset to the disk.
3884  * the ugly parts come from merging extents from the disk with the in-ram
3885  * representation.  This gets more complex because of the data=ordered code,
3886  * where the in-ram extents might be locked pending data=ordered completion.
3887  *
3888  * This also copies inline extents directly into the page.
3889  */
3890
3891 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3892                                     size_t pg_offset, u64 start, u64 len,
3893                                     int create)
3894 {
3895         int ret;
3896         int err = 0;
3897         u64 bytenr;
3898         u64 extent_start = 0;
3899         u64 extent_end = 0;
3900         u64 objectid = inode->i_ino;
3901         u32 found_type;
3902         struct btrfs_path *path = NULL;
3903         struct btrfs_root *root = BTRFS_I(inode)->root;
3904         struct btrfs_file_extent_item *item;
3905         struct extent_buffer *leaf;
3906         struct btrfs_key found_key;
3907         struct extent_map *em = NULL;
3908         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3909         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3910         struct btrfs_trans_handle *trans = NULL;
3911         int compressed;
3912
3913 again:
3914         spin_lock(&em_tree->lock);
3915         em = lookup_extent_mapping(em_tree, start, len);
3916         if (em)
3917                 em->bdev = root->fs_info->fs_devices->latest_bdev;
3918         spin_unlock(&em_tree->lock);
3919
3920         if (em) {
3921                 if (em->start > start || em->start + em->len <= start)
3922                         free_extent_map(em);
3923                 else if (em->block_start == EXTENT_MAP_INLINE && page)
3924                         free_extent_map(em);
3925                 else
3926                         goto out;
3927         }
3928         em = alloc_extent_map(GFP_NOFS);
3929         if (!em) {
3930                 err = -ENOMEM;
3931                 goto out;
3932         }
3933         em->bdev = root->fs_info->fs_devices->latest_bdev;
3934         em->start = EXTENT_MAP_HOLE;
3935         em->orig_start = EXTENT_MAP_HOLE;
3936         em->len = (u64)-1;
3937         em->block_len = (u64)-1;
3938
3939         if (!path) {
3940                 path = btrfs_alloc_path();
3941                 BUG_ON(!path);
3942         }
3943
3944         ret = btrfs_lookup_file_extent(trans, root, path,
3945                                        objectid, start, trans != NULL);
3946         if (ret < 0) {
3947                 err = ret;
3948                 goto out;
3949         }
3950
3951         if (ret != 0) {
3952                 if (path->slots[0] == 0)
3953                         goto not_found;
3954                 path->slots[0]--;
3955         }
3956
3957         leaf = path->nodes[0];
3958         item = btrfs_item_ptr(leaf, path->slots[0],
3959                               struct btrfs_file_extent_item);
3960         /* are we inside the extent that was found? */
3961         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3962         found_type = btrfs_key_type(&found_key);
3963         if (found_key.objectid != objectid ||
3964             found_type != BTRFS_EXTENT_DATA_KEY) {
3965                 goto not_found;
3966         }
3967
3968         found_type = btrfs_file_extent_type(leaf, item);
3969         extent_start = found_key.offset;
3970         compressed = btrfs_file_extent_compression(leaf, item);
3971         if (found_type == BTRFS_FILE_EXTENT_REG ||
3972             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3973                 extent_end = extent_start +
3974                        btrfs_file_extent_num_bytes(leaf, item);
3975         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3976                 size_t size;
3977                 size = btrfs_file_extent_inline_len(leaf, item);
3978                 extent_end = (extent_start + size + root->sectorsize - 1) &
3979                         ~((u64)root->sectorsize - 1);
3980         }
3981
3982         if (start >= extent_end) {
3983                 path->slots[0]++;
3984                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3985                         ret = btrfs_next_leaf(root, path);
3986                         if (ret < 0) {
3987                                 err = ret;
3988                                 goto out;
3989                         }
3990                         if (ret > 0)
3991                                 goto not_found;
3992                         leaf = path->nodes[0];
3993                 }
3994                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3995                 if (found_key.objectid != objectid ||
3996                     found_key.type != BTRFS_EXTENT_DATA_KEY)
3997                         goto not_found;
3998                 if (start + len <= found_key.offset)
3999                         goto not_found;
4000                 em->start = start;
4001                 em->len = found_key.offset - start;
4002                 goto not_found_em;
4003         }
4004
4005         if (found_type == BTRFS_FILE_EXTENT_REG ||
4006             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4007                 em->start = extent_start;
4008                 em->len = extent_end - extent_start;
4009                 em->orig_start = extent_start -
4010                                  btrfs_file_extent_offset(leaf, item);
4011                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4012                 if (bytenr == 0) {
4013                         em->block_start = EXTENT_MAP_HOLE;
4014                         goto insert;
4015                 }
4016                 if (compressed) {
4017                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4018                         em->block_start = bytenr;
4019                         em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4020                                                                          item);
4021                 } else {
4022                         bytenr += btrfs_file_extent_offset(leaf, item);
4023                         em->block_start = bytenr;
4024                         em->block_len = em->len;
4025                         if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4026                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4027                 }
4028                 goto insert;
4029         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4030                 unsigned long ptr;
4031                 char *map;
4032                 size_t size;
4033                 size_t extent_offset;
4034                 size_t copy_size;
4035
4036                 em->block_start = EXTENT_MAP_INLINE;
4037                 if (!page || create) {
4038                         em->start = extent_start;
4039                         em->len = extent_end - extent_start;
4040                         goto out;
4041                 }
4042
4043                 size = btrfs_file_extent_inline_len(leaf, item);
4044                 extent_offset = page_offset(page) + pg_offset - extent_start;
4045                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4046                                 size - extent_offset);
4047                 em->start = extent_start + extent_offset;
4048                 em->len = (copy_size + root->sectorsize - 1) &
4049                         ~((u64)root->sectorsize - 1);
4050                 em->orig_start = EXTENT_MAP_INLINE;
4051                 if (compressed)
4052                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4053                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4054                 if (create == 0 && !PageUptodate(page)) {
4055                         if (btrfs_file_extent_compression(leaf, item) ==
4056                             BTRFS_COMPRESS_ZLIB) {
4057                                 ret = uncompress_inline(path, inode, page,
4058                                                         pg_offset,
4059                                                         extent_offset, item);
4060                                 BUG_ON(ret);
4061                         } else {
4062                                 map = kmap(page);
4063                                 read_extent_buffer(leaf, map + pg_offset, ptr,
4064                                                    copy_size);
4065                                 kunmap(page);
4066                         }
4067                         flush_dcache_page(page);
4068                 } else if (create && PageUptodate(page)) {
4069                         if (!trans) {
4070                                 kunmap(page);
4071                                 free_extent_map(em);
4072                                 em = NULL;
4073                                 btrfs_release_path(root, path);
4074                                 trans = btrfs_join_transaction(root, 1);
4075                                 goto again;
4076                         }
4077                         map = kmap(page);
4078                         write_extent_buffer(leaf, map + pg_offset, ptr,
4079                                             copy_size);
4080                         kunmap(page);
4081                         btrfs_mark_buffer_dirty(leaf);
4082                 }
4083                 set_extent_uptodate(io_tree, em->start,
4084                                     extent_map_end(em) - 1, GFP_NOFS);
4085                 goto insert;
4086         } else {
4087                 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4088                 WARN_ON(1);
4089         }
4090 not_found:
4091         em->start = start;
4092         em->len = len;
4093 not_found_em:
4094         em->block_start = EXTENT_MAP_HOLE;
4095         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4096 insert:
4097         btrfs_release_path(root, path);
4098         if (em->start > start || extent_map_end(em) <= start) {
4099                 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4100                        "[%llu %llu]\n", (unsigned long long)em->start,
4101                        (unsigned long long)em->len,
4102                        (unsigned long long)start,
4103                        (unsigned long long)len);
4104                 err = -EIO;
4105                 goto out;
4106         }
4107
4108         err = 0;
4109         spin_lock(&em_tree->lock);
4110         ret = add_extent_mapping(em_tree, em);
4111         /* it is possible that someone inserted the extent into the tree
4112          * while we had the lock dropped.  It is also possible that
4113          * an overlapping map exists in the tree
4114          */
4115         if (ret == -EEXIST) {
4116                 struct extent_map *existing;
4117
4118                 ret = 0;
4119
4120                 existing = lookup_extent_mapping(em_tree, start, len);
4121                 if (existing && (existing->start > start ||
4122                     existing->start + existing->len <= start)) {
4123                         free_extent_map(existing);
4124                         existing = NULL;
4125                 }
4126                 if (!existing) {
4127                         existing = lookup_extent_mapping(em_tree, em->start,
4128                                                          em->len);
4129                         if (existing) {
4130                                 err = merge_extent_mapping(em_tree, existing,
4131                                                            em, start,
4132                                                            root->sectorsize);
4133                                 free_extent_map(existing);
4134                                 if (err) {
4135                                         free_extent_map(em);
4136                                         em = NULL;
4137                                 }
4138                         } else {
4139                                 err = -EIO;
4140                                 free_extent_map(em);
4141                                 em = NULL;
4142                         }
4143                 } else {
4144                         free_extent_map(em);
4145                         em = existing;
4146                         err = 0;
4147                 }
4148         }
4149         spin_unlock(&em_tree->lock);
4150 out:
4151         if (path)
4152                 btrfs_free_path(path);
4153         if (trans) {
4154                 ret = btrfs_end_transaction(trans, root);
4155                 if (!err)
4156                         err = ret;
4157         }
4158         if (err) {
4159                 free_extent_map(em);
4160                 WARN_ON(1);
4161                 return ERR_PTR(err);
4162         }
4163         return em;
4164 }
4165
4166 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4167                         const struct iovec *iov, loff_t offset,
4168                         unsigned long nr_segs)
4169 {
4170         return -EINVAL;
4171 }
4172
4173 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4174                 __u64 start, __u64 len)
4175 {
4176         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4177 }
4178
4179 int btrfs_readpage(struct file *file, struct page *page)
4180 {
4181         struct extent_io_tree *tree;
4182         tree = &BTRFS_I(page->mapping->host)->io_tree;
4183         return extent_read_full_page(tree, page, btrfs_get_extent);
4184 }
4185
4186 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4187 {
4188         struct extent_io_tree *tree;
4189
4190
4191         if (current->flags & PF_MEMALLOC) {
4192                 redirty_page_for_writepage(wbc, page);
4193                 unlock_page(page);
4194                 return 0;
4195         }
4196         tree = &BTRFS_I(page->mapping->host)->io_tree;
4197         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4198 }
4199
4200 int btrfs_writepages(struct address_space *mapping,
4201                      struct writeback_control *wbc)
4202 {
4203         struct extent_io_tree *tree;
4204
4205         tree = &BTRFS_I(mapping->host)->io_tree;
4206         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4207 }
4208
4209 static int
4210 btrfs_readpages(struct file *file, struct address_space *mapping,
4211                 struct list_head *pages, unsigned nr_pages)
4212 {
4213         struct extent_io_tree *tree;
4214         tree = &BTRFS_I(mapping->host)->io_tree;
4215         return extent_readpages(tree, mapping, pages, nr_pages,
4216                                 btrfs_get_extent);
4217 }
4218 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4219 {
4220         struct extent_io_tree *tree;
4221         struct extent_map_tree *map;
4222         int ret;
4223
4224         tree = &BTRFS_I(page->mapping->host)->io_tree;
4225         map = &BTRFS_I(page->mapping->host)->extent_tree;
4226         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4227         if (ret == 1) {
4228                 ClearPagePrivate(page);
4229                 set_page_private(page, 0);
4230                 page_cache_release(page);
4231         }
4232         return ret;
4233 }
4234
4235 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4236 {
4237         if (PageWriteback(page) || PageDirty(page))
4238                 return 0;
4239         return __btrfs_releasepage(page, gfp_flags);
4240 }
4241
4242 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4243 {
4244         struct extent_io_tree *tree;
4245         struct btrfs_ordered_extent *ordered;
4246         u64 page_start = page_offset(page);
4247         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4248
4249         wait_on_page_writeback(page);
4250         tree = &BTRFS_I(page->mapping->host)->io_tree;
4251         if (offset) {
4252                 btrfs_releasepage(page, GFP_NOFS);
4253                 return;
4254         }
4255
4256         lock_extent(tree, page_start, page_end, GFP_NOFS);
4257         ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4258                                            page_offset(page));
4259         if (ordered) {
4260                 /*
4261                  * IO on this page will never be started, so we need
4262                  * to account for any ordered extents now
4263                  */
4264                 clear_extent_bit(tree, page_start, page_end,
4265                                  EXTENT_DIRTY | EXTENT_DELALLOC |
4266                                  EXTENT_LOCKED, 1, 0, GFP_NOFS);
4267                 btrfs_finish_ordered_io(page->mapping->host,
4268                                         page_start, page_end);
4269                 btrfs_put_ordered_extent(ordered);
4270                 lock_extent(tree, page_start, page_end, GFP_NOFS);
4271         }
4272         clear_extent_bit(tree, page_start, page_end,
4273                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4274                  EXTENT_ORDERED,
4275                  1, 1, GFP_NOFS);
4276         __btrfs_releasepage(page, GFP_NOFS);
4277
4278         ClearPageChecked(page);
4279         if (PagePrivate(page)) {
4280                 ClearPagePrivate(page);
4281                 set_page_private(page, 0);
4282                 page_cache_release(page);
4283         }
4284 }
4285
4286 /*
4287  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4288  * called from a page fault handler when a page is first dirtied. Hence we must
4289  * be careful to check for EOF conditions here. We set the page up correctly
4290  * for a written page which means we get ENOSPC checking when writing into
4291  * holes and correct delalloc and unwritten extent mapping on filesystems that
4292  * support these features.
4293  *
4294  * We are not allowed to take the i_mutex here so we have to play games to
4295  * protect against truncate races as the page could now be beyond EOF.  Because
4296  * vmtruncate() writes the inode size before removing pages, once we have the
4297  * page lock we can determine safely if the page is beyond EOF. If it is not
4298  * beyond EOF, then the page is guaranteed safe against truncation until we
4299  * unlock the page.
4300  */
4301 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4302 {
4303         struct inode *inode = fdentry(vma->vm_file)->d_inode;
4304         struct btrfs_root *root = BTRFS_I(inode)->root;
4305         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4306         struct btrfs_ordered_extent *ordered;
4307         char *kaddr;
4308         unsigned long zero_start;
4309         loff_t size;
4310         int ret;
4311         u64 page_start;
4312         u64 page_end;
4313
4314         ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4315         if (ret)
4316                 goto out;
4317
4318         ret = -EINVAL;
4319 again:
4320         lock_page(page);
4321         size = i_size_read(inode);
4322         page_start = page_offset(page);
4323         page_end = page_start + PAGE_CACHE_SIZE - 1;
4324
4325         if ((page->mapping != inode->i_mapping) ||
4326             (page_start >= size)) {
4327                 /* page got truncated out from underneath us */
4328                 goto out_unlock;
4329         }
4330         wait_on_page_writeback(page);
4331
4332         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4333         set_page_extent_mapped(page);
4334
4335         /*
4336          * we can't set the delalloc bits if there are pending ordered
4337          * extents.  Drop our locks and wait for them to finish
4338          */
4339         ordered = btrfs_lookup_ordered_extent(inode, page_start);
4340         if (ordered) {
4341                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4342                 unlock_page(page);
4343                 btrfs_start_ordered_extent(inode, ordered, 1);
4344                 btrfs_put_ordered_extent(ordered);
4345                 goto again;
4346         }
4347
4348         btrfs_set_extent_delalloc(inode, page_start, page_end);
4349         ret = 0;
4350
4351         /* page is wholly or partially inside EOF */
4352         if (page_start + PAGE_CACHE_SIZE > size)
4353                 zero_start = size & ~PAGE_CACHE_MASK;
4354         else
4355                 zero_start = PAGE_CACHE_SIZE;
4356
4357         if (zero_start != PAGE_CACHE_SIZE) {
4358                 kaddr = kmap(page);
4359                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4360                 flush_dcache_page(page);
4361                 kunmap(page);
4362         }
4363         ClearPageChecked(page);
4364         set_page_dirty(page);
4365         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4366
4367 out_unlock:
4368         unlock_page(page);
4369 out:
4370         return ret;
4371 }
4372
4373 static void btrfs_truncate(struct inode *inode)
4374 {
4375         struct btrfs_root *root = BTRFS_I(inode)->root;
4376         int ret;
4377         struct btrfs_trans_handle *trans;
4378         unsigned long nr;
4379         u64 mask = root->sectorsize - 1;
4380
4381         if (!S_ISREG(inode->i_mode))
4382                 return;
4383         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4384                 return;
4385
4386         btrfs_truncate_page(inode->i_mapping, inode->i_size);
4387         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4388
4389         trans = btrfs_start_transaction(root, 1);
4390         btrfs_set_trans_block_group(trans, inode);
4391         btrfs_i_size_write(inode, inode->i_size);
4392
4393         ret = btrfs_orphan_add(trans, inode);
4394         if (ret)
4395                 goto out;
4396         /* FIXME, add redo link to tree so we don't leak on crash */
4397         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4398                                       BTRFS_EXTENT_DATA_KEY);
4399         btrfs_update_inode(trans, root, inode);
4400
4401         ret = btrfs_orphan_del(trans, inode);
4402         BUG_ON(ret);
4403
4404 out:
4405         nr = trans->blocks_used;
4406         ret = btrfs_end_transaction_throttle(trans, root);
4407         BUG_ON(ret);
4408         btrfs_btree_balance_dirty(root, nr);
4409 }
4410
4411 /*
4412  * create a new subvolume directory/inode (helper for the ioctl).
4413  */
4414 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4415                              struct btrfs_root *new_root, struct dentry *dentry,
4416                              u64 new_dirid, u64 alloc_hint)
4417 {
4418         struct inode *inode;
4419         int error;
4420         u64 index = 0;
4421
4422         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4423                                 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4424         if (IS_ERR(inode))
4425                 return PTR_ERR(inode);
4426         inode->i_op = &btrfs_dir_inode_operations;
4427         inode->i_fop = &btrfs_dir_file_operations;
4428
4429         inode->i_nlink = 1;
4430         btrfs_i_size_write(inode, 0);
4431
4432         error = btrfs_update_inode(trans, new_root, inode);
4433         if (error)
4434                 return error;
4435
4436         d_instantiate(dentry, inode);
4437         return 0;
4438 }
4439
4440 /* helper function for file defrag and space balancing.  This
4441  * forces readahead on a given range of bytes in an inode
4442  */
4443 unsigned long btrfs_force_ra(struct address_space *mapping,
4444                               struct file_ra_state *ra, struct file *file,
4445                               pgoff_t offset, pgoff_t last_index)
4446 {
4447         pgoff_t req_size = last_index - offset + 1;
4448
4449         page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4450         return offset + req_size;
4451 }
4452
4453 struct inode *btrfs_alloc_inode(struct super_block *sb)
4454 {
4455         struct btrfs_inode *ei;
4456
4457         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4458         if (!ei)
4459                 return NULL;
4460         ei->last_trans = 0;
4461         ei->logged_trans = 0;
4462         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4463         ei->i_acl = BTRFS_ACL_NOT_CACHED;
4464         ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4465         INIT_LIST_HEAD(&ei->i_orphan);
4466         return &ei->vfs_inode;
4467 }
4468
4469 void btrfs_destroy_inode(struct inode *inode)
4470 {
4471         struct btrfs_ordered_extent *ordered;
4472         WARN_ON(!list_empty(&inode->i_dentry));
4473         WARN_ON(inode->i_data.nrpages);
4474
4475         if (BTRFS_I(inode)->i_acl &&
4476             BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4477                 posix_acl_release(BTRFS_I(inode)->i_acl);
4478         if (BTRFS_I(inode)->i_default_acl &&
4479             BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4480                 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4481
4482         spin_lock(&BTRFS_I(inode)->root->list_lock);
4483         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4484                 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4485                        " list\n", inode->i_ino);
4486                 dump_stack();
4487         }
4488         spin_unlock(&BTRFS_I(inode)->root->list_lock);
4489
4490         while (1) {
4491                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4492                 if (!ordered)
4493                         break;
4494                 else {
4495                         printk(KERN_ERR "btrfs found ordered "
4496                                "extent %llu %llu on inode cleanup\n",
4497                                (unsigned long long)ordered->file_offset,
4498                                (unsigned long long)ordered->len);
4499                         btrfs_remove_ordered_extent(inode, ordered);
4500                         btrfs_put_ordered_extent(ordered);
4501                         btrfs_put_ordered_extent(ordered);
4502                 }
4503         }
4504         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4505         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4506 }
4507
4508 static void init_once(void *foo)
4509 {
4510         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4511
4512         inode_init_once(&ei->vfs_inode);
4513 }
4514
4515 void btrfs_destroy_cachep(void)
4516 {
4517         if (btrfs_inode_cachep)
4518                 kmem_cache_destroy(btrfs_inode_cachep);
4519         if (btrfs_trans_handle_cachep)
4520                 kmem_cache_destroy(btrfs_trans_handle_cachep);
4521         if (btrfs_transaction_cachep)
4522                 kmem_cache_destroy(btrfs_transaction_cachep);
4523         if (btrfs_bit_radix_cachep)
4524                 kmem_cache_destroy(btrfs_bit_radix_cachep);
4525         if (btrfs_path_cachep)
4526                 kmem_cache_destroy(btrfs_path_cachep);
4527 }
4528
4529 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4530                                        unsigned long extra_flags,
4531                                        void (*ctor)(void *))
4532 {
4533         return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4534                                  SLAB_MEM_SPREAD | extra_flags), ctor);
4535 }
4536
4537 int btrfs_init_cachep(void)
4538 {
4539         btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4540                                           sizeof(struct btrfs_inode),
4541                                           0, init_once);
4542         if (!btrfs_inode_cachep)
4543                 goto fail;
4544         btrfs_trans_handle_cachep =
4545                         btrfs_cache_create("btrfs_trans_handle_cache",
4546                                            sizeof(struct btrfs_trans_handle),
4547                                            0, NULL);
4548         if (!btrfs_trans_handle_cachep)
4549                 goto fail;
4550         btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4551                                              sizeof(struct btrfs_transaction),
4552                                              0, NULL);
4553         if (!btrfs_transaction_cachep)
4554                 goto fail;
4555         btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4556                                          sizeof(struct btrfs_path),
4557                                          0, NULL);
4558         if (!btrfs_path_cachep)
4559                 goto fail;
4560         btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4561                                               SLAB_DESTROY_BY_RCU, NULL);
4562         if (!btrfs_bit_radix_cachep)
4563                 goto fail;
4564         return 0;
4565 fail:
4566         btrfs_destroy_cachep();
4567         return -ENOMEM;
4568 }
4569
4570 static int btrfs_getattr(struct vfsmount *mnt,
4571                          struct dentry *dentry, struct kstat *stat)
4572 {
4573         struct inode *inode = dentry->d_inode;
4574         generic_fillattr(inode, stat);
4575         stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4576         stat->blksize = PAGE_CACHE_SIZE;
4577         stat->blocks = (inode_get_bytes(inode) +
4578                         BTRFS_I(inode)->delalloc_bytes) >> 9;
4579         return 0;
4580 }
4581
4582 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4583                            struct inode *new_dir, struct dentry *new_dentry)
4584 {
4585         struct btrfs_trans_handle *trans;
4586         struct btrfs_root *root = BTRFS_I(old_dir)->root;
4587         struct inode *new_inode = new_dentry->d_inode;
4588         struct inode *old_inode = old_dentry->d_inode;
4589         struct timespec ctime = CURRENT_TIME;
4590         u64 index = 0;
4591         int ret;
4592
4593         /* we're not allowed to rename between subvolumes */
4594         if (BTRFS_I(old_inode)->root->root_key.objectid !=
4595             BTRFS_I(new_dir)->root->root_key.objectid)
4596                 return -EXDEV;
4597
4598         if (S_ISDIR(old_inode->i_mode) && new_inode &&
4599             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4600                 return -ENOTEMPTY;
4601         }
4602
4603         /* to rename a snapshot or subvolume, we need to juggle the
4604          * backrefs.  This isn't coded yet
4605          */
4606         if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4607                 return -EXDEV;
4608
4609         ret = btrfs_check_free_space(root, 1, 0);
4610         if (ret)
4611                 goto out_unlock;
4612
4613         trans = btrfs_start_transaction(root, 1);
4614
4615         btrfs_set_trans_block_group(trans, new_dir);
4616
4617         btrfs_inc_nlink(old_dentry->d_inode);
4618         old_dir->i_ctime = old_dir->i_mtime = ctime;
4619         new_dir->i_ctime = new_dir->i_mtime = ctime;
4620         old_inode->i_ctime = ctime;
4621
4622         ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4623                                  old_dentry->d_name.name,
4624                                  old_dentry->d_name.len);
4625         if (ret)
4626                 goto out_fail;
4627
4628         if (new_inode) {
4629                 new_inode->i_ctime = CURRENT_TIME;
4630                 ret = btrfs_unlink_inode(trans, root, new_dir,
4631                                          new_dentry->d_inode,
4632                                          new_dentry->d_name.name,
4633                                          new_dentry->d_name.len);
4634                 if (ret)
4635                         goto out_fail;
4636                 if (new_inode->i_nlink == 0) {
4637                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4638                         if (ret)
4639                                 goto out_fail;
4640                 }
4641
4642         }
4643         ret = btrfs_set_inode_index(new_dir, &index);
4644         if (ret)
4645                 goto out_fail;
4646
4647         ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4648                              old_inode, new_dentry->d_name.name,
4649                              new_dentry->d_name.len, 1, index);
4650         if (ret)
4651                 goto out_fail;
4652
4653 out_fail:
4654         btrfs_end_transaction_throttle(trans, root);
4655 out_unlock:
4656         return ret;
4657 }
4658
4659 /*
4660  * some fairly slow code that needs optimization. This walks the list
4661  * of all the inodes with pending delalloc and forces them to disk.
4662  */
4663 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4664 {
4665         struct list_head *head = &root->fs_info->delalloc_inodes;
4666         struct btrfs_inode *binode;
4667         struct inode *inode;
4668
4669         if (root->fs_info->sb->s_flags & MS_RDONLY)
4670                 return -EROFS;
4671
4672         spin_lock(&root->fs_info->delalloc_lock);
4673         while (!list_empty(head)) {
4674                 binode = list_entry(head->next, struct btrfs_inode,
4675                                     delalloc_inodes);
4676                 inode = igrab(&binode->vfs_inode);
4677                 if (!inode)
4678                         list_del_init(&binode->delalloc_inodes);
4679                 spin_unlock(&root->fs_info->delalloc_lock);
4680                 if (inode) {
4681                         filemap_flush(inode->i_mapping);
4682                         iput(inode);
4683                 }
4684                 cond_resched();
4685                 spin_lock(&root->fs_info->delalloc_lock);
4686         }
4687         spin_unlock(&root->fs_info->delalloc_lock);
4688
4689         /* the filemap_flush will queue IO into the worker threads, but
4690          * we have to make sure the IO is actually started and that
4691          * ordered extents get created before we return
4692          */
4693         atomic_inc(&root->fs_info->async_submit_draining);
4694         while (atomic_read(&root->fs_info->nr_async_submits) ||
4695               atomic_read(&root->fs_info->async_delalloc_pages)) {
4696                 wait_event(root->fs_info->async_submit_wait,
4697                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4698                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4699         }
4700         atomic_dec(&root->fs_info->async_submit_draining);
4701         return 0;
4702 }
4703
4704 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4705                          const char *symname)
4706 {
4707         struct btrfs_trans_handle *trans;
4708         struct btrfs_root *root = BTRFS_I(dir)->root;
4709         struct btrfs_path *path;
4710         struct btrfs_key key;
4711         struct inode *inode = NULL;
4712         int err;
4713         int drop_inode = 0;
4714         u64 objectid;
4715         u64 index = 0 ;
4716         int name_len;
4717         int datasize;
4718         unsigned long ptr;
4719         struct btrfs_file_extent_item *ei;
4720         struct extent_buffer *leaf;
4721         unsigned long nr = 0;
4722
4723         name_len = strlen(symname) + 1;
4724         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4725                 return -ENAMETOOLONG;
4726
4727         err = btrfs_check_free_space(root, 1, 0);
4728         if (err)
4729                 goto out_fail;
4730
4731         trans = btrfs_start_transaction(root, 1);
4732         btrfs_set_trans_block_group(trans, dir);
4733
4734         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4735         if (err) {
4736                 err = -ENOSPC;
4737                 goto out_unlock;
4738         }
4739
4740         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4741                                 dentry->d_name.len,
4742                                 dentry->d_parent->d_inode->i_ino, objectid,
4743                                 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4744                                 &index);
4745         err = PTR_ERR(inode);
4746         if (IS_ERR(inode))
4747                 goto out_unlock;
4748
4749         err = btrfs_init_inode_security(inode, dir);
4750         if (err) {
4751                 drop_inode = 1;
4752                 goto out_unlock;
4753         }
4754
4755         btrfs_set_trans_block_group(trans, inode);
4756         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4757         if (err)
4758                 drop_inode = 1;
4759         else {
4760                 inode->i_mapping->a_ops = &btrfs_aops;
4761                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4762                 inode->i_fop = &btrfs_file_operations;
4763                 inode->i_op = &btrfs_file_inode_operations;
4764                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4765         }
4766         dir->i_sb->s_dirt = 1;
4767         btrfs_update_inode_block_group(trans, inode);
4768         btrfs_update_inode_block_group(trans, dir);
4769         if (drop_inode)
4770                 goto out_unlock;
4771
4772         path = btrfs_alloc_path();
4773         BUG_ON(!path);
4774         key.objectid = inode->i_ino;
4775         key.offset = 0;
4776         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4777         datasize = btrfs_file_extent_calc_inline_size(name_len);
4778         err = btrfs_insert_empty_item(trans, root, path, &key,
4779                                       datasize);
4780         if (err) {
4781                 drop_inode = 1;
4782                 goto out_unlock;
4783         }
4784         leaf = path->nodes[0];
4785         ei = btrfs_item_ptr(leaf, path->slots[0],
4786                             struct btrfs_file_extent_item);
4787         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4788         btrfs_set_file_extent_type(leaf, ei,
4789                                    BTRFS_FILE_EXTENT_INLINE);
4790         btrfs_set_file_extent_encryption(leaf, ei, 0);
4791         btrfs_set_file_extent_compression(leaf, ei, 0);
4792         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4793         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4794
4795         ptr = btrfs_file_extent_inline_start(ei);
4796         write_extent_buffer(leaf, symname, ptr, name_len);
4797         btrfs_mark_buffer_dirty(leaf);
4798         btrfs_free_path(path);
4799
4800         inode->i_op = &btrfs_symlink_inode_operations;
4801         inode->i_mapping->a_ops = &btrfs_symlink_aops;
4802         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4803         inode_set_bytes(inode, name_len);
4804         btrfs_i_size_write(inode, name_len - 1);
4805         err = btrfs_update_inode(trans, root, inode);
4806         if (err)
4807                 drop_inode = 1;
4808
4809 out_unlock:
4810         nr = trans->blocks_used;
4811         btrfs_end_transaction_throttle(trans, root);
4812 out_fail:
4813         if (drop_inode) {
4814                 inode_dec_link_count(inode);
4815                 iput(inode);
4816         }
4817         btrfs_btree_balance_dirty(root, nr);
4818         return err;
4819 }
4820
4821 static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4822                                u64 alloc_hint, int mode)
4823 {
4824         struct btrfs_trans_handle *trans;
4825         struct btrfs_root *root = BTRFS_I(inode)->root;
4826         struct btrfs_key ins;
4827         u64 alloc_size;
4828         u64 cur_offset = start;
4829         u64 num_bytes = end - start;
4830         int ret = 0;
4831
4832         trans = btrfs_join_transaction(root, 1);
4833         BUG_ON(!trans);
4834         btrfs_set_trans_block_group(trans, inode);
4835
4836         while (num_bytes > 0) {
4837                 alloc_size = min(num_bytes, root->fs_info->max_extent);
4838                 ret = btrfs_reserve_extent(trans, root, alloc_size,
4839                                            root->sectorsize, 0, alloc_hint,
4840                                            (u64)-1, &ins, 1);
4841                 if (ret) {
4842                         WARN_ON(1);
4843                         goto out;
4844                 }
4845                 ret = insert_reserved_file_extent(trans, inode,
4846                                                   cur_offset, ins.objectid,
4847                                                   ins.offset, ins.offset,
4848                                                   ins.offset, 0, 0, 0,
4849                                                   BTRFS_FILE_EXTENT_PREALLOC);
4850                 BUG_ON(ret);
4851                 num_bytes -= ins.offset;
4852                 cur_offset += ins.offset;
4853                 alloc_hint = ins.objectid + ins.offset;
4854         }
4855 out:
4856         if (cur_offset > start) {
4857                 inode->i_ctime = CURRENT_TIME;
4858                 btrfs_set_flag(inode, PREALLOC);
4859                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4860                     cur_offset > i_size_read(inode))
4861                         btrfs_i_size_write(inode, cur_offset);
4862                 ret = btrfs_update_inode(trans, root, inode);
4863                 BUG_ON(ret);
4864         }
4865
4866         btrfs_end_transaction(trans, root);
4867         return ret;
4868 }
4869
4870 static long btrfs_fallocate(struct inode *inode, int mode,
4871                             loff_t offset, loff_t len)
4872 {
4873         u64 cur_offset;
4874         u64 last_byte;
4875         u64 alloc_start;
4876         u64 alloc_end;
4877         u64 alloc_hint = 0;
4878         u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4879         struct extent_map *em;
4880         int ret;
4881
4882         alloc_start = offset & ~mask;
4883         alloc_end =  (offset + len + mask) & ~mask;
4884
4885         mutex_lock(&inode->i_mutex);
4886         if (alloc_start > inode->i_size) {
4887                 ret = btrfs_cont_expand(inode, alloc_start);
4888                 if (ret)
4889                         goto out;
4890         }
4891
4892         while (1) {
4893                 struct btrfs_ordered_extent *ordered;
4894                 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4895                             alloc_end - 1, GFP_NOFS);
4896                 ordered = btrfs_lookup_first_ordered_extent(inode,
4897                                                             alloc_end - 1);
4898                 if (ordered &&
4899                     ordered->file_offset + ordered->len > alloc_start &&
4900                     ordered->file_offset < alloc_end) {
4901                         btrfs_put_ordered_extent(ordered);
4902                         unlock_extent(&BTRFS_I(inode)->io_tree,
4903                                       alloc_start, alloc_end - 1, GFP_NOFS);
4904                         btrfs_wait_ordered_range(inode, alloc_start,
4905                                                  alloc_end - alloc_start);
4906                 } else {
4907                         if (ordered)
4908                                 btrfs_put_ordered_extent(ordered);
4909                         break;
4910                 }
4911         }
4912
4913         cur_offset = alloc_start;
4914         while (1) {
4915                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4916                                       alloc_end - cur_offset, 0);
4917                 BUG_ON(IS_ERR(em) || !em);
4918                 last_byte = min(extent_map_end(em), alloc_end);
4919                 last_byte = (last_byte + mask) & ~mask;
4920                 if (em->block_start == EXTENT_MAP_HOLE) {
4921                         ret = prealloc_file_range(inode, cur_offset,
4922                                         last_byte, alloc_hint, mode);
4923                         if (ret < 0) {
4924                                 free_extent_map(em);
4925                                 break;
4926                         }
4927                 }
4928                 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4929                         alloc_hint = em->block_start;
4930                 free_extent_map(em);
4931
4932                 cur_offset = last_byte;
4933                 if (cur_offset >= alloc_end) {
4934                         ret = 0;
4935                         break;
4936                 }
4937         }
4938         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4939                       GFP_NOFS);
4940 out:
4941         mutex_unlock(&inode->i_mutex);
4942         return ret;
4943 }
4944
4945 static int btrfs_set_page_dirty(struct page *page)
4946 {
4947         return __set_page_dirty_nobuffers(page);
4948 }
4949
4950 static int btrfs_permission(struct inode *inode, int mask)
4951 {
4952         if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4953                 return -EACCES;
4954         return generic_permission(inode, mask, btrfs_check_acl);
4955 }
4956
4957 static struct inode_operations btrfs_dir_inode_operations = {
4958         .getattr        = btrfs_getattr,
4959         .lookup         = btrfs_lookup,
4960         .create         = btrfs_create,
4961         .unlink         = btrfs_unlink,
4962         .link           = btrfs_link,
4963         .mkdir          = btrfs_mkdir,
4964         .rmdir          = btrfs_rmdir,
4965         .rename         = btrfs_rename,
4966         .symlink        = btrfs_symlink,
4967         .setattr        = btrfs_setattr,
4968         .mknod          = btrfs_mknod,
4969         .setxattr       = btrfs_setxattr,
4970         .getxattr       = btrfs_getxattr,
4971         .listxattr      = btrfs_listxattr,
4972         .removexattr    = btrfs_removexattr,
4973         .permission     = btrfs_permission,
4974 };
4975 static struct inode_operations btrfs_dir_ro_inode_operations = {
4976         .lookup         = btrfs_lookup,
4977         .permission     = btrfs_permission,
4978 };
4979 static struct file_operations btrfs_dir_file_operations = {
4980         .llseek         = generic_file_llseek,
4981         .read           = generic_read_dir,
4982         .readdir        = btrfs_real_readdir,
4983         .unlocked_ioctl = btrfs_ioctl,
4984 #ifdef CONFIG_COMPAT
4985         .compat_ioctl   = btrfs_ioctl,
4986 #endif
4987         .release        = btrfs_release_file,
4988         .fsync          = btrfs_sync_file,
4989 };
4990
4991 static struct extent_io_ops btrfs_extent_io_ops = {
4992         .fill_delalloc = run_delalloc_range,
4993         .submit_bio_hook = btrfs_submit_bio_hook,
4994         .merge_bio_hook = btrfs_merge_bio_hook,
4995         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4996         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4997         .writepage_start_hook = btrfs_writepage_start_hook,
4998         .readpage_io_failed_hook = btrfs_io_failed_hook,
4999         .set_bit_hook = btrfs_set_bit_hook,
5000         .clear_bit_hook = btrfs_clear_bit_hook,
5001 };
5002
5003 /*
5004  * btrfs doesn't support the bmap operation because swapfiles
5005  * use bmap to make a mapping of extents in the file.  They assume
5006  * these extents won't change over the life of the file and they
5007  * use the bmap result to do IO directly to the drive.
5008  *
5009  * the btrfs bmap call would return logical addresses that aren't
5010  * suitable for IO and they also will change frequently as COW
5011  * operations happen.  So, swapfile + btrfs == corruption.
5012  *
5013  * For now we're avoiding this by dropping bmap.
5014  */
5015 static struct address_space_operations btrfs_aops = {
5016         .readpage       = btrfs_readpage,
5017         .writepage      = btrfs_writepage,
5018         .writepages     = btrfs_writepages,
5019         .readpages      = btrfs_readpages,
5020         .sync_page      = block_sync_page,
5021         .direct_IO      = btrfs_direct_IO,
5022         .invalidatepage = btrfs_invalidatepage,
5023         .releasepage    = btrfs_releasepage,
5024         .set_page_dirty = btrfs_set_page_dirty,
5025 };
5026
5027 static struct address_space_operations btrfs_symlink_aops = {
5028         .readpage       = btrfs_readpage,
5029         .writepage      = btrfs_writepage,
5030         .invalidatepage = btrfs_invalidatepage,
5031         .releasepage    = btrfs_releasepage,
5032 };
5033
5034 static struct inode_operations btrfs_file_inode_operations = {
5035         .truncate       = btrfs_truncate,
5036         .getattr        = btrfs_getattr,
5037         .setattr        = btrfs_setattr,
5038         .setxattr       = btrfs_setxattr,
5039         .getxattr       = btrfs_getxattr,
5040         .listxattr      = btrfs_listxattr,
5041         .removexattr    = btrfs_removexattr,
5042         .permission     = btrfs_permission,
5043         .fallocate      = btrfs_fallocate,
5044         .fiemap         = btrfs_fiemap,
5045 };
5046 static struct inode_operations btrfs_special_inode_operations = {
5047         .getattr        = btrfs_getattr,
5048         .setattr        = btrfs_setattr,
5049         .permission     = btrfs_permission,
5050         .setxattr       = btrfs_setxattr,
5051         .getxattr       = btrfs_getxattr,
5052         .listxattr      = btrfs_listxattr,
5053         .removexattr    = btrfs_removexattr,
5054 };
5055 static struct inode_operations btrfs_symlink_inode_operations = {
5056         .readlink       = generic_readlink,
5057         .follow_link    = page_follow_link_light,
5058         .put_link       = page_put_link,
5059         .permission     = btrfs_permission,
5060         .setxattr       = btrfs_setxattr,
5061         .getxattr       = btrfs_getxattr,
5062         .listxattr      = btrfs_listxattr,
5063         .removexattr    = btrfs_removexattr,
5064 };