0112c02742f44cfb5811d75655cd065b3730821a
[linux-2.6-block.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21 #include "check-integrity.h"
22
23 static struct kmem_cache *extent_state_cache;
24 static struct kmem_cache *extent_buffer_cache;
25
26 static LIST_HEAD(buffers);
27 static LIST_HEAD(states);
28
29 #define LEAK_DEBUG 0
30 #if LEAK_DEBUG
31 static DEFINE_SPINLOCK(leak_lock);
32 #endif
33
34 #define BUFFER_LRU_MAX 64
35
36 struct tree_entry {
37         u64 start;
38         u64 end;
39         struct rb_node rb_node;
40 };
41
42 struct extent_page_data {
43         struct bio *bio;
44         struct extent_io_tree *tree;
45         get_extent_t *get_extent;
46
47         /* tells writepage not to lock the state bits for this range
48          * it still does the unlocking
49          */
50         unsigned int extent_locked:1;
51
52         /* tells the submit_bio code to use a WRITE_SYNC */
53         unsigned int sync_io:1;
54 };
55
56 static inline struct btrfs_fs_info *
57 tree_fs_info(struct extent_io_tree *tree)
58 {
59         return btrfs_sb(tree->mapping->host->i_sb);
60 }
61
62 int __init extent_io_init(void)
63 {
64         extent_state_cache = kmem_cache_create("extent_state",
65                         sizeof(struct extent_state), 0,
66                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
67         if (!extent_state_cache)
68                 return -ENOMEM;
69
70         extent_buffer_cache = kmem_cache_create("extent_buffers",
71                         sizeof(struct extent_buffer), 0,
72                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
73         if (!extent_buffer_cache)
74                 goto free_state_cache;
75         return 0;
76
77 free_state_cache:
78         kmem_cache_destroy(extent_state_cache);
79         return -ENOMEM;
80 }
81
82 void extent_io_exit(void)
83 {
84         struct extent_state *state;
85         struct extent_buffer *eb;
86
87         while (!list_empty(&states)) {
88                 state = list_entry(states.next, struct extent_state, leak_list);
89                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
90                        "state %lu in tree %p refs %d\n",
91                        (unsigned long long)state->start,
92                        (unsigned long long)state->end,
93                        state->state, state->tree, atomic_read(&state->refs));
94                 list_del(&state->leak_list);
95                 kmem_cache_free(extent_state_cache, state);
96
97         }
98
99         while (!list_empty(&buffers)) {
100                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
101                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
102                        "refs %d\n", (unsigned long long)eb->start,
103                        eb->len, atomic_read(&eb->refs));
104                 list_del(&eb->leak_list);
105                 kmem_cache_free(extent_buffer_cache, eb);
106         }
107         if (extent_state_cache)
108                 kmem_cache_destroy(extent_state_cache);
109         if (extent_buffer_cache)
110                 kmem_cache_destroy(extent_buffer_cache);
111 }
112
113 void extent_io_tree_init(struct extent_io_tree *tree,
114                          struct address_space *mapping)
115 {
116         tree->state = RB_ROOT;
117         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
118         tree->ops = NULL;
119         tree->dirty_bytes = 0;
120         spin_lock_init(&tree->lock);
121         spin_lock_init(&tree->buffer_lock);
122         tree->mapping = mapping;
123 }
124
125 static struct extent_state *alloc_extent_state(gfp_t mask)
126 {
127         struct extent_state *state;
128 #if LEAK_DEBUG
129         unsigned long flags;
130 #endif
131
132         state = kmem_cache_alloc(extent_state_cache, mask);
133         if (!state)
134                 return state;
135         state->state = 0;
136         state->private = 0;
137         state->tree = NULL;
138 #if LEAK_DEBUG
139         spin_lock_irqsave(&leak_lock, flags);
140         list_add(&state->leak_list, &states);
141         spin_unlock_irqrestore(&leak_lock, flags);
142 #endif
143         atomic_set(&state->refs, 1);
144         init_waitqueue_head(&state->wq);
145         trace_alloc_extent_state(state, mask, _RET_IP_);
146         return state;
147 }
148
149 void free_extent_state(struct extent_state *state)
150 {
151         if (!state)
152                 return;
153         if (atomic_dec_and_test(&state->refs)) {
154 #if LEAK_DEBUG
155                 unsigned long flags;
156 #endif
157                 WARN_ON(state->tree);
158 #if LEAK_DEBUG
159                 spin_lock_irqsave(&leak_lock, flags);
160                 list_del(&state->leak_list);
161                 spin_unlock_irqrestore(&leak_lock, flags);
162 #endif
163                 trace_free_extent_state(state, _RET_IP_);
164                 kmem_cache_free(extent_state_cache, state);
165         }
166 }
167
168 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
169                                    struct rb_node *node)
170 {
171         struct rb_node **p = &root->rb_node;
172         struct rb_node *parent = NULL;
173         struct tree_entry *entry;
174
175         while (*p) {
176                 parent = *p;
177                 entry = rb_entry(parent, struct tree_entry, rb_node);
178
179                 if (offset < entry->start)
180                         p = &(*p)->rb_left;
181                 else if (offset > entry->end)
182                         p = &(*p)->rb_right;
183                 else
184                         return parent;
185         }
186
187         entry = rb_entry(node, struct tree_entry, rb_node);
188         rb_link_node(node, parent, p);
189         rb_insert_color(node, root);
190         return NULL;
191 }
192
193 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
194                                      struct rb_node **prev_ret,
195                                      struct rb_node **next_ret)
196 {
197         struct rb_root *root = &tree->state;
198         struct rb_node *n = root->rb_node;
199         struct rb_node *prev = NULL;
200         struct rb_node *orig_prev = NULL;
201         struct tree_entry *entry;
202         struct tree_entry *prev_entry = NULL;
203
204         while (n) {
205                 entry = rb_entry(n, struct tree_entry, rb_node);
206                 prev = n;
207                 prev_entry = entry;
208
209                 if (offset < entry->start)
210                         n = n->rb_left;
211                 else if (offset > entry->end)
212                         n = n->rb_right;
213                 else
214                         return n;
215         }
216
217         if (prev_ret) {
218                 orig_prev = prev;
219                 while (prev && offset > prev_entry->end) {
220                         prev = rb_next(prev);
221                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222                 }
223                 *prev_ret = prev;
224                 prev = orig_prev;
225         }
226
227         if (next_ret) {
228                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
229                 while (prev && offset < prev_entry->start) {
230                         prev = rb_prev(prev);
231                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
232                 }
233                 *next_ret = prev;
234         }
235         return NULL;
236 }
237
238 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
239                                           u64 offset)
240 {
241         struct rb_node *prev = NULL;
242         struct rb_node *ret;
243
244         ret = __etree_search(tree, offset, &prev, NULL);
245         if (!ret)
246                 return prev;
247         return ret;
248 }
249
250 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
251                      struct extent_state *other)
252 {
253         if (tree->ops && tree->ops->merge_extent_hook)
254                 tree->ops->merge_extent_hook(tree->mapping->host, new,
255                                              other);
256 }
257
258 /*
259  * utility function to look for merge candidates inside a given range.
260  * Any extents with matching state are merged together into a single
261  * extent in the tree.  Extents with EXTENT_IO in their state field
262  * are not merged because the end_io handlers need to be able to do
263  * operations on them without sleeping (or doing allocations/splits).
264  *
265  * This should be called with the tree lock held.
266  */
267 static void merge_state(struct extent_io_tree *tree,
268                         struct extent_state *state)
269 {
270         struct extent_state *other;
271         struct rb_node *other_node;
272
273         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
274                 return;
275
276         other_node = rb_prev(&state->rb_node);
277         if (other_node) {
278                 other = rb_entry(other_node, struct extent_state, rb_node);
279                 if (other->end == state->start - 1 &&
280                     other->state == state->state) {
281                         merge_cb(tree, state, other);
282                         state->start = other->start;
283                         other->tree = NULL;
284                         rb_erase(&other->rb_node, &tree->state);
285                         free_extent_state(other);
286                 }
287         }
288         other_node = rb_next(&state->rb_node);
289         if (other_node) {
290                 other = rb_entry(other_node, struct extent_state, rb_node);
291                 if (other->start == state->end + 1 &&
292                     other->state == state->state) {
293                         merge_cb(tree, state, other);
294                         state->end = other->end;
295                         other->tree = NULL;
296                         rb_erase(&other->rb_node, &tree->state);
297                         free_extent_state(other);
298                 }
299         }
300 }
301
302 static void set_state_cb(struct extent_io_tree *tree,
303                          struct extent_state *state, int *bits)
304 {
305         if (tree->ops && tree->ops->set_bit_hook)
306                 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
307 }
308
309 static void clear_state_cb(struct extent_io_tree *tree,
310                            struct extent_state *state, int *bits)
311 {
312         if (tree->ops && tree->ops->clear_bit_hook)
313                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
314 }
315
316 static void set_state_bits(struct extent_io_tree *tree,
317                            struct extent_state *state, int *bits);
318
319 /*
320  * insert an extent_state struct into the tree.  'bits' are set on the
321  * struct before it is inserted.
322  *
323  * This may return -EEXIST if the extent is already there, in which case the
324  * state struct is freed.
325  *
326  * The tree lock is not taken internally.  This is a utility function and
327  * probably isn't what you want to call (see set/clear_extent_bit).
328  */
329 static int insert_state(struct extent_io_tree *tree,
330                         struct extent_state *state, u64 start, u64 end,
331                         int *bits)
332 {
333         struct rb_node *node;
334
335         if (end < start) {
336                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
337                        (unsigned long long)end,
338                        (unsigned long long)start);
339                 WARN_ON(1);
340         }
341         state->start = start;
342         state->end = end;
343
344         set_state_bits(tree, state, bits);
345
346         node = tree_insert(&tree->state, end, &state->rb_node);
347         if (node) {
348                 struct extent_state *found;
349                 found = rb_entry(node, struct extent_state, rb_node);
350                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
351                        "%llu %llu\n", (unsigned long long)found->start,
352                        (unsigned long long)found->end,
353                        (unsigned long long)start, (unsigned long long)end);
354                 return -EEXIST;
355         }
356         state->tree = tree;
357         merge_state(tree, state);
358         return 0;
359 }
360
361 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
362                      u64 split)
363 {
364         if (tree->ops && tree->ops->split_extent_hook)
365                 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
366 }
367
368 /*
369  * split a given extent state struct in two, inserting the preallocated
370  * struct 'prealloc' as the newly created second half.  'split' indicates an
371  * offset inside 'orig' where it should be split.
372  *
373  * Before calling,
374  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
375  * are two extent state structs in the tree:
376  * prealloc: [orig->start, split - 1]
377  * orig: [ split, orig->end ]
378  *
379  * The tree locks are not taken by this function. They need to be held
380  * by the caller.
381  */
382 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
383                        struct extent_state *prealloc, u64 split)
384 {
385         struct rb_node *node;
386
387         split_cb(tree, orig, split);
388
389         prealloc->start = orig->start;
390         prealloc->end = split - 1;
391         prealloc->state = orig->state;
392         orig->start = split;
393
394         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
395         if (node) {
396                 free_extent_state(prealloc);
397                 return -EEXIST;
398         }
399         prealloc->tree = tree;
400         return 0;
401 }
402
403 /*
404  * utility function to clear some bits in an extent state struct.
405  * it will optionally wake up any one waiting on this state (wake == 1), or
406  * forcibly remove the state from the tree (delete == 1).
407  *
408  * If no bits are set on the state struct after clearing things, the
409  * struct is freed and removed from the tree
410  */
411 static int clear_state_bit(struct extent_io_tree *tree,
412                             struct extent_state *state,
413                             int *bits, int wake)
414 {
415         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
416         int ret = state->state & bits_to_clear;
417
418         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
419                 u64 range = state->end - state->start + 1;
420                 WARN_ON(range > tree->dirty_bytes);
421                 tree->dirty_bytes -= range;
422         }
423         clear_state_cb(tree, state, bits);
424         state->state &= ~bits_to_clear;
425         if (wake)
426                 wake_up(&state->wq);
427         if (state->state == 0) {
428                 if (state->tree) {
429                         rb_erase(&state->rb_node, &tree->state);
430                         state->tree = NULL;
431                         free_extent_state(state);
432                 } else {
433                         WARN_ON(1);
434                 }
435         } else {
436                 merge_state(tree, state);
437         }
438         return ret;
439 }
440
441 static struct extent_state *
442 alloc_extent_state_atomic(struct extent_state *prealloc)
443 {
444         if (!prealloc)
445                 prealloc = alloc_extent_state(GFP_ATOMIC);
446
447         return prealloc;
448 }
449
450 void extent_io_tree_panic(struct extent_io_tree *tree, int err)
451 {
452         btrfs_panic(tree_fs_info(tree), err, "Locking error: "
453                     "Extent tree was modified by another "
454                     "thread while locked.");
455 }
456
457 /*
458  * clear some bits on a range in the tree.  This may require splitting
459  * or inserting elements in the tree, so the gfp mask is used to
460  * indicate which allocations or sleeping are allowed.
461  *
462  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
463  * the given range from the tree regardless of state (ie for truncate).
464  *
465  * the range [start, end] is inclusive.
466  *
467  * This takes the tree lock, and returns 0 on success and < 0 on error.
468  */
469 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
470                      int bits, int wake, int delete,
471                      struct extent_state **cached_state,
472                      gfp_t mask)
473 {
474         struct extent_state *state;
475         struct extent_state *cached;
476         struct extent_state *prealloc = NULL;
477         struct rb_node *next_node;
478         struct rb_node *node;
479         u64 last_end;
480         int err;
481         int clear = 0;
482
483         if (delete)
484                 bits |= ~EXTENT_CTLBITS;
485         bits |= EXTENT_FIRST_DELALLOC;
486
487         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
488                 clear = 1;
489 again:
490         if (!prealloc && (mask & __GFP_WAIT)) {
491                 prealloc = alloc_extent_state(mask);
492                 if (!prealloc)
493                         return -ENOMEM;
494         }
495
496         spin_lock(&tree->lock);
497         if (cached_state) {
498                 cached = *cached_state;
499
500                 if (clear) {
501                         *cached_state = NULL;
502                         cached_state = NULL;
503                 }
504
505                 if (cached && cached->tree && cached->start <= start &&
506                     cached->end > start) {
507                         if (clear)
508                                 atomic_dec(&cached->refs);
509                         state = cached;
510                         goto hit_next;
511                 }
512                 if (clear)
513                         free_extent_state(cached);
514         }
515         /*
516          * this search will find the extents that end after
517          * our range starts
518          */
519         node = tree_search(tree, start);
520         if (!node)
521                 goto out;
522         state = rb_entry(node, struct extent_state, rb_node);
523 hit_next:
524         if (state->start > end)
525                 goto out;
526         WARN_ON(state->end < start);
527         last_end = state->end;
528
529         if (state->end < end && !need_resched())
530                 next_node = rb_next(&state->rb_node);
531         else
532                 next_node = NULL;
533
534         /* the state doesn't have the wanted bits, go ahead */
535         if (!(state->state & bits))
536                 goto next;
537
538         /*
539          *     | ---- desired range ---- |
540          *  | state | or
541          *  | ------------- state -------------- |
542          *
543          * We need to split the extent we found, and may flip
544          * bits on second half.
545          *
546          * If the extent we found extends past our range, we
547          * just split and search again.  It'll get split again
548          * the next time though.
549          *
550          * If the extent we found is inside our range, we clear
551          * the desired bit on it.
552          */
553
554         if (state->start < start) {
555                 prealloc = alloc_extent_state_atomic(prealloc);
556                 BUG_ON(!prealloc);
557                 err = split_state(tree, state, prealloc, start);
558                 if (err)
559                         extent_io_tree_panic(tree, err);
560
561                 prealloc = NULL;
562                 if (err)
563                         goto out;
564                 if (state->end <= end) {
565                         clear_state_bit(tree, state, &bits, wake);
566                         if (last_end == (u64)-1)
567                                 goto out;
568                         start = last_end + 1;
569                 }
570                 goto search_again;
571         }
572         /*
573          * | ---- desired range ---- |
574          *                        | state |
575          * We need to split the extent, and clear the bit
576          * on the first half
577          */
578         if (state->start <= end && state->end > end) {
579                 prealloc = alloc_extent_state_atomic(prealloc);
580                 BUG_ON(!prealloc);
581                 err = split_state(tree, state, prealloc, end + 1);
582                 if (err)
583                         extent_io_tree_panic(tree, err);
584
585                 if (wake)
586                         wake_up(&state->wq);
587
588                 clear_state_bit(tree, prealloc, &bits, wake);
589
590                 prealloc = NULL;
591                 goto out;
592         }
593
594         clear_state_bit(tree, state, &bits, wake);
595 next:
596         if (last_end == (u64)-1)
597                 goto out;
598         start = last_end + 1;
599         if (start <= end && next_node) {
600                 state = rb_entry(next_node, struct extent_state,
601                                  rb_node);
602                 goto hit_next;
603         }
604         goto search_again;
605
606 out:
607         spin_unlock(&tree->lock);
608         if (prealloc)
609                 free_extent_state(prealloc);
610
611         return 0;
612
613 search_again:
614         if (start > end)
615                 goto out;
616         spin_unlock(&tree->lock);
617         if (mask & __GFP_WAIT)
618                 cond_resched();
619         goto again;
620 }
621
622 static void wait_on_state(struct extent_io_tree *tree,
623                           struct extent_state *state)
624                 __releases(tree->lock)
625                 __acquires(tree->lock)
626 {
627         DEFINE_WAIT(wait);
628         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
629         spin_unlock(&tree->lock);
630         schedule();
631         spin_lock(&tree->lock);
632         finish_wait(&state->wq, &wait);
633 }
634
635 /*
636  * waits for one or more bits to clear on a range in the state tree.
637  * The range [start, end] is inclusive.
638  * The tree lock is taken by this function
639  */
640 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
641 {
642         struct extent_state *state;
643         struct rb_node *node;
644
645         spin_lock(&tree->lock);
646 again:
647         while (1) {
648                 /*
649                  * this search will find all the extents that end after
650                  * our range starts
651                  */
652                 node = tree_search(tree, start);
653                 if (!node)
654                         break;
655
656                 state = rb_entry(node, struct extent_state, rb_node);
657
658                 if (state->start > end)
659                         goto out;
660
661                 if (state->state & bits) {
662                         start = state->start;
663                         atomic_inc(&state->refs);
664                         wait_on_state(tree, state);
665                         free_extent_state(state);
666                         goto again;
667                 }
668                 start = state->end + 1;
669
670                 if (start > end)
671                         break;
672
673                 cond_resched_lock(&tree->lock);
674         }
675 out:
676         spin_unlock(&tree->lock);
677 }
678
679 static void set_state_bits(struct extent_io_tree *tree,
680                            struct extent_state *state,
681                            int *bits)
682 {
683         int bits_to_set = *bits & ~EXTENT_CTLBITS;
684
685         set_state_cb(tree, state, bits);
686         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
687                 u64 range = state->end - state->start + 1;
688                 tree->dirty_bytes += range;
689         }
690         state->state |= bits_to_set;
691 }
692
693 static void cache_state(struct extent_state *state,
694                         struct extent_state **cached_ptr)
695 {
696         if (cached_ptr && !(*cached_ptr)) {
697                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
698                         *cached_ptr = state;
699                         atomic_inc(&state->refs);
700                 }
701         }
702 }
703
704 static void uncache_state(struct extent_state **cached_ptr)
705 {
706         if (cached_ptr && (*cached_ptr)) {
707                 struct extent_state *state = *cached_ptr;
708                 *cached_ptr = NULL;
709                 free_extent_state(state);
710         }
711 }
712
713 /*
714  * set some bits on a range in the tree.  This may require allocations or
715  * sleeping, so the gfp mask is used to indicate what is allowed.
716  *
717  * If any of the exclusive bits are set, this will fail with -EEXIST if some
718  * part of the range already has the desired bits set.  The start of the
719  * existing range is returned in failed_start in this case.
720  *
721  * [start, end] is inclusive This takes the tree lock.
722  */
723
724 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
725                    int bits, int exclusive_bits, u64 *failed_start,
726                    struct extent_state **cached_state, gfp_t mask)
727 {
728         struct extent_state *state;
729         struct extent_state *prealloc = NULL;
730         struct rb_node *node;
731         int err = 0;
732         u64 last_start;
733         u64 last_end;
734
735         bits |= EXTENT_FIRST_DELALLOC;
736 again:
737         if (!prealloc && (mask & __GFP_WAIT)) {
738                 prealloc = alloc_extent_state(mask);
739                 BUG_ON(!prealloc);
740         }
741
742         spin_lock(&tree->lock);
743         if (cached_state && *cached_state) {
744                 state = *cached_state;
745                 if (state->start <= start && state->end > start &&
746                     state->tree) {
747                         node = &state->rb_node;
748                         goto hit_next;
749                 }
750         }
751         /*
752          * this search will find all the extents that end after
753          * our range starts.
754          */
755         node = tree_search(tree, start);
756         if (!node) {
757                 prealloc = alloc_extent_state_atomic(prealloc);
758                 BUG_ON(!prealloc);
759                 err = insert_state(tree, prealloc, start, end, &bits);
760                 if (err)
761                         extent_io_tree_panic(tree, err);
762
763                 prealloc = NULL;
764                 goto out;
765         }
766         state = rb_entry(node, struct extent_state, rb_node);
767 hit_next:
768         last_start = state->start;
769         last_end = state->end;
770
771         /*
772          * | ---- desired range ---- |
773          * | state |
774          *
775          * Just lock what we found and keep going
776          */
777         if (state->start == start && state->end <= end) {
778                 struct rb_node *next_node;
779                 if (state->state & exclusive_bits) {
780                         *failed_start = state->start;
781                         err = -EEXIST;
782                         goto out;
783                 }
784
785                 set_state_bits(tree, state, &bits);
786
787                 cache_state(state, cached_state);
788                 merge_state(tree, state);
789                 if (last_end == (u64)-1)
790                         goto out;
791
792                 start = last_end + 1;
793                 next_node = rb_next(&state->rb_node);
794                 if (next_node && start < end && prealloc && !need_resched()) {
795                         state = rb_entry(next_node, struct extent_state,
796                                          rb_node);
797                         if (state->start == start)
798                                 goto hit_next;
799                 }
800                 goto search_again;
801         }
802
803         /*
804          *     | ---- desired range ---- |
805          * | state |
806          *   or
807          * | ------------- state -------------- |
808          *
809          * We need to split the extent we found, and may flip bits on
810          * second half.
811          *
812          * If the extent we found extends past our
813          * range, we just split and search again.  It'll get split
814          * again the next time though.
815          *
816          * If the extent we found is inside our range, we set the
817          * desired bit on it.
818          */
819         if (state->start < start) {
820                 if (state->state & exclusive_bits) {
821                         *failed_start = start;
822                         err = -EEXIST;
823                         goto out;
824                 }
825
826                 prealloc = alloc_extent_state_atomic(prealloc);
827                 BUG_ON(!prealloc);
828                 err = split_state(tree, state, prealloc, start);
829                 if (err)
830                         extent_io_tree_panic(tree, err);
831
832                 prealloc = NULL;
833                 if (err)
834                         goto out;
835                 if (state->end <= end) {
836                         set_state_bits(tree, state, &bits);
837                         cache_state(state, cached_state);
838                         merge_state(tree, state);
839                         if (last_end == (u64)-1)
840                                 goto out;
841                         start = last_end + 1;
842                 }
843                 goto search_again;
844         }
845         /*
846          * | ---- desired range ---- |
847          *     | state | or               | state |
848          *
849          * There's a hole, we need to insert something in it and
850          * ignore the extent we found.
851          */
852         if (state->start > start) {
853                 u64 this_end;
854                 if (end < last_start)
855                         this_end = end;
856                 else
857                         this_end = last_start - 1;
858
859                 prealloc = alloc_extent_state_atomic(prealloc);
860                 BUG_ON(!prealloc);
861
862                 /*
863                  * Avoid to free 'prealloc' if it can be merged with
864                  * the later extent.
865                  */
866                 err = insert_state(tree, prealloc, start, this_end,
867                                    &bits);
868                 if (err)
869                         extent_io_tree_panic(tree, err);
870
871                 cache_state(prealloc, cached_state);
872                 prealloc = NULL;
873                 start = this_end + 1;
874                 goto search_again;
875         }
876         /*
877          * | ---- desired range ---- |
878          *                        | state |
879          * We need to split the extent, and set the bit
880          * on the first half
881          */
882         if (state->start <= end && state->end > end) {
883                 if (state->state & exclusive_bits) {
884                         *failed_start = start;
885                         err = -EEXIST;
886                         goto out;
887                 }
888
889                 prealloc = alloc_extent_state_atomic(prealloc);
890                 BUG_ON(!prealloc);
891                 err = split_state(tree, state, prealloc, end + 1);
892                 if (err)
893                         extent_io_tree_panic(tree, err);
894
895                 set_state_bits(tree, prealloc, &bits);
896                 cache_state(prealloc, cached_state);
897                 merge_state(tree, prealloc);
898                 prealloc = NULL;
899                 goto out;
900         }
901
902         goto search_again;
903
904 out:
905         spin_unlock(&tree->lock);
906         if (prealloc)
907                 free_extent_state(prealloc);
908
909         return err;
910
911 search_again:
912         if (start > end)
913                 goto out;
914         spin_unlock(&tree->lock);
915         if (mask & __GFP_WAIT)
916                 cond_resched();
917         goto again;
918 }
919
920 /**
921  * convert_extent - convert all bits in a given range from one bit to another
922  * @tree:       the io tree to search
923  * @start:      the start offset in bytes
924  * @end:        the end offset in bytes (inclusive)
925  * @bits:       the bits to set in this range
926  * @clear_bits: the bits to clear in this range
927  * @mask:       the allocation mask
928  *
929  * This will go through and set bits for the given range.  If any states exist
930  * already in this range they are set with the given bit and cleared of the
931  * clear_bits.  This is only meant to be used by things that are mergeable, ie
932  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
933  * boundary bits like LOCK.
934  */
935 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
936                        int bits, int clear_bits, gfp_t mask)
937 {
938         struct extent_state *state;
939         struct extent_state *prealloc = NULL;
940         struct rb_node *node;
941         int err = 0;
942         u64 last_start;
943         u64 last_end;
944
945 again:
946         if (!prealloc && (mask & __GFP_WAIT)) {
947                 prealloc = alloc_extent_state(mask);
948                 if (!prealloc)
949                         return -ENOMEM;
950         }
951
952         spin_lock(&tree->lock);
953         /*
954          * this search will find all the extents that end after
955          * our range starts.
956          */
957         node = tree_search(tree, start);
958         if (!node) {
959                 prealloc = alloc_extent_state_atomic(prealloc);
960                 if (!prealloc) {
961                         err = -ENOMEM;
962                         goto out;
963                 }
964                 err = insert_state(tree, prealloc, start, end, &bits);
965                 prealloc = NULL;
966                 if (err)
967                         extent_io_tree_panic(tree, err);
968                 goto out;
969         }
970         state = rb_entry(node, struct extent_state, rb_node);
971 hit_next:
972         last_start = state->start;
973         last_end = state->end;
974
975         /*
976          * | ---- desired range ---- |
977          * | state |
978          *
979          * Just lock what we found and keep going
980          */
981         if (state->start == start && state->end <= end) {
982                 struct rb_node *next_node;
983
984                 set_state_bits(tree, state, &bits);
985                 clear_state_bit(tree, state, &clear_bits, 0);
986                 if (last_end == (u64)-1)
987                         goto out;
988
989                 start = last_end + 1;
990                 next_node = rb_next(&state->rb_node);
991                 if (next_node && start < end && prealloc && !need_resched()) {
992                         state = rb_entry(next_node, struct extent_state,
993                                          rb_node);
994                         if (state->start == start)
995                                 goto hit_next;
996                 }
997                 goto search_again;
998         }
999
1000         /*
1001          *     | ---- desired range ---- |
1002          * | state |
1003          *   or
1004          * | ------------- state -------------- |
1005          *
1006          * We need to split the extent we found, and may flip bits on
1007          * second half.
1008          *
1009          * If the extent we found extends past our
1010          * range, we just split and search again.  It'll get split
1011          * again the next time though.
1012          *
1013          * If the extent we found is inside our range, we set the
1014          * desired bit on it.
1015          */
1016         if (state->start < start) {
1017                 prealloc = alloc_extent_state_atomic(prealloc);
1018                 if (!prealloc) {
1019                         err = -ENOMEM;
1020                         goto out;
1021                 }
1022                 err = split_state(tree, state, prealloc, start);
1023                 if (err)
1024                         extent_io_tree_panic(tree, err);
1025                 prealloc = NULL;
1026                 if (err)
1027                         goto out;
1028                 if (state->end <= end) {
1029                         set_state_bits(tree, state, &bits);
1030                         clear_state_bit(tree, state, &clear_bits, 0);
1031                         if (last_end == (u64)-1)
1032                                 goto out;
1033                         start = last_end + 1;
1034                 }
1035                 goto search_again;
1036         }
1037         /*
1038          * | ---- desired range ---- |
1039          *     | state | or               | state |
1040          *
1041          * There's a hole, we need to insert something in it and
1042          * ignore the extent we found.
1043          */
1044         if (state->start > start) {
1045                 u64 this_end;
1046                 if (end < last_start)
1047                         this_end = end;
1048                 else
1049                         this_end = last_start - 1;
1050
1051                 prealloc = alloc_extent_state_atomic(prealloc);
1052                 if (!prealloc) {
1053                         err = -ENOMEM;
1054                         goto out;
1055                 }
1056
1057                 /*
1058                  * Avoid to free 'prealloc' if it can be merged with
1059                  * the later extent.
1060                  */
1061                 err = insert_state(tree, prealloc, start, this_end,
1062                                    &bits);
1063                 if (err)
1064                         extent_io_tree_panic(tree, err);
1065                 prealloc = NULL;
1066                 start = this_end + 1;
1067                 goto search_again;
1068         }
1069         /*
1070          * | ---- desired range ---- |
1071          *                        | state |
1072          * We need to split the extent, and set the bit
1073          * on the first half
1074          */
1075         if (state->start <= end && state->end > end) {
1076                 prealloc = alloc_extent_state_atomic(prealloc);
1077                 if (!prealloc) {
1078                         err = -ENOMEM;
1079                         goto out;
1080                 }
1081
1082                 err = split_state(tree, state, prealloc, end + 1);
1083                 if (err)
1084                         extent_io_tree_panic(tree, err);
1085
1086                 set_state_bits(tree, prealloc, &bits);
1087                 clear_state_bit(tree, prealloc, &clear_bits, 0);
1088                 prealloc = NULL;
1089                 goto out;
1090         }
1091
1092         goto search_again;
1093
1094 out:
1095         spin_unlock(&tree->lock);
1096         if (prealloc)
1097                 free_extent_state(prealloc);
1098
1099         return err;
1100
1101 search_again:
1102         if (start > end)
1103                 goto out;
1104         spin_unlock(&tree->lock);
1105         if (mask & __GFP_WAIT)
1106                 cond_resched();
1107         goto again;
1108 }
1109
1110 /* wrappers around set/clear extent bit */
1111 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1112                      gfp_t mask)
1113 {
1114         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
1115                               NULL, mask);
1116 }
1117
1118 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1119                     int bits, gfp_t mask)
1120 {
1121         return set_extent_bit(tree, start, end, bits, 0, NULL,
1122                               NULL, mask);
1123 }
1124
1125 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1126                       int bits, gfp_t mask)
1127 {
1128         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1129 }
1130
1131 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1132                         struct extent_state **cached_state, gfp_t mask)
1133 {
1134         return set_extent_bit(tree, start, end,
1135                               EXTENT_DELALLOC | EXTENT_UPTODATE,
1136                               0, NULL, cached_state, mask);
1137 }
1138
1139 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1140                        gfp_t mask)
1141 {
1142         return clear_extent_bit(tree, start, end,
1143                                 EXTENT_DIRTY | EXTENT_DELALLOC |
1144                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1145 }
1146
1147 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1148                      gfp_t mask)
1149 {
1150         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
1151                               NULL, mask);
1152 }
1153
1154 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1155                         struct extent_state **cached_state, gfp_t mask)
1156 {
1157         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1158                               NULL, cached_state, mask);
1159 }
1160
1161 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1162                                  u64 end, struct extent_state **cached_state,
1163                                  gfp_t mask)
1164 {
1165         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1166                                 cached_state, mask);
1167 }
1168
1169 /*
1170  * either insert or lock state struct between start and end use mask to tell
1171  * us if waiting is desired.
1172  */
1173 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1174                      int bits, struct extent_state **cached_state)
1175 {
1176         int err;
1177         u64 failed_start;
1178         while (1) {
1179                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1180                                      EXTENT_LOCKED, &failed_start,
1181                                      cached_state, GFP_NOFS);
1182                 if (err == -EEXIST) {
1183                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1184                         start = failed_start;
1185                 } else
1186                         break;
1187                 WARN_ON(start > end);
1188         }
1189         return err;
1190 }
1191
1192 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1193 {
1194         return lock_extent_bits(tree, start, end, 0, NULL);
1195 }
1196
1197 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1198 {
1199         int err;
1200         u64 failed_start;
1201
1202         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1203                              &failed_start, NULL, GFP_NOFS);
1204         if (err == -EEXIST) {
1205                 if (failed_start > start)
1206                         clear_extent_bit(tree, start, failed_start - 1,
1207                                          EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1208                 return 0;
1209         }
1210         return 1;
1211 }
1212
1213 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1214                          struct extent_state **cached, gfp_t mask)
1215 {
1216         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1217                                 mask);
1218 }
1219
1220 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1221 {
1222         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1223                                 GFP_NOFS);
1224 }
1225
1226 /*
1227  * helper function to set both pages and extents in the tree writeback
1228  */
1229 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1230 {
1231         unsigned long index = start >> PAGE_CACHE_SHIFT;
1232         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1233         struct page *page;
1234
1235         while (index <= end_index) {
1236                 page = find_get_page(tree->mapping, index);
1237                 BUG_ON(!page);
1238                 set_page_writeback(page);
1239                 page_cache_release(page);
1240                 index++;
1241         }
1242         return 0;
1243 }
1244
1245 /* find the first state struct with 'bits' set after 'start', and
1246  * return it.  tree->lock must be held.  NULL will returned if
1247  * nothing was found after 'start'
1248  */
1249 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1250                                                  u64 start, int bits)
1251 {
1252         struct rb_node *node;
1253         struct extent_state *state;
1254
1255         /*
1256          * this search will find all the extents that end after
1257          * our range starts.
1258          */
1259         node = tree_search(tree, start);
1260         if (!node)
1261                 goto out;
1262
1263         while (1) {
1264                 state = rb_entry(node, struct extent_state, rb_node);
1265                 if (state->end >= start && (state->state & bits))
1266                         return state;
1267
1268                 node = rb_next(node);
1269                 if (!node)
1270                         break;
1271         }
1272 out:
1273         return NULL;
1274 }
1275
1276 /*
1277  * find the first offset in the io tree with 'bits' set. zero is
1278  * returned if we find something, and *start_ret and *end_ret are
1279  * set to reflect the state struct that was found.
1280  *
1281  * If nothing was found, 1 is returned, < 0 on error
1282  */
1283 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1284                           u64 *start_ret, u64 *end_ret, int bits)
1285 {
1286         struct extent_state *state;
1287         int ret = 1;
1288
1289         spin_lock(&tree->lock);
1290         state = find_first_extent_bit_state(tree, start, bits);
1291         if (state) {
1292                 *start_ret = state->start;
1293                 *end_ret = state->end;
1294                 ret = 0;
1295         }
1296         spin_unlock(&tree->lock);
1297         return ret;
1298 }
1299
1300 /*
1301  * find a contiguous range of bytes in the file marked as delalloc, not
1302  * more than 'max_bytes'.  start and end are used to return the range,
1303  *
1304  * 1 is returned if we find something, 0 if nothing was in the tree
1305  */
1306 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1307                                         u64 *start, u64 *end, u64 max_bytes,
1308                                         struct extent_state **cached_state)
1309 {
1310         struct rb_node *node;
1311         struct extent_state *state;
1312         u64 cur_start = *start;
1313         u64 found = 0;
1314         u64 total_bytes = 0;
1315
1316         spin_lock(&tree->lock);
1317
1318         /*
1319          * this search will find all the extents that end after
1320          * our range starts.
1321          */
1322         node = tree_search(tree, cur_start);
1323         if (!node) {
1324                 if (!found)
1325                         *end = (u64)-1;
1326                 goto out;
1327         }
1328
1329         while (1) {
1330                 state = rb_entry(node, struct extent_state, rb_node);
1331                 if (found && (state->start != cur_start ||
1332                               (state->state & EXTENT_BOUNDARY))) {
1333                         goto out;
1334                 }
1335                 if (!(state->state & EXTENT_DELALLOC)) {
1336                         if (!found)
1337                                 *end = state->end;
1338                         goto out;
1339                 }
1340                 if (!found) {
1341                         *start = state->start;
1342                         *cached_state = state;
1343                         atomic_inc(&state->refs);
1344                 }
1345                 found++;
1346                 *end = state->end;
1347                 cur_start = state->end + 1;
1348                 node = rb_next(node);
1349                 if (!node)
1350                         break;
1351                 total_bytes += state->end - state->start + 1;
1352                 if (total_bytes >= max_bytes)
1353                         break;
1354         }
1355 out:
1356         spin_unlock(&tree->lock);
1357         return found;
1358 }
1359
1360 static noinline void __unlock_for_delalloc(struct inode *inode,
1361                                            struct page *locked_page,
1362                                            u64 start, u64 end)
1363 {
1364         int ret;
1365         struct page *pages[16];
1366         unsigned long index = start >> PAGE_CACHE_SHIFT;
1367         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1368         unsigned long nr_pages = end_index - index + 1;
1369         int i;
1370
1371         if (index == locked_page->index && end_index == index)
1372                 return;
1373
1374         while (nr_pages > 0) {
1375                 ret = find_get_pages_contig(inode->i_mapping, index,
1376                                      min_t(unsigned long, nr_pages,
1377                                      ARRAY_SIZE(pages)), pages);
1378                 for (i = 0; i < ret; i++) {
1379                         if (pages[i] != locked_page)
1380                                 unlock_page(pages[i]);
1381                         page_cache_release(pages[i]);
1382                 }
1383                 nr_pages -= ret;
1384                 index += ret;
1385                 cond_resched();
1386         }
1387 }
1388
1389 static noinline int lock_delalloc_pages(struct inode *inode,
1390                                         struct page *locked_page,
1391                                         u64 delalloc_start,
1392                                         u64 delalloc_end)
1393 {
1394         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1395         unsigned long start_index = index;
1396         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1397         unsigned long pages_locked = 0;
1398         struct page *pages[16];
1399         unsigned long nrpages;
1400         int ret;
1401         int i;
1402
1403         /* the caller is responsible for locking the start index */
1404         if (index == locked_page->index && index == end_index)
1405                 return 0;
1406
1407         /* skip the page at the start index */
1408         nrpages = end_index - index + 1;
1409         while (nrpages > 0) {
1410                 ret = find_get_pages_contig(inode->i_mapping, index,
1411                                      min_t(unsigned long,
1412                                      nrpages, ARRAY_SIZE(pages)), pages);
1413                 if (ret == 0) {
1414                         ret = -EAGAIN;
1415                         goto done;
1416                 }
1417                 /* now we have an array of pages, lock them all */
1418                 for (i = 0; i < ret; i++) {
1419                         /*
1420                          * the caller is taking responsibility for
1421                          * locked_page
1422                          */
1423                         if (pages[i] != locked_page) {
1424                                 lock_page(pages[i]);
1425                                 if (!PageDirty(pages[i]) ||
1426                                     pages[i]->mapping != inode->i_mapping) {
1427                                         ret = -EAGAIN;
1428                                         unlock_page(pages[i]);
1429                                         page_cache_release(pages[i]);
1430                                         goto done;
1431                                 }
1432                         }
1433                         page_cache_release(pages[i]);
1434                         pages_locked++;
1435                 }
1436                 nrpages -= ret;
1437                 index += ret;
1438                 cond_resched();
1439         }
1440         ret = 0;
1441 done:
1442         if (ret && pages_locked) {
1443                 __unlock_for_delalloc(inode, locked_page,
1444                               delalloc_start,
1445                               ((u64)(start_index + pages_locked - 1)) <<
1446                               PAGE_CACHE_SHIFT);
1447         }
1448         return ret;
1449 }
1450
1451 /*
1452  * find a contiguous range of bytes in the file marked as delalloc, not
1453  * more than 'max_bytes'.  start and end are used to return the range,
1454  *
1455  * 1 is returned if we find something, 0 if nothing was in the tree
1456  */
1457 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1458                                              struct extent_io_tree *tree,
1459                                              struct page *locked_page,
1460                                              u64 *start, u64 *end,
1461                                              u64 max_bytes)
1462 {
1463         u64 delalloc_start;
1464         u64 delalloc_end;
1465         u64 found;
1466         struct extent_state *cached_state = NULL;
1467         int ret;
1468         int loops = 0;
1469
1470 again:
1471         /* step one, find a bunch of delalloc bytes starting at start */
1472         delalloc_start = *start;
1473         delalloc_end = 0;
1474         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1475                                     max_bytes, &cached_state);
1476         if (!found || delalloc_end <= *start) {
1477                 *start = delalloc_start;
1478                 *end = delalloc_end;
1479                 free_extent_state(cached_state);
1480                 return found;
1481         }
1482
1483         /*
1484          * start comes from the offset of locked_page.  We have to lock
1485          * pages in order, so we can't process delalloc bytes before
1486          * locked_page
1487          */
1488         if (delalloc_start < *start)
1489                 delalloc_start = *start;
1490
1491         /*
1492          * make sure to limit the number of pages we try to lock down
1493          * if we're looping.
1494          */
1495         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1496                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1497
1498         /* step two, lock all the pages after the page that has start */
1499         ret = lock_delalloc_pages(inode, locked_page,
1500                                   delalloc_start, delalloc_end);
1501         if (ret == -EAGAIN) {
1502                 /* some of the pages are gone, lets avoid looping by
1503                  * shortening the size of the delalloc range we're searching
1504                  */
1505                 free_extent_state(cached_state);
1506                 if (!loops) {
1507                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1508                         max_bytes = PAGE_CACHE_SIZE - offset;
1509                         loops = 1;
1510                         goto again;
1511                 } else {
1512                         found = 0;
1513                         goto out_failed;
1514                 }
1515         }
1516         BUG_ON(ret);
1517
1518         /* step three, lock the state bits for the whole range */
1519         lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1520
1521         /* then test to make sure it is all still delalloc */
1522         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1523                              EXTENT_DELALLOC, 1, cached_state);
1524         if (!ret) {
1525                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1526                                      &cached_state, GFP_NOFS);
1527                 __unlock_for_delalloc(inode, locked_page,
1528                               delalloc_start, delalloc_end);
1529                 cond_resched();
1530                 goto again;
1531         }
1532         free_extent_state(cached_state);
1533         *start = delalloc_start;
1534         *end = delalloc_end;
1535 out_failed:
1536         return found;
1537 }
1538
1539 int extent_clear_unlock_delalloc(struct inode *inode,
1540                                 struct extent_io_tree *tree,
1541                                 u64 start, u64 end, struct page *locked_page,
1542                                 unsigned long op)
1543 {
1544         int ret;
1545         struct page *pages[16];
1546         unsigned long index = start >> PAGE_CACHE_SHIFT;
1547         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1548         unsigned long nr_pages = end_index - index + 1;
1549         int i;
1550         int clear_bits = 0;
1551
1552         if (op & EXTENT_CLEAR_UNLOCK)
1553                 clear_bits |= EXTENT_LOCKED;
1554         if (op & EXTENT_CLEAR_DIRTY)
1555                 clear_bits |= EXTENT_DIRTY;
1556
1557         if (op & EXTENT_CLEAR_DELALLOC)
1558                 clear_bits |= EXTENT_DELALLOC;
1559
1560         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1561         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1562                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1563                     EXTENT_SET_PRIVATE2)))
1564                 return 0;
1565
1566         while (nr_pages > 0) {
1567                 ret = find_get_pages_contig(inode->i_mapping, index,
1568                                      min_t(unsigned long,
1569                                      nr_pages, ARRAY_SIZE(pages)), pages);
1570                 for (i = 0; i < ret; i++) {
1571
1572                         if (op & EXTENT_SET_PRIVATE2)
1573                                 SetPagePrivate2(pages[i]);
1574
1575                         if (pages[i] == locked_page) {
1576                                 page_cache_release(pages[i]);
1577                                 continue;
1578                         }
1579                         if (op & EXTENT_CLEAR_DIRTY)
1580                                 clear_page_dirty_for_io(pages[i]);
1581                         if (op & EXTENT_SET_WRITEBACK)
1582                                 set_page_writeback(pages[i]);
1583                         if (op & EXTENT_END_WRITEBACK)
1584                                 end_page_writeback(pages[i]);
1585                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1586                                 unlock_page(pages[i]);
1587                         page_cache_release(pages[i]);
1588                 }
1589                 nr_pages -= ret;
1590                 index += ret;
1591                 cond_resched();
1592         }
1593         return 0;
1594 }
1595
1596 /*
1597  * count the number of bytes in the tree that have a given bit(s)
1598  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1599  * cached.  The total number found is returned.
1600  */
1601 u64 count_range_bits(struct extent_io_tree *tree,
1602                      u64 *start, u64 search_end, u64 max_bytes,
1603                      unsigned long bits, int contig)
1604 {
1605         struct rb_node *node;
1606         struct extent_state *state;
1607         u64 cur_start = *start;
1608         u64 total_bytes = 0;
1609         u64 last = 0;
1610         int found = 0;
1611
1612         if (search_end <= cur_start) {
1613                 WARN_ON(1);
1614                 return 0;
1615         }
1616
1617         spin_lock(&tree->lock);
1618         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1619                 total_bytes = tree->dirty_bytes;
1620                 goto out;
1621         }
1622         /*
1623          * this search will find all the extents that end after
1624          * our range starts.
1625          */
1626         node = tree_search(tree, cur_start);
1627         if (!node)
1628                 goto out;
1629
1630         while (1) {
1631                 state = rb_entry(node, struct extent_state, rb_node);
1632                 if (state->start > search_end)
1633                         break;
1634                 if (contig && found && state->start > last + 1)
1635                         break;
1636                 if (state->end >= cur_start && (state->state & bits) == bits) {
1637                         total_bytes += min(search_end, state->end) + 1 -
1638                                        max(cur_start, state->start);
1639                         if (total_bytes >= max_bytes)
1640                                 break;
1641                         if (!found) {
1642                                 *start = max(cur_start, state->start);
1643                                 found = 1;
1644                         }
1645                         last = state->end;
1646                 } else if (contig && found) {
1647                         break;
1648                 }
1649                 node = rb_next(node);
1650                 if (!node)
1651                         break;
1652         }
1653 out:
1654         spin_unlock(&tree->lock);
1655         return total_bytes;
1656 }
1657
1658 /*
1659  * set the private field for a given byte offset in the tree.  If there isn't
1660  * an extent_state there already, this does nothing.
1661  */
1662 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1663 {
1664         struct rb_node *node;
1665         struct extent_state *state;
1666         int ret = 0;
1667
1668         spin_lock(&tree->lock);
1669         /*
1670          * this search will find all the extents that end after
1671          * our range starts.
1672          */
1673         node = tree_search(tree, start);
1674         if (!node) {
1675                 ret = -ENOENT;
1676                 goto out;
1677         }
1678         state = rb_entry(node, struct extent_state, rb_node);
1679         if (state->start != start) {
1680                 ret = -ENOENT;
1681                 goto out;
1682         }
1683         state->private = private;
1684 out:
1685         spin_unlock(&tree->lock);
1686         return ret;
1687 }
1688
1689 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1690 {
1691         struct rb_node *node;
1692         struct extent_state *state;
1693         int ret = 0;
1694
1695         spin_lock(&tree->lock);
1696         /*
1697          * this search will find all the extents that end after
1698          * our range starts.
1699          */
1700         node = tree_search(tree, start);
1701         if (!node) {
1702                 ret = -ENOENT;
1703                 goto out;
1704         }
1705         state = rb_entry(node, struct extent_state, rb_node);
1706         if (state->start != start) {
1707                 ret = -ENOENT;
1708                 goto out;
1709         }
1710         *private = state->private;
1711 out:
1712         spin_unlock(&tree->lock);
1713         return ret;
1714 }
1715
1716 /*
1717  * searches a range in the state tree for a given mask.
1718  * If 'filled' == 1, this returns 1 only if every extent in the tree
1719  * has the bits set.  Otherwise, 1 is returned if any bit in the
1720  * range is found set.
1721  */
1722 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1723                    int bits, int filled, struct extent_state *cached)
1724 {
1725         struct extent_state *state = NULL;
1726         struct rb_node *node;
1727         int bitset = 0;
1728
1729         spin_lock(&tree->lock);
1730         if (cached && cached->tree && cached->start <= start &&
1731             cached->end > start)
1732                 node = &cached->rb_node;
1733         else
1734                 node = tree_search(tree, start);
1735         while (node && start <= end) {
1736                 state = rb_entry(node, struct extent_state, rb_node);
1737
1738                 if (filled && state->start > start) {
1739                         bitset = 0;
1740                         break;
1741                 }
1742
1743                 if (state->start > end)
1744                         break;
1745
1746                 if (state->state & bits) {
1747                         bitset = 1;
1748                         if (!filled)
1749                                 break;
1750                 } else if (filled) {
1751                         bitset = 0;
1752                         break;
1753                 }
1754
1755                 if (state->end == (u64)-1)
1756                         break;
1757
1758                 start = state->end + 1;
1759                 if (start > end)
1760                         break;
1761                 node = rb_next(node);
1762                 if (!node) {
1763                         if (filled)
1764                                 bitset = 0;
1765                         break;
1766                 }
1767         }
1768         spin_unlock(&tree->lock);
1769         return bitset;
1770 }
1771
1772 /*
1773  * helper function to set a given page up to date if all the
1774  * extents in the tree for that page are up to date
1775  */
1776 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1777 {
1778         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1779         u64 end = start + PAGE_CACHE_SIZE - 1;
1780         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1781                 SetPageUptodate(page);
1782 }
1783
1784 /*
1785  * helper function to unlock a page if all the extents in the tree
1786  * for that page are unlocked
1787  */
1788 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1789 {
1790         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1791         u64 end = start + PAGE_CACHE_SIZE - 1;
1792         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1793                 unlock_page(page);
1794 }
1795
1796 /*
1797  * helper function to end page writeback if all the extents
1798  * in the tree for that page are done with writeback
1799  */
1800 static void check_page_writeback(struct extent_io_tree *tree,
1801                                  struct page *page)
1802 {
1803         end_page_writeback(page);
1804 }
1805
1806 /*
1807  * When IO fails, either with EIO or csum verification fails, we
1808  * try other mirrors that might have a good copy of the data.  This
1809  * io_failure_record is used to record state as we go through all the
1810  * mirrors.  If another mirror has good data, the page is set up to date
1811  * and things continue.  If a good mirror can't be found, the original
1812  * bio end_io callback is called to indicate things have failed.
1813  */
1814 struct io_failure_record {
1815         struct page *page;
1816         u64 start;
1817         u64 len;
1818         u64 logical;
1819         unsigned long bio_flags;
1820         int this_mirror;
1821         int failed_mirror;
1822         int in_validation;
1823 };
1824
1825 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1826                                 int did_repair)
1827 {
1828         int ret;
1829         int err = 0;
1830         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1831
1832         set_state_private(failure_tree, rec->start, 0);
1833         ret = clear_extent_bits(failure_tree, rec->start,
1834                                 rec->start + rec->len - 1,
1835                                 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1836         if (ret)
1837                 err = ret;
1838
1839         if (did_repair) {
1840                 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1841                                         rec->start + rec->len - 1,
1842                                         EXTENT_DAMAGED, GFP_NOFS);
1843                 if (ret && !err)
1844                         err = ret;
1845         }
1846
1847         kfree(rec);
1848         return err;
1849 }
1850
1851 static void repair_io_failure_callback(struct bio *bio, int err)
1852 {
1853         complete(bio->bi_private);
1854 }
1855
1856 /*
1857  * this bypasses the standard btrfs submit functions deliberately, as
1858  * the standard behavior is to write all copies in a raid setup. here we only
1859  * want to write the one bad copy. so we do the mapping for ourselves and issue
1860  * submit_bio directly.
1861  * to avoid any synchonization issues, wait for the data after writing, which
1862  * actually prevents the read that triggered the error from finishing.
1863  * currently, there can be no more than two copies of every data bit. thus,
1864  * exactly one rewrite is required.
1865  */
1866 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1867                         u64 length, u64 logical, struct page *page,
1868                         int mirror_num)
1869 {
1870         struct bio *bio;
1871         struct btrfs_device *dev;
1872         DECLARE_COMPLETION_ONSTACK(compl);
1873         u64 map_length = 0;
1874         u64 sector;
1875         struct btrfs_bio *bbio = NULL;
1876         int ret;
1877
1878         BUG_ON(!mirror_num);
1879
1880         bio = bio_alloc(GFP_NOFS, 1);
1881         if (!bio)
1882                 return -EIO;
1883         bio->bi_private = &compl;
1884         bio->bi_end_io = repair_io_failure_callback;
1885         bio->bi_size = 0;
1886         map_length = length;
1887
1888         ret = btrfs_map_block(map_tree, WRITE, logical,
1889                               &map_length, &bbio, mirror_num);
1890         if (ret) {
1891                 bio_put(bio);
1892                 return -EIO;
1893         }
1894         BUG_ON(mirror_num != bbio->mirror_num);
1895         sector = bbio->stripes[mirror_num-1].physical >> 9;
1896         bio->bi_sector = sector;
1897         dev = bbio->stripes[mirror_num-1].dev;
1898         kfree(bbio);
1899         if (!dev || !dev->bdev || !dev->writeable) {
1900                 bio_put(bio);
1901                 return -EIO;
1902         }
1903         bio->bi_bdev = dev->bdev;
1904         bio_add_page(bio, page, length, start-page_offset(page));
1905         btrfsic_submit_bio(WRITE_SYNC, bio);
1906         wait_for_completion(&compl);
1907
1908         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1909                 /* try to remap that extent elsewhere? */
1910                 bio_put(bio);
1911                 return -EIO;
1912         }
1913
1914         printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1915                         "sector %llu)\n", page->mapping->host->i_ino, start,
1916                         dev->name, sector);
1917
1918         bio_put(bio);
1919         return 0;
1920 }
1921
1922 /*
1923  * each time an IO finishes, we do a fast check in the IO failure tree
1924  * to see if we need to process or clean up an io_failure_record
1925  */
1926 static int clean_io_failure(u64 start, struct page *page)
1927 {
1928         u64 private;
1929         u64 private_failure;
1930         struct io_failure_record *failrec;
1931         struct btrfs_mapping_tree *map_tree;
1932         struct extent_state *state;
1933         int num_copies;
1934         int did_repair = 0;
1935         int ret;
1936         struct inode *inode = page->mapping->host;
1937
1938         private = 0;
1939         ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1940                                 (u64)-1, 1, EXTENT_DIRTY, 0);
1941         if (!ret)
1942                 return 0;
1943
1944         ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1945                                 &private_failure);
1946         if (ret)
1947                 return 0;
1948
1949         failrec = (struct io_failure_record *)(unsigned long) private_failure;
1950         BUG_ON(!failrec->this_mirror);
1951
1952         if (failrec->in_validation) {
1953                 /* there was no real error, just free the record */
1954                 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1955                          failrec->start);
1956                 did_repair = 1;
1957                 goto out;
1958         }
1959
1960         spin_lock(&BTRFS_I(inode)->io_tree.lock);
1961         state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1962                                             failrec->start,
1963                                             EXTENT_LOCKED);
1964         spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1965
1966         if (state && state->start == failrec->start) {
1967                 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1968                 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1969                                                 failrec->len);
1970                 if (num_copies > 1)  {
1971                         ret = repair_io_failure(map_tree, start, failrec->len,
1972                                                 failrec->logical, page,
1973                                                 failrec->failed_mirror);
1974                         did_repair = !ret;
1975                 }
1976         }
1977
1978 out:
1979         if (!ret)
1980                 ret = free_io_failure(inode, failrec, did_repair);
1981
1982         return ret;
1983 }
1984
1985 /*
1986  * this is a generic handler for readpage errors (default
1987  * readpage_io_failed_hook). if other copies exist, read those and write back
1988  * good data to the failed position. does not investigate in remapping the
1989  * failed extent elsewhere, hoping the device will be smart enough to do this as
1990  * needed
1991  */
1992
1993 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1994                                 u64 start, u64 end, int failed_mirror,
1995                                 struct extent_state *state)
1996 {
1997         struct io_failure_record *failrec = NULL;
1998         u64 private;
1999         struct extent_map *em;
2000         struct inode *inode = page->mapping->host;
2001         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2002         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2003         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2004         struct bio *bio;
2005         int num_copies;
2006         int ret;
2007         int read_mode;
2008         u64 logical;
2009
2010         BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2011
2012         ret = get_state_private(failure_tree, start, &private);
2013         if (ret) {
2014                 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2015                 if (!failrec)
2016                         return -ENOMEM;
2017                 failrec->start = start;
2018                 failrec->len = end - start + 1;
2019                 failrec->this_mirror = 0;
2020                 failrec->bio_flags = 0;
2021                 failrec->in_validation = 0;
2022
2023                 read_lock(&em_tree->lock);
2024                 em = lookup_extent_mapping(em_tree, start, failrec->len);
2025                 if (!em) {
2026                         read_unlock(&em_tree->lock);
2027                         kfree(failrec);
2028                         return -EIO;
2029                 }
2030
2031                 if (em->start > start || em->start + em->len < start) {
2032                         free_extent_map(em);
2033                         em = NULL;
2034                 }
2035                 read_unlock(&em_tree->lock);
2036
2037                 if (!em || IS_ERR(em)) {
2038                         kfree(failrec);
2039                         return -EIO;
2040                 }
2041                 logical = start - em->start;
2042                 logical = em->block_start + logical;
2043                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2044                         logical = em->block_start;
2045                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2046                         extent_set_compress_type(&failrec->bio_flags,
2047                                                  em->compress_type);
2048                 }
2049                 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2050                          "len=%llu\n", logical, start, failrec->len);
2051                 failrec->logical = logical;
2052                 free_extent_map(em);
2053
2054                 /* set the bits in the private failure tree */
2055                 ret = set_extent_bits(failure_tree, start, end,
2056                                         EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2057                 if (ret >= 0)
2058                         ret = set_state_private(failure_tree, start,
2059                                                 (u64)(unsigned long)failrec);
2060                 /* set the bits in the inode's tree */
2061                 if (ret >= 0)
2062                         ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2063                                                 GFP_NOFS);
2064                 if (ret < 0) {
2065                         kfree(failrec);
2066                         return ret;
2067                 }
2068         } else {
2069                 failrec = (struct io_failure_record *)(unsigned long)private;
2070                 pr_debug("bio_readpage_error: (found) logical=%llu, "
2071                          "start=%llu, len=%llu, validation=%d\n",
2072                          failrec->logical, failrec->start, failrec->len,
2073                          failrec->in_validation);
2074                 /*
2075                  * when data can be on disk more than twice, add to failrec here
2076                  * (e.g. with a list for failed_mirror) to make
2077                  * clean_io_failure() clean all those errors at once.
2078                  */
2079         }
2080         num_copies = btrfs_num_copies(
2081                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
2082                               failrec->logical, failrec->len);
2083         if (num_copies == 1) {
2084                 /*
2085                  * we only have a single copy of the data, so don't bother with
2086                  * all the retry and error correction code that follows. no
2087                  * matter what the error is, it is very likely to persist.
2088                  */
2089                 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2090                          "state=%p, num_copies=%d, next_mirror %d, "
2091                          "failed_mirror %d\n", state, num_copies,
2092                          failrec->this_mirror, failed_mirror);
2093                 free_io_failure(inode, failrec, 0);
2094                 return -EIO;
2095         }
2096
2097         if (!state) {
2098                 spin_lock(&tree->lock);
2099                 state = find_first_extent_bit_state(tree, failrec->start,
2100                                                     EXTENT_LOCKED);
2101                 if (state && state->start != failrec->start)
2102                         state = NULL;
2103                 spin_unlock(&tree->lock);
2104         }
2105
2106         /*
2107          * there are two premises:
2108          *      a) deliver good data to the caller
2109          *      b) correct the bad sectors on disk
2110          */
2111         if (failed_bio->bi_vcnt > 1) {
2112                 /*
2113                  * to fulfill b), we need to know the exact failing sectors, as
2114                  * we don't want to rewrite any more than the failed ones. thus,
2115                  * we need separate read requests for the failed bio
2116                  *
2117                  * if the following BUG_ON triggers, our validation request got
2118                  * merged. we need separate requests for our algorithm to work.
2119                  */
2120                 BUG_ON(failrec->in_validation);
2121                 failrec->in_validation = 1;
2122                 failrec->this_mirror = failed_mirror;
2123                 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2124         } else {
2125                 /*
2126                  * we're ready to fulfill a) and b) alongside. get a good copy
2127                  * of the failed sector and if we succeed, we have setup
2128                  * everything for repair_io_failure to do the rest for us.
2129                  */
2130                 if (failrec->in_validation) {
2131                         BUG_ON(failrec->this_mirror != failed_mirror);
2132                         failrec->in_validation = 0;
2133                         failrec->this_mirror = 0;
2134                 }
2135                 failrec->failed_mirror = failed_mirror;
2136                 failrec->this_mirror++;
2137                 if (failrec->this_mirror == failed_mirror)
2138                         failrec->this_mirror++;
2139                 read_mode = READ_SYNC;
2140         }
2141
2142         if (!state || failrec->this_mirror > num_copies) {
2143                 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2144                          "next_mirror %d, failed_mirror %d\n", state,
2145                          num_copies, failrec->this_mirror, failed_mirror);
2146                 free_io_failure(inode, failrec, 0);
2147                 return -EIO;
2148         }
2149
2150         bio = bio_alloc(GFP_NOFS, 1);
2151         bio->bi_private = state;
2152         bio->bi_end_io = failed_bio->bi_end_io;
2153         bio->bi_sector = failrec->logical >> 9;
2154         bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2155         bio->bi_size = 0;
2156
2157         bio_add_page(bio, page, failrec->len, start - page_offset(page));
2158
2159         pr_debug("bio_readpage_error: submitting new read[%#x] to "
2160                  "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2161                  failrec->this_mirror, num_copies, failrec->in_validation);
2162
2163         ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2164                                          failrec->this_mirror,
2165                                          failrec->bio_flags, 0);
2166         return ret;
2167 }
2168
2169 /* lots and lots of room for performance fixes in the end_bio funcs */
2170
2171 int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2172 {
2173         int uptodate = (err == 0);
2174         struct extent_io_tree *tree;
2175         int ret;
2176
2177         tree = &BTRFS_I(page->mapping->host)->io_tree;
2178
2179         if (tree->ops && tree->ops->writepage_end_io_hook) {
2180                 ret = tree->ops->writepage_end_io_hook(page, start,
2181                                                end, NULL, uptodate);
2182                 if (ret)
2183                         uptodate = 0;
2184         }
2185
2186         if (!uptodate && tree->ops &&
2187             tree->ops->writepage_io_failed_hook) {
2188                 ret = tree->ops->writepage_io_failed_hook(NULL, page,
2189                                                  start, end, NULL);
2190                 /* Writeback already completed */
2191                 if (ret == 0)
2192                         return 1;
2193                 BUG_ON(ret < 0);
2194         }
2195
2196         if (!uptodate) {
2197                 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2198                 ClearPageUptodate(page);
2199                 SetPageError(page);
2200         }
2201         return 0;
2202 }
2203
2204 /*
2205  * after a writepage IO is done, we need to:
2206  * clear the uptodate bits on error
2207  * clear the writeback bits in the extent tree for this IO
2208  * end_page_writeback if the page has no more pending IO
2209  *
2210  * Scheduling is not allowed, so the extent state tree is expected
2211  * to have one and only one object corresponding to this IO.
2212  */
2213 static void end_bio_extent_writepage(struct bio *bio, int err)
2214 {
2215         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2216         struct extent_io_tree *tree;
2217         u64 start;
2218         u64 end;
2219         int whole_page;
2220
2221         do {
2222                 struct page *page = bvec->bv_page;
2223                 tree = &BTRFS_I(page->mapping->host)->io_tree;
2224
2225                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2226                          bvec->bv_offset;
2227                 end = start + bvec->bv_len - 1;
2228
2229                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2230                         whole_page = 1;
2231                 else
2232                         whole_page = 0;
2233
2234                 if (--bvec >= bio->bi_io_vec)
2235                         prefetchw(&bvec->bv_page->flags);
2236
2237                 if (end_extent_writepage(page, err, start, end))
2238                         continue;
2239
2240                 if (whole_page)
2241                         end_page_writeback(page);
2242                 else
2243                         check_page_writeback(tree, page);
2244         } while (bvec >= bio->bi_io_vec);
2245
2246         bio_put(bio);
2247 }
2248
2249 /*
2250  * after a readpage IO is done, we need to:
2251  * clear the uptodate bits on error
2252  * set the uptodate bits if things worked
2253  * set the page up to date if all extents in the tree are uptodate
2254  * clear the lock bit in the extent tree
2255  * unlock the page if there are no other extents locked for it
2256  *
2257  * Scheduling is not allowed, so the extent state tree is expected
2258  * to have one and only one object corresponding to this IO.
2259  */
2260 static void end_bio_extent_readpage(struct bio *bio, int err)
2261 {
2262         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2263         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2264         struct bio_vec *bvec = bio->bi_io_vec;
2265         struct extent_io_tree *tree;
2266         u64 start;
2267         u64 end;
2268         int whole_page;
2269         int ret;
2270
2271         if (err)
2272                 uptodate = 0;
2273
2274         do {
2275                 struct page *page = bvec->bv_page;
2276                 struct extent_state *cached = NULL;
2277                 struct extent_state *state;
2278
2279                 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2280                          "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2281                          (long int)bio->bi_bdev);
2282                 tree = &BTRFS_I(page->mapping->host)->io_tree;
2283
2284                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2285                         bvec->bv_offset;
2286                 end = start + bvec->bv_len - 1;
2287
2288                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2289                         whole_page = 1;
2290                 else
2291                         whole_page = 0;
2292
2293                 if (++bvec <= bvec_end)
2294                         prefetchw(&bvec->bv_page->flags);
2295
2296                 spin_lock(&tree->lock);
2297                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2298                 if (state && state->start == start) {
2299                         /*
2300                          * take a reference on the state, unlock will drop
2301                          * the ref
2302                          */
2303                         cache_state(state, &cached);
2304                 }
2305                 spin_unlock(&tree->lock);
2306
2307                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2308                         ret = tree->ops->readpage_end_io_hook(page, start, end,
2309                                                               state);
2310                         if (ret)
2311                                 uptodate = 0;
2312                         else
2313                                 clean_io_failure(start, page);
2314                 }
2315                 if (!uptodate) {
2316                         int failed_mirror;
2317                         failed_mirror = (int)(unsigned long)bio->bi_bdev;
2318                         /*
2319                          * The generic bio_readpage_error handles errors the
2320                          * following way: If possible, new read requests are
2321                          * created and submitted and will end up in
2322                          * end_bio_extent_readpage as well (if we're lucky, not
2323                          * in the !uptodate case). In that case it returns 0 and
2324                          * we just go on with the next page in our bio. If it
2325                          * can't handle the error it will return -EIO and we
2326                          * remain responsible for that page.
2327                          */
2328                         ret = bio_readpage_error(bio, page, start, end,
2329                                                         failed_mirror, NULL);
2330                         if (ret == 0) {
2331 error_handled:
2332                                 uptodate =
2333                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
2334                                 if (err)
2335                                         uptodate = 0;
2336                                 uncache_state(&cached);
2337                                 continue;
2338                         }
2339                         if (tree->ops && tree->ops->readpage_io_failed_hook) {
2340                                 ret = tree->ops->readpage_io_failed_hook(
2341                                                         bio, page, start, end,
2342                                                         failed_mirror, state);
2343                                 if (ret == 0)
2344                                         goto error_handled;
2345                         }
2346                         BUG_ON(ret < 0);
2347                 }
2348
2349                 if (uptodate) {
2350                         set_extent_uptodate(tree, start, end, &cached,
2351                                             GFP_ATOMIC);
2352                 }
2353                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2354
2355                 if (whole_page) {
2356                         if (uptodate) {
2357                                 SetPageUptodate(page);
2358                         } else {
2359                                 ClearPageUptodate(page);
2360                                 SetPageError(page);
2361                         }
2362                         unlock_page(page);
2363                 } else {
2364                         if (uptodate) {
2365                                 check_page_uptodate(tree, page);
2366                         } else {
2367                                 ClearPageUptodate(page);
2368                                 SetPageError(page);
2369                         }
2370                         check_page_locked(tree, page);
2371                 }
2372         } while (bvec <= bvec_end);
2373
2374         bio_put(bio);
2375 }
2376
2377 struct bio *
2378 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2379                 gfp_t gfp_flags)
2380 {
2381         struct bio *bio;
2382
2383         bio = bio_alloc(gfp_flags, nr_vecs);
2384
2385         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2386                 while (!bio && (nr_vecs /= 2))
2387                         bio = bio_alloc(gfp_flags, nr_vecs);
2388         }
2389
2390         if (bio) {
2391                 bio->bi_size = 0;
2392                 bio->bi_bdev = bdev;
2393                 bio->bi_sector = first_sector;
2394         }
2395         return bio;
2396 }
2397
2398 static int __must_check submit_one_bio(int rw, struct bio *bio,
2399                                        int mirror_num, unsigned long bio_flags)
2400 {
2401         int ret = 0;
2402         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2403         struct page *page = bvec->bv_page;
2404         struct extent_io_tree *tree = bio->bi_private;
2405         u64 start;
2406
2407         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2408
2409         bio->bi_private = NULL;
2410
2411         bio_get(bio);
2412
2413         if (tree->ops && tree->ops->submit_bio_hook)
2414                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2415                                            mirror_num, bio_flags, start);
2416         else
2417                 btrfsic_submit_bio(rw, bio);
2418
2419         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2420                 ret = -EOPNOTSUPP;
2421         bio_put(bio);
2422         return ret;
2423 }
2424
2425 static int merge_bio(struct extent_io_tree *tree, struct page *page,
2426                      unsigned long offset, size_t size, struct bio *bio,
2427                      unsigned long bio_flags)
2428 {
2429         int ret = 0;
2430         if (tree->ops && tree->ops->merge_bio_hook)
2431                 ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2432                                                 bio_flags);
2433         BUG_ON(ret < 0);
2434         return ret;
2435
2436 }
2437
2438 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2439                               struct page *page, sector_t sector,
2440                               size_t size, unsigned long offset,
2441                               struct block_device *bdev,
2442                               struct bio **bio_ret,
2443                               unsigned long max_pages,
2444                               bio_end_io_t end_io_func,
2445                               int mirror_num,
2446                               unsigned long prev_bio_flags,
2447                               unsigned long bio_flags)
2448 {
2449         int ret = 0;
2450         struct bio *bio;
2451         int nr;
2452         int contig = 0;
2453         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2454         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2455         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2456
2457         if (bio_ret && *bio_ret) {
2458                 bio = *bio_ret;
2459                 if (old_compressed)
2460                         contig = bio->bi_sector == sector;
2461                 else
2462                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
2463                                 sector;
2464
2465                 if (prev_bio_flags != bio_flags || !contig ||
2466                     merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2467                     bio_add_page(bio, page, page_size, offset) < page_size) {
2468                         ret = submit_one_bio(rw, bio, mirror_num,
2469                                              prev_bio_flags);
2470                         BUG_ON(ret < 0);
2471                         bio = NULL;
2472                 } else {
2473                         return 0;
2474                 }
2475         }
2476         if (this_compressed)
2477                 nr = BIO_MAX_PAGES;
2478         else
2479                 nr = bio_get_nr_vecs(bdev);
2480
2481         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2482         if (!bio)
2483                 return -ENOMEM;
2484
2485         bio_add_page(bio, page, page_size, offset);
2486         bio->bi_end_io = end_io_func;
2487         bio->bi_private = tree;
2488
2489         if (bio_ret)
2490                 *bio_ret = bio;
2491         else {
2492                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2493                 BUG_ON(ret < 0);
2494         }
2495
2496         return ret;
2497 }
2498
2499 void set_page_extent_mapped(struct page *page)
2500 {
2501         if (!PagePrivate(page)) {
2502                 SetPagePrivate(page);
2503                 page_cache_get(page);
2504                 set_page_private(page, EXTENT_PAGE_PRIVATE);
2505         }
2506 }
2507
2508 static void set_page_extent_head(struct page *page, unsigned long len)
2509 {
2510         WARN_ON(!PagePrivate(page));
2511         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2512 }
2513
2514 /*
2515  * basic readpage implementation.  Locked extent state structs are inserted
2516  * into the tree that are removed when the IO is done (by the end_io
2517  * handlers)
2518  */
2519 static int __extent_read_full_page(struct extent_io_tree *tree,
2520                                    struct page *page,
2521                                    get_extent_t *get_extent,
2522                                    struct bio **bio, int mirror_num,
2523                                    unsigned long *bio_flags)
2524 {
2525         struct inode *inode = page->mapping->host;
2526         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2527         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2528         u64 end;
2529         u64 cur = start;
2530         u64 extent_offset;
2531         u64 last_byte = i_size_read(inode);
2532         u64 block_start;
2533         u64 cur_end;
2534         sector_t sector;
2535         struct extent_map *em;
2536         struct block_device *bdev;
2537         struct btrfs_ordered_extent *ordered;
2538         int ret;
2539         int nr = 0;
2540         size_t pg_offset = 0;
2541         size_t iosize;
2542         size_t disk_io_size;
2543         size_t blocksize = inode->i_sb->s_blocksize;
2544         unsigned long this_bio_flag = 0;
2545
2546         set_page_extent_mapped(page);
2547
2548         if (!PageUptodate(page)) {
2549                 if (cleancache_get_page(page) == 0) {
2550                         BUG_ON(blocksize != PAGE_SIZE);
2551                         goto out;
2552                 }
2553         }
2554
2555         end = page_end;
2556         while (1) {
2557                 lock_extent(tree, start, end);
2558                 ordered = btrfs_lookup_ordered_extent(inode, start);
2559                 if (!ordered)
2560                         break;
2561                 unlock_extent(tree, start, end);
2562                 btrfs_start_ordered_extent(inode, ordered, 1);
2563                 btrfs_put_ordered_extent(ordered);
2564         }
2565
2566         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2567                 char *userpage;
2568                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2569
2570                 if (zero_offset) {
2571                         iosize = PAGE_CACHE_SIZE - zero_offset;
2572                         userpage = kmap_atomic(page, KM_USER0);
2573                         memset(userpage + zero_offset, 0, iosize);
2574                         flush_dcache_page(page);
2575                         kunmap_atomic(userpage, KM_USER0);
2576                 }
2577         }
2578         while (cur <= end) {
2579                 if (cur >= last_byte) {
2580                         char *userpage;
2581                         struct extent_state *cached = NULL;
2582
2583                         iosize = PAGE_CACHE_SIZE - pg_offset;
2584                         userpage = kmap_atomic(page, KM_USER0);
2585                         memset(userpage + pg_offset, 0, iosize);
2586                         flush_dcache_page(page);
2587                         kunmap_atomic(userpage, KM_USER0);
2588                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2589                                             &cached, GFP_NOFS);
2590                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2591                                              &cached, GFP_NOFS);
2592                         break;
2593                 }
2594                 em = get_extent(inode, page, pg_offset, cur,
2595                                 end - cur + 1, 0);
2596                 if (IS_ERR_OR_NULL(em)) {
2597                         SetPageError(page);
2598                         unlock_extent(tree, cur, end);
2599                         break;
2600                 }
2601                 extent_offset = cur - em->start;
2602                 BUG_ON(extent_map_end(em) <= cur);
2603                 BUG_ON(end < cur);
2604
2605                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2606                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2607                         extent_set_compress_type(&this_bio_flag,
2608                                                  em->compress_type);
2609                 }
2610
2611                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2612                 cur_end = min(extent_map_end(em) - 1, end);
2613                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2614                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2615                         disk_io_size = em->block_len;
2616                         sector = em->block_start >> 9;
2617                 } else {
2618                         sector = (em->block_start + extent_offset) >> 9;
2619                         disk_io_size = iosize;
2620                 }
2621                 bdev = em->bdev;
2622                 block_start = em->block_start;
2623                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2624                         block_start = EXTENT_MAP_HOLE;
2625                 free_extent_map(em);
2626                 em = NULL;
2627
2628                 /* we've found a hole, just zero and go on */
2629                 if (block_start == EXTENT_MAP_HOLE) {
2630                         char *userpage;
2631                         struct extent_state *cached = NULL;
2632
2633                         userpage = kmap_atomic(page, KM_USER0);
2634                         memset(userpage + pg_offset, 0, iosize);
2635                         flush_dcache_page(page);
2636                         kunmap_atomic(userpage, KM_USER0);
2637
2638                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2639                                             &cached, GFP_NOFS);
2640                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2641                                              &cached, GFP_NOFS);
2642                         cur = cur + iosize;
2643                         pg_offset += iosize;
2644                         continue;
2645                 }
2646                 /* the get_extent function already copied into the page */
2647                 if (test_range_bit(tree, cur, cur_end,
2648                                    EXTENT_UPTODATE, 1, NULL)) {
2649                         check_page_uptodate(tree, page);
2650                         unlock_extent(tree, cur, cur + iosize - 1);
2651                         cur = cur + iosize;
2652                         pg_offset += iosize;
2653                         continue;
2654                 }
2655                 /* we have an inline extent but it didn't get marked up
2656                  * to date.  Error out
2657                  */
2658                 if (block_start == EXTENT_MAP_INLINE) {
2659                         SetPageError(page);
2660                         unlock_extent(tree, cur, cur + iosize - 1);
2661                         cur = cur + iosize;
2662                         pg_offset += iosize;
2663                         continue;
2664                 }
2665
2666                 ret = 0;
2667                 if (tree->ops && tree->ops->readpage_io_hook) {
2668                         ret = tree->ops->readpage_io_hook(page, cur,
2669                                                           cur + iosize - 1);
2670                 }
2671                 if (!ret) {
2672                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2673                         pnr -= page->index;
2674                         ret = submit_extent_page(READ, tree, page,
2675                                          sector, disk_io_size, pg_offset,
2676                                          bdev, bio, pnr,
2677                                          end_bio_extent_readpage, mirror_num,
2678                                          *bio_flags,
2679                                          this_bio_flag);
2680                         nr++;
2681                         *bio_flags = this_bio_flag;
2682                 }
2683                 if (ret)
2684                         SetPageError(page);
2685                 cur = cur + iosize;
2686                 pg_offset += iosize;
2687         }
2688 out:
2689         if (!nr) {
2690                 if (!PageError(page))
2691                         SetPageUptodate(page);
2692                 unlock_page(page);
2693         }
2694         return 0;
2695 }
2696
2697 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2698                             get_extent_t *get_extent, int mirror_num)
2699 {
2700         struct bio *bio = NULL;
2701         unsigned long bio_flags = 0;
2702         int ret;
2703
2704         ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2705                                       &bio_flags);
2706         if (bio) {
2707                 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2708                 BUG_ON(ret < 0);
2709         }
2710         return ret;
2711 }
2712
2713 static noinline void update_nr_written(struct page *page,
2714                                       struct writeback_control *wbc,
2715                                       unsigned long nr_written)
2716 {
2717         wbc->nr_to_write -= nr_written;
2718         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2719             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2720                 page->mapping->writeback_index = page->index + nr_written;
2721 }
2722
2723 /*
2724  * the writepage semantics are similar to regular writepage.  extent
2725  * records are inserted to lock ranges in the tree, and as dirty areas
2726  * are found, they are marked writeback.  Then the lock bits are removed
2727  * and the end_io handler clears the writeback ranges
2728  */
2729 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2730                               void *data)
2731 {
2732         struct inode *inode = page->mapping->host;
2733         struct extent_page_data *epd = data;
2734         struct extent_io_tree *tree = epd->tree;
2735         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2736         u64 delalloc_start;
2737         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2738         u64 end;
2739         u64 cur = start;
2740         u64 extent_offset;
2741         u64 last_byte = i_size_read(inode);
2742         u64 block_start;
2743         u64 iosize;
2744         sector_t sector;
2745         struct extent_state *cached_state = NULL;
2746         struct extent_map *em;
2747         struct block_device *bdev;
2748         int ret;
2749         int nr = 0;
2750         size_t pg_offset = 0;
2751         size_t blocksize;
2752         loff_t i_size = i_size_read(inode);
2753         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2754         u64 nr_delalloc;
2755         u64 delalloc_end;
2756         int page_started;
2757         int compressed;
2758         int write_flags;
2759         unsigned long nr_written = 0;
2760         bool fill_delalloc = true;
2761
2762         if (wbc->sync_mode == WB_SYNC_ALL)
2763                 write_flags = WRITE_SYNC;
2764         else
2765                 write_flags = WRITE;
2766
2767         trace___extent_writepage(page, inode, wbc);
2768
2769         WARN_ON(!PageLocked(page));
2770
2771         ClearPageError(page);
2772
2773         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2774         if (page->index > end_index ||
2775            (page->index == end_index && !pg_offset)) {
2776                 page->mapping->a_ops->invalidatepage(page, 0);
2777                 unlock_page(page);
2778                 return 0;
2779         }
2780
2781         if (page->index == end_index) {
2782                 char *userpage;
2783
2784                 userpage = kmap_atomic(page, KM_USER0);
2785                 memset(userpage + pg_offset, 0,
2786                        PAGE_CACHE_SIZE - pg_offset);
2787                 kunmap_atomic(userpage, KM_USER0);
2788                 flush_dcache_page(page);
2789         }
2790         pg_offset = 0;
2791
2792         set_page_extent_mapped(page);
2793
2794         if (!tree->ops || !tree->ops->fill_delalloc)
2795                 fill_delalloc = false;
2796
2797         delalloc_start = start;
2798         delalloc_end = 0;
2799         page_started = 0;
2800         if (!epd->extent_locked && fill_delalloc) {
2801                 u64 delalloc_to_write = 0;
2802                 /*
2803                  * make sure the wbc mapping index is at least updated
2804                  * to this page.
2805                  */
2806                 update_nr_written(page, wbc, 0);
2807
2808                 while (delalloc_end < page_end) {
2809                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2810                                                        page,
2811                                                        &delalloc_start,
2812                                                        &delalloc_end,
2813                                                        128 * 1024 * 1024);
2814                         if (nr_delalloc == 0) {
2815                                 delalloc_start = delalloc_end + 1;
2816                                 continue;
2817                         }
2818                         ret = tree->ops->fill_delalloc(inode, page,
2819                                                        delalloc_start,
2820                                                        delalloc_end,
2821                                                        &page_started,
2822                                                        &nr_written);
2823                         BUG_ON(ret);
2824                         /*
2825                          * delalloc_end is already one less than the total
2826                          * length, so we don't subtract one from
2827                          * PAGE_CACHE_SIZE
2828                          */
2829                         delalloc_to_write += (delalloc_end - delalloc_start +
2830                                               PAGE_CACHE_SIZE) >>
2831                                               PAGE_CACHE_SHIFT;
2832                         delalloc_start = delalloc_end + 1;
2833                 }
2834                 if (wbc->nr_to_write < delalloc_to_write) {
2835                         int thresh = 8192;
2836
2837                         if (delalloc_to_write < thresh * 2)
2838                                 thresh = delalloc_to_write;
2839                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2840                                                  thresh);
2841                 }
2842
2843                 /* did the fill delalloc function already unlock and start
2844                  * the IO?
2845                  */
2846                 if (page_started) {
2847                         ret = 0;
2848                         /*
2849                          * we've unlocked the page, so we can't update
2850                          * the mapping's writeback index, just update
2851                          * nr_to_write.
2852                          */
2853                         wbc->nr_to_write -= nr_written;
2854                         goto done_unlocked;
2855                 }
2856         }
2857         if (tree->ops && tree->ops->writepage_start_hook) {
2858                 ret = tree->ops->writepage_start_hook(page, start,
2859                                                       page_end);
2860                 if (ret) {
2861                         /* Fixup worker will requeue */
2862                         if (ret == -EBUSY)
2863                                 wbc->pages_skipped++;
2864                         else
2865                                 redirty_page_for_writepage(wbc, page);
2866                         update_nr_written(page, wbc, nr_written);
2867                         unlock_page(page);
2868                         ret = 0;
2869                         goto done_unlocked;
2870                 }
2871         }
2872
2873         /*
2874          * we don't want to touch the inode after unlocking the page,
2875          * so we update the mapping writeback index now
2876          */
2877         update_nr_written(page, wbc, nr_written + 1);
2878
2879         end = page_end;
2880         if (last_byte <= start) {
2881                 if (tree->ops && tree->ops->writepage_end_io_hook)
2882                         tree->ops->writepage_end_io_hook(page, start,
2883                                                          page_end, NULL, 1);
2884                 goto done;
2885         }
2886
2887         blocksize = inode->i_sb->s_blocksize;
2888
2889         while (cur <= end) {
2890                 if (cur >= last_byte) {
2891                         if (tree->ops && tree->ops->writepage_end_io_hook)
2892                                 tree->ops->writepage_end_io_hook(page, cur,
2893                                                          page_end, NULL, 1);
2894                         break;
2895                 }
2896                 em = epd->get_extent(inode, page, pg_offset, cur,
2897                                      end - cur + 1, 1);
2898                 if (IS_ERR_OR_NULL(em)) {
2899                         SetPageError(page);
2900                         break;
2901                 }
2902
2903                 extent_offset = cur - em->start;
2904                 BUG_ON(extent_map_end(em) <= cur);
2905                 BUG_ON(end < cur);
2906                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2907                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2908                 sector = (em->block_start + extent_offset) >> 9;
2909                 bdev = em->bdev;
2910                 block_start = em->block_start;
2911                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2912                 free_extent_map(em);
2913                 em = NULL;
2914
2915                 /*
2916                  * compressed and inline extents are written through other
2917                  * paths in the FS
2918                  */
2919                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2920                     block_start == EXTENT_MAP_INLINE) {
2921                         /*
2922                          * end_io notification does not happen here for
2923                          * compressed extents
2924                          */
2925                         if (!compressed && tree->ops &&
2926                             tree->ops->writepage_end_io_hook)
2927                                 tree->ops->writepage_end_io_hook(page, cur,
2928                                                          cur + iosize - 1,
2929                                                          NULL, 1);
2930                         else if (compressed) {
2931                                 /* we don't want to end_page_writeback on
2932                                  * a compressed extent.  this happens
2933                                  * elsewhere
2934                                  */
2935                                 nr++;
2936                         }
2937
2938                         cur += iosize;
2939                         pg_offset += iosize;
2940                         continue;
2941                 }
2942                 /* leave this out until we have a page_mkwrite call */
2943                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2944                                    EXTENT_DIRTY, 0, NULL)) {
2945                         cur = cur + iosize;
2946                         pg_offset += iosize;
2947                         continue;
2948                 }
2949
2950                 if (tree->ops && tree->ops->writepage_io_hook) {
2951                         ret = tree->ops->writepage_io_hook(page, cur,
2952                                                 cur + iosize - 1);
2953                 } else {
2954                         ret = 0;
2955                 }
2956                 if (ret) {
2957                         SetPageError(page);
2958                 } else {
2959                         unsigned long max_nr = end_index + 1;
2960
2961                         set_range_writeback(tree, cur, cur + iosize - 1);
2962                         if (!PageWriteback(page)) {
2963                                 printk(KERN_ERR "btrfs warning page %lu not "
2964                                        "writeback, cur %llu end %llu\n",
2965                                        page->index, (unsigned long long)cur,
2966                                        (unsigned long long)end);
2967                         }
2968
2969                         ret = submit_extent_page(write_flags, tree, page,
2970                                                  sector, iosize, pg_offset,
2971                                                  bdev, &epd->bio, max_nr,
2972                                                  end_bio_extent_writepage,
2973                                                  0, 0, 0);
2974                         if (ret)
2975                                 SetPageError(page);
2976                 }
2977                 cur = cur + iosize;
2978                 pg_offset += iosize;
2979                 nr++;
2980         }
2981 done:
2982         if (nr == 0) {
2983                 /* make sure the mapping tag for page dirty gets cleared */
2984                 set_page_writeback(page);
2985                 end_page_writeback(page);
2986         }
2987         unlock_page(page);
2988
2989 done_unlocked:
2990
2991         /* drop our reference on any cached states */
2992         free_extent_state(cached_state);
2993         return 0;
2994 }
2995
2996 /**
2997  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2998  * @mapping: address space structure to write
2999  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3000  * @writepage: function called for each page
3001  * @data: data passed to writepage function
3002  *
3003  * If a page is already under I/O, write_cache_pages() skips it, even
3004  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3005  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3006  * and msync() need to guarantee that all the data which was dirty at the time
3007  * the call was made get new I/O started against them.  If wbc->sync_mode is
3008  * WB_SYNC_ALL then we were called for data integrity and we must wait for
3009  * existing IO to complete.
3010  */
3011 static int extent_write_cache_pages(struct extent_io_tree *tree,
3012                              struct address_space *mapping,
3013                              struct writeback_control *wbc,
3014                              writepage_t writepage, void *data,
3015                              void (*flush_fn)(void *))
3016 {
3017         int ret = 0;
3018         int done = 0;
3019         int nr_to_write_done = 0;
3020         struct pagevec pvec;
3021         int nr_pages;
3022         pgoff_t index;
3023         pgoff_t end;            /* Inclusive */
3024         int scanned = 0;
3025         int tag;
3026
3027         pagevec_init(&pvec, 0);
3028         if (wbc->range_cyclic) {
3029                 index = mapping->writeback_index; /* Start from prev offset */
3030                 end = -1;
3031         } else {
3032                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
3033                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
3034                 scanned = 1;
3035         }
3036         if (wbc->sync_mode == WB_SYNC_ALL)
3037                 tag = PAGECACHE_TAG_TOWRITE;
3038         else
3039                 tag = PAGECACHE_TAG_DIRTY;
3040 retry:
3041         if (wbc->sync_mode == WB_SYNC_ALL)
3042                 tag_pages_for_writeback(mapping, index, end);
3043         while (!done && !nr_to_write_done && (index <= end) &&
3044                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3045                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3046                 unsigned i;
3047
3048                 scanned = 1;
3049                 for (i = 0; i < nr_pages; i++) {
3050                         struct page *page = pvec.pages[i];
3051
3052                         /*
3053                          * At this point we hold neither mapping->tree_lock nor
3054                          * lock on the page itself: the page may be truncated or
3055                          * invalidated (changing page->mapping to NULL), or even
3056                          * swizzled back from swapper_space to tmpfs file
3057                          * mapping
3058                          */
3059                         if (tree->ops &&
3060                             tree->ops->write_cache_pages_lock_hook) {
3061                                 tree->ops->write_cache_pages_lock_hook(page,
3062                                                                data, flush_fn);
3063                         } else {
3064                                 if (!trylock_page(page)) {
3065                                         flush_fn(data);
3066                                         lock_page(page);
3067                                 }
3068                         }
3069
3070                         if (unlikely(page->mapping != mapping)) {
3071                                 unlock_page(page);
3072                                 continue;
3073                         }
3074
3075                         if (!wbc->range_cyclic && page->index > end) {
3076                                 done = 1;
3077                                 unlock_page(page);
3078                                 continue;
3079                         }
3080
3081                         if (wbc->sync_mode != WB_SYNC_NONE) {
3082                                 if (PageWriteback(page))
3083                                         flush_fn(data);
3084                                 wait_on_page_writeback(page);
3085                         }
3086
3087                         if (PageWriteback(page) ||
3088                             !clear_page_dirty_for_io(page)) {
3089                                 unlock_page(page);
3090                                 continue;
3091                         }
3092
3093                         ret = (*writepage)(page, wbc, data);
3094
3095                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3096                                 unlock_page(page);
3097                                 ret = 0;
3098                         }
3099                         if (ret)
3100                                 done = 1;
3101
3102                         /*
3103                          * the filesystem may choose to bump up nr_to_write.
3104                          * We have to make sure to honor the new nr_to_write
3105                          * at any time
3106                          */
3107                         nr_to_write_done = wbc->nr_to_write <= 0;
3108                 }
3109                 pagevec_release(&pvec);
3110                 cond_resched();
3111         }
3112         if (!scanned && !done) {
3113                 /*
3114                  * We hit the last page and there is more work to be done: wrap
3115                  * back to the start of the file
3116                  */
3117                 scanned = 1;
3118                 index = 0;
3119                 goto retry;
3120         }
3121         return ret;
3122 }
3123
3124 static void flush_epd_write_bio(struct extent_page_data *epd)
3125 {
3126         if (epd->bio) {
3127                 int rw = WRITE;
3128                 int ret;
3129
3130                 if (epd->sync_io)
3131                         rw = WRITE_SYNC;
3132
3133                 ret = submit_one_bio(rw, epd->bio, 0, 0);
3134                 BUG_ON(ret < 0);
3135                 epd->bio = NULL;
3136         }
3137 }
3138
3139 static noinline void flush_write_bio(void *data)
3140 {
3141         struct extent_page_data *epd = data;
3142         flush_epd_write_bio(epd);
3143 }
3144
3145 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3146                           get_extent_t *get_extent,
3147                           struct writeback_control *wbc)
3148 {
3149         int ret;
3150         struct extent_page_data epd = {
3151                 .bio = NULL,
3152                 .tree = tree,
3153                 .get_extent = get_extent,
3154                 .extent_locked = 0,
3155                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3156         };
3157
3158         ret = __extent_writepage(page, wbc, &epd);
3159
3160         flush_epd_write_bio(&epd);
3161         return ret;
3162 }
3163
3164 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3165                               u64 start, u64 end, get_extent_t *get_extent,
3166                               int mode)
3167 {
3168         int ret = 0;
3169         struct address_space *mapping = inode->i_mapping;
3170         struct page *page;
3171         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3172                 PAGE_CACHE_SHIFT;
3173
3174         struct extent_page_data epd = {
3175                 .bio = NULL,
3176                 .tree = tree,
3177                 .get_extent = get_extent,
3178                 .extent_locked = 1,
3179                 .sync_io = mode == WB_SYNC_ALL,
3180         };
3181         struct writeback_control wbc_writepages = {
3182                 .sync_mode      = mode,
3183                 .nr_to_write    = nr_pages * 2,
3184                 .range_start    = start,
3185                 .range_end      = end + 1,
3186         };
3187
3188         while (start <= end) {
3189                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3190                 if (clear_page_dirty_for_io(page))
3191                         ret = __extent_writepage(page, &wbc_writepages, &epd);
3192                 else {
3193                         if (tree->ops && tree->ops->writepage_end_io_hook)
3194                                 tree->ops->writepage_end_io_hook(page, start,
3195                                                  start + PAGE_CACHE_SIZE - 1,
3196                                                  NULL, 1);
3197                         unlock_page(page);
3198                 }
3199                 page_cache_release(page);
3200                 start += PAGE_CACHE_SIZE;
3201         }
3202
3203         flush_epd_write_bio(&epd);
3204         return ret;
3205 }
3206
3207 int extent_writepages(struct extent_io_tree *tree,
3208                       struct address_space *mapping,
3209                       get_extent_t *get_extent,
3210                       struct writeback_control *wbc)
3211 {
3212         int ret = 0;
3213         struct extent_page_data epd = {
3214                 .bio = NULL,
3215                 .tree = tree,
3216                 .get_extent = get_extent,
3217                 .extent_locked = 0,
3218                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3219         };
3220
3221         ret = extent_write_cache_pages(tree, mapping, wbc,
3222                                        __extent_writepage, &epd,
3223                                        flush_write_bio);
3224         flush_epd_write_bio(&epd);
3225         return ret;
3226 }
3227
3228 int extent_readpages(struct extent_io_tree *tree,
3229                      struct address_space *mapping,
3230                      struct list_head *pages, unsigned nr_pages,
3231                      get_extent_t get_extent)
3232 {
3233         struct bio *bio = NULL;
3234         unsigned page_idx;
3235         unsigned long bio_flags = 0;
3236
3237         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3238                 struct page *page = list_entry(pages->prev, struct page, lru);
3239
3240                 prefetchw(&page->flags);
3241                 list_del(&page->lru);
3242                 if (!add_to_page_cache_lru(page, mapping,
3243                                         page->index, GFP_NOFS)) {
3244                         __extent_read_full_page(tree, page, get_extent,
3245                                                 &bio, 0, &bio_flags);
3246                 }
3247                 page_cache_release(page);
3248         }
3249         BUG_ON(!list_empty(pages));
3250         if (bio) {
3251                 int ret = submit_one_bio(READ, bio, 0, bio_flags);
3252                 BUG_ON(ret < 0);
3253         }
3254         return 0;
3255 }
3256
3257 /*
3258  * basic invalidatepage code, this waits on any locked or writeback
3259  * ranges corresponding to the page, and then deletes any extent state
3260  * records from the tree
3261  */
3262 int extent_invalidatepage(struct extent_io_tree *tree,
3263                           struct page *page, unsigned long offset)
3264 {
3265         struct extent_state *cached_state = NULL;
3266         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3267         u64 end = start + PAGE_CACHE_SIZE - 1;
3268         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3269
3270         start += (offset + blocksize - 1) & ~(blocksize - 1);
3271         if (start > end)
3272                 return 0;
3273
3274         lock_extent_bits(tree, start, end, 0, &cached_state);
3275         wait_on_page_writeback(page);
3276         clear_extent_bit(tree, start, end,
3277                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3278                          EXTENT_DO_ACCOUNTING,
3279                          1, 1, &cached_state, GFP_NOFS);
3280         return 0;
3281 }
3282
3283 /*
3284  * a helper for releasepage, this tests for areas of the page that
3285  * are locked or under IO and drops the related state bits if it is safe
3286  * to drop the page.
3287  */
3288 int try_release_extent_state(struct extent_map_tree *map,
3289                              struct extent_io_tree *tree, struct page *page,
3290                              gfp_t mask)
3291 {
3292         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3293         u64 end = start + PAGE_CACHE_SIZE - 1;
3294         int ret = 1;
3295
3296         if (test_range_bit(tree, start, end,
3297                            EXTENT_IOBITS, 0, NULL))
3298                 ret = 0;
3299         else {
3300                 if ((mask & GFP_NOFS) == GFP_NOFS)
3301                         mask = GFP_NOFS;
3302                 /*
3303                  * at this point we can safely clear everything except the
3304                  * locked bit and the nodatasum bit
3305                  */
3306                 ret = clear_extent_bit(tree, start, end,
3307                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3308                                  0, 0, NULL, mask);
3309
3310                 /* if clear_extent_bit failed for enomem reasons,
3311                  * we can't allow the release to continue.
3312                  */
3313                 if (ret < 0)
3314                         ret = 0;
3315                 else
3316                         ret = 1;
3317         }
3318         return ret;
3319 }
3320
3321 /*
3322  * a helper for releasepage.  As long as there are no locked extents
3323  * in the range corresponding to the page, both state records and extent
3324  * map records are removed
3325  */
3326 int try_release_extent_mapping(struct extent_map_tree *map,
3327                                struct extent_io_tree *tree, struct page *page,
3328                                gfp_t mask)
3329 {
3330         struct extent_map *em;
3331         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3332         u64 end = start + PAGE_CACHE_SIZE - 1;
3333
3334         if ((mask & __GFP_WAIT) &&
3335             page->mapping->host->i_size > 16 * 1024 * 1024) {
3336                 u64 len;
3337                 while (start <= end) {
3338                         len = end - start + 1;
3339                         write_lock(&map->lock);
3340                         em = lookup_extent_mapping(map, start, len);
3341                         if (!em) {
3342                                 write_unlock(&map->lock);
3343                                 break;
3344                         }
3345                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3346                             em->start != start) {
3347                                 write_unlock(&map->lock);
3348                                 free_extent_map(em);
3349                                 break;
3350                         }
3351                         if (!test_range_bit(tree, em->start,
3352                                             extent_map_end(em) - 1,
3353                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
3354                                             0, NULL)) {
3355                                 remove_extent_mapping(map, em);
3356                                 /* once for the rb tree */
3357                                 free_extent_map(em);
3358                         }
3359                         start = extent_map_end(em);
3360                         write_unlock(&map->lock);
3361
3362                         /* once for us */
3363                         free_extent_map(em);
3364                 }
3365         }
3366         return try_release_extent_state(map, tree, page, mask);
3367 }
3368
3369 /*
3370  * helper function for fiemap, which doesn't want to see any holes.
3371  * This maps until we find something past 'last'
3372  */
3373 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3374                                                 u64 offset,
3375                                                 u64 last,
3376                                                 get_extent_t *get_extent)
3377 {
3378         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3379         struct extent_map *em;
3380         u64 len;
3381
3382         if (offset >= last)
3383                 return NULL;
3384
3385         while(1) {
3386                 len = last - offset;
3387                 if (len == 0)
3388                         break;
3389                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
3390                 em = get_extent(inode, NULL, 0, offset, len, 0);
3391                 if (IS_ERR_OR_NULL(em))
3392                         return em;
3393
3394                 /* if this isn't a hole return it */
3395                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3396                     em->block_start != EXTENT_MAP_HOLE) {
3397                         return em;
3398                 }
3399
3400                 /* this is a hole, advance to the next extent */
3401                 offset = extent_map_end(em);
3402                 free_extent_map(em);
3403                 if (offset >= last)
3404                         break;
3405         }
3406         return NULL;
3407 }
3408
3409 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3410                 __u64 start, __u64 len, get_extent_t *get_extent)
3411 {
3412         int ret = 0;
3413         u64 off = start;
3414         u64 max = start + len;
3415         u32 flags = 0;
3416         u32 found_type;
3417         u64 last;
3418         u64 last_for_get_extent = 0;
3419         u64 disko = 0;
3420         u64 isize = i_size_read(inode);
3421         struct btrfs_key found_key;
3422         struct extent_map *em = NULL;
3423         struct extent_state *cached_state = NULL;
3424         struct btrfs_path *path;
3425         struct btrfs_file_extent_item *item;
3426         int end = 0;
3427         u64 em_start = 0;
3428         u64 em_len = 0;
3429         u64 em_end = 0;
3430         unsigned long emflags;
3431
3432         if (len == 0)
3433                 return -EINVAL;
3434
3435         path = btrfs_alloc_path();
3436         if (!path)
3437                 return -ENOMEM;
3438         path->leave_spinning = 1;
3439
3440         start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3441         len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3442
3443         /*
3444          * lookup the last file extent.  We're not using i_size here
3445          * because there might be preallocation past i_size
3446          */
3447         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3448                                        path, btrfs_ino(inode), -1, 0);
3449         if (ret < 0) {
3450                 btrfs_free_path(path);
3451                 return ret;
3452         }
3453         WARN_ON(!ret);
3454         path->slots[0]--;
3455         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3456                               struct btrfs_file_extent_item);
3457         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3458         found_type = btrfs_key_type(&found_key);
3459
3460         /* No extents, but there might be delalloc bits */
3461         if (found_key.objectid != btrfs_ino(inode) ||
3462             found_type != BTRFS_EXTENT_DATA_KEY) {
3463                 /* have to trust i_size as the end */
3464                 last = (u64)-1;
3465                 last_for_get_extent = isize;
3466         } else {
3467                 /*
3468                  * remember the start of the last extent.  There are a
3469                  * bunch of different factors that go into the length of the
3470                  * extent, so its much less complex to remember where it started
3471                  */
3472                 last = found_key.offset;
3473                 last_for_get_extent = last + 1;
3474         }
3475         btrfs_free_path(path);
3476
3477         /*
3478          * we might have some extents allocated but more delalloc past those
3479          * extents.  so, we trust isize unless the start of the last extent is
3480          * beyond isize
3481          */
3482         if (last < isize) {
3483                 last = (u64)-1;
3484                 last_for_get_extent = isize;
3485         }
3486
3487         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3488                          &cached_state);
3489
3490         em = get_extent_skip_holes(inode, start, last_for_get_extent,
3491                                    get_extent);
3492         if (!em)
3493                 goto out;
3494         if (IS_ERR(em)) {
3495                 ret = PTR_ERR(em);
3496                 goto out;
3497         }
3498
3499         while (!end) {
3500                 u64 offset_in_extent;
3501
3502                 /* break if the extent we found is outside the range */
3503                 if (em->start >= max || extent_map_end(em) < off)
3504                         break;
3505
3506                 /*
3507                  * get_extent may return an extent that starts before our
3508                  * requested range.  We have to make sure the ranges
3509                  * we return to fiemap always move forward and don't
3510                  * overlap, so adjust the offsets here
3511                  */
3512                 em_start = max(em->start, off);
3513
3514                 /*
3515                  * record the offset from the start of the extent
3516                  * for adjusting the disk offset below
3517                  */
3518                 offset_in_extent = em_start - em->start;
3519                 em_end = extent_map_end(em);
3520                 em_len = em_end - em_start;
3521                 emflags = em->flags;
3522                 disko = 0;
3523                 flags = 0;
3524
3525                 /*
3526                  * bump off for our next call to get_extent
3527                  */
3528                 off = extent_map_end(em);
3529                 if (off >= max)
3530                         end = 1;
3531
3532                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3533                         end = 1;
3534                         flags |= FIEMAP_EXTENT_LAST;
3535                 } else if (em->block_start == EXTENT_MAP_INLINE) {
3536                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
3537                                   FIEMAP_EXTENT_NOT_ALIGNED);
3538                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3539                         flags |= (FIEMAP_EXTENT_DELALLOC |
3540                                   FIEMAP_EXTENT_UNKNOWN);
3541                 } else {
3542                         disko = em->block_start + offset_in_extent;
3543                 }
3544                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3545                         flags |= FIEMAP_EXTENT_ENCODED;
3546
3547                 free_extent_map(em);
3548                 em = NULL;
3549                 if ((em_start >= last) || em_len == (u64)-1 ||
3550                    (last == (u64)-1 && isize <= em_end)) {
3551                         flags |= FIEMAP_EXTENT_LAST;
3552                         end = 1;
3553                 }
3554
3555                 /* now scan forward to see if this is really the last extent. */
3556                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3557                                            get_extent);
3558                 if (IS_ERR(em)) {
3559                         ret = PTR_ERR(em);
3560                         goto out;
3561                 }
3562                 if (!em) {
3563                         flags |= FIEMAP_EXTENT_LAST;
3564                         end = 1;
3565                 }
3566                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3567                                               em_len, flags);
3568                 if (ret)
3569                         goto out_free;
3570         }
3571 out_free:
3572         free_extent_map(em);
3573 out:
3574         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3575                              &cached_state, GFP_NOFS);
3576         return ret;
3577 }
3578
3579 inline struct page *extent_buffer_page(struct extent_buffer *eb,
3580                                               unsigned long i)
3581 {
3582         struct page *p;
3583         struct address_space *mapping;
3584
3585         if (i == 0)
3586                 return eb->first_page;
3587         i += eb->start >> PAGE_CACHE_SHIFT;
3588         mapping = eb->first_page->mapping;
3589         if (!mapping)
3590                 return NULL;
3591
3592         /*
3593          * extent_buffer_page is only called after pinning the page
3594          * by increasing the reference count.  So we know the page must
3595          * be in the radix tree.
3596          */
3597         rcu_read_lock();
3598         p = radix_tree_lookup(&mapping->page_tree, i);
3599         rcu_read_unlock();
3600
3601         return p;
3602 }
3603
3604 inline unsigned long num_extent_pages(u64 start, u64 len)
3605 {
3606         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3607                 (start >> PAGE_CACHE_SHIFT);
3608 }
3609
3610 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3611                                                    u64 start,
3612                                                    unsigned long len,
3613                                                    gfp_t mask)
3614 {
3615         struct extent_buffer *eb = NULL;
3616 #if LEAK_DEBUG
3617         unsigned long flags;
3618 #endif
3619
3620         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3621         if (eb == NULL)
3622                 return NULL;
3623         eb->start = start;
3624         eb->len = len;
3625         rwlock_init(&eb->lock);
3626         atomic_set(&eb->write_locks, 0);
3627         atomic_set(&eb->read_locks, 0);
3628         atomic_set(&eb->blocking_readers, 0);
3629         atomic_set(&eb->blocking_writers, 0);
3630         atomic_set(&eb->spinning_readers, 0);
3631         atomic_set(&eb->spinning_writers, 0);
3632         eb->lock_nested = 0;
3633         init_waitqueue_head(&eb->write_lock_wq);
3634         init_waitqueue_head(&eb->read_lock_wq);
3635
3636 #if LEAK_DEBUG
3637         spin_lock_irqsave(&leak_lock, flags);
3638         list_add(&eb->leak_list, &buffers);
3639         spin_unlock_irqrestore(&leak_lock, flags);
3640 #endif
3641         atomic_set(&eb->refs, 1);
3642
3643         return eb;
3644 }
3645
3646 static void __free_extent_buffer(struct extent_buffer *eb)
3647 {
3648 #if LEAK_DEBUG
3649         unsigned long flags;
3650         spin_lock_irqsave(&leak_lock, flags);
3651         list_del(&eb->leak_list);
3652         spin_unlock_irqrestore(&leak_lock, flags);
3653 #endif
3654         kmem_cache_free(extent_buffer_cache, eb);
3655 }
3656
3657 /*
3658  * Helper for releasing extent buffer page.
3659  */
3660 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3661                                                 unsigned long start_idx)
3662 {
3663         unsigned long index;
3664         struct page *page;
3665
3666         if (!eb->first_page)
3667                 return;
3668
3669         index = num_extent_pages(eb->start, eb->len);
3670         if (start_idx >= index)
3671                 return;
3672
3673         do {
3674                 index--;
3675                 page = extent_buffer_page(eb, index);
3676                 if (page)
3677                         page_cache_release(page);
3678         } while (index != start_idx);
3679 }
3680
3681 /*
3682  * Helper for releasing the extent buffer.
3683  */
3684 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3685 {
3686         btrfs_release_extent_buffer_page(eb, 0);
3687         __free_extent_buffer(eb);
3688 }
3689
3690 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3691                                           u64 start, unsigned long len,
3692                                           struct page *page0)
3693 {
3694         unsigned long num_pages = num_extent_pages(start, len);
3695         unsigned long i;
3696         unsigned long index = start >> PAGE_CACHE_SHIFT;
3697         struct extent_buffer *eb;
3698         struct extent_buffer *exists = NULL;
3699         struct page *p;
3700         struct address_space *mapping = tree->mapping;
3701         int uptodate = 1;
3702         int ret;
3703
3704         rcu_read_lock();
3705         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3706         if (eb && atomic_inc_not_zero(&eb->refs)) {
3707                 rcu_read_unlock();
3708                 mark_page_accessed(eb->first_page);
3709                 return eb;
3710         }
3711         rcu_read_unlock();
3712
3713         eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3714         if (!eb)
3715                 return NULL;
3716
3717         if (page0) {
3718                 eb->first_page = page0;
3719                 i = 1;
3720                 index++;
3721                 page_cache_get(page0);
3722                 mark_page_accessed(page0);
3723                 set_page_extent_mapped(page0);
3724                 set_page_extent_head(page0, len);
3725                 uptodate = PageUptodate(page0);
3726         } else {
3727                 i = 0;
3728         }
3729         for (; i < num_pages; i++, index++) {
3730                 p = find_or_create_page(mapping, index, GFP_NOFS);
3731                 if (!p) {
3732                         WARN_ON(1);
3733                         goto free_eb;
3734                 }
3735                 set_page_extent_mapped(p);
3736                 mark_page_accessed(p);
3737                 if (i == 0) {
3738                         eb->first_page = p;
3739                         set_page_extent_head(p, len);
3740                 } else {
3741                         set_page_private(p, EXTENT_PAGE_PRIVATE);
3742                 }
3743                 if (!PageUptodate(p))
3744                         uptodate = 0;
3745
3746                 /*
3747                  * see below about how we avoid a nasty race with release page
3748                  * and why we unlock later
3749                  */
3750                 if (i != 0)
3751                         unlock_page(p);
3752         }
3753         if (uptodate)
3754                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3755
3756         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3757         if (ret)
3758                 goto free_eb;
3759
3760         spin_lock(&tree->buffer_lock);
3761         ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3762         if (ret == -EEXIST) {
3763                 exists = radix_tree_lookup(&tree->buffer,
3764                                                 start >> PAGE_CACHE_SHIFT);
3765                 /* add one reference for the caller */
3766                 atomic_inc(&exists->refs);
3767                 spin_unlock(&tree->buffer_lock);
3768                 radix_tree_preload_end();
3769                 goto free_eb;
3770         }
3771         /* add one reference for the tree */
3772         atomic_inc(&eb->refs);
3773         spin_unlock(&tree->buffer_lock);
3774         radix_tree_preload_end();
3775
3776         /*
3777          * there is a race where release page may have
3778          * tried to find this extent buffer in the radix
3779          * but failed.  It will tell the VM it is safe to
3780          * reclaim the, and it will clear the page private bit.
3781          * We must make sure to set the page private bit properly
3782          * after the extent buffer is in the radix tree so
3783          * it doesn't get lost
3784          */
3785         set_page_extent_mapped(eb->first_page);
3786         set_page_extent_head(eb->first_page, eb->len);
3787         if (!page0)
3788                 unlock_page(eb->first_page);
3789         return eb;
3790
3791 free_eb:
3792         if (eb->first_page && !page0)
3793                 unlock_page(eb->first_page);
3794
3795         if (!atomic_dec_and_test(&eb->refs))
3796                 return exists;
3797         btrfs_release_extent_buffer(eb);
3798         return exists;
3799 }
3800
3801 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3802                                          u64 start, unsigned long len)
3803 {
3804         struct extent_buffer *eb;
3805
3806         rcu_read_lock();
3807         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3808         if (eb && atomic_inc_not_zero(&eb->refs)) {
3809                 rcu_read_unlock();
3810                 mark_page_accessed(eb->first_page);
3811                 return eb;
3812         }
3813         rcu_read_unlock();
3814
3815         return NULL;
3816 }
3817
3818 void free_extent_buffer(struct extent_buffer *eb)
3819 {
3820         if (!eb)
3821                 return;
3822
3823         if (!atomic_dec_and_test(&eb->refs))
3824                 return;
3825
3826         WARN_ON(1);
3827 }
3828
3829 void clear_extent_buffer_dirty(struct extent_io_tree *tree,
3830                               struct extent_buffer *eb)
3831 {
3832         unsigned long i;
3833         unsigned long num_pages;
3834         struct page *page;
3835
3836         num_pages = num_extent_pages(eb->start, eb->len);
3837
3838         for (i = 0; i < num_pages; i++) {
3839                 page = extent_buffer_page(eb, i);
3840                 if (!PageDirty(page))
3841                         continue;
3842
3843                 lock_page(page);
3844                 WARN_ON(!PagePrivate(page));
3845
3846                 set_page_extent_mapped(page);
3847                 if (i == 0)
3848                         set_page_extent_head(page, eb->len);
3849
3850                 clear_page_dirty_for_io(page);
3851                 spin_lock_irq(&page->mapping->tree_lock);
3852                 if (!PageDirty(page)) {
3853                         radix_tree_tag_clear(&page->mapping->page_tree,
3854                                                 page_index(page),
3855                                                 PAGECACHE_TAG_DIRTY);
3856                 }
3857                 spin_unlock_irq(&page->mapping->tree_lock);
3858                 ClearPageError(page);
3859                 unlock_page(page);
3860         }
3861 }
3862
3863 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3864                              struct extent_buffer *eb)
3865 {
3866         unsigned long i;
3867         unsigned long num_pages;
3868         int was_dirty = 0;
3869
3870         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3871         num_pages = num_extent_pages(eb->start, eb->len);
3872         for (i = 0; i < num_pages; i++)
3873                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3874         return was_dirty;
3875 }
3876
3877 static int __eb_straddles_pages(u64 start, u64 len)
3878 {
3879         if (len < PAGE_CACHE_SIZE)
3880                 return 1;
3881         if (start & (PAGE_CACHE_SIZE - 1))
3882                 return 1;
3883         if ((start + len) & (PAGE_CACHE_SIZE - 1))
3884                 return 1;
3885         return 0;
3886 }
3887
3888 static int eb_straddles_pages(struct extent_buffer *eb)
3889 {
3890         return __eb_straddles_pages(eb->start, eb->len);
3891 }
3892
3893 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3894                                 struct extent_buffer *eb,
3895                                 struct extent_state **cached_state)
3896 {
3897         unsigned long i;
3898         struct page *page;
3899         unsigned long num_pages;
3900
3901         num_pages = num_extent_pages(eb->start, eb->len);
3902         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3903
3904         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3905                               cached_state, GFP_NOFS);
3906
3907         for (i = 0; i < num_pages; i++) {
3908                 page = extent_buffer_page(eb, i);
3909                 if (page)
3910                         ClearPageUptodate(page);
3911         }
3912         return 0;
3913 }
3914
3915 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3916                                 struct extent_buffer *eb)
3917 {
3918         unsigned long i;
3919         struct page *page;
3920         unsigned long num_pages;
3921
3922         num_pages = num_extent_pages(eb->start, eb->len);
3923
3924         if (eb_straddles_pages(eb)) {
3925                 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3926                                     NULL, GFP_NOFS);
3927         }
3928         for (i = 0; i < num_pages; i++) {
3929                 page = extent_buffer_page(eb, i);
3930                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3931                     ((i == num_pages - 1) &&
3932                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3933                         check_page_uptodate(tree, page);
3934                         continue;
3935                 }
3936                 SetPageUptodate(page);
3937         }
3938         return 0;
3939 }
3940
3941 int extent_range_uptodate(struct extent_io_tree *tree,
3942                           u64 start, u64 end)
3943 {
3944         struct page *page;
3945         int ret;
3946         int pg_uptodate = 1;
3947         int uptodate;
3948         unsigned long index;
3949
3950         if (__eb_straddles_pages(start, end - start + 1)) {
3951                 ret = test_range_bit(tree, start, end,
3952                                      EXTENT_UPTODATE, 1, NULL);
3953                 if (ret)
3954                         return 1;
3955         }
3956         while (start <= end) {
3957                 index = start >> PAGE_CACHE_SHIFT;
3958                 page = find_get_page(tree->mapping, index);
3959                 if (!page)
3960                         return 1;
3961                 uptodate = PageUptodate(page);
3962                 page_cache_release(page);
3963                 if (!uptodate) {
3964                         pg_uptodate = 0;
3965                         break;
3966                 }
3967                 start += PAGE_CACHE_SIZE;
3968         }
3969         return pg_uptodate;
3970 }
3971
3972 int extent_buffer_uptodate(struct extent_io_tree *tree,
3973                            struct extent_buffer *eb,
3974                            struct extent_state *cached_state)
3975 {
3976         int ret = 0;
3977         unsigned long num_pages;
3978         unsigned long i;
3979         struct page *page;
3980         int pg_uptodate = 1;
3981
3982         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3983                 return 1;
3984
3985         if (eb_straddles_pages(eb)) {
3986                 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3987                                    EXTENT_UPTODATE, 1, cached_state);
3988                 if (ret)
3989                         return ret;
3990         }
3991
3992         num_pages = num_extent_pages(eb->start, eb->len);
3993         for (i = 0; i < num_pages; i++) {
3994                 page = extent_buffer_page(eb, i);
3995                 if (!PageUptodate(page)) {
3996                         pg_uptodate = 0;
3997                         break;
3998                 }
3999         }
4000         return pg_uptodate;
4001 }
4002
4003 int read_extent_buffer_pages(struct extent_io_tree *tree,
4004                              struct extent_buffer *eb, u64 start, int wait,
4005                              get_extent_t *get_extent, int mirror_num)
4006 {
4007         unsigned long i;
4008         unsigned long start_i;
4009         struct page *page;
4010         int err;
4011         int ret = 0;
4012         int locked_pages = 0;
4013         int all_uptodate = 1;
4014         int inc_all_pages = 0;
4015         unsigned long num_pages;
4016         struct bio *bio = NULL;
4017         unsigned long bio_flags = 0;
4018
4019         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4020                 return 0;
4021
4022         if (eb_straddles_pages(eb)) {
4023                 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
4024                                    EXTENT_UPTODATE, 1, NULL)) {
4025                         return 0;
4026                 }
4027         }
4028
4029         if (start) {
4030                 WARN_ON(start < eb->start);
4031                 start_i = (start >> PAGE_CACHE_SHIFT) -
4032                         (eb->start >> PAGE_CACHE_SHIFT);
4033         } else {
4034                 start_i = 0;
4035         }
4036
4037         num_pages = num_extent_pages(eb->start, eb->len);
4038         for (i = start_i; i < num_pages; i++) {
4039                 page = extent_buffer_page(eb, i);
4040                 if (wait == WAIT_NONE) {
4041                         if (!trylock_page(page))
4042                                 goto unlock_exit;
4043                 } else {
4044                         lock_page(page);
4045                 }
4046                 locked_pages++;
4047                 if (!PageUptodate(page))
4048                         all_uptodate = 0;
4049         }
4050         if (all_uptodate) {
4051                 if (start_i == 0)
4052                         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4053                 goto unlock_exit;
4054         }
4055
4056         for (i = start_i; i < num_pages; i++) {
4057                 page = extent_buffer_page(eb, i);
4058
4059                 WARN_ON(!PagePrivate(page));
4060
4061                 set_page_extent_mapped(page);
4062                 if (i == 0)
4063                         set_page_extent_head(page, eb->len);
4064
4065                 if (inc_all_pages)
4066                         page_cache_get(page);
4067                 if (!PageUptodate(page)) {
4068                         if (start_i == 0)
4069                                 inc_all_pages = 1;
4070                         ClearPageError(page);
4071                         err = __extent_read_full_page(tree, page,
4072                                                       get_extent, &bio,
4073                                                       mirror_num, &bio_flags);
4074                         if (err)
4075                                 ret = err;
4076                 } else {
4077                         unlock_page(page);
4078                 }
4079         }
4080
4081         if (bio) {
4082                 err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4083                 BUG_ON(err < 0);
4084         }
4085
4086         if (ret || wait != WAIT_COMPLETE)
4087                 return ret;
4088
4089         for (i = start_i; i < num_pages; i++) {
4090                 page = extent_buffer_page(eb, i);
4091                 wait_on_page_locked(page);
4092                 if (!PageUptodate(page))
4093                         ret = -EIO;
4094         }
4095
4096         if (!ret)
4097                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4098         return ret;
4099
4100 unlock_exit:
4101         i = start_i;
4102         while (locked_pages > 0) {
4103                 page = extent_buffer_page(eb, i);
4104                 i++;
4105                 unlock_page(page);
4106                 locked_pages--;
4107         }
4108         return ret;
4109 }
4110
4111 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4112                         unsigned long start,
4113                         unsigned long len)
4114 {
4115         size_t cur;
4116         size_t offset;
4117         struct page *page;
4118         char *kaddr;
4119         char *dst = (char *)dstv;
4120         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4121         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4122
4123         WARN_ON(start > eb->len);
4124         WARN_ON(start + len > eb->start + eb->len);
4125
4126         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4127
4128         while (len > 0) {
4129                 page = extent_buffer_page(eb, i);
4130
4131                 cur = min(len, (PAGE_CACHE_SIZE - offset));
4132                 kaddr = page_address(page);
4133                 memcpy(dst, kaddr + offset, cur);
4134
4135                 dst += cur;
4136                 len -= cur;
4137                 offset = 0;
4138                 i++;
4139         }
4140 }
4141
4142 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4143                                unsigned long min_len, char **map,
4144                                unsigned long *map_start,
4145                                unsigned long *map_len)
4146 {
4147         size_t offset = start & (PAGE_CACHE_SIZE - 1);
4148         char *kaddr;
4149         struct page *p;
4150         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4151         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4152         unsigned long end_i = (start_offset + start + min_len - 1) >>
4153                 PAGE_CACHE_SHIFT;
4154
4155         if (i != end_i)
4156                 return -EINVAL;
4157
4158         if (i == 0) {
4159                 offset = start_offset;
4160                 *map_start = 0;
4161         } else {
4162                 offset = 0;
4163                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4164         }
4165
4166         if (start + min_len > eb->len) {
4167                 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4168                        "wanted %lu %lu\n", (unsigned long long)eb->start,
4169                        eb->len, start, min_len);
4170                 WARN_ON(1);
4171                 return -EINVAL;
4172         }
4173
4174         p = extent_buffer_page(eb, i);
4175         kaddr = page_address(p);
4176         *map = kaddr + offset;
4177         *map_len = PAGE_CACHE_SIZE - offset;
4178         return 0;
4179 }
4180
4181 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4182                           unsigned long start,
4183                           unsigned long len)
4184 {
4185         size_t cur;
4186         size_t offset;
4187         struct page *page;
4188         char *kaddr;
4189         char *ptr = (char *)ptrv;
4190         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4191         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4192         int ret = 0;
4193
4194         WARN_ON(start > eb->len);
4195         WARN_ON(start + len > eb->start + eb->len);
4196
4197         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4198
4199         while (len > 0) {
4200                 page = extent_buffer_page(eb, i);
4201
4202                 cur = min(len, (PAGE_CACHE_SIZE - offset));
4203
4204                 kaddr = page_address(page);
4205                 ret = memcmp(ptr, kaddr + offset, cur);
4206                 if (ret)
4207                         break;
4208
4209                 ptr += cur;
4210                 len -= cur;
4211                 offset = 0;
4212                 i++;
4213         }
4214         return ret;
4215 }
4216
4217 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4218                          unsigned long start, unsigned long len)
4219 {
4220         size_t cur;
4221         size_t offset;
4222         struct page *page;
4223         char *kaddr;
4224         char *src = (char *)srcv;
4225         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4226         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4227
4228         WARN_ON(start > eb->len);
4229         WARN_ON(start + len > eb->start + eb->len);
4230
4231         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4232
4233         while (len > 0) {
4234                 page = extent_buffer_page(eb, i);
4235                 WARN_ON(!PageUptodate(page));
4236
4237                 cur = min(len, PAGE_CACHE_SIZE - offset);
4238                 kaddr = page_address(page);
4239                 memcpy(kaddr + offset, src, cur);
4240
4241                 src += cur;
4242                 len -= cur;
4243                 offset = 0;
4244                 i++;
4245         }
4246 }
4247
4248 void memset_extent_buffer(struct extent_buffer *eb, char c,
4249                           unsigned long start, unsigned long len)
4250 {
4251         size_t cur;
4252         size_t offset;
4253         struct page *page;
4254         char *kaddr;
4255         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4256         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4257
4258         WARN_ON(start > eb->len);
4259         WARN_ON(start + len > eb->start + eb->len);
4260
4261         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4262
4263         while (len > 0) {
4264                 page = extent_buffer_page(eb, i);
4265                 WARN_ON(!PageUptodate(page));
4266
4267                 cur = min(len, PAGE_CACHE_SIZE - offset);
4268                 kaddr = page_address(page);
4269                 memset(kaddr + offset, c, cur);
4270
4271                 len -= cur;
4272                 offset = 0;
4273                 i++;
4274         }
4275 }
4276
4277 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4278                         unsigned long dst_offset, unsigned long src_offset,
4279                         unsigned long len)
4280 {
4281         u64 dst_len = dst->len;
4282         size_t cur;
4283         size_t offset;
4284         struct page *page;
4285         char *kaddr;
4286         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4287         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4288
4289         WARN_ON(src->len != dst_len);
4290
4291         offset = (start_offset + dst_offset) &
4292                 ((unsigned long)PAGE_CACHE_SIZE - 1);
4293
4294         while (len > 0) {
4295                 page = extent_buffer_page(dst, i);
4296                 WARN_ON(!PageUptodate(page));
4297
4298                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4299
4300                 kaddr = page_address(page);
4301                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
4302
4303                 src_offset += cur;
4304                 len -= cur;
4305                 offset = 0;
4306                 i++;
4307         }
4308 }
4309
4310 static void move_pages(struct page *dst_page, struct page *src_page,
4311                        unsigned long dst_off, unsigned long src_off,
4312                        unsigned long len)
4313 {
4314         char *dst_kaddr = page_address(dst_page);
4315         if (dst_page == src_page) {
4316                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4317         } else {
4318                 char *src_kaddr = page_address(src_page);
4319                 char *p = dst_kaddr + dst_off + len;
4320                 char *s = src_kaddr + src_off + len;
4321
4322                 while (len--)
4323                         *--p = *--s;
4324         }
4325 }
4326
4327 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4328 {
4329         unsigned long distance = (src > dst) ? src - dst : dst - src;
4330         return distance < len;
4331 }
4332
4333 static void copy_pages(struct page *dst_page, struct page *src_page,
4334                        unsigned long dst_off, unsigned long src_off,
4335                        unsigned long len)
4336 {
4337         char *dst_kaddr = page_address(dst_page);
4338         char *src_kaddr;
4339
4340         if (dst_page != src_page) {
4341                 src_kaddr = page_address(src_page);
4342         } else {
4343                 src_kaddr = dst_kaddr;
4344                 BUG_ON(areas_overlap(src_off, dst_off, len));
4345         }
4346
4347         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4348 }
4349
4350 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4351                            unsigned long src_offset, unsigned long len)
4352 {
4353         size_t cur;
4354         size_t dst_off_in_page;
4355         size_t src_off_in_page;
4356         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4357         unsigned long dst_i;
4358         unsigned long src_i;
4359
4360         if (src_offset + len > dst->len) {
4361                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4362                        "len %lu dst len %lu\n", src_offset, len, dst->len);
4363                 BUG_ON(1);
4364         }
4365         if (dst_offset + len > dst->len) {
4366                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4367                        "len %lu dst len %lu\n", dst_offset, len, dst->len);
4368                 BUG_ON(1);
4369         }
4370
4371         while (len > 0) {
4372                 dst_off_in_page = (start_offset + dst_offset) &
4373                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4374                 src_off_in_page = (start_offset + src_offset) &
4375                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4376
4377                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4378                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4379
4380                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4381                                                src_off_in_page));
4382                 cur = min_t(unsigned long, cur,
4383                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4384
4385                 copy_pages(extent_buffer_page(dst, dst_i),
4386                            extent_buffer_page(dst, src_i),
4387                            dst_off_in_page, src_off_in_page, cur);
4388
4389                 src_offset += cur;
4390                 dst_offset += cur;
4391                 len -= cur;
4392         }
4393 }
4394
4395 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4396                            unsigned long src_offset, unsigned long len)
4397 {
4398         size_t cur;
4399         size_t dst_off_in_page;
4400         size_t src_off_in_page;
4401         unsigned long dst_end = dst_offset + len - 1;
4402         unsigned long src_end = src_offset + len - 1;
4403         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4404         unsigned long dst_i;
4405         unsigned long src_i;
4406
4407         if (src_offset + len > dst->len) {
4408                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4409                        "len %lu len %lu\n", src_offset, len, dst->len);
4410                 BUG_ON(1);
4411         }
4412         if (dst_offset + len > dst->len) {
4413                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4414                        "len %lu len %lu\n", dst_offset, len, dst->len);
4415                 BUG_ON(1);
4416         }
4417         if (!areas_overlap(src_offset, dst_offset, len)) {
4418                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4419                 return;
4420         }
4421         while (len > 0) {
4422                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4423                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4424
4425                 dst_off_in_page = (start_offset + dst_end) &
4426                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4427                 src_off_in_page = (start_offset + src_end) &
4428                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4429
4430                 cur = min_t(unsigned long, len, src_off_in_page + 1);
4431                 cur = min(cur, dst_off_in_page + 1);
4432                 move_pages(extent_buffer_page(dst, dst_i),
4433                            extent_buffer_page(dst, src_i),
4434                            dst_off_in_page - cur + 1,
4435                            src_off_in_page - cur + 1, cur);
4436
4437                 dst_end -= cur;
4438                 src_end -= cur;
4439                 len -= cur;
4440         }
4441 }
4442
4443 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4444 {
4445         struct extent_buffer *eb =
4446                         container_of(head, struct extent_buffer, rcu_head);
4447
4448         btrfs_release_extent_buffer(eb);
4449 }
4450
4451 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4452 {
4453         u64 start = page_offset(page);
4454         struct extent_buffer *eb;
4455         int ret = 1;
4456
4457         spin_lock(&tree->buffer_lock);
4458         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4459         if (!eb) {
4460                 spin_unlock(&tree->buffer_lock);
4461                 return ret;
4462         }
4463
4464         if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4465                 ret = 0;
4466                 goto out;
4467         }
4468
4469         /*
4470          * set @eb->refs to 0 if it is already 1, and then release the @eb.
4471          * Or go back.
4472          */
4473         if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
4474                 ret = 0;
4475                 goto out;
4476         }
4477
4478         radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4479 out:
4480         spin_unlock(&tree->buffer_lock);
4481
4482         /* at this point we can safely release the extent buffer */
4483         if (atomic_read(&eb->refs) == 0)
4484                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4485         return ret;
4486 }