Btrfs: extent_io and extent_state optimizations
[linux-2.6-block.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17
18 /* temporary define until extent_map moves out of btrfs */
19 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
20                                        unsigned long extra_flags,
21                                        void (*ctor)(void *, struct kmem_cache *,
22                                                     unsigned long));
23
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29
30 #define BUFFER_LRU_MAX 64
31
32 struct tree_entry {
33         u64 start;
34         u64 end;
35         struct rb_node rb_node;
36 };
37
38 struct extent_page_data {
39         struct bio *bio;
40         struct extent_io_tree *tree;
41         get_extent_t *get_extent;
42 };
43
44 int __init extent_io_init(void)
45 {
46         extent_state_cache = btrfs_cache_create("extent_state",
47                                             sizeof(struct extent_state), 0,
48                                             NULL);
49         if (!extent_state_cache)
50                 return -ENOMEM;
51
52         extent_buffer_cache = btrfs_cache_create("extent_buffers",
53                                             sizeof(struct extent_buffer), 0,
54                                             NULL);
55         if (!extent_buffer_cache)
56                 goto free_state_cache;
57         return 0;
58
59 free_state_cache:
60         kmem_cache_destroy(extent_state_cache);
61         return -ENOMEM;
62 }
63
64 void extent_io_exit(void)
65 {
66         struct extent_state *state;
67
68         while (!list_empty(&states)) {
69                 state = list_entry(states.next, struct extent_state, list);
70                 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
71                 list_del(&state->list);
72                 kmem_cache_free(extent_state_cache, state);
73
74         }
75
76         if (extent_state_cache)
77                 kmem_cache_destroy(extent_state_cache);
78         if (extent_buffer_cache)
79                 kmem_cache_destroy(extent_buffer_cache);
80 }
81
82 void extent_io_tree_init(struct extent_io_tree *tree,
83                           struct address_space *mapping, gfp_t mask)
84 {
85         tree->state.rb_node = NULL;
86         tree->ops = NULL;
87         tree->dirty_bytes = 0;
88         spin_lock_init(&tree->lock);
89         spin_lock_init(&tree->lru_lock);
90         tree->mapping = mapping;
91         INIT_LIST_HEAD(&tree->buffer_lru);
92         tree->lru_size = 0;
93 }
94 EXPORT_SYMBOL(extent_io_tree_init);
95
96 void extent_io_tree_empty_lru(struct extent_io_tree *tree)
97 {
98         struct extent_buffer *eb;
99         while(!list_empty(&tree->buffer_lru)) {
100                 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
101                                 lru);
102                 list_del_init(&eb->lru);
103                 free_extent_buffer(eb);
104         }
105 }
106 EXPORT_SYMBOL(extent_io_tree_empty_lru);
107
108 struct extent_state *alloc_extent_state(gfp_t mask)
109 {
110         struct extent_state *state;
111
112         state = kmem_cache_alloc(extent_state_cache, mask);
113         if (!state || IS_ERR(state))
114                 return state;
115         state->state = 0;
116         state->private = 0;
117         state->tree = NULL;
118
119         atomic_set(&state->refs, 1);
120         init_waitqueue_head(&state->wq);
121         return state;
122 }
123 EXPORT_SYMBOL(alloc_extent_state);
124
125 void free_extent_state(struct extent_state *state)
126 {
127         if (!state)
128                 return;
129         if (atomic_dec_and_test(&state->refs)) {
130                 WARN_ON(state->tree);
131                 kmem_cache_free(extent_state_cache, state);
132         }
133 }
134 EXPORT_SYMBOL(free_extent_state);
135
136 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
137                                    struct rb_node *node)
138 {
139         struct rb_node ** p = &root->rb_node;
140         struct rb_node * parent = NULL;
141         struct tree_entry *entry;
142
143         while(*p) {
144                 parent = *p;
145                 entry = rb_entry(parent, struct tree_entry, rb_node);
146
147                 if (offset < entry->start)
148                         p = &(*p)->rb_left;
149                 else if (offset > entry->end)
150                         p = &(*p)->rb_right;
151                 else
152                         return parent;
153         }
154
155         entry = rb_entry(node, struct tree_entry, rb_node);
156         rb_link_node(node, parent, p);
157         rb_insert_color(node, root);
158         return NULL;
159 }
160
161 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
162                                      struct rb_node **prev_ret,
163                                      struct rb_node **next_ret)
164 {
165         struct rb_node * n = root->rb_node;
166         struct rb_node *prev = NULL;
167         struct rb_node *orig_prev = NULL;
168         struct tree_entry *entry;
169         struct tree_entry *prev_entry = NULL;
170
171         while(n) {
172                 entry = rb_entry(n, struct tree_entry, rb_node);
173                 prev = n;
174                 prev_entry = entry;
175
176                 if (offset < entry->start)
177                         n = n->rb_left;
178                 else if (offset > entry->end)
179                         n = n->rb_right;
180                 else
181                         return n;
182         }
183
184         if (prev_ret) {
185                 orig_prev = prev;
186                 while(prev && offset > prev_entry->end) {
187                         prev = rb_next(prev);
188                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
189                 }
190                 *prev_ret = prev;
191                 prev = orig_prev;
192         }
193
194         if (next_ret) {
195                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
196                 while(prev && offset < prev_entry->start) {
197                         prev = rb_prev(prev);
198                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
199                 }
200                 *next_ret = prev;
201         }
202         return NULL;
203 }
204
205 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
206 {
207         struct rb_node *prev = NULL;
208         struct rb_node *ret;
209
210         ret = __tree_search(root, offset, &prev, NULL);
211         if (!ret)
212                 return prev;
213         return ret;
214 }
215
216 /*
217  * utility function to look for merge candidates inside a given range.
218  * Any extents with matching state are merged together into a single
219  * extent in the tree.  Extents with EXTENT_IO in their state field
220  * are not merged because the end_io handlers need to be able to do
221  * operations on them without sleeping (or doing allocations/splits).
222  *
223  * This should be called with the tree lock held.
224  */
225 static int merge_state(struct extent_io_tree *tree,
226                        struct extent_state *state)
227 {
228         struct extent_state *other;
229         struct rb_node *other_node;
230
231         if (state->state & EXTENT_IOBITS)
232                 return 0;
233
234         other_node = rb_prev(&state->rb_node);
235         if (other_node) {
236                 other = rb_entry(other_node, struct extent_state, rb_node);
237                 if (other->end == state->start - 1 &&
238                     other->state == state->state) {
239                         state->start = other->start;
240                         other->tree = NULL;
241                         rb_erase(&other->rb_node, &tree->state);
242                         free_extent_state(other);
243                 }
244         }
245         other_node = rb_next(&state->rb_node);
246         if (other_node) {
247                 other = rb_entry(other_node, struct extent_state, rb_node);
248                 if (other->start == state->end + 1 &&
249                     other->state == state->state) {
250                         other->start = state->start;
251                         state->tree = NULL;
252                         rb_erase(&state->rb_node, &tree->state);
253                         free_extent_state(state);
254                 }
255         }
256         return 0;
257 }
258
259 /*
260  * insert an extent_state struct into the tree.  'bits' are set on the
261  * struct before it is inserted.
262  *
263  * This may return -EEXIST if the extent is already there, in which case the
264  * state struct is freed.
265  *
266  * The tree lock is not taken internally.  This is a utility function and
267  * probably isn't what you want to call (see set/clear_extent_bit).
268  */
269 static int insert_state(struct extent_io_tree *tree,
270                         struct extent_state *state, u64 start, u64 end,
271                         int bits)
272 {
273         struct rb_node *node;
274
275         if (end < start) {
276                 printk("end < start %Lu %Lu\n", end, start);
277                 WARN_ON(1);
278         }
279         if (bits & EXTENT_DIRTY)
280                 tree->dirty_bytes += end - start + 1;
281         state->state |= bits;
282         state->start = start;
283         state->end = end;
284         node = tree_insert(&tree->state, end, &state->rb_node);
285         if (node) {
286                 struct extent_state *found;
287                 found = rb_entry(node, struct extent_state, rb_node);
288                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
289                 free_extent_state(state);
290                 return -EEXIST;
291         }
292         state->tree = tree;
293         merge_state(tree, state);
294         return 0;
295 }
296
297 /*
298  * split a given extent state struct in two, inserting the preallocated
299  * struct 'prealloc' as the newly created second half.  'split' indicates an
300  * offset inside 'orig' where it should be split.
301  *
302  * Before calling,
303  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
304  * are two extent state structs in the tree:
305  * prealloc: [orig->start, split - 1]
306  * orig: [ split, orig->end ]
307  *
308  * The tree locks are not taken by this function. They need to be held
309  * by the caller.
310  */
311 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
312                        struct extent_state *prealloc, u64 split)
313 {
314         struct rb_node *node;
315         prealloc->start = orig->start;
316         prealloc->end = split - 1;
317         prealloc->state = orig->state;
318         orig->start = split;
319
320         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
321         if (node) {
322                 struct extent_state *found;
323                 found = rb_entry(node, struct extent_state, rb_node);
324                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
325                 free_extent_state(prealloc);
326                 return -EEXIST;
327         }
328         prealloc->tree = tree;
329         return 0;
330 }
331
332 /*
333  * utility function to clear some bits in an extent state struct.
334  * it will optionally wake up any one waiting on this state (wake == 1), or
335  * forcibly remove the state from the tree (delete == 1).
336  *
337  * If no bits are set on the state struct after clearing things, the
338  * struct is freed and removed from the tree
339  */
340 static int clear_state_bit(struct extent_io_tree *tree,
341                             struct extent_state *state, int bits, int wake,
342                             int delete)
343 {
344         int ret = state->state & bits;
345
346         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
347                 u64 range = state->end - state->start + 1;
348                 WARN_ON(range > tree->dirty_bytes);
349                 tree->dirty_bytes -= range;
350         }
351         state->state &= ~bits;
352         if (wake)
353                 wake_up(&state->wq);
354         if (delete || state->state == 0) {
355                 if (state->tree) {
356                         rb_erase(&state->rb_node, &tree->state);
357                         state->tree = NULL;
358                         free_extent_state(state);
359                 } else {
360                         WARN_ON(1);
361                 }
362         } else {
363                 merge_state(tree, state);
364         }
365         return ret;
366 }
367
368 /*
369  * clear some bits on a range in the tree.  This may require splitting
370  * or inserting elements in the tree, so the gfp mask is used to
371  * indicate which allocations or sleeping are allowed.
372  *
373  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
374  * the given range from the tree regardless of state (ie for truncate).
375  *
376  * the range [start, end] is inclusive.
377  *
378  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
379  * bits were already set, or zero if none of the bits were already set.
380  */
381 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
382                      int bits, int wake, int delete, gfp_t mask)
383 {
384         struct extent_state *state;
385         struct extent_state *prealloc = NULL;
386         struct rb_node *node;
387         unsigned long flags;
388         int err;
389         int set = 0;
390
391 again:
392         if (!prealloc && (mask & __GFP_WAIT)) {
393                 prealloc = alloc_extent_state(mask);
394                 if (!prealloc)
395                         return -ENOMEM;
396         }
397
398         spin_lock_irqsave(&tree->lock, flags);
399         /*
400          * this search will find the extents that end after
401          * our range starts
402          */
403         node = tree_search(&tree->state, start);
404         if (!node)
405                 goto out;
406         state = rb_entry(node, struct extent_state, rb_node);
407         if (state->start > end)
408                 goto out;
409         WARN_ON(state->end < start);
410
411         /*
412          *     | ---- desired range ---- |
413          *  | state | or
414          *  | ------------- state -------------- |
415          *
416          * We need to split the extent we found, and may flip
417          * bits on second half.
418          *
419          * If the extent we found extends past our range, we
420          * just split and search again.  It'll get split again
421          * the next time though.
422          *
423          * If the extent we found is inside our range, we clear
424          * the desired bit on it.
425          */
426
427         if (state->start < start) {
428                 if (!prealloc)
429                         prealloc = alloc_extent_state(GFP_ATOMIC);
430                 err = split_state(tree, state, prealloc, start);
431                 BUG_ON(err == -EEXIST);
432                 prealloc = NULL;
433                 if (err)
434                         goto out;
435                 if (state->end <= end) {
436                         start = state->end + 1;
437                         set |= clear_state_bit(tree, state, bits,
438                                         wake, delete);
439                 } else {
440                         start = state->start;
441                 }
442                 goto search_again;
443         }
444         /*
445          * | ---- desired range ---- |
446          *                        | state |
447          * We need to split the extent, and clear the bit
448          * on the first half
449          */
450         if (state->start <= end && state->end > end) {
451                 if (!prealloc)
452                         prealloc = alloc_extent_state(GFP_ATOMIC);
453                 err = split_state(tree, state, prealloc, end + 1);
454                 BUG_ON(err == -EEXIST);
455
456                 if (wake)
457                         wake_up(&state->wq);
458                 set |= clear_state_bit(tree, prealloc, bits,
459                                        wake, delete);
460                 prealloc = NULL;
461                 goto out;
462         }
463
464         start = state->end + 1;
465         set |= clear_state_bit(tree, state, bits, wake, delete);
466         goto search_again;
467
468 out:
469         spin_unlock_irqrestore(&tree->lock, flags);
470         if (prealloc)
471                 free_extent_state(prealloc);
472
473         return set;
474
475 search_again:
476         if (start > end)
477                 goto out;
478         spin_unlock_irqrestore(&tree->lock, flags);
479         if (mask & __GFP_WAIT)
480                 cond_resched();
481         goto again;
482 }
483 EXPORT_SYMBOL(clear_extent_bit);
484
485 static int wait_on_state(struct extent_io_tree *tree,
486                          struct extent_state *state)
487 {
488         DEFINE_WAIT(wait);
489         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
490         spin_unlock_irq(&tree->lock);
491         schedule();
492         spin_lock_irq(&tree->lock);
493         finish_wait(&state->wq, &wait);
494         return 0;
495 }
496
497 /*
498  * waits for one or more bits to clear on a range in the state tree.
499  * The range [start, end] is inclusive.
500  * The tree lock is taken by this function
501  */
502 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
503 {
504         struct extent_state *state;
505         struct rb_node *node;
506
507         spin_lock_irq(&tree->lock);
508 again:
509         while (1) {
510                 /*
511                  * this search will find all the extents that end after
512                  * our range starts
513                  */
514                 node = tree_search(&tree->state, start);
515                 if (!node)
516                         break;
517
518                 state = rb_entry(node, struct extent_state, rb_node);
519
520                 if (state->start > end)
521                         goto out;
522
523                 if (state->state & bits) {
524                         start = state->start;
525                         atomic_inc(&state->refs);
526                         wait_on_state(tree, state);
527                         free_extent_state(state);
528                         goto again;
529                 }
530                 start = state->end + 1;
531
532                 if (start > end)
533                         break;
534
535                 if (need_resched()) {
536                         spin_unlock_irq(&tree->lock);
537                         cond_resched();
538                         spin_lock_irq(&tree->lock);
539                 }
540         }
541 out:
542         spin_unlock_irq(&tree->lock);
543         return 0;
544 }
545 EXPORT_SYMBOL(wait_extent_bit);
546
547 static void set_state_bits(struct extent_io_tree *tree,
548                            struct extent_state *state,
549                            int bits)
550 {
551         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
552                 u64 range = state->end - state->start + 1;
553                 tree->dirty_bytes += range;
554         }
555         state->state |= bits;
556 }
557
558 /*
559  * set some bits on a range in the tree.  This may require allocations
560  * or sleeping, so the gfp mask is used to indicate what is allowed.
561  *
562  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
563  * range already has the desired bits set.  The start of the existing
564  * range is returned in failed_start in this case.
565  *
566  * [start, end] is inclusive
567  * This takes the tree lock.
568  */
569 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
570                    int exclusive, u64 *failed_start, gfp_t mask)
571 {
572         struct extent_state *state;
573         struct extent_state *prealloc = NULL;
574         struct rb_node *node;
575         unsigned long flags;
576         int err = 0;
577         int set;
578         u64 last_start;
579         u64 last_end;
580 again:
581         if (!prealloc && (mask & __GFP_WAIT)) {
582                 prealloc = alloc_extent_state(mask);
583                 if (!prealloc)
584                         return -ENOMEM;
585         }
586
587         spin_lock_irqsave(&tree->lock, flags);
588         /*
589          * this search will find all the extents that end after
590          * our range starts.
591          */
592         node = tree_search(&tree->state, start);
593         if (!node) {
594                 err = insert_state(tree, prealloc, start, end, bits);
595                 prealloc = NULL;
596                 BUG_ON(err == -EEXIST);
597                 goto out;
598         }
599
600         state = rb_entry(node, struct extent_state, rb_node);
601         last_start = state->start;
602         last_end = state->end;
603
604         /*
605          * | ---- desired range ---- |
606          * | state |
607          *
608          * Just lock what we found and keep going
609          */
610         if (state->start == start && state->end <= end) {
611                 set = state->state & bits;
612                 if (set && exclusive) {
613                         *failed_start = state->start;
614                         err = -EEXIST;
615                         goto out;
616                 }
617                 set_state_bits(tree, state, bits);
618                 start = state->end + 1;
619                 merge_state(tree, state);
620                 goto search_again;
621         }
622
623         /*
624          *     | ---- desired range ---- |
625          * | state |
626          *   or
627          * | ------------- state -------------- |
628          *
629          * We need to split the extent we found, and may flip bits on
630          * second half.
631          *
632          * If the extent we found extends past our
633          * range, we just split and search again.  It'll get split
634          * again the next time though.
635          *
636          * If the extent we found is inside our range, we set the
637          * desired bit on it.
638          */
639         if (state->start < start) {
640                 set = state->state & bits;
641                 if (exclusive && set) {
642                         *failed_start = start;
643                         err = -EEXIST;
644                         goto out;
645                 }
646                 err = split_state(tree, state, prealloc, start);
647                 BUG_ON(err == -EEXIST);
648                 prealloc = NULL;
649                 if (err)
650                         goto out;
651                 if (state->end <= end) {
652                         set_state_bits(tree, state, bits);
653                         start = state->end + 1;
654                         merge_state(tree, state);
655                 } else {
656                         start = state->start;
657                 }
658                 goto search_again;
659         }
660         /*
661          * | ---- desired range ---- |
662          *     | state | or               | state |
663          *
664          * There's a hole, we need to insert something in it and
665          * ignore the extent we found.
666          */
667         if (state->start > start) {
668                 u64 this_end;
669                 if (end < last_start)
670                         this_end = end;
671                 else
672                         this_end = last_start -1;
673                 err = insert_state(tree, prealloc, start, this_end,
674                                    bits);
675                 prealloc = NULL;
676                 BUG_ON(err == -EEXIST);
677                 if (err)
678                         goto out;
679                 start = this_end + 1;
680                 goto search_again;
681         }
682         /*
683          * | ---- desired range ---- |
684          *                        | state |
685          * We need to split the extent, and set the bit
686          * on the first half
687          */
688         if (state->start <= end && state->end > end) {
689                 set = state->state & bits;
690                 if (exclusive && set) {
691                         *failed_start = start;
692                         err = -EEXIST;
693                         goto out;
694                 }
695                 err = split_state(tree, state, prealloc, end + 1);
696                 BUG_ON(err == -EEXIST);
697
698                 set_state_bits(tree, prealloc, bits);
699                 merge_state(tree, prealloc);
700                 prealloc = NULL;
701                 goto out;
702         }
703
704         goto search_again;
705
706 out:
707         spin_unlock_irqrestore(&tree->lock, flags);
708         if (prealloc)
709                 free_extent_state(prealloc);
710
711         return err;
712
713 search_again:
714         if (start > end)
715                 goto out;
716         spin_unlock_irqrestore(&tree->lock, flags);
717         if (mask & __GFP_WAIT)
718                 cond_resched();
719         goto again;
720 }
721 EXPORT_SYMBOL(set_extent_bit);
722
723 /* wrappers around set/clear extent bit */
724 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
725                      gfp_t mask)
726 {
727         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
728                               mask);
729 }
730 EXPORT_SYMBOL(set_extent_dirty);
731
732 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
733                     int bits, gfp_t mask)
734 {
735         return set_extent_bit(tree, start, end, bits, 0, NULL,
736                               mask);
737 }
738 EXPORT_SYMBOL(set_extent_bits);
739
740 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
741                       int bits, gfp_t mask)
742 {
743         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
744 }
745 EXPORT_SYMBOL(clear_extent_bits);
746
747 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
748                      gfp_t mask)
749 {
750         return set_extent_bit(tree, start, end,
751                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
752                               mask);
753 }
754 EXPORT_SYMBOL(set_extent_delalloc);
755
756 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
757                        gfp_t mask)
758 {
759         return clear_extent_bit(tree, start, end,
760                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
761 }
762 EXPORT_SYMBOL(clear_extent_dirty);
763
764 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
765                      gfp_t mask)
766 {
767         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
768                               mask);
769 }
770 EXPORT_SYMBOL(set_extent_new);
771
772 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
773                        gfp_t mask)
774 {
775         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
776 }
777 EXPORT_SYMBOL(clear_extent_new);
778
779 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
780                         gfp_t mask)
781 {
782         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
783                               mask);
784 }
785 EXPORT_SYMBOL(set_extent_uptodate);
786
787 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
788                           gfp_t mask)
789 {
790         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
791 }
792 EXPORT_SYMBOL(clear_extent_uptodate);
793
794 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
795                          gfp_t mask)
796 {
797         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
798                               0, NULL, mask);
799 }
800 EXPORT_SYMBOL(set_extent_writeback);
801
802 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
803                            gfp_t mask)
804 {
805         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
806 }
807 EXPORT_SYMBOL(clear_extent_writeback);
808
809 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
810 {
811         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
812 }
813 EXPORT_SYMBOL(wait_on_extent_writeback);
814
815 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
816 {
817         int err;
818         u64 failed_start;
819         while (1) {
820                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
821                                      &failed_start, mask);
822                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
823                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
824                         start = failed_start;
825                 } else {
826                         break;
827                 }
828                 WARN_ON(start > end);
829         }
830         return err;
831 }
832 EXPORT_SYMBOL(lock_extent);
833
834 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
835                   gfp_t mask)
836 {
837         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
838 }
839 EXPORT_SYMBOL(unlock_extent);
840
841 /*
842  * helper function to set pages and extents in the tree dirty
843  */
844 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
845 {
846         unsigned long index = start >> PAGE_CACHE_SHIFT;
847         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
848         struct page *page;
849
850         while (index <= end_index) {
851                 page = find_get_page(tree->mapping, index);
852                 BUG_ON(!page);
853                 __set_page_dirty_nobuffers(page);
854                 page_cache_release(page);
855                 index++;
856         }
857         set_extent_dirty(tree, start, end, GFP_NOFS);
858         return 0;
859 }
860 EXPORT_SYMBOL(set_range_dirty);
861
862 /*
863  * helper function to set both pages and extents in the tree writeback
864  */
865 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
866 {
867         unsigned long index = start >> PAGE_CACHE_SHIFT;
868         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
869         struct page *page;
870
871         while (index <= end_index) {
872                 page = find_get_page(tree->mapping, index);
873                 BUG_ON(!page);
874                 set_page_writeback(page);
875                 page_cache_release(page);
876                 index++;
877         }
878         set_extent_writeback(tree, start, end, GFP_NOFS);
879         return 0;
880 }
881 EXPORT_SYMBOL(set_range_writeback);
882
883 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
884                           u64 *start_ret, u64 *end_ret, int bits)
885 {
886         struct rb_node *node;
887         struct extent_state *state;
888         int ret = 1;
889
890         spin_lock_irq(&tree->lock);
891         /*
892          * this search will find all the extents that end after
893          * our range starts.
894          */
895         node = tree_search(&tree->state, start);
896         if (!node || IS_ERR(node)) {
897                 goto out;
898         }
899
900         while(1) {
901                 state = rb_entry(node, struct extent_state, rb_node);
902                 if (state->end >= start && (state->state & bits)) {
903                         *start_ret = state->start;
904                         *end_ret = state->end;
905                         ret = 0;
906                         break;
907                 }
908                 node = rb_next(node);
909                 if (!node)
910                         break;
911         }
912 out:
913         spin_unlock_irq(&tree->lock);
914         return ret;
915 }
916 EXPORT_SYMBOL(find_first_extent_bit);
917
918 u64 find_lock_delalloc_range(struct extent_io_tree *tree,
919                              u64 *start, u64 *end, u64 max_bytes)
920 {
921         struct rb_node *node;
922         struct extent_state *state;
923         u64 cur_start = *start;
924         u64 found = 0;
925         u64 total_bytes = 0;
926
927         spin_lock_irq(&tree->lock);
928         /*
929          * this search will find all the extents that end after
930          * our range starts.
931          */
932 search_again:
933         node = tree_search(&tree->state, cur_start);
934         if (!node || IS_ERR(node)) {
935                 *end = (u64)-1;
936                 goto out;
937         }
938
939         while(1) {
940                 state = rb_entry(node, struct extent_state, rb_node);
941                 if (found && state->start != cur_start) {
942                         goto out;
943                 }
944                 if (!(state->state & EXTENT_DELALLOC)) {
945                         if (!found)
946                                 *end = state->end;
947                         goto out;
948                 }
949                 if (!found) {
950                         struct extent_state *prev_state;
951                         struct rb_node *prev_node = node;
952                         while(1) {
953                                 prev_node = rb_prev(prev_node);
954                                 if (!prev_node)
955                                         break;
956                                 prev_state = rb_entry(prev_node,
957                                                       struct extent_state,
958                                                       rb_node);
959                                 if (!(prev_state->state & EXTENT_DELALLOC))
960                                         break;
961                                 state = prev_state;
962                                 node = prev_node;
963                         }
964                 }
965                 if (state->state & EXTENT_LOCKED) {
966                         DEFINE_WAIT(wait);
967                         atomic_inc(&state->refs);
968                         prepare_to_wait(&state->wq, &wait,
969                                         TASK_UNINTERRUPTIBLE);
970                         spin_unlock_irq(&tree->lock);
971                         schedule();
972                         spin_lock_irq(&tree->lock);
973                         finish_wait(&state->wq, &wait);
974                         free_extent_state(state);
975                         goto search_again;
976                 }
977                 state->state |= EXTENT_LOCKED;
978                 if (!found)
979                         *start = state->start;
980                 found++;
981                 *end = state->end;
982                 cur_start = state->end + 1;
983                 node = rb_next(node);
984                 if (!node)
985                         break;
986                 total_bytes += state->end - state->start + 1;
987                 if (total_bytes >= max_bytes)
988                         break;
989         }
990 out:
991         spin_unlock_irq(&tree->lock);
992         return found;
993 }
994
995 u64 count_range_bits(struct extent_io_tree *tree,
996                      u64 *start, u64 search_end, u64 max_bytes,
997                      unsigned long bits)
998 {
999         struct rb_node *node;
1000         struct extent_state *state;
1001         u64 cur_start = *start;
1002         u64 total_bytes = 0;
1003         int found = 0;
1004
1005         if (search_end <= cur_start) {
1006                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1007                 WARN_ON(1);
1008                 return 0;
1009         }
1010
1011         spin_lock_irq(&tree->lock);
1012         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1013                 total_bytes = tree->dirty_bytes;
1014                 goto out;
1015         }
1016         /*
1017          * this search will find all the extents that end after
1018          * our range starts.
1019          */
1020         node = tree_search(&tree->state, cur_start);
1021         if (!node || IS_ERR(node)) {
1022                 goto out;
1023         }
1024
1025         while(1) {
1026                 state = rb_entry(node, struct extent_state, rb_node);
1027                 if (state->start > search_end)
1028                         break;
1029                 if (state->end >= cur_start && (state->state & bits)) {
1030                         total_bytes += min(search_end, state->end) + 1 -
1031                                        max(cur_start, state->start);
1032                         if (total_bytes >= max_bytes)
1033                                 break;
1034                         if (!found) {
1035                                 *start = state->start;
1036                                 found = 1;
1037                         }
1038                 }
1039                 node = rb_next(node);
1040                 if (!node)
1041                         break;
1042         }
1043 out:
1044         spin_unlock_irq(&tree->lock);
1045         return total_bytes;
1046 }
1047 /*
1048  * helper function to lock both pages and extents in the tree.
1049  * pages must be locked first.
1050  */
1051 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1052 {
1053         unsigned long index = start >> PAGE_CACHE_SHIFT;
1054         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1055         struct page *page;
1056         int err;
1057
1058         while (index <= end_index) {
1059                 page = grab_cache_page(tree->mapping, index);
1060                 if (!page) {
1061                         err = -ENOMEM;
1062                         goto failed;
1063                 }
1064                 if (IS_ERR(page)) {
1065                         err = PTR_ERR(page);
1066                         goto failed;
1067                 }
1068                 index++;
1069         }
1070         lock_extent(tree, start, end, GFP_NOFS);
1071         return 0;
1072
1073 failed:
1074         /*
1075          * we failed above in getting the page at 'index', so we undo here
1076          * up to but not including the page at 'index'
1077          */
1078         end_index = index;
1079         index = start >> PAGE_CACHE_SHIFT;
1080         while (index < end_index) {
1081                 page = find_get_page(tree->mapping, index);
1082                 unlock_page(page);
1083                 page_cache_release(page);
1084                 index++;
1085         }
1086         return err;
1087 }
1088 EXPORT_SYMBOL(lock_range);
1089
1090 /*
1091  * helper function to unlock both pages and extents in the tree.
1092  */
1093 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1094 {
1095         unsigned long index = start >> PAGE_CACHE_SHIFT;
1096         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1097         struct page *page;
1098
1099         while (index <= end_index) {
1100                 page = find_get_page(tree->mapping, index);
1101                 unlock_page(page);
1102                 page_cache_release(page);
1103                 index++;
1104         }
1105         unlock_extent(tree, start, end, GFP_NOFS);
1106         return 0;
1107 }
1108 EXPORT_SYMBOL(unlock_range);
1109
1110 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1111 {
1112         struct rb_node *node;
1113         struct extent_state *state;
1114         int ret = 0;
1115
1116         spin_lock_irq(&tree->lock);
1117         /*
1118          * this search will find all the extents that end after
1119          * our range starts.
1120          */
1121         node = tree_search(&tree->state, start);
1122         if (!node || IS_ERR(node)) {
1123                 ret = -ENOENT;
1124                 goto out;
1125         }
1126         state = rb_entry(node, struct extent_state, rb_node);
1127         if (state->start != start) {
1128                 ret = -ENOENT;
1129                 goto out;
1130         }
1131         state->private = private;
1132 out:
1133         spin_unlock_irq(&tree->lock);
1134         return ret;
1135 }
1136
1137 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1138 {
1139         struct rb_node *node;
1140         struct extent_state *state;
1141         int ret = 0;
1142
1143         spin_lock_irq(&tree->lock);
1144         /*
1145          * this search will find all the extents that end after
1146          * our range starts.
1147          */
1148         node = tree_search(&tree->state, start);
1149         if (!node || IS_ERR(node)) {
1150                 ret = -ENOENT;
1151                 goto out;
1152         }
1153         state = rb_entry(node, struct extent_state, rb_node);
1154         if (state->start != start) {
1155                 ret = -ENOENT;
1156                 goto out;
1157         }
1158         *private = state->private;
1159 out:
1160         spin_unlock_irq(&tree->lock);
1161         return ret;
1162 }
1163
1164 /*
1165  * searches a range in the state tree for a given mask.
1166  * If 'filled' == 1, this returns 1 only if every extent in the tree
1167  * has the bits set.  Otherwise, 1 is returned if any bit in the
1168  * range is found set.
1169  */
1170 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1171                    int bits, int filled)
1172 {
1173         struct extent_state *state = NULL;
1174         struct rb_node *node;
1175         int bitset = 0;
1176         unsigned long flags;
1177
1178         spin_lock_irqsave(&tree->lock, flags);
1179         node = tree_search(&tree->state, start);
1180         while (node && start <= end) {
1181                 state = rb_entry(node, struct extent_state, rb_node);
1182
1183                 if (filled && state->start > start) {
1184                         bitset = 0;
1185                         break;
1186                 }
1187
1188                 if (state->start > end)
1189                         break;
1190
1191                 if (state->state & bits) {
1192                         bitset = 1;
1193                         if (!filled)
1194                                 break;
1195                 } else if (filled) {
1196                         bitset = 0;
1197                         break;
1198                 }
1199                 start = state->end + 1;
1200                 if (start > end)
1201                         break;
1202                 node = rb_next(node);
1203                 if (!node) {
1204                         if (filled)
1205                                 bitset = 0;
1206                         break;
1207                 }
1208         }
1209         spin_unlock_irqrestore(&tree->lock, flags);
1210         return bitset;
1211 }
1212 EXPORT_SYMBOL(test_range_bit);
1213
1214 /*
1215  * helper function to set a given page up to date if all the
1216  * extents in the tree for that page are up to date
1217  */
1218 static int check_page_uptodate(struct extent_io_tree *tree,
1219                                struct page *page)
1220 {
1221         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1222         u64 end = start + PAGE_CACHE_SIZE - 1;
1223         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1224                 SetPageUptodate(page);
1225         return 0;
1226 }
1227
1228 /*
1229  * helper function to unlock a page if all the extents in the tree
1230  * for that page are unlocked
1231  */
1232 static int check_page_locked(struct extent_io_tree *tree,
1233                              struct page *page)
1234 {
1235         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1236         u64 end = start + PAGE_CACHE_SIZE - 1;
1237         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1238                 unlock_page(page);
1239         return 0;
1240 }
1241
1242 /*
1243  * helper function to end page writeback if all the extents
1244  * in the tree for that page are done with writeback
1245  */
1246 static int check_page_writeback(struct extent_io_tree *tree,
1247                              struct page *page)
1248 {
1249         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1250         u64 end = start + PAGE_CACHE_SIZE - 1;
1251         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1252                 end_page_writeback(page);
1253         return 0;
1254 }
1255
1256 /* lots and lots of room for performance fixes in the end_bio funcs */
1257
1258 /*
1259  * after a writepage IO is done, we need to:
1260  * clear the uptodate bits on error
1261  * clear the writeback bits in the extent tree for this IO
1262  * end_page_writeback if the page has no more pending IO
1263  *
1264  * Scheduling is not allowed, so the extent state tree is expected
1265  * to have one and only one object corresponding to this IO.
1266  */
1267 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1268 static void end_bio_extent_writepage(struct bio *bio, int err)
1269 #else
1270 static int end_bio_extent_writepage(struct bio *bio,
1271                                    unsigned int bytes_done, int err)
1272 #endif
1273 {
1274         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1275         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1276         struct extent_state *state = bio->bi_private;
1277         struct extent_io_tree *tree = state->tree;
1278         struct rb_node *node;
1279         u64 start;
1280         u64 end;
1281         u64 cur;
1282         int whole_page;
1283         unsigned long flags;
1284
1285 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1286         if (bio->bi_size)
1287                 return 1;
1288 #endif
1289         do {
1290                 struct page *page = bvec->bv_page;
1291                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1292                          bvec->bv_offset;
1293                 end = start + bvec->bv_len - 1;
1294
1295                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1296                         whole_page = 1;
1297                 else
1298                         whole_page = 0;
1299
1300                 if (--bvec >= bio->bi_io_vec)
1301                         prefetchw(&bvec->bv_page->flags);
1302
1303                 if (!uptodate) {
1304                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1305                         ClearPageUptodate(page);
1306                         SetPageError(page);
1307                 }
1308
1309                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1310                         tree->ops->writepage_end_io_hook(page, start, end,
1311                                                          state);
1312                 }
1313
1314                 /*
1315                  * bios can get merged in funny ways, and so we need to
1316                  * be careful with the state variable.  We know the
1317                  * state won't be merged with others because it has
1318                  * WRITEBACK set, but we can't be sure each biovec is
1319                  * sequential in the file.  So, if our cached state
1320                  * doesn't match the expected end, search the tree
1321                  * for the correct one.
1322                  */
1323
1324                 spin_lock_irqsave(&tree->lock, flags);
1325                 if (!state || state->end != end) {
1326                         state = NULL;
1327                         node = __tree_search(&tree->state, start, NULL, NULL);
1328                         if (node) {
1329                                 state = rb_entry(node, struct extent_state,
1330                                                  rb_node);
1331                                 if (state->end != end ||
1332                                     !(state->state & EXTENT_WRITEBACK))
1333                                         state = NULL;
1334                         }
1335                         if (!state) {
1336                                 spin_unlock_irqrestore(&tree->lock, flags);
1337                                 clear_extent_writeback(tree, start,
1338                                                        end, GFP_ATOMIC);
1339                                 goto next_io;
1340                         }
1341                 }
1342                 cur = end;
1343                 while(1) {
1344                         struct extent_state *clear = state;
1345                         cur = state->start;
1346                         node = rb_prev(&state->rb_node);
1347                         if (node) {
1348                                 state = rb_entry(node,
1349                                                  struct extent_state,
1350                                                  rb_node);
1351                         } else {
1352                                 state = NULL;
1353                         }
1354
1355                         clear_state_bit(tree, clear, EXTENT_WRITEBACK,
1356                                         1, 0);
1357                         if (cur == start)
1358                                 break;
1359                         if (cur < start) {
1360                                 WARN_ON(1);
1361                                 break;
1362                         }
1363                         if (!node)
1364                                 break;
1365                 }
1366                 /* before releasing the lock, make sure the next state
1367                  * variable has the expected bits set and corresponds
1368                  * to the correct offsets in the file
1369                  */
1370                 if (state && (state->end + 1 != start ||
1371                     !state->state & EXTENT_WRITEBACK)) {
1372                         state = NULL;
1373                 }
1374                 spin_unlock_irqrestore(&tree->lock, flags);
1375 next_io:
1376
1377                 if (whole_page)
1378                         end_page_writeback(page);
1379                 else
1380                         check_page_writeback(tree, page);
1381         } while (bvec >= bio->bi_io_vec);
1382         bio_put(bio);
1383 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1384         return 0;
1385 #endif
1386 }
1387
1388 /*
1389  * after a readpage IO is done, we need to:
1390  * clear the uptodate bits on error
1391  * set the uptodate bits if things worked
1392  * set the page up to date if all extents in the tree are uptodate
1393  * clear the lock bit in the extent tree
1394  * unlock the page if there are no other extents locked for it
1395  *
1396  * Scheduling is not allowed, so the extent state tree is expected
1397  * to have one and only one object corresponding to this IO.
1398  */
1399 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1400 static void end_bio_extent_readpage(struct bio *bio, int err)
1401 #else
1402 static int end_bio_extent_readpage(struct bio *bio,
1403                                    unsigned int bytes_done, int err)
1404 #endif
1405 {
1406         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1407         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1408         struct extent_state *state = bio->bi_private;
1409         struct extent_io_tree *tree = state->tree;
1410         struct rb_node *node;
1411         u64 start;
1412         u64 end;
1413         u64 cur;
1414         unsigned long flags;
1415         int whole_page;
1416         int ret;
1417
1418 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1419         if (bio->bi_size)
1420                 return 1;
1421 #endif
1422
1423         do {
1424                 struct page *page = bvec->bv_page;
1425                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1426                         bvec->bv_offset;
1427                 end = start + bvec->bv_len - 1;
1428
1429                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1430                         whole_page = 1;
1431                 else
1432                         whole_page = 0;
1433
1434                 if (--bvec >= bio->bi_io_vec)
1435                         prefetchw(&bvec->bv_page->flags);
1436
1437                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1438                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1439                                                               state);
1440                         if (ret)
1441                                 uptodate = 0;
1442                 }
1443
1444                 spin_lock_irqsave(&tree->lock, flags);
1445                 if (!state || state->end != end) {
1446                         state = NULL;
1447                         node = __tree_search(&tree->state, start, NULL, NULL);
1448                         if (node) {
1449                                 state = rb_entry(node, struct extent_state,
1450                                                  rb_node);
1451                                 if (state->end != end ||
1452                                     !(state->state & EXTENT_LOCKED))
1453                                         state = NULL;
1454                         }
1455                         if (!state) {
1456                                 spin_unlock_irqrestore(&tree->lock, flags);
1457                                 set_extent_uptodate(tree, start, end,
1458                                                     GFP_ATOMIC);
1459                                 unlock_extent(tree, start, end, GFP_ATOMIC);
1460                                 goto next_io;
1461                         }
1462                 }
1463
1464                 cur = end;
1465                 while(1) {
1466                         struct extent_state *clear = state;
1467                         cur = state->start;
1468                         node = rb_prev(&state->rb_node);
1469                         if (node) {
1470                                 state = rb_entry(node,
1471                                          struct extent_state,
1472                                          rb_node);
1473                         } else {
1474                                 state = NULL;
1475                         }
1476                         clear->state |= EXTENT_UPTODATE;
1477                         clear_state_bit(tree, clear, EXTENT_LOCKED,
1478                                         1, 0);
1479                         if (cur == start)
1480                                 break;
1481                         if (cur < start) {
1482                                 WARN_ON(1);
1483                                 break;
1484                         }
1485                         if (!node)
1486                                 break;
1487                 }
1488                 /* before releasing the lock, make sure the next state
1489                  * variable has the expected bits set and corresponds
1490                  * to the correct offsets in the file
1491                  */
1492                 if (state && (state->end + 1 != start ||
1493                     !state->state & EXTENT_WRITEBACK)) {
1494                         state = NULL;
1495                 }
1496                 spin_unlock_irqrestore(&tree->lock, flags);
1497 next_io:
1498                 if (whole_page) {
1499                         if (uptodate) {
1500                                 SetPageUptodate(page);
1501                         } else {
1502                                 ClearPageUptodate(page);
1503                                 SetPageError(page);
1504                         }
1505                         unlock_page(page);
1506                 } else {
1507                         if (uptodate) {
1508                                 check_page_uptodate(tree, page);
1509                         } else {
1510                                 ClearPageUptodate(page);
1511                                 SetPageError(page);
1512                         }
1513                         check_page_locked(tree, page);
1514                 }
1515         } while (bvec >= bio->bi_io_vec);
1516
1517         bio_put(bio);
1518 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1519         return 0;
1520 #endif
1521 }
1522
1523 /*
1524  * IO done from prepare_write is pretty simple, we just unlock
1525  * the structs in the extent tree when done, and set the uptodate bits
1526  * as appropriate.
1527  */
1528 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1529 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1530 #else
1531 static int end_bio_extent_preparewrite(struct bio *bio,
1532                                        unsigned int bytes_done, int err)
1533 #endif
1534 {
1535         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1536         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1537         struct extent_state *state = bio->bi_private;
1538         struct extent_io_tree *tree = state->tree;
1539         u64 start;
1540         u64 end;
1541
1542 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1543         if (bio->bi_size)
1544                 return 1;
1545 #endif
1546
1547         do {
1548                 struct page *page = bvec->bv_page;
1549                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1550                         bvec->bv_offset;
1551                 end = start + bvec->bv_len - 1;
1552
1553                 if (--bvec >= bio->bi_io_vec)
1554                         prefetchw(&bvec->bv_page->flags);
1555
1556                 if (uptodate) {
1557                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1558                 } else {
1559                         ClearPageUptodate(page);
1560                         SetPageError(page);
1561                 }
1562
1563                 unlock_extent(tree, start, end, GFP_ATOMIC);
1564
1565         } while (bvec >= bio->bi_io_vec);
1566
1567         bio_put(bio);
1568 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1569         return 0;
1570 #endif
1571 }
1572
1573 static struct bio *
1574 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1575                  gfp_t gfp_flags)
1576 {
1577         struct bio *bio;
1578
1579         bio = bio_alloc(gfp_flags, nr_vecs);
1580
1581         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1582                 while (!bio && (nr_vecs /= 2))
1583                         bio = bio_alloc(gfp_flags, nr_vecs);
1584         }
1585
1586         if (bio) {
1587                 bio->bi_bdev = bdev;
1588                 bio->bi_sector = first_sector;
1589         }
1590         return bio;
1591 }
1592
1593 static int submit_one_bio(int rw, struct bio *bio)
1594 {
1595         u64 maxsector;
1596         int ret = 0;
1597         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1598         struct page *page = bvec->bv_page;
1599         struct extent_io_tree *tree = bio->bi_private;
1600         struct rb_node *node;
1601         struct extent_state *state;
1602         u64 start;
1603         u64 end;
1604
1605         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1606         end = start + bvec->bv_len - 1;
1607
1608         spin_lock_irq(&tree->lock);
1609         node = __tree_search(&tree->state, start, NULL, NULL);
1610         BUG_ON(!node);
1611         state = rb_entry(node, struct extent_state, rb_node);
1612         while(state->end < end) {
1613                 node = rb_next(node);
1614                 state = rb_entry(node, struct extent_state, rb_node);
1615         }
1616         BUG_ON(state->end != end);
1617         spin_unlock_irq(&tree->lock);
1618
1619         bio->bi_private = state;
1620
1621         bio_get(bio);
1622
1623         maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1624         if (maxsector < bio->bi_sector) {
1625                 printk("sector too large max %Lu got %llu\n", maxsector,
1626                         (unsigned long long)bio->bi_sector);
1627                 WARN_ON(1);
1628         }
1629
1630         submit_bio(rw, bio);
1631         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1632                 ret = -EOPNOTSUPP;
1633         bio_put(bio);
1634         return ret;
1635 }
1636
1637 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1638                               struct page *page, sector_t sector,
1639                               size_t size, unsigned long offset,
1640                               struct block_device *bdev,
1641                               struct bio **bio_ret,
1642                               unsigned long max_pages,
1643                               bio_end_io_t end_io_func)
1644 {
1645         int ret = 0;
1646         struct bio *bio;
1647         int nr;
1648
1649         if (bio_ret && *bio_ret) {
1650                 bio = *bio_ret;
1651                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1652                     bio_add_page(bio, page, size, offset) < size) {
1653                         ret = submit_one_bio(rw, bio);
1654                         bio = NULL;
1655                 } else {
1656                         return 0;
1657                 }
1658         }
1659         nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1660         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1661         if (!bio) {
1662                 printk("failed to allocate bio nr %d\n", nr);
1663         }
1664
1665
1666         bio_add_page(bio, page, size, offset);
1667         bio->bi_end_io = end_io_func;
1668         bio->bi_private = tree;
1669
1670         if (bio_ret) {
1671                 *bio_ret = bio;
1672         } else {
1673                 ret = submit_one_bio(rw, bio);
1674         }
1675
1676         return ret;
1677 }
1678
1679 void set_page_extent_mapped(struct page *page)
1680 {
1681         if (!PagePrivate(page)) {
1682                 SetPagePrivate(page);
1683                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1684                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1685                 page_cache_get(page);
1686         }
1687 }
1688
1689 void set_page_extent_head(struct page *page, unsigned long len)
1690 {
1691         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1692 }
1693
1694 /*
1695  * basic readpage implementation.  Locked extent state structs are inserted
1696  * into the tree that are removed when the IO is done (by the end_io
1697  * handlers)
1698  */
1699 static int __extent_read_full_page(struct extent_io_tree *tree,
1700                                    struct page *page,
1701                                    get_extent_t *get_extent,
1702                                    struct bio **bio)
1703 {
1704         struct inode *inode = page->mapping->host;
1705         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1706         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1707         u64 end;
1708         u64 cur = start;
1709         u64 extent_offset;
1710         u64 last_byte = i_size_read(inode);
1711         u64 block_start;
1712         u64 cur_end;
1713         sector_t sector;
1714         struct extent_map *em;
1715         struct block_device *bdev;
1716         int ret;
1717         int nr = 0;
1718         size_t page_offset = 0;
1719         size_t iosize;
1720         size_t blocksize = inode->i_sb->s_blocksize;
1721
1722         set_page_extent_mapped(page);
1723
1724         end = page_end;
1725         lock_extent(tree, start, end, GFP_NOFS);
1726
1727         while (cur <= end) {
1728                 if (cur >= last_byte) {
1729                         char *userpage;
1730                         iosize = PAGE_CACHE_SIZE - page_offset;
1731                         userpage = kmap_atomic(page, KM_USER0);
1732                         memset(userpage + page_offset, 0, iosize);
1733                         flush_dcache_page(page);
1734                         kunmap_atomic(userpage, KM_USER0);
1735                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1736                                             GFP_NOFS);
1737                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1738                         break;
1739                 }
1740                 em = get_extent(inode, page, page_offset, cur,
1741                                 end - cur + 1, 0);
1742                 if (IS_ERR(em) || !em) {
1743                         SetPageError(page);
1744                         unlock_extent(tree, cur, end, GFP_NOFS);
1745                         break;
1746                 }
1747
1748                 extent_offset = cur - em->start;
1749                 BUG_ON(extent_map_end(em) <= cur);
1750                 BUG_ON(end < cur);
1751
1752                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1753                 cur_end = min(extent_map_end(em) - 1, end);
1754                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1755                 sector = (em->block_start + extent_offset) >> 9;
1756                 bdev = em->bdev;
1757                 block_start = em->block_start;
1758                 free_extent_map(em);
1759                 em = NULL;
1760
1761                 /* we've found a hole, just zero and go on */
1762                 if (block_start == EXTENT_MAP_HOLE) {
1763                         char *userpage;
1764                         userpage = kmap_atomic(page, KM_USER0);
1765                         memset(userpage + page_offset, 0, iosize);
1766                         flush_dcache_page(page);
1767                         kunmap_atomic(userpage, KM_USER0);
1768
1769                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1770                                             GFP_NOFS);
1771                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1772                         cur = cur + iosize;
1773                         page_offset += iosize;
1774                         continue;
1775                 }
1776                 /* the get_extent function already copied into the page */
1777                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1778                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1779                         cur = cur + iosize;
1780                         page_offset += iosize;
1781                         continue;
1782                 }
1783                 /* we have an inline extent but it didn't get marked up
1784                  * to date.  Error out
1785                  */
1786                 if (block_start == EXTENT_MAP_INLINE) {
1787                         SetPageError(page);
1788                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1789                         cur = cur + iosize;
1790                         page_offset += iosize;
1791                         continue;
1792                 }
1793
1794                 ret = 0;
1795                 if (tree->ops && tree->ops->readpage_io_hook) {
1796                         ret = tree->ops->readpage_io_hook(page, cur,
1797                                                           cur + iosize - 1);
1798                 }
1799                 if (!ret) {
1800                         unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1801                         nr -= page->index;
1802                         ret = submit_extent_page(READ, tree, page,
1803                                          sector, iosize, page_offset,
1804                                          bdev, bio, nr,
1805                                          end_bio_extent_readpage);
1806                 }
1807                 if (ret)
1808                         SetPageError(page);
1809                 cur = cur + iosize;
1810                 page_offset += iosize;
1811                 nr++;
1812         }
1813         if (!nr) {
1814                 if (!PageError(page))
1815                         SetPageUptodate(page);
1816                 unlock_page(page);
1817         }
1818         return 0;
1819 }
1820
1821 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1822                             get_extent_t *get_extent)
1823 {
1824         struct bio *bio = NULL;
1825         int ret;
1826
1827         ret = __extent_read_full_page(tree, page, get_extent, &bio);
1828         if (bio)
1829                 submit_one_bio(READ, bio);
1830         return ret;
1831 }
1832 EXPORT_SYMBOL(extent_read_full_page);
1833
1834 /*
1835  * the writepage semantics are similar to regular writepage.  extent
1836  * records are inserted to lock ranges in the tree, and as dirty areas
1837  * are found, they are marked writeback.  Then the lock bits are removed
1838  * and the end_io handler clears the writeback ranges
1839  */
1840 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1841                               void *data)
1842 {
1843         struct inode *inode = page->mapping->host;
1844         struct extent_page_data *epd = data;
1845         struct extent_io_tree *tree = epd->tree;
1846         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1847         u64 delalloc_start;
1848         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1849         u64 end;
1850         u64 cur = start;
1851         u64 extent_offset;
1852         u64 last_byte = i_size_read(inode);
1853         u64 block_start;
1854         u64 iosize;
1855         sector_t sector;
1856         struct extent_map *em;
1857         struct block_device *bdev;
1858         int ret;
1859         int nr = 0;
1860         size_t page_offset = 0;
1861         size_t blocksize;
1862         loff_t i_size = i_size_read(inode);
1863         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1864         u64 nr_delalloc;
1865         u64 delalloc_end;
1866
1867         WARN_ON(!PageLocked(page));
1868         if (page->index > end_index) {
1869                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1870                 unlock_page(page);
1871                 return 0;
1872         }
1873
1874         if (page->index == end_index) {
1875                 char *userpage;
1876
1877                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1878
1879                 userpage = kmap_atomic(page, KM_USER0);
1880                 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1881                 flush_dcache_page(page);
1882                 kunmap_atomic(userpage, KM_USER0);
1883         }
1884
1885         set_page_extent_mapped(page);
1886
1887         delalloc_start = start;
1888         delalloc_end = 0;
1889         while(delalloc_end < page_end) {
1890                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1891                                                        &delalloc_end,
1892                                                        128 * 1024 * 1024);
1893                 if (nr_delalloc == 0) {
1894                         delalloc_start = delalloc_end + 1;
1895                         continue;
1896                 }
1897                 tree->ops->fill_delalloc(inode, delalloc_start,
1898                                          delalloc_end);
1899                 clear_extent_bit(tree, delalloc_start,
1900                                  delalloc_end,
1901                                  EXTENT_LOCKED | EXTENT_DELALLOC,
1902                                  1, 0, GFP_NOFS);
1903                 delalloc_start = delalloc_end + 1;
1904         }
1905         lock_extent(tree, start, page_end, GFP_NOFS);
1906
1907         end = page_end;
1908         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1909                 printk("found delalloc bits after lock_extent\n");
1910         }
1911
1912         if (last_byte <= start) {
1913                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1914                 goto done;
1915         }
1916
1917         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1918         blocksize = inode->i_sb->s_blocksize;
1919
1920         while (cur <= end) {
1921                 if (cur >= last_byte) {
1922                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1923                         break;
1924                 }
1925                 em = epd->get_extent(inode, page, page_offset, cur,
1926                                      end - cur + 1, 1);
1927                 if (IS_ERR(em) || !em) {
1928                         SetPageError(page);
1929                         break;
1930                 }
1931
1932                 extent_offset = cur - em->start;
1933                 BUG_ON(extent_map_end(em) <= cur);
1934                 BUG_ON(end < cur);
1935                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1936                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1937                 sector = (em->block_start + extent_offset) >> 9;
1938                 bdev = em->bdev;
1939                 block_start = em->block_start;
1940                 free_extent_map(em);
1941                 em = NULL;
1942
1943                 if (block_start == EXTENT_MAP_HOLE ||
1944                     block_start == EXTENT_MAP_INLINE) {
1945                         clear_extent_dirty(tree, cur,
1946                                            cur + iosize - 1, GFP_NOFS);
1947                         cur = cur + iosize;
1948                         page_offset += iosize;
1949                         continue;
1950                 }
1951
1952                 /* leave this out until we have a page_mkwrite call */
1953                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1954                                    EXTENT_DIRTY, 0)) {
1955                         cur = cur + iosize;
1956                         page_offset += iosize;
1957                         continue;
1958                 }
1959                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1960                 if (tree->ops && tree->ops->writepage_io_hook) {
1961                         ret = tree->ops->writepage_io_hook(page, cur,
1962                                                 cur + iosize - 1);
1963                 } else {
1964                         ret = 0;
1965                 }
1966                 if (ret)
1967                         SetPageError(page);
1968                 else {
1969                         unsigned long max_nr = end_index + 1;
1970                         set_range_writeback(tree, cur, cur + iosize - 1);
1971                         if (!PageWriteback(page)) {
1972                                 printk("warning page %lu not writeback, "
1973                                        "cur %llu end %llu\n", page->index,
1974                                        (unsigned long long)cur,
1975                                        (unsigned long long)end);
1976                         }
1977
1978                         ret = submit_extent_page(WRITE, tree, page, sector,
1979                                                  iosize, page_offset, bdev,
1980                                                  &epd->bio, max_nr,
1981                                                  end_bio_extent_writepage);
1982                         if (ret)
1983                                 SetPageError(page);
1984                 }
1985                 cur = cur + iosize;
1986                 page_offset += iosize;
1987                 nr++;
1988         }
1989 done:
1990         if (nr == 0) {
1991                 /* make sure the mapping tag for page dirty gets cleared */
1992                 set_page_writeback(page);
1993                 end_page_writeback(page);
1994         }
1995         unlock_extent(tree, start, page_end, GFP_NOFS);
1996         unlock_page(page);
1997         return 0;
1998 }
1999
2000 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2001
2002 /* Taken directly from 2.6.23 for 2.6.18 back port */
2003 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2004                                 void *data);
2005
2006 /**
2007  * write_cache_pages - walk the list of dirty pages of the given address space
2008  * and write all of them.
2009  * @mapping: address space structure to write
2010  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2011  * @writepage: function called for each page
2012  * @data: data passed to writepage function
2013  *
2014  * If a page is already under I/O, write_cache_pages() skips it, even
2015  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2016  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2017  * and msync() need to guarantee that all the data which was dirty at the time
2018  * the call was made get new I/O started against them.  If wbc->sync_mode is
2019  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2020  * existing IO to complete.
2021  */
2022 static int write_cache_pages(struct address_space *mapping,
2023                       struct writeback_control *wbc, writepage_t writepage,
2024                       void *data)
2025 {
2026         struct backing_dev_info *bdi = mapping->backing_dev_info;
2027         int ret = 0;
2028         int done = 0;
2029         struct pagevec pvec;
2030         int nr_pages;
2031         pgoff_t index;
2032         pgoff_t end;            /* Inclusive */
2033         int scanned = 0;
2034         int range_whole = 0;
2035
2036         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2037                 wbc->encountered_congestion = 1;
2038                 return 0;
2039         }
2040
2041         pagevec_init(&pvec, 0);
2042         if (wbc->range_cyclic) {
2043                 index = mapping->writeback_index; /* Start from prev offset */
2044                 end = -1;
2045         } else {
2046                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2047                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2048                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2049                         range_whole = 1;
2050                 scanned = 1;
2051         }
2052 retry:
2053         while (!done && (index <= end) &&
2054                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2055                                               PAGECACHE_TAG_DIRTY,
2056                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2057                 unsigned i;
2058
2059                 scanned = 1;
2060                 for (i = 0; i < nr_pages; i++) {
2061                         struct page *page = pvec.pages[i];
2062
2063                         /*
2064                          * At this point we hold neither mapping->tree_lock nor
2065                          * lock on the page itself: the page may be truncated or
2066                          * invalidated (changing page->mapping to NULL), or even
2067                          * swizzled back from swapper_space to tmpfs file
2068                          * mapping
2069                          */
2070                         lock_page(page);
2071
2072                         if (unlikely(page->mapping != mapping)) {
2073                                 unlock_page(page);
2074                                 continue;
2075                         }
2076
2077                         if (!wbc->range_cyclic && page->index > end) {
2078                                 done = 1;
2079                                 unlock_page(page);
2080                                 continue;
2081                         }
2082
2083                         if (wbc->sync_mode != WB_SYNC_NONE)
2084                                 wait_on_page_writeback(page);
2085
2086                         if (PageWriteback(page) ||
2087                             !clear_page_dirty_for_io(page)) {
2088                                 unlock_page(page);
2089                                 continue;
2090                         }
2091
2092                         ret = (*writepage)(page, wbc, data);
2093
2094                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2095                                 unlock_page(page);
2096                                 ret = 0;
2097                         }
2098                         if (ret || (--(wbc->nr_to_write) <= 0))
2099                                 done = 1;
2100                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2101                                 wbc->encountered_congestion = 1;
2102                                 done = 1;
2103                         }
2104                 }
2105                 pagevec_release(&pvec);
2106                 cond_resched();
2107         }
2108         if (!scanned && !done) {
2109                 /*
2110                  * We hit the last page and there is more work to be done: wrap
2111                  * back to the start of the file
2112                  */
2113                 scanned = 1;
2114                 index = 0;
2115                 goto retry;
2116         }
2117         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2118                 mapping->writeback_index = index;
2119         return ret;
2120 }
2121 #endif
2122
2123 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2124                           get_extent_t *get_extent,
2125                           struct writeback_control *wbc)
2126 {
2127         int ret;
2128         struct address_space *mapping = page->mapping;
2129         struct extent_page_data epd = {
2130                 .bio = NULL,
2131                 .tree = tree,
2132                 .get_extent = get_extent,
2133         };
2134         struct writeback_control wbc_writepages = {
2135                 .bdi            = wbc->bdi,
2136                 .sync_mode      = WB_SYNC_NONE,
2137                 .older_than_this = NULL,
2138                 .nr_to_write    = 64,
2139                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2140                 .range_end      = (loff_t)-1,
2141         };
2142
2143
2144         ret = __extent_writepage(page, wbc, &epd);
2145
2146         write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2147         if (epd.bio) {
2148                 submit_one_bio(WRITE, epd.bio);
2149         }
2150         return ret;
2151 }
2152 EXPORT_SYMBOL(extent_write_full_page);
2153
2154
2155 int extent_writepages(struct extent_io_tree *tree,
2156                       struct address_space *mapping,
2157                       get_extent_t *get_extent,
2158                       struct writeback_control *wbc)
2159 {
2160         int ret = 0;
2161         struct extent_page_data epd = {
2162                 .bio = NULL,
2163                 .tree = tree,
2164                 .get_extent = get_extent,
2165         };
2166
2167         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2168         if (epd.bio) {
2169                 submit_one_bio(WRITE, epd.bio);
2170         }
2171         return ret;
2172 }
2173 EXPORT_SYMBOL(extent_writepages);
2174
2175 int extent_readpages(struct extent_io_tree *tree,
2176                      struct address_space *mapping,
2177                      struct list_head *pages, unsigned nr_pages,
2178                      get_extent_t get_extent)
2179 {
2180         struct bio *bio = NULL;
2181         unsigned page_idx;
2182         struct pagevec pvec;
2183
2184         pagevec_init(&pvec, 0);
2185         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2186                 struct page *page = list_entry(pages->prev, struct page, lru);
2187
2188                 prefetchw(&page->flags);
2189                 list_del(&page->lru);
2190                 /*
2191                  * what we want to do here is call add_to_page_cache_lru,
2192                  * but that isn't exported, so we reproduce it here
2193                  */
2194                 if (!add_to_page_cache(page, mapping,
2195                                         page->index, GFP_KERNEL)) {
2196
2197                         /* open coding of lru_cache_add, also not exported */
2198                         page_cache_get(page);
2199                         if (!pagevec_add(&pvec, page))
2200                                 __pagevec_lru_add(&pvec);
2201                         __extent_read_full_page(tree, page, get_extent, &bio);
2202                 }
2203                 page_cache_release(page);
2204         }
2205         if (pagevec_count(&pvec))
2206                 __pagevec_lru_add(&pvec);
2207         BUG_ON(!list_empty(pages));
2208         if (bio)
2209                 submit_one_bio(READ, bio);
2210         return 0;
2211 }
2212 EXPORT_SYMBOL(extent_readpages);
2213
2214 /*
2215  * basic invalidatepage code, this waits on any locked or writeback
2216  * ranges corresponding to the page, and then deletes any extent state
2217  * records from the tree
2218  */
2219 int extent_invalidatepage(struct extent_io_tree *tree,
2220                           struct page *page, unsigned long offset)
2221 {
2222         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2223         u64 end = start + PAGE_CACHE_SIZE - 1;
2224         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2225
2226         start += (offset + blocksize -1) & ~(blocksize - 1);
2227         if (start > end)
2228                 return 0;
2229
2230         lock_extent(tree, start, end, GFP_NOFS);
2231         wait_on_extent_writeback(tree, start, end);
2232         clear_extent_bit(tree, start, end,
2233                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2234                          1, 1, GFP_NOFS);
2235         return 0;
2236 }
2237 EXPORT_SYMBOL(extent_invalidatepage);
2238
2239 /*
2240  * simple commit_write call, set_range_dirty is used to mark both
2241  * the pages and the extent records as dirty
2242  */
2243 int extent_commit_write(struct extent_io_tree *tree,
2244                         struct inode *inode, struct page *page,
2245                         unsigned from, unsigned to)
2246 {
2247         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2248
2249         set_page_extent_mapped(page);
2250         set_page_dirty(page);
2251
2252         if (pos > inode->i_size) {
2253                 i_size_write(inode, pos);
2254                 mark_inode_dirty(inode);
2255         }
2256         return 0;
2257 }
2258 EXPORT_SYMBOL(extent_commit_write);
2259
2260 int extent_prepare_write(struct extent_io_tree *tree,
2261                          struct inode *inode, struct page *page,
2262                          unsigned from, unsigned to, get_extent_t *get_extent)
2263 {
2264         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2265         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2266         u64 block_start;
2267         u64 orig_block_start;
2268         u64 block_end;
2269         u64 cur_end;
2270         struct extent_map *em;
2271         unsigned blocksize = 1 << inode->i_blkbits;
2272         size_t page_offset = 0;
2273         size_t block_off_start;
2274         size_t block_off_end;
2275         int err = 0;
2276         int iocount = 0;
2277         int ret = 0;
2278         int isnew;
2279
2280         set_page_extent_mapped(page);
2281
2282         block_start = (page_start + from) & ~((u64)blocksize - 1);
2283         block_end = (page_start + to - 1) | (blocksize - 1);
2284         orig_block_start = block_start;
2285
2286         lock_extent(tree, page_start, page_end, GFP_NOFS);
2287         while(block_start <= block_end) {
2288                 em = get_extent(inode, page, page_offset, block_start,
2289                                 block_end - block_start + 1, 1);
2290                 if (IS_ERR(em) || !em) {
2291                         goto err;
2292                 }
2293                 cur_end = min(block_end, extent_map_end(em) - 1);
2294                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2295                 block_off_end = block_off_start + blocksize;
2296                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2297
2298                 if (!PageUptodate(page) && isnew &&
2299                     (block_off_end > to || block_off_start < from)) {
2300                         void *kaddr;
2301
2302                         kaddr = kmap_atomic(page, KM_USER0);
2303                         if (block_off_end > to)
2304                                 memset(kaddr + to, 0, block_off_end - to);
2305                         if (block_off_start < from)
2306                                 memset(kaddr + block_off_start, 0,
2307                                        from - block_off_start);
2308                         flush_dcache_page(page);
2309                         kunmap_atomic(kaddr, KM_USER0);
2310                 }
2311                 if ((em->block_start != EXTENT_MAP_HOLE &&
2312                      em->block_start != EXTENT_MAP_INLINE) &&
2313                     !isnew && !PageUptodate(page) &&
2314                     (block_off_end > to || block_off_start < from) &&
2315                     !test_range_bit(tree, block_start, cur_end,
2316                                     EXTENT_UPTODATE, 1)) {
2317                         u64 sector;
2318                         u64 extent_offset = block_start - em->start;
2319                         size_t iosize;
2320                         sector = (em->block_start + extent_offset) >> 9;
2321                         iosize = (cur_end - block_start + blocksize) &
2322                                 ~((u64)blocksize - 1);
2323                         /*
2324                          * we've already got the extent locked, but we
2325                          * need to split the state such that our end_bio
2326                          * handler can clear the lock.
2327                          */
2328                         set_extent_bit(tree, block_start,
2329                                        block_start + iosize - 1,
2330                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2331                         ret = submit_extent_page(READ, tree, page,
2332                                          sector, iosize, page_offset, em->bdev,
2333                                          NULL, 1,
2334                                          end_bio_extent_preparewrite);
2335                         iocount++;
2336                         block_start = block_start + iosize;
2337                 } else {
2338                         set_extent_uptodate(tree, block_start, cur_end,
2339                                             GFP_NOFS);
2340                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2341                         block_start = cur_end + 1;
2342                 }
2343                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2344                 free_extent_map(em);
2345         }
2346         if (iocount) {
2347                 wait_extent_bit(tree, orig_block_start,
2348                                 block_end, EXTENT_LOCKED);
2349         }
2350         check_page_uptodate(tree, page);
2351 err:
2352         /* FIXME, zero out newly allocated blocks on error */
2353         return err;
2354 }
2355 EXPORT_SYMBOL(extent_prepare_write);
2356
2357 /*
2358  * a helper for releasepage.  As long as there are no locked extents
2359  * in the range corresponding to the page, both state records and extent
2360  * map records are removed
2361  */
2362 int try_release_extent_mapping(struct extent_map_tree *map,
2363                                struct extent_io_tree *tree, struct page *page,
2364                                gfp_t mask)
2365 {
2366         struct extent_map *em;
2367         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2368         u64 end = start + PAGE_CACHE_SIZE - 1;
2369         u64 orig_start = start;
2370         int ret = 1;
2371
2372         if ((mask & __GFP_WAIT) &&
2373             page->mapping->host->i_size > 16 * 1024 * 1024) {
2374                 while (start <= end) {
2375                         spin_lock(&map->lock);
2376                         em = lookup_extent_mapping(map, start, end);
2377                         if (!em || IS_ERR(em)) {
2378                                 spin_unlock(&map->lock);
2379                                 break;
2380                         }
2381                         if (em->start != start) {
2382                                 spin_unlock(&map->lock);
2383                                 free_extent_map(em);
2384                                 break;
2385                         }
2386                         if (!test_range_bit(tree, em->start,
2387                                             extent_map_end(em) - 1,
2388                                             EXTENT_LOCKED, 0)) {
2389                                 remove_extent_mapping(map, em);
2390                                 /* once for the rb tree */
2391                                 free_extent_map(em);
2392                         }
2393                         start = extent_map_end(em);
2394                         spin_unlock(&map->lock);
2395
2396                         /* once for us */
2397                         free_extent_map(em);
2398                 }
2399         }
2400         if (test_range_bit(tree, orig_start, end, EXTENT_IOBITS, 0))
2401                 ret = 0;
2402         else {
2403                 if ((mask & GFP_NOFS) == GFP_NOFS)
2404                         mask = GFP_NOFS;
2405                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2406                                  1, 1, mask);
2407         }
2408         return ret;
2409 }
2410 EXPORT_SYMBOL(try_release_extent_mapping);
2411
2412 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2413                 get_extent_t *get_extent)
2414 {
2415         struct inode *inode = mapping->host;
2416         u64 start = iblock << inode->i_blkbits;
2417         sector_t sector = 0;
2418         struct extent_map *em;
2419
2420         em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2421         if (!em || IS_ERR(em))
2422                 return 0;
2423
2424         if (em->block_start == EXTENT_MAP_INLINE ||
2425             em->block_start == EXTENT_MAP_HOLE)
2426                 goto out;
2427
2428         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2429 printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
2430 out:
2431         free_extent_map(em);
2432         return sector;
2433 }
2434
2435 static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
2436 {
2437         if (list_empty(&eb->lru)) {
2438                 extent_buffer_get(eb);
2439                 list_add(&eb->lru, &tree->buffer_lru);
2440                 tree->lru_size++;
2441                 if (tree->lru_size >= BUFFER_LRU_MAX) {
2442                         struct extent_buffer *rm;
2443                         rm = list_entry(tree->buffer_lru.prev,
2444                                         struct extent_buffer, lru);
2445                         tree->lru_size--;
2446                         list_del_init(&rm->lru);
2447                         free_extent_buffer(rm);
2448                 }
2449         } else
2450                 list_move(&eb->lru, &tree->buffer_lru);
2451         return 0;
2452 }
2453 static struct extent_buffer *find_lru(struct extent_io_tree *tree,
2454                                       u64 start, unsigned long len)
2455 {
2456         struct list_head *lru = &tree->buffer_lru;
2457         struct list_head *cur = lru->next;
2458         struct extent_buffer *eb;
2459
2460         if (list_empty(lru))
2461                 return NULL;
2462
2463         do {
2464                 eb = list_entry(cur, struct extent_buffer, lru);
2465                 if (eb->start == start && eb->len == len) {
2466                         extent_buffer_get(eb);
2467                         return eb;
2468                 }
2469                 cur = cur->next;
2470         } while (cur != lru);
2471         return NULL;
2472 }
2473
2474 static inline unsigned long num_extent_pages(u64 start, u64 len)
2475 {
2476         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2477                 (start >> PAGE_CACHE_SHIFT);
2478 }
2479
2480 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2481                                               unsigned long i)
2482 {
2483         struct page *p;
2484         struct address_space *mapping;
2485
2486         if (i == 0)
2487                 return eb->first_page;
2488         i += eb->start >> PAGE_CACHE_SHIFT;
2489         mapping = eb->first_page->mapping;
2490         read_lock_irq(&mapping->tree_lock);
2491         p = radix_tree_lookup(&mapping->page_tree, i);
2492         read_unlock_irq(&mapping->tree_lock);
2493         return p;
2494 }
2495
2496 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2497                                                    u64 start,
2498                                                    unsigned long len,
2499                                                    gfp_t mask)
2500 {
2501         struct extent_buffer *eb = NULL;
2502
2503         spin_lock(&tree->lru_lock);
2504         eb = find_lru(tree, start, len);
2505         spin_unlock(&tree->lru_lock);
2506         if (eb) {
2507                 return eb;
2508         }
2509
2510         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2511         INIT_LIST_HEAD(&eb->lru);
2512         eb->start = start;
2513         eb->len = len;
2514         atomic_set(&eb->refs, 1);
2515
2516         return eb;
2517 }
2518
2519 static void __free_extent_buffer(struct extent_buffer *eb)
2520 {
2521         kmem_cache_free(extent_buffer_cache, eb);
2522 }
2523
2524 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2525                                           u64 start, unsigned long len,
2526                                           struct page *page0,
2527                                           gfp_t mask)
2528 {
2529         unsigned long num_pages = num_extent_pages(start, len);
2530         unsigned long i;
2531         unsigned long index = start >> PAGE_CACHE_SHIFT;
2532         struct extent_buffer *eb;
2533         struct page *p;
2534         struct address_space *mapping = tree->mapping;
2535         int uptodate = 1;
2536
2537         eb = __alloc_extent_buffer(tree, start, len, mask);
2538         if (!eb || IS_ERR(eb))
2539                 return NULL;
2540
2541         if (eb->flags & EXTENT_BUFFER_FILLED)
2542                 goto lru_add;
2543
2544         if (page0) {
2545                 eb->first_page = page0;
2546                 i = 1;
2547                 index++;
2548                 page_cache_get(page0);
2549                 mark_page_accessed(page0);
2550                 set_page_extent_mapped(page0);
2551                 WARN_ON(!PageUptodate(page0));
2552                 set_page_extent_head(page0, len);
2553         } else {
2554                 i = 0;
2555         }
2556         for (; i < num_pages; i++, index++) {
2557                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2558                 if (!p) {
2559                         WARN_ON(1);
2560                         goto fail;
2561                 }
2562                 set_page_extent_mapped(p);
2563                 mark_page_accessed(p);
2564                 if (i == 0) {
2565                         eb->first_page = p;
2566                         set_page_extent_head(p, len);
2567                 } else {
2568                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2569                 }
2570                 if (!PageUptodate(p))
2571                         uptodate = 0;
2572                 unlock_page(p);
2573         }
2574         if (uptodate)
2575                 eb->flags |= EXTENT_UPTODATE;
2576         eb->flags |= EXTENT_BUFFER_FILLED;
2577
2578 lru_add:
2579         spin_lock(&tree->lru_lock);
2580         add_lru(tree, eb);
2581         spin_unlock(&tree->lru_lock);
2582         return eb;
2583
2584 fail:
2585         spin_lock(&tree->lru_lock);
2586         list_del_init(&eb->lru);
2587         spin_unlock(&tree->lru_lock);
2588         if (!atomic_dec_and_test(&eb->refs))
2589                 return NULL;
2590         for (index = 1; index < i; index++) {
2591                 page_cache_release(extent_buffer_page(eb, index));
2592         }
2593         if (i > 0)
2594                 page_cache_release(extent_buffer_page(eb, 0));
2595         __free_extent_buffer(eb);
2596         return NULL;
2597 }
2598 EXPORT_SYMBOL(alloc_extent_buffer);
2599
2600 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2601                                          u64 start, unsigned long len,
2602                                           gfp_t mask)
2603 {
2604         unsigned long num_pages = num_extent_pages(start, len);
2605         unsigned long i;
2606         unsigned long index = start >> PAGE_CACHE_SHIFT;
2607         struct extent_buffer *eb;
2608         struct page *p;
2609         struct address_space *mapping = tree->mapping;
2610         int uptodate = 1;
2611
2612         eb = __alloc_extent_buffer(tree, start, len, mask);
2613         if (!eb || IS_ERR(eb))
2614                 return NULL;
2615
2616         if (eb->flags & EXTENT_BUFFER_FILLED)
2617                 goto lru_add;
2618
2619         for (i = 0; i < num_pages; i++, index++) {
2620                 p = find_lock_page(mapping, index);
2621                 if (!p) {
2622                         goto fail;
2623                 }
2624                 set_page_extent_mapped(p);
2625                 mark_page_accessed(p);
2626
2627                 if (i == 0) {
2628                         eb->first_page = p;
2629                         set_page_extent_head(p, len);
2630                 } else {
2631                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2632                 }
2633
2634                 if (!PageUptodate(p))
2635                         uptodate = 0;
2636                 unlock_page(p);
2637         }
2638         if (uptodate)
2639                 eb->flags |= EXTENT_UPTODATE;
2640         eb->flags |= EXTENT_BUFFER_FILLED;
2641
2642 lru_add:
2643         spin_lock(&tree->lru_lock);
2644         add_lru(tree, eb);
2645         spin_unlock(&tree->lru_lock);
2646         return eb;
2647 fail:
2648         spin_lock(&tree->lru_lock);
2649         list_del_init(&eb->lru);
2650         spin_unlock(&tree->lru_lock);
2651         if (!atomic_dec_and_test(&eb->refs))
2652                 return NULL;
2653         for (index = 1; index < i; index++) {
2654                 page_cache_release(extent_buffer_page(eb, index));
2655         }
2656         if (i > 0)
2657                 page_cache_release(extent_buffer_page(eb, 0));
2658         __free_extent_buffer(eb);
2659         return NULL;
2660 }
2661 EXPORT_SYMBOL(find_extent_buffer);
2662
2663 void free_extent_buffer(struct extent_buffer *eb)
2664 {
2665         unsigned long i;
2666         unsigned long num_pages;
2667
2668         if (!eb)
2669                 return;
2670
2671         if (!atomic_dec_and_test(&eb->refs))
2672                 return;
2673
2674         WARN_ON(!list_empty(&eb->lru));
2675         num_pages = num_extent_pages(eb->start, eb->len);
2676
2677         for (i = 1; i < num_pages; i++) {
2678                 page_cache_release(extent_buffer_page(eb, i));
2679         }
2680         page_cache_release(extent_buffer_page(eb, 0));
2681         __free_extent_buffer(eb);
2682 }
2683 EXPORT_SYMBOL(free_extent_buffer);
2684
2685 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2686                               struct extent_buffer *eb)
2687 {
2688         int set;
2689         unsigned long i;
2690         unsigned long num_pages;
2691         struct page *page;
2692
2693         u64 start = eb->start;
2694         u64 end = start + eb->len - 1;
2695
2696         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2697         num_pages = num_extent_pages(eb->start, eb->len);
2698
2699         for (i = 0; i < num_pages; i++) {
2700                 page = extent_buffer_page(eb, i);
2701                 lock_page(page);
2702                 if (i == 0)
2703                         set_page_extent_head(page, eb->len);
2704                 else
2705                         set_page_private(page, EXTENT_PAGE_PRIVATE);
2706
2707                 /*
2708                  * if we're on the last page or the first page and the
2709                  * block isn't aligned on a page boundary, do extra checks
2710                  * to make sure we don't clean page that is partially dirty
2711                  */
2712                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2713                     ((i == num_pages - 1) &&
2714                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2715                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2716                         end  = start + PAGE_CACHE_SIZE - 1;
2717                         if (test_range_bit(tree, start, end,
2718                                            EXTENT_DIRTY, 0)) {
2719                                 unlock_page(page);
2720                                 continue;
2721                         }
2722                 }
2723                 clear_page_dirty_for_io(page);
2724                 read_lock_irq(&page->mapping->tree_lock);
2725                 if (!PageDirty(page)) {
2726                         radix_tree_tag_clear(&page->mapping->page_tree,
2727                                                 page_index(page),
2728                                                 PAGECACHE_TAG_DIRTY);
2729                 }
2730                 read_unlock_irq(&page->mapping->tree_lock);
2731                 unlock_page(page);
2732         }
2733         return 0;
2734 }
2735 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2736
2737 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2738                                     struct extent_buffer *eb)
2739 {
2740         return wait_on_extent_writeback(tree, eb->start,
2741                                         eb->start + eb->len - 1);
2742 }
2743 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2744
2745 int set_extent_buffer_dirty(struct extent_io_tree *tree,
2746                              struct extent_buffer *eb)
2747 {
2748         unsigned long i;
2749         unsigned long num_pages;
2750
2751         num_pages = num_extent_pages(eb->start, eb->len);
2752         for (i = 0; i < num_pages; i++) {
2753                 struct page *page = extent_buffer_page(eb, i);
2754                 /* writepage may need to do something special for the
2755                  * first page, we have to make sure page->private is
2756                  * properly set.  releasepage may drop page->private
2757                  * on us if the page isn't already dirty.
2758                  */
2759                 if (i == 0) {
2760                         lock_page(page);
2761                         set_page_extent_head(page, eb->len);
2762                 } else if (PagePrivate(page) &&
2763                            page->private != EXTENT_PAGE_PRIVATE) {
2764                         lock_page(page);
2765                         set_page_extent_mapped(page);
2766                         unlock_page(page);
2767                 }
2768                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2769                 if (i == 0)
2770                         unlock_page(page);
2771         }
2772         return set_extent_dirty(tree, eb->start,
2773                                 eb->start + eb->len - 1, GFP_NOFS);
2774 }
2775 EXPORT_SYMBOL(set_extent_buffer_dirty);
2776
2777 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2778                                 struct extent_buffer *eb)
2779 {
2780         unsigned long i;
2781         struct page *page;
2782         unsigned long num_pages;
2783
2784         num_pages = num_extent_pages(eb->start, eb->len);
2785
2786         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2787                             GFP_NOFS);
2788         for (i = 0; i < num_pages; i++) {
2789                 page = extent_buffer_page(eb, i);
2790                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2791                     ((i == num_pages - 1) &&
2792                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2793                         check_page_uptodate(tree, page);
2794                         continue;
2795                 }
2796                 SetPageUptodate(page);
2797         }
2798         return 0;
2799 }
2800 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2801
2802 int extent_buffer_uptodate(struct extent_io_tree *tree,
2803                              struct extent_buffer *eb)
2804 {
2805         if (eb->flags & EXTENT_UPTODATE)
2806                 return 1;
2807         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2808                            EXTENT_UPTODATE, 1);
2809 }
2810 EXPORT_SYMBOL(extent_buffer_uptodate);
2811
2812 int read_extent_buffer_pages(struct extent_io_tree *tree,
2813                              struct extent_buffer *eb,
2814                              u64 start,
2815                              int wait)
2816 {
2817         unsigned long i;
2818         unsigned long start_i;
2819         struct page *page;
2820         int err;
2821         int ret = 0;
2822         unsigned long num_pages;
2823
2824         if (eb->flags & EXTENT_UPTODATE)
2825                 return 0;
2826
2827         if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2828                            EXTENT_UPTODATE, 1)) {
2829                 return 0;
2830         }
2831
2832         if (start) {
2833                 WARN_ON(start < eb->start);
2834                 start_i = (start >> PAGE_CACHE_SHIFT) -
2835                         (eb->start >> PAGE_CACHE_SHIFT);
2836         } else {
2837                 start_i = 0;
2838         }
2839
2840         num_pages = num_extent_pages(eb->start, eb->len);
2841         for (i = start_i; i < num_pages; i++) {
2842                 page = extent_buffer_page(eb, i);
2843                 if (PageUptodate(page)) {
2844                         continue;
2845                 }
2846                 if (!wait) {
2847                         if (TestSetPageLocked(page)) {
2848                                 continue;
2849                         }
2850                 } else {
2851                         lock_page(page);
2852                 }
2853                 if (!PageUptodate(page)) {
2854                         err = page->mapping->a_ops->readpage(NULL, page);
2855                         if (err) {
2856                                 ret = err;
2857                         }
2858                 } else {
2859                         unlock_page(page);
2860                 }
2861         }
2862
2863         if (ret || !wait) {
2864                 return ret;
2865         }
2866
2867         for (i = start_i; i < num_pages; i++) {
2868                 page = extent_buffer_page(eb, i);
2869                 wait_on_page_locked(page);
2870                 if (!PageUptodate(page)) {
2871                         ret = -EIO;
2872                 }
2873         }
2874         if (!ret)
2875                 eb->flags |= EXTENT_UPTODATE;
2876         return ret;
2877 }
2878 EXPORT_SYMBOL(read_extent_buffer_pages);
2879
2880 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2881                         unsigned long start,
2882                         unsigned long len)
2883 {
2884         size_t cur;
2885         size_t offset;
2886         struct page *page;
2887         char *kaddr;
2888         char *dst = (char *)dstv;
2889         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2890         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2891         unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2892
2893         WARN_ON(start > eb->len);
2894         WARN_ON(start + len > eb->start + eb->len);
2895
2896         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2897
2898         while(len > 0) {
2899                 page = extent_buffer_page(eb, i);
2900                 if (!PageUptodate(page)) {
2901                         printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2902                         WARN_ON(1);
2903                 }
2904                 WARN_ON(!PageUptodate(page));
2905
2906                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2907                 kaddr = kmap_atomic(page, KM_USER1);
2908                 memcpy(dst, kaddr + offset, cur);
2909                 kunmap_atomic(kaddr, KM_USER1);
2910
2911                 dst += cur;
2912                 len -= cur;
2913                 offset = 0;
2914                 i++;
2915         }
2916 }
2917 EXPORT_SYMBOL(read_extent_buffer);
2918
2919 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2920                                unsigned long min_len, char **token, char **map,
2921                                unsigned long *map_start,
2922                                unsigned long *map_len, int km)
2923 {
2924         size_t offset = start & (PAGE_CACHE_SIZE - 1);
2925         char *kaddr;
2926         struct page *p;
2927         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2928         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2929         unsigned long end_i = (start_offset + start + min_len - 1) >>
2930                 PAGE_CACHE_SHIFT;
2931
2932         if (i != end_i)
2933                 return -EINVAL;
2934
2935         if (i == 0) {
2936                 offset = start_offset;
2937                 *map_start = 0;
2938         } else {
2939                 offset = 0;
2940                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2941         }
2942         if (start + min_len > eb->len) {
2943 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2944                 WARN_ON(1);
2945         }
2946
2947         p = extent_buffer_page(eb, i);
2948         WARN_ON(!PageUptodate(p));
2949         kaddr = kmap_atomic(p, km);
2950         *token = kaddr;
2951         *map = kaddr + offset;
2952         *map_len = PAGE_CACHE_SIZE - offset;
2953         return 0;
2954 }
2955 EXPORT_SYMBOL(map_private_extent_buffer);
2956
2957 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2958                       unsigned long min_len,
2959                       char **token, char **map,
2960                       unsigned long *map_start,
2961                       unsigned long *map_len, int km)
2962 {
2963         int err;
2964         int save = 0;
2965         if (eb->map_token) {
2966                 unmap_extent_buffer(eb, eb->map_token, km);
2967                 eb->map_token = NULL;
2968                 save = 1;
2969         }
2970         err = map_private_extent_buffer(eb, start, min_len, token, map,
2971                                        map_start, map_len, km);
2972         if (!err && save) {
2973                 eb->map_token = *token;
2974                 eb->kaddr = *map;
2975                 eb->map_start = *map_start;
2976                 eb->map_len = *map_len;
2977         }
2978         return err;
2979 }
2980 EXPORT_SYMBOL(map_extent_buffer);
2981
2982 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2983 {
2984         kunmap_atomic(token, km);
2985 }
2986 EXPORT_SYMBOL(unmap_extent_buffer);
2987
2988 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2989                           unsigned long start,
2990                           unsigned long len)
2991 {
2992         size_t cur;
2993         size_t offset;
2994         struct page *page;
2995         char *kaddr;
2996         char *ptr = (char *)ptrv;
2997         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2998         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2999         int ret = 0;
3000
3001         WARN_ON(start > eb->len);
3002         WARN_ON(start + len > eb->start + eb->len);
3003
3004         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3005
3006         while(len > 0) {
3007                 page = extent_buffer_page(eb, i);
3008                 WARN_ON(!PageUptodate(page));
3009
3010                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3011
3012                 kaddr = kmap_atomic(page, KM_USER0);
3013                 ret = memcmp(ptr, kaddr + offset, cur);
3014                 kunmap_atomic(kaddr, KM_USER0);
3015                 if (ret)
3016                         break;
3017
3018                 ptr += cur;
3019                 len -= cur;
3020                 offset = 0;
3021                 i++;
3022         }
3023         return ret;
3024 }
3025 EXPORT_SYMBOL(memcmp_extent_buffer);
3026
3027 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3028                          unsigned long start, unsigned long len)
3029 {
3030         size_t cur;
3031         size_t offset;
3032         struct page *page;
3033         char *kaddr;
3034         char *src = (char *)srcv;
3035         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3036         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3037
3038         WARN_ON(start > eb->len);
3039         WARN_ON(start + len > eb->start + eb->len);
3040
3041         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3042
3043         while(len > 0) {
3044                 page = extent_buffer_page(eb, i);
3045                 WARN_ON(!PageUptodate(page));
3046
3047                 cur = min(len, PAGE_CACHE_SIZE - offset);
3048                 kaddr = kmap_atomic(page, KM_USER1);
3049                 memcpy(kaddr + offset, src, cur);
3050                 kunmap_atomic(kaddr, KM_USER1);
3051
3052                 src += cur;
3053                 len -= cur;
3054                 offset = 0;
3055                 i++;
3056         }
3057 }
3058 EXPORT_SYMBOL(write_extent_buffer);
3059
3060 void memset_extent_buffer(struct extent_buffer *eb, char c,
3061                           unsigned long start, unsigned long len)
3062 {
3063         size_t cur;
3064         size_t offset;
3065         struct page *page;
3066         char *kaddr;
3067         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3068         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3069
3070         WARN_ON(start > eb->len);
3071         WARN_ON(start + len > eb->start + eb->len);
3072
3073         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3074
3075         while(len > 0) {
3076                 page = extent_buffer_page(eb, i);
3077                 WARN_ON(!PageUptodate(page));
3078
3079                 cur = min(len, PAGE_CACHE_SIZE - offset);
3080                 kaddr = kmap_atomic(page, KM_USER0);
3081                 memset(kaddr + offset, c, cur);
3082                 kunmap_atomic(kaddr, KM_USER0);
3083
3084                 len -= cur;
3085                 offset = 0;
3086                 i++;
3087         }
3088 }
3089 EXPORT_SYMBOL(memset_extent_buffer);
3090
3091 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3092                         unsigned long dst_offset, unsigned long src_offset,
3093                         unsigned long len)
3094 {
3095         u64 dst_len = dst->len;
3096         size_t cur;
3097         size_t offset;
3098         struct page *page;
3099         char *kaddr;
3100         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3101         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3102
3103         WARN_ON(src->len != dst_len);
3104
3105         offset = (start_offset + dst_offset) &
3106                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3107
3108         while(len > 0) {
3109                 page = extent_buffer_page(dst, i);
3110                 WARN_ON(!PageUptodate(page));
3111
3112                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3113
3114                 kaddr = kmap_atomic(page, KM_USER0);
3115                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3116                 kunmap_atomic(kaddr, KM_USER0);
3117
3118                 src_offset += cur;
3119                 len -= cur;
3120                 offset = 0;
3121                 i++;
3122         }
3123 }
3124 EXPORT_SYMBOL(copy_extent_buffer);
3125
3126 static void move_pages(struct page *dst_page, struct page *src_page,
3127                        unsigned long dst_off, unsigned long src_off,
3128                        unsigned long len)
3129 {
3130         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3131         if (dst_page == src_page) {
3132                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3133         } else {
3134                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3135                 char *p = dst_kaddr + dst_off + len;
3136                 char *s = src_kaddr + src_off + len;
3137
3138                 while (len--)
3139                         *--p = *--s;
3140
3141                 kunmap_atomic(src_kaddr, KM_USER1);
3142         }
3143         kunmap_atomic(dst_kaddr, KM_USER0);
3144 }
3145
3146 static void copy_pages(struct page *dst_page, struct page *src_page,
3147                        unsigned long dst_off, unsigned long src_off,
3148                        unsigned long len)
3149 {
3150         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3151         char *src_kaddr;
3152
3153         if (dst_page != src_page)
3154                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3155         else
3156                 src_kaddr = dst_kaddr;
3157
3158         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3159         kunmap_atomic(dst_kaddr, KM_USER0);
3160         if (dst_page != src_page)
3161                 kunmap_atomic(src_kaddr, KM_USER1);
3162 }
3163
3164 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3165                            unsigned long src_offset, unsigned long len)
3166 {
3167         size_t cur;
3168         size_t dst_off_in_page;
3169         size_t src_off_in_page;
3170         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3171         unsigned long dst_i;
3172         unsigned long src_i;
3173
3174         if (src_offset + len > dst->len) {
3175                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3176                        src_offset, len, dst->len);
3177                 BUG_ON(1);
3178         }
3179         if (dst_offset + len > dst->len) {
3180                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3181                        dst_offset, len, dst->len);
3182                 BUG_ON(1);
3183         }
3184
3185         while(len > 0) {
3186                 dst_off_in_page = (start_offset + dst_offset) &
3187                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3188                 src_off_in_page = (start_offset + src_offset) &
3189                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3190
3191                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3192                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3193
3194                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3195                                                src_off_in_page));
3196                 cur = min_t(unsigned long, cur,
3197                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3198
3199                 copy_pages(extent_buffer_page(dst, dst_i),
3200                            extent_buffer_page(dst, src_i),
3201                            dst_off_in_page, src_off_in_page, cur);
3202
3203                 src_offset += cur;
3204                 dst_offset += cur;
3205                 len -= cur;
3206         }
3207 }
3208 EXPORT_SYMBOL(memcpy_extent_buffer);
3209
3210 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3211                            unsigned long src_offset, unsigned long len)
3212 {
3213         size_t cur;
3214         size_t dst_off_in_page;
3215         size_t src_off_in_page;
3216         unsigned long dst_end = dst_offset + len - 1;
3217         unsigned long src_end = src_offset + len - 1;
3218         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3219         unsigned long dst_i;
3220         unsigned long src_i;
3221
3222         if (src_offset + len > dst->len) {
3223                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3224                        src_offset, len, dst->len);
3225                 BUG_ON(1);
3226         }
3227         if (dst_offset + len > dst->len) {
3228                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3229                        dst_offset, len, dst->len);
3230                 BUG_ON(1);
3231         }
3232         if (dst_offset < src_offset) {
3233                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3234                 return;
3235         }
3236         while(len > 0) {
3237                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3238                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3239
3240                 dst_off_in_page = (start_offset + dst_end) &
3241                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3242                 src_off_in_page = (start_offset + src_end) &
3243                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3244
3245                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3246                 cur = min(cur, dst_off_in_page + 1);
3247                 move_pages(extent_buffer_page(dst, dst_i),
3248                            extent_buffer_page(dst, src_i),
3249                            dst_off_in_page - cur + 1,
3250                            src_off_in_page - cur + 1, cur);
3251
3252                 dst_end -= cur;
3253                 src_end -= cur;
3254                 len -= cur;
3255         }
3256 }
3257 EXPORT_SYMBOL(memmove_extent_buffer);