Btrfs: Split the extent_map code into two parts
[linux-2.6-block.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17
18 /* temporary define until extent_map moves out of btrfs */
19 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
20                                        unsigned long extra_flags,
21                                        void (*ctor)(void *, struct kmem_cache *,
22                                                     unsigned long));
23
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29
30 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
31 #define BUFFER_LRU_MAX 64
32
33 struct tree_entry {
34         u64 start;
35         u64 end;
36         int in_tree;
37         struct rb_node rb_node;
38 };
39
40 struct extent_page_data {
41         struct bio *bio;
42         struct extent_io_tree *tree;
43         get_extent_t *get_extent;
44 };
45
46 int __init extent_io_init(void)
47 {
48         extent_state_cache = btrfs_cache_create("extent_state",
49                                             sizeof(struct extent_state), 0,
50                                             NULL);
51         if (!extent_state_cache)
52                 return -ENOMEM;
53
54         extent_buffer_cache = btrfs_cache_create("extent_buffers",
55                                             sizeof(struct extent_buffer), 0,
56                                             NULL);
57         if (!extent_buffer_cache)
58                 goto free_state_cache;
59         return 0;
60
61 free_state_cache:
62         kmem_cache_destroy(extent_state_cache);
63         return -ENOMEM;
64 }
65
66 void extent_io_exit(void)
67 {
68         struct extent_state *state;
69
70         while (!list_empty(&states)) {
71                 state = list_entry(states.next, struct extent_state, list);
72                 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
73                 list_del(&state->list);
74                 kmem_cache_free(extent_state_cache, state);
75
76         }
77
78         if (extent_state_cache)
79                 kmem_cache_destroy(extent_state_cache);
80         if (extent_buffer_cache)
81                 kmem_cache_destroy(extent_buffer_cache);
82 }
83
84 void extent_io_tree_init(struct extent_io_tree *tree,
85                           struct address_space *mapping, gfp_t mask)
86 {
87         tree->state.rb_node = NULL;
88         tree->ops = NULL;
89         tree->dirty_bytes = 0;
90         rwlock_init(&tree->lock);
91         spin_lock_init(&tree->lru_lock);
92         tree->mapping = mapping;
93         INIT_LIST_HEAD(&tree->buffer_lru);
94         tree->lru_size = 0;
95 }
96 EXPORT_SYMBOL(extent_io_tree_init);
97
98 void extent_io_tree_empty_lru(struct extent_io_tree *tree)
99 {
100         struct extent_buffer *eb;
101         while(!list_empty(&tree->buffer_lru)) {
102                 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
103                                 lru);
104                 list_del_init(&eb->lru);
105                 free_extent_buffer(eb);
106         }
107 }
108 EXPORT_SYMBOL(extent_io_tree_empty_lru);
109
110 struct extent_state *alloc_extent_state(gfp_t mask)
111 {
112         struct extent_state *state;
113         unsigned long flags;
114
115         state = kmem_cache_alloc(extent_state_cache, mask);
116         if (!state || IS_ERR(state))
117                 return state;
118         state->state = 0;
119         state->in_tree = 0;
120         state->private = 0;
121
122         spin_lock_irqsave(&state_lock, flags);
123         list_add(&state->list, &states);
124         spin_unlock_irqrestore(&state_lock, flags);
125
126         atomic_set(&state->refs, 1);
127         init_waitqueue_head(&state->wq);
128         return state;
129 }
130 EXPORT_SYMBOL(alloc_extent_state);
131
132 void free_extent_state(struct extent_state *state)
133 {
134         unsigned long flags;
135         if (!state)
136                 return;
137         if (atomic_dec_and_test(&state->refs)) {
138                 WARN_ON(state->in_tree);
139                 spin_lock_irqsave(&state_lock, flags);
140                 list_del(&state->list);
141                 spin_unlock_irqrestore(&state_lock, flags);
142                 kmem_cache_free(extent_state_cache, state);
143         }
144 }
145 EXPORT_SYMBOL(free_extent_state);
146
147 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
148                                    struct rb_node *node)
149 {
150         struct rb_node ** p = &root->rb_node;
151         struct rb_node * parent = NULL;
152         struct tree_entry *entry;
153
154         while(*p) {
155                 parent = *p;
156                 entry = rb_entry(parent, struct tree_entry, rb_node);
157
158                 if (offset < entry->start)
159                         p = &(*p)->rb_left;
160                 else if (offset > entry->end)
161                         p = &(*p)->rb_right;
162                 else
163                         return parent;
164         }
165
166         entry = rb_entry(node, struct tree_entry, rb_node);
167         entry->in_tree = 1;
168         rb_link_node(node, parent, p);
169         rb_insert_color(node, root);
170         return NULL;
171 }
172
173 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
174                                      struct rb_node **prev_ret,
175                                      struct rb_node **next_ret)
176 {
177         struct rb_node * n = root->rb_node;
178         struct rb_node *prev = NULL;
179         struct rb_node *orig_prev = NULL;
180         struct tree_entry *entry;
181         struct tree_entry *prev_entry = NULL;
182
183         while(n) {
184                 entry = rb_entry(n, struct tree_entry, rb_node);
185                 prev = n;
186                 prev_entry = entry;
187
188                 if (offset < entry->start)
189                         n = n->rb_left;
190                 else if (offset > entry->end)
191                         n = n->rb_right;
192                 else
193                         return n;
194         }
195
196         if (prev_ret) {
197                 orig_prev = prev;
198                 while(prev && offset > prev_entry->end) {
199                         prev = rb_next(prev);
200                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
201                 }
202                 *prev_ret = prev;
203                 prev = orig_prev;
204         }
205
206         if (next_ret) {
207                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
208                 while(prev && offset < prev_entry->start) {
209                         prev = rb_prev(prev);
210                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211                 }
212                 *next_ret = prev;
213         }
214         return NULL;
215 }
216
217 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
218 {
219         struct rb_node *prev;
220         struct rb_node *ret;
221         ret = __tree_search(root, offset, &prev, NULL);
222         if (!ret)
223                 return prev;
224         return ret;
225 }
226
227 /*
228  * utility function to look for merge candidates inside a given range.
229  * Any extents with matching state are merged together into a single
230  * extent in the tree.  Extents with EXTENT_IO in their state field
231  * are not merged because the end_io handlers need to be able to do
232  * operations on them without sleeping (or doing allocations/splits).
233  *
234  * This should be called with the tree lock held.
235  */
236 static int merge_state(struct extent_io_tree *tree,
237                        struct extent_state *state)
238 {
239         struct extent_state *other;
240         struct rb_node *other_node;
241
242         if (state->state & EXTENT_IOBITS)
243                 return 0;
244
245         other_node = rb_prev(&state->rb_node);
246         if (other_node) {
247                 other = rb_entry(other_node, struct extent_state, rb_node);
248                 if (other->end == state->start - 1 &&
249                     other->state == state->state) {
250                         state->start = other->start;
251                         other->in_tree = 0;
252                         rb_erase(&other->rb_node, &tree->state);
253                         free_extent_state(other);
254                 }
255         }
256         other_node = rb_next(&state->rb_node);
257         if (other_node) {
258                 other = rb_entry(other_node, struct extent_state, rb_node);
259                 if (other->start == state->end + 1 &&
260                     other->state == state->state) {
261                         other->start = state->start;
262                         state->in_tree = 0;
263                         rb_erase(&state->rb_node, &tree->state);
264                         free_extent_state(state);
265                 }
266         }
267         return 0;
268 }
269
270 /*
271  * insert an extent_state struct into the tree.  'bits' are set on the
272  * struct before it is inserted.
273  *
274  * This may return -EEXIST if the extent is already there, in which case the
275  * state struct is freed.
276  *
277  * The tree lock is not taken internally.  This is a utility function and
278  * probably isn't what you want to call (see set/clear_extent_bit).
279  */
280 static int insert_state(struct extent_io_tree *tree,
281                         struct extent_state *state, u64 start, u64 end,
282                         int bits)
283 {
284         struct rb_node *node;
285
286         if (end < start) {
287                 printk("end < start %Lu %Lu\n", end, start);
288                 WARN_ON(1);
289         }
290         if (bits & EXTENT_DIRTY)
291                 tree->dirty_bytes += end - start + 1;
292         state->state |= bits;
293         state->start = start;
294         state->end = end;
295         node = tree_insert(&tree->state, end, &state->rb_node);
296         if (node) {
297                 struct extent_state *found;
298                 found = rb_entry(node, struct extent_state, rb_node);
299                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
300                 free_extent_state(state);
301                 return -EEXIST;
302         }
303         merge_state(tree, state);
304         return 0;
305 }
306
307 /*
308  * split a given extent state struct in two, inserting the preallocated
309  * struct 'prealloc' as the newly created second half.  'split' indicates an
310  * offset inside 'orig' where it should be split.
311  *
312  * Before calling,
313  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
314  * are two extent state structs in the tree:
315  * prealloc: [orig->start, split - 1]
316  * orig: [ split, orig->end ]
317  *
318  * The tree locks are not taken by this function. They need to be held
319  * by the caller.
320  */
321 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
322                        struct extent_state *prealloc, u64 split)
323 {
324         struct rb_node *node;
325         prealloc->start = orig->start;
326         prealloc->end = split - 1;
327         prealloc->state = orig->state;
328         orig->start = split;
329
330         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
331         if (node) {
332                 struct extent_state *found;
333                 found = rb_entry(node, struct extent_state, rb_node);
334                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
335                 free_extent_state(prealloc);
336                 return -EEXIST;
337         }
338         return 0;
339 }
340
341 /*
342  * utility function to clear some bits in an extent state struct.
343  * it will optionally wake up any one waiting on this state (wake == 1), or
344  * forcibly remove the state from the tree (delete == 1).
345  *
346  * If no bits are set on the state struct after clearing things, the
347  * struct is freed and removed from the tree
348  */
349 static int clear_state_bit(struct extent_io_tree *tree,
350                             struct extent_state *state, int bits, int wake,
351                             int delete)
352 {
353         int ret = state->state & bits;
354
355         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
356                 u64 range = state->end - state->start + 1;
357                 WARN_ON(range > tree->dirty_bytes);
358                 tree->dirty_bytes -= range;
359         }
360         state->state &= ~bits;
361         if (wake)
362                 wake_up(&state->wq);
363         if (delete || state->state == 0) {
364                 if (state->in_tree) {
365                         rb_erase(&state->rb_node, &tree->state);
366                         state->in_tree = 0;
367                         free_extent_state(state);
368                 } else {
369                         WARN_ON(1);
370                 }
371         } else {
372                 merge_state(tree, state);
373         }
374         return ret;
375 }
376
377 /*
378  * clear some bits on a range in the tree.  This may require splitting
379  * or inserting elements in the tree, so the gfp mask is used to
380  * indicate which allocations or sleeping are allowed.
381  *
382  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
383  * the given range from the tree regardless of state (ie for truncate).
384  *
385  * the range [start, end] is inclusive.
386  *
387  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
388  * bits were already set, or zero if none of the bits were already set.
389  */
390 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
391                      int bits, int wake, int delete, gfp_t mask)
392 {
393         struct extent_state *state;
394         struct extent_state *prealloc = NULL;
395         struct rb_node *node;
396         unsigned long flags;
397         int err;
398         int set = 0;
399
400 again:
401         if (!prealloc && (mask & __GFP_WAIT)) {
402                 prealloc = alloc_extent_state(mask);
403                 if (!prealloc)
404                         return -ENOMEM;
405         }
406
407         write_lock_irqsave(&tree->lock, flags);
408         /*
409          * this search will find the extents that end after
410          * our range starts
411          */
412         node = tree_search(&tree->state, start);
413         if (!node)
414                 goto out;
415         state = rb_entry(node, struct extent_state, rb_node);
416         if (state->start > end)
417                 goto out;
418         WARN_ON(state->end < start);
419
420         /*
421          *     | ---- desired range ---- |
422          *  | state | or
423          *  | ------------- state -------------- |
424          *
425          * We need to split the extent we found, and may flip
426          * bits on second half.
427          *
428          * If the extent we found extends past our range, we
429          * just split and search again.  It'll get split again
430          * the next time though.
431          *
432          * If the extent we found is inside our range, we clear
433          * the desired bit on it.
434          */
435
436         if (state->start < start) {
437                 err = split_state(tree, state, prealloc, start);
438                 BUG_ON(err == -EEXIST);
439                 prealloc = NULL;
440                 if (err)
441                         goto out;
442                 if (state->end <= end) {
443                         start = state->end + 1;
444                         set |= clear_state_bit(tree, state, bits,
445                                         wake, delete);
446                 } else {
447                         start = state->start;
448                 }
449                 goto search_again;
450         }
451         /*
452          * | ---- desired range ---- |
453          *                        | state |
454          * We need to split the extent, and clear the bit
455          * on the first half
456          */
457         if (state->start <= end && state->end > end) {
458                 err = split_state(tree, state, prealloc, end + 1);
459                 BUG_ON(err == -EEXIST);
460
461                 if (wake)
462                         wake_up(&state->wq);
463                 set |= clear_state_bit(tree, prealloc, bits,
464                                        wake, delete);
465                 prealloc = NULL;
466                 goto out;
467         }
468
469         start = state->end + 1;
470         set |= clear_state_bit(tree, state, bits, wake, delete);
471         goto search_again;
472
473 out:
474         write_unlock_irqrestore(&tree->lock, flags);
475         if (prealloc)
476                 free_extent_state(prealloc);
477
478         return set;
479
480 search_again:
481         if (start > end)
482                 goto out;
483         write_unlock_irqrestore(&tree->lock, flags);
484         if (mask & __GFP_WAIT)
485                 cond_resched();
486         goto again;
487 }
488 EXPORT_SYMBOL(clear_extent_bit);
489
490 static int wait_on_state(struct extent_io_tree *tree,
491                          struct extent_state *state)
492 {
493         DEFINE_WAIT(wait);
494         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
495         read_unlock_irq(&tree->lock);
496         schedule();
497         read_lock_irq(&tree->lock);
498         finish_wait(&state->wq, &wait);
499         return 0;
500 }
501
502 /*
503  * waits for one or more bits to clear on a range in the state tree.
504  * The range [start, end] is inclusive.
505  * The tree lock is taken by this function
506  */
507 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
508 {
509         struct extent_state *state;
510         struct rb_node *node;
511
512         read_lock_irq(&tree->lock);
513 again:
514         while (1) {
515                 /*
516                  * this search will find all the extents that end after
517                  * our range starts
518                  */
519                 node = tree_search(&tree->state, start);
520                 if (!node)
521                         break;
522
523                 state = rb_entry(node, struct extent_state, rb_node);
524
525                 if (state->start > end)
526                         goto out;
527
528                 if (state->state & bits) {
529                         start = state->start;
530                         atomic_inc(&state->refs);
531                         wait_on_state(tree, state);
532                         free_extent_state(state);
533                         goto again;
534                 }
535                 start = state->end + 1;
536
537                 if (start > end)
538                         break;
539
540                 if (need_resched()) {
541                         read_unlock_irq(&tree->lock);
542                         cond_resched();
543                         read_lock_irq(&tree->lock);
544                 }
545         }
546 out:
547         read_unlock_irq(&tree->lock);
548         return 0;
549 }
550 EXPORT_SYMBOL(wait_extent_bit);
551
552 static void set_state_bits(struct extent_io_tree *tree,
553                            struct extent_state *state,
554                            int bits)
555 {
556         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
557                 u64 range = state->end - state->start + 1;
558                 tree->dirty_bytes += range;
559         }
560         state->state |= bits;
561 }
562
563 /*
564  * set some bits on a range in the tree.  This may require allocations
565  * or sleeping, so the gfp mask is used to indicate what is allowed.
566  *
567  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
568  * range already has the desired bits set.  The start of the existing
569  * range is returned in failed_start in this case.
570  *
571  * [start, end] is inclusive
572  * This takes the tree lock.
573  */
574 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
575                    int exclusive, u64 *failed_start, gfp_t mask)
576 {
577         struct extent_state *state;
578         struct extent_state *prealloc = NULL;
579         struct rb_node *node;
580         unsigned long flags;
581         int err = 0;
582         int set;
583         u64 last_start;
584         u64 last_end;
585 again:
586         if (!prealloc && (mask & __GFP_WAIT)) {
587                 prealloc = alloc_extent_state(mask);
588                 if (!prealloc)
589                         return -ENOMEM;
590         }
591
592         write_lock_irqsave(&tree->lock, flags);
593         /*
594          * this search will find all the extents that end after
595          * our range starts.
596          */
597         node = tree_search(&tree->state, start);
598         if (!node) {
599                 err = insert_state(tree, prealloc, start, end, bits);
600                 prealloc = NULL;
601                 BUG_ON(err == -EEXIST);
602                 goto out;
603         }
604
605         state = rb_entry(node, struct extent_state, rb_node);
606         last_start = state->start;
607         last_end = state->end;
608
609         /*
610          * | ---- desired range ---- |
611          * | state |
612          *
613          * Just lock what we found and keep going
614          */
615         if (state->start == start && state->end <= end) {
616                 set = state->state & bits;
617                 if (set && exclusive) {
618                         *failed_start = state->start;
619                         err = -EEXIST;
620                         goto out;
621                 }
622                 set_state_bits(tree, state, bits);
623                 start = state->end + 1;
624                 merge_state(tree, state);
625                 goto search_again;
626         }
627
628         /*
629          *     | ---- desired range ---- |
630          * | state |
631          *   or
632          * | ------------- state -------------- |
633          *
634          * We need to split the extent we found, and may flip bits on
635          * second half.
636          *
637          * If the extent we found extends past our
638          * range, we just split and search again.  It'll get split
639          * again the next time though.
640          *
641          * If the extent we found is inside our range, we set the
642          * desired bit on it.
643          */
644         if (state->start < start) {
645                 set = state->state & bits;
646                 if (exclusive && set) {
647                         *failed_start = start;
648                         err = -EEXIST;
649                         goto out;
650                 }
651                 err = split_state(tree, state, prealloc, start);
652                 BUG_ON(err == -EEXIST);
653                 prealloc = NULL;
654                 if (err)
655                         goto out;
656                 if (state->end <= end) {
657                         set_state_bits(tree, state, bits);
658                         start = state->end + 1;
659                         merge_state(tree, state);
660                 } else {
661                         start = state->start;
662                 }
663                 goto search_again;
664         }
665         /*
666          * | ---- desired range ---- |
667          *     | state | or               | state |
668          *
669          * There's a hole, we need to insert something in it and
670          * ignore the extent we found.
671          */
672         if (state->start > start) {
673                 u64 this_end;
674                 if (end < last_start)
675                         this_end = end;
676                 else
677                         this_end = last_start -1;
678                 err = insert_state(tree, prealloc, start, this_end,
679                                    bits);
680                 prealloc = NULL;
681                 BUG_ON(err == -EEXIST);
682                 if (err)
683                         goto out;
684                 start = this_end + 1;
685                 goto search_again;
686         }
687         /*
688          * | ---- desired range ---- |
689          *                        | state |
690          * We need to split the extent, and set the bit
691          * on the first half
692          */
693         if (state->start <= end && state->end > end) {
694                 set = state->state & bits;
695                 if (exclusive && set) {
696                         *failed_start = start;
697                         err = -EEXIST;
698                         goto out;
699                 }
700                 err = split_state(tree, state, prealloc, end + 1);
701                 BUG_ON(err == -EEXIST);
702
703                 set_state_bits(tree, prealloc, bits);
704                 merge_state(tree, prealloc);
705                 prealloc = NULL;
706                 goto out;
707         }
708
709         goto search_again;
710
711 out:
712         write_unlock_irqrestore(&tree->lock, flags);
713         if (prealloc)
714                 free_extent_state(prealloc);
715
716         return err;
717
718 search_again:
719         if (start > end)
720                 goto out;
721         write_unlock_irqrestore(&tree->lock, flags);
722         if (mask & __GFP_WAIT)
723                 cond_resched();
724         goto again;
725 }
726 EXPORT_SYMBOL(set_extent_bit);
727
728 /* wrappers around set/clear extent bit */
729 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
730                      gfp_t mask)
731 {
732         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
733                               mask);
734 }
735 EXPORT_SYMBOL(set_extent_dirty);
736
737 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
738                     int bits, gfp_t mask)
739 {
740         return set_extent_bit(tree, start, end, bits, 0, NULL,
741                               mask);
742 }
743 EXPORT_SYMBOL(set_extent_bits);
744
745 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
746                       int bits, gfp_t mask)
747 {
748         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
749 }
750 EXPORT_SYMBOL(clear_extent_bits);
751
752 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
753                      gfp_t mask)
754 {
755         return set_extent_bit(tree, start, end,
756                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
757                               mask);
758 }
759 EXPORT_SYMBOL(set_extent_delalloc);
760
761 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
762                        gfp_t mask)
763 {
764         return clear_extent_bit(tree, start, end,
765                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
766 }
767 EXPORT_SYMBOL(clear_extent_dirty);
768
769 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
770                      gfp_t mask)
771 {
772         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
773                               mask);
774 }
775 EXPORT_SYMBOL(set_extent_new);
776
777 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
778                        gfp_t mask)
779 {
780         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
781 }
782 EXPORT_SYMBOL(clear_extent_new);
783
784 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
785                         gfp_t mask)
786 {
787         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
788                               mask);
789 }
790 EXPORT_SYMBOL(set_extent_uptodate);
791
792 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
793                           gfp_t mask)
794 {
795         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
796 }
797 EXPORT_SYMBOL(clear_extent_uptodate);
798
799 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
800                          gfp_t mask)
801 {
802         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
803                               0, NULL, mask);
804 }
805 EXPORT_SYMBOL(set_extent_writeback);
806
807 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
808                            gfp_t mask)
809 {
810         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
811 }
812 EXPORT_SYMBOL(clear_extent_writeback);
813
814 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
815 {
816         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
817 }
818 EXPORT_SYMBOL(wait_on_extent_writeback);
819
820 /*
821  * locks a range in ascending order, waiting for any locked regions
822  * it hits on the way.  [start,end] are inclusive, and this will sleep.
823  */
824 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
825 {
826         int err;
827         u64 failed_start;
828         while (1) {
829                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
830                                      &failed_start, mask);
831                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
832                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
833                         start = failed_start;
834                 } else {
835                         break;
836                 }
837                 WARN_ON(start > end);
838         }
839         return err;
840 }
841 EXPORT_SYMBOL(lock_extent);
842
843 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
844                   gfp_t mask)
845 {
846         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
847 }
848 EXPORT_SYMBOL(unlock_extent);
849
850 /*
851  * helper function to set pages and extents in the tree dirty
852  */
853 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
854 {
855         unsigned long index = start >> PAGE_CACHE_SHIFT;
856         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
857         struct page *page;
858
859         while (index <= end_index) {
860                 page = find_get_page(tree->mapping, index);
861                 BUG_ON(!page);
862                 __set_page_dirty_nobuffers(page);
863                 page_cache_release(page);
864                 index++;
865         }
866         set_extent_dirty(tree, start, end, GFP_NOFS);
867         return 0;
868 }
869 EXPORT_SYMBOL(set_range_dirty);
870
871 /*
872  * helper function to set both pages and extents in the tree writeback
873  */
874 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
875 {
876         unsigned long index = start >> PAGE_CACHE_SHIFT;
877         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
878         struct page *page;
879
880         while (index <= end_index) {
881                 page = find_get_page(tree->mapping, index);
882                 BUG_ON(!page);
883                 set_page_writeback(page);
884                 page_cache_release(page);
885                 index++;
886         }
887         set_extent_writeback(tree, start, end, GFP_NOFS);
888         return 0;
889 }
890 EXPORT_SYMBOL(set_range_writeback);
891
892 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
893                           u64 *start_ret, u64 *end_ret, int bits)
894 {
895         struct rb_node *node;
896         struct extent_state *state;
897         int ret = 1;
898
899         read_lock_irq(&tree->lock);
900         /*
901          * this search will find all the extents that end after
902          * our range starts.
903          */
904         node = tree_search(&tree->state, start);
905         if (!node || IS_ERR(node)) {
906                 goto out;
907         }
908
909         while(1) {
910                 state = rb_entry(node, struct extent_state, rb_node);
911                 if (state->end >= start && (state->state & bits)) {
912                         *start_ret = state->start;
913                         *end_ret = state->end;
914                         ret = 0;
915                         break;
916                 }
917                 node = rb_next(node);
918                 if (!node)
919                         break;
920         }
921 out:
922         read_unlock_irq(&tree->lock);
923         return ret;
924 }
925 EXPORT_SYMBOL(find_first_extent_bit);
926
927 u64 find_lock_delalloc_range(struct extent_io_tree *tree,
928                              u64 *start, u64 *end, u64 max_bytes)
929 {
930         struct rb_node *node;
931         struct extent_state *state;
932         u64 cur_start = *start;
933         u64 found = 0;
934         u64 total_bytes = 0;
935
936         write_lock_irq(&tree->lock);
937         /*
938          * this search will find all the extents that end after
939          * our range starts.
940          */
941 search_again:
942         node = tree_search(&tree->state, cur_start);
943         if (!node || IS_ERR(node)) {
944                 *end = (u64)-1;
945                 goto out;
946         }
947
948         while(1) {
949                 state = rb_entry(node, struct extent_state, rb_node);
950                 if (found && state->start != cur_start) {
951                         goto out;
952                 }
953                 if (!(state->state & EXTENT_DELALLOC)) {
954                         if (!found)
955                                 *end = state->end;
956                         goto out;
957                 }
958                 if (!found) {
959                         struct extent_state *prev_state;
960                         struct rb_node *prev_node = node;
961                         while(1) {
962                                 prev_node = rb_prev(prev_node);
963                                 if (!prev_node)
964                                         break;
965                                 prev_state = rb_entry(prev_node,
966                                                       struct extent_state,
967                                                       rb_node);
968                                 if (!(prev_state->state & EXTENT_DELALLOC))
969                                         break;
970                                 state = prev_state;
971                                 node = prev_node;
972                         }
973                 }
974                 if (state->state & EXTENT_LOCKED) {
975                         DEFINE_WAIT(wait);
976                         atomic_inc(&state->refs);
977                         prepare_to_wait(&state->wq, &wait,
978                                         TASK_UNINTERRUPTIBLE);
979                         write_unlock_irq(&tree->lock);
980                         schedule();
981                         write_lock_irq(&tree->lock);
982                         finish_wait(&state->wq, &wait);
983                         free_extent_state(state);
984                         goto search_again;
985                 }
986                 state->state |= EXTENT_LOCKED;
987                 if (!found)
988                         *start = state->start;
989                 found++;
990                 *end = state->end;
991                 cur_start = state->end + 1;
992                 node = rb_next(node);
993                 if (!node)
994                         break;
995                 total_bytes += state->end - state->start + 1;
996                 if (total_bytes >= max_bytes)
997                         break;
998         }
999 out:
1000         write_unlock_irq(&tree->lock);
1001         return found;
1002 }
1003
1004 u64 count_range_bits(struct extent_io_tree *tree,
1005                      u64 *start, u64 search_end, u64 max_bytes,
1006                      unsigned long bits)
1007 {
1008         struct rb_node *node;
1009         struct extent_state *state;
1010         u64 cur_start = *start;
1011         u64 total_bytes = 0;
1012         int found = 0;
1013
1014         if (search_end <= cur_start) {
1015                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1016                 WARN_ON(1);
1017                 return 0;
1018         }
1019
1020         write_lock_irq(&tree->lock);
1021         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1022                 total_bytes = tree->dirty_bytes;
1023                 goto out;
1024         }
1025         /*
1026          * this search will find all the extents that end after
1027          * our range starts.
1028          */
1029         node = tree_search(&tree->state, cur_start);
1030         if (!node || IS_ERR(node)) {
1031                 goto out;
1032         }
1033
1034         while(1) {
1035                 state = rb_entry(node, struct extent_state, rb_node);
1036                 if (state->start > search_end)
1037                         break;
1038                 if (state->end >= cur_start && (state->state & bits)) {
1039                         total_bytes += min(search_end, state->end) + 1 -
1040                                        max(cur_start, state->start);
1041                         if (total_bytes >= max_bytes)
1042                                 break;
1043                         if (!found) {
1044                                 *start = state->start;
1045                                 found = 1;
1046                         }
1047                 }
1048                 node = rb_next(node);
1049                 if (!node)
1050                         break;
1051         }
1052 out:
1053         write_unlock_irq(&tree->lock);
1054         return total_bytes;
1055 }
1056 /*
1057  * helper function to lock both pages and extents in the tree.
1058  * pages must be locked first.
1059  */
1060 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1061 {
1062         unsigned long index = start >> PAGE_CACHE_SHIFT;
1063         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1064         struct page *page;
1065         int err;
1066
1067         while (index <= end_index) {
1068                 page = grab_cache_page(tree->mapping, index);
1069                 if (!page) {
1070                         err = -ENOMEM;
1071                         goto failed;
1072                 }
1073                 if (IS_ERR(page)) {
1074                         err = PTR_ERR(page);
1075                         goto failed;
1076                 }
1077                 index++;
1078         }
1079         lock_extent(tree, start, end, GFP_NOFS);
1080         return 0;
1081
1082 failed:
1083         /*
1084          * we failed above in getting the page at 'index', so we undo here
1085          * up to but not including the page at 'index'
1086          */
1087         end_index = index;
1088         index = start >> PAGE_CACHE_SHIFT;
1089         while (index < end_index) {
1090                 page = find_get_page(tree->mapping, index);
1091                 unlock_page(page);
1092                 page_cache_release(page);
1093                 index++;
1094         }
1095         return err;
1096 }
1097 EXPORT_SYMBOL(lock_range);
1098
1099 /*
1100  * helper function to unlock both pages and extents in the tree.
1101  */
1102 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1103 {
1104         unsigned long index = start >> PAGE_CACHE_SHIFT;
1105         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1106         struct page *page;
1107
1108         while (index <= end_index) {
1109                 page = find_get_page(tree->mapping, index);
1110                 unlock_page(page);
1111                 page_cache_release(page);
1112                 index++;
1113         }
1114         unlock_extent(tree, start, end, GFP_NOFS);
1115         return 0;
1116 }
1117 EXPORT_SYMBOL(unlock_range);
1118
1119 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1120 {
1121         struct rb_node *node;
1122         struct extent_state *state;
1123         int ret = 0;
1124
1125         write_lock_irq(&tree->lock);
1126         /*
1127          * this search will find all the extents that end after
1128          * our range starts.
1129          */
1130         node = tree_search(&tree->state, start);
1131         if (!node || IS_ERR(node)) {
1132                 ret = -ENOENT;
1133                 goto out;
1134         }
1135         state = rb_entry(node, struct extent_state, rb_node);
1136         if (state->start != start) {
1137                 ret = -ENOENT;
1138                 goto out;
1139         }
1140         state->private = private;
1141 out:
1142         write_unlock_irq(&tree->lock);
1143         return ret;
1144 }
1145
1146 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1147 {
1148         struct rb_node *node;
1149         struct extent_state *state;
1150         int ret = 0;
1151
1152         read_lock_irq(&tree->lock);
1153         /*
1154          * this search will find all the extents that end after
1155          * our range starts.
1156          */
1157         node = tree_search(&tree->state, start);
1158         if (!node || IS_ERR(node)) {
1159                 ret = -ENOENT;
1160                 goto out;
1161         }
1162         state = rb_entry(node, struct extent_state, rb_node);
1163         if (state->start != start) {
1164                 ret = -ENOENT;
1165                 goto out;
1166         }
1167         *private = state->private;
1168 out:
1169         read_unlock_irq(&tree->lock);
1170         return ret;
1171 }
1172
1173 /*
1174  * searches a range in the state tree for a given mask.
1175  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1176  * has the bits set.  Otherwise, 1 is returned if any bit in the
1177  * range is found set.
1178  */
1179 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1180                    int bits, int filled)
1181 {
1182         struct extent_state *state = NULL;
1183         struct rb_node *node;
1184         int bitset = 0;
1185         unsigned long flags;
1186
1187         read_lock_irqsave(&tree->lock, flags);
1188         node = tree_search(&tree->state, start);
1189         while (node && start <= end) {
1190                 state = rb_entry(node, struct extent_state, rb_node);
1191
1192                 if (filled && state->start > start) {
1193                         bitset = 0;
1194                         break;
1195                 }
1196
1197                 if (state->start > end)
1198                         break;
1199
1200                 if (state->state & bits) {
1201                         bitset = 1;
1202                         if (!filled)
1203                                 break;
1204                 } else if (filled) {
1205                         bitset = 0;
1206                         break;
1207                 }
1208                 start = state->end + 1;
1209                 if (start > end)
1210                         break;
1211                 node = rb_next(node);
1212                 if (!node) {
1213                         if (filled)
1214                                 bitset = 0;
1215                         break;
1216                 }
1217         }
1218         read_unlock_irqrestore(&tree->lock, flags);
1219         return bitset;
1220 }
1221 EXPORT_SYMBOL(test_range_bit);
1222
1223 /*
1224  * helper function to set a given page up to date if all the
1225  * extents in the tree for that page are up to date
1226  */
1227 static int check_page_uptodate(struct extent_io_tree *tree,
1228                                struct page *page)
1229 {
1230         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1231         u64 end = start + PAGE_CACHE_SIZE - 1;
1232         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1233                 SetPageUptodate(page);
1234         return 0;
1235 }
1236
1237 /*
1238  * helper function to unlock a page if all the extents in the tree
1239  * for that page are unlocked
1240  */
1241 static int check_page_locked(struct extent_io_tree *tree,
1242                              struct page *page)
1243 {
1244         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1245         u64 end = start + PAGE_CACHE_SIZE - 1;
1246         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1247                 unlock_page(page);
1248         return 0;
1249 }
1250
1251 /*
1252  * helper function to end page writeback if all the extents
1253  * in the tree for that page are done with writeback
1254  */
1255 static int check_page_writeback(struct extent_io_tree *tree,
1256                              struct page *page)
1257 {
1258         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1259         u64 end = start + PAGE_CACHE_SIZE - 1;
1260         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1261                 end_page_writeback(page);
1262         return 0;
1263 }
1264
1265 /* lots and lots of room for performance fixes in the end_bio funcs */
1266
1267 /*
1268  * after a writepage IO is done, we need to:
1269  * clear the uptodate bits on error
1270  * clear the writeback bits in the extent tree for this IO
1271  * end_page_writeback if the page has no more pending IO
1272  *
1273  * Scheduling is not allowed, so the extent state tree is expected
1274  * to have one and only one object corresponding to this IO.
1275  */
1276 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1277 static void end_bio_extent_writepage(struct bio *bio, int err)
1278 #else
1279 static int end_bio_extent_writepage(struct bio *bio,
1280                                    unsigned int bytes_done, int err)
1281 #endif
1282 {
1283         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1284         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1285         struct extent_io_tree *tree = bio->bi_private;
1286         u64 start;
1287         u64 end;
1288         int whole_page;
1289
1290 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1291         if (bio->bi_size)
1292                 return 1;
1293 #endif
1294
1295         do {
1296                 struct page *page = bvec->bv_page;
1297                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1298                          bvec->bv_offset;
1299                 end = start + bvec->bv_len - 1;
1300
1301                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1302                         whole_page = 1;
1303                 else
1304                         whole_page = 0;
1305
1306                 if (--bvec >= bio->bi_io_vec)
1307                         prefetchw(&bvec->bv_page->flags);
1308
1309                 if (!uptodate) {
1310                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1311                         ClearPageUptodate(page);
1312                         SetPageError(page);
1313                 }
1314                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1315
1316                 if (whole_page)
1317                         end_page_writeback(page);
1318                 else
1319                         check_page_writeback(tree, page);
1320                 if (tree->ops && tree->ops->writepage_end_io_hook)
1321                         tree->ops->writepage_end_io_hook(page, start, end);
1322         } while (bvec >= bio->bi_io_vec);
1323
1324         bio_put(bio);
1325 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1326         return 0;
1327 #endif
1328 }
1329
1330 /*
1331  * after a readpage IO is done, we need to:
1332  * clear the uptodate bits on error
1333  * set the uptodate bits if things worked
1334  * set the page up to date if all extents in the tree are uptodate
1335  * clear the lock bit in the extent tree
1336  * unlock the page if there are no other extents locked for it
1337  *
1338  * Scheduling is not allowed, so the extent state tree is expected
1339  * to have one and only one object corresponding to this IO.
1340  */
1341 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1342 static void end_bio_extent_readpage(struct bio *bio, int err)
1343 #else
1344 static int end_bio_extent_readpage(struct bio *bio,
1345                                    unsigned int bytes_done, int err)
1346 #endif
1347 {
1348         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1349         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1350         struct extent_io_tree *tree = bio->bi_private;
1351         u64 start;
1352         u64 end;
1353         int whole_page;
1354         int ret;
1355
1356 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1357         if (bio->bi_size)
1358                 return 1;
1359 #endif
1360
1361         do {
1362                 struct page *page = bvec->bv_page;
1363                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1364                         bvec->bv_offset;
1365                 end = start + bvec->bv_len - 1;
1366
1367                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1368                         whole_page = 1;
1369                 else
1370                         whole_page = 0;
1371
1372                 if (--bvec >= bio->bi_io_vec)
1373                         prefetchw(&bvec->bv_page->flags);
1374
1375                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1376                         ret = tree->ops->readpage_end_io_hook(page, start, end);
1377                         if (ret)
1378                                 uptodate = 0;
1379                 }
1380                 if (uptodate) {
1381                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1382                         if (whole_page)
1383                                 SetPageUptodate(page);
1384                         else
1385                                 check_page_uptodate(tree, page);
1386                 } else {
1387                         ClearPageUptodate(page);
1388                         SetPageError(page);
1389                 }
1390
1391                 unlock_extent(tree, start, end, GFP_ATOMIC);
1392
1393                 if (whole_page)
1394                         unlock_page(page);
1395                 else
1396                         check_page_locked(tree, page);
1397         } while (bvec >= bio->bi_io_vec);
1398
1399         bio_put(bio);
1400 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1401         return 0;
1402 #endif
1403 }
1404
1405 /*
1406  * IO done from prepare_write is pretty simple, we just unlock
1407  * the structs in the extent tree when done, and set the uptodate bits
1408  * as appropriate.
1409  */
1410 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1411 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1412 #else
1413 static int end_bio_extent_preparewrite(struct bio *bio,
1414                                        unsigned int bytes_done, int err)
1415 #endif
1416 {
1417         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1418         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1419         struct extent_io_tree *tree = bio->bi_private;
1420         u64 start;
1421         u64 end;
1422
1423 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1424         if (bio->bi_size)
1425                 return 1;
1426 #endif
1427
1428         do {
1429                 struct page *page = bvec->bv_page;
1430                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1431                         bvec->bv_offset;
1432                 end = start + bvec->bv_len - 1;
1433
1434                 if (--bvec >= bio->bi_io_vec)
1435                         prefetchw(&bvec->bv_page->flags);
1436
1437                 if (uptodate) {
1438                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1439                 } else {
1440                         ClearPageUptodate(page);
1441                         SetPageError(page);
1442                 }
1443
1444                 unlock_extent(tree, start, end, GFP_ATOMIC);
1445
1446         } while (bvec >= bio->bi_io_vec);
1447
1448         bio_put(bio);
1449 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1450         return 0;
1451 #endif
1452 }
1453
1454 static struct bio *
1455 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1456                  gfp_t gfp_flags)
1457 {
1458         struct bio *bio;
1459
1460         bio = bio_alloc(gfp_flags, nr_vecs);
1461
1462         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1463                 while (!bio && (nr_vecs /= 2))
1464                         bio = bio_alloc(gfp_flags, nr_vecs);
1465         }
1466
1467         if (bio) {
1468                 bio->bi_bdev = bdev;
1469                 bio->bi_sector = first_sector;
1470         }
1471         return bio;
1472 }
1473
1474 static int submit_one_bio(int rw, struct bio *bio)
1475 {
1476         u64 maxsector;
1477         int ret = 0;
1478
1479         bio_get(bio);
1480
1481         maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1482         if (maxsector < bio->bi_sector) {
1483                 printk("sector too large max %Lu got %llu\n", maxsector,
1484                         (unsigned long long)bio->bi_sector);
1485                 WARN_ON(1);
1486         }
1487
1488         submit_bio(rw, bio);
1489         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1490                 ret = -EOPNOTSUPP;
1491         bio_put(bio);
1492         return ret;
1493 }
1494
1495 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1496                               struct page *page, sector_t sector,
1497                               size_t size, unsigned long offset,
1498                               struct block_device *bdev,
1499                               struct bio **bio_ret,
1500                               unsigned long max_pages,
1501                               bio_end_io_t end_io_func)
1502 {
1503         int ret = 0;
1504         struct bio *bio;
1505         int nr;
1506
1507         if (bio_ret && *bio_ret) {
1508                 bio = *bio_ret;
1509                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1510                     bio_add_page(bio, page, size, offset) < size) {
1511                         ret = submit_one_bio(rw, bio);
1512                         bio = NULL;
1513                 } else {
1514                         return 0;
1515                 }
1516         }
1517         nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1518         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1519         if (!bio) {
1520                 printk("failed to allocate bio nr %d\n", nr);
1521         }
1522         bio_add_page(bio, page, size, offset);
1523         bio->bi_end_io = end_io_func;
1524         bio->bi_private = tree;
1525         if (bio_ret) {
1526                 *bio_ret = bio;
1527         } else {
1528                 ret = submit_one_bio(rw, bio);
1529         }
1530
1531         return ret;
1532 }
1533
1534 void set_page_extent_mapped(struct page *page)
1535 {
1536         if (!PagePrivate(page)) {
1537                 SetPagePrivate(page);
1538                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1539                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1540                 page_cache_get(page);
1541         }
1542 }
1543
1544 void set_page_extent_head(struct page *page, unsigned long len)
1545 {
1546         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1547 }
1548
1549 /*
1550  * basic readpage implementation.  Locked extent state structs are inserted
1551  * into the tree that are removed when the IO is done (by the end_io
1552  * handlers)
1553  */
1554 static int __extent_read_full_page(struct extent_io_tree *tree,
1555                                    struct page *page,
1556                                    get_extent_t *get_extent,
1557                                    struct bio **bio)
1558 {
1559         struct inode *inode = page->mapping->host;
1560         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1561         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1562         u64 end;
1563         u64 cur = start;
1564         u64 extent_offset;
1565         u64 last_byte = i_size_read(inode);
1566         u64 block_start;
1567         u64 cur_end;
1568         sector_t sector;
1569         struct extent_map *em;
1570         struct block_device *bdev;
1571         int ret;
1572         int nr = 0;
1573         size_t page_offset = 0;
1574         size_t iosize;
1575         size_t blocksize = inode->i_sb->s_blocksize;
1576
1577         set_page_extent_mapped(page);
1578
1579         end = page_end;
1580         lock_extent(tree, start, end, GFP_NOFS);
1581
1582         while (cur <= end) {
1583                 if (cur >= last_byte) {
1584                         char *userpage;
1585                         iosize = PAGE_CACHE_SIZE - page_offset;
1586                         userpage = kmap_atomic(page, KM_USER0);
1587                         memset(userpage + page_offset, 0, iosize);
1588                         flush_dcache_page(page);
1589                         kunmap_atomic(userpage, KM_USER0);
1590                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1591                                             GFP_NOFS);
1592                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1593                         break;
1594                 }
1595                 em = get_extent(inode, page, page_offset, cur,
1596                                 end - cur + 1, 0);
1597                 if (IS_ERR(em) || !em) {
1598                         SetPageError(page);
1599                         unlock_extent(tree, cur, end, GFP_NOFS);
1600                         break;
1601                 }
1602
1603                 extent_offset = cur - em->start;
1604                 BUG_ON(extent_map_end(em) <= cur);
1605                 BUG_ON(end < cur);
1606
1607                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1608                 cur_end = min(extent_map_end(em) - 1, end);
1609                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1610                 sector = (em->block_start + extent_offset) >> 9;
1611                 bdev = em->bdev;
1612                 block_start = em->block_start;
1613                 free_extent_map(em);
1614                 em = NULL;
1615
1616                 /* we've found a hole, just zero and go on */
1617                 if (block_start == EXTENT_MAP_HOLE) {
1618                         char *userpage;
1619                         userpage = kmap_atomic(page, KM_USER0);
1620                         memset(userpage + page_offset, 0, iosize);
1621                         flush_dcache_page(page);
1622                         kunmap_atomic(userpage, KM_USER0);
1623
1624                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1625                                             GFP_NOFS);
1626                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1627                         cur = cur + iosize;
1628                         page_offset += iosize;
1629                         continue;
1630                 }
1631                 /* the get_extent function already copied into the page */
1632                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1633                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1634                         cur = cur + iosize;
1635                         page_offset += iosize;
1636                         continue;
1637                 }
1638
1639                 ret = 0;
1640                 if (tree->ops && tree->ops->readpage_io_hook) {
1641                         ret = tree->ops->readpage_io_hook(page, cur,
1642                                                           cur + iosize - 1);
1643                 }
1644                 if (!ret) {
1645                         unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1646                         nr -= page->index;
1647                         ret = submit_extent_page(READ, tree, page,
1648                                          sector, iosize, page_offset,
1649                                          bdev, bio, nr,
1650                                          end_bio_extent_readpage);
1651                 }
1652                 if (ret)
1653                         SetPageError(page);
1654                 cur = cur + iosize;
1655                 page_offset += iosize;
1656                 nr++;
1657         }
1658         if (!nr) {
1659                 if (!PageError(page))
1660                         SetPageUptodate(page);
1661                 unlock_page(page);
1662         }
1663         return 0;
1664 }
1665
1666 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1667                             get_extent_t *get_extent)
1668 {
1669         struct bio *bio = NULL;
1670         int ret;
1671
1672         ret = __extent_read_full_page(tree, page, get_extent, &bio);
1673         if (bio)
1674                 submit_one_bio(READ, bio);
1675         return ret;
1676 }
1677 EXPORT_SYMBOL(extent_read_full_page);
1678
1679 /*
1680  * the writepage semantics are similar to regular writepage.  extent
1681  * records are inserted to lock ranges in the tree, and as dirty areas
1682  * are found, they are marked writeback.  Then the lock bits are removed
1683  * and the end_io handler clears the writeback ranges
1684  */
1685 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1686                               void *data)
1687 {
1688         struct inode *inode = page->mapping->host;
1689         struct extent_page_data *epd = data;
1690         struct extent_io_tree *tree = epd->tree;
1691         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1692         u64 delalloc_start;
1693         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1694         u64 end;
1695         u64 cur = start;
1696         u64 extent_offset;
1697         u64 last_byte = i_size_read(inode);
1698         u64 block_start;
1699         u64 iosize;
1700         sector_t sector;
1701         struct extent_map *em;
1702         struct block_device *bdev;
1703         int ret;
1704         int nr = 0;
1705         size_t page_offset = 0;
1706         size_t blocksize;
1707         loff_t i_size = i_size_read(inode);
1708         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1709         u64 nr_delalloc;
1710         u64 delalloc_end;
1711
1712         WARN_ON(!PageLocked(page));
1713         if (page->index > end_index) {
1714                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1715                 unlock_page(page);
1716                 return 0;
1717         }
1718
1719         if (page->index == end_index) {
1720                 char *userpage;
1721
1722                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1723
1724                 userpage = kmap_atomic(page, KM_USER0);
1725                 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1726                 flush_dcache_page(page);
1727                 kunmap_atomic(userpage, KM_USER0);
1728         }
1729
1730         set_page_extent_mapped(page);
1731
1732         delalloc_start = start;
1733         delalloc_end = 0;
1734         while(delalloc_end < page_end) {
1735                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1736                                                        &delalloc_end,
1737                                                        128 * 1024 * 1024);
1738                 if (nr_delalloc == 0) {
1739                         delalloc_start = delalloc_end + 1;
1740                         continue;
1741                 }
1742                 tree->ops->fill_delalloc(inode, delalloc_start,
1743                                          delalloc_end);
1744                 clear_extent_bit(tree, delalloc_start,
1745                                  delalloc_end,
1746                                  EXTENT_LOCKED | EXTENT_DELALLOC,
1747                                  1, 0, GFP_NOFS);
1748                 delalloc_start = delalloc_end + 1;
1749         }
1750         lock_extent(tree, start, page_end, GFP_NOFS);
1751
1752         end = page_end;
1753         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1754                 printk("found delalloc bits after lock_extent\n");
1755         }
1756
1757         if (last_byte <= start) {
1758                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1759                 goto done;
1760         }
1761
1762         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1763         blocksize = inode->i_sb->s_blocksize;
1764
1765         while (cur <= end) {
1766                 if (cur >= last_byte) {
1767                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1768                         break;
1769                 }
1770                 em = epd->get_extent(inode, page, page_offset, cur,
1771                                      end - cur + 1, 1);
1772                 if (IS_ERR(em) || !em) {
1773                         SetPageError(page);
1774                         break;
1775                 }
1776
1777                 extent_offset = cur - em->start;
1778                 BUG_ON(extent_map_end(em) <= cur);
1779                 BUG_ON(end < cur);
1780                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1781                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1782                 sector = (em->block_start + extent_offset) >> 9;
1783                 bdev = em->bdev;
1784                 block_start = em->block_start;
1785                 free_extent_map(em);
1786                 em = NULL;
1787
1788                 if (block_start == EXTENT_MAP_HOLE ||
1789                     block_start == EXTENT_MAP_INLINE) {
1790                         clear_extent_dirty(tree, cur,
1791                                            cur + iosize - 1, GFP_NOFS);
1792                         cur = cur + iosize;
1793                         page_offset += iosize;
1794                         continue;
1795                 }
1796
1797                 /* leave this out until we have a page_mkwrite call */
1798                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1799                                    EXTENT_DIRTY, 0)) {
1800                         cur = cur + iosize;
1801                         page_offset += iosize;
1802                         continue;
1803                 }
1804                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1805                 if (tree->ops && tree->ops->writepage_io_hook) {
1806                         ret = tree->ops->writepage_io_hook(page, cur,
1807                                                 cur + iosize - 1);
1808                 } else {
1809                         ret = 0;
1810                 }
1811                 if (ret)
1812                         SetPageError(page);
1813                 else {
1814                         unsigned long max_nr = end_index + 1;
1815                         set_range_writeback(tree, cur, cur + iosize - 1);
1816                         if (!PageWriteback(page)) {
1817                                 printk("warning page %lu not writeback, "
1818                                        "cur %llu end %llu\n", page->index,
1819                                        (unsigned long long)cur,
1820                                        (unsigned long long)end);
1821                         }
1822
1823                         ret = submit_extent_page(WRITE, tree, page, sector,
1824                                                  iosize, page_offset, bdev,
1825                                                  &epd->bio, max_nr,
1826                                                  end_bio_extent_writepage);
1827                         if (ret)
1828                                 SetPageError(page);
1829                 }
1830                 cur = cur + iosize;
1831                 page_offset += iosize;
1832                 nr++;
1833         }
1834 done:
1835         if (nr == 0) {
1836                 /* make sure the mapping tag for page dirty gets cleared */
1837                 set_page_writeback(page);
1838                 end_page_writeback(page);
1839         }
1840         unlock_extent(tree, start, page_end, GFP_NOFS);
1841         unlock_page(page);
1842         return 0;
1843 }
1844
1845 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1846
1847 /* Taken directly from 2.6.23 for 2.6.18 back port */
1848 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
1849                                 void *data);
1850
1851 /**
1852  * write_cache_pages - walk the list of dirty pages of the given address space
1853  * and write all of them.
1854  * @mapping: address space structure to write
1855  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1856  * @writepage: function called for each page
1857  * @data: data passed to writepage function
1858  *
1859  * If a page is already under I/O, write_cache_pages() skips it, even
1860  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
1861  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
1862  * and msync() need to guarantee that all the data which was dirty at the time
1863  * the call was made get new I/O started against them.  If wbc->sync_mode is
1864  * WB_SYNC_ALL then we were called for data integrity and we must wait for
1865  * existing IO to complete.
1866  */
1867 static int write_cache_pages(struct address_space *mapping,
1868                       struct writeback_control *wbc, writepage_t writepage,
1869                       void *data)
1870 {
1871         struct backing_dev_info *bdi = mapping->backing_dev_info;
1872         int ret = 0;
1873         int done = 0;
1874         struct pagevec pvec;
1875         int nr_pages;
1876         pgoff_t index;
1877         pgoff_t end;            /* Inclusive */
1878         int scanned = 0;
1879         int range_whole = 0;
1880
1881         if (wbc->nonblocking && bdi_write_congested(bdi)) {
1882                 wbc->encountered_congestion = 1;
1883                 return 0;
1884         }
1885
1886         pagevec_init(&pvec, 0);
1887         if (wbc->range_cyclic) {
1888                 index = mapping->writeback_index; /* Start from prev offset */
1889                 end = -1;
1890         } else {
1891                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
1892                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
1893                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1894                         range_whole = 1;
1895                 scanned = 1;
1896         }
1897 retry:
1898         while (!done && (index <= end) &&
1899                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1900                                               PAGECACHE_TAG_DIRTY,
1901                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
1902                 unsigned i;
1903
1904                 scanned = 1;
1905                 for (i = 0; i < nr_pages; i++) {
1906                         struct page *page = pvec.pages[i];
1907
1908                         /*
1909                          * At this point we hold neither mapping->tree_lock nor
1910                          * lock on the page itself: the page may be truncated or
1911                          * invalidated (changing page->mapping to NULL), or even
1912                          * swizzled back from swapper_space to tmpfs file
1913                          * mapping
1914                          */
1915                         lock_page(page);
1916
1917                         if (unlikely(page->mapping != mapping)) {
1918                                 unlock_page(page);
1919                                 continue;
1920                         }
1921
1922                         if (!wbc->range_cyclic && page->index > end) {
1923                                 done = 1;
1924                                 unlock_page(page);
1925                                 continue;
1926                         }
1927
1928                         if (wbc->sync_mode != WB_SYNC_NONE)
1929                                 wait_on_page_writeback(page);
1930
1931                         if (PageWriteback(page) ||
1932                             !clear_page_dirty_for_io(page)) {
1933                                 unlock_page(page);
1934                                 continue;
1935                         }
1936
1937                         ret = (*writepage)(page, wbc, data);
1938
1939                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
1940                                 unlock_page(page);
1941                                 ret = 0;
1942                         }
1943                         if (ret || (--(wbc->nr_to_write) <= 0))
1944                                 done = 1;
1945                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
1946                                 wbc->encountered_congestion = 1;
1947                                 done = 1;
1948                         }
1949                 }
1950                 pagevec_release(&pvec);
1951                 cond_resched();
1952         }
1953         if (!scanned && !done) {
1954                 /*
1955                  * We hit the last page and there is more work to be done: wrap
1956                  * back to the start of the file
1957                  */
1958                 scanned = 1;
1959                 index = 0;
1960                 goto retry;
1961         }
1962         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1963                 mapping->writeback_index = index;
1964         return ret;
1965 }
1966 #endif
1967
1968 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
1969                           get_extent_t *get_extent,
1970                           struct writeback_control *wbc)
1971 {
1972         int ret;
1973         struct address_space *mapping = page->mapping;
1974         struct extent_page_data epd = {
1975                 .bio = NULL,
1976                 .tree = tree,
1977                 .get_extent = get_extent,
1978         };
1979         struct writeback_control wbc_writepages = {
1980                 .bdi            = wbc->bdi,
1981                 .sync_mode      = WB_SYNC_NONE,
1982                 .older_than_this = NULL,
1983                 .nr_to_write    = 64,
1984                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
1985                 .range_end      = (loff_t)-1,
1986         };
1987
1988
1989         ret = __extent_writepage(page, wbc, &epd);
1990
1991         write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
1992         if (epd.bio) {
1993                 submit_one_bio(WRITE, epd.bio);
1994         }
1995         return ret;
1996 }
1997 EXPORT_SYMBOL(extent_write_full_page);
1998
1999
2000 int extent_writepages(struct extent_io_tree *tree,
2001                       struct address_space *mapping,
2002                       get_extent_t *get_extent,
2003                       struct writeback_control *wbc)
2004 {
2005         int ret = 0;
2006         struct extent_page_data epd = {
2007                 .bio = NULL,
2008                 .tree = tree,
2009                 .get_extent = get_extent,
2010         };
2011
2012         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2013         if (epd.bio) {
2014                 submit_one_bio(WRITE, epd.bio);
2015         }
2016         return ret;
2017 }
2018 EXPORT_SYMBOL(extent_writepages);
2019
2020 int extent_readpages(struct extent_io_tree *tree,
2021                      struct address_space *mapping,
2022                      struct list_head *pages, unsigned nr_pages,
2023                      get_extent_t get_extent)
2024 {
2025         struct bio *bio = NULL;
2026         unsigned page_idx;
2027         struct pagevec pvec;
2028
2029         pagevec_init(&pvec, 0);
2030         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2031                 struct page *page = list_entry(pages->prev, struct page, lru);
2032
2033                 prefetchw(&page->flags);
2034                 list_del(&page->lru);
2035                 /*
2036                  * what we want to do here is call add_to_page_cache_lru,
2037                  * but that isn't exported, so we reproduce it here
2038                  */
2039                 if (!add_to_page_cache(page, mapping,
2040                                         page->index, GFP_KERNEL)) {
2041
2042                         /* open coding of lru_cache_add, also not exported */
2043                         page_cache_get(page);
2044                         if (!pagevec_add(&pvec, page))
2045                                 __pagevec_lru_add(&pvec);
2046                         __extent_read_full_page(tree, page, get_extent, &bio);
2047                 }
2048                 page_cache_release(page);
2049         }
2050         if (pagevec_count(&pvec))
2051                 __pagevec_lru_add(&pvec);
2052         BUG_ON(!list_empty(pages));
2053         if (bio)
2054                 submit_one_bio(READ, bio);
2055         return 0;
2056 }
2057 EXPORT_SYMBOL(extent_readpages);
2058
2059 /*
2060  * basic invalidatepage code, this waits on any locked or writeback
2061  * ranges corresponding to the page, and then deletes any extent state
2062  * records from the tree
2063  */
2064 int extent_invalidatepage(struct extent_io_tree *tree,
2065                           struct page *page, unsigned long offset)
2066 {
2067         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2068         u64 end = start + PAGE_CACHE_SIZE - 1;
2069         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2070
2071         start += (offset + blocksize -1) & ~(blocksize - 1);
2072         if (start > end)
2073                 return 0;
2074
2075         lock_extent(tree, start, end, GFP_NOFS);
2076         wait_on_extent_writeback(tree, start, end);
2077         clear_extent_bit(tree, start, end,
2078                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2079                          1, 1, GFP_NOFS);
2080         return 0;
2081 }
2082 EXPORT_SYMBOL(extent_invalidatepage);
2083
2084 /*
2085  * simple commit_write call, set_range_dirty is used to mark both
2086  * the pages and the extent records as dirty
2087  */
2088 int extent_commit_write(struct extent_io_tree *tree,
2089                         struct inode *inode, struct page *page,
2090                         unsigned from, unsigned to)
2091 {
2092         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2093
2094         set_page_extent_mapped(page);
2095         set_page_dirty(page);
2096
2097         if (pos > inode->i_size) {
2098                 i_size_write(inode, pos);
2099                 mark_inode_dirty(inode);
2100         }
2101         return 0;
2102 }
2103 EXPORT_SYMBOL(extent_commit_write);
2104
2105 int extent_prepare_write(struct extent_io_tree *tree,
2106                          struct inode *inode, struct page *page,
2107                          unsigned from, unsigned to, get_extent_t *get_extent)
2108 {
2109         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2110         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2111         u64 block_start;
2112         u64 orig_block_start;
2113         u64 block_end;
2114         u64 cur_end;
2115         struct extent_map *em;
2116         unsigned blocksize = 1 << inode->i_blkbits;
2117         size_t page_offset = 0;
2118         size_t block_off_start;
2119         size_t block_off_end;
2120         int err = 0;
2121         int iocount = 0;
2122         int ret = 0;
2123         int isnew;
2124
2125         set_page_extent_mapped(page);
2126
2127         block_start = (page_start + from) & ~((u64)blocksize - 1);
2128         block_end = (page_start + to - 1) | (blocksize - 1);
2129         orig_block_start = block_start;
2130
2131         lock_extent(tree, page_start, page_end, GFP_NOFS);
2132         while(block_start <= block_end) {
2133                 em = get_extent(inode, page, page_offset, block_start,
2134                                 block_end - block_start + 1, 1);
2135                 if (IS_ERR(em) || !em) {
2136                         goto err;
2137                 }
2138                 cur_end = min(block_end, extent_map_end(em) - 1);
2139                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2140                 block_off_end = block_off_start + blocksize;
2141                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2142
2143                 if (!PageUptodate(page) && isnew &&
2144                     (block_off_end > to || block_off_start < from)) {
2145                         void *kaddr;
2146
2147                         kaddr = kmap_atomic(page, KM_USER0);
2148                         if (block_off_end > to)
2149                                 memset(kaddr + to, 0, block_off_end - to);
2150                         if (block_off_start < from)
2151                                 memset(kaddr + block_off_start, 0,
2152                                        from - block_off_start);
2153                         flush_dcache_page(page);
2154                         kunmap_atomic(kaddr, KM_USER0);
2155                 }
2156                 if ((em->block_start != EXTENT_MAP_HOLE &&
2157                      em->block_start != EXTENT_MAP_INLINE) &&
2158                     !isnew && !PageUptodate(page) &&
2159                     (block_off_end > to || block_off_start < from) &&
2160                     !test_range_bit(tree, block_start, cur_end,
2161                                     EXTENT_UPTODATE, 1)) {
2162                         u64 sector;
2163                         u64 extent_offset = block_start - em->start;
2164                         size_t iosize;
2165                         sector = (em->block_start + extent_offset) >> 9;
2166                         iosize = (cur_end - block_start + blocksize) &
2167                                 ~((u64)blocksize - 1);
2168                         /*
2169                          * we've already got the extent locked, but we
2170                          * need to split the state such that our end_bio
2171                          * handler can clear the lock.
2172                          */
2173                         set_extent_bit(tree, block_start,
2174                                        block_start + iosize - 1,
2175                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2176                         ret = submit_extent_page(READ, tree, page,
2177                                          sector, iosize, page_offset, em->bdev,
2178                                          NULL, 1,
2179                                          end_bio_extent_preparewrite);
2180                         iocount++;
2181                         block_start = block_start + iosize;
2182                 } else {
2183                         set_extent_uptodate(tree, block_start, cur_end,
2184                                             GFP_NOFS);
2185                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2186                         block_start = cur_end + 1;
2187                 }
2188                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2189                 free_extent_map(em);
2190         }
2191         if (iocount) {
2192                 wait_extent_bit(tree, orig_block_start,
2193                                 block_end, EXTENT_LOCKED);
2194         }
2195         check_page_uptodate(tree, page);
2196 err:
2197         /* FIXME, zero out newly allocated blocks on error */
2198         return err;
2199 }
2200 EXPORT_SYMBOL(extent_prepare_write);
2201
2202 /*
2203  * a helper for releasepage.  As long as there are no locked extents
2204  * in the range corresponding to the page, both state records and extent
2205  * map records are removed
2206  */
2207 int try_release_extent_mapping(struct extent_map_tree *map,
2208                                struct extent_io_tree *tree, struct page *page)
2209 {
2210         struct extent_map *em;
2211         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2212         u64 end = start + PAGE_CACHE_SIZE - 1;
2213         u64 orig_start = start;
2214         int ret = 1;
2215
2216         while (start <= end) {
2217                 spin_lock(&map->lock);
2218                 em = lookup_extent_mapping(map, start, end);
2219                 if (!em || IS_ERR(em)) {
2220                         spin_unlock(&map->lock);
2221                         break;
2222                 }
2223                 if (!test_range_bit(tree, em->start, extent_map_end(em) - 1,
2224                                     EXTENT_LOCKED, 0)) {
2225                         remove_extent_mapping(map, em);
2226                         /* once for the rb tree */
2227                         free_extent_map(em);
2228                 }
2229                 start = extent_map_end(em);
2230                 spin_unlock(&map->lock);
2231
2232                 /* once for us */
2233                 free_extent_map(em);
2234         }
2235         if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
2236                 ret = 0;
2237         else
2238                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2239                                  1, 1, GFP_NOFS);
2240         return ret;
2241 }
2242 EXPORT_SYMBOL(try_release_extent_mapping);
2243
2244 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2245                 get_extent_t *get_extent)
2246 {
2247         struct inode *inode = mapping->host;
2248         u64 start = iblock << inode->i_blkbits;
2249         sector_t sector = 0;
2250         struct extent_map *em;
2251
2252         em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2253         if (!em || IS_ERR(em))
2254                 return 0;
2255
2256         if (em->block_start == EXTENT_MAP_INLINE ||
2257             em->block_start == EXTENT_MAP_HOLE)
2258                 goto out;
2259
2260         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2261 printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
2262 out:
2263         free_extent_map(em);
2264         return sector;
2265 }
2266
2267 static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
2268 {
2269         if (list_empty(&eb->lru)) {
2270                 extent_buffer_get(eb);
2271                 list_add(&eb->lru, &tree->buffer_lru);
2272                 tree->lru_size++;
2273                 if (tree->lru_size >= BUFFER_LRU_MAX) {
2274                         struct extent_buffer *rm;
2275                         rm = list_entry(tree->buffer_lru.prev,
2276                                         struct extent_buffer, lru);
2277                         tree->lru_size--;
2278                         list_del_init(&rm->lru);
2279                         free_extent_buffer(rm);
2280                 }
2281         } else
2282                 list_move(&eb->lru, &tree->buffer_lru);
2283         return 0;
2284 }
2285 static struct extent_buffer *find_lru(struct extent_io_tree *tree,
2286                                       u64 start, unsigned long len)
2287 {
2288         struct list_head *lru = &tree->buffer_lru;
2289         struct list_head *cur = lru->next;
2290         struct extent_buffer *eb;
2291
2292         if (list_empty(lru))
2293                 return NULL;
2294
2295         do {
2296                 eb = list_entry(cur, struct extent_buffer, lru);
2297                 if (eb->start == start && eb->len == len) {
2298                         extent_buffer_get(eb);
2299                         return eb;
2300                 }
2301                 cur = cur->next;
2302         } while (cur != lru);
2303         return NULL;
2304 }
2305
2306 static inline unsigned long num_extent_pages(u64 start, u64 len)
2307 {
2308         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2309                 (start >> PAGE_CACHE_SHIFT);
2310 }
2311
2312 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2313                                               unsigned long i)
2314 {
2315         struct page *p;
2316         struct address_space *mapping;
2317
2318         if (i == 0)
2319                 return eb->first_page;
2320         i += eb->start >> PAGE_CACHE_SHIFT;
2321         mapping = eb->first_page->mapping;
2322         read_lock_irq(&mapping->tree_lock);
2323         p = radix_tree_lookup(&mapping->page_tree, i);
2324         read_unlock_irq(&mapping->tree_lock);
2325         return p;
2326 }
2327
2328 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2329                                                    u64 start,
2330                                                    unsigned long len,
2331                                                    gfp_t mask)
2332 {
2333         struct extent_buffer *eb = NULL;
2334
2335         spin_lock(&tree->lru_lock);
2336         eb = find_lru(tree, start, len);
2337         spin_unlock(&tree->lru_lock);
2338         if (eb) {
2339                 return eb;
2340         }
2341
2342         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2343         INIT_LIST_HEAD(&eb->lru);
2344         eb->start = start;
2345         eb->len = len;
2346         atomic_set(&eb->refs, 1);
2347
2348         return eb;
2349 }
2350
2351 static void __free_extent_buffer(struct extent_buffer *eb)
2352 {
2353         kmem_cache_free(extent_buffer_cache, eb);
2354 }
2355
2356 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2357                                           u64 start, unsigned long len,
2358                                           struct page *page0,
2359                                           gfp_t mask)
2360 {
2361         unsigned long num_pages = num_extent_pages(start, len);
2362         unsigned long i;
2363         unsigned long index = start >> PAGE_CACHE_SHIFT;
2364         struct extent_buffer *eb;
2365         struct page *p;
2366         struct address_space *mapping = tree->mapping;
2367         int uptodate = 1;
2368
2369         eb = __alloc_extent_buffer(tree, start, len, mask);
2370         if (!eb || IS_ERR(eb))
2371                 return NULL;
2372
2373         if (eb->flags & EXTENT_BUFFER_FILLED)
2374                 goto lru_add;
2375
2376         if (page0) {
2377                 eb->first_page = page0;
2378                 i = 1;
2379                 index++;
2380                 page_cache_get(page0);
2381                 mark_page_accessed(page0);
2382                 set_page_extent_mapped(page0);
2383                 WARN_ON(!PageUptodate(page0));
2384                 set_page_extent_head(page0, len);
2385         } else {
2386                 i = 0;
2387         }
2388         for (; i < num_pages; i++, index++) {
2389                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2390                 if (!p) {
2391                         WARN_ON(1);
2392                         goto fail;
2393                 }
2394                 set_page_extent_mapped(p);
2395                 mark_page_accessed(p);
2396                 if (i == 0) {
2397                         eb->first_page = p;
2398                         set_page_extent_head(p, len);
2399                 } else {
2400                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2401                 }
2402                 if (!PageUptodate(p))
2403                         uptodate = 0;
2404                 unlock_page(p);
2405         }
2406         if (uptodate)
2407                 eb->flags |= EXTENT_UPTODATE;
2408         eb->flags |= EXTENT_BUFFER_FILLED;
2409
2410 lru_add:
2411         spin_lock(&tree->lru_lock);
2412         add_lru(tree, eb);
2413         spin_unlock(&tree->lru_lock);
2414         return eb;
2415
2416 fail:
2417         spin_lock(&tree->lru_lock);
2418         list_del_init(&eb->lru);
2419         spin_unlock(&tree->lru_lock);
2420         if (!atomic_dec_and_test(&eb->refs))
2421                 return NULL;
2422         for (index = 1; index < i; index++) {
2423                 page_cache_release(extent_buffer_page(eb, index));
2424         }
2425         if (i > 0)
2426                 page_cache_release(extent_buffer_page(eb, 0));
2427         __free_extent_buffer(eb);
2428         return NULL;
2429 }
2430 EXPORT_SYMBOL(alloc_extent_buffer);
2431
2432 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2433                                          u64 start, unsigned long len,
2434                                           gfp_t mask)
2435 {
2436         unsigned long num_pages = num_extent_pages(start, len);
2437         unsigned long i;
2438         unsigned long index = start >> PAGE_CACHE_SHIFT;
2439         struct extent_buffer *eb;
2440         struct page *p;
2441         struct address_space *mapping = tree->mapping;
2442         int uptodate = 1;
2443
2444         eb = __alloc_extent_buffer(tree, start, len, mask);
2445         if (!eb || IS_ERR(eb))
2446                 return NULL;
2447
2448         if (eb->flags & EXTENT_BUFFER_FILLED)
2449                 goto lru_add;
2450
2451         for (i = 0; i < num_pages; i++, index++) {
2452                 p = find_lock_page(mapping, index);
2453                 if (!p) {
2454                         goto fail;
2455                 }
2456                 set_page_extent_mapped(p);
2457                 mark_page_accessed(p);
2458
2459                 if (i == 0) {
2460                         eb->first_page = p;
2461                         set_page_extent_head(p, len);
2462                 } else {
2463                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2464                 }
2465
2466                 if (!PageUptodate(p))
2467                         uptodate = 0;
2468                 unlock_page(p);
2469         }
2470         if (uptodate)
2471                 eb->flags |= EXTENT_UPTODATE;
2472         eb->flags |= EXTENT_BUFFER_FILLED;
2473
2474 lru_add:
2475         spin_lock(&tree->lru_lock);
2476         add_lru(tree, eb);
2477         spin_unlock(&tree->lru_lock);
2478         return eb;
2479 fail:
2480         spin_lock(&tree->lru_lock);
2481         list_del_init(&eb->lru);
2482         spin_unlock(&tree->lru_lock);
2483         if (!atomic_dec_and_test(&eb->refs))
2484                 return NULL;
2485         for (index = 1; index < i; index++) {
2486                 page_cache_release(extent_buffer_page(eb, index));
2487         }
2488         if (i > 0)
2489                 page_cache_release(extent_buffer_page(eb, 0));
2490         __free_extent_buffer(eb);
2491         return NULL;
2492 }
2493 EXPORT_SYMBOL(find_extent_buffer);
2494
2495 void free_extent_buffer(struct extent_buffer *eb)
2496 {
2497         unsigned long i;
2498         unsigned long num_pages;
2499
2500         if (!eb)
2501                 return;
2502
2503         if (!atomic_dec_and_test(&eb->refs))
2504                 return;
2505
2506         WARN_ON(!list_empty(&eb->lru));
2507         num_pages = num_extent_pages(eb->start, eb->len);
2508
2509         for (i = 1; i < num_pages; i++) {
2510                 page_cache_release(extent_buffer_page(eb, i));
2511         }
2512         page_cache_release(extent_buffer_page(eb, 0));
2513         __free_extent_buffer(eb);
2514 }
2515 EXPORT_SYMBOL(free_extent_buffer);
2516
2517 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2518                               struct extent_buffer *eb)
2519 {
2520         int set;
2521         unsigned long i;
2522         unsigned long num_pages;
2523         struct page *page;
2524
2525         u64 start = eb->start;
2526         u64 end = start + eb->len - 1;
2527
2528         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2529         num_pages = num_extent_pages(eb->start, eb->len);
2530
2531         for (i = 0; i < num_pages; i++) {
2532                 page = extent_buffer_page(eb, i);
2533                 lock_page(page);
2534                 if (i == 0)
2535                         set_page_extent_head(page, eb->len);
2536                 else
2537                         set_page_private(page, EXTENT_PAGE_PRIVATE);
2538
2539                 /*
2540                  * if we're on the last page or the first page and the
2541                  * block isn't aligned on a page boundary, do extra checks
2542                  * to make sure we don't clean page that is partially dirty
2543                  */
2544                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2545                     ((i == num_pages - 1) &&
2546                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2547                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2548                         end  = start + PAGE_CACHE_SIZE - 1;
2549                         if (test_range_bit(tree, start, end,
2550                                            EXTENT_DIRTY, 0)) {
2551                                 unlock_page(page);
2552                                 continue;
2553                         }
2554                 }
2555                 clear_page_dirty_for_io(page);
2556                 write_lock_irq(&page->mapping->tree_lock);
2557                 if (!PageDirty(page)) {
2558                         radix_tree_tag_clear(&page->mapping->page_tree,
2559                                                 page_index(page),
2560                                                 PAGECACHE_TAG_DIRTY);
2561                 }
2562                 write_unlock_irq(&page->mapping->tree_lock);
2563                 unlock_page(page);
2564         }
2565         return 0;
2566 }
2567 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2568
2569 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2570                                     struct extent_buffer *eb)
2571 {
2572         return wait_on_extent_writeback(tree, eb->start,
2573                                         eb->start + eb->len - 1);
2574 }
2575 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2576
2577 int set_extent_buffer_dirty(struct extent_io_tree *tree,
2578                              struct extent_buffer *eb)
2579 {
2580         unsigned long i;
2581         unsigned long num_pages;
2582
2583         num_pages = num_extent_pages(eb->start, eb->len);
2584         for (i = 0; i < num_pages; i++) {
2585                 struct page *page = extent_buffer_page(eb, i);
2586                 /* writepage may need to do something special for the
2587                  * first page, we have to make sure page->private is
2588                  * properly set.  releasepage may drop page->private
2589                  * on us if the page isn't already dirty.
2590                  */
2591                 if (i == 0) {
2592                         lock_page(page);
2593                         set_page_extent_head(page, eb->len);
2594                 } else if (PagePrivate(page) &&
2595                            page->private != EXTENT_PAGE_PRIVATE) {
2596                         lock_page(page);
2597                         set_page_extent_mapped(page);
2598                         unlock_page(page);
2599                 }
2600                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2601                 if (i == 0)
2602                         unlock_page(page);
2603         }
2604         return set_extent_dirty(tree, eb->start,
2605                                 eb->start + eb->len - 1, GFP_NOFS);
2606 }
2607 EXPORT_SYMBOL(set_extent_buffer_dirty);
2608
2609 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2610                                 struct extent_buffer *eb)
2611 {
2612         unsigned long i;
2613         struct page *page;
2614         unsigned long num_pages;
2615
2616         num_pages = num_extent_pages(eb->start, eb->len);
2617
2618         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2619                             GFP_NOFS);
2620         for (i = 0; i < num_pages; i++) {
2621                 page = extent_buffer_page(eb, i);
2622                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2623                     ((i == num_pages - 1) &&
2624                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2625                         check_page_uptodate(tree, page);
2626                         continue;
2627                 }
2628                 SetPageUptodate(page);
2629         }
2630         return 0;
2631 }
2632 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2633
2634 int extent_buffer_uptodate(struct extent_io_tree *tree,
2635                              struct extent_buffer *eb)
2636 {
2637         if (eb->flags & EXTENT_UPTODATE)
2638                 return 1;
2639         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2640                            EXTENT_UPTODATE, 1);
2641 }
2642 EXPORT_SYMBOL(extent_buffer_uptodate);
2643
2644 int read_extent_buffer_pages(struct extent_io_tree *tree,
2645                              struct extent_buffer *eb,
2646                              u64 start,
2647                              int wait)
2648 {
2649         unsigned long i;
2650         unsigned long start_i;
2651         struct page *page;
2652         int err;
2653         int ret = 0;
2654         unsigned long num_pages;
2655
2656         if (eb->flags & EXTENT_UPTODATE)
2657                 return 0;
2658
2659         if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2660                            EXTENT_UPTODATE, 1)) {
2661                 return 0;
2662         }
2663
2664         if (start) {
2665                 WARN_ON(start < eb->start);
2666                 start_i = (start >> PAGE_CACHE_SHIFT) -
2667                         (eb->start >> PAGE_CACHE_SHIFT);
2668         } else {
2669                 start_i = 0;
2670         }
2671
2672         num_pages = num_extent_pages(eb->start, eb->len);
2673         for (i = start_i; i < num_pages; i++) {
2674                 page = extent_buffer_page(eb, i);
2675                 if (PageUptodate(page)) {
2676                         continue;
2677                 }
2678                 if (!wait) {
2679                         if (TestSetPageLocked(page)) {
2680                                 continue;
2681                         }
2682                 } else {
2683                         lock_page(page);
2684                 }
2685                 if (!PageUptodate(page)) {
2686                         err = page->mapping->a_ops->readpage(NULL, page);
2687                         if (err) {
2688                                 ret = err;
2689                         }
2690                 } else {
2691                         unlock_page(page);
2692                 }
2693         }
2694
2695         if (ret || !wait) {
2696                 return ret;
2697         }
2698
2699         for (i = start_i; i < num_pages; i++) {
2700                 page = extent_buffer_page(eb, i);
2701                 wait_on_page_locked(page);
2702                 if (!PageUptodate(page)) {
2703                         ret = -EIO;
2704                 }
2705         }
2706         if (!ret)
2707                 eb->flags |= EXTENT_UPTODATE;
2708         return ret;
2709 }
2710 EXPORT_SYMBOL(read_extent_buffer_pages);
2711
2712 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2713                         unsigned long start,
2714                         unsigned long len)
2715 {
2716         size_t cur;
2717         size_t offset;
2718         struct page *page;
2719         char *kaddr;
2720         char *dst = (char *)dstv;
2721         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2722         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2723         unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2724
2725         WARN_ON(start > eb->len);
2726         WARN_ON(start + len > eb->start + eb->len);
2727
2728         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2729
2730         while(len > 0) {
2731                 page = extent_buffer_page(eb, i);
2732                 if (!PageUptodate(page)) {
2733                         printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2734                         WARN_ON(1);
2735                 }
2736                 WARN_ON(!PageUptodate(page));
2737
2738                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2739                 kaddr = kmap_atomic(page, KM_USER1);
2740                 memcpy(dst, kaddr + offset, cur);
2741                 kunmap_atomic(kaddr, KM_USER1);
2742
2743                 dst += cur;
2744                 len -= cur;
2745                 offset = 0;
2746                 i++;
2747         }
2748 }
2749 EXPORT_SYMBOL(read_extent_buffer);
2750
2751 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2752                                unsigned long min_len, char **token, char **map,
2753                                unsigned long *map_start,
2754                                unsigned long *map_len, int km)
2755 {
2756         size_t offset = start & (PAGE_CACHE_SIZE - 1);
2757         char *kaddr;
2758         struct page *p;
2759         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2760         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2761         unsigned long end_i = (start_offset + start + min_len - 1) >>
2762                 PAGE_CACHE_SHIFT;
2763
2764         if (i != end_i)
2765                 return -EINVAL;
2766
2767         if (i == 0) {
2768                 offset = start_offset;
2769                 *map_start = 0;
2770         } else {
2771                 offset = 0;
2772                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2773         }
2774         if (start + min_len > eb->len) {
2775 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2776                 WARN_ON(1);
2777         }
2778
2779         p = extent_buffer_page(eb, i);
2780         WARN_ON(!PageUptodate(p));
2781         kaddr = kmap_atomic(p, km);
2782         *token = kaddr;
2783         *map = kaddr + offset;
2784         *map_len = PAGE_CACHE_SIZE - offset;
2785         return 0;
2786 }
2787 EXPORT_SYMBOL(map_private_extent_buffer);
2788
2789 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2790                       unsigned long min_len,
2791                       char **token, char **map,
2792                       unsigned long *map_start,
2793                       unsigned long *map_len, int km)
2794 {
2795         int err;
2796         int save = 0;
2797         if (eb->map_token) {
2798                 unmap_extent_buffer(eb, eb->map_token, km);
2799                 eb->map_token = NULL;
2800                 save = 1;
2801         }
2802         err = map_private_extent_buffer(eb, start, min_len, token, map,
2803                                        map_start, map_len, km);
2804         if (!err && save) {
2805                 eb->map_token = *token;
2806                 eb->kaddr = *map;
2807                 eb->map_start = *map_start;
2808                 eb->map_len = *map_len;
2809         }
2810         return err;
2811 }
2812 EXPORT_SYMBOL(map_extent_buffer);
2813
2814 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2815 {
2816         kunmap_atomic(token, km);
2817 }
2818 EXPORT_SYMBOL(unmap_extent_buffer);
2819
2820 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2821                           unsigned long start,
2822                           unsigned long len)
2823 {
2824         size_t cur;
2825         size_t offset;
2826         struct page *page;
2827         char *kaddr;
2828         char *ptr = (char *)ptrv;
2829         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2830         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2831         int ret = 0;
2832
2833         WARN_ON(start > eb->len);
2834         WARN_ON(start + len > eb->start + eb->len);
2835
2836         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2837
2838         while(len > 0) {
2839                 page = extent_buffer_page(eb, i);
2840                 WARN_ON(!PageUptodate(page));
2841
2842                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2843
2844                 kaddr = kmap_atomic(page, KM_USER0);
2845                 ret = memcmp(ptr, kaddr + offset, cur);
2846                 kunmap_atomic(kaddr, KM_USER0);
2847                 if (ret)
2848                         break;
2849
2850                 ptr += cur;
2851                 len -= cur;
2852                 offset = 0;
2853                 i++;
2854         }
2855         return ret;
2856 }
2857 EXPORT_SYMBOL(memcmp_extent_buffer);
2858
2859 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
2860                          unsigned long start, unsigned long len)
2861 {
2862         size_t cur;
2863         size_t offset;
2864         struct page *page;
2865         char *kaddr;
2866         char *src = (char *)srcv;
2867         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2868         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2869
2870         WARN_ON(start > eb->len);
2871         WARN_ON(start + len > eb->start + eb->len);
2872
2873         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2874
2875         while(len > 0) {
2876                 page = extent_buffer_page(eb, i);
2877                 WARN_ON(!PageUptodate(page));
2878
2879                 cur = min(len, PAGE_CACHE_SIZE - offset);
2880                 kaddr = kmap_atomic(page, KM_USER1);
2881                 memcpy(kaddr + offset, src, cur);
2882                 kunmap_atomic(kaddr, KM_USER1);
2883
2884                 src += cur;
2885                 len -= cur;
2886                 offset = 0;
2887                 i++;
2888         }
2889 }
2890 EXPORT_SYMBOL(write_extent_buffer);
2891
2892 void memset_extent_buffer(struct extent_buffer *eb, char c,
2893                           unsigned long start, unsigned long len)
2894 {
2895         size_t cur;
2896         size_t offset;
2897         struct page *page;
2898         char *kaddr;
2899         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2900         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2901
2902         WARN_ON(start > eb->len);
2903         WARN_ON(start + len > eb->start + eb->len);
2904
2905         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2906
2907         while(len > 0) {
2908                 page = extent_buffer_page(eb, i);
2909                 WARN_ON(!PageUptodate(page));
2910
2911                 cur = min(len, PAGE_CACHE_SIZE - offset);
2912                 kaddr = kmap_atomic(page, KM_USER0);
2913                 memset(kaddr + offset, c, cur);
2914                 kunmap_atomic(kaddr, KM_USER0);
2915
2916                 len -= cur;
2917                 offset = 0;
2918                 i++;
2919         }
2920 }
2921 EXPORT_SYMBOL(memset_extent_buffer);
2922
2923 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
2924                         unsigned long dst_offset, unsigned long src_offset,
2925                         unsigned long len)
2926 {
2927         u64 dst_len = dst->len;
2928         size_t cur;
2929         size_t offset;
2930         struct page *page;
2931         char *kaddr;
2932         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2933         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2934
2935         WARN_ON(src->len != dst_len);
2936
2937         offset = (start_offset + dst_offset) &
2938                 ((unsigned long)PAGE_CACHE_SIZE - 1);
2939
2940         while(len > 0) {
2941                 page = extent_buffer_page(dst, i);
2942                 WARN_ON(!PageUptodate(page));
2943
2944                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
2945
2946                 kaddr = kmap_atomic(page, KM_USER0);
2947                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
2948                 kunmap_atomic(kaddr, KM_USER0);
2949
2950                 src_offset += cur;
2951                 len -= cur;
2952                 offset = 0;
2953                 i++;
2954         }
2955 }
2956 EXPORT_SYMBOL(copy_extent_buffer);
2957
2958 static void move_pages(struct page *dst_page, struct page *src_page,
2959                        unsigned long dst_off, unsigned long src_off,
2960                        unsigned long len)
2961 {
2962         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2963         if (dst_page == src_page) {
2964                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
2965         } else {
2966                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
2967                 char *p = dst_kaddr + dst_off + len;
2968                 char *s = src_kaddr + src_off + len;
2969
2970                 while (len--)
2971                         *--p = *--s;
2972
2973                 kunmap_atomic(src_kaddr, KM_USER1);
2974         }
2975         kunmap_atomic(dst_kaddr, KM_USER0);
2976 }
2977
2978 static void copy_pages(struct page *dst_page, struct page *src_page,
2979                        unsigned long dst_off, unsigned long src_off,
2980                        unsigned long len)
2981 {
2982         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2983         char *src_kaddr;
2984
2985         if (dst_page != src_page)
2986                 src_kaddr = kmap_atomic(src_page, KM_USER1);
2987         else
2988                 src_kaddr = dst_kaddr;
2989
2990         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
2991         kunmap_atomic(dst_kaddr, KM_USER0);
2992         if (dst_page != src_page)
2993                 kunmap_atomic(src_kaddr, KM_USER1);
2994 }
2995
2996 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2997                            unsigned long src_offset, unsigned long len)
2998 {
2999         size_t cur;
3000         size_t dst_off_in_page;
3001         size_t src_off_in_page;
3002         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3003         unsigned long dst_i;
3004         unsigned long src_i;
3005
3006         if (src_offset + len > dst->len) {
3007                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3008                        src_offset, len, dst->len);
3009                 BUG_ON(1);
3010         }
3011         if (dst_offset + len > dst->len) {
3012                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3013                        dst_offset, len, dst->len);
3014                 BUG_ON(1);
3015         }
3016
3017         while(len > 0) {
3018                 dst_off_in_page = (start_offset + dst_offset) &
3019                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3020                 src_off_in_page = (start_offset + src_offset) &
3021                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3022
3023                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3024                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3025
3026                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3027                                                src_off_in_page));
3028                 cur = min_t(unsigned long, cur,
3029                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3030
3031                 copy_pages(extent_buffer_page(dst, dst_i),
3032                            extent_buffer_page(dst, src_i),
3033                            dst_off_in_page, src_off_in_page, cur);
3034
3035                 src_offset += cur;
3036                 dst_offset += cur;
3037                 len -= cur;
3038         }
3039 }
3040 EXPORT_SYMBOL(memcpy_extent_buffer);
3041
3042 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3043                            unsigned long src_offset, unsigned long len)
3044 {
3045         size_t cur;
3046         size_t dst_off_in_page;
3047         size_t src_off_in_page;
3048         unsigned long dst_end = dst_offset + len - 1;
3049         unsigned long src_end = src_offset + len - 1;
3050         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3051         unsigned long dst_i;
3052         unsigned long src_i;
3053
3054         if (src_offset + len > dst->len) {
3055                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3056                        src_offset, len, dst->len);
3057                 BUG_ON(1);
3058         }
3059         if (dst_offset + len > dst->len) {
3060                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3061                        dst_offset, len, dst->len);
3062                 BUG_ON(1);
3063         }
3064         if (dst_offset < src_offset) {
3065                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3066                 return;
3067         }
3068         while(len > 0) {
3069                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3070                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3071
3072                 dst_off_in_page = (start_offset + dst_end) &
3073                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3074                 src_off_in_page = (start_offset + src_end) &
3075                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3076
3077                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3078                 cur = min(cur, dst_off_in_page + 1);
3079                 move_pages(extent_buffer_page(dst, dst_i),
3080                            extent_buffer_page(dst, src_i),
3081                            dst_off_in_page - cur + 1,
3082                            src_off_in_page - cur + 1, cur);
3083
3084                 dst_end -= cur;
3085                 src_end -= cur;
3086                 len -= cur;
3087         }
3088 }
3089 EXPORT_SYMBOL(memmove_extent_buffer);