Btrfs: allocate extent state and check the result properly
[linux-2.6-block.git] / fs / btrfs / extent_io.c
CommitLineData
d1310b2e
CM
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
d1310b2e
CM
5#include <linux/pagemap.h>
6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h>
9#include <linux/blkdev.h>
10#include <linux/swap.h>
d1310b2e
CM
11#include <linux/writeback.h>
12#include <linux/pagevec.h>
13#include "extent_io.h"
14#include "extent_map.h"
2db04966 15#include "compat.h"
902b22f3
DW
16#include "ctree.h"
17#include "btrfs_inode.h"
d1310b2e 18
d1310b2e
CM
19static struct kmem_cache *extent_state_cache;
20static struct kmem_cache *extent_buffer_cache;
21
22static LIST_HEAD(buffers);
23static LIST_HEAD(states);
4bef0848 24
b47eda86 25#define LEAK_DEBUG 0
3935127c 26#if LEAK_DEBUG
d397712b 27static DEFINE_SPINLOCK(leak_lock);
4bef0848 28#endif
d1310b2e 29
d1310b2e
CM
30#define BUFFER_LRU_MAX 64
31
32struct tree_entry {
33 u64 start;
34 u64 end;
d1310b2e
CM
35 struct rb_node rb_node;
36};
37
38struct extent_page_data {
39 struct bio *bio;
40 struct extent_io_tree *tree;
41 get_extent_t *get_extent;
771ed689
CM
42
43 /* tells writepage not to lock the state bits for this range
44 * it still does the unlocking
45 */
ffbd517d
CM
46 unsigned int extent_locked:1;
47
48 /* tells the submit_bio code to use a WRITE_SYNC */
49 unsigned int sync_io:1;
d1310b2e
CM
50};
51
52int __init extent_io_init(void)
53{
9601e3f6
CH
54 extent_state_cache = kmem_cache_create("extent_state",
55 sizeof(struct extent_state), 0,
56 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
57 if (!extent_state_cache)
58 return -ENOMEM;
59
9601e3f6
CH
60 extent_buffer_cache = kmem_cache_create("extent_buffers",
61 sizeof(struct extent_buffer), 0,
62 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
63 if (!extent_buffer_cache)
64 goto free_state_cache;
65 return 0;
66
67free_state_cache:
68 kmem_cache_destroy(extent_state_cache);
69 return -ENOMEM;
70}
71
72void extent_io_exit(void)
73{
74 struct extent_state *state;
2d2ae547 75 struct extent_buffer *eb;
d1310b2e
CM
76
77 while (!list_empty(&states)) {
2d2ae547 78 state = list_entry(states.next, struct extent_state, leak_list);
d397712b
CM
79 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
80 "state %lu in tree %p refs %d\n",
81 (unsigned long long)state->start,
82 (unsigned long long)state->end,
83 state->state, state->tree, atomic_read(&state->refs));
2d2ae547 84 list_del(&state->leak_list);
d1310b2e
CM
85 kmem_cache_free(extent_state_cache, state);
86
87 }
88
2d2ae547
CM
89 while (!list_empty(&buffers)) {
90 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
d397712b
CM
91 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
92 "refs %d\n", (unsigned long long)eb->start,
93 eb->len, atomic_read(&eb->refs));
2d2ae547
CM
94 list_del(&eb->leak_list);
95 kmem_cache_free(extent_buffer_cache, eb);
96 }
d1310b2e
CM
97 if (extent_state_cache)
98 kmem_cache_destroy(extent_state_cache);
99 if (extent_buffer_cache)
100 kmem_cache_destroy(extent_buffer_cache);
101}
102
103void extent_io_tree_init(struct extent_io_tree *tree,
104 struct address_space *mapping, gfp_t mask)
105{
6bef4d31 106 tree->state = RB_ROOT;
19fe0a8b 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
d1310b2e
CM
108 tree->ops = NULL;
109 tree->dirty_bytes = 0;
70dec807 110 spin_lock_init(&tree->lock);
6af118ce 111 spin_lock_init(&tree->buffer_lock);
d1310b2e 112 tree->mapping = mapping;
d1310b2e 113}
d1310b2e 114
b2950863 115static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
116{
117 struct extent_state *state;
3935127c 118#if LEAK_DEBUG
2d2ae547 119 unsigned long flags;
4bef0848 120#endif
d1310b2e
CM
121
122 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 123 if (!state)
d1310b2e
CM
124 return state;
125 state->state = 0;
d1310b2e 126 state->private = 0;
70dec807 127 state->tree = NULL;
3935127c 128#if LEAK_DEBUG
2d2ae547
CM
129 spin_lock_irqsave(&leak_lock, flags);
130 list_add(&state->leak_list, &states);
131 spin_unlock_irqrestore(&leak_lock, flags);
4bef0848 132#endif
d1310b2e
CM
133 atomic_set(&state->refs, 1);
134 init_waitqueue_head(&state->wq);
135 return state;
136}
d1310b2e 137
4845e44f 138void free_extent_state(struct extent_state *state)
d1310b2e 139{
d1310b2e
CM
140 if (!state)
141 return;
142 if (atomic_dec_and_test(&state->refs)) {
3935127c 143#if LEAK_DEBUG
2d2ae547 144 unsigned long flags;
4bef0848 145#endif
70dec807 146 WARN_ON(state->tree);
3935127c 147#if LEAK_DEBUG
2d2ae547
CM
148 spin_lock_irqsave(&leak_lock, flags);
149 list_del(&state->leak_list);
150 spin_unlock_irqrestore(&leak_lock, flags);
4bef0848 151#endif
d1310b2e
CM
152 kmem_cache_free(extent_state_cache, state);
153 }
154}
d1310b2e
CM
155
156static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
157 struct rb_node *node)
158{
d397712b
CM
159 struct rb_node **p = &root->rb_node;
160 struct rb_node *parent = NULL;
d1310b2e
CM
161 struct tree_entry *entry;
162
d397712b 163 while (*p) {
d1310b2e
CM
164 parent = *p;
165 entry = rb_entry(parent, struct tree_entry, rb_node);
166
167 if (offset < entry->start)
168 p = &(*p)->rb_left;
169 else if (offset > entry->end)
170 p = &(*p)->rb_right;
171 else
172 return parent;
173 }
174
175 entry = rb_entry(node, struct tree_entry, rb_node);
d1310b2e
CM
176 rb_link_node(node, parent, p);
177 rb_insert_color(node, root);
178 return NULL;
179}
180
80ea96b1 181static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
d1310b2e
CM
182 struct rb_node **prev_ret,
183 struct rb_node **next_ret)
184{
80ea96b1 185 struct rb_root *root = &tree->state;
d397712b 186 struct rb_node *n = root->rb_node;
d1310b2e
CM
187 struct rb_node *prev = NULL;
188 struct rb_node *orig_prev = NULL;
189 struct tree_entry *entry;
190 struct tree_entry *prev_entry = NULL;
191
d397712b 192 while (n) {
d1310b2e
CM
193 entry = rb_entry(n, struct tree_entry, rb_node);
194 prev = n;
195 prev_entry = entry;
196
197 if (offset < entry->start)
198 n = n->rb_left;
199 else if (offset > entry->end)
200 n = n->rb_right;
d397712b 201 else
d1310b2e
CM
202 return n;
203 }
204
205 if (prev_ret) {
206 orig_prev = prev;
d397712b 207 while (prev && offset > prev_entry->end) {
d1310b2e
CM
208 prev = rb_next(prev);
209 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
210 }
211 *prev_ret = prev;
212 prev = orig_prev;
213 }
214
215 if (next_ret) {
216 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 217 while (prev && offset < prev_entry->start) {
d1310b2e
CM
218 prev = rb_prev(prev);
219 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220 }
221 *next_ret = prev;
222 }
223 return NULL;
224}
225
80ea96b1
CM
226static inline struct rb_node *tree_search(struct extent_io_tree *tree,
227 u64 offset)
d1310b2e 228{
70dec807 229 struct rb_node *prev = NULL;
d1310b2e 230 struct rb_node *ret;
70dec807 231
80ea96b1 232 ret = __etree_search(tree, offset, &prev, NULL);
d397712b 233 if (!ret)
d1310b2e
CM
234 return prev;
235 return ret;
236}
237
9ed74f2d
JB
238static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
239 struct extent_state *other)
240{
241 if (tree->ops && tree->ops->merge_extent_hook)
242 tree->ops->merge_extent_hook(tree->mapping->host, new,
243 other);
244}
245
d1310b2e
CM
246/*
247 * utility function to look for merge candidates inside a given range.
248 * Any extents with matching state are merged together into a single
249 * extent in the tree. Extents with EXTENT_IO in their state field
250 * are not merged because the end_io handlers need to be able to do
251 * operations on them without sleeping (or doing allocations/splits).
252 *
253 * This should be called with the tree lock held.
254 */
255static int merge_state(struct extent_io_tree *tree,
256 struct extent_state *state)
257{
258 struct extent_state *other;
259 struct rb_node *other_node;
260
5b21f2ed 261 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
d1310b2e
CM
262 return 0;
263
264 other_node = rb_prev(&state->rb_node);
265 if (other_node) {
266 other = rb_entry(other_node, struct extent_state, rb_node);
267 if (other->end == state->start - 1 &&
268 other->state == state->state) {
9ed74f2d 269 merge_cb(tree, state, other);
d1310b2e 270 state->start = other->start;
70dec807 271 other->tree = NULL;
d1310b2e
CM
272 rb_erase(&other->rb_node, &tree->state);
273 free_extent_state(other);
274 }
275 }
276 other_node = rb_next(&state->rb_node);
277 if (other_node) {
278 other = rb_entry(other_node, struct extent_state, rb_node);
279 if (other->start == state->end + 1 &&
280 other->state == state->state) {
9ed74f2d 281 merge_cb(tree, state, other);
d1310b2e 282 other->start = state->start;
70dec807 283 state->tree = NULL;
d1310b2e
CM
284 rb_erase(&state->rb_node, &tree->state);
285 free_extent_state(state);
9ed74f2d 286 state = NULL;
d1310b2e
CM
287 }
288 }
9ed74f2d 289
d1310b2e
CM
290 return 0;
291}
292
9ed74f2d 293static int set_state_cb(struct extent_io_tree *tree,
0ca1f7ce 294 struct extent_state *state, int *bits)
291d673e
CM
295{
296 if (tree->ops && tree->ops->set_bit_hook) {
9ed74f2d 297 return tree->ops->set_bit_hook(tree->mapping->host,
0ca1f7ce 298 state, bits);
291d673e 299 }
9ed74f2d
JB
300
301 return 0;
291d673e
CM
302}
303
304static void clear_state_cb(struct extent_io_tree *tree,
0ca1f7ce 305 struct extent_state *state, int *bits)
291d673e 306{
9ed74f2d
JB
307 if (tree->ops && tree->ops->clear_bit_hook)
308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
291d673e
CM
309}
310
d1310b2e
CM
311/*
312 * insert an extent_state struct into the tree. 'bits' are set on the
313 * struct before it is inserted.
314 *
315 * This may return -EEXIST if the extent is already there, in which case the
316 * state struct is freed.
317 *
318 * The tree lock is not taken internally. This is a utility function and
319 * probably isn't what you want to call (see set/clear_extent_bit).
320 */
321static int insert_state(struct extent_io_tree *tree,
322 struct extent_state *state, u64 start, u64 end,
0ca1f7ce 323 int *bits)
d1310b2e
CM
324{
325 struct rb_node *node;
0ca1f7ce 326 int bits_to_set = *bits & ~EXTENT_CTLBITS;
9ed74f2d 327 int ret;
d1310b2e
CM
328
329 if (end < start) {
d397712b
CM
330 printk(KERN_ERR "btrfs end < start %llu %llu\n",
331 (unsigned long long)end,
332 (unsigned long long)start);
d1310b2e
CM
333 WARN_ON(1);
334 }
d1310b2e
CM
335 state->start = start;
336 state->end = end;
9ed74f2d
JB
337 ret = set_state_cb(tree, state, bits);
338 if (ret)
339 return ret;
340
0ca1f7ce 341 if (bits_to_set & EXTENT_DIRTY)
9ed74f2d 342 tree->dirty_bytes += end - start + 1;
0ca1f7ce 343 state->state |= bits_to_set;
d1310b2e
CM
344 node = tree_insert(&tree->state, end, &state->rb_node);
345 if (node) {
346 struct extent_state *found;
347 found = rb_entry(node, struct extent_state, rb_node);
d397712b
CM
348 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
349 "%llu %llu\n", (unsigned long long)found->start,
350 (unsigned long long)found->end,
351 (unsigned long long)start, (unsigned long long)end);
d1310b2e
CM
352 free_extent_state(state);
353 return -EEXIST;
354 }
70dec807 355 state->tree = tree;
d1310b2e
CM
356 merge_state(tree, state);
357 return 0;
358}
359
9ed74f2d
JB
360static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
361 u64 split)
362{
363 if (tree->ops && tree->ops->split_extent_hook)
364 return tree->ops->split_extent_hook(tree->mapping->host,
365 orig, split);
366 return 0;
367}
368
d1310b2e
CM
369/*
370 * split a given extent state struct in two, inserting the preallocated
371 * struct 'prealloc' as the newly created second half. 'split' indicates an
372 * offset inside 'orig' where it should be split.
373 *
374 * Before calling,
375 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
376 * are two extent state structs in the tree:
377 * prealloc: [orig->start, split - 1]
378 * orig: [ split, orig->end ]
379 *
380 * The tree locks are not taken by this function. They need to be held
381 * by the caller.
382 */
383static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
384 struct extent_state *prealloc, u64 split)
385{
386 struct rb_node *node;
9ed74f2d
JB
387
388 split_cb(tree, orig, split);
389
d1310b2e
CM
390 prealloc->start = orig->start;
391 prealloc->end = split - 1;
392 prealloc->state = orig->state;
393 orig->start = split;
394
395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
396 if (node) {
d1310b2e
CM
397 free_extent_state(prealloc);
398 return -EEXIST;
399 }
70dec807 400 prealloc->tree = tree;
d1310b2e
CM
401 return 0;
402}
403
404/*
405 * utility function to clear some bits in an extent state struct.
406 * it will optionally wake up any one waiting on this state (wake == 1), or
407 * forcibly remove the state from the tree (delete == 1).
408 *
409 * If no bits are set on the state struct after clearing things, the
410 * struct is freed and removed from the tree
411 */
412static int clear_state_bit(struct extent_io_tree *tree,
0ca1f7ce
YZ
413 struct extent_state *state,
414 int *bits, int wake)
d1310b2e 415{
0ca1f7ce 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
32c00aff 417 int ret = state->state & bits_to_clear;
d1310b2e 418
0ca1f7ce 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
420 u64 range = state->end - state->start + 1;
421 WARN_ON(range > tree->dirty_bytes);
422 tree->dirty_bytes -= range;
423 }
291d673e 424 clear_state_cb(tree, state, bits);
32c00aff 425 state->state &= ~bits_to_clear;
d1310b2e
CM
426 if (wake)
427 wake_up(&state->wq);
0ca1f7ce 428 if (state->state == 0) {
70dec807 429 if (state->tree) {
d1310b2e 430 rb_erase(&state->rb_node, &tree->state);
70dec807 431 state->tree = NULL;
d1310b2e
CM
432 free_extent_state(state);
433 } else {
434 WARN_ON(1);
435 }
436 } else {
437 merge_state(tree, state);
438 }
439 return ret;
440}
441
8233767a
XG
442static struct extent_state *
443alloc_extent_state_atomic(struct extent_state *prealloc)
444{
445 if (!prealloc)
446 prealloc = alloc_extent_state(GFP_ATOMIC);
447
448 return prealloc;
449}
450
d1310b2e
CM
451/*
452 * clear some bits on a range in the tree. This may require splitting
453 * or inserting elements in the tree, so the gfp mask is used to
454 * indicate which allocations or sleeping are allowed.
455 *
456 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
457 * the given range from the tree regardless of state (ie for truncate).
458 *
459 * the range [start, end] is inclusive.
460 *
461 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
462 * bits were already set, or zero if none of the bits were already set.
463 */
464int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
2c64c53d
CM
465 int bits, int wake, int delete,
466 struct extent_state **cached_state,
467 gfp_t mask)
d1310b2e
CM
468{
469 struct extent_state *state;
2c64c53d 470 struct extent_state *cached;
d1310b2e 471 struct extent_state *prealloc = NULL;
2c64c53d 472 struct rb_node *next_node;
d1310b2e 473 struct rb_node *node;
5c939df5 474 u64 last_end;
d1310b2e
CM
475 int err;
476 int set = 0;
2ac55d41 477 int clear = 0;
d1310b2e 478
0ca1f7ce
YZ
479 if (delete)
480 bits |= ~EXTENT_CTLBITS;
481 bits |= EXTENT_FIRST_DELALLOC;
482
2ac55d41
JB
483 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
484 clear = 1;
d1310b2e
CM
485again:
486 if (!prealloc && (mask & __GFP_WAIT)) {
487 prealloc = alloc_extent_state(mask);
8233767a 488 BUG_ON(!prealloc);
d1310b2e
CM
489 }
490
cad321ad 491 spin_lock(&tree->lock);
2c64c53d
CM
492 if (cached_state) {
493 cached = *cached_state;
2ac55d41
JB
494
495 if (clear) {
496 *cached_state = NULL;
497 cached_state = NULL;
498 }
499
42daec29 500 if (cached && cached->tree && cached->start == start) {
2ac55d41
JB
501 if (clear)
502 atomic_dec(&cached->refs);
2c64c53d 503 state = cached;
42daec29 504 goto hit_next;
2c64c53d 505 }
2ac55d41
JB
506 if (clear)
507 free_extent_state(cached);
2c64c53d 508 }
d1310b2e
CM
509 /*
510 * this search will find the extents that end after
511 * our range starts
512 */
80ea96b1 513 node = tree_search(tree, start);
d1310b2e
CM
514 if (!node)
515 goto out;
516 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 517hit_next:
d1310b2e
CM
518 if (state->start > end)
519 goto out;
520 WARN_ON(state->end < start);
5c939df5 521 last_end = state->end;
d1310b2e
CM
522
523 /*
524 * | ---- desired range ---- |
525 * | state | or
526 * | ------------- state -------------- |
527 *
528 * We need to split the extent we found, and may flip
529 * bits on second half.
530 *
531 * If the extent we found extends past our range, we
532 * just split and search again. It'll get split again
533 * the next time though.
534 *
535 * If the extent we found is inside our range, we clear
536 * the desired bit on it.
537 */
538
539 if (state->start < start) {
8233767a
XG
540 prealloc = alloc_extent_state_atomic(prealloc);
541 BUG_ON(!prealloc);
d1310b2e
CM
542 err = split_state(tree, state, prealloc, start);
543 BUG_ON(err == -EEXIST);
544 prealloc = NULL;
545 if (err)
546 goto out;
547 if (state->end <= end) {
0ca1f7ce 548 set |= clear_state_bit(tree, state, &bits, wake);
5c939df5
YZ
549 if (last_end == (u64)-1)
550 goto out;
551 start = last_end + 1;
d1310b2e
CM
552 }
553 goto search_again;
554 }
555 /*
556 * | ---- desired range ---- |
557 * | state |
558 * We need to split the extent, and clear the bit
559 * on the first half
560 */
561 if (state->start <= end && state->end > end) {
8233767a
XG
562 prealloc = alloc_extent_state_atomic(prealloc);
563 BUG_ON(!prealloc);
d1310b2e
CM
564 err = split_state(tree, state, prealloc, end + 1);
565 BUG_ON(err == -EEXIST);
d1310b2e
CM
566 if (wake)
567 wake_up(&state->wq);
42daec29 568
0ca1f7ce 569 set |= clear_state_bit(tree, prealloc, &bits, wake);
9ed74f2d 570
d1310b2e
CM
571 prealloc = NULL;
572 goto out;
573 }
42daec29 574
2c64c53d
CM
575 if (state->end < end && prealloc && !need_resched())
576 next_node = rb_next(&state->rb_node);
577 else
578 next_node = NULL;
42daec29 579
0ca1f7ce 580 set |= clear_state_bit(tree, state, &bits, wake);
5c939df5
YZ
581 if (last_end == (u64)-1)
582 goto out;
583 start = last_end + 1;
2c64c53d
CM
584 if (start <= end && next_node) {
585 state = rb_entry(next_node, struct extent_state,
586 rb_node);
587 if (state->start == start)
588 goto hit_next;
589 }
d1310b2e
CM
590 goto search_again;
591
592out:
cad321ad 593 spin_unlock(&tree->lock);
d1310b2e
CM
594 if (prealloc)
595 free_extent_state(prealloc);
596
597 return set;
598
599search_again:
600 if (start > end)
601 goto out;
cad321ad 602 spin_unlock(&tree->lock);
d1310b2e
CM
603 if (mask & __GFP_WAIT)
604 cond_resched();
605 goto again;
606}
d1310b2e
CM
607
608static int wait_on_state(struct extent_io_tree *tree,
609 struct extent_state *state)
641f5219
CH
610 __releases(tree->lock)
611 __acquires(tree->lock)
d1310b2e
CM
612{
613 DEFINE_WAIT(wait);
614 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 615 spin_unlock(&tree->lock);
d1310b2e 616 schedule();
cad321ad 617 spin_lock(&tree->lock);
d1310b2e
CM
618 finish_wait(&state->wq, &wait);
619 return 0;
620}
621
622/*
623 * waits for one or more bits to clear on a range in the state tree.
624 * The range [start, end] is inclusive.
625 * The tree lock is taken by this function
626 */
627int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
628{
629 struct extent_state *state;
630 struct rb_node *node;
631
cad321ad 632 spin_lock(&tree->lock);
d1310b2e
CM
633again:
634 while (1) {
635 /*
636 * this search will find all the extents that end after
637 * our range starts
638 */
80ea96b1 639 node = tree_search(tree, start);
d1310b2e
CM
640 if (!node)
641 break;
642
643 state = rb_entry(node, struct extent_state, rb_node);
644
645 if (state->start > end)
646 goto out;
647
648 if (state->state & bits) {
649 start = state->start;
650 atomic_inc(&state->refs);
651 wait_on_state(tree, state);
652 free_extent_state(state);
653 goto again;
654 }
655 start = state->end + 1;
656
657 if (start > end)
658 break;
659
660 if (need_resched()) {
cad321ad 661 spin_unlock(&tree->lock);
d1310b2e 662 cond_resched();
cad321ad 663 spin_lock(&tree->lock);
d1310b2e
CM
664 }
665 }
666out:
cad321ad 667 spin_unlock(&tree->lock);
d1310b2e
CM
668 return 0;
669}
d1310b2e 670
9ed74f2d 671static int set_state_bits(struct extent_io_tree *tree,
d1310b2e 672 struct extent_state *state,
0ca1f7ce 673 int *bits)
d1310b2e 674{
9ed74f2d 675 int ret;
0ca1f7ce 676 int bits_to_set = *bits & ~EXTENT_CTLBITS;
9ed74f2d
JB
677
678 ret = set_state_cb(tree, state, bits);
679 if (ret)
680 return ret;
0ca1f7ce 681 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
682 u64 range = state->end - state->start + 1;
683 tree->dirty_bytes += range;
684 }
0ca1f7ce 685 state->state |= bits_to_set;
9ed74f2d
JB
686
687 return 0;
d1310b2e
CM
688}
689
2c64c53d
CM
690static void cache_state(struct extent_state *state,
691 struct extent_state **cached_ptr)
692{
693 if (cached_ptr && !(*cached_ptr)) {
694 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
695 *cached_ptr = state;
696 atomic_inc(&state->refs);
697 }
698 }
699}
700
507903b8
AJ
701static void uncache_state(struct extent_state **cached_ptr)
702{
703 if (cached_ptr && (*cached_ptr)) {
704 struct extent_state *state = *cached_ptr;
109b36a2
CM
705 *cached_ptr = NULL;
706 free_extent_state(state);
507903b8
AJ
707 }
708}
709
d1310b2e 710/*
1edbb734
CM
711 * set some bits on a range in the tree. This may require allocations or
712 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 713 *
1edbb734
CM
714 * If any of the exclusive bits are set, this will fail with -EEXIST if some
715 * part of the range already has the desired bits set. The start of the
716 * existing range is returned in failed_start in this case.
d1310b2e 717 *
1edbb734 718 * [start, end] is inclusive This takes the tree lock.
d1310b2e 719 */
1edbb734 720
4845e44f
CM
721int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
722 int bits, int exclusive_bits, u64 *failed_start,
723 struct extent_state **cached_state, gfp_t mask)
d1310b2e
CM
724{
725 struct extent_state *state;
726 struct extent_state *prealloc = NULL;
727 struct rb_node *node;
d1310b2e 728 int err = 0;
d1310b2e
CM
729 u64 last_start;
730 u64 last_end;
42daec29 731
0ca1f7ce 732 bits |= EXTENT_FIRST_DELALLOC;
d1310b2e
CM
733again:
734 if (!prealloc && (mask & __GFP_WAIT)) {
735 prealloc = alloc_extent_state(mask);
8233767a 736 BUG_ON(!prealloc);
d1310b2e
CM
737 }
738
cad321ad 739 spin_lock(&tree->lock);
9655d298
CM
740 if (cached_state && *cached_state) {
741 state = *cached_state;
742 if (state->start == start && state->tree) {
743 node = &state->rb_node;
744 goto hit_next;
745 }
746 }
d1310b2e
CM
747 /*
748 * this search will find all the extents that end after
749 * our range starts.
750 */
80ea96b1 751 node = tree_search(tree, start);
d1310b2e 752 if (!node) {
8233767a
XG
753 prealloc = alloc_extent_state_atomic(prealloc);
754 BUG_ON(!prealloc);
0ca1f7ce 755 err = insert_state(tree, prealloc, start, end, &bits);
d1310b2e
CM
756 prealloc = NULL;
757 BUG_ON(err == -EEXIST);
758 goto out;
759 }
d1310b2e 760 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 761hit_next:
d1310b2e
CM
762 last_start = state->start;
763 last_end = state->end;
764
765 /*
766 * | ---- desired range ---- |
767 * | state |
768 *
769 * Just lock what we found and keep going
770 */
771 if (state->start == start && state->end <= end) {
40431d6c 772 struct rb_node *next_node;
1edbb734 773 if (state->state & exclusive_bits) {
d1310b2e
CM
774 *failed_start = state->start;
775 err = -EEXIST;
776 goto out;
777 }
42daec29 778
0ca1f7ce 779 err = set_state_bits(tree, state, &bits);
9ed74f2d
JB
780 if (err)
781 goto out;
782
2c64c53d 783 cache_state(state, cached_state);
d1310b2e 784 merge_state(tree, state);
5c939df5
YZ
785 if (last_end == (u64)-1)
786 goto out;
40431d6c 787
5c939df5 788 start = last_end + 1;
40431d6c
CM
789 if (start < end && prealloc && !need_resched()) {
790 next_node = rb_next(node);
791 if (next_node) {
792 state = rb_entry(next_node, struct extent_state,
793 rb_node);
794 if (state->start == start)
795 goto hit_next;
796 }
797 }
d1310b2e
CM
798 goto search_again;
799 }
800
801 /*
802 * | ---- desired range ---- |
803 * | state |
804 * or
805 * | ------------- state -------------- |
806 *
807 * We need to split the extent we found, and may flip bits on
808 * second half.
809 *
810 * If the extent we found extends past our
811 * range, we just split and search again. It'll get split
812 * again the next time though.
813 *
814 * If the extent we found is inside our range, we set the
815 * desired bit on it.
816 */
817 if (state->start < start) {
1edbb734 818 if (state->state & exclusive_bits) {
d1310b2e
CM
819 *failed_start = start;
820 err = -EEXIST;
821 goto out;
822 }
8233767a
XG
823
824 prealloc = alloc_extent_state_atomic(prealloc);
825 BUG_ON(!prealloc);
d1310b2e
CM
826 err = split_state(tree, state, prealloc, start);
827 BUG_ON(err == -EEXIST);
828 prealloc = NULL;
829 if (err)
830 goto out;
831 if (state->end <= end) {
0ca1f7ce 832 err = set_state_bits(tree, state, &bits);
9ed74f2d
JB
833 if (err)
834 goto out;
2c64c53d 835 cache_state(state, cached_state);
d1310b2e 836 merge_state(tree, state);
5c939df5
YZ
837 if (last_end == (u64)-1)
838 goto out;
839 start = last_end + 1;
d1310b2e
CM
840 }
841 goto search_again;
842 }
843 /*
844 * | ---- desired range ---- |
845 * | state | or | state |
846 *
847 * There's a hole, we need to insert something in it and
848 * ignore the extent we found.
849 */
850 if (state->start > start) {
851 u64 this_end;
852 if (end < last_start)
853 this_end = end;
854 else
d397712b 855 this_end = last_start - 1;
8233767a
XG
856
857 prealloc = alloc_extent_state_atomic(prealloc);
858 BUG_ON(!prealloc);
d1310b2e 859 err = insert_state(tree, prealloc, start, this_end,
0ca1f7ce 860 &bits);
d1310b2e 861 BUG_ON(err == -EEXIST);
9ed74f2d
JB
862 if (err) {
863 prealloc = NULL;
d1310b2e 864 goto out;
9ed74f2d
JB
865 }
866 cache_state(prealloc, cached_state);
867 prealloc = NULL;
d1310b2e
CM
868 start = this_end + 1;
869 goto search_again;
870 }
871 /*
872 * | ---- desired range ---- |
873 * | state |
874 * We need to split the extent, and set the bit
875 * on the first half
876 */
877 if (state->start <= end && state->end > end) {
1edbb734 878 if (state->state & exclusive_bits) {
d1310b2e
CM
879 *failed_start = start;
880 err = -EEXIST;
881 goto out;
882 }
8233767a
XG
883
884 prealloc = alloc_extent_state_atomic(prealloc);
885 BUG_ON(!prealloc);
d1310b2e
CM
886 err = split_state(tree, state, prealloc, end + 1);
887 BUG_ON(err == -EEXIST);
888
0ca1f7ce 889 err = set_state_bits(tree, prealloc, &bits);
9ed74f2d
JB
890 if (err) {
891 prealloc = NULL;
892 goto out;
893 }
2c64c53d 894 cache_state(prealloc, cached_state);
d1310b2e
CM
895 merge_state(tree, prealloc);
896 prealloc = NULL;
897 goto out;
898 }
899
900 goto search_again;
901
902out:
cad321ad 903 spin_unlock(&tree->lock);
d1310b2e
CM
904 if (prealloc)
905 free_extent_state(prealloc);
906
907 return err;
908
909search_again:
910 if (start > end)
911 goto out;
cad321ad 912 spin_unlock(&tree->lock);
d1310b2e
CM
913 if (mask & __GFP_WAIT)
914 cond_resched();
915 goto again;
916}
d1310b2e
CM
917
918/* wrappers around set/clear extent bit */
919int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
920 gfp_t mask)
921{
922 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
2c64c53d 923 NULL, mask);
d1310b2e 924}
d1310b2e
CM
925
926int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
927 int bits, gfp_t mask)
928{
929 return set_extent_bit(tree, start, end, bits, 0, NULL,
2c64c53d 930 NULL, mask);
d1310b2e 931}
d1310b2e
CM
932
933int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
934 int bits, gfp_t mask)
935{
2c64c53d 936 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
d1310b2e 937}
d1310b2e
CM
938
939int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
2ac55d41 940 struct extent_state **cached_state, gfp_t mask)
d1310b2e
CM
941{
942 return set_extent_bit(tree, start, end,
40431d6c 943 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
2ac55d41 944 0, NULL, cached_state, mask);
d1310b2e 945}
d1310b2e
CM
946
947int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
948 gfp_t mask)
949{
950 return clear_extent_bit(tree, start, end,
32c00aff 951 EXTENT_DIRTY | EXTENT_DELALLOC |
0ca1f7ce 952 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
d1310b2e 953}
d1310b2e
CM
954
955int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
956 gfp_t mask)
957{
958 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
2c64c53d 959 NULL, mask);
d1310b2e 960}
d1310b2e 961
b2950863 962static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
d1310b2e
CM
963 gfp_t mask)
964{
2c64c53d
CM
965 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
966 NULL, mask);
d1310b2e 967}
d1310b2e
CM
968
969int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
507903b8 970 struct extent_state **cached_state, gfp_t mask)
d1310b2e 971{
507903b8
AJ
972 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
973 NULL, cached_state, mask);
d1310b2e 974}
d1310b2e 975
d397712b 976static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
2ac55d41
JB
977 u64 end, struct extent_state **cached_state,
978 gfp_t mask)
d1310b2e 979{
2c64c53d 980 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
2ac55d41 981 cached_state, mask);
d1310b2e 982}
d1310b2e 983
d1310b2e
CM
984int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
985{
986 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
987}
d1310b2e 988
d352ac68
CM
989/*
990 * either insert or lock state struct between start and end use mask to tell
991 * us if waiting is desired.
992 */
1edbb734 993int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
2c64c53d 994 int bits, struct extent_state **cached_state, gfp_t mask)
d1310b2e
CM
995{
996 int err;
997 u64 failed_start;
998 while (1) {
1edbb734 999 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
2c64c53d
CM
1000 EXTENT_LOCKED, &failed_start,
1001 cached_state, mask);
d1310b2e
CM
1002 if (err == -EEXIST && (mask & __GFP_WAIT)) {
1003 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1004 start = failed_start;
1005 } else {
1006 break;
1007 }
1008 WARN_ON(start > end);
1009 }
1010 return err;
1011}
d1310b2e 1012
1edbb734
CM
1013int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1014{
2c64c53d 1015 return lock_extent_bits(tree, start, end, 0, NULL, mask);
1edbb734
CM
1016}
1017
25179201
JB
1018int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1019 gfp_t mask)
1020{
1021 int err;
1022 u64 failed_start;
1023
2c64c53d
CM
1024 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1025 &failed_start, NULL, mask);
6643558d
YZ
1026 if (err == -EEXIST) {
1027 if (failed_start > start)
1028 clear_extent_bit(tree, start, failed_start - 1,
2c64c53d 1029 EXTENT_LOCKED, 1, 0, NULL, mask);
25179201 1030 return 0;
6643558d 1031 }
25179201
JB
1032 return 1;
1033}
25179201 1034
2c64c53d
CM
1035int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1036 struct extent_state **cached, gfp_t mask)
1037{
1038 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1039 mask);
1040}
1041
507903b8 1042int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
d1310b2e 1043{
2c64c53d
CM
1044 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1045 mask);
d1310b2e 1046}
d1310b2e
CM
1047
1048/*
1049 * helper function to set pages and extents in the tree dirty
1050 */
1051int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1052{
1053 unsigned long index = start >> PAGE_CACHE_SHIFT;
1054 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1055 struct page *page;
1056
1057 while (index <= end_index) {
1058 page = find_get_page(tree->mapping, index);
1059 BUG_ON(!page);
1060 __set_page_dirty_nobuffers(page);
1061 page_cache_release(page);
1062 index++;
1063 }
d1310b2e
CM
1064 return 0;
1065}
d1310b2e
CM
1066
1067/*
1068 * helper function to set both pages and extents in the tree writeback
1069 */
b2950863 1070static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
d1310b2e
CM
1071{
1072 unsigned long index = start >> PAGE_CACHE_SHIFT;
1073 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1074 struct page *page;
1075
1076 while (index <= end_index) {
1077 page = find_get_page(tree->mapping, index);
1078 BUG_ON(!page);
1079 set_page_writeback(page);
1080 page_cache_release(page);
1081 index++;
1082 }
d1310b2e
CM
1083 return 0;
1084}
d1310b2e 1085
d352ac68
CM
1086/*
1087 * find the first offset in the io tree with 'bits' set. zero is
1088 * returned if we find something, and *start_ret and *end_ret are
1089 * set to reflect the state struct that was found.
1090 *
1091 * If nothing was found, 1 is returned, < 0 on error
1092 */
d1310b2e
CM
1093int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1094 u64 *start_ret, u64 *end_ret, int bits)
1095{
1096 struct rb_node *node;
1097 struct extent_state *state;
1098 int ret = 1;
1099
cad321ad 1100 spin_lock(&tree->lock);
d1310b2e
CM
1101 /*
1102 * this search will find all the extents that end after
1103 * our range starts.
1104 */
80ea96b1 1105 node = tree_search(tree, start);
d397712b 1106 if (!node)
d1310b2e 1107 goto out;
d1310b2e 1108
d397712b 1109 while (1) {
d1310b2e
CM
1110 state = rb_entry(node, struct extent_state, rb_node);
1111 if (state->end >= start && (state->state & bits)) {
1112 *start_ret = state->start;
1113 *end_ret = state->end;
1114 ret = 0;
1115 break;
1116 }
1117 node = rb_next(node);
1118 if (!node)
1119 break;
1120 }
1121out:
cad321ad 1122 spin_unlock(&tree->lock);
d1310b2e
CM
1123 return ret;
1124}
d1310b2e 1125
d352ac68
CM
1126/* find the first state struct with 'bits' set after 'start', and
1127 * return it. tree->lock must be held. NULL will returned if
1128 * nothing was found after 'start'
1129 */
d7fc640e
CM
1130struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1131 u64 start, int bits)
1132{
1133 struct rb_node *node;
1134 struct extent_state *state;
1135
1136 /*
1137 * this search will find all the extents that end after
1138 * our range starts.
1139 */
1140 node = tree_search(tree, start);
d397712b 1141 if (!node)
d7fc640e 1142 goto out;
d7fc640e 1143
d397712b 1144 while (1) {
d7fc640e 1145 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1146 if (state->end >= start && (state->state & bits))
d7fc640e 1147 return state;
d397712b 1148
d7fc640e
CM
1149 node = rb_next(node);
1150 if (!node)
1151 break;
1152 }
1153out:
1154 return NULL;
1155}
d7fc640e 1156
d352ac68
CM
1157/*
1158 * find a contiguous range of bytes in the file marked as delalloc, not
1159 * more than 'max_bytes'. start and end are used to return the range,
1160 *
1161 * 1 is returned if we find something, 0 if nothing was in the tree
1162 */
c8b97818 1163static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
c2a128d2
JB
1164 u64 *start, u64 *end, u64 max_bytes,
1165 struct extent_state **cached_state)
d1310b2e
CM
1166{
1167 struct rb_node *node;
1168 struct extent_state *state;
1169 u64 cur_start = *start;
1170 u64 found = 0;
1171 u64 total_bytes = 0;
1172
cad321ad 1173 spin_lock(&tree->lock);
c8b97818 1174
d1310b2e
CM
1175 /*
1176 * this search will find all the extents that end after
1177 * our range starts.
1178 */
80ea96b1 1179 node = tree_search(tree, cur_start);
2b114d1d 1180 if (!node) {
3b951516
CM
1181 if (!found)
1182 *end = (u64)-1;
d1310b2e
CM
1183 goto out;
1184 }
1185
d397712b 1186 while (1) {
d1310b2e 1187 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1188 if (found && (state->start != cur_start ||
1189 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1190 goto out;
1191 }
1192 if (!(state->state & EXTENT_DELALLOC)) {
1193 if (!found)
1194 *end = state->end;
1195 goto out;
1196 }
c2a128d2 1197 if (!found) {
d1310b2e 1198 *start = state->start;
c2a128d2
JB
1199 *cached_state = state;
1200 atomic_inc(&state->refs);
1201 }
d1310b2e
CM
1202 found++;
1203 *end = state->end;
1204 cur_start = state->end + 1;
1205 node = rb_next(node);
1206 if (!node)
1207 break;
1208 total_bytes += state->end - state->start + 1;
1209 if (total_bytes >= max_bytes)
1210 break;
1211 }
1212out:
cad321ad 1213 spin_unlock(&tree->lock);
d1310b2e
CM
1214 return found;
1215}
1216
c8b97818
CM
1217static noinline int __unlock_for_delalloc(struct inode *inode,
1218 struct page *locked_page,
1219 u64 start, u64 end)
1220{
1221 int ret;
1222 struct page *pages[16];
1223 unsigned long index = start >> PAGE_CACHE_SHIFT;
1224 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1225 unsigned long nr_pages = end_index - index + 1;
1226 int i;
1227
1228 if (index == locked_page->index && end_index == index)
1229 return 0;
1230
d397712b 1231 while (nr_pages > 0) {
c8b97818 1232 ret = find_get_pages_contig(inode->i_mapping, index,
5b050f04
CM
1233 min_t(unsigned long, nr_pages,
1234 ARRAY_SIZE(pages)), pages);
c8b97818
CM
1235 for (i = 0; i < ret; i++) {
1236 if (pages[i] != locked_page)
1237 unlock_page(pages[i]);
1238 page_cache_release(pages[i]);
1239 }
1240 nr_pages -= ret;
1241 index += ret;
1242 cond_resched();
1243 }
1244 return 0;
1245}
1246
1247static noinline int lock_delalloc_pages(struct inode *inode,
1248 struct page *locked_page,
1249 u64 delalloc_start,
1250 u64 delalloc_end)
1251{
1252 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1253 unsigned long start_index = index;
1254 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1255 unsigned long pages_locked = 0;
1256 struct page *pages[16];
1257 unsigned long nrpages;
1258 int ret;
1259 int i;
1260
1261 /* the caller is responsible for locking the start index */
1262 if (index == locked_page->index && index == end_index)
1263 return 0;
1264
1265 /* skip the page at the start index */
1266 nrpages = end_index - index + 1;
d397712b 1267 while (nrpages > 0) {
c8b97818 1268 ret = find_get_pages_contig(inode->i_mapping, index,
5b050f04
CM
1269 min_t(unsigned long,
1270 nrpages, ARRAY_SIZE(pages)), pages);
c8b97818
CM
1271 if (ret == 0) {
1272 ret = -EAGAIN;
1273 goto done;
1274 }
1275 /* now we have an array of pages, lock them all */
1276 for (i = 0; i < ret; i++) {
1277 /*
1278 * the caller is taking responsibility for
1279 * locked_page
1280 */
771ed689 1281 if (pages[i] != locked_page) {
c8b97818 1282 lock_page(pages[i]);
f2b1c41c
CM
1283 if (!PageDirty(pages[i]) ||
1284 pages[i]->mapping != inode->i_mapping) {
771ed689
CM
1285 ret = -EAGAIN;
1286 unlock_page(pages[i]);
1287 page_cache_release(pages[i]);
1288 goto done;
1289 }
1290 }
c8b97818 1291 page_cache_release(pages[i]);
771ed689 1292 pages_locked++;
c8b97818 1293 }
c8b97818
CM
1294 nrpages -= ret;
1295 index += ret;
1296 cond_resched();
1297 }
1298 ret = 0;
1299done:
1300 if (ret && pages_locked) {
1301 __unlock_for_delalloc(inode, locked_page,
1302 delalloc_start,
1303 ((u64)(start_index + pages_locked - 1)) <<
1304 PAGE_CACHE_SHIFT);
1305 }
1306 return ret;
1307}
1308
1309/*
1310 * find a contiguous range of bytes in the file marked as delalloc, not
1311 * more than 'max_bytes'. start and end are used to return the range,
1312 *
1313 * 1 is returned if we find something, 0 if nothing was in the tree
1314 */
1315static noinline u64 find_lock_delalloc_range(struct inode *inode,
1316 struct extent_io_tree *tree,
1317 struct page *locked_page,
1318 u64 *start, u64 *end,
1319 u64 max_bytes)
1320{
1321 u64 delalloc_start;
1322 u64 delalloc_end;
1323 u64 found;
9655d298 1324 struct extent_state *cached_state = NULL;
c8b97818
CM
1325 int ret;
1326 int loops = 0;
1327
1328again:
1329 /* step one, find a bunch of delalloc bytes starting at start */
1330 delalloc_start = *start;
1331 delalloc_end = 0;
1332 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
c2a128d2 1333 max_bytes, &cached_state);
70b99e69 1334 if (!found || delalloc_end <= *start) {
c8b97818
CM
1335 *start = delalloc_start;
1336 *end = delalloc_end;
c2a128d2 1337 free_extent_state(cached_state);
c8b97818
CM
1338 return found;
1339 }
1340
70b99e69
CM
1341 /*
1342 * start comes from the offset of locked_page. We have to lock
1343 * pages in order, so we can't process delalloc bytes before
1344 * locked_page
1345 */
d397712b 1346 if (delalloc_start < *start)
70b99e69 1347 delalloc_start = *start;
70b99e69 1348
c8b97818
CM
1349 /*
1350 * make sure to limit the number of pages we try to lock down
1351 * if we're looping.
1352 */
d397712b 1353 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
771ed689 1354 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
d397712b 1355
c8b97818
CM
1356 /* step two, lock all the pages after the page that has start */
1357 ret = lock_delalloc_pages(inode, locked_page,
1358 delalloc_start, delalloc_end);
1359 if (ret == -EAGAIN) {
1360 /* some of the pages are gone, lets avoid looping by
1361 * shortening the size of the delalloc range we're searching
1362 */
9655d298 1363 free_extent_state(cached_state);
c8b97818
CM
1364 if (!loops) {
1365 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1366 max_bytes = PAGE_CACHE_SIZE - offset;
1367 loops = 1;
1368 goto again;
1369 } else {
1370 found = 0;
1371 goto out_failed;
1372 }
1373 }
1374 BUG_ON(ret);
1375
1376 /* step three, lock the state bits for the whole range */
9655d298
CM
1377 lock_extent_bits(tree, delalloc_start, delalloc_end,
1378 0, &cached_state, GFP_NOFS);
c8b97818
CM
1379
1380 /* then test to make sure it is all still delalloc */
1381 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1382 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1383 if (!ret) {
9655d298
CM
1384 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1385 &cached_state, GFP_NOFS);
c8b97818
CM
1386 __unlock_for_delalloc(inode, locked_page,
1387 delalloc_start, delalloc_end);
1388 cond_resched();
1389 goto again;
1390 }
9655d298 1391 free_extent_state(cached_state);
c8b97818
CM
1392 *start = delalloc_start;
1393 *end = delalloc_end;
1394out_failed:
1395 return found;
1396}
1397
1398int extent_clear_unlock_delalloc(struct inode *inode,
1399 struct extent_io_tree *tree,
1400 u64 start, u64 end, struct page *locked_page,
a791e35e 1401 unsigned long op)
c8b97818
CM
1402{
1403 int ret;
1404 struct page *pages[16];
1405 unsigned long index = start >> PAGE_CACHE_SHIFT;
1406 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1407 unsigned long nr_pages = end_index - index + 1;
1408 int i;
771ed689 1409 int clear_bits = 0;
c8b97818 1410
a791e35e 1411 if (op & EXTENT_CLEAR_UNLOCK)
771ed689 1412 clear_bits |= EXTENT_LOCKED;
a791e35e 1413 if (op & EXTENT_CLEAR_DIRTY)
c8b97818
CM
1414 clear_bits |= EXTENT_DIRTY;
1415
a791e35e 1416 if (op & EXTENT_CLEAR_DELALLOC)
771ed689
CM
1417 clear_bits |= EXTENT_DELALLOC;
1418
2c64c53d 1419 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
32c00aff
JB
1420 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1421 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1422 EXTENT_SET_PRIVATE2)))
771ed689 1423 return 0;
c8b97818 1424
d397712b 1425 while (nr_pages > 0) {
c8b97818 1426 ret = find_get_pages_contig(inode->i_mapping, index,
5b050f04
CM
1427 min_t(unsigned long,
1428 nr_pages, ARRAY_SIZE(pages)), pages);
c8b97818 1429 for (i = 0; i < ret; i++) {
8b62b72b 1430
a791e35e 1431 if (op & EXTENT_SET_PRIVATE2)
8b62b72b
CM
1432 SetPagePrivate2(pages[i]);
1433
c8b97818
CM
1434 if (pages[i] == locked_page) {
1435 page_cache_release(pages[i]);
1436 continue;
1437 }
a791e35e 1438 if (op & EXTENT_CLEAR_DIRTY)
c8b97818 1439 clear_page_dirty_for_io(pages[i]);
a791e35e 1440 if (op & EXTENT_SET_WRITEBACK)
c8b97818 1441 set_page_writeback(pages[i]);
a791e35e 1442 if (op & EXTENT_END_WRITEBACK)
c8b97818 1443 end_page_writeback(pages[i]);
a791e35e 1444 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
771ed689 1445 unlock_page(pages[i]);
c8b97818
CM
1446 page_cache_release(pages[i]);
1447 }
1448 nr_pages -= ret;
1449 index += ret;
1450 cond_resched();
1451 }
1452 return 0;
1453}
c8b97818 1454
d352ac68
CM
1455/*
1456 * count the number of bytes in the tree that have a given bit(s)
1457 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1458 * cached. The total number found is returned.
1459 */
d1310b2e
CM
1460u64 count_range_bits(struct extent_io_tree *tree,
1461 u64 *start, u64 search_end, u64 max_bytes,
ec29ed5b 1462 unsigned long bits, int contig)
d1310b2e
CM
1463{
1464 struct rb_node *node;
1465 struct extent_state *state;
1466 u64 cur_start = *start;
1467 u64 total_bytes = 0;
ec29ed5b 1468 u64 last = 0;
d1310b2e
CM
1469 int found = 0;
1470
1471 if (search_end <= cur_start) {
d1310b2e
CM
1472 WARN_ON(1);
1473 return 0;
1474 }
1475
cad321ad 1476 spin_lock(&tree->lock);
d1310b2e
CM
1477 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1478 total_bytes = tree->dirty_bytes;
1479 goto out;
1480 }
1481 /*
1482 * this search will find all the extents that end after
1483 * our range starts.
1484 */
80ea96b1 1485 node = tree_search(tree, cur_start);
d397712b 1486 if (!node)
d1310b2e 1487 goto out;
d1310b2e 1488
d397712b 1489 while (1) {
d1310b2e
CM
1490 state = rb_entry(node, struct extent_state, rb_node);
1491 if (state->start > search_end)
1492 break;
ec29ed5b
CM
1493 if (contig && found && state->start > last + 1)
1494 break;
1495 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
1496 total_bytes += min(search_end, state->end) + 1 -
1497 max(cur_start, state->start);
1498 if (total_bytes >= max_bytes)
1499 break;
1500 if (!found) {
1501 *start = state->start;
1502 found = 1;
1503 }
ec29ed5b
CM
1504 last = state->end;
1505 } else if (contig && found) {
1506 break;
d1310b2e
CM
1507 }
1508 node = rb_next(node);
1509 if (!node)
1510 break;
1511 }
1512out:
cad321ad 1513 spin_unlock(&tree->lock);
d1310b2e
CM
1514 return total_bytes;
1515}
b2950863 1516
d352ac68
CM
1517/*
1518 * set the private field for a given byte offset in the tree. If there isn't
1519 * an extent_state there already, this does nothing.
1520 */
d1310b2e
CM
1521int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1522{
1523 struct rb_node *node;
1524 struct extent_state *state;
1525 int ret = 0;
1526
cad321ad 1527 spin_lock(&tree->lock);
d1310b2e
CM
1528 /*
1529 * this search will find all the extents that end after
1530 * our range starts.
1531 */
80ea96b1 1532 node = tree_search(tree, start);
2b114d1d 1533 if (!node) {
d1310b2e
CM
1534 ret = -ENOENT;
1535 goto out;
1536 }
1537 state = rb_entry(node, struct extent_state, rb_node);
1538 if (state->start != start) {
1539 ret = -ENOENT;
1540 goto out;
1541 }
1542 state->private = private;
1543out:
cad321ad 1544 spin_unlock(&tree->lock);
d1310b2e
CM
1545 return ret;
1546}
1547
1548int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1549{
1550 struct rb_node *node;
1551 struct extent_state *state;
1552 int ret = 0;
1553
cad321ad 1554 spin_lock(&tree->lock);
d1310b2e
CM
1555 /*
1556 * this search will find all the extents that end after
1557 * our range starts.
1558 */
80ea96b1 1559 node = tree_search(tree, start);
2b114d1d 1560 if (!node) {
d1310b2e
CM
1561 ret = -ENOENT;
1562 goto out;
1563 }
1564 state = rb_entry(node, struct extent_state, rb_node);
1565 if (state->start != start) {
1566 ret = -ENOENT;
1567 goto out;
1568 }
1569 *private = state->private;
1570out:
cad321ad 1571 spin_unlock(&tree->lock);
d1310b2e
CM
1572 return ret;
1573}
1574
1575/*
1576 * searches a range in the state tree for a given mask.
70dec807 1577 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
1578 * has the bits set. Otherwise, 1 is returned if any bit in the
1579 * range is found set.
1580 */
1581int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
9655d298 1582 int bits, int filled, struct extent_state *cached)
d1310b2e
CM
1583{
1584 struct extent_state *state = NULL;
1585 struct rb_node *node;
1586 int bitset = 0;
d1310b2e 1587
cad321ad 1588 spin_lock(&tree->lock);
9655d298
CM
1589 if (cached && cached->tree && cached->start == start)
1590 node = &cached->rb_node;
1591 else
1592 node = tree_search(tree, start);
d1310b2e
CM
1593 while (node && start <= end) {
1594 state = rb_entry(node, struct extent_state, rb_node);
1595
1596 if (filled && state->start > start) {
1597 bitset = 0;
1598 break;
1599 }
1600
1601 if (state->start > end)
1602 break;
1603
1604 if (state->state & bits) {
1605 bitset = 1;
1606 if (!filled)
1607 break;
1608 } else if (filled) {
1609 bitset = 0;
1610 break;
1611 }
46562cec
CM
1612
1613 if (state->end == (u64)-1)
1614 break;
1615
d1310b2e
CM
1616 start = state->end + 1;
1617 if (start > end)
1618 break;
1619 node = rb_next(node);
1620 if (!node) {
1621 if (filled)
1622 bitset = 0;
1623 break;
1624 }
1625 }
cad321ad 1626 spin_unlock(&tree->lock);
d1310b2e
CM
1627 return bitset;
1628}
d1310b2e
CM
1629
1630/*
1631 * helper function to set a given page up to date if all the
1632 * extents in the tree for that page are up to date
1633 */
1634static int check_page_uptodate(struct extent_io_tree *tree,
1635 struct page *page)
1636{
1637 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1638 u64 end = start + PAGE_CACHE_SIZE - 1;
9655d298 1639 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e
CM
1640 SetPageUptodate(page);
1641 return 0;
1642}
1643
1644/*
1645 * helper function to unlock a page if all the extents in the tree
1646 * for that page are unlocked
1647 */
1648static int check_page_locked(struct extent_io_tree *tree,
1649 struct page *page)
1650{
1651 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1652 u64 end = start + PAGE_CACHE_SIZE - 1;
9655d298 1653 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
d1310b2e
CM
1654 unlock_page(page);
1655 return 0;
1656}
1657
1658/*
1659 * helper function to end page writeback if all the extents
1660 * in the tree for that page are done with writeback
1661 */
1662static int check_page_writeback(struct extent_io_tree *tree,
1663 struct page *page)
1664{
1edbb734 1665 end_page_writeback(page);
d1310b2e
CM
1666 return 0;
1667}
1668
1669/* lots and lots of room for performance fixes in the end_bio funcs */
1670
1671/*
1672 * after a writepage IO is done, we need to:
1673 * clear the uptodate bits on error
1674 * clear the writeback bits in the extent tree for this IO
1675 * end_page_writeback if the page has no more pending IO
1676 *
1677 * Scheduling is not allowed, so the extent state tree is expected
1678 * to have one and only one object corresponding to this IO.
1679 */
d1310b2e 1680static void end_bio_extent_writepage(struct bio *bio, int err)
d1310b2e 1681{
1259ab75 1682 int uptodate = err == 0;
d1310b2e 1683 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
902b22f3 1684 struct extent_io_tree *tree;
d1310b2e
CM
1685 u64 start;
1686 u64 end;
1687 int whole_page;
1259ab75 1688 int ret;
d1310b2e 1689
d1310b2e
CM
1690 do {
1691 struct page *page = bvec->bv_page;
902b22f3
DW
1692 tree = &BTRFS_I(page->mapping->host)->io_tree;
1693
d1310b2e
CM
1694 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1695 bvec->bv_offset;
1696 end = start + bvec->bv_len - 1;
1697
1698 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1699 whole_page = 1;
1700 else
1701 whole_page = 0;
1702
1703 if (--bvec >= bio->bi_io_vec)
1704 prefetchw(&bvec->bv_page->flags);
1259ab75
CM
1705 if (tree->ops && tree->ops->writepage_end_io_hook) {
1706 ret = tree->ops->writepage_end_io_hook(page, start,
902b22f3 1707 end, NULL, uptodate);
1259ab75
CM
1708 if (ret)
1709 uptodate = 0;
1710 }
1711
1712 if (!uptodate && tree->ops &&
1713 tree->ops->writepage_io_failed_hook) {
1714 ret = tree->ops->writepage_io_failed_hook(bio, page,
902b22f3 1715 start, end, NULL);
1259ab75 1716 if (ret == 0) {
1259ab75
CM
1717 uptodate = (err == 0);
1718 continue;
1719 }
1720 }
1721
d1310b2e 1722 if (!uptodate) {
2ac55d41 1723 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
d1310b2e
CM
1724 ClearPageUptodate(page);
1725 SetPageError(page);
1726 }
70dec807 1727
d1310b2e
CM
1728 if (whole_page)
1729 end_page_writeback(page);
1730 else
1731 check_page_writeback(tree, page);
d1310b2e 1732 } while (bvec >= bio->bi_io_vec);
2b1f55b0 1733
d1310b2e 1734 bio_put(bio);
d1310b2e
CM
1735}
1736
1737/*
1738 * after a readpage IO is done, we need to:
1739 * clear the uptodate bits on error
1740 * set the uptodate bits if things worked
1741 * set the page up to date if all extents in the tree are uptodate
1742 * clear the lock bit in the extent tree
1743 * unlock the page if there are no other extents locked for it
1744 *
1745 * Scheduling is not allowed, so the extent state tree is expected
1746 * to have one and only one object corresponding to this IO.
1747 */
d1310b2e 1748static void end_bio_extent_readpage(struct bio *bio, int err)
d1310b2e
CM
1749{
1750 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4125bf76
CM
1751 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1752 struct bio_vec *bvec = bio->bi_io_vec;
902b22f3 1753 struct extent_io_tree *tree;
d1310b2e
CM
1754 u64 start;
1755 u64 end;
1756 int whole_page;
1757 int ret;
1758
d20f7043
CM
1759 if (err)
1760 uptodate = 0;
1761
d1310b2e
CM
1762 do {
1763 struct page *page = bvec->bv_page;
507903b8
AJ
1764 struct extent_state *cached = NULL;
1765 struct extent_state *state;
1766
902b22f3
DW
1767 tree = &BTRFS_I(page->mapping->host)->io_tree;
1768
d1310b2e
CM
1769 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1770 bvec->bv_offset;
1771 end = start + bvec->bv_len - 1;
1772
1773 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1774 whole_page = 1;
1775 else
1776 whole_page = 0;
1777
4125bf76 1778 if (++bvec <= bvec_end)
d1310b2e
CM
1779 prefetchw(&bvec->bv_page->flags);
1780
507903b8 1781 spin_lock(&tree->lock);
0d399205 1782 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
109b36a2 1783 if (state && state->start == start) {
507903b8
AJ
1784 /*
1785 * take a reference on the state, unlock will drop
1786 * the ref
1787 */
1788 cache_state(state, &cached);
1789 }
1790 spin_unlock(&tree->lock);
1791
d1310b2e 1792 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
70dec807 1793 ret = tree->ops->readpage_end_io_hook(page, start, end,
507903b8 1794 state);
d1310b2e
CM
1795 if (ret)
1796 uptodate = 0;
1797 }
7e38326f
CM
1798 if (!uptodate && tree->ops &&
1799 tree->ops->readpage_io_failed_hook) {
1800 ret = tree->ops->readpage_io_failed_hook(bio, page,
902b22f3 1801 start, end, NULL);
7e38326f 1802 if (ret == 0) {
3b951516
CM
1803 uptodate =
1804 test_bit(BIO_UPTODATE, &bio->bi_flags);
d20f7043
CM
1805 if (err)
1806 uptodate = 0;
507903b8 1807 uncache_state(&cached);
7e38326f
CM
1808 continue;
1809 }
1810 }
d1310b2e 1811
771ed689 1812 if (uptodate) {
507903b8 1813 set_extent_uptodate(tree, start, end, &cached,
902b22f3 1814 GFP_ATOMIC);
771ed689 1815 }
507903b8 1816 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
d1310b2e 1817
70dec807
CM
1818 if (whole_page) {
1819 if (uptodate) {
1820 SetPageUptodate(page);
1821 } else {
1822 ClearPageUptodate(page);
1823 SetPageError(page);
1824 }
d1310b2e 1825 unlock_page(page);
70dec807
CM
1826 } else {
1827 if (uptodate) {
1828 check_page_uptodate(tree, page);
1829 } else {
1830 ClearPageUptodate(page);
1831 SetPageError(page);
1832 }
d1310b2e 1833 check_page_locked(tree, page);
70dec807 1834 }
4125bf76 1835 } while (bvec <= bvec_end);
d1310b2e
CM
1836
1837 bio_put(bio);
d1310b2e
CM
1838}
1839
1840/*
1841 * IO done from prepare_write is pretty simple, we just unlock
1842 * the structs in the extent tree when done, and set the uptodate bits
1843 * as appropriate.
1844 */
d1310b2e 1845static void end_bio_extent_preparewrite(struct bio *bio, int err)
d1310b2e
CM
1846{
1847 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1848 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
902b22f3 1849 struct extent_io_tree *tree;
d1310b2e
CM
1850 u64 start;
1851 u64 end;
1852
d1310b2e
CM
1853 do {
1854 struct page *page = bvec->bv_page;
507903b8 1855 struct extent_state *cached = NULL;
902b22f3
DW
1856 tree = &BTRFS_I(page->mapping->host)->io_tree;
1857
d1310b2e
CM
1858 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1859 bvec->bv_offset;
1860 end = start + bvec->bv_len - 1;
1861
1862 if (--bvec >= bio->bi_io_vec)
1863 prefetchw(&bvec->bv_page->flags);
1864
1865 if (uptodate) {
507903b8
AJ
1866 set_extent_uptodate(tree, start, end, &cached,
1867 GFP_ATOMIC);
d1310b2e
CM
1868 } else {
1869 ClearPageUptodate(page);
1870 SetPageError(page);
1871 }
1872
507903b8 1873 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
d1310b2e
CM
1874
1875 } while (bvec >= bio->bi_io_vec);
1876
1877 bio_put(bio);
d1310b2e
CM
1878}
1879
88f794ed
MX
1880struct bio *
1881btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1882 gfp_t gfp_flags)
d1310b2e
CM
1883{
1884 struct bio *bio;
1885
1886 bio = bio_alloc(gfp_flags, nr_vecs);
1887
1888 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1889 while (!bio && (nr_vecs /= 2))
1890 bio = bio_alloc(gfp_flags, nr_vecs);
1891 }
1892
1893 if (bio) {
e1c4b745 1894 bio->bi_size = 0;
d1310b2e
CM
1895 bio->bi_bdev = bdev;
1896 bio->bi_sector = first_sector;
1897 }
1898 return bio;
1899}
1900
c8b97818
CM
1901static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1902 unsigned long bio_flags)
d1310b2e 1903{
d1310b2e 1904 int ret = 0;
70dec807
CM
1905 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1906 struct page *page = bvec->bv_page;
1907 struct extent_io_tree *tree = bio->bi_private;
70dec807 1908 u64 start;
70dec807
CM
1909
1910 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
70dec807 1911
902b22f3 1912 bio->bi_private = NULL;
d1310b2e
CM
1913
1914 bio_get(bio);
1915
065631f6 1916 if (tree->ops && tree->ops->submit_bio_hook)
6b82ce8d 1917 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
eaf25d93 1918 mirror_num, bio_flags, start);
0b86a832
CM
1919 else
1920 submit_bio(rw, bio);
d1310b2e
CM
1921 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1922 ret = -EOPNOTSUPP;
1923 bio_put(bio);
1924 return ret;
1925}
1926
1927static int submit_extent_page(int rw, struct extent_io_tree *tree,
1928 struct page *page, sector_t sector,
1929 size_t size, unsigned long offset,
1930 struct block_device *bdev,
1931 struct bio **bio_ret,
1932 unsigned long max_pages,
f188591e 1933 bio_end_io_t end_io_func,
c8b97818
CM
1934 int mirror_num,
1935 unsigned long prev_bio_flags,
1936 unsigned long bio_flags)
d1310b2e
CM
1937{
1938 int ret = 0;
1939 struct bio *bio;
1940 int nr;
c8b97818
CM
1941 int contig = 0;
1942 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1943 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
5b050f04 1944 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
d1310b2e
CM
1945
1946 if (bio_ret && *bio_ret) {
1947 bio = *bio_ret;
c8b97818
CM
1948 if (old_compressed)
1949 contig = bio->bi_sector == sector;
1950 else
1951 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1952 sector;
1953
1954 if (prev_bio_flags != bio_flags || !contig ||
239b14b3 1955 (tree->ops && tree->ops->merge_bio_hook &&
c8b97818
CM
1956 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1957 bio_flags)) ||
1958 bio_add_page(bio, page, page_size, offset) < page_size) {
1959 ret = submit_one_bio(rw, bio, mirror_num,
1960 prev_bio_flags);
d1310b2e
CM
1961 bio = NULL;
1962 } else {
1963 return 0;
1964 }
1965 }
c8b97818
CM
1966 if (this_compressed)
1967 nr = BIO_MAX_PAGES;
1968 else
1969 nr = bio_get_nr_vecs(bdev);
1970
88f794ed 1971 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
5df67083
TI
1972 if (!bio)
1973 return -ENOMEM;
70dec807 1974
c8b97818 1975 bio_add_page(bio, page, page_size, offset);
d1310b2e
CM
1976 bio->bi_end_io = end_io_func;
1977 bio->bi_private = tree;
70dec807 1978
d397712b 1979 if (bio_ret)
d1310b2e 1980 *bio_ret = bio;
d397712b 1981 else
c8b97818 1982 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
d1310b2e
CM
1983
1984 return ret;
1985}
1986
1987void set_page_extent_mapped(struct page *page)
1988{
1989 if (!PagePrivate(page)) {
1990 SetPagePrivate(page);
d1310b2e 1991 page_cache_get(page);
6af118ce 1992 set_page_private(page, EXTENT_PAGE_PRIVATE);
d1310b2e
CM
1993 }
1994}
1995
b2950863 1996static void set_page_extent_head(struct page *page, unsigned long len)
d1310b2e 1997{
eb14ab8e 1998 WARN_ON(!PagePrivate(page));
d1310b2e
CM
1999 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2000}
2001
2002/*
2003 * basic readpage implementation. Locked extent state structs are inserted
2004 * into the tree that are removed when the IO is done (by the end_io
2005 * handlers)
2006 */
2007static int __extent_read_full_page(struct extent_io_tree *tree,
2008 struct page *page,
2009 get_extent_t *get_extent,
c8b97818
CM
2010 struct bio **bio, int mirror_num,
2011 unsigned long *bio_flags)
d1310b2e
CM
2012{
2013 struct inode *inode = page->mapping->host;
2014 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2015 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2016 u64 end;
2017 u64 cur = start;
2018 u64 extent_offset;
2019 u64 last_byte = i_size_read(inode);
2020 u64 block_start;
2021 u64 cur_end;
2022 sector_t sector;
2023 struct extent_map *em;
2024 struct block_device *bdev;
11c65dcc 2025 struct btrfs_ordered_extent *ordered;
d1310b2e
CM
2026 int ret;
2027 int nr = 0;
2028 size_t page_offset = 0;
2029 size_t iosize;
c8b97818 2030 size_t disk_io_size;
d1310b2e 2031 size_t blocksize = inode->i_sb->s_blocksize;
c8b97818 2032 unsigned long this_bio_flag = 0;
d1310b2e
CM
2033
2034 set_page_extent_mapped(page);
2035
2036 end = page_end;
11c65dcc
JB
2037 while (1) {
2038 lock_extent(tree, start, end, GFP_NOFS);
2039 ordered = btrfs_lookup_ordered_extent(inode, start);
2040 if (!ordered)
2041 break;
2042 unlock_extent(tree, start, end, GFP_NOFS);
2043 btrfs_start_ordered_extent(inode, ordered, 1);
2044 btrfs_put_ordered_extent(ordered);
2045 }
d1310b2e 2046
c8b97818
CM
2047 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2048 char *userpage;
2049 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2050
2051 if (zero_offset) {
2052 iosize = PAGE_CACHE_SIZE - zero_offset;
2053 userpage = kmap_atomic(page, KM_USER0);
2054 memset(userpage + zero_offset, 0, iosize);
2055 flush_dcache_page(page);
2056 kunmap_atomic(userpage, KM_USER0);
2057 }
2058 }
d1310b2e
CM
2059 while (cur <= end) {
2060 if (cur >= last_byte) {
2061 char *userpage;
507903b8
AJ
2062 struct extent_state *cached = NULL;
2063
d1310b2e
CM
2064 iosize = PAGE_CACHE_SIZE - page_offset;
2065 userpage = kmap_atomic(page, KM_USER0);
2066 memset(userpage + page_offset, 0, iosize);
2067 flush_dcache_page(page);
2068 kunmap_atomic(userpage, KM_USER0);
2069 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8
AJ
2070 &cached, GFP_NOFS);
2071 unlock_extent_cached(tree, cur, cur + iosize - 1,
2072 &cached, GFP_NOFS);
d1310b2e
CM
2073 break;
2074 }
2075 em = get_extent(inode, page, page_offset, cur,
2076 end - cur + 1, 0);
2077 if (IS_ERR(em) || !em) {
2078 SetPageError(page);
2079 unlock_extent(tree, cur, end, GFP_NOFS);
2080 break;
2081 }
d1310b2e
CM
2082 extent_offset = cur - em->start;
2083 BUG_ON(extent_map_end(em) <= cur);
2084 BUG_ON(end < cur);
2085
261507a0 2086 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
c8b97818 2087 this_bio_flag = EXTENT_BIO_COMPRESSED;
261507a0
LZ
2088 extent_set_compress_type(&this_bio_flag,
2089 em->compress_type);
2090 }
c8b97818 2091
d1310b2e
CM
2092 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2093 cur_end = min(extent_map_end(em) - 1, end);
2094 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
c8b97818
CM
2095 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2096 disk_io_size = em->block_len;
2097 sector = em->block_start >> 9;
2098 } else {
2099 sector = (em->block_start + extent_offset) >> 9;
2100 disk_io_size = iosize;
2101 }
d1310b2e
CM
2102 bdev = em->bdev;
2103 block_start = em->block_start;
d899e052
YZ
2104 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2105 block_start = EXTENT_MAP_HOLE;
d1310b2e
CM
2106 free_extent_map(em);
2107 em = NULL;
2108
2109 /* we've found a hole, just zero and go on */
2110 if (block_start == EXTENT_MAP_HOLE) {
2111 char *userpage;
507903b8
AJ
2112 struct extent_state *cached = NULL;
2113
d1310b2e
CM
2114 userpage = kmap_atomic(page, KM_USER0);
2115 memset(userpage + page_offset, 0, iosize);
2116 flush_dcache_page(page);
2117 kunmap_atomic(userpage, KM_USER0);
2118
2119 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8
AJ
2120 &cached, GFP_NOFS);
2121 unlock_extent_cached(tree, cur, cur + iosize - 1,
2122 &cached, GFP_NOFS);
d1310b2e
CM
2123 cur = cur + iosize;
2124 page_offset += iosize;
2125 continue;
2126 }
2127 /* the get_extent function already copied into the page */
9655d298
CM
2128 if (test_range_bit(tree, cur, cur_end,
2129 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 2130 check_page_uptodate(tree, page);
d1310b2e
CM
2131 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2132 cur = cur + iosize;
2133 page_offset += iosize;
2134 continue;
2135 }
70dec807
CM
2136 /* we have an inline extent but it didn't get marked up
2137 * to date. Error out
2138 */
2139 if (block_start == EXTENT_MAP_INLINE) {
2140 SetPageError(page);
2141 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2142 cur = cur + iosize;
2143 page_offset += iosize;
2144 continue;
2145 }
d1310b2e
CM
2146
2147 ret = 0;
2148 if (tree->ops && tree->ops->readpage_io_hook) {
2149 ret = tree->ops->readpage_io_hook(page, cur,
2150 cur + iosize - 1);
2151 }
2152 if (!ret) {
89642229
CM
2153 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2154 pnr -= page->index;
d1310b2e 2155 ret = submit_extent_page(READ, tree, page,
c8b97818 2156 sector, disk_io_size, page_offset,
89642229 2157 bdev, bio, pnr,
c8b97818
CM
2158 end_bio_extent_readpage, mirror_num,
2159 *bio_flags,
2160 this_bio_flag);
89642229 2161 nr++;
c8b97818 2162 *bio_flags = this_bio_flag;
d1310b2e
CM
2163 }
2164 if (ret)
2165 SetPageError(page);
2166 cur = cur + iosize;
2167 page_offset += iosize;
d1310b2e
CM
2168 }
2169 if (!nr) {
2170 if (!PageError(page))
2171 SetPageUptodate(page);
2172 unlock_page(page);
2173 }
2174 return 0;
2175}
2176
2177int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2178 get_extent_t *get_extent)
2179{
2180 struct bio *bio = NULL;
c8b97818 2181 unsigned long bio_flags = 0;
d1310b2e
CM
2182 int ret;
2183
c8b97818
CM
2184 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2185 &bio_flags);
d1310b2e 2186 if (bio)
6b82ce8d 2187 ret = submit_one_bio(READ, bio, 0, bio_flags);
d1310b2e
CM
2188 return ret;
2189}
d1310b2e 2190
11c8349b
CM
2191static noinline void update_nr_written(struct page *page,
2192 struct writeback_control *wbc,
2193 unsigned long nr_written)
2194{
2195 wbc->nr_to_write -= nr_written;
2196 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2197 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2198 page->mapping->writeback_index = page->index + nr_written;
2199}
2200
d1310b2e
CM
2201/*
2202 * the writepage semantics are similar to regular writepage. extent
2203 * records are inserted to lock ranges in the tree, and as dirty areas
2204 * are found, they are marked writeback. Then the lock bits are removed
2205 * and the end_io handler clears the writeback ranges
2206 */
2207static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2208 void *data)
2209{
2210 struct inode *inode = page->mapping->host;
2211 struct extent_page_data *epd = data;
2212 struct extent_io_tree *tree = epd->tree;
2213 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2214 u64 delalloc_start;
2215 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2216 u64 end;
2217 u64 cur = start;
2218 u64 extent_offset;
2219 u64 last_byte = i_size_read(inode);
2220 u64 block_start;
2221 u64 iosize;
2222 sector_t sector;
2c64c53d 2223 struct extent_state *cached_state = NULL;
d1310b2e
CM
2224 struct extent_map *em;
2225 struct block_device *bdev;
2226 int ret;
2227 int nr = 0;
7f3c74fb 2228 size_t pg_offset = 0;
d1310b2e
CM
2229 size_t blocksize;
2230 loff_t i_size = i_size_read(inode);
2231 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2232 u64 nr_delalloc;
2233 u64 delalloc_end;
c8b97818
CM
2234 int page_started;
2235 int compressed;
ffbd517d 2236 int write_flags;
771ed689 2237 unsigned long nr_written = 0;
d1310b2e 2238
ffbd517d 2239 if (wbc->sync_mode == WB_SYNC_ALL)
721a9602 2240 write_flags = WRITE_SYNC;
ffbd517d
CM
2241 else
2242 write_flags = WRITE;
2243
1abe9b8a 2244 trace___extent_writepage(page, inode, wbc);
2245
d1310b2e 2246 WARN_ON(!PageLocked(page));
7f3c74fb 2247 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
211c17f5 2248 if (page->index > end_index ||
7f3c74fb 2249 (page->index == end_index && !pg_offset)) {
39be25cd 2250 page->mapping->a_ops->invalidatepage(page, 0);
d1310b2e
CM
2251 unlock_page(page);
2252 return 0;
2253 }
2254
2255 if (page->index == end_index) {
2256 char *userpage;
2257
d1310b2e 2258 userpage = kmap_atomic(page, KM_USER0);
7f3c74fb
CM
2259 memset(userpage + pg_offset, 0,
2260 PAGE_CACHE_SIZE - pg_offset);
d1310b2e 2261 kunmap_atomic(userpage, KM_USER0);
211c17f5 2262 flush_dcache_page(page);
d1310b2e 2263 }
7f3c74fb 2264 pg_offset = 0;
d1310b2e
CM
2265
2266 set_page_extent_mapped(page);
2267
2268 delalloc_start = start;
2269 delalloc_end = 0;
c8b97818 2270 page_started = 0;
771ed689 2271 if (!epd->extent_locked) {
f85d7d6c 2272 u64 delalloc_to_write = 0;
11c8349b
CM
2273 /*
2274 * make sure the wbc mapping index is at least updated
2275 * to this page.
2276 */
2277 update_nr_written(page, wbc, 0);
2278
d397712b 2279 while (delalloc_end < page_end) {
771ed689 2280 nr_delalloc = find_lock_delalloc_range(inode, tree,
c8b97818
CM
2281 page,
2282 &delalloc_start,
d1310b2e
CM
2283 &delalloc_end,
2284 128 * 1024 * 1024);
771ed689
CM
2285 if (nr_delalloc == 0) {
2286 delalloc_start = delalloc_end + 1;
2287 continue;
2288 }
2289 tree->ops->fill_delalloc(inode, page, delalloc_start,
2290 delalloc_end, &page_started,
2291 &nr_written);
f85d7d6c
CM
2292 /*
2293 * delalloc_end is already one less than the total
2294 * length, so we don't subtract one from
2295 * PAGE_CACHE_SIZE
2296 */
2297 delalloc_to_write += (delalloc_end - delalloc_start +
2298 PAGE_CACHE_SIZE) >>
2299 PAGE_CACHE_SHIFT;
d1310b2e 2300 delalloc_start = delalloc_end + 1;
d1310b2e 2301 }
f85d7d6c
CM
2302 if (wbc->nr_to_write < delalloc_to_write) {
2303 int thresh = 8192;
2304
2305 if (delalloc_to_write < thresh * 2)
2306 thresh = delalloc_to_write;
2307 wbc->nr_to_write = min_t(u64, delalloc_to_write,
2308 thresh);
2309 }
c8b97818 2310
771ed689
CM
2311 /* did the fill delalloc function already unlock and start
2312 * the IO?
2313 */
2314 if (page_started) {
2315 ret = 0;
11c8349b
CM
2316 /*
2317 * we've unlocked the page, so we can't update
2318 * the mapping's writeback index, just update
2319 * nr_to_write.
2320 */
2321 wbc->nr_to_write -= nr_written;
2322 goto done_unlocked;
771ed689 2323 }
c8b97818 2324 }
247e743c 2325 if (tree->ops && tree->ops->writepage_start_hook) {
c8b97818
CM
2326 ret = tree->ops->writepage_start_hook(page, start,
2327 page_end);
247e743c 2328 if (ret == -EAGAIN) {
247e743c 2329 redirty_page_for_writepage(wbc, page);
11c8349b 2330 update_nr_written(page, wbc, nr_written);
247e743c 2331 unlock_page(page);
771ed689 2332 ret = 0;
11c8349b 2333 goto done_unlocked;
247e743c
CM
2334 }
2335 }
2336
11c8349b
CM
2337 /*
2338 * we don't want to touch the inode after unlocking the page,
2339 * so we update the mapping writeback index now
2340 */
2341 update_nr_written(page, wbc, nr_written + 1);
771ed689 2342
d1310b2e 2343 end = page_end;
d1310b2e 2344 if (last_byte <= start) {
e6dcd2dc
CM
2345 if (tree->ops && tree->ops->writepage_end_io_hook)
2346 tree->ops->writepage_end_io_hook(page, start,
2347 page_end, NULL, 1);
d1310b2e
CM
2348 goto done;
2349 }
2350
d1310b2e
CM
2351 blocksize = inode->i_sb->s_blocksize;
2352
2353 while (cur <= end) {
2354 if (cur >= last_byte) {
e6dcd2dc
CM
2355 if (tree->ops && tree->ops->writepage_end_io_hook)
2356 tree->ops->writepage_end_io_hook(page, cur,
2357 page_end, NULL, 1);
d1310b2e
CM
2358 break;
2359 }
7f3c74fb 2360 em = epd->get_extent(inode, page, pg_offset, cur,
d1310b2e
CM
2361 end - cur + 1, 1);
2362 if (IS_ERR(em) || !em) {
2363 SetPageError(page);
2364 break;
2365 }
2366
2367 extent_offset = cur - em->start;
2368 BUG_ON(extent_map_end(em) <= cur);
2369 BUG_ON(end < cur);
2370 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2371 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2372 sector = (em->block_start + extent_offset) >> 9;
2373 bdev = em->bdev;
2374 block_start = em->block_start;
c8b97818 2375 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
d1310b2e
CM
2376 free_extent_map(em);
2377 em = NULL;
2378
c8b97818
CM
2379 /*
2380 * compressed and inline extents are written through other
2381 * paths in the FS
2382 */
2383 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 2384 block_start == EXTENT_MAP_INLINE) {
c8b97818
CM
2385 /*
2386 * end_io notification does not happen here for
2387 * compressed extents
2388 */
2389 if (!compressed && tree->ops &&
2390 tree->ops->writepage_end_io_hook)
e6dcd2dc
CM
2391 tree->ops->writepage_end_io_hook(page, cur,
2392 cur + iosize - 1,
2393 NULL, 1);
c8b97818
CM
2394 else if (compressed) {
2395 /* we don't want to end_page_writeback on
2396 * a compressed extent. this happens
2397 * elsewhere
2398 */
2399 nr++;
2400 }
2401
2402 cur += iosize;
7f3c74fb 2403 pg_offset += iosize;
d1310b2e
CM
2404 continue;
2405 }
d1310b2e
CM
2406 /* leave this out until we have a page_mkwrite call */
2407 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
9655d298 2408 EXTENT_DIRTY, 0, NULL)) {
d1310b2e 2409 cur = cur + iosize;
7f3c74fb 2410 pg_offset += iosize;
d1310b2e
CM
2411 continue;
2412 }
c8b97818 2413
d1310b2e
CM
2414 if (tree->ops && tree->ops->writepage_io_hook) {
2415 ret = tree->ops->writepage_io_hook(page, cur,
2416 cur + iosize - 1);
2417 } else {
2418 ret = 0;
2419 }
1259ab75 2420 if (ret) {
d1310b2e 2421 SetPageError(page);
1259ab75 2422 } else {
d1310b2e 2423 unsigned long max_nr = end_index + 1;
7f3c74fb 2424
d1310b2e
CM
2425 set_range_writeback(tree, cur, cur + iosize - 1);
2426 if (!PageWriteback(page)) {
d397712b
CM
2427 printk(KERN_ERR "btrfs warning page %lu not "
2428 "writeback, cur %llu end %llu\n",
2429 page->index, (unsigned long long)cur,
d1310b2e
CM
2430 (unsigned long long)end);
2431 }
2432
ffbd517d
CM
2433 ret = submit_extent_page(write_flags, tree, page,
2434 sector, iosize, pg_offset,
2435 bdev, &epd->bio, max_nr,
c8b97818
CM
2436 end_bio_extent_writepage,
2437 0, 0, 0);
d1310b2e
CM
2438 if (ret)
2439 SetPageError(page);
2440 }
2441 cur = cur + iosize;
7f3c74fb 2442 pg_offset += iosize;
d1310b2e
CM
2443 nr++;
2444 }
2445done:
2446 if (nr == 0) {
2447 /* make sure the mapping tag for page dirty gets cleared */
2448 set_page_writeback(page);
2449 end_page_writeback(page);
2450 }
d1310b2e 2451 unlock_page(page);
771ed689 2452
11c8349b
CM
2453done_unlocked:
2454
2c64c53d
CM
2455 /* drop our reference on any cached states */
2456 free_extent_state(cached_state);
d1310b2e
CM
2457 return 0;
2458}
2459
d1310b2e 2460/**
4bef0848 2461 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
d1310b2e
CM
2462 * @mapping: address space structure to write
2463 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2464 * @writepage: function called for each page
2465 * @data: data passed to writepage function
2466 *
2467 * If a page is already under I/O, write_cache_pages() skips it, even
2468 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2469 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2470 * and msync() need to guarantee that all the data which was dirty at the time
2471 * the call was made get new I/O started against them. If wbc->sync_mode is
2472 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2473 * existing IO to complete.
2474 */
b2950863 2475static int extent_write_cache_pages(struct extent_io_tree *tree,
4bef0848
CM
2476 struct address_space *mapping,
2477 struct writeback_control *wbc,
d2c3f4f6
CM
2478 writepage_t writepage, void *data,
2479 void (*flush_fn)(void *))
d1310b2e 2480{
d1310b2e
CM
2481 int ret = 0;
2482 int done = 0;
f85d7d6c 2483 int nr_to_write_done = 0;
d1310b2e
CM
2484 struct pagevec pvec;
2485 int nr_pages;
2486 pgoff_t index;
2487 pgoff_t end; /* Inclusive */
2488 int scanned = 0;
d1310b2e 2489
d1310b2e
CM
2490 pagevec_init(&pvec, 0);
2491 if (wbc->range_cyclic) {
2492 index = mapping->writeback_index; /* Start from prev offset */
2493 end = -1;
2494 } else {
2495 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2496 end = wbc->range_end >> PAGE_CACHE_SHIFT;
d1310b2e
CM
2497 scanned = 1;
2498 }
2499retry:
f85d7d6c 2500 while (!done && !nr_to_write_done && (index <= end) &&
d1310b2e 2501 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
d397712b
CM
2502 PAGECACHE_TAG_DIRTY, min(end - index,
2503 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
d1310b2e
CM
2504 unsigned i;
2505
2506 scanned = 1;
2507 for (i = 0; i < nr_pages; i++) {
2508 struct page *page = pvec.pages[i];
2509
2510 /*
2511 * At this point we hold neither mapping->tree_lock nor
2512 * lock on the page itself: the page may be truncated or
2513 * invalidated (changing page->mapping to NULL), or even
2514 * swizzled back from swapper_space to tmpfs file
2515 * mapping
2516 */
4bef0848
CM
2517 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2518 tree->ops->write_cache_pages_lock_hook(page);
2519 else
2520 lock_page(page);
d1310b2e
CM
2521
2522 if (unlikely(page->mapping != mapping)) {
2523 unlock_page(page);
2524 continue;
2525 }
2526
2527 if (!wbc->range_cyclic && page->index > end) {
2528 done = 1;
2529 unlock_page(page);
2530 continue;
2531 }
2532
d2c3f4f6 2533 if (wbc->sync_mode != WB_SYNC_NONE) {
0e6bd956
CM
2534 if (PageWriteback(page))
2535 flush_fn(data);
d1310b2e 2536 wait_on_page_writeback(page);
d2c3f4f6 2537 }
d1310b2e
CM
2538
2539 if (PageWriteback(page) ||
2540 !clear_page_dirty_for_io(page)) {
2541 unlock_page(page);
2542 continue;
2543 }
2544
2545 ret = (*writepage)(page, wbc, data);
2546
2547 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2548 unlock_page(page);
2549 ret = 0;
2550 }
f85d7d6c 2551 if (ret)
d1310b2e 2552 done = 1;
f85d7d6c
CM
2553
2554 /*
2555 * the filesystem may choose to bump up nr_to_write.
2556 * We have to make sure to honor the new nr_to_write
2557 * at any time
2558 */
2559 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
2560 }
2561 pagevec_release(&pvec);
2562 cond_resched();
2563 }
2564 if (!scanned && !done) {
2565 /*
2566 * We hit the last page and there is more work to be done: wrap
2567 * back to the start of the file
2568 */
2569 scanned = 1;
2570 index = 0;
2571 goto retry;
2572 }
d1310b2e
CM
2573 return ret;
2574}
d1310b2e 2575
ffbd517d 2576static void flush_epd_write_bio(struct extent_page_data *epd)
d2c3f4f6 2577{
d2c3f4f6 2578 if (epd->bio) {
ffbd517d
CM
2579 if (epd->sync_io)
2580 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2581 else
2582 submit_one_bio(WRITE, epd->bio, 0, 0);
d2c3f4f6
CM
2583 epd->bio = NULL;
2584 }
2585}
2586
ffbd517d
CM
2587static noinline void flush_write_bio(void *data)
2588{
2589 struct extent_page_data *epd = data;
2590 flush_epd_write_bio(epd);
2591}
2592
d1310b2e
CM
2593int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2594 get_extent_t *get_extent,
2595 struct writeback_control *wbc)
2596{
2597 int ret;
2598 struct address_space *mapping = page->mapping;
2599 struct extent_page_data epd = {
2600 .bio = NULL,
2601 .tree = tree,
2602 .get_extent = get_extent,
771ed689 2603 .extent_locked = 0,
ffbd517d 2604 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
2605 };
2606 struct writeback_control wbc_writepages = {
d313d7a3 2607 .sync_mode = wbc->sync_mode,
d1310b2e
CM
2608 .older_than_this = NULL,
2609 .nr_to_write = 64,
2610 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2611 .range_end = (loff_t)-1,
2612 };
2613
d1310b2e
CM
2614 ret = __extent_writepage(page, wbc, &epd);
2615
4bef0848 2616 extent_write_cache_pages(tree, mapping, &wbc_writepages,
d2c3f4f6 2617 __extent_writepage, &epd, flush_write_bio);
ffbd517d 2618 flush_epd_write_bio(&epd);
d1310b2e
CM
2619 return ret;
2620}
d1310b2e 2621
771ed689
CM
2622int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2623 u64 start, u64 end, get_extent_t *get_extent,
2624 int mode)
2625{
2626 int ret = 0;
2627 struct address_space *mapping = inode->i_mapping;
2628 struct page *page;
2629 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2630 PAGE_CACHE_SHIFT;
2631
2632 struct extent_page_data epd = {
2633 .bio = NULL,
2634 .tree = tree,
2635 .get_extent = get_extent,
2636 .extent_locked = 1,
ffbd517d 2637 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
2638 };
2639 struct writeback_control wbc_writepages = {
771ed689
CM
2640 .sync_mode = mode,
2641 .older_than_this = NULL,
2642 .nr_to_write = nr_pages * 2,
2643 .range_start = start,
2644 .range_end = end + 1,
2645 };
2646
d397712b 2647 while (start <= end) {
771ed689
CM
2648 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2649 if (clear_page_dirty_for_io(page))
2650 ret = __extent_writepage(page, &wbc_writepages, &epd);
2651 else {
2652 if (tree->ops && tree->ops->writepage_end_io_hook)
2653 tree->ops->writepage_end_io_hook(page, start,
2654 start + PAGE_CACHE_SIZE - 1,
2655 NULL, 1);
2656 unlock_page(page);
2657 }
2658 page_cache_release(page);
2659 start += PAGE_CACHE_SIZE;
2660 }
2661
ffbd517d 2662 flush_epd_write_bio(&epd);
771ed689
CM
2663 return ret;
2664}
d1310b2e
CM
2665
2666int extent_writepages(struct extent_io_tree *tree,
2667 struct address_space *mapping,
2668 get_extent_t *get_extent,
2669 struct writeback_control *wbc)
2670{
2671 int ret = 0;
2672 struct extent_page_data epd = {
2673 .bio = NULL,
2674 .tree = tree,
2675 .get_extent = get_extent,
771ed689 2676 .extent_locked = 0,
ffbd517d 2677 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
2678 };
2679
4bef0848 2680 ret = extent_write_cache_pages(tree, mapping, wbc,
d2c3f4f6
CM
2681 __extent_writepage, &epd,
2682 flush_write_bio);
ffbd517d 2683 flush_epd_write_bio(&epd);
d1310b2e
CM
2684 return ret;
2685}
d1310b2e
CM
2686
2687int extent_readpages(struct extent_io_tree *tree,
2688 struct address_space *mapping,
2689 struct list_head *pages, unsigned nr_pages,
2690 get_extent_t get_extent)
2691{
2692 struct bio *bio = NULL;
2693 unsigned page_idx;
c8b97818 2694 unsigned long bio_flags = 0;
d1310b2e 2695
d1310b2e
CM
2696 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2697 struct page *page = list_entry(pages->prev, struct page, lru);
2698
2699 prefetchw(&page->flags);
2700 list_del(&page->lru);
28ecb609 2701 if (!add_to_page_cache_lru(page, mapping,
43e817a1 2702 page->index, GFP_NOFS)) {
f188591e 2703 __extent_read_full_page(tree, page, get_extent,
c8b97818 2704 &bio, 0, &bio_flags);
d1310b2e
CM
2705 }
2706 page_cache_release(page);
2707 }
d1310b2e
CM
2708 BUG_ON(!list_empty(pages));
2709 if (bio)
c8b97818 2710 submit_one_bio(READ, bio, 0, bio_flags);
d1310b2e
CM
2711 return 0;
2712}
d1310b2e
CM
2713
2714/*
2715 * basic invalidatepage code, this waits on any locked or writeback
2716 * ranges corresponding to the page, and then deletes any extent state
2717 * records from the tree
2718 */
2719int extent_invalidatepage(struct extent_io_tree *tree,
2720 struct page *page, unsigned long offset)
2721{
2ac55d41 2722 struct extent_state *cached_state = NULL;
d1310b2e
CM
2723 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2724 u64 end = start + PAGE_CACHE_SIZE - 1;
2725 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2726
d397712b 2727 start += (offset + blocksize - 1) & ~(blocksize - 1);
d1310b2e
CM
2728 if (start > end)
2729 return 0;
2730
2ac55d41 2731 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
1edbb734 2732 wait_on_page_writeback(page);
d1310b2e 2733 clear_extent_bit(tree, start, end,
32c00aff
JB
2734 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2735 EXTENT_DO_ACCOUNTING,
2ac55d41 2736 1, 1, &cached_state, GFP_NOFS);
d1310b2e
CM
2737 return 0;
2738}
d1310b2e
CM
2739
2740/*
2741 * simple commit_write call, set_range_dirty is used to mark both
2742 * the pages and the extent records as dirty
2743 */
2744int extent_commit_write(struct extent_io_tree *tree,
2745 struct inode *inode, struct page *page,
2746 unsigned from, unsigned to)
2747{
2748 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2749
2750 set_page_extent_mapped(page);
2751 set_page_dirty(page);
2752
2753 if (pos > inode->i_size) {
2754 i_size_write(inode, pos);
2755 mark_inode_dirty(inode);
2756 }
2757 return 0;
2758}
d1310b2e
CM
2759
2760int extent_prepare_write(struct extent_io_tree *tree,
2761 struct inode *inode, struct page *page,
2762 unsigned from, unsigned to, get_extent_t *get_extent)
2763{
2764 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2765 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2766 u64 block_start;
2767 u64 orig_block_start;
2768 u64 block_end;
2769 u64 cur_end;
2770 struct extent_map *em;
2771 unsigned blocksize = 1 << inode->i_blkbits;
2772 size_t page_offset = 0;
2773 size_t block_off_start;
2774 size_t block_off_end;
2775 int err = 0;
2776 int iocount = 0;
2777 int ret = 0;
2778 int isnew;
2779
2780 set_page_extent_mapped(page);
2781
2782 block_start = (page_start + from) & ~((u64)blocksize - 1);
2783 block_end = (page_start + to - 1) | (blocksize - 1);
2784 orig_block_start = block_start;
2785
2786 lock_extent(tree, page_start, page_end, GFP_NOFS);
d397712b 2787 while (block_start <= block_end) {
d1310b2e
CM
2788 em = get_extent(inode, page, page_offset, block_start,
2789 block_end - block_start + 1, 1);
d397712b 2790 if (IS_ERR(em) || !em)
d1310b2e 2791 goto err;
d397712b 2792
d1310b2e
CM
2793 cur_end = min(block_end, extent_map_end(em) - 1);
2794 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2795 block_off_end = block_off_start + blocksize;
2796 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2797
2798 if (!PageUptodate(page) && isnew &&
2799 (block_off_end > to || block_off_start < from)) {
2800 void *kaddr;
2801
2802 kaddr = kmap_atomic(page, KM_USER0);
2803 if (block_off_end > to)
2804 memset(kaddr + to, 0, block_off_end - to);
2805 if (block_off_start < from)
2806 memset(kaddr + block_off_start, 0,
2807 from - block_off_start);
2808 flush_dcache_page(page);
2809 kunmap_atomic(kaddr, KM_USER0);
2810 }
2811 if ((em->block_start != EXTENT_MAP_HOLE &&
2812 em->block_start != EXTENT_MAP_INLINE) &&
2813 !isnew && !PageUptodate(page) &&
2814 (block_off_end > to || block_off_start < from) &&
2815 !test_range_bit(tree, block_start, cur_end,
9655d298 2816 EXTENT_UPTODATE, 1, NULL)) {
d1310b2e
CM
2817 u64 sector;
2818 u64 extent_offset = block_start - em->start;
2819 size_t iosize;
2820 sector = (em->block_start + extent_offset) >> 9;
2821 iosize = (cur_end - block_start + blocksize) &
2822 ~((u64)blocksize - 1);
2823 /*
2824 * we've already got the extent locked, but we
2825 * need to split the state such that our end_bio
2826 * handler can clear the lock.
2827 */
2828 set_extent_bit(tree, block_start,
2829 block_start + iosize - 1,
2c64c53d 2830 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
d1310b2e
CM
2831 ret = submit_extent_page(READ, tree, page,
2832 sector, iosize, page_offset, em->bdev,
2833 NULL, 1,
c8b97818
CM
2834 end_bio_extent_preparewrite, 0,
2835 0, 0);
411fc6bc
AK
2836 if (ret && !err)
2837 err = ret;
d1310b2e
CM
2838 iocount++;
2839 block_start = block_start + iosize;
2840 } else {
507903b8
AJ
2841 struct extent_state *cached = NULL;
2842
2843 set_extent_uptodate(tree, block_start, cur_end, &cached,
d1310b2e 2844 GFP_NOFS);
507903b8
AJ
2845 unlock_extent_cached(tree, block_start, cur_end,
2846 &cached, GFP_NOFS);
d1310b2e
CM
2847 block_start = cur_end + 1;
2848 }
2849 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2850 free_extent_map(em);
2851 }
2852 if (iocount) {
2853 wait_extent_bit(tree, orig_block_start,
2854 block_end, EXTENT_LOCKED);
2855 }
2856 check_page_uptodate(tree, page);
2857err:
2858 /* FIXME, zero out newly allocated blocks on error */
2859 return err;
2860}
d1310b2e 2861
7b13b7b1
CM
2862/*
2863 * a helper for releasepage, this tests for areas of the page that
2864 * are locked or under IO and drops the related state bits if it is safe
2865 * to drop the page.
2866 */
2867int try_release_extent_state(struct extent_map_tree *map,
2868 struct extent_io_tree *tree, struct page *page,
2869 gfp_t mask)
2870{
2871 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2872 u64 end = start + PAGE_CACHE_SIZE - 1;
2873 int ret = 1;
2874
211f90e6 2875 if (test_range_bit(tree, start, end,
8b62b72b 2876 EXTENT_IOBITS, 0, NULL))
7b13b7b1
CM
2877 ret = 0;
2878 else {
2879 if ((mask & GFP_NOFS) == GFP_NOFS)
2880 mask = GFP_NOFS;
11ef160f
CM
2881 /*
2882 * at this point we can safely clear everything except the
2883 * locked bit and the nodatasum bit
2884 */
e3f24cc5 2885 ret = clear_extent_bit(tree, start, end,
11ef160f
CM
2886 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2887 0, 0, NULL, mask);
e3f24cc5
CM
2888
2889 /* if clear_extent_bit failed for enomem reasons,
2890 * we can't allow the release to continue.
2891 */
2892 if (ret < 0)
2893 ret = 0;
2894 else
2895 ret = 1;
7b13b7b1
CM
2896 }
2897 return ret;
2898}
7b13b7b1 2899
d1310b2e
CM
2900/*
2901 * a helper for releasepage. As long as there are no locked extents
2902 * in the range corresponding to the page, both state records and extent
2903 * map records are removed
2904 */
2905int try_release_extent_mapping(struct extent_map_tree *map,
70dec807
CM
2906 struct extent_io_tree *tree, struct page *page,
2907 gfp_t mask)
d1310b2e
CM
2908{
2909 struct extent_map *em;
2910 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2911 u64 end = start + PAGE_CACHE_SIZE - 1;
7b13b7b1 2912
70dec807
CM
2913 if ((mask & __GFP_WAIT) &&
2914 page->mapping->host->i_size > 16 * 1024 * 1024) {
39b5637f 2915 u64 len;
70dec807 2916 while (start <= end) {
39b5637f 2917 len = end - start + 1;
890871be 2918 write_lock(&map->lock);
39b5637f 2919 em = lookup_extent_mapping(map, start, len);
70dec807 2920 if (!em || IS_ERR(em)) {
890871be 2921 write_unlock(&map->lock);
70dec807
CM
2922 break;
2923 }
7f3c74fb
CM
2924 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2925 em->start != start) {
890871be 2926 write_unlock(&map->lock);
70dec807
CM
2927 free_extent_map(em);
2928 break;
2929 }
2930 if (!test_range_bit(tree, em->start,
2931 extent_map_end(em) - 1,
8b62b72b 2932 EXTENT_LOCKED | EXTENT_WRITEBACK,
9655d298 2933 0, NULL)) {
70dec807
CM
2934 remove_extent_mapping(map, em);
2935 /* once for the rb tree */
2936 free_extent_map(em);
2937 }
2938 start = extent_map_end(em);
890871be 2939 write_unlock(&map->lock);
70dec807
CM
2940
2941 /* once for us */
d1310b2e
CM
2942 free_extent_map(em);
2943 }
d1310b2e 2944 }
7b13b7b1 2945 return try_release_extent_state(map, tree, page, mask);
d1310b2e 2946}
d1310b2e
CM
2947
2948sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2949 get_extent_t *get_extent)
2950{
2951 struct inode *inode = mapping->host;
2ac55d41 2952 struct extent_state *cached_state = NULL;
d1310b2e
CM
2953 u64 start = iblock << inode->i_blkbits;
2954 sector_t sector = 0;
d899e052 2955 size_t blksize = (1 << inode->i_blkbits);
d1310b2e
CM
2956 struct extent_map *em;
2957
2ac55d41
JB
2958 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2959 0, &cached_state, GFP_NOFS);
d899e052 2960 em = get_extent(inode, NULL, 0, start, blksize, 0);
2ac55d41
JB
2961 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2962 start + blksize - 1, &cached_state, GFP_NOFS);
d1310b2e
CM
2963 if (!em || IS_ERR(em))
2964 return 0;
2965
d899e052 2966 if (em->block_start > EXTENT_MAP_LAST_BYTE)
d1310b2e
CM
2967 goto out;
2968
2969 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
d1310b2e
CM
2970out:
2971 free_extent_map(em);
2972 return sector;
2973}
2974
ec29ed5b
CM
2975/*
2976 * helper function for fiemap, which doesn't want to see any holes.
2977 * This maps until we find something past 'last'
2978 */
2979static struct extent_map *get_extent_skip_holes(struct inode *inode,
2980 u64 offset,
2981 u64 last,
2982 get_extent_t *get_extent)
2983{
2984 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2985 struct extent_map *em;
2986 u64 len;
2987
2988 if (offset >= last)
2989 return NULL;
2990
2991 while(1) {
2992 len = last - offset;
2993 if (len == 0)
2994 break;
2995 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2996 em = get_extent(inode, NULL, 0, offset, len, 0);
2997 if (!em || IS_ERR(em))
2998 return em;
2999
3000 /* if this isn't a hole return it */
3001 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3002 em->block_start != EXTENT_MAP_HOLE) {
3003 return em;
3004 }
3005
3006 /* this is a hole, advance to the next extent */
3007 offset = extent_map_end(em);
3008 free_extent_map(em);
3009 if (offset >= last)
3010 break;
3011 }
3012 return NULL;
3013}
3014
1506fcc8
YS
3015int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3016 __u64 start, __u64 len, get_extent_t *get_extent)
3017{
975f84fe 3018 int ret = 0;
1506fcc8
YS
3019 u64 off = start;
3020 u64 max = start + len;
3021 u32 flags = 0;
975f84fe
JB
3022 u32 found_type;
3023 u64 last;
ec29ed5b 3024 u64 last_for_get_extent = 0;
1506fcc8 3025 u64 disko = 0;
ec29ed5b 3026 u64 isize = i_size_read(inode);
975f84fe 3027 struct btrfs_key found_key;
1506fcc8 3028 struct extent_map *em = NULL;
2ac55d41 3029 struct extent_state *cached_state = NULL;
975f84fe
JB
3030 struct btrfs_path *path;
3031 struct btrfs_file_extent_item *item;
1506fcc8 3032 int end = 0;
ec29ed5b
CM
3033 u64 em_start = 0;
3034 u64 em_len = 0;
3035 u64 em_end = 0;
1506fcc8 3036 unsigned long emflags;
1506fcc8
YS
3037
3038 if (len == 0)
3039 return -EINVAL;
3040
975f84fe
JB
3041 path = btrfs_alloc_path();
3042 if (!path)
3043 return -ENOMEM;
3044 path->leave_spinning = 1;
3045
ec29ed5b
CM
3046 /*
3047 * lookup the last file extent. We're not using i_size here
3048 * because there might be preallocation past i_size
3049 */
975f84fe
JB
3050 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3051 path, inode->i_ino, -1, 0);
3052 if (ret < 0) {
3053 btrfs_free_path(path);
3054 return ret;
3055 }
3056 WARN_ON(!ret);
3057 path->slots[0]--;
3058 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3059 struct btrfs_file_extent_item);
3060 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key);
3062
ec29ed5b 3063 /* No extents, but there might be delalloc bits */
975f84fe
JB
3064 if (found_key.objectid != inode->i_ino ||
3065 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
3066 /* have to trust i_size as the end */
3067 last = (u64)-1;
3068 last_for_get_extent = isize;
3069 } else {
3070 /*
3071 * remember the start of the last extent. There are a
3072 * bunch of different factors that go into the length of the
3073 * extent, so its much less complex to remember where it started
3074 */
3075 last = found_key.offset;
3076 last_for_get_extent = last + 1;
975f84fe 3077 }
975f84fe
JB
3078 btrfs_free_path(path);
3079
ec29ed5b
CM
3080 /*
3081 * we might have some extents allocated but more delalloc past those
3082 * extents. so, we trust isize unless the start of the last extent is
3083 * beyond isize
3084 */
3085 if (last < isize) {
3086 last = (u64)-1;
3087 last_for_get_extent = isize;
3088 }
3089
2ac55d41
JB
3090 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3091 &cached_state, GFP_NOFS);
ec29ed5b
CM
3092
3093 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3094 get_extent);
1506fcc8
YS
3095 if (!em)
3096 goto out;
3097 if (IS_ERR(em)) {
3098 ret = PTR_ERR(em);
3099 goto out;
3100 }
975f84fe 3101
1506fcc8 3102 while (!end) {
ea8efc74
CM
3103 u64 offset_in_extent;
3104
3105 /* break if the extent we found is outside the range */
3106 if (em->start >= max || extent_map_end(em) < off)
3107 break;
3108
3109 /*
3110 * get_extent may return an extent that starts before our
3111 * requested range. We have to make sure the ranges
3112 * we return to fiemap always move forward and don't
3113 * overlap, so adjust the offsets here
3114 */
3115 em_start = max(em->start, off);
1506fcc8 3116
ea8efc74
CM
3117 /*
3118 * record the offset from the start of the extent
3119 * for adjusting the disk offset below
3120 */
3121 offset_in_extent = em_start - em->start;
ec29ed5b 3122 em_end = extent_map_end(em);
ea8efc74 3123 em_len = em_end - em_start;
ec29ed5b 3124 emflags = em->flags;
1506fcc8
YS
3125 disko = 0;
3126 flags = 0;
3127
ea8efc74
CM
3128 /*
3129 * bump off for our next call to get_extent
3130 */
3131 off = extent_map_end(em);
3132 if (off >= max)
3133 end = 1;
3134
93dbfad7 3135 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
3136 end = 1;
3137 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 3138 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
3139 flags |= (FIEMAP_EXTENT_DATA_INLINE |
3140 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 3141 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
3142 flags |= (FIEMAP_EXTENT_DELALLOC |
3143 FIEMAP_EXTENT_UNKNOWN);
93dbfad7 3144 } else {
ea8efc74 3145 disko = em->block_start + offset_in_extent;
1506fcc8
YS
3146 }
3147 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3148 flags |= FIEMAP_EXTENT_ENCODED;
3149
1506fcc8
YS
3150 free_extent_map(em);
3151 em = NULL;
ec29ed5b
CM
3152 if ((em_start >= last) || em_len == (u64)-1 ||
3153 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
3154 flags |= FIEMAP_EXTENT_LAST;
3155 end = 1;
3156 }
3157
ec29ed5b
CM
3158 /* now scan forward to see if this is really the last extent. */
3159 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3160 get_extent);
3161 if (IS_ERR(em)) {
3162 ret = PTR_ERR(em);
3163 goto out;
3164 }
3165 if (!em) {
975f84fe
JB
3166 flags |= FIEMAP_EXTENT_LAST;
3167 end = 1;
3168 }
ec29ed5b
CM
3169 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3170 em_len, flags);
3171 if (ret)
3172 goto out_free;
1506fcc8
YS
3173 }
3174out_free:
3175 free_extent_map(em);
3176out:
2ac55d41
JB
3177 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3178 &cached_state, GFP_NOFS);
1506fcc8
YS
3179 return ret;
3180}
3181
d1310b2e
CM
3182static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3183 unsigned long i)
3184{
3185 struct page *p;
3186 struct address_space *mapping;
3187
3188 if (i == 0)
3189 return eb->first_page;
3190 i += eb->start >> PAGE_CACHE_SHIFT;
3191 mapping = eb->first_page->mapping;
33958dc6
CM
3192 if (!mapping)
3193 return NULL;
0ee0fda0
SW
3194
3195 /*
3196 * extent_buffer_page is only called after pinning the page
3197 * by increasing the reference count. So we know the page must
3198 * be in the radix tree.
3199 */
0ee0fda0 3200 rcu_read_lock();
d1310b2e 3201 p = radix_tree_lookup(&mapping->page_tree, i);
0ee0fda0 3202 rcu_read_unlock();
2b1f55b0 3203
d1310b2e
CM
3204 return p;
3205}
3206
6af118ce 3207static inline unsigned long num_extent_pages(u64 start, u64 len)
728131d8 3208{
6af118ce
CM
3209 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3210 (start >> PAGE_CACHE_SHIFT);
728131d8
CM
3211}
3212
d1310b2e
CM
3213static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3214 u64 start,
3215 unsigned long len,
3216 gfp_t mask)
3217{
3218 struct extent_buffer *eb = NULL;
3935127c 3219#if LEAK_DEBUG
2d2ae547 3220 unsigned long flags;
4bef0848 3221#endif
d1310b2e 3222
d1310b2e 3223 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
91ca338d
TI
3224 if (eb == NULL)
3225 return NULL;
d1310b2e
CM
3226 eb->start = start;
3227 eb->len = len;
b4ce94de
CM
3228 spin_lock_init(&eb->lock);
3229 init_waitqueue_head(&eb->lock_wq);
3230
3935127c 3231#if LEAK_DEBUG
2d2ae547
CM
3232 spin_lock_irqsave(&leak_lock, flags);
3233 list_add(&eb->leak_list, &buffers);
3234 spin_unlock_irqrestore(&leak_lock, flags);
4bef0848 3235#endif
d1310b2e
CM
3236 atomic_set(&eb->refs, 1);
3237
3238 return eb;
3239}
3240
3241static void __free_extent_buffer(struct extent_buffer *eb)
3242{
3935127c 3243#if LEAK_DEBUG
2d2ae547
CM
3244 unsigned long flags;
3245 spin_lock_irqsave(&leak_lock, flags);
3246 list_del(&eb->leak_list);
3247 spin_unlock_irqrestore(&leak_lock, flags);
4bef0848 3248#endif
d1310b2e
CM
3249 kmem_cache_free(extent_buffer_cache, eb);
3250}
3251
897ca6e9
MX
3252/*
3253 * Helper for releasing extent buffer page.
3254 */
3255static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3256 unsigned long start_idx)
3257{
3258 unsigned long index;
3259 struct page *page;
3260
3261 if (!eb->first_page)
3262 return;
3263
3264 index = num_extent_pages(eb->start, eb->len);
3265 if (start_idx >= index)
3266 return;
3267
3268 do {
3269 index--;
3270 page = extent_buffer_page(eb, index);
3271 if (page)
3272 page_cache_release(page);
3273 } while (index != start_idx);
3274}
3275
3276/*
3277 * Helper for releasing the extent buffer.
3278 */
3279static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3280{
3281 btrfs_release_extent_buffer_page(eb, 0);
3282 __free_extent_buffer(eb);
3283}
3284
d1310b2e
CM
3285struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3286 u64 start, unsigned long len,
3287 struct page *page0,
3288 gfp_t mask)
3289{
3290 unsigned long num_pages = num_extent_pages(start, len);
3291 unsigned long i;
3292 unsigned long index = start >> PAGE_CACHE_SHIFT;
3293 struct extent_buffer *eb;
6af118ce 3294 struct extent_buffer *exists = NULL;
d1310b2e
CM
3295 struct page *p;
3296 struct address_space *mapping = tree->mapping;
3297 int uptodate = 1;
19fe0a8b 3298 int ret;
d1310b2e 3299
19fe0a8b
MX
3300 rcu_read_lock();
3301 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3302 if (eb && atomic_inc_not_zero(&eb->refs)) {
3303 rcu_read_unlock();
0f9dd46c 3304 mark_page_accessed(eb->first_page);
6af118ce
CM
3305 return eb;
3306 }
19fe0a8b 3307 rcu_read_unlock();
6af118ce 3308
d1310b2e 3309 eb = __alloc_extent_buffer(tree, start, len, mask);
2b114d1d 3310 if (!eb)
d1310b2e
CM
3311 return NULL;
3312
d1310b2e
CM
3313 if (page0) {
3314 eb->first_page = page0;
3315 i = 1;
3316 index++;
3317 page_cache_get(page0);
3318 mark_page_accessed(page0);
3319 set_page_extent_mapped(page0);
d1310b2e 3320 set_page_extent_head(page0, len);
f188591e 3321 uptodate = PageUptodate(page0);
d1310b2e
CM
3322 } else {
3323 i = 0;
3324 }
3325 for (; i < num_pages; i++, index++) {
3326 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3327 if (!p) {
3328 WARN_ON(1);
6af118ce 3329 goto free_eb;
d1310b2e
CM
3330 }
3331 set_page_extent_mapped(p);
3332 mark_page_accessed(p);
3333 if (i == 0) {
3334 eb->first_page = p;
3335 set_page_extent_head(p, len);
3336 } else {
3337 set_page_private(p, EXTENT_PAGE_PRIVATE);
3338 }
3339 if (!PageUptodate(p))
3340 uptodate = 0;
eb14ab8e
CM
3341
3342 /*
3343 * see below about how we avoid a nasty race with release page
3344 * and why we unlock later
3345 */
3346 if (i != 0)
3347 unlock_page(p);
d1310b2e
CM
3348 }
3349 if (uptodate)
b4ce94de 3350 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
d1310b2e 3351
19fe0a8b
MX
3352 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3353 if (ret)
3354 goto free_eb;
3355
6af118ce 3356 spin_lock(&tree->buffer_lock);
19fe0a8b
MX
3357 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3358 if (ret == -EEXIST) {
3359 exists = radix_tree_lookup(&tree->buffer,
3360 start >> PAGE_CACHE_SHIFT);
6af118ce
CM
3361 /* add one reference for the caller */
3362 atomic_inc(&exists->refs);
3363 spin_unlock(&tree->buffer_lock);
19fe0a8b 3364 radix_tree_preload_end();
6af118ce
CM
3365 goto free_eb;
3366 }
6af118ce
CM
3367 /* add one reference for the tree */
3368 atomic_inc(&eb->refs);
f044ba78 3369 spin_unlock(&tree->buffer_lock);
19fe0a8b 3370 radix_tree_preload_end();
eb14ab8e
CM
3371
3372 /*
3373 * there is a race where release page may have
3374 * tried to find this extent buffer in the radix
3375 * but failed. It will tell the VM it is safe to
3376 * reclaim the, and it will clear the page private bit.
3377 * We must make sure to set the page private bit properly
3378 * after the extent buffer is in the radix tree so
3379 * it doesn't get lost
3380 */
3381 set_page_extent_mapped(eb->first_page);
3382 set_page_extent_head(eb->first_page, eb->len);
3383 if (!page0)
3384 unlock_page(eb->first_page);
d1310b2e
CM
3385 return eb;
3386
6af118ce 3387free_eb:
eb14ab8e
CM
3388 if (eb->first_page && !page0)
3389 unlock_page(eb->first_page);
3390
d1310b2e 3391 if (!atomic_dec_and_test(&eb->refs))
6af118ce 3392 return exists;
897ca6e9 3393 btrfs_release_extent_buffer(eb);
6af118ce 3394 return exists;
d1310b2e 3395}
d1310b2e
CM
3396
3397struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3398 u64 start, unsigned long len,
3399 gfp_t mask)
3400{
d1310b2e 3401 struct extent_buffer *eb;
d1310b2e 3402
19fe0a8b
MX
3403 rcu_read_lock();
3404 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3405 if (eb && atomic_inc_not_zero(&eb->refs)) {
3406 rcu_read_unlock();
0f9dd46c 3407 mark_page_accessed(eb->first_page);
19fe0a8b
MX
3408 return eb;
3409 }
3410 rcu_read_unlock();
0f9dd46c 3411
19fe0a8b 3412 return NULL;
d1310b2e 3413}
d1310b2e
CM
3414
3415void free_extent_buffer(struct extent_buffer *eb)
3416{
d1310b2e
CM
3417 if (!eb)
3418 return;
3419
3420 if (!atomic_dec_and_test(&eb->refs))
3421 return;
3422
6af118ce 3423 WARN_ON(1);
d1310b2e 3424}
d1310b2e
CM
3425
3426int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3427 struct extent_buffer *eb)
3428{
d1310b2e
CM
3429 unsigned long i;
3430 unsigned long num_pages;
3431 struct page *page;
3432
d1310b2e
CM
3433 num_pages = num_extent_pages(eb->start, eb->len);
3434
3435 for (i = 0; i < num_pages; i++) {
3436 page = extent_buffer_page(eb, i);
b9473439 3437 if (!PageDirty(page))
d2c3f4f6
CM
3438 continue;
3439
a61e6f29 3440 lock_page(page);
eb14ab8e
CM
3441 WARN_ON(!PagePrivate(page));
3442
3443 set_page_extent_mapped(page);
d1310b2e
CM
3444 if (i == 0)
3445 set_page_extent_head(page, eb->len);
d1310b2e 3446
d1310b2e 3447 clear_page_dirty_for_io(page);
0ee0fda0 3448 spin_lock_irq(&page->mapping->tree_lock);
d1310b2e
CM
3449 if (!PageDirty(page)) {
3450 radix_tree_tag_clear(&page->mapping->page_tree,
3451 page_index(page),
3452 PAGECACHE_TAG_DIRTY);
3453 }
0ee0fda0 3454 spin_unlock_irq(&page->mapping->tree_lock);
a61e6f29 3455 unlock_page(page);
d1310b2e
CM
3456 }
3457 return 0;
3458}
d1310b2e
CM
3459
3460int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3461 struct extent_buffer *eb)
3462{
3463 return wait_on_extent_writeback(tree, eb->start,
3464 eb->start + eb->len - 1);
3465}
d1310b2e
CM
3466
3467int set_extent_buffer_dirty(struct extent_io_tree *tree,
3468 struct extent_buffer *eb)
3469{
3470 unsigned long i;
3471 unsigned long num_pages;
b9473439 3472 int was_dirty = 0;
d1310b2e 3473
b9473439 3474 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
d1310b2e 3475 num_pages = num_extent_pages(eb->start, eb->len);
b9473439 3476 for (i = 0; i < num_pages; i++)
d1310b2e 3477 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
b9473439 3478 return was_dirty;
d1310b2e 3479}
d1310b2e 3480
1259ab75 3481int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2ac55d41
JB
3482 struct extent_buffer *eb,
3483 struct extent_state **cached_state)
1259ab75
CM
3484{
3485 unsigned long i;
3486 struct page *page;
3487 unsigned long num_pages;
3488
3489 num_pages = num_extent_pages(eb->start, eb->len);
b4ce94de 3490 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
1259ab75
CM
3491
3492 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2ac55d41 3493 cached_state, GFP_NOFS);
1259ab75
CM
3494 for (i = 0; i < num_pages; i++) {
3495 page = extent_buffer_page(eb, i);
33958dc6
CM
3496 if (page)
3497 ClearPageUptodate(page);
1259ab75
CM
3498 }
3499 return 0;
3500}
3501
d1310b2e
CM
3502int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3503 struct extent_buffer *eb)
3504{
3505 unsigned long i;
3506 struct page *page;
3507 unsigned long num_pages;
3508
3509 num_pages = num_extent_pages(eb->start, eb->len);
3510
3511 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
507903b8 3512 NULL, GFP_NOFS);
d1310b2e
CM
3513 for (i = 0; i < num_pages; i++) {
3514 page = extent_buffer_page(eb, i);
3515 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3516 ((i == num_pages - 1) &&
3517 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3518 check_page_uptodate(tree, page);
3519 continue;
3520 }
3521 SetPageUptodate(page);
3522 }
3523 return 0;
3524}
d1310b2e 3525
ce9adaa5
CM
3526int extent_range_uptodate(struct extent_io_tree *tree,
3527 u64 start, u64 end)
3528{
3529 struct page *page;
3530 int ret;
3531 int pg_uptodate = 1;
3532 int uptodate;
3533 unsigned long index;
3534
9655d298 3535 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
ce9adaa5
CM
3536 if (ret)
3537 return 1;
d397712b 3538 while (start <= end) {
ce9adaa5
CM
3539 index = start >> PAGE_CACHE_SHIFT;
3540 page = find_get_page(tree->mapping, index);
3541 uptodate = PageUptodate(page);
3542 page_cache_release(page);
3543 if (!uptodate) {
3544 pg_uptodate = 0;
3545 break;
3546 }
3547 start += PAGE_CACHE_SIZE;
3548 }
3549 return pg_uptodate;
3550}
3551
d1310b2e 3552int extent_buffer_uptodate(struct extent_io_tree *tree,
2ac55d41
JB
3553 struct extent_buffer *eb,
3554 struct extent_state *cached_state)
d1310b2e 3555{
728131d8 3556 int ret = 0;
ce9adaa5
CM
3557 unsigned long num_pages;
3558 unsigned long i;
728131d8
CM
3559 struct page *page;
3560 int pg_uptodate = 1;
3561
b4ce94de 3562 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4235298e 3563 return 1;
728131d8 3564
4235298e 3565 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2ac55d41 3566 EXTENT_UPTODATE, 1, cached_state);
4235298e
CM
3567 if (ret)
3568 return ret;
728131d8
CM
3569
3570 num_pages = num_extent_pages(eb->start, eb->len);
3571 for (i = 0; i < num_pages; i++) {
3572 page = extent_buffer_page(eb, i);
3573 if (!PageUptodate(page)) {
3574 pg_uptodate = 0;
3575 break;
3576 }
3577 }
4235298e 3578 return pg_uptodate;
d1310b2e 3579}
d1310b2e
CM
3580
3581int read_extent_buffer_pages(struct extent_io_tree *tree,
3582 struct extent_buffer *eb,
a86c12c7 3583 u64 start, int wait,
f188591e 3584 get_extent_t *get_extent, int mirror_num)
d1310b2e
CM
3585{
3586 unsigned long i;
3587 unsigned long start_i;
3588 struct page *page;
3589 int err;
3590 int ret = 0;
ce9adaa5
CM
3591 int locked_pages = 0;
3592 int all_uptodate = 1;
3593 int inc_all_pages = 0;
d1310b2e 3594 unsigned long num_pages;
a86c12c7 3595 struct bio *bio = NULL;
c8b97818 3596 unsigned long bio_flags = 0;
a86c12c7 3597
b4ce94de 3598 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
3599 return 0;
3600
ce9adaa5 3601 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
9655d298 3602 EXTENT_UPTODATE, 1, NULL)) {
d1310b2e
CM
3603 return 0;
3604 }
3605
3606 if (start) {
3607 WARN_ON(start < eb->start);
3608 start_i = (start >> PAGE_CACHE_SHIFT) -
3609 (eb->start >> PAGE_CACHE_SHIFT);
3610 } else {
3611 start_i = 0;
3612 }
3613
3614 num_pages = num_extent_pages(eb->start, eb->len);
3615 for (i = start_i; i < num_pages; i++) {
3616 page = extent_buffer_page(eb, i);
d1310b2e 3617 if (!wait) {
2db04966 3618 if (!trylock_page(page))
ce9adaa5 3619 goto unlock_exit;
d1310b2e
CM
3620 } else {
3621 lock_page(page);
3622 }
ce9adaa5 3623 locked_pages++;
d397712b 3624 if (!PageUptodate(page))
ce9adaa5 3625 all_uptodate = 0;
ce9adaa5
CM
3626 }
3627 if (all_uptodate) {
3628 if (start_i == 0)
b4ce94de 3629 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
3630 goto unlock_exit;
3631 }
3632
3633 for (i = start_i; i < num_pages; i++) {
3634 page = extent_buffer_page(eb, i);
eb14ab8e
CM
3635
3636 WARN_ON(!PagePrivate(page));
3637
3638 set_page_extent_mapped(page);
3639 if (i == 0)
3640 set_page_extent_head(page, eb->len);
3641
ce9adaa5
CM
3642 if (inc_all_pages)
3643 page_cache_get(page);
3644 if (!PageUptodate(page)) {
3645 if (start_i == 0)
3646 inc_all_pages = 1;
f188591e 3647 ClearPageError(page);
a86c12c7 3648 err = __extent_read_full_page(tree, page,
f188591e 3649 get_extent, &bio,
c8b97818 3650 mirror_num, &bio_flags);
d397712b 3651 if (err)
d1310b2e 3652 ret = err;
d1310b2e
CM
3653 } else {
3654 unlock_page(page);
3655 }
3656 }
3657
a86c12c7 3658 if (bio)
c8b97818 3659 submit_one_bio(READ, bio, mirror_num, bio_flags);
a86c12c7 3660
d397712b 3661 if (ret || !wait)
d1310b2e 3662 return ret;
d397712b 3663
d1310b2e
CM
3664 for (i = start_i; i < num_pages; i++) {
3665 page = extent_buffer_page(eb, i);
3666 wait_on_page_locked(page);
d397712b 3667 if (!PageUptodate(page))
d1310b2e 3668 ret = -EIO;
d1310b2e 3669 }
d397712b 3670
d1310b2e 3671 if (!ret)
b4ce94de 3672 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
d1310b2e 3673 return ret;
ce9adaa5
CM
3674
3675unlock_exit:
3676 i = start_i;
d397712b 3677 while (locked_pages > 0) {
ce9adaa5
CM
3678 page = extent_buffer_page(eb, i);
3679 i++;
3680 unlock_page(page);
3681 locked_pages--;
3682 }
3683 return ret;
d1310b2e 3684}
d1310b2e
CM
3685
3686void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3687 unsigned long start,
3688 unsigned long len)
3689{
3690 size_t cur;
3691 size_t offset;
3692 struct page *page;
3693 char *kaddr;
3694 char *dst = (char *)dstv;
3695 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3696 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
d1310b2e
CM
3697
3698 WARN_ON(start > eb->len);
3699 WARN_ON(start + len > eb->start + eb->len);
3700
3701 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3702
d397712b 3703 while (len > 0) {
d1310b2e 3704 page = extent_buffer_page(eb, i);
d1310b2e
CM
3705
3706 cur = min(len, (PAGE_CACHE_SIZE - offset));
3707 kaddr = kmap_atomic(page, KM_USER1);
3708 memcpy(dst, kaddr + offset, cur);
3709 kunmap_atomic(kaddr, KM_USER1);
3710
3711 dst += cur;
3712 len -= cur;
3713 offset = 0;
3714 i++;
3715 }
3716}
d1310b2e
CM
3717
3718int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3719 unsigned long min_len, char **token, char **map,
3720 unsigned long *map_start,
3721 unsigned long *map_len, int km)
3722{
3723 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3724 char *kaddr;
3725 struct page *p;
3726 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3727 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3728 unsigned long end_i = (start_offset + start + min_len - 1) >>
3729 PAGE_CACHE_SHIFT;
3730
3731 if (i != end_i)
3732 return -EINVAL;
3733
3734 if (i == 0) {
3735 offset = start_offset;
3736 *map_start = 0;
3737 } else {
3738 offset = 0;
3739 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3740 }
d397712b 3741
d1310b2e 3742 if (start + min_len > eb->len) {
d397712b
CM
3743 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3744 "wanted %lu %lu\n", (unsigned long long)eb->start,
3745 eb->len, start, min_len);
d1310b2e 3746 WARN_ON(1);
85026533 3747 return -EINVAL;
d1310b2e
CM
3748 }
3749
3750 p = extent_buffer_page(eb, i);
d1310b2e
CM
3751 kaddr = kmap_atomic(p, km);
3752 *token = kaddr;
3753 *map = kaddr + offset;
3754 *map_len = PAGE_CACHE_SIZE - offset;
3755 return 0;
3756}
d1310b2e
CM
3757
3758int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3759 unsigned long min_len,
3760 char **token, char **map,
3761 unsigned long *map_start,
3762 unsigned long *map_len, int km)
3763{
3764 int err;
3765 int save = 0;
3766 if (eb->map_token) {
3767 unmap_extent_buffer(eb, eb->map_token, km);
3768 eb->map_token = NULL;
3769 save = 1;
3770 }
3771 err = map_private_extent_buffer(eb, start, min_len, token, map,
3772 map_start, map_len, km);
3773 if (!err && save) {
3774 eb->map_token = *token;
3775 eb->kaddr = *map;
3776 eb->map_start = *map_start;
3777 eb->map_len = *map_len;
3778 }
3779 return err;
3780}
d1310b2e
CM
3781
3782void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3783{
3784 kunmap_atomic(token, km);
3785}
d1310b2e
CM
3786
3787int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3788 unsigned long start,
3789 unsigned long len)
3790{
3791 size_t cur;
3792 size_t offset;
3793 struct page *page;
3794 char *kaddr;
3795 char *ptr = (char *)ptrv;
3796 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3797 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3798 int ret = 0;
3799
3800 WARN_ON(start > eb->len);
3801 WARN_ON(start + len > eb->start + eb->len);
3802
3803 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3804
d397712b 3805 while (len > 0) {
d1310b2e 3806 page = extent_buffer_page(eb, i);
d1310b2e
CM
3807
3808 cur = min(len, (PAGE_CACHE_SIZE - offset));
3809
3810 kaddr = kmap_atomic(page, KM_USER0);
3811 ret = memcmp(ptr, kaddr + offset, cur);
3812 kunmap_atomic(kaddr, KM_USER0);
3813 if (ret)
3814 break;
3815
3816 ptr += cur;
3817 len -= cur;
3818 offset = 0;
3819 i++;
3820 }
3821 return ret;
3822}
d1310b2e
CM
3823
3824void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3825 unsigned long start, unsigned long len)
3826{
3827 size_t cur;
3828 size_t offset;
3829 struct page *page;
3830 char *kaddr;
3831 char *src = (char *)srcv;
3832 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3833 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3834
3835 WARN_ON(start > eb->len);
3836 WARN_ON(start + len > eb->start + eb->len);
3837
3838 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3839
d397712b 3840 while (len > 0) {
d1310b2e
CM
3841 page = extent_buffer_page(eb, i);
3842 WARN_ON(!PageUptodate(page));
3843
3844 cur = min(len, PAGE_CACHE_SIZE - offset);
3845 kaddr = kmap_atomic(page, KM_USER1);
3846 memcpy(kaddr + offset, src, cur);
3847 kunmap_atomic(kaddr, KM_USER1);
3848
3849 src += cur;
3850 len -= cur;
3851 offset = 0;
3852 i++;
3853 }
3854}
d1310b2e
CM
3855
3856void memset_extent_buffer(struct extent_buffer *eb, char c,
3857 unsigned long start, unsigned long len)
3858{
3859 size_t cur;
3860 size_t offset;
3861 struct page *page;
3862 char *kaddr;
3863 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3864 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3865
3866 WARN_ON(start > eb->len);
3867 WARN_ON(start + len > eb->start + eb->len);
3868
3869 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3870
d397712b 3871 while (len > 0) {
d1310b2e
CM
3872 page = extent_buffer_page(eb, i);
3873 WARN_ON(!PageUptodate(page));
3874
3875 cur = min(len, PAGE_CACHE_SIZE - offset);
3876 kaddr = kmap_atomic(page, KM_USER0);
3877 memset(kaddr + offset, c, cur);
3878 kunmap_atomic(kaddr, KM_USER0);
3879
3880 len -= cur;
3881 offset = 0;
3882 i++;
3883 }
3884}
d1310b2e
CM
3885
3886void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3887 unsigned long dst_offset, unsigned long src_offset,
3888 unsigned long len)
3889{
3890 u64 dst_len = dst->len;
3891 size_t cur;
3892 size_t offset;
3893 struct page *page;
3894 char *kaddr;
3895 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3896 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3897
3898 WARN_ON(src->len != dst_len);
3899
3900 offset = (start_offset + dst_offset) &
3901 ((unsigned long)PAGE_CACHE_SIZE - 1);
3902
d397712b 3903 while (len > 0) {
d1310b2e
CM
3904 page = extent_buffer_page(dst, i);
3905 WARN_ON(!PageUptodate(page));
3906
3907 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3908
3909 kaddr = kmap_atomic(page, KM_USER0);
3910 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3911 kunmap_atomic(kaddr, KM_USER0);
3912
3913 src_offset += cur;
3914 len -= cur;
3915 offset = 0;
3916 i++;
3917 }
3918}
d1310b2e
CM
3919
3920static void move_pages(struct page *dst_page, struct page *src_page,
3921 unsigned long dst_off, unsigned long src_off,
3922 unsigned long len)
3923{
3924 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3925 if (dst_page == src_page) {
3926 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3927 } else {
3928 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3929 char *p = dst_kaddr + dst_off + len;
3930 char *s = src_kaddr + src_off + len;
3931
3932 while (len--)
3933 *--p = *--s;
3934
3935 kunmap_atomic(src_kaddr, KM_USER1);
3936 }
3937 kunmap_atomic(dst_kaddr, KM_USER0);
3938}
3939
3387206f
ST
3940static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3941{
3942 unsigned long distance = (src > dst) ? src - dst : dst - src;
3943 return distance < len;
3944}
3945
d1310b2e
CM
3946static void copy_pages(struct page *dst_page, struct page *src_page,
3947 unsigned long dst_off, unsigned long src_off,
3948 unsigned long len)
3949{
3950 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3951 char *src_kaddr;
3952
3387206f 3953 if (dst_page != src_page) {
d1310b2e 3954 src_kaddr = kmap_atomic(src_page, KM_USER1);
3387206f 3955 } else {
d1310b2e 3956 src_kaddr = dst_kaddr;
3387206f
ST
3957 BUG_ON(areas_overlap(src_off, dst_off, len));
3958 }
d1310b2e
CM
3959
3960 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3961 kunmap_atomic(dst_kaddr, KM_USER0);
3962 if (dst_page != src_page)
3963 kunmap_atomic(src_kaddr, KM_USER1);
3964}
3965
3966void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3967 unsigned long src_offset, unsigned long len)
3968{
3969 size_t cur;
3970 size_t dst_off_in_page;
3971 size_t src_off_in_page;
3972 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3973 unsigned long dst_i;
3974 unsigned long src_i;
3975
3976 if (src_offset + len > dst->len) {
d397712b
CM
3977 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3978 "len %lu dst len %lu\n", src_offset, len, dst->len);
d1310b2e
CM
3979 BUG_ON(1);
3980 }
3981 if (dst_offset + len > dst->len) {
d397712b
CM
3982 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3983 "len %lu dst len %lu\n", dst_offset, len, dst->len);
d1310b2e
CM
3984 BUG_ON(1);
3985 }
3986
d397712b 3987 while (len > 0) {
d1310b2e
CM
3988 dst_off_in_page = (start_offset + dst_offset) &
3989 ((unsigned long)PAGE_CACHE_SIZE - 1);
3990 src_off_in_page = (start_offset + src_offset) &
3991 ((unsigned long)PAGE_CACHE_SIZE - 1);
3992
3993 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3994 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3995
3996 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3997 src_off_in_page));
3998 cur = min_t(unsigned long, cur,
3999 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4000
4001 copy_pages(extent_buffer_page(dst, dst_i),
4002 extent_buffer_page(dst, src_i),
4003 dst_off_in_page, src_off_in_page, cur);
4004
4005 src_offset += cur;
4006 dst_offset += cur;
4007 len -= cur;
4008 }
4009}
d1310b2e
CM
4010
4011void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4012 unsigned long src_offset, unsigned long len)
4013{
4014 size_t cur;
4015 size_t dst_off_in_page;
4016 size_t src_off_in_page;
4017 unsigned long dst_end = dst_offset + len - 1;
4018 unsigned long src_end = src_offset + len - 1;
4019 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4020 unsigned long dst_i;
4021 unsigned long src_i;
4022
4023 if (src_offset + len > dst->len) {
d397712b
CM
4024 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4025 "len %lu len %lu\n", src_offset, len, dst->len);
d1310b2e
CM
4026 BUG_ON(1);
4027 }
4028 if (dst_offset + len > dst->len) {
d397712b
CM
4029 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4030 "len %lu len %lu\n", dst_offset, len, dst->len);
d1310b2e
CM
4031 BUG_ON(1);
4032 }
3387206f 4033 if (!areas_overlap(src_offset, dst_offset, len)) {
d1310b2e
CM
4034 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4035 return;
4036 }
d397712b 4037 while (len > 0) {
d1310b2e
CM
4038 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4039 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4040
4041 dst_off_in_page = (start_offset + dst_end) &
4042 ((unsigned long)PAGE_CACHE_SIZE - 1);
4043 src_off_in_page = (start_offset + src_end) &
4044 ((unsigned long)PAGE_CACHE_SIZE - 1);
4045
4046 cur = min_t(unsigned long, len, src_off_in_page + 1);
4047 cur = min(cur, dst_off_in_page + 1);
4048 move_pages(extent_buffer_page(dst, dst_i),
4049 extent_buffer_page(dst, src_i),
4050 dst_off_in_page - cur + 1,
4051 src_off_in_page - cur + 1, cur);
4052
4053 dst_end -= cur;
4054 src_end -= cur;
4055 len -= cur;
4056 }
4057}
6af118ce 4058
19fe0a8b
MX
4059static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4060{
4061 struct extent_buffer *eb =
4062 container_of(head, struct extent_buffer, rcu_head);
4063
4064 btrfs_release_extent_buffer(eb);
4065}
4066
6af118ce
CM
4067int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4068{
4069 u64 start = page_offset(page);
4070 struct extent_buffer *eb;
4071 int ret = 1;
6af118ce
CM
4072
4073 spin_lock(&tree->buffer_lock);
19fe0a8b 4074 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
45f49bce
CM
4075 if (!eb) {
4076 spin_unlock(&tree->buffer_lock);
4077 return ret;
4078 }
6af118ce 4079
19fe0a8b 4080 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
6af118ce
CM
4081 ret = 0;
4082 goto out;
4083 }
19fe0a8b
MX
4084
4085 /*
4086 * set @eb->refs to 0 if it is already 1, and then release the @eb.
4087 * Or go back.
4088 */
4089 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
b9473439
CM
4090 ret = 0;
4091 goto out;
4092 }
897ca6e9 4093
19fe0a8b 4094 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
6af118ce
CM
4095out:
4096 spin_unlock(&tree->buffer_lock);
19fe0a8b
MX
4097
4098 /* at this point we can safely release the extent buffer */
4099 if (atomic_read(&eb->refs) == 0)
4100 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6af118ce
CM
4101 return ret;
4102}