Btrfs: process all async extents on compressed write failure
[linux-2.6-block.git] / fs / btrfs / inode.c
CommitLineData
6cbd5570
CM
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
8f18cf13 19#include <linux/kernel.h>
065631f6 20#include <linux/bio.h>
39279cc3 21#include <linux/buffer_head.h>
f2eb0a24 22#include <linux/file.h>
39279cc3
CM
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
39279cc3
CM
29#include <linux/backing-dev.h>
30#include <linux/mpage.h>
31#include <linux/swap.h>
32#include <linux/writeback.h>
33#include <linux/statfs.h>
34#include <linux/compat.h>
a27bb332 35#include <linux/aio.h>
9ebefb18 36#include <linux/bit_spinlock.h>
5103e947 37#include <linux/xattr.h>
33268eaf 38#include <linux/posix_acl.h>
d899e052 39#include <linux/falloc.h>
5a0e3ad6 40#include <linux/slab.h>
7a36ddec 41#include <linux/ratelimit.h>
22c44fe6 42#include <linux/mount.h>
55e301fd 43#include <linux/btrfs.h>
53b381b3 44#include <linux/blkdev.h>
f23b5a59 45#include <linux/posix_acl_xattr.h>
39279cc3
CM
46#include "ctree.h"
47#include "disk-io.h"
48#include "transaction.h"
49#include "btrfs_inode.h"
39279cc3 50#include "print-tree.h"
e6dcd2dc 51#include "ordered-data.h"
95819c05 52#include "xattr.h"
e02119d5 53#include "tree-log.h"
4a54c8c1 54#include "volumes.h"
c8b97818 55#include "compression.h"
b4ce94de 56#include "locking.h"
dc89e982 57#include "free-space-cache.h"
581bb050 58#include "inode-map.h"
38c227d8 59#include "backref.h"
f23b5a59 60#include "hash.h"
63541927 61#include "props.h"
39279cc3
CM
62
63struct btrfs_iget_args {
90d3e592 64 struct btrfs_key *location;
39279cc3
CM
65 struct btrfs_root *root;
66};
67
6e1d5dcc
AD
68static const struct inode_operations btrfs_dir_inode_operations;
69static const struct inode_operations btrfs_symlink_inode_operations;
70static const struct inode_operations btrfs_dir_ro_inode_operations;
71static const struct inode_operations btrfs_special_inode_operations;
72static const struct inode_operations btrfs_file_inode_operations;
7f09410b
AD
73static const struct address_space_operations btrfs_aops;
74static const struct address_space_operations btrfs_symlink_aops;
828c0950 75static const struct file_operations btrfs_dir_file_operations;
d1310b2e 76static struct extent_io_ops btrfs_extent_io_ops;
39279cc3
CM
77
78static struct kmem_cache *btrfs_inode_cachep;
8ccf6f19 79static struct kmem_cache *btrfs_delalloc_work_cachep;
39279cc3
CM
80struct kmem_cache *btrfs_trans_handle_cachep;
81struct kmem_cache *btrfs_transaction_cachep;
39279cc3 82struct kmem_cache *btrfs_path_cachep;
dc89e982 83struct kmem_cache *btrfs_free_space_cachep;
39279cc3
CM
84
85#define S_SHIFT 12
86static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
88 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
89 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
90 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
91 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
92 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
93 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
94};
95
3972f260 96static int btrfs_setsize(struct inode *inode, struct iattr *attr);
a41ad394 97static int btrfs_truncate(struct inode *inode);
5fd02043 98static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
771ed689
CM
99static noinline int cow_file_range(struct inode *inode,
100 struct page *locked_page,
101 u64 start, u64 end, int *page_started,
102 unsigned long *nr_written, int unlock);
70c8a91c
JB
103static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104 u64 len, u64 orig_start,
105 u64 block_start, u64 block_len,
cc95bef6
JB
106 u64 orig_block_len, u64 ram_bytes,
107 int type);
7b128766 108
48a3b636 109static int btrfs_dirty_inode(struct inode *inode);
7b128766 110
f34f57a3 111static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
2a7dba39
EP
112 struct inode *inode, struct inode *dir,
113 const struct qstr *qstr)
0279b4cd
JO
114{
115 int err;
116
f34f57a3 117 err = btrfs_init_acl(trans, inode, dir);
0279b4cd 118 if (!err)
2a7dba39 119 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
0279b4cd
JO
120 return err;
121}
122
c8b97818
CM
123/*
124 * this does all the hard work for inserting an inline extent into
125 * the btree. The caller should have done a btrfs_drop_extents so that
126 * no overlapping inline items exist in the btree
127 */
40f76580 128static int insert_inline_extent(struct btrfs_trans_handle *trans,
1acae57b 129 struct btrfs_path *path, int extent_inserted,
c8b97818
CM
130 struct btrfs_root *root, struct inode *inode,
131 u64 start, size_t size, size_t compressed_size,
fe3f566c 132 int compress_type,
c8b97818
CM
133 struct page **compressed_pages)
134{
c8b97818
CM
135 struct extent_buffer *leaf;
136 struct page *page = NULL;
137 char *kaddr;
138 unsigned long ptr;
139 struct btrfs_file_extent_item *ei;
140 int err = 0;
141 int ret;
142 size_t cur_size = size;
c8b97818 143 unsigned long offset;
c8b97818 144
fe3f566c 145 if (compressed_size && compressed_pages)
c8b97818 146 cur_size = compressed_size;
c8b97818 147
1acae57b 148 inode_add_bytes(inode, size);
c8b97818 149
1acae57b
FDBM
150 if (!extent_inserted) {
151 struct btrfs_key key;
152 size_t datasize;
c8b97818 153
1acae57b
FDBM
154 key.objectid = btrfs_ino(inode);
155 key.offset = start;
962a298f 156 key.type = BTRFS_EXTENT_DATA_KEY;
c8b97818 157
1acae57b
FDBM
158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 path->leave_spinning = 1;
160 ret = btrfs_insert_empty_item(trans, root, path, &key,
161 datasize);
162 if (ret) {
163 err = ret;
164 goto fail;
165 }
c8b97818
CM
166 }
167 leaf = path->nodes[0];
168 ei = btrfs_item_ptr(leaf, path->slots[0],
169 struct btrfs_file_extent_item);
170 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172 btrfs_set_file_extent_encryption(leaf, ei, 0);
173 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175 ptr = btrfs_file_extent_inline_start(ei);
176
261507a0 177 if (compress_type != BTRFS_COMPRESS_NONE) {
c8b97818
CM
178 struct page *cpage;
179 int i = 0;
d397712b 180 while (compressed_size > 0) {
c8b97818 181 cpage = compressed_pages[i];
5b050f04 182 cur_size = min_t(unsigned long, compressed_size,
c8b97818
CM
183 PAGE_CACHE_SIZE);
184
7ac687d9 185 kaddr = kmap_atomic(cpage);
c8b97818 186 write_extent_buffer(leaf, kaddr, ptr, cur_size);
7ac687d9 187 kunmap_atomic(kaddr);
c8b97818
CM
188
189 i++;
190 ptr += cur_size;
191 compressed_size -= cur_size;
192 }
193 btrfs_set_file_extent_compression(leaf, ei,
261507a0 194 compress_type);
c8b97818
CM
195 } else {
196 page = find_get_page(inode->i_mapping,
197 start >> PAGE_CACHE_SHIFT);
198 btrfs_set_file_extent_compression(leaf, ei, 0);
7ac687d9 199 kaddr = kmap_atomic(page);
c8b97818
CM
200 offset = start & (PAGE_CACHE_SIZE - 1);
201 write_extent_buffer(leaf, kaddr + offset, ptr, size);
7ac687d9 202 kunmap_atomic(kaddr);
c8b97818
CM
203 page_cache_release(page);
204 }
205 btrfs_mark_buffer_dirty(leaf);
1acae57b 206 btrfs_release_path(path);
c8b97818 207
c2167754
YZ
208 /*
209 * we're an inline extent, so nobody can
210 * extend the file past i_size without locking
211 * a page we already have locked.
212 *
213 * We must do any isize and inode updates
214 * before we unlock the pages. Otherwise we
215 * could end up racing with unlink.
216 */
c8b97818 217 BTRFS_I(inode)->disk_i_size = inode->i_size;
79787eaa 218 ret = btrfs_update_inode(trans, root, inode);
c2167754 219
79787eaa 220 return ret;
c8b97818 221fail:
c8b97818
CM
222 return err;
223}
224
225
226/*
227 * conditionally insert an inline extent into the file. This
228 * does the checks required to make sure the data is small enough
229 * to fit as an inline extent.
230 */
00361589
JB
231static noinline int cow_file_range_inline(struct btrfs_root *root,
232 struct inode *inode, u64 start,
233 u64 end, size_t compressed_size,
234 int compress_type,
235 struct page **compressed_pages)
c8b97818 236{
00361589 237 struct btrfs_trans_handle *trans;
c8b97818
CM
238 u64 isize = i_size_read(inode);
239 u64 actual_end = min(end + 1, isize);
240 u64 inline_len = actual_end - start;
fda2832f 241 u64 aligned_end = ALIGN(end, root->sectorsize);
c8b97818
CM
242 u64 data_len = inline_len;
243 int ret;
1acae57b
FDBM
244 struct btrfs_path *path;
245 int extent_inserted = 0;
246 u32 extent_item_size;
c8b97818
CM
247
248 if (compressed_size)
249 data_len = compressed_size;
250
251 if (start > 0 ||
354877be
WS
252 actual_end > PAGE_CACHE_SIZE ||
253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
c8b97818
CM
254 (!compressed_size &&
255 (actual_end & (root->sectorsize - 1)) == 0) ||
256 end + 1 < isize ||
257 data_len > root->fs_info->max_inline) {
258 return 1;
259 }
260
1acae57b
FDBM
261 path = btrfs_alloc_path();
262 if (!path)
263 return -ENOMEM;
264
00361589 265 trans = btrfs_join_transaction(root);
1acae57b
FDBM
266 if (IS_ERR(trans)) {
267 btrfs_free_path(path);
00361589 268 return PTR_ERR(trans);
1acae57b 269 }
00361589
JB
270 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
271
1acae57b
FDBM
272 if (compressed_size && compressed_pages)
273 extent_item_size = btrfs_file_extent_calc_inline_size(
274 compressed_size);
275 else
276 extent_item_size = btrfs_file_extent_calc_inline_size(
277 inline_len);
278
279 ret = __btrfs_drop_extents(trans, root, inode, path,
280 start, aligned_end, NULL,
281 1, 1, extent_item_size, &extent_inserted);
00361589
JB
282 if (ret) {
283 btrfs_abort_transaction(trans, root, ret);
284 goto out;
285 }
c8b97818
CM
286
287 if (isize > actual_end)
288 inline_len = min_t(u64, isize, actual_end);
1acae57b
FDBM
289 ret = insert_inline_extent(trans, path, extent_inserted,
290 root, inode, start,
c8b97818 291 inline_len, compressed_size,
fe3f566c 292 compress_type, compressed_pages);
2adcac1a 293 if (ret && ret != -ENOSPC) {
79787eaa 294 btrfs_abort_transaction(trans, root, ret);
00361589 295 goto out;
2adcac1a 296 } else if (ret == -ENOSPC) {
00361589
JB
297 ret = 1;
298 goto out;
79787eaa 299 }
2adcac1a 300
bdc20e67 301 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
0ca1f7ce 302 btrfs_delalloc_release_metadata(inode, end + 1 - start);
a1ed835e 303 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
00361589 304out:
1acae57b 305 btrfs_free_path(path);
00361589
JB
306 btrfs_end_transaction(trans, root);
307 return ret;
c8b97818
CM
308}
309
771ed689
CM
310struct async_extent {
311 u64 start;
312 u64 ram_size;
313 u64 compressed_size;
314 struct page **pages;
315 unsigned long nr_pages;
261507a0 316 int compress_type;
771ed689
CM
317 struct list_head list;
318};
319
320struct async_cow {
321 struct inode *inode;
322 struct btrfs_root *root;
323 struct page *locked_page;
324 u64 start;
325 u64 end;
326 struct list_head extents;
327 struct btrfs_work work;
328};
329
330static noinline int add_async_extent(struct async_cow *cow,
331 u64 start, u64 ram_size,
332 u64 compressed_size,
333 struct page **pages,
261507a0
LZ
334 unsigned long nr_pages,
335 int compress_type)
771ed689
CM
336{
337 struct async_extent *async_extent;
338
339 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
79787eaa 340 BUG_ON(!async_extent); /* -ENOMEM */
771ed689
CM
341 async_extent->start = start;
342 async_extent->ram_size = ram_size;
343 async_extent->compressed_size = compressed_size;
344 async_extent->pages = pages;
345 async_extent->nr_pages = nr_pages;
261507a0 346 async_extent->compress_type = compress_type;
771ed689
CM
347 list_add_tail(&async_extent->list, &cow->extents);
348 return 0;
349}
350
f79707b0
WS
351static inline int inode_need_compress(struct inode *inode)
352{
353 struct btrfs_root *root = BTRFS_I(inode)->root;
354
355 /* force compress */
356 if (btrfs_test_opt(root, FORCE_COMPRESS))
357 return 1;
358 /* bad compression ratios */
359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
360 return 0;
361 if (btrfs_test_opt(root, COMPRESS) ||
362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
363 BTRFS_I(inode)->force_compress)
364 return 1;
365 return 0;
366}
367
d352ac68 368/*
771ed689
CM
369 * we create compressed extents in two phases. The first
370 * phase compresses a range of pages that have already been
371 * locked (both pages and state bits are locked).
c8b97818 372 *
771ed689
CM
373 * This is done inside an ordered work queue, and the compression
374 * is spread across many cpus. The actual IO submission is step
375 * two, and the ordered work queue takes care of making sure that
376 * happens in the same order things were put onto the queue by
377 * writepages and friends.
c8b97818 378 *
771ed689
CM
379 * If this code finds it can't get good compression, it puts an
380 * entry onto the work queue to write the uncompressed bytes. This
381 * makes sure that both compressed inodes and uncompressed inodes
b2570314
AB
382 * are written in the same order that the flusher thread sent them
383 * down.
d352ac68 384 */
771ed689
CM
385static noinline int compress_file_range(struct inode *inode,
386 struct page *locked_page,
387 u64 start, u64 end,
388 struct async_cow *async_cow,
389 int *num_added)
b888db2b
CM
390{
391 struct btrfs_root *root = BTRFS_I(inode)->root;
db94535d 392 u64 num_bytes;
db94535d 393 u64 blocksize = root->sectorsize;
c8b97818 394 u64 actual_end;
42dc7bab 395 u64 isize = i_size_read(inode);
e6dcd2dc 396 int ret = 0;
c8b97818
CM
397 struct page **pages = NULL;
398 unsigned long nr_pages;
399 unsigned long nr_pages_ret = 0;
400 unsigned long total_compressed = 0;
401 unsigned long total_in = 0;
402 unsigned long max_compressed = 128 * 1024;
771ed689 403 unsigned long max_uncompressed = 128 * 1024;
c8b97818
CM
404 int i;
405 int will_compress;
261507a0 406 int compress_type = root->fs_info->compress_type;
4adaa611 407 int redirty = 0;
b888db2b 408
4cb13e5d
LB
409 /* if this is a small write inside eof, kick off a defrag */
410 if ((end - start + 1) < 16 * 1024 &&
411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
4cb5300b
CM
412 btrfs_add_inode_defrag(NULL, inode);
413
68bb462d
WS
414 /*
415 * skip compression for a small file range(<=blocksize) that
416 * isn't an inline extent, since it dosen't save disk space at all.
417 */
418 if ((end - start + 1) <= blocksize &&
419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
420 goto cleanup_and_bail_uncompressed;
421
42dc7bab 422 actual_end = min_t(u64, isize, end + 1);
c8b97818
CM
423again:
424 will_compress = 0;
425 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
426 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
be20aa9d 427
f03d9301
CM
428 /*
429 * we don't want to send crud past the end of i_size through
430 * compression, that's just a waste of CPU time. So, if the
431 * end of the file is before the start of our current
432 * requested range of bytes, we bail out to the uncompressed
433 * cleanup code that can deal with all of this.
434 *
435 * It isn't really the fastest way to fix things, but this is a
436 * very uncommon corner.
437 */
438 if (actual_end <= start)
439 goto cleanup_and_bail_uncompressed;
440
c8b97818
CM
441 total_compressed = actual_end - start;
442
443 /* we want to make sure that amount of ram required to uncompress
444 * an extent is reasonable, so we limit the total size in ram
771ed689
CM
445 * of a compressed extent to 128k. This is a crucial number
446 * because it also controls how easily we can spread reads across
447 * cpus for decompression.
448 *
449 * We also want to make sure the amount of IO required to do
450 * a random read is reasonably small, so we limit the size of
451 * a compressed extent to 128k.
c8b97818
CM
452 */
453 total_compressed = min(total_compressed, max_uncompressed);
fda2832f 454 num_bytes = ALIGN(end - start + 1, blocksize);
be20aa9d 455 num_bytes = max(blocksize, num_bytes);
c8b97818
CM
456 total_in = 0;
457 ret = 0;
db94535d 458
771ed689
CM
459 /*
460 * we do compression for mount -o compress and when the
461 * inode has not been flagged as nocompress. This flag can
462 * change at any time if we discover bad compression ratios.
c8b97818 463 */
f79707b0 464 if (inode_need_compress(inode)) {
c8b97818 465 WARN_ON(pages);
cfbc246e 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
560f7d75
LZ
467 if (!pages) {
468 /* just bail out to the uncompressed code */
469 goto cont;
470 }
c8b97818 471
261507a0
LZ
472 if (BTRFS_I(inode)->force_compress)
473 compress_type = BTRFS_I(inode)->force_compress;
474
4adaa611
CM
475 /*
476 * we need to call clear_page_dirty_for_io on each
477 * page in the range. Otherwise applications with the file
478 * mmap'd can wander in and change the page contents while
479 * we are compressing them.
480 *
481 * If the compression fails for any reason, we set the pages
482 * dirty again later on.
483 */
484 extent_range_clear_dirty_for_io(inode, start, end);
485 redirty = 1;
261507a0
LZ
486 ret = btrfs_compress_pages(compress_type,
487 inode->i_mapping, start,
488 total_compressed, pages,
489 nr_pages, &nr_pages_ret,
490 &total_in,
491 &total_compressed,
492 max_compressed);
c8b97818
CM
493
494 if (!ret) {
495 unsigned long offset = total_compressed &
496 (PAGE_CACHE_SIZE - 1);
497 struct page *page = pages[nr_pages_ret - 1];
498 char *kaddr;
499
500 /* zero the tail end of the last page, we might be
501 * sending it down to disk
502 */
503 if (offset) {
7ac687d9 504 kaddr = kmap_atomic(page);
c8b97818
CM
505 memset(kaddr + offset, 0,
506 PAGE_CACHE_SIZE - offset);
7ac687d9 507 kunmap_atomic(kaddr);
c8b97818
CM
508 }
509 will_compress = 1;
510 }
511 }
560f7d75 512cont:
c8b97818
CM
513 if (start == 0) {
514 /* lets try to make an inline extent */
771ed689 515 if (ret || total_in < (actual_end - start)) {
c8b97818 516 /* we didn't compress the entire range, try
771ed689 517 * to make an uncompressed inline extent.
c8b97818 518 */
00361589
JB
519 ret = cow_file_range_inline(root, inode, start, end,
520 0, 0, NULL);
c8b97818 521 } else {
771ed689 522 /* try making a compressed inline extent */
00361589 523 ret = cow_file_range_inline(root, inode, start, end,
fe3f566c
LZ
524 total_compressed,
525 compress_type, pages);
c8b97818 526 }
79787eaa 527 if (ret <= 0) {
151a41bc
JB
528 unsigned long clear_flags = EXTENT_DELALLOC |
529 EXTENT_DEFRAG;
530 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
531
771ed689 532 /*
79787eaa
JM
533 * inline extent creation worked or returned error,
534 * we don't need to create any more async work items.
535 * Unlock and free up our temp pages.
771ed689 536 */
c2790a2e 537 extent_clear_unlock_delalloc(inode, start, end, NULL,
151a41bc 538 clear_flags, PAGE_UNLOCK |
c2790a2e
JB
539 PAGE_CLEAR_DIRTY |
540 PAGE_SET_WRITEBACK |
541 PAGE_END_WRITEBACK);
c8b97818
CM
542 goto free_pages_out;
543 }
544 }
545
546 if (will_compress) {
547 /*
548 * we aren't doing an inline extent round the compressed size
549 * up to a block size boundary so the allocator does sane
550 * things
551 */
fda2832f 552 total_compressed = ALIGN(total_compressed, blocksize);
c8b97818
CM
553
554 /*
555 * one last check to make sure the compression is really a
556 * win, compare the page count read with the blocks on disk
557 */
fda2832f 558 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
c8b97818
CM
559 if (total_compressed >= total_in) {
560 will_compress = 0;
561 } else {
c8b97818
CM
562 num_bytes = total_in;
563 }
564 }
565 if (!will_compress && pages) {
566 /*
567 * the compression code ran but failed to make things smaller,
568 * free any pages it allocated and our page pointer array
569 */
570 for (i = 0; i < nr_pages_ret; i++) {
70b99e69 571 WARN_ON(pages[i]->mapping);
c8b97818
CM
572 page_cache_release(pages[i]);
573 }
574 kfree(pages);
575 pages = NULL;
576 total_compressed = 0;
577 nr_pages_ret = 0;
578
579 /* flag the file so we don't compress in the future */
1e701a32
CM
580 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
581 !(BTRFS_I(inode)->force_compress)) {
a555f810 582 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
1e701a32 583 }
c8b97818 584 }
771ed689
CM
585 if (will_compress) {
586 *num_added += 1;
c8b97818 587
771ed689
CM
588 /* the async work queues will take care of doing actual
589 * allocation on disk for these compressed pages,
590 * and will submit them to the elevator.
591 */
592 add_async_extent(async_cow, start, num_bytes,
261507a0
LZ
593 total_compressed, pages, nr_pages_ret,
594 compress_type);
179e29e4 595
24ae6365 596 if (start + num_bytes < end) {
771ed689
CM
597 start += num_bytes;
598 pages = NULL;
599 cond_resched();
600 goto again;
601 }
602 } else {
f03d9301 603cleanup_and_bail_uncompressed:
771ed689
CM
604 /*
605 * No compression, but we still need to write the pages in
606 * the file we've been given so far. redirty the locked
607 * page if it corresponds to our extent and set things up
608 * for the async work queue to run cow_file_range to do
609 * the normal delalloc dance
610 */
611 if (page_offset(locked_page) >= start &&
612 page_offset(locked_page) <= end) {
613 __set_page_dirty_nobuffers(locked_page);
614 /* unlocked later on in the async handlers */
615 }
4adaa611
CM
616 if (redirty)
617 extent_range_redirty_for_io(inode, start, end);
261507a0
LZ
618 add_async_extent(async_cow, start, end - start + 1,
619 0, NULL, 0, BTRFS_COMPRESS_NONE);
771ed689
CM
620 *num_added += 1;
621 }
3b951516 622
771ed689 623out:
79787eaa 624 return ret;
771ed689
CM
625
626free_pages_out:
627 for (i = 0; i < nr_pages_ret; i++) {
628 WARN_ON(pages[i]->mapping);
629 page_cache_release(pages[i]);
630 }
d397712b 631 kfree(pages);
771ed689
CM
632
633 goto out;
634}
635
40ae837b
FM
636static void free_async_extent_pages(struct async_extent *async_extent)
637{
638 int i;
639
640 if (!async_extent->pages)
641 return;
642
643 for (i = 0; i < async_extent->nr_pages; i++) {
644 WARN_ON(async_extent->pages[i]->mapping);
645 page_cache_release(async_extent->pages[i]);
646 }
647 kfree(async_extent->pages);
648 async_extent->nr_pages = 0;
649 async_extent->pages = NULL;
650}
651
771ed689
CM
652/*
653 * phase two of compressed writeback. This is the ordered portion
654 * of the code, which only gets called in the order the work was
655 * queued. We walk all the async extents created by compress_file_range
656 * and send them down to the disk.
657 */
658static noinline int submit_compressed_extents(struct inode *inode,
659 struct async_cow *async_cow)
660{
661 struct async_extent *async_extent;
662 u64 alloc_hint = 0;
771ed689
CM
663 struct btrfs_key ins;
664 struct extent_map *em;
665 struct btrfs_root *root = BTRFS_I(inode)->root;
666 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
667 struct extent_io_tree *io_tree;
f5a84ee3 668 int ret = 0;
771ed689
CM
669
670 if (list_empty(&async_cow->extents))
671 return 0;
672
3e04e7f1 673again:
d397712b 674 while (!list_empty(&async_cow->extents)) {
771ed689
CM
675 async_extent = list_entry(async_cow->extents.next,
676 struct async_extent, list);
677 list_del(&async_extent->list);
c8b97818 678
771ed689
CM
679 io_tree = &BTRFS_I(inode)->io_tree;
680
f5a84ee3 681retry:
771ed689
CM
682 /* did the compression code fall back to uncompressed IO? */
683 if (!async_extent->pages) {
684 int page_started = 0;
685 unsigned long nr_written = 0;
686
687 lock_extent(io_tree, async_extent->start,
2ac55d41 688 async_extent->start +
d0082371 689 async_extent->ram_size - 1);
771ed689
CM
690
691 /* allocate blocks */
f5a84ee3
JB
692 ret = cow_file_range(inode, async_cow->locked_page,
693 async_extent->start,
694 async_extent->start +
695 async_extent->ram_size - 1,
696 &page_started, &nr_written, 0);
771ed689 697
79787eaa
JM
698 /* JDM XXX */
699
771ed689
CM
700 /*
701 * if page_started, cow_file_range inserted an
702 * inline extent and took care of all the unlocking
703 * and IO for us. Otherwise, we need to submit
704 * all those pages down to the drive.
705 */
f5a84ee3 706 if (!page_started && !ret)
771ed689
CM
707 extent_write_locked_range(io_tree,
708 inode, async_extent->start,
d397712b 709 async_extent->start +
771ed689
CM
710 async_extent->ram_size - 1,
711 btrfs_get_extent,
712 WB_SYNC_ALL);
3e04e7f1
JB
713 else if (ret)
714 unlock_page(async_cow->locked_page);
771ed689
CM
715 kfree(async_extent);
716 cond_resched();
717 continue;
718 }
719
720 lock_extent(io_tree, async_extent->start,
d0082371 721 async_extent->start + async_extent->ram_size - 1);
771ed689 722
00361589 723 ret = btrfs_reserve_extent(root,
771ed689
CM
724 async_extent->compressed_size,
725 async_extent->compressed_size,
e570fd27 726 0, alloc_hint, &ins, 1, 1);
f5a84ee3 727 if (ret) {
40ae837b 728 free_async_extent_pages(async_extent);
3e04e7f1 729
fdf8e2ea
JB
730 if (ret == -ENOSPC) {
731 unlock_extent(io_tree, async_extent->start,
732 async_extent->start +
733 async_extent->ram_size - 1);
ce62003f
LB
734
735 /*
736 * we need to redirty the pages if we decide to
737 * fallback to uncompressed IO, otherwise we
738 * will not submit these pages down to lower
739 * layers.
740 */
741 extent_range_redirty_for_io(inode,
742 async_extent->start,
743 async_extent->start +
744 async_extent->ram_size - 1);
745
79787eaa 746 goto retry;
fdf8e2ea 747 }
3e04e7f1 748 goto out_free;
f5a84ee3
JB
749 }
750
c2167754
YZ
751 /*
752 * here we're doing allocation and writeback of the
753 * compressed pages
754 */
755 btrfs_drop_extent_cache(inode, async_extent->start,
756 async_extent->start +
757 async_extent->ram_size - 1, 0);
758
172ddd60 759 em = alloc_extent_map();
b9aa55be
LB
760 if (!em) {
761 ret = -ENOMEM;
3e04e7f1 762 goto out_free_reserve;
b9aa55be 763 }
771ed689
CM
764 em->start = async_extent->start;
765 em->len = async_extent->ram_size;
445a6944 766 em->orig_start = em->start;
2ab28f32
JB
767 em->mod_start = em->start;
768 em->mod_len = em->len;
c8b97818 769
771ed689
CM
770 em->block_start = ins.objectid;
771 em->block_len = ins.offset;
b4939680 772 em->orig_block_len = ins.offset;
cc95bef6 773 em->ram_bytes = async_extent->ram_size;
771ed689 774 em->bdev = root->fs_info->fs_devices->latest_bdev;
261507a0 775 em->compress_type = async_extent->compress_type;
771ed689
CM
776 set_bit(EXTENT_FLAG_PINNED, &em->flags);
777 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
70c8a91c 778 em->generation = -1;
771ed689 779
d397712b 780 while (1) {
890871be 781 write_lock(&em_tree->lock);
09a2a8f9 782 ret = add_extent_mapping(em_tree, em, 1);
890871be 783 write_unlock(&em_tree->lock);
771ed689
CM
784 if (ret != -EEXIST) {
785 free_extent_map(em);
786 break;
787 }
788 btrfs_drop_extent_cache(inode, async_extent->start,
789 async_extent->start +
790 async_extent->ram_size - 1, 0);
791 }
792
3e04e7f1
JB
793 if (ret)
794 goto out_free_reserve;
795
261507a0
LZ
796 ret = btrfs_add_ordered_extent_compress(inode,
797 async_extent->start,
798 ins.objectid,
799 async_extent->ram_size,
800 ins.offset,
801 BTRFS_ORDERED_COMPRESSED,
802 async_extent->compress_type);
d9f85963
FM
803 if (ret) {
804 btrfs_drop_extent_cache(inode, async_extent->start,
805 async_extent->start +
806 async_extent->ram_size - 1, 0);
3e04e7f1 807 goto out_free_reserve;
d9f85963 808 }
771ed689 809
771ed689
CM
810 /*
811 * clear dirty, set writeback and unlock the pages.
812 */
c2790a2e 813 extent_clear_unlock_delalloc(inode, async_extent->start,
a791e35e
CM
814 async_extent->start +
815 async_extent->ram_size - 1,
151a41bc
JB
816 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
817 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
c2790a2e 818 PAGE_SET_WRITEBACK);
771ed689 819 ret = btrfs_submit_compressed_write(inode,
d397712b
CM
820 async_extent->start,
821 async_extent->ram_size,
822 ins.objectid,
823 ins.offset, async_extent->pages,
824 async_extent->nr_pages);
fce2a4e6
FM
825 if (ret) {
826 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
827 struct page *p = async_extent->pages[0];
828 const u64 start = async_extent->start;
829 const u64 end = start + async_extent->ram_size - 1;
830
831 p->mapping = inode->i_mapping;
832 tree->ops->writepage_end_io_hook(p, start, end,
833 NULL, 0);
834 p->mapping = NULL;
835 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
836 PAGE_END_WRITEBACK |
837 PAGE_SET_ERROR);
40ae837b 838 free_async_extent_pages(async_extent);
fce2a4e6 839 }
771ed689
CM
840 alloc_hint = ins.objectid + ins.offset;
841 kfree(async_extent);
842 cond_resched();
843 }
3d7a820f 844 return 0;
3e04e7f1 845out_free_reserve:
e570fd27 846 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
79787eaa 847out_free:
c2790a2e 848 extent_clear_unlock_delalloc(inode, async_extent->start,
3e04e7f1
JB
849 async_extent->start +
850 async_extent->ram_size - 1,
c2790a2e 851 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
151a41bc
JB
852 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
853 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
704de49d
FM
854 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
855 PAGE_SET_ERROR);
40ae837b 856 free_async_extent_pages(async_extent);
79787eaa 857 kfree(async_extent);
3e04e7f1 858 goto again;
771ed689
CM
859}
860
4b46fce2
JB
861static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
862 u64 num_bytes)
863{
864 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
865 struct extent_map *em;
866 u64 alloc_hint = 0;
867
868 read_lock(&em_tree->lock);
869 em = search_extent_mapping(em_tree, start, num_bytes);
870 if (em) {
871 /*
872 * if block start isn't an actual block number then find the
873 * first block in this inode and use that as a hint. If that
874 * block is also bogus then just don't worry about it.
875 */
876 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
877 free_extent_map(em);
878 em = search_extent_mapping(em_tree, 0, 0);
879 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
880 alloc_hint = em->block_start;
881 if (em)
882 free_extent_map(em);
883 } else {
884 alloc_hint = em->block_start;
885 free_extent_map(em);
886 }
887 }
888 read_unlock(&em_tree->lock);
889
890 return alloc_hint;
891}
892
771ed689
CM
893/*
894 * when extent_io.c finds a delayed allocation range in the file,
895 * the call backs end up in this code. The basic idea is to
896 * allocate extents on disk for the range, and create ordered data structs
897 * in ram to track those extents.
898 *
899 * locked_page is the page that writepage had locked already. We use
900 * it to make sure we don't do extra locks or unlocks.
901 *
902 * *page_started is set to one if we unlock locked_page and do everything
903 * required to start IO on it. It may be clean and already done with
904 * IO when we return.
905 */
00361589
JB
906static noinline int cow_file_range(struct inode *inode,
907 struct page *locked_page,
908 u64 start, u64 end, int *page_started,
909 unsigned long *nr_written,
910 int unlock)
771ed689 911{
00361589 912 struct btrfs_root *root = BTRFS_I(inode)->root;
771ed689
CM
913 u64 alloc_hint = 0;
914 u64 num_bytes;
915 unsigned long ram_size;
916 u64 disk_num_bytes;
917 u64 cur_alloc_size;
918 u64 blocksize = root->sectorsize;
771ed689
CM
919 struct btrfs_key ins;
920 struct extent_map *em;
921 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
922 int ret = 0;
923
02ecd2c2
JB
924 if (btrfs_is_free_space_inode(inode)) {
925 WARN_ON_ONCE(1);
29bce2f3
JB
926 ret = -EINVAL;
927 goto out_unlock;
02ecd2c2 928 }
771ed689 929
fda2832f 930 num_bytes = ALIGN(end - start + 1, blocksize);
771ed689
CM
931 num_bytes = max(blocksize, num_bytes);
932 disk_num_bytes = num_bytes;
771ed689 933
4cb5300b 934 /* if this is a small write inside eof, kick off defrag */
4cb13e5d
LB
935 if (num_bytes < 64 * 1024 &&
936 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
00361589 937 btrfs_add_inode_defrag(NULL, inode);
4cb5300b 938
771ed689
CM
939 if (start == 0) {
940 /* lets try to make an inline extent */
00361589
JB
941 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
942 NULL);
771ed689 943 if (ret == 0) {
c2790a2e
JB
944 extent_clear_unlock_delalloc(inode, start, end, NULL,
945 EXTENT_LOCKED | EXTENT_DELALLOC |
151a41bc 946 EXTENT_DEFRAG, PAGE_UNLOCK |
c2790a2e
JB
947 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
948 PAGE_END_WRITEBACK);
c2167754 949
771ed689
CM
950 *nr_written = *nr_written +
951 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
952 *page_started = 1;
771ed689 953 goto out;
79787eaa 954 } else if (ret < 0) {
79787eaa 955 goto out_unlock;
771ed689
CM
956 }
957 }
958
959 BUG_ON(disk_num_bytes >
6c41761f 960 btrfs_super_total_bytes(root->fs_info->super_copy));
771ed689 961
4b46fce2 962 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
771ed689
CM
963 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
964
d397712b 965 while (disk_num_bytes > 0) {
a791e35e
CM
966 unsigned long op;
967
287a0ab9 968 cur_alloc_size = disk_num_bytes;
00361589 969 ret = btrfs_reserve_extent(root, cur_alloc_size,
771ed689 970 root->sectorsize, 0, alloc_hint,
e570fd27 971 &ins, 1, 1);
00361589 972 if (ret < 0)
79787eaa 973 goto out_unlock;
d397712b 974
172ddd60 975 em = alloc_extent_map();
b9aa55be
LB
976 if (!em) {
977 ret = -ENOMEM;
ace68bac 978 goto out_reserve;
b9aa55be 979 }
e6dcd2dc 980 em->start = start;
445a6944 981 em->orig_start = em->start;
771ed689
CM
982 ram_size = ins.offset;
983 em->len = ins.offset;
2ab28f32
JB
984 em->mod_start = em->start;
985 em->mod_len = em->len;
c8b97818 986
e6dcd2dc 987 em->block_start = ins.objectid;
c8b97818 988 em->block_len = ins.offset;
b4939680 989 em->orig_block_len = ins.offset;
cc95bef6 990 em->ram_bytes = ram_size;
e6dcd2dc 991 em->bdev = root->fs_info->fs_devices->latest_bdev;
7f3c74fb 992 set_bit(EXTENT_FLAG_PINNED, &em->flags);
70c8a91c 993 em->generation = -1;
c8b97818 994
d397712b 995 while (1) {
890871be 996 write_lock(&em_tree->lock);
09a2a8f9 997 ret = add_extent_mapping(em_tree, em, 1);
890871be 998 write_unlock(&em_tree->lock);
e6dcd2dc
CM
999 if (ret != -EEXIST) {
1000 free_extent_map(em);
1001 break;
1002 }
1003 btrfs_drop_extent_cache(inode, start,
c8b97818 1004 start + ram_size - 1, 0);
e6dcd2dc 1005 }
ace68bac
LB
1006 if (ret)
1007 goto out_reserve;
e6dcd2dc 1008
98d20f67 1009 cur_alloc_size = ins.offset;
e6dcd2dc 1010 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
771ed689 1011 ram_size, cur_alloc_size, 0);
ace68bac 1012 if (ret)
d9f85963 1013 goto out_drop_extent_cache;
c8b97818 1014
17d217fe
YZ
1015 if (root->root_key.objectid ==
1016 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1017 ret = btrfs_reloc_clone_csums(inode, start,
1018 cur_alloc_size);
00361589 1019 if (ret)
d9f85963 1020 goto out_drop_extent_cache;
17d217fe
YZ
1021 }
1022
d397712b 1023 if (disk_num_bytes < cur_alloc_size)
3b951516 1024 break;
d397712b 1025
c8b97818
CM
1026 /* we're not doing compressed IO, don't unlock the first
1027 * page (which the caller expects to stay locked), don't
1028 * clear any dirty bits and don't set any writeback bits
8b62b72b
CM
1029 *
1030 * Do set the Private2 bit so we know this page was properly
1031 * setup for writepage
c8b97818 1032 */
c2790a2e
JB
1033 op = unlock ? PAGE_UNLOCK : 0;
1034 op |= PAGE_SET_PRIVATE2;
a791e35e 1035
c2790a2e
JB
1036 extent_clear_unlock_delalloc(inode, start,
1037 start + ram_size - 1, locked_page,
1038 EXTENT_LOCKED | EXTENT_DELALLOC,
1039 op);
c8b97818 1040 disk_num_bytes -= cur_alloc_size;
c59f8951
CM
1041 num_bytes -= cur_alloc_size;
1042 alloc_hint = ins.objectid + ins.offset;
1043 start += cur_alloc_size;
b888db2b 1044 }
79787eaa 1045out:
be20aa9d 1046 return ret;
b7d5b0a8 1047
d9f85963
FM
1048out_drop_extent_cache:
1049 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
ace68bac 1050out_reserve:
e570fd27 1051 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
79787eaa 1052out_unlock:
c2790a2e 1053 extent_clear_unlock_delalloc(inode, start, end, locked_page,
151a41bc
JB
1054 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1055 EXTENT_DELALLOC | EXTENT_DEFRAG,
1056 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1057 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
79787eaa 1058 goto out;
771ed689 1059}
c8b97818 1060
771ed689
CM
1061/*
1062 * work queue call back to started compression on a file and pages
1063 */
1064static noinline void async_cow_start(struct btrfs_work *work)
1065{
1066 struct async_cow *async_cow;
1067 int num_added = 0;
1068 async_cow = container_of(work, struct async_cow, work);
1069
1070 compress_file_range(async_cow->inode, async_cow->locked_page,
1071 async_cow->start, async_cow->end, async_cow,
1072 &num_added);
8180ef88 1073 if (num_added == 0) {
cb77fcd8 1074 btrfs_add_delayed_iput(async_cow->inode);
771ed689 1075 async_cow->inode = NULL;
8180ef88 1076 }
771ed689
CM
1077}
1078
1079/*
1080 * work queue call back to submit previously compressed pages
1081 */
1082static noinline void async_cow_submit(struct btrfs_work *work)
1083{
1084 struct async_cow *async_cow;
1085 struct btrfs_root *root;
1086 unsigned long nr_pages;
1087
1088 async_cow = container_of(work, struct async_cow, work);
1089
1090 root = async_cow->root;
1091 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1092 PAGE_CACHE_SHIFT;
1093
66657b31 1094 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
287082b0 1095 5 * 1024 * 1024 &&
771ed689
CM
1096 waitqueue_active(&root->fs_info->async_submit_wait))
1097 wake_up(&root->fs_info->async_submit_wait);
1098
d397712b 1099 if (async_cow->inode)
771ed689 1100 submit_compressed_extents(async_cow->inode, async_cow);
771ed689 1101}
c8b97818 1102
771ed689
CM
1103static noinline void async_cow_free(struct btrfs_work *work)
1104{
1105 struct async_cow *async_cow;
1106 async_cow = container_of(work, struct async_cow, work);
8180ef88 1107 if (async_cow->inode)
cb77fcd8 1108 btrfs_add_delayed_iput(async_cow->inode);
771ed689
CM
1109 kfree(async_cow);
1110}
1111
1112static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1113 u64 start, u64 end, int *page_started,
1114 unsigned long *nr_written)
1115{
1116 struct async_cow *async_cow;
1117 struct btrfs_root *root = BTRFS_I(inode)->root;
1118 unsigned long nr_pages;
1119 u64 cur_end;
287082b0 1120 int limit = 10 * 1024 * 1024;
771ed689 1121
a3429ab7
CM
1122 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1123 1, 0, NULL, GFP_NOFS);
d397712b 1124 while (start < end) {
771ed689 1125 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
79787eaa 1126 BUG_ON(!async_cow); /* -ENOMEM */
8180ef88 1127 async_cow->inode = igrab(inode);
771ed689
CM
1128 async_cow->root = root;
1129 async_cow->locked_page = locked_page;
1130 async_cow->start = start;
1131
f79707b0
WS
1132 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1133 !btrfs_test_opt(root, FORCE_COMPRESS))
771ed689
CM
1134 cur_end = end;
1135 else
1136 cur_end = min(end, start + 512 * 1024 - 1);
1137
1138 async_cow->end = cur_end;
1139 INIT_LIST_HEAD(&async_cow->extents);
1140
9e0af237
LB
1141 btrfs_init_work(&async_cow->work,
1142 btrfs_delalloc_helper,
1143 async_cow_start, async_cow_submit,
1144 async_cow_free);
771ed689 1145
771ed689
CM
1146 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1147 PAGE_CACHE_SHIFT;
1148 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1149
afe3d242
QW
1150 btrfs_queue_work(root->fs_info->delalloc_workers,
1151 &async_cow->work);
771ed689
CM
1152
1153 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1154 wait_event(root->fs_info->async_submit_wait,
1155 (atomic_read(&root->fs_info->async_delalloc_pages) <
1156 limit));
1157 }
1158
d397712b 1159 while (atomic_read(&root->fs_info->async_submit_draining) &&
771ed689
CM
1160 atomic_read(&root->fs_info->async_delalloc_pages)) {
1161 wait_event(root->fs_info->async_submit_wait,
1162 (atomic_read(&root->fs_info->async_delalloc_pages) ==
1163 0));
1164 }
1165
1166 *nr_written += nr_pages;
1167 start = cur_end + 1;
1168 }
1169 *page_started = 1;
1170 return 0;
be20aa9d
CM
1171}
1172
d397712b 1173static noinline int csum_exist_in_range(struct btrfs_root *root,
17d217fe
YZ
1174 u64 bytenr, u64 num_bytes)
1175{
1176 int ret;
1177 struct btrfs_ordered_sum *sums;
1178 LIST_HEAD(list);
1179
07d400a6 1180 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
a2de733c 1181 bytenr + num_bytes - 1, &list, 0);
17d217fe
YZ
1182 if (ret == 0 && list_empty(&list))
1183 return 0;
1184
1185 while (!list_empty(&list)) {
1186 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1187 list_del(&sums->list);
1188 kfree(sums);
1189 }
1190 return 1;
1191}
1192
d352ac68
CM
1193/*
1194 * when nowcow writeback call back. This checks for snapshots or COW copies
1195 * of the extents that exist in the file, and COWs the file as required.
1196 *
1197 * If no cow copies or snapshots exist, we write directly to the existing
1198 * blocks on disk
1199 */
7f366cfe
CM
1200static noinline int run_delalloc_nocow(struct inode *inode,
1201 struct page *locked_page,
771ed689
CM
1202 u64 start, u64 end, int *page_started, int force,
1203 unsigned long *nr_written)
be20aa9d 1204{
be20aa9d 1205 struct btrfs_root *root = BTRFS_I(inode)->root;
7ea394f1 1206 struct btrfs_trans_handle *trans;
be20aa9d 1207 struct extent_buffer *leaf;
be20aa9d 1208 struct btrfs_path *path;
80ff3856 1209 struct btrfs_file_extent_item *fi;
be20aa9d 1210 struct btrfs_key found_key;
80ff3856
YZ
1211 u64 cow_start;
1212 u64 cur_offset;
1213 u64 extent_end;
5d4f98a2 1214 u64 extent_offset;
80ff3856
YZ
1215 u64 disk_bytenr;
1216 u64 num_bytes;
b4939680 1217 u64 disk_num_bytes;
cc95bef6 1218 u64 ram_bytes;
80ff3856 1219 int extent_type;
79787eaa 1220 int ret, err;
d899e052 1221 int type;
80ff3856
YZ
1222 int nocow;
1223 int check_prev = 1;
82d5902d 1224 bool nolock;
33345d01 1225 u64 ino = btrfs_ino(inode);
be20aa9d
CM
1226
1227 path = btrfs_alloc_path();
17ca04af 1228 if (!path) {
c2790a2e
JB
1229 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1230 EXTENT_LOCKED | EXTENT_DELALLOC |
151a41bc
JB
1231 EXTENT_DO_ACCOUNTING |
1232 EXTENT_DEFRAG, PAGE_UNLOCK |
c2790a2e
JB
1233 PAGE_CLEAR_DIRTY |
1234 PAGE_SET_WRITEBACK |
1235 PAGE_END_WRITEBACK);
d8926bb3 1236 return -ENOMEM;
17ca04af 1237 }
82d5902d 1238
83eea1f1 1239 nolock = btrfs_is_free_space_inode(inode);
82d5902d
LZ
1240
1241 if (nolock)
7a7eaa40 1242 trans = btrfs_join_transaction_nolock(root);
82d5902d 1243 else
7a7eaa40 1244 trans = btrfs_join_transaction(root);
ff5714cc 1245
79787eaa 1246 if (IS_ERR(trans)) {
c2790a2e
JB
1247 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1248 EXTENT_LOCKED | EXTENT_DELALLOC |
151a41bc
JB
1249 EXTENT_DO_ACCOUNTING |
1250 EXTENT_DEFRAG, PAGE_UNLOCK |
c2790a2e
JB
1251 PAGE_CLEAR_DIRTY |
1252 PAGE_SET_WRITEBACK |
1253 PAGE_END_WRITEBACK);
79787eaa
JM
1254 btrfs_free_path(path);
1255 return PTR_ERR(trans);
1256 }
1257
74b21075 1258 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
be20aa9d 1259
80ff3856
YZ
1260 cow_start = (u64)-1;
1261 cur_offset = start;
1262 while (1) {
33345d01 1263 ret = btrfs_lookup_file_extent(trans, root, path, ino,
80ff3856 1264 cur_offset, 0);
d788a349 1265 if (ret < 0)
79787eaa 1266 goto error;
80ff3856
YZ
1267 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1268 leaf = path->nodes[0];
1269 btrfs_item_key_to_cpu(leaf, &found_key,
1270 path->slots[0] - 1);
33345d01 1271 if (found_key.objectid == ino &&
80ff3856
YZ
1272 found_key.type == BTRFS_EXTENT_DATA_KEY)
1273 path->slots[0]--;
1274 }
1275 check_prev = 0;
1276next_slot:
1277 leaf = path->nodes[0];
1278 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1279 ret = btrfs_next_leaf(root, path);
d788a349 1280 if (ret < 0)
79787eaa 1281 goto error;
80ff3856
YZ
1282 if (ret > 0)
1283 break;
1284 leaf = path->nodes[0];
1285 }
be20aa9d 1286
80ff3856
YZ
1287 nocow = 0;
1288 disk_bytenr = 0;
17d217fe 1289 num_bytes = 0;
80ff3856
YZ
1290 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1291
33345d01 1292 if (found_key.objectid > ino ||
80ff3856
YZ
1293 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1294 found_key.offset > end)
1295 break;
1296
1297 if (found_key.offset > cur_offset) {
1298 extent_end = found_key.offset;
e9061e21 1299 extent_type = 0;
80ff3856
YZ
1300 goto out_check;
1301 }
1302
1303 fi = btrfs_item_ptr(leaf, path->slots[0],
1304 struct btrfs_file_extent_item);
1305 extent_type = btrfs_file_extent_type(leaf, fi);
1306
cc95bef6 1307 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
d899e052
YZ
1308 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1309 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
80ff3856 1310 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5d4f98a2 1311 extent_offset = btrfs_file_extent_offset(leaf, fi);
80ff3856
YZ
1312 extent_end = found_key.offset +
1313 btrfs_file_extent_num_bytes(leaf, fi);
b4939680
JB
1314 disk_num_bytes =
1315 btrfs_file_extent_disk_num_bytes(leaf, fi);
80ff3856
YZ
1316 if (extent_end <= start) {
1317 path->slots[0]++;
1318 goto next_slot;
1319 }
17d217fe
YZ
1320 if (disk_bytenr == 0)
1321 goto out_check;
80ff3856
YZ
1322 if (btrfs_file_extent_compression(leaf, fi) ||
1323 btrfs_file_extent_encryption(leaf, fi) ||
1324 btrfs_file_extent_other_encoding(leaf, fi))
1325 goto out_check;
d899e052
YZ
1326 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1327 goto out_check;
d2fb3437 1328 if (btrfs_extent_readonly(root, disk_bytenr))
80ff3856 1329 goto out_check;
33345d01 1330 if (btrfs_cross_ref_exist(trans, root, ino,
5d4f98a2
YZ
1331 found_key.offset -
1332 extent_offset, disk_bytenr))
17d217fe 1333 goto out_check;
5d4f98a2 1334 disk_bytenr += extent_offset;
17d217fe
YZ
1335 disk_bytenr += cur_offset - found_key.offset;
1336 num_bytes = min(end + 1, extent_end) - cur_offset;
e9894fd3
WS
1337 /*
1338 * if there are pending snapshots for this root,
1339 * we fall into common COW way.
1340 */
1341 if (!nolock) {
1342 err = btrfs_start_nocow_write(root);
1343 if (!err)
1344 goto out_check;
1345 }
17d217fe
YZ
1346 /*
1347 * force cow if csum exists in the range.
1348 * this ensure that csum for a given extent are
1349 * either valid or do not exist.
1350 */
1351 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1352 goto out_check;
80ff3856
YZ
1353 nocow = 1;
1354 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1355 extent_end = found_key.offset +
514ac8ad
CM
1356 btrfs_file_extent_inline_len(leaf,
1357 path->slots[0], fi);
80ff3856
YZ
1358 extent_end = ALIGN(extent_end, root->sectorsize);
1359 } else {
1360 BUG_ON(1);
1361 }
1362out_check:
1363 if (extent_end <= start) {
1364 path->slots[0]++;
e9894fd3
WS
1365 if (!nolock && nocow)
1366 btrfs_end_nocow_write(root);
80ff3856
YZ
1367 goto next_slot;
1368 }
1369 if (!nocow) {
1370 if (cow_start == (u64)-1)
1371 cow_start = cur_offset;
1372 cur_offset = extent_end;
1373 if (cur_offset > end)
1374 break;
1375 path->slots[0]++;
1376 goto next_slot;
7ea394f1
YZ
1377 }
1378
b3b4aa74 1379 btrfs_release_path(path);
80ff3856 1380 if (cow_start != (u64)-1) {
00361589
JB
1381 ret = cow_file_range(inode, locked_page,
1382 cow_start, found_key.offset - 1,
1383 page_started, nr_written, 1);
e9894fd3
WS
1384 if (ret) {
1385 if (!nolock && nocow)
1386 btrfs_end_nocow_write(root);
79787eaa 1387 goto error;
e9894fd3 1388 }
80ff3856 1389 cow_start = (u64)-1;
7ea394f1 1390 }
80ff3856 1391
d899e052
YZ
1392 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1393 struct extent_map *em;
1394 struct extent_map_tree *em_tree;
1395 em_tree = &BTRFS_I(inode)->extent_tree;
172ddd60 1396 em = alloc_extent_map();
79787eaa 1397 BUG_ON(!em); /* -ENOMEM */
d899e052 1398 em->start = cur_offset;
70c8a91c 1399 em->orig_start = found_key.offset - extent_offset;
d899e052
YZ
1400 em->len = num_bytes;
1401 em->block_len = num_bytes;
1402 em->block_start = disk_bytenr;
b4939680 1403 em->orig_block_len = disk_num_bytes;
cc95bef6 1404 em->ram_bytes = ram_bytes;
d899e052 1405 em->bdev = root->fs_info->fs_devices->latest_bdev;
2ab28f32
JB
1406 em->mod_start = em->start;
1407 em->mod_len = em->len;
d899e052 1408 set_bit(EXTENT_FLAG_PINNED, &em->flags);
b11e234d 1409 set_bit(EXTENT_FLAG_FILLING, &em->flags);
70c8a91c 1410 em->generation = -1;
d899e052 1411 while (1) {
890871be 1412 write_lock(&em_tree->lock);
09a2a8f9 1413 ret = add_extent_mapping(em_tree, em, 1);
890871be 1414 write_unlock(&em_tree->lock);
d899e052
YZ
1415 if (ret != -EEXIST) {
1416 free_extent_map(em);
1417 break;
1418 }
1419 btrfs_drop_extent_cache(inode, em->start,
1420 em->start + em->len - 1, 0);
1421 }
1422 type = BTRFS_ORDERED_PREALLOC;
1423 } else {
1424 type = BTRFS_ORDERED_NOCOW;
1425 }
80ff3856
YZ
1426
1427 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
d899e052 1428 num_bytes, num_bytes, type);
79787eaa 1429 BUG_ON(ret); /* -ENOMEM */
771ed689 1430
efa56464
YZ
1431 if (root->root_key.objectid ==
1432 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1433 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1434 num_bytes);
e9894fd3
WS
1435 if (ret) {
1436 if (!nolock && nocow)
1437 btrfs_end_nocow_write(root);
79787eaa 1438 goto error;
e9894fd3 1439 }
efa56464
YZ
1440 }
1441
c2790a2e
JB
1442 extent_clear_unlock_delalloc(inode, cur_offset,
1443 cur_offset + num_bytes - 1,
1444 locked_page, EXTENT_LOCKED |
1445 EXTENT_DELALLOC, PAGE_UNLOCK |
1446 PAGE_SET_PRIVATE2);
e9894fd3
WS
1447 if (!nolock && nocow)
1448 btrfs_end_nocow_write(root);
80ff3856
YZ
1449 cur_offset = extent_end;
1450 if (cur_offset > end)
1451 break;
be20aa9d 1452 }
b3b4aa74 1453 btrfs_release_path(path);
80ff3856 1454
17ca04af 1455 if (cur_offset <= end && cow_start == (u64)-1) {
80ff3856 1456 cow_start = cur_offset;
17ca04af
JB
1457 cur_offset = end;
1458 }
1459
80ff3856 1460 if (cow_start != (u64)-1) {
00361589
JB
1461 ret = cow_file_range(inode, locked_page, cow_start, end,
1462 page_started, nr_written, 1);
d788a349 1463 if (ret)
79787eaa 1464 goto error;
80ff3856
YZ
1465 }
1466
79787eaa 1467error:
a698d075 1468 err = btrfs_end_transaction(trans, root);
79787eaa
JM
1469 if (!ret)
1470 ret = err;
1471
17ca04af 1472 if (ret && cur_offset < end)
c2790a2e
JB
1473 extent_clear_unlock_delalloc(inode, cur_offset, end,
1474 locked_page, EXTENT_LOCKED |
151a41bc
JB
1475 EXTENT_DELALLOC | EXTENT_DEFRAG |
1476 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1477 PAGE_CLEAR_DIRTY |
c2790a2e
JB
1478 PAGE_SET_WRITEBACK |
1479 PAGE_END_WRITEBACK);
7ea394f1 1480 btrfs_free_path(path);
79787eaa 1481 return ret;
be20aa9d
CM
1482}
1483
47059d93
WS
1484static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1485{
1486
1487 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1488 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1489 return 0;
1490
1491 /*
1492 * @defrag_bytes is a hint value, no spinlock held here,
1493 * if is not zero, it means the file is defragging.
1494 * Force cow if given extent needs to be defragged.
1495 */
1496 if (BTRFS_I(inode)->defrag_bytes &&
1497 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1498 EXTENT_DEFRAG, 0, NULL))
1499 return 1;
1500
1501 return 0;
1502}
1503
d352ac68
CM
1504/*
1505 * extent_io.c call back to do delayed allocation processing
1506 */
c8b97818 1507static int run_delalloc_range(struct inode *inode, struct page *locked_page,
771ed689
CM
1508 u64 start, u64 end, int *page_started,
1509 unsigned long *nr_written)
be20aa9d 1510{
be20aa9d 1511 int ret;
47059d93 1512 int force_cow = need_force_cow(inode, start, end);
a2135011 1513
47059d93 1514 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
c8b97818 1515 ret = run_delalloc_nocow(inode, locked_page, start, end,
d397712b 1516 page_started, 1, nr_written);
47059d93 1517 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
d899e052 1518 ret = run_delalloc_nocow(inode, locked_page, start, end,
d397712b 1519 page_started, 0, nr_written);
7816030e 1520 } else if (!inode_need_compress(inode)) {
7f366cfe
CM
1521 ret = cow_file_range(inode, locked_page, start, end,
1522 page_started, nr_written, 1);
7ddf5a42
JB
1523 } else {
1524 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1525 &BTRFS_I(inode)->runtime_flags);
771ed689 1526 ret = cow_file_range_async(inode, locked_page, start, end,
d397712b 1527 page_started, nr_written);
7ddf5a42 1528 }
b888db2b
CM
1529 return ret;
1530}
1531
1bf85046
JM
1532static void btrfs_split_extent_hook(struct inode *inode,
1533 struct extent_state *orig, u64 split)
9ed74f2d 1534{
0ca1f7ce 1535 /* not delalloc, ignore it */
9ed74f2d 1536 if (!(orig->state & EXTENT_DELALLOC))
1bf85046 1537 return;
9ed74f2d 1538
9e0baf60
JB
1539 spin_lock(&BTRFS_I(inode)->lock);
1540 BTRFS_I(inode)->outstanding_extents++;
1541 spin_unlock(&BTRFS_I(inode)->lock);
9ed74f2d
JB
1542}
1543
1544/*
1545 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1546 * extents so we can keep track of new extents that are just merged onto old
1547 * extents, such as when we are doing sequential writes, so we can properly
1548 * account for the metadata space we'll need.
1549 */
1bf85046
JM
1550static void btrfs_merge_extent_hook(struct inode *inode,
1551 struct extent_state *new,
1552 struct extent_state *other)
9ed74f2d 1553{
9ed74f2d
JB
1554 /* not delalloc, ignore it */
1555 if (!(other->state & EXTENT_DELALLOC))
1bf85046 1556 return;
9ed74f2d 1557
9e0baf60
JB
1558 spin_lock(&BTRFS_I(inode)->lock);
1559 BTRFS_I(inode)->outstanding_extents--;
1560 spin_unlock(&BTRFS_I(inode)->lock);
9ed74f2d
JB
1561}
1562
eb73c1b7
MX
1563static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1564 struct inode *inode)
1565{
1566 spin_lock(&root->delalloc_lock);
1567 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1568 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1569 &root->delalloc_inodes);
1570 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1571 &BTRFS_I(inode)->runtime_flags);
1572 root->nr_delalloc_inodes++;
1573 if (root->nr_delalloc_inodes == 1) {
1574 spin_lock(&root->fs_info->delalloc_root_lock);
1575 BUG_ON(!list_empty(&root->delalloc_root));
1576 list_add_tail(&root->delalloc_root,
1577 &root->fs_info->delalloc_roots);
1578 spin_unlock(&root->fs_info->delalloc_root_lock);
1579 }
1580 }
1581 spin_unlock(&root->delalloc_lock);
1582}
1583
1584static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1585 struct inode *inode)
1586{
1587 spin_lock(&root->delalloc_lock);
1588 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1589 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1590 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1591 &BTRFS_I(inode)->runtime_flags);
1592 root->nr_delalloc_inodes--;
1593 if (!root->nr_delalloc_inodes) {
1594 spin_lock(&root->fs_info->delalloc_root_lock);
1595 BUG_ON(list_empty(&root->delalloc_root));
1596 list_del_init(&root->delalloc_root);
1597 spin_unlock(&root->fs_info->delalloc_root_lock);
1598 }
1599 }
1600 spin_unlock(&root->delalloc_lock);
1601}
1602
d352ac68
CM
1603/*
1604 * extent_io.c set_bit_hook, used to track delayed allocation
1605 * bytes in this file, and to maintain the list of inodes that
1606 * have pending delalloc work to be done.
1607 */
1bf85046 1608static void btrfs_set_bit_hook(struct inode *inode,
41074888 1609 struct extent_state *state, unsigned long *bits)
291d673e 1610{
9ed74f2d 1611
47059d93
WS
1612 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1613 WARN_ON(1);
75eff68e
CM
1614 /*
1615 * set_bit and clear bit hooks normally require _irqsave/restore
27160b6b 1616 * but in this case, we are only testing for the DELALLOC
75eff68e
CM
1617 * bit, which is only set or cleared with irqs on
1618 */
0ca1f7ce 1619 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
291d673e 1620 struct btrfs_root *root = BTRFS_I(inode)->root;
0ca1f7ce 1621 u64 len = state->end + 1 - state->start;
83eea1f1 1622 bool do_list = !btrfs_is_free_space_inode(inode);
9ed74f2d 1623
9e0baf60 1624 if (*bits & EXTENT_FIRST_DELALLOC) {
0ca1f7ce 1625 *bits &= ~EXTENT_FIRST_DELALLOC;
9e0baf60
JB
1626 } else {
1627 spin_lock(&BTRFS_I(inode)->lock);
1628 BTRFS_I(inode)->outstanding_extents++;
1629 spin_unlock(&BTRFS_I(inode)->lock);
1630 }
287a0ab9 1631
963d678b
MX
1632 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1633 root->fs_info->delalloc_batch);
df0af1a5 1634 spin_lock(&BTRFS_I(inode)->lock);
0ca1f7ce 1635 BTRFS_I(inode)->delalloc_bytes += len;
47059d93
WS
1636 if (*bits & EXTENT_DEFRAG)
1637 BTRFS_I(inode)->defrag_bytes += len;
df0af1a5 1638 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
eb73c1b7
MX
1639 &BTRFS_I(inode)->runtime_flags))
1640 btrfs_add_delalloc_inodes(root, inode);
df0af1a5 1641 spin_unlock(&BTRFS_I(inode)->lock);
291d673e 1642 }
291d673e
CM
1643}
1644
d352ac68
CM
1645/*
1646 * extent_io.c clear_bit_hook, see set_bit_hook for why
1647 */
1bf85046 1648static void btrfs_clear_bit_hook(struct inode *inode,
41074888
DS
1649 struct extent_state *state,
1650 unsigned long *bits)
291d673e 1651{
47059d93
WS
1652 u64 len = state->end + 1 - state->start;
1653
1654 spin_lock(&BTRFS_I(inode)->lock);
1655 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1656 BTRFS_I(inode)->defrag_bytes -= len;
1657 spin_unlock(&BTRFS_I(inode)->lock);
1658
75eff68e
CM
1659 /*
1660 * set_bit and clear bit hooks normally require _irqsave/restore
27160b6b 1661 * but in this case, we are only testing for the DELALLOC
75eff68e
CM
1662 * bit, which is only set or cleared with irqs on
1663 */
0ca1f7ce 1664 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
291d673e 1665 struct btrfs_root *root = BTRFS_I(inode)->root;
83eea1f1 1666 bool do_list = !btrfs_is_free_space_inode(inode);
bcbfce8a 1667
9e0baf60 1668 if (*bits & EXTENT_FIRST_DELALLOC) {
0ca1f7ce 1669 *bits &= ~EXTENT_FIRST_DELALLOC;
9e0baf60
JB
1670 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1671 spin_lock(&BTRFS_I(inode)->lock);
1672 BTRFS_I(inode)->outstanding_extents--;
1673 spin_unlock(&BTRFS_I(inode)->lock);
1674 }
0ca1f7ce 1675
b6d08f06
JB
1676 /*
1677 * We don't reserve metadata space for space cache inodes so we
1678 * don't need to call dellalloc_release_metadata if there is an
1679 * error.
1680 */
1681 if (*bits & EXTENT_DO_ACCOUNTING &&
1682 root != root->fs_info->tree_root)
0ca1f7ce
YZ
1683 btrfs_delalloc_release_metadata(inode, len);
1684
0cb59c99 1685 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
7ee9e440 1686 && do_list && !(state->state & EXTENT_NORESERVE))
0ca1f7ce 1687 btrfs_free_reserved_data_space(inode, len);
9ed74f2d 1688
963d678b
MX
1689 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1690 root->fs_info->delalloc_batch);
df0af1a5 1691 spin_lock(&BTRFS_I(inode)->lock);
0ca1f7ce 1692 BTRFS_I(inode)->delalloc_bytes -= len;
0cb59c99 1693 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
df0af1a5 1694 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
eb73c1b7
MX
1695 &BTRFS_I(inode)->runtime_flags))
1696 btrfs_del_delalloc_inode(root, inode);
df0af1a5 1697 spin_unlock(&BTRFS_I(inode)->lock);
291d673e 1698 }
291d673e
CM
1699}
1700
d352ac68
CM
1701/*
1702 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1703 * we don't create bios that span stripes or chunks
1704 */
64a16701 1705int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
c8b97818
CM
1706 size_t size, struct bio *bio,
1707 unsigned long bio_flags)
239b14b3
CM
1708{
1709 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
4f024f37 1710 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
239b14b3
CM
1711 u64 length = 0;
1712 u64 map_length;
239b14b3
CM
1713 int ret;
1714
771ed689
CM
1715 if (bio_flags & EXTENT_BIO_COMPRESSED)
1716 return 0;
1717
4f024f37 1718 length = bio->bi_iter.bi_size;
239b14b3 1719 map_length = length;
64a16701 1720 ret = btrfs_map_block(root->fs_info, rw, logical,
f188591e 1721 &map_length, NULL, 0);
3ec706c8 1722 /* Will always return 0 with map_multi == NULL */
3444a972 1723 BUG_ON(ret < 0);
d397712b 1724 if (map_length < length + size)
239b14b3 1725 return 1;
3444a972 1726 return 0;
239b14b3
CM
1727}
1728
d352ac68
CM
1729/*
1730 * in order to insert checksums into the metadata in large chunks,
1731 * we wait until bio submission time. All the pages in the bio are
1732 * checksummed and sums are attached onto the ordered extent record.
1733 *
1734 * At IO completion time the cums attached on the ordered extent record
1735 * are inserted into the btree
1736 */
d397712b
CM
1737static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1738 struct bio *bio, int mirror_num,
eaf25d93
CM
1739 unsigned long bio_flags,
1740 u64 bio_offset)
065631f6 1741{
065631f6 1742 struct btrfs_root *root = BTRFS_I(inode)->root;
065631f6 1743 int ret = 0;
e015640f 1744
d20f7043 1745 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
79787eaa 1746 BUG_ON(ret); /* -ENOMEM */
4a69a410
CM
1747 return 0;
1748}
e015640f 1749
4a69a410
CM
1750/*
1751 * in order to insert checksums into the metadata in large chunks,
1752 * we wait until bio submission time. All the pages in the bio are
1753 * checksummed and sums are attached onto the ordered extent record.
1754 *
1755 * At IO completion time the cums attached on the ordered extent record
1756 * are inserted into the btree
1757 */
b2950863 1758static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
eaf25d93
CM
1759 int mirror_num, unsigned long bio_flags,
1760 u64 bio_offset)
4a69a410
CM
1761{
1762 struct btrfs_root *root = BTRFS_I(inode)->root;
61891923
SB
1763 int ret;
1764
1765 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1766 if (ret)
1767 bio_endio(bio, ret);
1768 return ret;
44b8bd7e
CM
1769}
1770
d352ac68 1771/*
cad321ad
CM
1772 * extent_io.c submission hook. This does the right thing for csum calculation
1773 * on write, or reading the csums from the tree before a read
d352ac68 1774 */
b2950863 1775static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
eaf25d93
CM
1776 int mirror_num, unsigned long bio_flags,
1777 u64 bio_offset)
44b8bd7e
CM
1778{
1779 struct btrfs_root *root = BTRFS_I(inode)->root;
1780 int ret = 0;
19b9bdb0 1781 int skip_sum;
0417341e 1782 int metadata = 0;
b812ce28 1783 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
44b8bd7e 1784
6cbff00f 1785 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
cad321ad 1786
83eea1f1 1787 if (btrfs_is_free_space_inode(inode))
0417341e
JM
1788 metadata = 2;
1789
7b6d91da 1790 if (!(rw & REQ_WRITE)) {
5fd02043
JB
1791 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1792 if (ret)
61891923 1793 goto out;
5fd02043 1794
d20f7043 1795 if (bio_flags & EXTENT_BIO_COMPRESSED) {
61891923
SB
1796 ret = btrfs_submit_compressed_read(inode, bio,
1797 mirror_num,
1798 bio_flags);
1799 goto out;
c2db1073
TI
1800 } else if (!skip_sum) {
1801 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1802 if (ret)
61891923 1803 goto out;
c2db1073 1804 }
4d1b5fb4 1805 goto mapit;
b812ce28 1806 } else if (async && !skip_sum) {
17d217fe
YZ
1807 /* csum items have already been cloned */
1808 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1809 goto mapit;
19b9bdb0 1810 /* we're doing a write, do the async checksumming */
61891923 1811 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
44b8bd7e 1812 inode, rw, bio, mirror_num,
eaf25d93
CM
1813 bio_flags, bio_offset,
1814 __btrfs_submit_bio_start,
4a69a410 1815 __btrfs_submit_bio_done);
61891923 1816 goto out;
b812ce28
JB
1817 } else if (!skip_sum) {
1818 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1819 if (ret)
1820 goto out;
19b9bdb0
CM
1821 }
1822
0b86a832 1823mapit:
61891923
SB
1824 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1825
1826out:
1827 if (ret < 0)
1828 bio_endio(bio, ret);
1829 return ret;
065631f6 1830}
6885f308 1831
d352ac68
CM
1832/*
1833 * given a list of ordered sums record them in the inode. This happens
1834 * at IO completion time based on sums calculated at bio submission time.
1835 */
ba1da2f4 1836static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
e6dcd2dc
CM
1837 struct inode *inode, u64 file_offset,
1838 struct list_head *list)
1839{
e6dcd2dc
CM
1840 struct btrfs_ordered_sum *sum;
1841
c6e30871 1842 list_for_each_entry(sum, list, list) {
39847c4d 1843 trans->adding_csums = 1;
d20f7043
CM
1844 btrfs_csum_file_blocks(trans,
1845 BTRFS_I(inode)->root->fs_info->csum_root, sum);
39847c4d 1846 trans->adding_csums = 0;
e6dcd2dc
CM
1847 }
1848 return 0;
1849}
1850
2ac55d41
JB
1851int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1852 struct extent_state **cached_state)
ea8c2819 1853{
6c1500f2 1854 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
ea8c2819 1855 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2ac55d41 1856 cached_state, GFP_NOFS);
ea8c2819
CM
1857}
1858
d352ac68 1859/* see btrfs_writepage_start_hook for details on why this is required */
247e743c
CM
1860struct btrfs_writepage_fixup {
1861 struct page *page;
1862 struct btrfs_work work;
1863};
1864
b2950863 1865static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
247e743c
CM
1866{
1867 struct btrfs_writepage_fixup *fixup;
1868 struct btrfs_ordered_extent *ordered;
2ac55d41 1869 struct extent_state *cached_state = NULL;
247e743c
CM
1870 struct page *page;
1871 struct inode *inode;
1872 u64 page_start;
1873 u64 page_end;
87826df0 1874 int ret;
247e743c
CM
1875
1876 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1877 page = fixup->page;
4a096752 1878again:
247e743c
CM
1879 lock_page(page);
1880 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1881 ClearPageChecked(page);
1882 goto out_page;
1883 }
1884
1885 inode = page->mapping->host;
1886 page_start = page_offset(page);
1887 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1888
2ac55d41 1889 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
d0082371 1890 &cached_state);
4a096752
CM
1891
1892 /* already ordered? We're done */
8b62b72b 1893 if (PagePrivate2(page))
247e743c 1894 goto out;
4a096752
CM
1895
1896 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1897 if (ordered) {
2ac55d41
JB
1898 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1899 page_end, &cached_state, GFP_NOFS);
4a096752
CM
1900 unlock_page(page);
1901 btrfs_start_ordered_extent(inode, ordered, 1);
87826df0 1902 btrfs_put_ordered_extent(ordered);
4a096752
CM
1903 goto again;
1904 }
247e743c 1905
87826df0
JM
1906 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1907 if (ret) {
1908 mapping_set_error(page->mapping, ret);
1909 end_extent_writepage(page, ret, page_start, page_end);
1910 ClearPageChecked(page);
1911 goto out;
1912 }
1913
2ac55d41 1914 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
247e743c 1915 ClearPageChecked(page);
87826df0 1916 set_page_dirty(page);
247e743c 1917out:
2ac55d41
JB
1918 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1919 &cached_state, GFP_NOFS);
247e743c
CM
1920out_page:
1921 unlock_page(page);
1922 page_cache_release(page);
b897abec 1923 kfree(fixup);
247e743c
CM
1924}
1925
1926/*
1927 * There are a few paths in the higher layers of the kernel that directly
1928 * set the page dirty bit without asking the filesystem if it is a
1929 * good idea. This causes problems because we want to make sure COW
1930 * properly happens and the data=ordered rules are followed.
1931 *
c8b97818 1932 * In our case any range that doesn't have the ORDERED bit set
247e743c
CM
1933 * hasn't been properly setup for IO. We kick off an async process
1934 * to fix it up. The async helper will wait for ordered extents, set
1935 * the delalloc bit and make it safe to write the page.
1936 */
b2950863 1937static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
247e743c
CM
1938{
1939 struct inode *inode = page->mapping->host;
1940 struct btrfs_writepage_fixup *fixup;
1941 struct btrfs_root *root = BTRFS_I(inode)->root;
247e743c 1942
8b62b72b
CM
1943 /* this page is properly in the ordered list */
1944 if (TestClearPagePrivate2(page))
247e743c
CM
1945 return 0;
1946
1947 if (PageChecked(page))
1948 return -EAGAIN;
1949
1950 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1951 if (!fixup)
1952 return -EAGAIN;
f421950f 1953
247e743c
CM
1954 SetPageChecked(page);
1955 page_cache_get(page);
9e0af237
LB
1956 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1957 btrfs_writepage_fixup_worker, NULL, NULL);
247e743c 1958 fixup->page = page;
dc6e3209 1959 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
87826df0 1960 return -EBUSY;
247e743c
CM
1961}
1962
d899e052
YZ
1963static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1964 struct inode *inode, u64 file_pos,
1965 u64 disk_bytenr, u64 disk_num_bytes,
1966 u64 num_bytes, u64 ram_bytes,
1967 u8 compression, u8 encryption,
1968 u16 other_encoding, int extent_type)
1969{
1970 struct btrfs_root *root = BTRFS_I(inode)->root;
1971 struct btrfs_file_extent_item *fi;
1972 struct btrfs_path *path;
1973 struct extent_buffer *leaf;
1974 struct btrfs_key ins;
1acae57b 1975 int extent_inserted = 0;
d899e052
YZ
1976 int ret;
1977
1978 path = btrfs_alloc_path();
d8926bb3
MF
1979 if (!path)
1980 return -ENOMEM;
d899e052 1981
a1ed835e
CM
1982 /*
1983 * we may be replacing one extent in the tree with another.
1984 * The new extent is pinned in the extent map, and we don't want
1985 * to drop it from the cache until it is completely in the btree.
1986 *
1987 * So, tell btrfs_drop_extents to leave this extent in the cache.
1988 * the caller is expected to unpin it and allow it to be merged
1989 * with the others.
1990 */
1acae57b
FDBM
1991 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1992 file_pos + num_bytes, NULL, 0,
1993 1, sizeof(*fi), &extent_inserted);
79787eaa
JM
1994 if (ret)
1995 goto out;
d899e052 1996
1acae57b
FDBM
1997 if (!extent_inserted) {
1998 ins.objectid = btrfs_ino(inode);
1999 ins.offset = file_pos;
2000 ins.type = BTRFS_EXTENT_DATA_KEY;
2001
2002 path->leave_spinning = 1;
2003 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2004 sizeof(*fi));
2005 if (ret)
2006 goto out;
2007 }
d899e052
YZ
2008 leaf = path->nodes[0];
2009 fi = btrfs_item_ptr(leaf, path->slots[0],
2010 struct btrfs_file_extent_item);
2011 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2012 btrfs_set_file_extent_type(leaf, fi, extent_type);
2013 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2014 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2015 btrfs_set_file_extent_offset(leaf, fi, 0);
2016 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2017 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2018 btrfs_set_file_extent_compression(leaf, fi, compression);
2019 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2020 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
b9473439 2021
d899e052 2022 btrfs_mark_buffer_dirty(leaf);
ce195332 2023 btrfs_release_path(path);
d899e052
YZ
2024
2025 inode_add_bytes(inode, num_bytes);
d899e052
YZ
2026
2027 ins.objectid = disk_bytenr;
2028 ins.offset = disk_num_bytes;
2029 ins.type = BTRFS_EXTENT_ITEM_KEY;
5d4f98a2
YZ
2030 ret = btrfs_alloc_reserved_file_extent(trans, root,
2031 root->root_key.objectid,
33345d01 2032 btrfs_ino(inode), file_pos, &ins);
79787eaa 2033out:
d899e052 2034 btrfs_free_path(path);
b9473439 2035
79787eaa 2036 return ret;
d899e052
YZ
2037}
2038
38c227d8
LB
2039/* snapshot-aware defrag */
2040struct sa_defrag_extent_backref {
2041 struct rb_node node;
2042 struct old_sa_defrag_extent *old;
2043 u64 root_id;
2044 u64 inum;
2045 u64 file_pos;
2046 u64 extent_offset;
2047 u64 num_bytes;
2048 u64 generation;
2049};
2050
2051struct old_sa_defrag_extent {
2052 struct list_head list;
2053 struct new_sa_defrag_extent *new;
2054
2055 u64 extent_offset;
2056 u64 bytenr;
2057 u64 offset;
2058 u64 len;
2059 int count;
2060};
2061
2062struct new_sa_defrag_extent {
2063 struct rb_root root;
2064 struct list_head head;
2065 struct btrfs_path *path;
2066 struct inode *inode;
2067 u64 file_pos;
2068 u64 len;
2069 u64 bytenr;
2070 u64 disk_len;
2071 u8 compress_type;
2072};
2073
2074static int backref_comp(struct sa_defrag_extent_backref *b1,
2075 struct sa_defrag_extent_backref *b2)
2076{
2077 if (b1->root_id < b2->root_id)
2078 return -1;
2079 else if (b1->root_id > b2->root_id)
2080 return 1;
2081
2082 if (b1->inum < b2->inum)
2083 return -1;
2084 else if (b1->inum > b2->inum)
2085 return 1;
2086
2087 if (b1->file_pos < b2->file_pos)
2088 return -1;
2089 else if (b1->file_pos > b2->file_pos)
2090 return 1;
2091
2092 /*
2093 * [------------------------------] ===> (a range of space)
2094 * |<--->| |<---->| =============> (fs/file tree A)
2095 * |<---------------------------->| ===> (fs/file tree B)
2096 *
2097 * A range of space can refer to two file extents in one tree while
2098 * refer to only one file extent in another tree.
2099 *
2100 * So we may process a disk offset more than one time(two extents in A)
2101 * and locate at the same extent(one extent in B), then insert two same
2102 * backrefs(both refer to the extent in B).
2103 */
2104 return 0;
2105}
2106
2107static void backref_insert(struct rb_root *root,
2108 struct sa_defrag_extent_backref *backref)
2109{
2110 struct rb_node **p = &root->rb_node;
2111 struct rb_node *parent = NULL;
2112 struct sa_defrag_extent_backref *entry;
2113 int ret;
2114
2115 while (*p) {
2116 parent = *p;
2117 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2118
2119 ret = backref_comp(backref, entry);
2120 if (ret < 0)
2121 p = &(*p)->rb_left;
2122 else
2123 p = &(*p)->rb_right;
2124 }
2125
2126 rb_link_node(&backref->node, parent, p);
2127 rb_insert_color(&backref->node, root);
2128}
2129
2130/*
2131 * Note the backref might has changed, and in this case we just return 0.
2132 */
2133static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2134 void *ctx)
2135{
2136 struct btrfs_file_extent_item *extent;
2137 struct btrfs_fs_info *fs_info;
2138 struct old_sa_defrag_extent *old = ctx;
2139 struct new_sa_defrag_extent *new = old->new;
2140 struct btrfs_path *path = new->path;
2141 struct btrfs_key key;
2142 struct btrfs_root *root;
2143 struct sa_defrag_extent_backref *backref;
2144 struct extent_buffer *leaf;
2145 struct inode *inode = new->inode;
2146 int slot;
2147 int ret;
2148 u64 extent_offset;
2149 u64 num_bytes;
2150
2151 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2152 inum == btrfs_ino(inode))
2153 return 0;
2154
2155 key.objectid = root_id;
2156 key.type = BTRFS_ROOT_ITEM_KEY;
2157 key.offset = (u64)-1;
2158
2159 fs_info = BTRFS_I(inode)->root->fs_info;
2160 root = btrfs_read_fs_root_no_name(fs_info, &key);
2161 if (IS_ERR(root)) {
2162 if (PTR_ERR(root) == -ENOENT)
2163 return 0;
2164 WARN_ON(1);
2165 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2166 inum, offset, root_id);
2167 return PTR_ERR(root);
2168 }
2169
2170 key.objectid = inum;
2171 key.type = BTRFS_EXTENT_DATA_KEY;
2172 if (offset > (u64)-1 << 32)
2173 key.offset = 0;
2174 else
2175 key.offset = offset;
2176
2177 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
fae7f21c 2178 if (WARN_ON(ret < 0))
38c227d8 2179 return ret;
50f1319c 2180 ret = 0;
38c227d8
LB
2181
2182 while (1) {
2183 cond_resched();
2184
2185 leaf = path->nodes[0];
2186 slot = path->slots[0];
2187
2188 if (slot >= btrfs_header_nritems(leaf)) {
2189 ret = btrfs_next_leaf(root, path);
2190 if (ret < 0) {
2191 goto out;
2192 } else if (ret > 0) {
2193 ret = 0;
2194 goto out;
2195 }
2196 continue;
2197 }
2198
2199 path->slots[0]++;
2200
2201 btrfs_item_key_to_cpu(leaf, &key, slot);
2202
2203 if (key.objectid > inum)
2204 goto out;
2205
2206 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2207 continue;
2208
2209 extent = btrfs_item_ptr(leaf, slot,
2210 struct btrfs_file_extent_item);
2211
2212 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2213 continue;
2214
e68afa49
LB
2215 /*
2216 * 'offset' refers to the exact key.offset,
2217 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2218 * (key.offset - extent_offset).
2219 */
2220 if (key.offset != offset)
38c227d8
LB
2221 continue;
2222
e68afa49 2223 extent_offset = btrfs_file_extent_offset(leaf, extent);
38c227d8 2224 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
e68afa49 2225
38c227d8
LB
2226 if (extent_offset >= old->extent_offset + old->offset +
2227 old->len || extent_offset + num_bytes <=
2228 old->extent_offset + old->offset)
2229 continue;
38c227d8
LB
2230 break;
2231 }
2232
2233 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2234 if (!backref) {
2235 ret = -ENOENT;
2236 goto out;
2237 }
2238
2239 backref->root_id = root_id;
2240 backref->inum = inum;
e68afa49 2241 backref->file_pos = offset;
38c227d8
LB
2242 backref->num_bytes = num_bytes;
2243 backref->extent_offset = extent_offset;
2244 backref->generation = btrfs_file_extent_generation(leaf, extent);
2245 backref->old = old;
2246 backref_insert(&new->root, backref);
2247 old->count++;
2248out:
2249 btrfs_release_path(path);
2250 WARN_ON(ret);
2251 return ret;
2252}
2253
2254static noinline bool record_extent_backrefs(struct btrfs_path *path,
2255 struct new_sa_defrag_extent *new)
2256{
2257 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2258 struct old_sa_defrag_extent *old, *tmp;
2259 int ret;
2260
2261 new->path = path;
2262
2263 list_for_each_entry_safe(old, tmp, &new->head, list) {
e68afa49
LB
2264 ret = iterate_inodes_from_logical(old->bytenr +
2265 old->extent_offset, fs_info,
38c227d8
LB
2266 path, record_one_backref,
2267 old);
4724b106
JB
2268 if (ret < 0 && ret != -ENOENT)
2269 return false;
38c227d8
LB
2270
2271 /* no backref to be processed for this extent */
2272 if (!old->count) {
2273 list_del(&old->list);
2274 kfree(old);
2275 }
2276 }
2277
2278 if (list_empty(&new->head))
2279 return false;
2280
2281 return true;
2282}
2283
2284static int relink_is_mergable(struct extent_buffer *leaf,
2285 struct btrfs_file_extent_item *fi,
116e0024 2286 struct new_sa_defrag_extent *new)
38c227d8 2287{
116e0024 2288 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
38c227d8
LB
2289 return 0;
2290
2291 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2292 return 0;
2293
116e0024
LB
2294 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2295 return 0;
2296
2297 if (btrfs_file_extent_encryption(leaf, fi) ||
38c227d8
LB
2298 btrfs_file_extent_other_encoding(leaf, fi))
2299 return 0;
2300
2301 return 1;
2302}
2303
2304/*
2305 * Note the backref might has changed, and in this case we just return 0.
2306 */
2307static noinline int relink_extent_backref(struct btrfs_path *path,
2308 struct sa_defrag_extent_backref *prev,
2309 struct sa_defrag_extent_backref *backref)
2310{
2311 struct btrfs_file_extent_item *extent;
2312 struct btrfs_file_extent_item *item;
2313 struct btrfs_ordered_extent *ordered;
2314 struct btrfs_trans_handle *trans;
2315 struct btrfs_fs_info *fs_info;
2316 struct btrfs_root *root;
2317 struct btrfs_key key;
2318 struct extent_buffer *leaf;
2319 struct old_sa_defrag_extent *old = backref->old;
2320 struct new_sa_defrag_extent *new = old->new;
2321 struct inode *src_inode = new->inode;
2322 struct inode *inode;
2323 struct extent_state *cached = NULL;
2324 int ret = 0;
2325 u64 start;
2326 u64 len;
2327 u64 lock_start;
2328 u64 lock_end;
2329 bool merge = false;
2330 int index;
2331
2332 if (prev && prev->root_id == backref->root_id &&
2333 prev->inum == backref->inum &&
2334 prev->file_pos + prev->num_bytes == backref->file_pos)
2335 merge = true;
2336
2337 /* step 1: get root */
2338 key.objectid = backref->root_id;
2339 key.type = BTRFS_ROOT_ITEM_KEY;
2340 key.offset = (u64)-1;
2341
2342 fs_info = BTRFS_I(src_inode)->root->fs_info;
2343 index = srcu_read_lock(&fs_info->subvol_srcu);
2344
2345 root = btrfs_read_fs_root_no_name(fs_info, &key);
2346 if (IS_ERR(root)) {
2347 srcu_read_unlock(&fs_info->subvol_srcu, index);
2348 if (PTR_ERR(root) == -ENOENT)
2349 return 0;
2350 return PTR_ERR(root);
2351 }
38c227d8 2352
bcbba5e6
WS
2353 if (btrfs_root_readonly(root)) {
2354 srcu_read_unlock(&fs_info->subvol_srcu, index);
2355 return 0;
2356 }
2357
38c227d8
LB
2358 /* step 2: get inode */
2359 key.objectid = backref->inum;
2360 key.type = BTRFS_INODE_ITEM_KEY;
2361 key.offset = 0;
2362
2363 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2364 if (IS_ERR(inode)) {
2365 srcu_read_unlock(&fs_info->subvol_srcu, index);
2366 return 0;
2367 }
2368
2369 srcu_read_unlock(&fs_info->subvol_srcu, index);
2370
2371 /* step 3: relink backref */
2372 lock_start = backref->file_pos;
2373 lock_end = backref->file_pos + backref->num_bytes - 1;
2374 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2375 0, &cached);
2376
2377 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2378 if (ordered) {
2379 btrfs_put_ordered_extent(ordered);
2380 goto out_unlock;
2381 }
2382
2383 trans = btrfs_join_transaction(root);
2384 if (IS_ERR(trans)) {
2385 ret = PTR_ERR(trans);
2386 goto out_unlock;
2387 }
2388
2389 key.objectid = backref->inum;
2390 key.type = BTRFS_EXTENT_DATA_KEY;
2391 key.offset = backref->file_pos;
2392
2393 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2394 if (ret < 0) {
2395 goto out_free_path;
2396 } else if (ret > 0) {
2397 ret = 0;
2398 goto out_free_path;
2399 }
2400
2401 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2402 struct btrfs_file_extent_item);
2403
2404 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2405 backref->generation)
2406 goto out_free_path;
2407
2408 btrfs_release_path(path);
2409
2410 start = backref->file_pos;
2411 if (backref->extent_offset < old->extent_offset + old->offset)
2412 start += old->extent_offset + old->offset -
2413 backref->extent_offset;
2414
2415 len = min(backref->extent_offset + backref->num_bytes,
2416 old->extent_offset + old->offset + old->len);
2417 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2418
2419 ret = btrfs_drop_extents(trans, root, inode, start,
2420 start + len, 1);
2421 if (ret)
2422 goto out_free_path;
2423again:
2424 key.objectid = btrfs_ino(inode);
2425 key.type = BTRFS_EXTENT_DATA_KEY;
2426 key.offset = start;
2427
a09a0a70 2428 path->leave_spinning = 1;
38c227d8
LB
2429 if (merge) {
2430 struct btrfs_file_extent_item *fi;
2431 u64 extent_len;
2432 struct btrfs_key found_key;
2433
3c9665df 2434 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
38c227d8
LB
2435 if (ret < 0)
2436 goto out_free_path;
2437
2438 path->slots[0]--;
2439 leaf = path->nodes[0];
2440 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2441
2442 fi = btrfs_item_ptr(leaf, path->slots[0],
2443 struct btrfs_file_extent_item);
2444 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2445
116e0024
LB
2446 if (extent_len + found_key.offset == start &&
2447 relink_is_mergable(leaf, fi, new)) {
38c227d8
LB
2448 btrfs_set_file_extent_num_bytes(leaf, fi,
2449 extent_len + len);
2450 btrfs_mark_buffer_dirty(leaf);
2451 inode_add_bytes(inode, len);
2452
2453 ret = 1;
2454 goto out_free_path;
2455 } else {
2456 merge = false;
2457 btrfs_release_path(path);
2458 goto again;
2459 }
2460 }
2461
2462 ret = btrfs_insert_empty_item(trans, root, path, &key,
2463 sizeof(*extent));
2464 if (ret) {
2465 btrfs_abort_transaction(trans, root, ret);
2466 goto out_free_path;
2467 }
2468
2469 leaf = path->nodes[0];
2470 item = btrfs_item_ptr(leaf, path->slots[0],
2471 struct btrfs_file_extent_item);
2472 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2473 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2474 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2475 btrfs_set_file_extent_num_bytes(leaf, item, len);
2476 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2477 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2478 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2479 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2480 btrfs_set_file_extent_encryption(leaf, item, 0);
2481 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2482
2483 btrfs_mark_buffer_dirty(leaf);
2484 inode_add_bytes(inode, len);
a09a0a70 2485 btrfs_release_path(path);
38c227d8
LB
2486
2487 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2488 new->disk_len, 0,
2489 backref->root_id, backref->inum,
2490 new->file_pos, 0); /* start - extent_offset */
2491 if (ret) {
2492 btrfs_abort_transaction(trans, root, ret);
2493 goto out_free_path;
2494 }
2495
2496 ret = 1;
2497out_free_path:
2498 btrfs_release_path(path);
a09a0a70 2499 path->leave_spinning = 0;
38c227d8
LB
2500 btrfs_end_transaction(trans, root);
2501out_unlock:
2502 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2503 &cached, GFP_NOFS);
2504 iput(inode);
2505 return ret;
2506}
2507
6f519564
LB
2508static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2509{
2510 struct old_sa_defrag_extent *old, *tmp;
2511
2512 if (!new)
2513 return;
2514
2515 list_for_each_entry_safe(old, tmp, &new->head, list) {
2516 list_del(&old->list);
2517 kfree(old);
2518 }
2519 kfree(new);
2520}
2521
38c227d8
LB
2522static void relink_file_extents(struct new_sa_defrag_extent *new)
2523{
2524 struct btrfs_path *path;
38c227d8
LB
2525 struct sa_defrag_extent_backref *backref;
2526 struct sa_defrag_extent_backref *prev = NULL;
2527 struct inode *inode;
2528 struct btrfs_root *root;
2529 struct rb_node *node;
2530 int ret;
2531
2532 inode = new->inode;
2533 root = BTRFS_I(inode)->root;
2534
2535 path = btrfs_alloc_path();
2536 if (!path)
2537 return;
2538
2539 if (!record_extent_backrefs(path, new)) {
2540 btrfs_free_path(path);
2541 goto out;
2542 }
2543 btrfs_release_path(path);
2544
2545 while (1) {
2546 node = rb_first(&new->root);
2547 if (!node)
2548 break;
2549 rb_erase(node, &new->root);
2550
2551 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2552
2553 ret = relink_extent_backref(path, prev, backref);
2554 WARN_ON(ret < 0);
2555
2556 kfree(prev);
2557
2558 if (ret == 1)
2559 prev = backref;
2560 else
2561 prev = NULL;
2562 cond_resched();
2563 }
2564 kfree(prev);
2565
2566 btrfs_free_path(path);
38c227d8 2567out:
6f519564
LB
2568 free_sa_defrag_extent(new);
2569
38c227d8
LB
2570 atomic_dec(&root->fs_info->defrag_running);
2571 wake_up(&root->fs_info->transaction_wait);
38c227d8
LB
2572}
2573
2574static struct new_sa_defrag_extent *
2575record_old_file_extents(struct inode *inode,
2576 struct btrfs_ordered_extent *ordered)
2577{
2578 struct btrfs_root *root = BTRFS_I(inode)->root;
2579 struct btrfs_path *path;
2580 struct btrfs_key key;
6f519564 2581 struct old_sa_defrag_extent *old;
38c227d8
LB
2582 struct new_sa_defrag_extent *new;
2583 int ret;
2584
2585 new = kmalloc(sizeof(*new), GFP_NOFS);
2586 if (!new)
2587 return NULL;
2588
2589 new->inode = inode;
2590 new->file_pos = ordered->file_offset;
2591 new->len = ordered->len;
2592 new->bytenr = ordered->start;
2593 new->disk_len = ordered->disk_len;
2594 new->compress_type = ordered->compress_type;
2595 new->root = RB_ROOT;
2596 INIT_LIST_HEAD(&new->head);
2597
2598 path = btrfs_alloc_path();
2599 if (!path)
2600 goto out_kfree;
2601
2602 key.objectid = btrfs_ino(inode);
2603 key.type = BTRFS_EXTENT_DATA_KEY;
2604 key.offset = new->file_pos;
2605
2606 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2607 if (ret < 0)
2608 goto out_free_path;
2609 if (ret > 0 && path->slots[0] > 0)
2610 path->slots[0]--;
2611
2612 /* find out all the old extents for the file range */
2613 while (1) {
2614 struct btrfs_file_extent_item *extent;
2615 struct extent_buffer *l;
2616 int slot;
2617 u64 num_bytes;
2618 u64 offset;
2619 u64 end;
2620 u64 disk_bytenr;
2621 u64 extent_offset;
2622
2623 l = path->nodes[0];
2624 slot = path->slots[0];
2625
2626 if (slot >= btrfs_header_nritems(l)) {
2627 ret = btrfs_next_leaf(root, path);
2628 if (ret < 0)
6f519564 2629 goto out_free_path;
38c227d8
LB
2630 else if (ret > 0)
2631 break;
2632 continue;
2633 }
2634
2635 btrfs_item_key_to_cpu(l, &key, slot);
2636
2637 if (key.objectid != btrfs_ino(inode))
2638 break;
2639 if (key.type != BTRFS_EXTENT_DATA_KEY)
2640 break;
2641 if (key.offset >= new->file_pos + new->len)
2642 break;
2643
2644 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2645
2646 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2647 if (key.offset + num_bytes < new->file_pos)
2648 goto next;
2649
2650 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2651 if (!disk_bytenr)
2652 goto next;
2653
2654 extent_offset = btrfs_file_extent_offset(l, extent);
2655
2656 old = kmalloc(sizeof(*old), GFP_NOFS);
2657 if (!old)
6f519564 2658 goto out_free_path;
38c227d8
LB
2659
2660 offset = max(new->file_pos, key.offset);
2661 end = min(new->file_pos + new->len, key.offset + num_bytes);
2662
2663 old->bytenr = disk_bytenr;
2664 old->extent_offset = extent_offset;
2665 old->offset = offset - key.offset;
2666 old->len = end - offset;
2667 old->new = new;
2668 old->count = 0;
2669 list_add_tail(&old->list, &new->head);
2670next:
2671 path->slots[0]++;
2672 cond_resched();
2673 }
2674
2675 btrfs_free_path(path);
2676 atomic_inc(&root->fs_info->defrag_running);
2677
2678 return new;
2679
38c227d8
LB
2680out_free_path:
2681 btrfs_free_path(path);
2682out_kfree:
6f519564 2683 free_sa_defrag_extent(new);
38c227d8
LB
2684 return NULL;
2685}
2686
e570fd27
MX
2687static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2688 u64 start, u64 len)
2689{
2690 struct btrfs_block_group_cache *cache;
2691
2692 cache = btrfs_lookup_block_group(root->fs_info, start);
2693 ASSERT(cache);
2694
2695 spin_lock(&cache->lock);
2696 cache->delalloc_bytes -= len;
2697 spin_unlock(&cache->lock);
2698
2699 btrfs_put_block_group(cache);
2700}
2701
d352ac68
CM
2702/* as ordered data IO finishes, this gets called so we can finish
2703 * an ordered extent if the range of bytes in the file it covers are
2704 * fully written.
2705 */
5fd02043 2706static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
e6dcd2dc 2707{
5fd02043 2708 struct inode *inode = ordered_extent->inode;
e6dcd2dc 2709 struct btrfs_root *root = BTRFS_I(inode)->root;
0ca1f7ce 2710 struct btrfs_trans_handle *trans = NULL;
e6dcd2dc 2711 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2ac55d41 2712 struct extent_state *cached_state = NULL;
38c227d8 2713 struct new_sa_defrag_extent *new = NULL;
261507a0 2714 int compress_type = 0;
77cef2ec
JB
2715 int ret = 0;
2716 u64 logical_len = ordered_extent->len;
82d5902d 2717 bool nolock;
77cef2ec 2718 bool truncated = false;
e6dcd2dc 2719
83eea1f1 2720 nolock = btrfs_is_free_space_inode(inode);
0cb59c99 2721
5fd02043
JB
2722 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2723 ret = -EIO;
2724 goto out;
2725 }
2726
f612496b
MX
2727 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2728 ordered_extent->file_offset +
2729 ordered_extent->len - 1);
2730
77cef2ec
JB
2731 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2732 truncated = true;
2733 logical_len = ordered_extent->truncated_len;
2734 /* Truncated the entire extent, don't bother adding */
2735 if (!logical_len)
2736 goto out;
2737 }
2738
c2167754 2739 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
79787eaa 2740 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
6c760c07
JB
2741 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2742 if (nolock)
2743 trans = btrfs_join_transaction_nolock(root);
2744 else
2745 trans = btrfs_join_transaction(root);
2746 if (IS_ERR(trans)) {
2747 ret = PTR_ERR(trans);
2748 trans = NULL;
2749 goto out;
c2167754 2750 }
6c760c07
JB
2751 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2752 ret = btrfs_update_inode_fallback(trans, root, inode);
2753 if (ret) /* -ENOMEM or corruption */
2754 btrfs_abort_transaction(trans, root, ret);
c2167754
YZ
2755 goto out;
2756 }
e6dcd2dc 2757
2ac55d41
JB
2758 lock_extent_bits(io_tree, ordered_extent->file_offset,
2759 ordered_extent->file_offset + ordered_extent->len - 1,
d0082371 2760 0, &cached_state);
e6dcd2dc 2761
38c227d8
LB
2762 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2763 ordered_extent->file_offset + ordered_extent->len - 1,
2764 EXTENT_DEFRAG, 1, cached_state);
2765 if (ret) {
2766 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
8101c8db 2767 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
38c227d8
LB
2768 /* the inode is shared */
2769 new = record_old_file_extents(inode, ordered_extent);
2770
2771 clear_extent_bit(io_tree, ordered_extent->file_offset,
2772 ordered_extent->file_offset + ordered_extent->len - 1,
2773 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2774 }
2775
0cb59c99 2776 if (nolock)
7a7eaa40 2777 trans = btrfs_join_transaction_nolock(root);
0cb59c99 2778 else
7a7eaa40 2779 trans = btrfs_join_transaction(root);
79787eaa
JM
2780 if (IS_ERR(trans)) {
2781 ret = PTR_ERR(trans);
2782 trans = NULL;
2783 goto out_unlock;
2784 }
a79b7d4b 2785
0ca1f7ce 2786 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
c2167754 2787
c8b97818 2788 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
261507a0 2789 compress_type = ordered_extent->compress_type;
d899e052 2790 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
261507a0 2791 BUG_ON(compress_type);
920bbbfb 2792 ret = btrfs_mark_extent_written(trans, inode,
d899e052
YZ
2793 ordered_extent->file_offset,
2794 ordered_extent->file_offset +
77cef2ec 2795 logical_len);
d899e052 2796 } else {
0af3d00b 2797 BUG_ON(root == root->fs_info->tree_root);
d899e052
YZ
2798 ret = insert_reserved_file_extent(trans, inode,
2799 ordered_extent->file_offset,
2800 ordered_extent->start,
2801 ordered_extent->disk_len,
77cef2ec 2802 logical_len, logical_len,
261507a0 2803 compress_type, 0, 0,
d899e052 2804 BTRFS_FILE_EXTENT_REG);
e570fd27
MX
2805 if (!ret)
2806 btrfs_release_delalloc_bytes(root,
2807 ordered_extent->start,
2808 ordered_extent->disk_len);
d899e052 2809 }
5dc562c5
JB
2810 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2811 ordered_extent->file_offset, ordered_extent->len,
2812 trans->transid);
79787eaa
JM
2813 if (ret < 0) {
2814 btrfs_abort_transaction(trans, root, ret);
5fd02043 2815 goto out_unlock;
79787eaa 2816 }
2ac55d41 2817
e6dcd2dc
CM
2818 add_pending_csums(trans, inode, ordered_extent->file_offset,
2819 &ordered_extent->list);
2820
6c760c07
JB
2821 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2822 ret = btrfs_update_inode_fallback(trans, root, inode);
2823 if (ret) { /* -ENOMEM or corruption */
2824 btrfs_abort_transaction(trans, root, ret);
2825 goto out_unlock;
1ef30be1
JB
2826 }
2827 ret = 0;
5fd02043
JB
2828out_unlock:
2829 unlock_extent_cached(io_tree, ordered_extent->file_offset,
2830 ordered_extent->file_offset +
2831 ordered_extent->len - 1, &cached_state, GFP_NOFS);
c2167754 2832out:
5b0e95bf 2833 if (root != root->fs_info->tree_root)
0cb59c99 2834 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
a698d075
MX
2835 if (trans)
2836 btrfs_end_transaction(trans, root);
0cb59c99 2837
77cef2ec
JB
2838 if (ret || truncated) {
2839 u64 start, end;
2840
2841 if (truncated)
2842 start = ordered_extent->file_offset + logical_len;
2843 else
2844 start = ordered_extent->file_offset;
2845 end = ordered_extent->file_offset + ordered_extent->len - 1;
2846 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2847
2848 /* Drop the cache for the part of the extent we didn't write. */
2849 btrfs_drop_extent_cache(inode, start, end, 0);
5fd02043 2850
0bec9ef5
JB
2851 /*
2852 * If the ordered extent had an IOERR or something else went
2853 * wrong we need to return the space for this ordered extent
77cef2ec
JB
2854 * back to the allocator. We only free the extent in the
2855 * truncated case if we didn't write out the extent at all.
0bec9ef5 2856 */
77cef2ec
JB
2857 if ((ret || !logical_len) &&
2858 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
0bec9ef5
JB
2859 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2860 btrfs_free_reserved_extent(root, ordered_extent->start,
e570fd27 2861 ordered_extent->disk_len, 1);
0bec9ef5
JB
2862 }
2863
2864
5fd02043 2865 /*
8bad3c02
LB
2866 * This needs to be done to make sure anybody waiting knows we are done
2867 * updating everything for this ordered extent.
5fd02043
JB
2868 */
2869 btrfs_remove_ordered_extent(inode, ordered_extent);
2870
38c227d8 2871 /* for snapshot-aware defrag */
6f519564
LB
2872 if (new) {
2873 if (ret) {
2874 free_sa_defrag_extent(new);
2875 atomic_dec(&root->fs_info->defrag_running);
2876 } else {
2877 relink_file_extents(new);
2878 }
2879 }
38c227d8 2880
e6dcd2dc
CM
2881 /* once for us */
2882 btrfs_put_ordered_extent(ordered_extent);
2883 /* once for the tree */
2884 btrfs_put_ordered_extent(ordered_extent);
2885
5fd02043
JB
2886 return ret;
2887}
2888
2889static void finish_ordered_fn(struct btrfs_work *work)
2890{
2891 struct btrfs_ordered_extent *ordered_extent;
2892 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2893 btrfs_finish_ordered_io(ordered_extent);
e6dcd2dc
CM
2894}
2895
b2950863 2896static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
211f90e6
CM
2897 struct extent_state *state, int uptodate)
2898{
5fd02043
JB
2899 struct inode *inode = page->mapping->host;
2900 struct btrfs_root *root = BTRFS_I(inode)->root;
2901 struct btrfs_ordered_extent *ordered_extent = NULL;
9e0af237
LB
2902 struct btrfs_workqueue *wq;
2903 btrfs_work_func_t func;
5fd02043 2904
1abe9b8a 2905 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2906
8b62b72b 2907 ClearPagePrivate2(page);
5fd02043
JB
2908 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2909 end - start + 1, uptodate))
2910 return 0;
2911
9e0af237
LB
2912 if (btrfs_is_free_space_inode(inode)) {
2913 wq = root->fs_info->endio_freespace_worker;
2914 func = btrfs_freespace_write_helper;
2915 } else {
2916 wq = root->fs_info->endio_write_workers;
2917 func = btrfs_endio_write_helper;
2918 }
5fd02043 2919
9e0af237
LB
2920 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2921 NULL);
2922 btrfs_queue_work(wq, &ordered_extent->work);
5fd02043
JB
2923
2924 return 0;
211f90e6
CM
2925}
2926
dc380aea
MX
2927static int __readpage_endio_check(struct inode *inode,
2928 struct btrfs_io_bio *io_bio,
2929 int icsum, struct page *page,
2930 int pgoff, u64 start, size_t len)
2931{
2932 char *kaddr;
2933 u32 csum_expected;
2934 u32 csum = ~(u32)0;
2935 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2936 DEFAULT_RATELIMIT_BURST);
2937
2938 csum_expected = *(((u32 *)io_bio->csum) + icsum);
2939
2940 kaddr = kmap_atomic(page);
2941 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
2942 btrfs_csum_final(csum, (char *)&csum);
2943 if (csum != csum_expected)
2944 goto zeroit;
2945
2946 kunmap_atomic(kaddr);
2947 return 0;
2948zeroit:
2949 if (__ratelimit(&_rs))
2950 btrfs_info(BTRFS_I(inode)->root->fs_info,
2951 "csum failed ino %llu off %llu csum %u expected csum %u",
2952 btrfs_ino(inode), start, csum, csum_expected);
2953 memset(kaddr + pgoff, 1, len);
2954 flush_dcache_page(page);
2955 kunmap_atomic(kaddr);
2956 if (csum_expected == 0)
2957 return 0;
2958 return -EIO;
2959}
2960
d352ac68
CM
2961/*
2962 * when reads are done, we need to check csums to verify the data is correct
4a54c8c1
JS
2963 * if there's a match, we allow the bio to finish. If not, the code in
2964 * extent_io.c will try to find good copies for us.
d352ac68 2965 */
facc8a22
MX
2966static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2967 u64 phy_offset, struct page *page,
2968 u64 start, u64 end, int mirror)
07157aac 2969{
4eee4fa4 2970 size_t offset = start - page_offset(page);
07157aac 2971 struct inode *inode = page->mapping->host;
d1310b2e 2972 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
ff79f819 2973 struct btrfs_root *root = BTRFS_I(inode)->root;
d1310b2e 2974
d20f7043
CM
2975 if (PageChecked(page)) {
2976 ClearPageChecked(page);
dc380aea 2977 return 0;
d20f7043 2978 }
6cbff00f
CH
2979
2980 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
dc380aea 2981 return 0;
17d217fe
YZ
2982
2983 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
9655d298 2984 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
17d217fe
YZ
2985 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2986 GFP_NOFS);
b6cda9bc 2987 return 0;
17d217fe 2988 }
d20f7043 2989
facc8a22 2990 phy_offset >>= inode->i_sb->s_blocksize_bits;
dc380aea
MX
2991 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
2992 start, (size_t)(end - start + 1));
07157aac 2993}
b888db2b 2994
24bbcf04
YZ
2995struct delayed_iput {
2996 struct list_head list;
2997 struct inode *inode;
2998};
2999
79787eaa
JM
3000/* JDM: If this is fs-wide, why can't we add a pointer to
3001 * btrfs_inode instead and avoid the allocation? */
24bbcf04
YZ
3002void btrfs_add_delayed_iput(struct inode *inode)
3003{
3004 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3005 struct delayed_iput *delayed;
3006
3007 if (atomic_add_unless(&inode->i_count, -1, 1))
3008 return;
3009
3010 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
3011 delayed->inode = inode;
3012
3013 spin_lock(&fs_info->delayed_iput_lock);
3014 list_add_tail(&delayed->list, &fs_info->delayed_iputs);
3015 spin_unlock(&fs_info->delayed_iput_lock);
3016}
3017
3018void btrfs_run_delayed_iputs(struct btrfs_root *root)
3019{
3020 LIST_HEAD(list);
3021 struct btrfs_fs_info *fs_info = root->fs_info;
3022 struct delayed_iput *delayed;
3023 int empty;
3024
3025 spin_lock(&fs_info->delayed_iput_lock);
3026 empty = list_empty(&fs_info->delayed_iputs);
3027 spin_unlock(&fs_info->delayed_iput_lock);
3028 if (empty)
3029 return;
3030
24bbcf04
YZ
3031 spin_lock(&fs_info->delayed_iput_lock);
3032 list_splice_init(&fs_info->delayed_iputs, &list);
3033 spin_unlock(&fs_info->delayed_iput_lock);
3034
3035 while (!list_empty(&list)) {
3036 delayed = list_entry(list.next, struct delayed_iput, list);
3037 list_del(&delayed->list);
3038 iput(delayed->inode);
3039 kfree(delayed);
3040 }
24bbcf04
YZ
3041}
3042
d68fc57b 3043/*
42b2aa86 3044 * This is called in transaction commit time. If there are no orphan
d68fc57b
YZ
3045 * files in the subvolume, it removes orphan item and frees block_rsv
3046 * structure.
3047 */
3048void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3049 struct btrfs_root *root)
3050{
90290e19 3051 struct btrfs_block_rsv *block_rsv;
d68fc57b
YZ
3052 int ret;
3053
8a35d95f 3054 if (atomic_read(&root->orphan_inodes) ||
d68fc57b
YZ
3055 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3056 return;
3057
90290e19 3058 spin_lock(&root->orphan_lock);
8a35d95f 3059 if (atomic_read(&root->orphan_inodes)) {
90290e19
JB
3060 spin_unlock(&root->orphan_lock);
3061 return;
3062 }
3063
3064 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3065 spin_unlock(&root->orphan_lock);
3066 return;
3067 }
3068
3069 block_rsv = root->orphan_block_rsv;
3070 root->orphan_block_rsv = NULL;
3071 spin_unlock(&root->orphan_lock);
3072
27cdeb70 3073 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
d68fc57b
YZ
3074 btrfs_root_refs(&root->root_item) > 0) {
3075 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3076 root->root_key.objectid);
4ef31a45
JB
3077 if (ret)
3078 btrfs_abort_transaction(trans, root, ret);
3079 else
27cdeb70
MX
3080 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3081 &root->state);
d68fc57b
YZ
3082 }
3083
90290e19
JB
3084 if (block_rsv) {
3085 WARN_ON(block_rsv->size > 0);
3086 btrfs_free_block_rsv(root, block_rsv);
d68fc57b
YZ
3087 }
3088}
3089
7b128766
JB
3090/*
3091 * This creates an orphan entry for the given inode in case something goes
3092 * wrong in the middle of an unlink/truncate.
d68fc57b
YZ
3093 *
3094 * NOTE: caller of this function should reserve 5 units of metadata for
3095 * this function.
7b128766
JB
3096 */
3097int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3098{
3099 struct btrfs_root *root = BTRFS_I(inode)->root;
d68fc57b
YZ
3100 struct btrfs_block_rsv *block_rsv = NULL;
3101 int reserve = 0;
3102 int insert = 0;
3103 int ret;
7b128766 3104
d68fc57b 3105 if (!root->orphan_block_rsv) {
66d8f3dd 3106 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
b532402e
TI
3107 if (!block_rsv)
3108 return -ENOMEM;
d68fc57b 3109 }
7b128766 3110
d68fc57b
YZ
3111 spin_lock(&root->orphan_lock);
3112 if (!root->orphan_block_rsv) {
3113 root->orphan_block_rsv = block_rsv;
3114 } else if (block_rsv) {
3115 btrfs_free_block_rsv(root, block_rsv);
3116 block_rsv = NULL;
7b128766 3117 }
7b128766 3118
8a35d95f
JB
3119 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3120 &BTRFS_I(inode)->runtime_flags)) {
d68fc57b
YZ
3121#if 0
3122 /*
3123 * For proper ENOSPC handling, we should do orphan
3124 * cleanup when mounting. But this introduces backward
3125 * compatibility issue.
3126 */
3127 if (!xchg(&root->orphan_item_inserted, 1))
3128 insert = 2;
3129 else
3130 insert = 1;
3131#endif
3132 insert = 1;
321f0e70 3133 atomic_inc(&root->orphan_inodes);
7b128766
JB
3134 }
3135
72ac3c0d
JB
3136 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3137 &BTRFS_I(inode)->runtime_flags))
d68fc57b 3138 reserve = 1;
d68fc57b 3139 spin_unlock(&root->orphan_lock);
7b128766 3140
d68fc57b
YZ
3141 /* grab metadata reservation from transaction handle */
3142 if (reserve) {
3143 ret = btrfs_orphan_reserve_metadata(trans, inode);
79787eaa 3144 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
d68fc57b 3145 }
7b128766 3146
d68fc57b
YZ
3147 /* insert an orphan item to track this unlinked/truncated file */
3148 if (insert >= 1) {
33345d01 3149 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
4ef31a45 3150 if (ret) {
703c88e0 3151 atomic_dec(&root->orphan_inodes);
4ef31a45
JB
3152 if (reserve) {
3153 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3154 &BTRFS_I(inode)->runtime_flags);
3155 btrfs_orphan_release_metadata(inode);
3156 }
3157 if (ret != -EEXIST) {
e8e7cff6
JB
3158 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3159 &BTRFS_I(inode)->runtime_flags);
4ef31a45
JB
3160 btrfs_abort_transaction(trans, root, ret);
3161 return ret;
3162 }
79787eaa
JM
3163 }
3164 ret = 0;
d68fc57b
YZ
3165 }
3166
3167 /* insert an orphan item to track subvolume contains orphan files */
3168 if (insert >= 2) {
3169 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3170 root->root_key.objectid);
79787eaa
JM
3171 if (ret && ret != -EEXIST) {
3172 btrfs_abort_transaction(trans, root, ret);
3173 return ret;
3174 }
d68fc57b
YZ
3175 }
3176 return 0;
7b128766
JB
3177}
3178
3179/*
3180 * We have done the truncate/delete so we can go ahead and remove the orphan
3181 * item for this particular inode.
3182 */
48a3b636
ES
3183static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3184 struct inode *inode)
7b128766
JB
3185{
3186 struct btrfs_root *root = BTRFS_I(inode)->root;
d68fc57b
YZ
3187 int delete_item = 0;
3188 int release_rsv = 0;
7b128766
JB
3189 int ret = 0;
3190
d68fc57b 3191 spin_lock(&root->orphan_lock);
8a35d95f
JB
3192 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3193 &BTRFS_I(inode)->runtime_flags))
d68fc57b 3194 delete_item = 1;
7b128766 3195
72ac3c0d
JB
3196 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3197 &BTRFS_I(inode)->runtime_flags))
d68fc57b 3198 release_rsv = 1;
d68fc57b 3199 spin_unlock(&root->orphan_lock);
7b128766 3200
703c88e0 3201 if (delete_item) {
8a35d95f 3202 atomic_dec(&root->orphan_inodes);
703c88e0
FDBM
3203 if (trans)
3204 ret = btrfs_del_orphan_item(trans, root,
3205 btrfs_ino(inode));
8a35d95f 3206 }
7b128766 3207
703c88e0
FDBM
3208 if (release_rsv)
3209 btrfs_orphan_release_metadata(inode);
3210
4ef31a45 3211 return ret;
7b128766
JB
3212}
3213
3214/*
3215 * this cleans up any orphans that may be left on the list from the last use
3216 * of this root.
3217 */
66b4ffd1 3218int btrfs_orphan_cleanup(struct btrfs_root *root)
7b128766
JB
3219{
3220 struct btrfs_path *path;
3221 struct extent_buffer *leaf;
7b128766
JB
3222 struct btrfs_key key, found_key;
3223 struct btrfs_trans_handle *trans;
3224 struct inode *inode;
8f6d7f4f 3225 u64 last_objectid = 0;
7b128766
JB
3226 int ret = 0, nr_unlink = 0, nr_truncate = 0;
3227
d68fc57b 3228 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
66b4ffd1 3229 return 0;
c71bf099
YZ
3230
3231 path = btrfs_alloc_path();
66b4ffd1
JB
3232 if (!path) {
3233 ret = -ENOMEM;
3234 goto out;
3235 }
7b128766
JB
3236 path->reada = -1;
3237
3238 key.objectid = BTRFS_ORPHAN_OBJECTID;
962a298f 3239 key.type = BTRFS_ORPHAN_ITEM_KEY;
7b128766
JB
3240 key.offset = (u64)-1;
3241
7b128766
JB
3242 while (1) {
3243 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
66b4ffd1
JB
3244 if (ret < 0)
3245 goto out;
7b128766
JB
3246
3247 /*
3248 * if ret == 0 means we found what we were searching for, which
25985edc 3249 * is weird, but possible, so only screw with path if we didn't
7b128766
JB
3250 * find the key and see if we have stuff that matches
3251 */
3252 if (ret > 0) {
66b4ffd1 3253 ret = 0;
7b128766
JB
3254 if (path->slots[0] == 0)
3255 break;
3256 path->slots[0]--;
3257 }
3258
3259 /* pull out the item */
3260 leaf = path->nodes[0];
7b128766
JB
3261 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3262
3263 /* make sure the item matches what we want */
3264 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3265 break;
962a298f 3266 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
7b128766
JB
3267 break;
3268
3269 /* release the path since we're done with it */
b3b4aa74 3270 btrfs_release_path(path);
7b128766
JB
3271
3272 /*
3273 * this is where we are basically btrfs_lookup, without the
3274 * crossing root thing. we store the inode number in the
3275 * offset of the orphan item.
3276 */
8f6d7f4f
JB
3277
3278 if (found_key.offset == last_objectid) {
c2cf52eb
SK
3279 btrfs_err(root->fs_info,
3280 "Error removing orphan entry, stopping orphan cleanup");
8f6d7f4f
JB
3281 ret = -EINVAL;
3282 goto out;
3283 }
3284
3285 last_objectid = found_key.offset;
3286
5d4f98a2
YZ
3287 found_key.objectid = found_key.offset;
3288 found_key.type = BTRFS_INODE_ITEM_KEY;
3289 found_key.offset = 0;
73f73415 3290 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
8c6ffba0 3291 ret = PTR_ERR_OR_ZERO(inode);
a8c9e576 3292 if (ret && ret != -ESTALE)
66b4ffd1 3293 goto out;
7b128766 3294
f8e9e0b0
AJ
3295 if (ret == -ESTALE && root == root->fs_info->tree_root) {
3296 struct btrfs_root *dead_root;
3297 struct btrfs_fs_info *fs_info = root->fs_info;
3298 int is_dead_root = 0;
3299
3300 /*
3301 * this is an orphan in the tree root. Currently these
3302 * could come from 2 sources:
3303 * a) a snapshot deletion in progress
3304 * b) a free space cache inode
3305 * We need to distinguish those two, as the snapshot
3306 * orphan must not get deleted.
3307 * find_dead_roots already ran before us, so if this
3308 * is a snapshot deletion, we should find the root
3309 * in the dead_roots list
3310 */
3311 spin_lock(&fs_info->trans_lock);
3312 list_for_each_entry(dead_root, &fs_info->dead_roots,
3313 root_list) {
3314 if (dead_root->root_key.objectid ==
3315 found_key.objectid) {
3316 is_dead_root = 1;
3317 break;
3318 }
3319 }
3320 spin_unlock(&fs_info->trans_lock);
3321 if (is_dead_root) {
3322 /* prevent this orphan from being found again */
3323 key.offset = found_key.objectid - 1;
3324 continue;
3325 }
3326 }
7b128766 3327 /*
a8c9e576
JB
3328 * Inode is already gone but the orphan item is still there,
3329 * kill the orphan item.
7b128766 3330 */
a8c9e576
JB
3331 if (ret == -ESTALE) {
3332 trans = btrfs_start_transaction(root, 1);
66b4ffd1
JB
3333 if (IS_ERR(trans)) {
3334 ret = PTR_ERR(trans);
3335 goto out;
3336 }
c2cf52eb
SK
3337 btrfs_debug(root->fs_info, "auto deleting %Lu",
3338 found_key.objectid);
a8c9e576
JB
3339 ret = btrfs_del_orphan_item(trans, root,
3340 found_key.objectid);
5b21f2ed 3341 btrfs_end_transaction(trans, root);
4ef31a45
JB
3342 if (ret)
3343 goto out;
7b128766
JB
3344 continue;
3345 }
3346
a8c9e576
JB
3347 /*
3348 * add this inode to the orphan list so btrfs_orphan_del does
3349 * the proper thing when we hit it
3350 */
8a35d95f
JB
3351 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3352 &BTRFS_I(inode)->runtime_flags);
925396ec 3353 atomic_inc(&root->orphan_inodes);
a8c9e576 3354
7b128766
JB
3355 /* if we have links, this was a truncate, lets do that */
3356 if (inode->i_nlink) {
fae7f21c 3357 if (WARN_ON(!S_ISREG(inode->i_mode))) {
a41ad394
JB
3358 iput(inode);
3359 continue;
3360 }
7b128766 3361 nr_truncate++;
f3fe820c
JB
3362
3363 /* 1 for the orphan item deletion. */
3364 trans = btrfs_start_transaction(root, 1);
3365 if (IS_ERR(trans)) {
c69b26b0 3366 iput(inode);
f3fe820c
JB
3367 ret = PTR_ERR(trans);
3368 goto out;
3369 }
3370 ret = btrfs_orphan_add(trans, inode);
3371 btrfs_end_transaction(trans, root);
c69b26b0
JB
3372 if (ret) {
3373 iput(inode);
f3fe820c 3374 goto out;
c69b26b0 3375 }
f3fe820c 3376
66b4ffd1 3377 ret = btrfs_truncate(inode);
4a7d0f68
JB
3378 if (ret)
3379 btrfs_orphan_del(NULL, inode);
7b128766
JB
3380 } else {
3381 nr_unlink++;
3382 }
3383
3384 /* this will do delete_inode and everything for us */
3385 iput(inode);
66b4ffd1
JB
3386 if (ret)
3387 goto out;
7b128766 3388 }
3254c876
MX
3389 /* release the path since we're done with it */
3390 btrfs_release_path(path);
3391
d68fc57b
YZ
3392 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3393
3394 if (root->orphan_block_rsv)
3395 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3396 (u64)-1);
3397
27cdeb70
MX
3398 if (root->orphan_block_rsv ||
3399 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
7a7eaa40 3400 trans = btrfs_join_transaction(root);
66b4ffd1
JB
3401 if (!IS_ERR(trans))
3402 btrfs_end_transaction(trans, root);
d68fc57b 3403 }
7b128766
JB
3404
3405 if (nr_unlink)
4884b476 3406 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
7b128766 3407 if (nr_truncate)
4884b476 3408 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
66b4ffd1
JB
3409
3410out:
3411 if (ret)
c2cf52eb
SK
3412 btrfs_crit(root->fs_info,
3413 "could not do orphan cleanup %d", ret);
66b4ffd1
JB
3414 btrfs_free_path(path);
3415 return ret;
7b128766
JB
3416}
3417
46a53cca
CM
3418/*
3419 * very simple check to peek ahead in the leaf looking for xattrs. If we
3420 * don't find any xattrs, we know there can't be any acls.
3421 *
3422 * slot is the slot the inode is in, objectid is the objectid of the inode
3423 */
3424static noinline int acls_after_inode_item(struct extent_buffer *leaf,
63541927
FDBM
3425 int slot, u64 objectid,
3426 int *first_xattr_slot)
46a53cca
CM
3427{
3428 u32 nritems = btrfs_header_nritems(leaf);
3429 struct btrfs_key found_key;
f23b5a59
JB
3430 static u64 xattr_access = 0;
3431 static u64 xattr_default = 0;
46a53cca
CM
3432 int scanned = 0;
3433
f23b5a59
JB
3434 if (!xattr_access) {
3435 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3436 strlen(POSIX_ACL_XATTR_ACCESS));
3437 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3438 strlen(POSIX_ACL_XATTR_DEFAULT));
3439 }
3440
46a53cca 3441 slot++;
63541927 3442 *first_xattr_slot = -1;
46a53cca
CM
3443 while (slot < nritems) {
3444 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3445
3446 /* we found a different objectid, there must not be acls */
3447 if (found_key.objectid != objectid)
3448 return 0;
3449
3450 /* we found an xattr, assume we've got an acl */
f23b5a59 3451 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
63541927
FDBM
3452 if (*first_xattr_slot == -1)
3453 *first_xattr_slot = slot;
f23b5a59
JB
3454 if (found_key.offset == xattr_access ||
3455 found_key.offset == xattr_default)
3456 return 1;
3457 }
46a53cca
CM
3458
3459 /*
3460 * we found a key greater than an xattr key, there can't
3461 * be any acls later on
3462 */
3463 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3464 return 0;
3465
3466 slot++;
3467 scanned++;
3468
3469 /*
3470 * it goes inode, inode backrefs, xattrs, extents,
3471 * so if there are a ton of hard links to an inode there can
3472 * be a lot of backrefs. Don't waste time searching too hard,
3473 * this is just an optimization
3474 */
3475 if (scanned >= 8)
3476 break;
3477 }
3478 /* we hit the end of the leaf before we found an xattr or
3479 * something larger than an xattr. We have to assume the inode
3480 * has acls
3481 */
63541927
FDBM
3482 if (*first_xattr_slot == -1)
3483 *first_xattr_slot = slot;
46a53cca
CM
3484 return 1;
3485}
3486
d352ac68
CM
3487/*
3488 * read an inode from the btree into the in-memory inode
3489 */
5d4f98a2 3490static void btrfs_read_locked_inode(struct inode *inode)
39279cc3
CM
3491{
3492 struct btrfs_path *path;
5f39d397 3493 struct extent_buffer *leaf;
39279cc3 3494 struct btrfs_inode_item *inode_item;
0b86a832 3495 struct btrfs_timespec *tspec;
39279cc3
CM
3496 struct btrfs_root *root = BTRFS_I(inode)->root;
3497 struct btrfs_key location;
67de1176 3498 unsigned long ptr;
46a53cca 3499 int maybe_acls;
618e21d5 3500 u32 rdev;
39279cc3 3501 int ret;
2f7e33d4 3502 bool filled = false;
63541927 3503 int first_xattr_slot;
2f7e33d4
MX
3504
3505 ret = btrfs_fill_inode(inode, &rdev);
3506 if (!ret)
3507 filled = true;
39279cc3
CM
3508
3509 path = btrfs_alloc_path();
1748f843
MF
3510 if (!path)
3511 goto make_bad;
3512
39279cc3 3513 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
dc17ff8f 3514
39279cc3 3515 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
5f39d397 3516 if (ret)
39279cc3 3517 goto make_bad;
39279cc3 3518
5f39d397 3519 leaf = path->nodes[0];
2f7e33d4
MX
3520
3521 if (filled)
67de1176 3522 goto cache_index;
2f7e33d4 3523
5f39d397
CM
3524 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3525 struct btrfs_inode_item);
5f39d397 3526 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
bfe86848 3527 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2f2f43d3
EB
3528 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3529 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
dbe674a9 3530 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
5f39d397
CM
3531
3532 tspec = btrfs_inode_atime(inode_item);
3533 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3534 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3535
3536 tspec = btrfs_inode_mtime(inode_item);
3537 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3538 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3539
3540 tspec = btrfs_inode_ctime(inode_item);
3541 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3542 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3543
a76a3cd4 3544 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
e02119d5 3545 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
5dc562c5
JB
3546 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3547
3548 /*
3549 * If we were modified in the current generation and evicted from memory
3550 * and then re-read we need to do a full sync since we don't have any
3551 * idea about which extents were modified before we were evicted from
3552 * cache.
3553 */
3554 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3555 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3556 &BTRFS_I(inode)->runtime_flags);
3557
0c4d2d95 3558 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
e02119d5 3559 inode->i_generation = BTRFS_I(inode)->generation;
618e21d5 3560 inode->i_rdev = 0;
5f39d397
CM
3561 rdev = btrfs_inode_rdev(leaf, inode_item);
3562
aec7477b 3563 BTRFS_I(inode)->index_cnt = (u64)-1;
d2fb3437 3564 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
67de1176
MX
3565
3566cache_index:
3567 path->slots[0]++;
3568 if (inode->i_nlink != 1 ||
3569 path->slots[0] >= btrfs_header_nritems(leaf))
3570 goto cache_acl;
3571
3572 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3573 if (location.objectid != btrfs_ino(inode))
3574 goto cache_acl;
3575
3576 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3577 if (location.type == BTRFS_INODE_REF_KEY) {
3578 struct btrfs_inode_ref *ref;
3579
3580 ref = (struct btrfs_inode_ref *)ptr;
3581 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3582 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3583 struct btrfs_inode_extref *extref;
3584
3585 extref = (struct btrfs_inode_extref *)ptr;
3586 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3587 extref);
3588 }
2f7e33d4 3589cache_acl:
46a53cca
CM
3590 /*
3591 * try to precache a NULL acl entry for files that don't have
3592 * any xattrs or acls
3593 */
33345d01 3594 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
63541927
FDBM
3595 btrfs_ino(inode), &first_xattr_slot);
3596 if (first_xattr_slot != -1) {
3597 path->slots[0] = first_xattr_slot;
3598 ret = btrfs_load_inode_props(inode, path);
3599 if (ret)
3600 btrfs_err(root->fs_info,
351fd353 3601 "error loading props for ino %llu (root %llu): %d",
63541927
FDBM
3602 btrfs_ino(inode),
3603 root->root_key.objectid, ret);
3604 }
3605 btrfs_free_path(path);
3606
72c04902
AV
3607 if (!maybe_acls)
3608 cache_no_acl(inode);
46a53cca 3609
39279cc3 3610 switch (inode->i_mode & S_IFMT) {
39279cc3
CM
3611 case S_IFREG:
3612 inode->i_mapping->a_ops = &btrfs_aops;
04160088 3613 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
d1310b2e 3614 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
39279cc3
CM
3615 inode->i_fop = &btrfs_file_operations;
3616 inode->i_op = &btrfs_file_inode_operations;
3617 break;
3618 case S_IFDIR:
3619 inode->i_fop = &btrfs_dir_file_operations;
3620 if (root == root->fs_info->tree_root)
3621 inode->i_op = &btrfs_dir_ro_inode_operations;
3622 else
3623 inode->i_op = &btrfs_dir_inode_operations;
3624 break;
3625 case S_IFLNK:
3626 inode->i_op = &btrfs_symlink_inode_operations;
3627 inode->i_mapping->a_ops = &btrfs_symlink_aops;
04160088 3628 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
39279cc3 3629 break;
618e21d5 3630 default:
0279b4cd 3631 inode->i_op = &btrfs_special_inode_operations;
618e21d5
JB
3632 init_special_inode(inode, inode->i_mode, rdev);
3633 break;
39279cc3 3634 }
6cbff00f
CH
3635
3636 btrfs_update_iflags(inode);
39279cc3
CM
3637 return;
3638
3639make_bad:
39279cc3 3640 btrfs_free_path(path);
39279cc3
CM
3641 make_bad_inode(inode);
3642}
3643
d352ac68
CM
3644/*
3645 * given a leaf and an inode, copy the inode fields into the leaf
3646 */
e02119d5
CM
3647static void fill_inode_item(struct btrfs_trans_handle *trans,
3648 struct extent_buffer *leaf,
5f39d397 3649 struct btrfs_inode_item *item,
39279cc3
CM
3650 struct inode *inode)
3651{
51fab693
LB
3652 struct btrfs_map_token token;
3653
3654 btrfs_init_map_token(&token);
5f39d397 3655
51fab693
LB
3656 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3657 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3658 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3659 &token);
3660 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3661 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
5f39d397 3662
51fab693
LB
3663 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3664 inode->i_atime.tv_sec, &token);
3665 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3666 inode->i_atime.tv_nsec, &token);
5f39d397 3667
51fab693
LB
3668 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3669 inode->i_mtime.tv_sec, &token);
3670 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3671 inode->i_mtime.tv_nsec, &token);
5f39d397 3672
51fab693
LB
3673 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3674 inode->i_ctime.tv_sec, &token);
3675 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3676 inode->i_ctime.tv_nsec, &token);
5f39d397 3677
51fab693
LB
3678 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3679 &token);
3680 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3681 &token);
3682 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3683 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3684 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3685 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3686 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
39279cc3
CM
3687}
3688
d352ac68
CM
3689/*
3690 * copy everything in the in-memory inode into the btree.
3691 */
2115133f 3692static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
d397712b 3693 struct btrfs_root *root, struct inode *inode)
39279cc3
CM
3694{
3695 struct btrfs_inode_item *inode_item;
3696 struct btrfs_path *path;
5f39d397 3697 struct extent_buffer *leaf;
39279cc3
CM
3698 int ret;
3699
3700 path = btrfs_alloc_path();
16cdcec7
MX
3701 if (!path)
3702 return -ENOMEM;
3703
b9473439 3704 path->leave_spinning = 1;
16cdcec7
MX
3705 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3706 1);
39279cc3
CM
3707 if (ret) {
3708 if (ret > 0)
3709 ret = -ENOENT;
3710 goto failed;
3711 }
3712
5f39d397
CM
3713 leaf = path->nodes[0];
3714 inode_item = btrfs_item_ptr(leaf, path->slots[0],
16cdcec7 3715 struct btrfs_inode_item);
39279cc3 3716
e02119d5 3717 fill_inode_item(trans, leaf, inode_item, inode);
5f39d397 3718 btrfs_mark_buffer_dirty(leaf);
15ee9bc7 3719 btrfs_set_inode_last_trans(trans, inode);
39279cc3
CM
3720 ret = 0;
3721failed:
39279cc3
CM
3722 btrfs_free_path(path);
3723 return ret;
3724}
3725
2115133f
CM
3726/*
3727 * copy everything in the in-memory inode into the btree.
3728 */
3729noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3730 struct btrfs_root *root, struct inode *inode)
3731{
3732 int ret;
3733
3734 /*
3735 * If the inode is a free space inode, we can deadlock during commit
3736 * if we put it into the delayed code.
3737 *
3738 * The data relocation inode should also be directly updated
3739 * without delay
3740 */
83eea1f1 3741 if (!btrfs_is_free_space_inode(inode)
1d52c78a
JB
3742 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3743 && !root->fs_info->log_root_recovering) {
8ea05e3a
AB
3744 btrfs_update_root_times(trans, root);
3745
2115133f
CM
3746 ret = btrfs_delayed_update_inode(trans, root, inode);
3747 if (!ret)
3748 btrfs_set_inode_last_trans(trans, inode);
3749 return ret;
3750 }
3751
3752 return btrfs_update_inode_item(trans, root, inode);
3753}
3754
be6aef60
JB
3755noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3756 struct btrfs_root *root,
3757 struct inode *inode)
2115133f
CM
3758{
3759 int ret;
3760
3761 ret = btrfs_update_inode(trans, root, inode);
3762 if (ret == -ENOSPC)
3763 return btrfs_update_inode_item(trans, root, inode);
3764 return ret;
3765}
3766
d352ac68
CM
3767/*
3768 * unlink helper that gets used here in inode.c and in the tree logging
3769 * recovery code. It remove a link in a directory with a given name, and
3770 * also drops the back refs in the inode to the directory
3771 */
92986796
AV
3772static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3773 struct btrfs_root *root,
3774 struct inode *dir, struct inode *inode,
3775 const char *name, int name_len)
39279cc3
CM
3776{
3777 struct btrfs_path *path;
39279cc3 3778 int ret = 0;
5f39d397 3779 struct extent_buffer *leaf;
39279cc3 3780 struct btrfs_dir_item *di;
5f39d397 3781 struct btrfs_key key;
aec7477b 3782 u64 index;
33345d01
LZ
3783 u64 ino = btrfs_ino(inode);
3784 u64 dir_ino = btrfs_ino(dir);
39279cc3
CM
3785
3786 path = btrfs_alloc_path();
54aa1f4d
CM
3787 if (!path) {
3788 ret = -ENOMEM;
554233a6 3789 goto out;
54aa1f4d
CM
3790 }
3791
b9473439 3792 path->leave_spinning = 1;
33345d01 3793 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
39279cc3
CM
3794 name, name_len, -1);
3795 if (IS_ERR(di)) {
3796 ret = PTR_ERR(di);
3797 goto err;
3798 }
3799 if (!di) {
3800 ret = -ENOENT;
3801 goto err;
3802 }
5f39d397
CM
3803 leaf = path->nodes[0];
3804 btrfs_dir_item_key_to_cpu(leaf, di, &key);
39279cc3 3805 ret = btrfs_delete_one_dir_name(trans, root, path, di);
54aa1f4d
CM
3806 if (ret)
3807 goto err;
b3b4aa74 3808 btrfs_release_path(path);
39279cc3 3809
67de1176
MX
3810 /*
3811 * If we don't have dir index, we have to get it by looking up
3812 * the inode ref, since we get the inode ref, remove it directly,
3813 * it is unnecessary to do delayed deletion.
3814 *
3815 * But if we have dir index, needn't search inode ref to get it.
3816 * Since the inode ref is close to the inode item, it is better
3817 * that we delay to delete it, and just do this deletion when
3818 * we update the inode item.
3819 */
3820 if (BTRFS_I(inode)->dir_index) {
3821 ret = btrfs_delayed_delete_inode_ref(inode);
3822 if (!ret) {
3823 index = BTRFS_I(inode)->dir_index;
3824 goto skip_backref;
3825 }
3826 }
3827
33345d01
LZ
3828 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3829 dir_ino, &index);
aec7477b 3830 if (ret) {
c2cf52eb
SK
3831 btrfs_info(root->fs_info,
3832 "failed to delete reference to %.*s, inode %llu parent %llu",
c1c9ff7c 3833 name_len, name, ino, dir_ino);
79787eaa 3834 btrfs_abort_transaction(trans, root, ret);
aec7477b
JB
3835 goto err;
3836 }
67de1176 3837skip_backref:
16cdcec7 3838 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
79787eaa
JM
3839 if (ret) {
3840 btrfs_abort_transaction(trans, root, ret);
39279cc3 3841 goto err;
79787eaa 3842 }
39279cc3 3843
e02119d5 3844 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
33345d01 3845 inode, dir_ino);
79787eaa
JM
3846 if (ret != 0 && ret != -ENOENT) {
3847 btrfs_abort_transaction(trans, root, ret);
3848 goto err;
3849 }
e02119d5
CM
3850
3851 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3852 dir, index);
6418c961
CM
3853 if (ret == -ENOENT)
3854 ret = 0;
d4e3991b
ZB
3855 else if (ret)
3856 btrfs_abort_transaction(trans, root, ret);
39279cc3
CM
3857err:
3858 btrfs_free_path(path);
e02119d5
CM
3859 if (ret)
3860 goto out;
3861
3862 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
0c4d2d95
JB
3863 inode_inc_iversion(inode);
3864 inode_inc_iversion(dir);
e02119d5 3865 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
b9959295 3866 ret = btrfs_update_inode(trans, root, dir);
e02119d5 3867out:
39279cc3
CM
3868 return ret;
3869}
3870
92986796
AV
3871int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3872 struct btrfs_root *root,
3873 struct inode *dir, struct inode *inode,
3874 const char *name, int name_len)
3875{
3876 int ret;
3877 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3878 if (!ret) {
8b558c5f 3879 drop_nlink(inode);
92986796
AV
3880 ret = btrfs_update_inode(trans, root, inode);
3881 }
3882 return ret;
3883}
39279cc3 3884
a22285a6
YZ
3885/*
3886 * helper to start transaction for unlink and rmdir.
3887 *
d52be818
JB
3888 * unlink and rmdir are special in btrfs, they do not always free space, so
3889 * if we cannot make our reservations the normal way try and see if there is
3890 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3891 * allow the unlink to occur.
a22285a6 3892 */
d52be818 3893static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4df27c4d 3894{
39279cc3 3895 struct btrfs_trans_handle *trans;
a22285a6 3896 struct btrfs_root *root = BTRFS_I(dir)->root;
4df27c4d
YZ
3897 int ret;
3898
e70bea5f
JB
3899 /*
3900 * 1 for the possible orphan item
3901 * 1 for the dir item
3902 * 1 for the dir index
3903 * 1 for the inode ref
e70bea5f
JB
3904 * 1 for the inode
3905 */
6e137ed3 3906 trans = btrfs_start_transaction(root, 5);
a22285a6
YZ
3907 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3908 return trans;
4df27c4d 3909
d52be818
JB
3910 if (PTR_ERR(trans) == -ENOSPC) {
3911 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4df27c4d 3912
d52be818
JB
3913 trans = btrfs_start_transaction(root, 0);
3914 if (IS_ERR(trans))
3915 return trans;
3916 ret = btrfs_cond_migrate_bytes(root->fs_info,
3917 &root->fs_info->trans_block_rsv,
3918 num_bytes, 5);
3919 if (ret) {
3920 btrfs_end_transaction(trans, root);
3921 return ERR_PTR(ret);
a22285a6 3922 }
5a77d76c 3923 trans->block_rsv = &root->fs_info->trans_block_rsv;
d52be818 3924 trans->bytes_reserved = num_bytes;
a22285a6 3925 }
d52be818 3926 return trans;
a22285a6
YZ
3927}
3928
3929static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3930{
3931 struct btrfs_root *root = BTRFS_I(dir)->root;
3932 struct btrfs_trans_handle *trans;
3933 struct inode *inode = dentry->d_inode;
3934 int ret;
a22285a6 3935
d52be818 3936 trans = __unlink_start_trans(dir);
a22285a6
YZ
3937 if (IS_ERR(trans))
3938 return PTR_ERR(trans);
5f39d397 3939
12fcfd22
CM
3940 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3941
e02119d5
CM
3942 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3943 dentry->d_name.name, dentry->d_name.len);
b532402e
TI
3944 if (ret)
3945 goto out;
7b128766 3946
a22285a6 3947 if (inode->i_nlink == 0) {
7b128766 3948 ret = btrfs_orphan_add(trans, inode);
b532402e
TI
3949 if (ret)
3950 goto out;
a22285a6 3951 }
7b128766 3952
b532402e 3953out:
d52be818 3954 btrfs_end_transaction(trans, root);
b53d3f5d 3955 btrfs_btree_balance_dirty(root);
39279cc3
CM
3956 return ret;
3957}
3958
4df27c4d
YZ
3959int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3960 struct btrfs_root *root,
3961 struct inode *dir, u64 objectid,
3962 const char *name, int name_len)
3963{
3964 struct btrfs_path *path;
3965 struct extent_buffer *leaf;
3966 struct btrfs_dir_item *di;
3967 struct btrfs_key key;
3968 u64 index;
3969 int ret;
33345d01 3970 u64 dir_ino = btrfs_ino(dir);
4df27c4d
YZ
3971
3972 path = btrfs_alloc_path();
3973 if (!path)
3974 return -ENOMEM;
3975
33345d01 3976 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4df27c4d 3977 name, name_len, -1);
79787eaa
JM
3978 if (IS_ERR_OR_NULL(di)) {
3979 if (!di)
3980 ret = -ENOENT;
3981 else
3982 ret = PTR_ERR(di);
3983 goto out;
3984 }
4df27c4d
YZ
3985
3986 leaf = path->nodes[0];
3987 btrfs_dir_item_key_to_cpu(leaf, di, &key);
3988 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3989 ret = btrfs_delete_one_dir_name(trans, root, path, di);
79787eaa
JM
3990 if (ret) {
3991 btrfs_abort_transaction(trans, root, ret);
3992 goto out;
3993 }
b3b4aa74 3994 btrfs_release_path(path);
4df27c4d
YZ
3995
3996 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3997 objectid, root->root_key.objectid,
33345d01 3998 dir_ino, &index, name, name_len);
4df27c4d 3999 if (ret < 0) {
79787eaa
JM
4000 if (ret != -ENOENT) {
4001 btrfs_abort_transaction(trans, root, ret);
4002 goto out;
4003 }
33345d01 4004 di = btrfs_search_dir_index_item(root, path, dir_ino,
4df27c4d 4005 name, name_len);
79787eaa
JM
4006 if (IS_ERR_OR_NULL(di)) {
4007 if (!di)
4008 ret = -ENOENT;
4009 else
4010 ret = PTR_ERR(di);
4011 btrfs_abort_transaction(trans, root, ret);
4012 goto out;
4013 }
4df27c4d
YZ
4014
4015 leaf = path->nodes[0];
4016 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 4017 btrfs_release_path(path);
4df27c4d
YZ
4018 index = key.offset;
4019 }
945d8962 4020 btrfs_release_path(path);
4df27c4d 4021
16cdcec7 4022 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
79787eaa
JM
4023 if (ret) {
4024 btrfs_abort_transaction(trans, root, ret);
4025 goto out;
4026 }
4df27c4d
YZ
4027
4028 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
0c4d2d95 4029 inode_inc_iversion(dir);
4df27c4d 4030 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
5a24e84c 4031 ret = btrfs_update_inode_fallback(trans, root, dir);
79787eaa
JM
4032 if (ret)
4033 btrfs_abort_transaction(trans, root, ret);
4034out:
71d7aed0 4035 btrfs_free_path(path);
79787eaa 4036 return ret;
4df27c4d
YZ
4037}
4038
39279cc3
CM
4039static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4040{
4041 struct inode *inode = dentry->d_inode;
1832a6d5 4042 int err = 0;
39279cc3 4043 struct btrfs_root *root = BTRFS_I(dir)->root;
39279cc3 4044 struct btrfs_trans_handle *trans;
39279cc3 4045
b3ae244e 4046 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
134d4512 4047 return -ENOTEMPTY;
b3ae244e
DS
4048 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
4049 return -EPERM;
134d4512 4050
d52be818 4051 trans = __unlink_start_trans(dir);
a22285a6 4052 if (IS_ERR(trans))
5df6a9f6 4053 return PTR_ERR(trans);
5df6a9f6 4054
33345d01 4055 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4df27c4d
YZ
4056 err = btrfs_unlink_subvol(trans, root, dir,
4057 BTRFS_I(inode)->location.objectid,
4058 dentry->d_name.name,
4059 dentry->d_name.len);
4060 goto out;
4061 }
4062
7b128766
JB
4063 err = btrfs_orphan_add(trans, inode);
4064 if (err)
4df27c4d 4065 goto out;
7b128766 4066
39279cc3 4067 /* now the directory is empty */
e02119d5
CM
4068 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
4069 dentry->d_name.name, dentry->d_name.len);
d397712b 4070 if (!err)
dbe674a9 4071 btrfs_i_size_write(inode, 0);
4df27c4d 4072out:
d52be818 4073 btrfs_end_transaction(trans, root);
b53d3f5d 4074 btrfs_btree_balance_dirty(root);
3954401f 4075
39279cc3
CM
4076 return err;
4077}
4078
39279cc3
CM
4079/*
4080 * this can truncate away extent items, csum items and directory items.
4081 * It starts at a high offset and removes keys until it can't find
d352ac68 4082 * any higher than new_size
39279cc3
CM
4083 *
4084 * csum items that cross the new i_size are truncated to the new size
4085 * as well.
7b128766
JB
4086 *
4087 * min_type is the minimum key type to truncate down to. If set to 0, this
4088 * will kill all the items on this inode, including the INODE_ITEM_KEY.
39279cc3 4089 */
8082510e
YZ
4090int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4091 struct btrfs_root *root,
4092 struct inode *inode,
4093 u64 new_size, u32 min_type)
39279cc3 4094{
39279cc3 4095 struct btrfs_path *path;
5f39d397 4096 struct extent_buffer *leaf;
39279cc3 4097 struct btrfs_file_extent_item *fi;
8082510e
YZ
4098 struct btrfs_key key;
4099 struct btrfs_key found_key;
39279cc3 4100 u64 extent_start = 0;
db94535d 4101 u64 extent_num_bytes = 0;
5d4f98a2 4102 u64 extent_offset = 0;
39279cc3 4103 u64 item_end = 0;
7f4f6e0a 4104 u64 last_size = (u64)-1;
8082510e 4105 u32 found_type = (u8)-1;
39279cc3
CM
4106 int found_extent;
4107 int del_item;
85e21bac
CM
4108 int pending_del_nr = 0;
4109 int pending_del_slot = 0;
179e29e4 4110 int extent_type = -1;
8082510e
YZ
4111 int ret;
4112 int err = 0;
33345d01 4113 u64 ino = btrfs_ino(inode);
8082510e
YZ
4114
4115 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
39279cc3 4116
0eb0e19c
MF
4117 path = btrfs_alloc_path();
4118 if (!path)
4119 return -ENOMEM;
4120 path->reada = -1;
4121
5dc562c5
JB
4122 /*
4123 * We want to drop from the next block forward in case this new size is
4124 * not block aligned since we will be keeping the last block of the
4125 * extent just the way it is.
4126 */
27cdeb70
MX
4127 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4128 root == root->fs_info->tree_root)
fda2832f
QW
4129 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4130 root->sectorsize), (u64)-1, 0);
8082510e 4131
16cdcec7
MX
4132 /*
4133 * This function is also used to drop the items in the log tree before
4134 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4135 * it is used to drop the loged items. So we shouldn't kill the delayed
4136 * items.
4137 */
4138 if (min_type == 0 && root == BTRFS_I(inode)->root)
4139 btrfs_kill_delayed_inode_items(inode);
4140
33345d01 4141 key.objectid = ino;
39279cc3 4142 key.offset = (u64)-1;
5f39d397
CM
4143 key.type = (u8)-1;
4144
85e21bac 4145search_again:
b9473439 4146 path->leave_spinning = 1;
85e21bac 4147 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8082510e
YZ
4148 if (ret < 0) {
4149 err = ret;
4150 goto out;
4151 }
d397712b 4152
85e21bac 4153 if (ret > 0) {
e02119d5
CM
4154 /* there are no items in the tree for us to truncate, we're
4155 * done
4156 */
8082510e
YZ
4157 if (path->slots[0] == 0)
4158 goto out;
85e21bac
CM
4159 path->slots[0]--;
4160 }
4161
d397712b 4162 while (1) {
39279cc3 4163 fi = NULL;
5f39d397
CM
4164 leaf = path->nodes[0];
4165 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
962a298f 4166 found_type = found_key.type;
39279cc3 4167
33345d01 4168 if (found_key.objectid != ino)
39279cc3 4169 break;
5f39d397 4170
85e21bac 4171 if (found_type < min_type)
39279cc3
CM
4172 break;
4173
5f39d397 4174 item_end = found_key.offset;
39279cc3 4175 if (found_type == BTRFS_EXTENT_DATA_KEY) {
5f39d397 4176 fi = btrfs_item_ptr(leaf, path->slots[0],
39279cc3 4177 struct btrfs_file_extent_item);
179e29e4
CM
4178 extent_type = btrfs_file_extent_type(leaf, fi);
4179 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
5f39d397 4180 item_end +=
db94535d 4181 btrfs_file_extent_num_bytes(leaf, fi);
179e29e4 4182 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
179e29e4 4183 item_end += btrfs_file_extent_inline_len(leaf,
514ac8ad 4184 path->slots[0], fi);
39279cc3 4185 }
008630c1 4186 item_end--;
39279cc3 4187 }
8082510e
YZ
4188 if (found_type > min_type) {
4189 del_item = 1;
4190 } else {
4191 if (item_end < new_size)
b888db2b 4192 break;
8082510e
YZ
4193 if (found_key.offset >= new_size)
4194 del_item = 1;
4195 else
4196 del_item = 0;
39279cc3 4197 }
39279cc3 4198 found_extent = 0;
39279cc3 4199 /* FIXME, shrink the extent if the ref count is only 1 */
179e29e4
CM
4200 if (found_type != BTRFS_EXTENT_DATA_KEY)
4201 goto delete;
4202
7f4f6e0a
JB
4203 if (del_item)
4204 last_size = found_key.offset;
4205 else
4206 last_size = new_size;
4207
179e29e4 4208 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
39279cc3 4209 u64 num_dec;
db94535d 4210 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
f70a9a6b 4211 if (!del_item) {
db94535d
CM
4212 u64 orig_num_bytes =
4213 btrfs_file_extent_num_bytes(leaf, fi);
fda2832f
QW
4214 extent_num_bytes = ALIGN(new_size -
4215 found_key.offset,
4216 root->sectorsize);
db94535d
CM
4217 btrfs_set_file_extent_num_bytes(leaf, fi,
4218 extent_num_bytes);
4219 num_dec = (orig_num_bytes -
9069218d 4220 extent_num_bytes);
27cdeb70
MX
4221 if (test_bit(BTRFS_ROOT_REF_COWS,
4222 &root->state) &&
4223 extent_start != 0)
a76a3cd4 4224 inode_sub_bytes(inode, num_dec);
5f39d397 4225 btrfs_mark_buffer_dirty(leaf);
39279cc3 4226 } else {
db94535d
CM
4227 extent_num_bytes =
4228 btrfs_file_extent_disk_num_bytes(leaf,
4229 fi);
5d4f98a2
YZ
4230 extent_offset = found_key.offset -
4231 btrfs_file_extent_offset(leaf, fi);
4232
39279cc3 4233 /* FIXME blocksize != 4096 */
9069218d 4234 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
39279cc3
CM
4235 if (extent_start != 0) {
4236 found_extent = 1;
27cdeb70
MX
4237 if (test_bit(BTRFS_ROOT_REF_COWS,
4238 &root->state))
a76a3cd4 4239 inode_sub_bytes(inode, num_dec);
e02119d5 4240 }
39279cc3 4241 }
9069218d 4242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
c8b97818
CM
4243 /*
4244 * we can't truncate inline items that have had
4245 * special encodings
4246 */
4247 if (!del_item &&
4248 btrfs_file_extent_compression(leaf, fi) == 0 &&
4249 btrfs_file_extent_encryption(leaf, fi) == 0 &&
4250 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
e02119d5
CM
4251 u32 size = new_size - found_key.offset;
4252
27cdeb70 4253 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
a76a3cd4
YZ
4254 inode_sub_bytes(inode, item_end + 1 -
4255 new_size);
514ac8ad
CM
4256
4257 /*
4258 * update the ram bytes to properly reflect
4259 * the new size of our item
4260 */
4261 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
e02119d5
CM
4262 size =
4263 btrfs_file_extent_calc_inline_size(size);
afe5fea7 4264 btrfs_truncate_item(root, path, size, 1);
27cdeb70
MX
4265 } else if (test_bit(BTRFS_ROOT_REF_COWS,
4266 &root->state)) {
a76a3cd4
YZ
4267 inode_sub_bytes(inode, item_end + 1 -
4268 found_key.offset);
9069218d 4269 }
39279cc3 4270 }
179e29e4 4271delete:
39279cc3 4272 if (del_item) {
85e21bac
CM
4273 if (!pending_del_nr) {
4274 /* no pending yet, add ourselves */
4275 pending_del_slot = path->slots[0];
4276 pending_del_nr = 1;
4277 } else if (pending_del_nr &&
4278 path->slots[0] + 1 == pending_del_slot) {
4279 /* hop on the pending chunk */
4280 pending_del_nr++;
4281 pending_del_slot = path->slots[0];
4282 } else {
d397712b 4283 BUG();
85e21bac 4284 }
39279cc3
CM
4285 } else {
4286 break;
4287 }
27cdeb70
MX
4288 if (found_extent &&
4289 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4290 root == root->fs_info->tree_root)) {
b9473439 4291 btrfs_set_path_blocking(path);
39279cc3 4292 ret = btrfs_free_extent(trans, root, extent_start,
5d4f98a2
YZ
4293 extent_num_bytes, 0,
4294 btrfs_header_owner(leaf),
66d7e7f0 4295 ino, extent_offset, 0);
39279cc3
CM
4296 BUG_ON(ret);
4297 }
85e21bac 4298
8082510e
YZ
4299 if (found_type == BTRFS_INODE_ITEM_KEY)
4300 break;
4301
4302 if (path->slots[0] == 0 ||
4303 path->slots[0] != pending_del_slot) {
8082510e
YZ
4304 if (pending_del_nr) {
4305 ret = btrfs_del_items(trans, root, path,
4306 pending_del_slot,
4307 pending_del_nr);
79787eaa
JM
4308 if (ret) {
4309 btrfs_abort_transaction(trans,
4310 root, ret);
4311 goto error;
4312 }
8082510e
YZ
4313 pending_del_nr = 0;
4314 }
b3b4aa74 4315 btrfs_release_path(path);
85e21bac 4316 goto search_again;
8082510e
YZ
4317 } else {
4318 path->slots[0]--;
85e21bac 4319 }
39279cc3 4320 }
8082510e 4321out:
85e21bac
CM
4322 if (pending_del_nr) {
4323 ret = btrfs_del_items(trans, root, path, pending_del_slot,
4324 pending_del_nr);
79787eaa
JM
4325 if (ret)
4326 btrfs_abort_transaction(trans, root, ret);
85e21bac 4327 }
79787eaa 4328error:
dac5705c
FM
4329 if (last_size != (u64)-1 &&
4330 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7f4f6e0a 4331 btrfs_ordered_update_i_size(inode, last_size, NULL);
39279cc3 4332 btrfs_free_path(path);
8082510e 4333 return err;
39279cc3
CM
4334}
4335
4336/*
2aaa6655
JB
4337 * btrfs_truncate_page - read, zero a chunk and write a page
4338 * @inode - inode that we're zeroing
4339 * @from - the offset to start zeroing
4340 * @len - the length to zero, 0 to zero the entire range respective to the
4341 * offset
4342 * @front - zero up to the offset instead of from the offset on
4343 *
4344 * This will find the page for the "from" offset and cow the page and zero the
4345 * part we want to zero. This is used with truncate and hole punching.
39279cc3 4346 */
2aaa6655
JB
4347int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4348 int front)
39279cc3 4349{
2aaa6655 4350 struct address_space *mapping = inode->i_mapping;
db94535d 4351 struct btrfs_root *root = BTRFS_I(inode)->root;
e6dcd2dc
CM
4352 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4353 struct btrfs_ordered_extent *ordered;
2ac55d41 4354 struct extent_state *cached_state = NULL;
e6dcd2dc 4355 char *kaddr;
db94535d 4356 u32 blocksize = root->sectorsize;
39279cc3
CM
4357 pgoff_t index = from >> PAGE_CACHE_SHIFT;
4358 unsigned offset = from & (PAGE_CACHE_SIZE-1);
4359 struct page *page;
3b16a4e3 4360 gfp_t mask = btrfs_alloc_write_mask(mapping);
39279cc3 4361 int ret = 0;
a52d9a80 4362 u64 page_start;
e6dcd2dc 4363 u64 page_end;
39279cc3 4364
2aaa6655
JB
4365 if ((offset & (blocksize - 1)) == 0 &&
4366 (!len || ((len & (blocksize - 1)) == 0)))
39279cc3 4367 goto out;
0ca1f7ce 4368 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5d5e103a
JB
4369 if (ret)
4370 goto out;
39279cc3 4371
211c17f5 4372again:
3b16a4e3 4373 page = find_or_create_page(mapping, index, mask);
5d5e103a 4374 if (!page) {
0ca1f7ce 4375 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
ac6a2b36 4376 ret = -ENOMEM;
39279cc3 4377 goto out;
5d5e103a 4378 }
e6dcd2dc
CM
4379
4380 page_start = page_offset(page);
4381 page_end = page_start + PAGE_CACHE_SIZE - 1;
4382
39279cc3 4383 if (!PageUptodate(page)) {
9ebefb18 4384 ret = btrfs_readpage(NULL, page);
39279cc3 4385 lock_page(page);
211c17f5
CM
4386 if (page->mapping != mapping) {
4387 unlock_page(page);
4388 page_cache_release(page);
4389 goto again;
4390 }
39279cc3
CM
4391 if (!PageUptodate(page)) {
4392 ret = -EIO;
89642229 4393 goto out_unlock;
39279cc3
CM
4394 }
4395 }
211c17f5 4396 wait_on_page_writeback(page);
e6dcd2dc 4397
d0082371 4398 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
e6dcd2dc
CM
4399 set_page_extent_mapped(page);
4400
4401 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4402 if (ordered) {
2ac55d41
JB
4403 unlock_extent_cached(io_tree, page_start, page_end,
4404 &cached_state, GFP_NOFS);
e6dcd2dc
CM
4405 unlock_page(page);
4406 page_cache_release(page);
eb84ae03 4407 btrfs_start_ordered_extent(inode, ordered, 1);
e6dcd2dc
CM
4408 btrfs_put_ordered_extent(ordered);
4409 goto again;
4410 }
4411
2ac55d41 4412 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
9e8a4a8b
LB
4413 EXTENT_DIRTY | EXTENT_DELALLOC |
4414 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
2ac55d41 4415 0, 0, &cached_state, GFP_NOFS);
5d5e103a 4416
2ac55d41
JB
4417 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4418 &cached_state);
9ed74f2d 4419 if (ret) {
2ac55d41
JB
4420 unlock_extent_cached(io_tree, page_start, page_end,
4421 &cached_state, GFP_NOFS);
9ed74f2d
JB
4422 goto out_unlock;
4423 }
4424
e6dcd2dc 4425 if (offset != PAGE_CACHE_SIZE) {
2aaa6655
JB
4426 if (!len)
4427 len = PAGE_CACHE_SIZE - offset;
e6dcd2dc 4428 kaddr = kmap(page);
2aaa6655
JB
4429 if (front)
4430 memset(kaddr, 0, offset);
4431 else
4432 memset(kaddr + offset, 0, len);
e6dcd2dc
CM
4433 flush_dcache_page(page);
4434 kunmap(page);
4435 }
247e743c 4436 ClearPageChecked(page);
e6dcd2dc 4437 set_page_dirty(page);
2ac55d41
JB
4438 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4439 GFP_NOFS);
39279cc3 4440
89642229 4441out_unlock:
5d5e103a 4442 if (ret)
0ca1f7ce 4443 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
39279cc3
CM
4444 unlock_page(page);
4445 page_cache_release(page);
4446out:
4447 return ret;
4448}
4449
16e7549f
JB
4450static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4451 u64 offset, u64 len)
4452{
4453 struct btrfs_trans_handle *trans;
4454 int ret;
4455
4456 /*
4457 * Still need to make sure the inode looks like it's been updated so
4458 * that any holes get logged if we fsync.
4459 */
4460 if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4461 BTRFS_I(inode)->last_trans = root->fs_info->generation;
4462 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4463 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4464 return 0;
4465 }
4466
4467 /*
4468 * 1 - for the one we're dropping
4469 * 1 - for the one we're adding
4470 * 1 - for updating the inode.
4471 */
4472 trans = btrfs_start_transaction(root, 3);
4473 if (IS_ERR(trans))
4474 return PTR_ERR(trans);
4475
4476 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4477 if (ret) {
4478 btrfs_abort_transaction(trans, root, ret);
4479 btrfs_end_transaction(trans, root);
4480 return ret;
4481 }
4482
4483 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4484 0, 0, len, 0, len, 0, 0, 0);
4485 if (ret)
4486 btrfs_abort_transaction(trans, root, ret);
4487 else
4488 btrfs_update_inode(trans, root, inode);
4489 btrfs_end_transaction(trans, root);
4490 return ret;
4491}
4492
695a0d0d
JB
4493/*
4494 * This function puts in dummy file extents for the area we're creating a hole
4495 * for. So if we are truncating this file to a larger size we need to insert
4496 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4497 * the range between oldsize and size
4498 */
a41ad394 4499int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
39279cc3 4500{
9036c102
YZ
4501 struct btrfs_root *root = BTRFS_I(inode)->root;
4502 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
a22285a6 4503 struct extent_map *em = NULL;
2ac55d41 4504 struct extent_state *cached_state = NULL;
5dc562c5 4505 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
fda2832f
QW
4506 u64 hole_start = ALIGN(oldsize, root->sectorsize);
4507 u64 block_end = ALIGN(size, root->sectorsize);
9036c102
YZ
4508 u64 last_byte;
4509 u64 cur_offset;
4510 u64 hole_size;
9ed74f2d 4511 int err = 0;
39279cc3 4512
a71754fc
JB
4513 /*
4514 * If our size started in the middle of a page we need to zero out the
4515 * rest of the page before we expand the i_size, otherwise we could
4516 * expose stale data.
4517 */
4518 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4519 if (err)
4520 return err;
4521
9036c102
YZ
4522 if (size <= hole_start)
4523 return 0;
4524
9036c102
YZ
4525 while (1) {
4526 struct btrfs_ordered_extent *ordered;
fa7c1494 4527
2ac55d41 4528 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
d0082371 4529 &cached_state);
fa7c1494
MX
4530 ordered = btrfs_lookup_ordered_range(inode, hole_start,
4531 block_end - hole_start);
9036c102
YZ
4532 if (!ordered)
4533 break;
2ac55d41
JB
4534 unlock_extent_cached(io_tree, hole_start, block_end - 1,
4535 &cached_state, GFP_NOFS);
fa7c1494 4536 btrfs_start_ordered_extent(inode, ordered, 1);
9036c102
YZ
4537 btrfs_put_ordered_extent(ordered);
4538 }
39279cc3 4539
9036c102
YZ
4540 cur_offset = hole_start;
4541 while (1) {
4542 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4543 block_end - cur_offset, 0);
79787eaa
JM
4544 if (IS_ERR(em)) {
4545 err = PTR_ERR(em);
f2767956 4546 em = NULL;
79787eaa
JM
4547 break;
4548 }
9036c102 4549 last_byte = min(extent_map_end(em), block_end);
fda2832f 4550 last_byte = ALIGN(last_byte , root->sectorsize);
8082510e 4551 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5dc562c5 4552 struct extent_map *hole_em;
9036c102 4553 hole_size = last_byte - cur_offset;
9ed74f2d 4554
16e7549f
JB
4555 err = maybe_insert_hole(root, inode, cur_offset,
4556 hole_size);
4557 if (err)
3893e33b 4558 break;
5dc562c5
JB
4559 btrfs_drop_extent_cache(inode, cur_offset,
4560 cur_offset + hole_size - 1, 0);
4561 hole_em = alloc_extent_map();
4562 if (!hole_em) {
4563 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4564 &BTRFS_I(inode)->runtime_flags);
4565 goto next;
4566 }
4567 hole_em->start = cur_offset;
4568 hole_em->len = hole_size;
4569 hole_em->orig_start = cur_offset;
8082510e 4570
5dc562c5
JB
4571 hole_em->block_start = EXTENT_MAP_HOLE;
4572 hole_em->block_len = 0;
b4939680 4573 hole_em->orig_block_len = 0;
cc95bef6 4574 hole_em->ram_bytes = hole_size;
5dc562c5
JB
4575 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4576 hole_em->compress_type = BTRFS_COMPRESS_NONE;
16e7549f 4577 hole_em->generation = root->fs_info->generation;
8082510e 4578
5dc562c5
JB
4579 while (1) {
4580 write_lock(&em_tree->lock);
09a2a8f9 4581 err = add_extent_mapping(em_tree, hole_em, 1);
5dc562c5
JB
4582 write_unlock(&em_tree->lock);
4583 if (err != -EEXIST)
4584 break;
4585 btrfs_drop_extent_cache(inode, cur_offset,
4586 cur_offset +
4587 hole_size - 1, 0);
4588 }
4589 free_extent_map(hole_em);
9036c102 4590 }
16e7549f 4591next:
9036c102 4592 free_extent_map(em);
a22285a6 4593 em = NULL;
9036c102 4594 cur_offset = last_byte;
8082510e 4595 if (cur_offset >= block_end)
9036c102
YZ
4596 break;
4597 }
a22285a6 4598 free_extent_map(em);
2ac55d41
JB
4599 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4600 GFP_NOFS);
9036c102
YZ
4601 return err;
4602}
39279cc3 4603
3972f260 4604static int btrfs_setsize(struct inode *inode, struct iattr *attr)
8082510e 4605{
f4a2f4c5
MX
4606 struct btrfs_root *root = BTRFS_I(inode)->root;
4607 struct btrfs_trans_handle *trans;
a41ad394 4608 loff_t oldsize = i_size_read(inode);
3972f260
ES
4609 loff_t newsize = attr->ia_size;
4610 int mask = attr->ia_valid;
8082510e
YZ
4611 int ret;
4612
3972f260
ES
4613 /*
4614 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4615 * special case where we need to update the times despite not having
4616 * these flags set. For all other operations the VFS set these flags
4617 * explicitly if it wants a timestamp update.
4618 */
dff6efc3
CH
4619 if (newsize != oldsize) {
4620 inode_inc_iversion(inode);
4621 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4622 inode->i_ctime = inode->i_mtime =
4623 current_fs_time(inode->i_sb);
4624 }
3972f260 4625
a41ad394 4626 if (newsize > oldsize) {
7caef267 4627 truncate_pagecache(inode, newsize);
a41ad394 4628 ret = btrfs_cont_expand(inode, oldsize, newsize);
f4a2f4c5 4629 if (ret)
8082510e 4630 return ret;
8082510e 4631
f4a2f4c5
MX
4632 trans = btrfs_start_transaction(root, 1);
4633 if (IS_ERR(trans))
4634 return PTR_ERR(trans);
4635
4636 i_size_write(inode, newsize);
4637 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4638 ret = btrfs_update_inode(trans, root, inode);
7ad85bb7 4639 btrfs_end_transaction(trans, root);
a41ad394 4640 } else {
8082510e 4641
a41ad394
JB
4642 /*
4643 * We're truncating a file that used to have good data down to
4644 * zero. Make sure it gets into the ordered flush list so that
4645 * any new writes get down to disk quickly.
4646 */
4647 if (newsize == 0)
72ac3c0d
JB
4648 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4649 &BTRFS_I(inode)->runtime_flags);
8082510e 4650
f3fe820c
JB
4651 /*
4652 * 1 for the orphan item we're going to add
4653 * 1 for the orphan item deletion.
4654 */
4655 trans = btrfs_start_transaction(root, 2);
4656 if (IS_ERR(trans))
4657 return PTR_ERR(trans);
4658
4659 /*
4660 * We need to do this in case we fail at _any_ point during the
4661 * actual truncate. Once we do the truncate_setsize we could
4662 * invalidate pages which forces any outstanding ordered io to
4663 * be instantly completed which will give us extents that need
4664 * to be truncated. If we fail to get an orphan inode down we
4665 * could have left over extents that were never meant to live,
4666 * so we need to garuntee from this point on that everything
4667 * will be consistent.
4668 */
4669 ret = btrfs_orphan_add(trans, inode);
4670 btrfs_end_transaction(trans, root);
4671 if (ret)
4672 return ret;
4673
a41ad394
JB
4674 /* we don't support swapfiles, so vmtruncate shouldn't fail */
4675 truncate_setsize(inode, newsize);
2e60a51e
MX
4676
4677 /* Disable nonlocked read DIO to avoid the end less truncate */
4678 btrfs_inode_block_unlocked_dio(inode);
4679 inode_dio_wait(inode);
4680 btrfs_inode_resume_unlocked_dio(inode);
4681
a41ad394 4682 ret = btrfs_truncate(inode);
7f4f6e0a
JB
4683 if (ret && inode->i_nlink) {
4684 int err;
4685
4686 /*
4687 * failed to truncate, disk_i_size is only adjusted down
4688 * as we remove extents, so it should represent the true
4689 * size of the inode, so reset the in memory size and
4690 * delete our orphan entry.
4691 */
4692 trans = btrfs_join_transaction(root);
4693 if (IS_ERR(trans)) {
4694 btrfs_orphan_del(NULL, inode);
4695 return ret;
4696 }
4697 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4698 err = btrfs_orphan_del(trans, inode);
4699 if (err)
4700 btrfs_abort_transaction(trans, root, err);
4701 btrfs_end_transaction(trans, root);
4702 }
8082510e
YZ
4703 }
4704
a41ad394 4705 return ret;
8082510e
YZ
4706}
4707
9036c102
YZ
4708static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4709{
4710 struct inode *inode = dentry->d_inode;
b83cc969 4711 struct btrfs_root *root = BTRFS_I(inode)->root;
9036c102 4712 int err;
39279cc3 4713
b83cc969
LZ
4714 if (btrfs_root_readonly(root))
4715 return -EROFS;
4716
9036c102
YZ
4717 err = inode_change_ok(inode, attr);
4718 if (err)
4719 return err;
2bf5a725 4720
5a3f23d5 4721 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3972f260 4722 err = btrfs_setsize(inode, attr);
8082510e
YZ
4723 if (err)
4724 return err;
39279cc3 4725 }
9036c102 4726
1025774c
CH
4727 if (attr->ia_valid) {
4728 setattr_copy(inode, attr);
0c4d2d95 4729 inode_inc_iversion(inode);
22c44fe6 4730 err = btrfs_dirty_inode(inode);
1025774c 4731
22c44fe6 4732 if (!err && attr->ia_valid & ATTR_MODE)
996a710d 4733 err = posix_acl_chmod(inode, inode->i_mode);
1025774c 4734 }
33268eaf 4735
39279cc3
CM
4736 return err;
4737}
61295eb8 4738
131e404a
FDBM
4739/*
4740 * While truncating the inode pages during eviction, we get the VFS calling
4741 * btrfs_invalidatepage() against each page of the inode. This is slow because
4742 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4743 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4744 * extent_state structures over and over, wasting lots of time.
4745 *
4746 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4747 * those expensive operations on a per page basis and do only the ordered io
4748 * finishing, while we release here the extent_map and extent_state structures,
4749 * without the excessive merging and splitting.
4750 */
4751static void evict_inode_truncate_pages(struct inode *inode)
4752{
4753 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4754 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4755 struct rb_node *node;
4756
4757 ASSERT(inode->i_state & I_FREEING);
91b0abe3 4758 truncate_inode_pages_final(&inode->i_data);
131e404a
FDBM
4759
4760 write_lock(&map_tree->lock);
4761 while (!RB_EMPTY_ROOT(&map_tree->map)) {
4762 struct extent_map *em;
4763
4764 node = rb_first(&map_tree->map);
4765 em = rb_entry(node, struct extent_map, rb_node);
180589ef
WS
4766 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4767 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
131e404a
FDBM
4768 remove_extent_mapping(map_tree, em);
4769 free_extent_map(em);
7064dd5c
FM
4770 if (need_resched()) {
4771 write_unlock(&map_tree->lock);
4772 cond_resched();
4773 write_lock(&map_tree->lock);
4774 }
131e404a
FDBM
4775 }
4776 write_unlock(&map_tree->lock);
4777
4778 spin_lock(&io_tree->lock);
4779 while (!RB_EMPTY_ROOT(&io_tree->state)) {
4780 struct extent_state *state;
4781 struct extent_state *cached_state = NULL;
4782
4783 node = rb_first(&io_tree->state);
4784 state = rb_entry(node, struct extent_state, rb_node);
4785 atomic_inc(&state->refs);
4786 spin_unlock(&io_tree->lock);
4787
4788 lock_extent_bits(io_tree, state->start, state->end,
4789 0, &cached_state);
4790 clear_extent_bit(io_tree, state->start, state->end,
4791 EXTENT_LOCKED | EXTENT_DIRTY |
4792 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4793 EXTENT_DEFRAG, 1, 1,
4794 &cached_state, GFP_NOFS);
4795 free_extent_state(state);
4796
7064dd5c 4797 cond_resched();
131e404a
FDBM
4798 spin_lock(&io_tree->lock);
4799 }
4800 spin_unlock(&io_tree->lock);
4801}
4802
bd555975 4803void btrfs_evict_inode(struct inode *inode)
39279cc3
CM
4804{
4805 struct btrfs_trans_handle *trans;
4806 struct btrfs_root *root = BTRFS_I(inode)->root;
726c35fa 4807 struct btrfs_block_rsv *rsv, *global_rsv;
07127184 4808 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
39279cc3
CM
4809 int ret;
4810
1abe9b8a 4811 trace_btrfs_inode_evict(inode);
4812
131e404a
FDBM
4813 evict_inode_truncate_pages(inode);
4814
69e9c6c6
SB
4815 if (inode->i_nlink &&
4816 ((btrfs_root_refs(&root->root_item) != 0 &&
4817 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4818 btrfs_is_free_space_inode(inode)))
bd555975
AV
4819 goto no_delete;
4820
39279cc3 4821 if (is_bad_inode(inode)) {
7b128766 4822 btrfs_orphan_del(NULL, inode);
39279cc3
CM
4823 goto no_delete;
4824 }
bd555975 4825 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4a096752 4826 btrfs_wait_ordered_range(inode, 0, (u64)-1);
5f39d397 4827
f612496b
MX
4828 btrfs_free_io_failure_record(inode, 0, (u64)-1);
4829
c71bf099 4830 if (root->fs_info->log_root_recovering) {
6bf02314 4831 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8a35d95f 4832 &BTRFS_I(inode)->runtime_flags));
c71bf099
YZ
4833 goto no_delete;
4834 }
4835
76dda93c 4836 if (inode->i_nlink > 0) {
69e9c6c6
SB
4837 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
4838 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
76dda93c
YZ
4839 goto no_delete;
4840 }
4841
0e8c36a9
MX
4842 ret = btrfs_commit_inode_delayed_inode(inode);
4843 if (ret) {
4844 btrfs_orphan_del(NULL, inode);
4845 goto no_delete;
4846 }
4847
66d8f3dd 4848 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4289a667
JB
4849 if (!rsv) {
4850 btrfs_orphan_del(NULL, inode);
4851 goto no_delete;
4852 }
4a338542 4853 rsv->size = min_size;
ca7e70f5 4854 rsv->failfast = 1;
726c35fa 4855 global_rsv = &root->fs_info->global_block_rsv;
4289a667 4856
dbe674a9 4857 btrfs_i_size_write(inode, 0);
5f39d397 4858
4289a667 4859 /*
8407aa46
MX
4860 * This is a bit simpler than btrfs_truncate since we've already
4861 * reserved our space for our orphan item in the unlink, so we just
4862 * need to reserve some slack space in case we add bytes and update
4863 * inode item when doing the truncate.
4289a667 4864 */
8082510e 4865 while (1) {
08e007d2
MX
4866 ret = btrfs_block_rsv_refill(root, rsv, min_size,
4867 BTRFS_RESERVE_FLUSH_LIMIT);
726c35fa
JB
4868
4869 /*
4870 * Try and steal from the global reserve since we will
4871 * likely not use this space anyway, we want to try as
4872 * hard as possible to get this to work.
4873 */
4874 if (ret)
4875 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
d68fc57b 4876
d68fc57b 4877 if (ret) {
c2cf52eb
SK
4878 btrfs_warn(root->fs_info,
4879 "Could not get space for a delete, will truncate on mount %d",
4880 ret);
4289a667
JB
4881 btrfs_orphan_del(NULL, inode);
4882 btrfs_free_block_rsv(root, rsv);
4883 goto no_delete;
d68fc57b 4884 }
7b128766 4885
0e8c36a9 4886 trans = btrfs_join_transaction(root);
4289a667
JB
4887 if (IS_ERR(trans)) {
4888 btrfs_orphan_del(NULL, inode);
4889 btrfs_free_block_rsv(root, rsv);
4890 goto no_delete;
d68fc57b 4891 }
7b128766 4892
4289a667
JB
4893 trans->block_rsv = rsv;
4894
d68fc57b 4895 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
ca7e70f5 4896 if (ret != -ENOSPC)
8082510e 4897 break;
85e21bac 4898
8407aa46 4899 trans->block_rsv = &root->fs_info->trans_block_rsv;
8082510e
YZ
4900 btrfs_end_transaction(trans, root);
4901 trans = NULL;
b53d3f5d 4902 btrfs_btree_balance_dirty(root);
8082510e 4903 }
5f39d397 4904
4289a667
JB
4905 btrfs_free_block_rsv(root, rsv);
4906
4ef31a45
JB
4907 /*
4908 * Errors here aren't a big deal, it just means we leave orphan items
4909 * in the tree. They will be cleaned up on the next mount.
4910 */
8082510e 4911 if (ret == 0) {
4289a667 4912 trans->block_rsv = root->orphan_block_rsv;
4ef31a45
JB
4913 btrfs_orphan_del(trans, inode);
4914 } else {
4915 btrfs_orphan_del(NULL, inode);
8082510e 4916 }
54aa1f4d 4917
4289a667 4918 trans->block_rsv = &root->fs_info->trans_block_rsv;
581bb050
LZ
4919 if (!(root == root->fs_info->tree_root ||
4920 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
33345d01 4921 btrfs_return_ino(root, btrfs_ino(inode));
581bb050 4922
54aa1f4d 4923 btrfs_end_transaction(trans, root);
b53d3f5d 4924 btrfs_btree_balance_dirty(root);
39279cc3 4925no_delete:
89042e5a 4926 btrfs_remove_delayed_node(inode);
dbd5768f 4927 clear_inode(inode);
8082510e 4928 return;
39279cc3
CM
4929}
4930
4931/*
4932 * this returns the key found in the dir entry in the location pointer.
4933 * If no dir entries were found, location->objectid is 0.
4934 */
4935static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4936 struct btrfs_key *location)
4937{
4938 const char *name = dentry->d_name.name;
4939 int namelen = dentry->d_name.len;
4940 struct btrfs_dir_item *di;
4941 struct btrfs_path *path;
4942 struct btrfs_root *root = BTRFS_I(dir)->root;
0d9f7f3e 4943 int ret = 0;
39279cc3
CM
4944
4945 path = btrfs_alloc_path();
d8926bb3
MF
4946 if (!path)
4947 return -ENOMEM;
3954401f 4948
33345d01 4949 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
39279cc3 4950 namelen, 0);
0d9f7f3e
Y
4951 if (IS_ERR(di))
4952 ret = PTR_ERR(di);
d397712b 4953
c704005d 4954 if (IS_ERR_OR_NULL(di))
3954401f 4955 goto out_err;
d397712b 4956
5f39d397 4957 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
39279cc3 4958out:
39279cc3
CM
4959 btrfs_free_path(path);
4960 return ret;
3954401f
CM
4961out_err:
4962 location->objectid = 0;
4963 goto out;
39279cc3
CM
4964}
4965
4966/*
4967 * when we hit a tree root in a directory, the btrfs part of the inode
4968 * needs to be changed to reflect the root directory of the tree root. This
4969 * is kind of like crossing a mount point.
4970 */
4971static int fixup_tree_root_location(struct btrfs_root *root,
4df27c4d
YZ
4972 struct inode *dir,
4973 struct dentry *dentry,
4974 struct btrfs_key *location,
4975 struct btrfs_root **sub_root)
39279cc3 4976{
4df27c4d
YZ
4977 struct btrfs_path *path;
4978 struct btrfs_root *new_root;
4979 struct btrfs_root_ref *ref;
4980 struct extent_buffer *leaf;
4981 int ret;
4982 int err = 0;
39279cc3 4983
4df27c4d
YZ
4984 path = btrfs_alloc_path();
4985 if (!path) {
4986 err = -ENOMEM;
4987 goto out;
4988 }
39279cc3 4989
4df27c4d 4990 err = -ENOENT;
75ac2dd9
KN
4991 ret = btrfs_find_item(root->fs_info->tree_root, path,
4992 BTRFS_I(dir)->root->root_key.objectid,
4993 location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4df27c4d
YZ
4994 if (ret) {
4995 if (ret < 0)
4996 err = ret;
4997 goto out;
4998 }
39279cc3 4999
4df27c4d
YZ
5000 leaf = path->nodes[0];
5001 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
33345d01 5002 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4df27c4d
YZ
5003 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5004 goto out;
39279cc3 5005
4df27c4d
YZ
5006 ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5007 (unsigned long)(ref + 1),
5008 dentry->d_name.len);
5009 if (ret)
5010 goto out;
5011
b3b4aa74 5012 btrfs_release_path(path);
4df27c4d
YZ
5013
5014 new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
5015 if (IS_ERR(new_root)) {
5016 err = PTR_ERR(new_root);
5017 goto out;
5018 }
5019
4df27c4d
YZ
5020 *sub_root = new_root;
5021 location->objectid = btrfs_root_dirid(&new_root->root_item);
5022 location->type = BTRFS_INODE_ITEM_KEY;
5023 location->offset = 0;
5024 err = 0;
5025out:
5026 btrfs_free_path(path);
5027 return err;
39279cc3
CM
5028}
5029
5d4f98a2
YZ
5030static void inode_tree_add(struct inode *inode)
5031{
5032 struct btrfs_root *root = BTRFS_I(inode)->root;
5033 struct btrfs_inode *entry;
03e860bd
FNP
5034 struct rb_node **p;
5035 struct rb_node *parent;
cef21937 5036 struct rb_node *new = &BTRFS_I(inode)->rb_node;
33345d01 5037 u64 ino = btrfs_ino(inode);
5d4f98a2 5038
1d3382cb 5039 if (inode_unhashed(inode))
76dda93c 5040 return;
e1409cef 5041 parent = NULL;
5d4f98a2 5042 spin_lock(&root->inode_lock);
e1409cef 5043 p = &root->inode_tree.rb_node;
5d4f98a2
YZ
5044 while (*p) {
5045 parent = *p;
5046 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5047
33345d01 5048 if (ino < btrfs_ino(&entry->vfs_inode))
03e860bd 5049 p = &parent->rb_left;
33345d01 5050 else if (ino > btrfs_ino(&entry->vfs_inode))
03e860bd 5051 p = &parent->rb_right;
5d4f98a2
YZ
5052 else {
5053 WARN_ON(!(entry->vfs_inode.i_state &
a4ffdde6 5054 (I_WILL_FREE | I_FREEING)));
cef21937 5055 rb_replace_node(parent, new, &root->inode_tree);
03e860bd
FNP
5056 RB_CLEAR_NODE(parent);
5057 spin_unlock(&root->inode_lock);
cef21937 5058 return;
5d4f98a2
YZ
5059 }
5060 }
cef21937
FDBM
5061 rb_link_node(new, parent, p);
5062 rb_insert_color(new, &root->inode_tree);
5d4f98a2
YZ
5063 spin_unlock(&root->inode_lock);
5064}
5065
5066static void inode_tree_del(struct inode *inode)
5067{
5068 struct btrfs_root *root = BTRFS_I(inode)->root;
76dda93c 5069 int empty = 0;
5d4f98a2 5070
03e860bd 5071 spin_lock(&root->inode_lock);
5d4f98a2 5072 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5d4f98a2 5073 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5d4f98a2 5074 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
76dda93c 5075 empty = RB_EMPTY_ROOT(&root->inode_tree);
5d4f98a2 5076 }
03e860bd 5077 spin_unlock(&root->inode_lock);
76dda93c 5078
69e9c6c6 5079 if (empty && btrfs_root_refs(&root->root_item) == 0) {
76dda93c
YZ
5080 synchronize_srcu(&root->fs_info->subvol_srcu);
5081 spin_lock(&root->inode_lock);
5082 empty = RB_EMPTY_ROOT(&root->inode_tree);
5083 spin_unlock(&root->inode_lock);
5084 if (empty)
5085 btrfs_add_dead_root(root);
5086 }
5087}
5088
143bede5 5089void btrfs_invalidate_inodes(struct btrfs_root *root)
76dda93c
YZ
5090{
5091 struct rb_node *node;
5092 struct rb_node *prev;
5093 struct btrfs_inode *entry;
5094 struct inode *inode;
5095 u64 objectid = 0;
5096
7813b3db
LB
5097 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5098 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
76dda93c
YZ
5099
5100 spin_lock(&root->inode_lock);
5101again:
5102 node = root->inode_tree.rb_node;
5103 prev = NULL;
5104 while (node) {
5105 prev = node;
5106 entry = rb_entry(node, struct btrfs_inode, rb_node);
5107
33345d01 5108 if (objectid < btrfs_ino(&entry->vfs_inode))
76dda93c 5109 node = node->rb_left;
33345d01 5110 else if (objectid > btrfs_ino(&entry->vfs_inode))
76dda93c
YZ
5111 node = node->rb_right;
5112 else
5113 break;
5114 }
5115 if (!node) {
5116 while (prev) {
5117 entry = rb_entry(prev, struct btrfs_inode, rb_node);
33345d01 5118 if (objectid <= btrfs_ino(&entry->vfs_inode)) {
76dda93c
YZ
5119 node = prev;
5120 break;
5121 }
5122 prev = rb_next(prev);
5123 }
5124 }
5125 while (node) {
5126 entry = rb_entry(node, struct btrfs_inode, rb_node);
33345d01 5127 objectid = btrfs_ino(&entry->vfs_inode) + 1;
76dda93c
YZ
5128 inode = igrab(&entry->vfs_inode);
5129 if (inode) {
5130 spin_unlock(&root->inode_lock);
5131 if (atomic_read(&inode->i_count) > 1)
5132 d_prune_aliases(inode);
5133 /*
45321ac5 5134 * btrfs_drop_inode will have it removed from
76dda93c
YZ
5135 * the inode cache when its usage count
5136 * hits zero.
5137 */
5138 iput(inode);
5139 cond_resched();
5140 spin_lock(&root->inode_lock);
5141 goto again;
5142 }
5143
5144 if (cond_resched_lock(&root->inode_lock))
5145 goto again;
5146
5147 node = rb_next(node);
5148 }
5149 spin_unlock(&root->inode_lock);
5d4f98a2
YZ
5150}
5151
e02119d5
CM
5152static int btrfs_init_locked_inode(struct inode *inode, void *p)
5153{
5154 struct btrfs_iget_args *args = p;
90d3e592
CM
5155 inode->i_ino = args->location->objectid;
5156 memcpy(&BTRFS_I(inode)->location, args->location,
5157 sizeof(*args->location));
e02119d5 5158 BTRFS_I(inode)->root = args->root;
39279cc3
CM
5159 return 0;
5160}
5161
5162static int btrfs_find_actor(struct inode *inode, void *opaque)
5163{
5164 struct btrfs_iget_args *args = opaque;
90d3e592 5165 return args->location->objectid == BTRFS_I(inode)->location.objectid &&
d397712b 5166 args->root == BTRFS_I(inode)->root;
39279cc3
CM
5167}
5168
5d4f98a2 5169static struct inode *btrfs_iget_locked(struct super_block *s,
90d3e592 5170 struct btrfs_key *location,
5d4f98a2 5171 struct btrfs_root *root)
39279cc3
CM
5172{
5173 struct inode *inode;
5174 struct btrfs_iget_args args;
90d3e592 5175 unsigned long hashval = btrfs_inode_hash(location->objectid, root);
778ba82b 5176
90d3e592 5177 args.location = location;
39279cc3
CM
5178 args.root = root;
5179
778ba82b 5180 inode = iget5_locked(s, hashval, btrfs_find_actor,
39279cc3
CM
5181 btrfs_init_locked_inode,
5182 (void *)&args);
5183 return inode;
5184}
5185
1a54ef8c
BR
5186/* Get an inode object given its location and corresponding root.
5187 * Returns in *is_new if the inode was read from disk
5188 */
5189struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
73f73415 5190 struct btrfs_root *root, int *new)
1a54ef8c
BR
5191{
5192 struct inode *inode;
5193
90d3e592 5194 inode = btrfs_iget_locked(s, location, root);
1a54ef8c 5195 if (!inode)
5d4f98a2 5196 return ERR_PTR(-ENOMEM);
1a54ef8c
BR
5197
5198 if (inode->i_state & I_NEW) {
1a54ef8c 5199 btrfs_read_locked_inode(inode);
1748f843
MF
5200 if (!is_bad_inode(inode)) {
5201 inode_tree_add(inode);
5202 unlock_new_inode(inode);
5203 if (new)
5204 *new = 1;
5205 } else {
e0b6d65b
ST
5206 unlock_new_inode(inode);
5207 iput(inode);
5208 inode = ERR_PTR(-ESTALE);
1748f843
MF
5209 }
5210 }
5211
1a54ef8c
BR
5212 return inode;
5213}
5214
4df27c4d
YZ
5215static struct inode *new_simple_dir(struct super_block *s,
5216 struct btrfs_key *key,
5217 struct btrfs_root *root)
5218{
5219 struct inode *inode = new_inode(s);
5220
5221 if (!inode)
5222 return ERR_PTR(-ENOMEM);
5223
4df27c4d
YZ
5224 BTRFS_I(inode)->root = root;
5225 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
72ac3c0d 5226 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4df27c4d
YZ
5227
5228 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
848cce0d 5229 inode->i_op = &btrfs_dir_ro_inode_operations;
4df27c4d
YZ
5230 inode->i_fop = &simple_dir_operations;
5231 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5232 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5233
5234 return inode;
5235}
5236
3de4586c 5237struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
39279cc3 5238{
d397712b 5239 struct inode *inode;
4df27c4d 5240 struct btrfs_root *root = BTRFS_I(dir)->root;
39279cc3
CM
5241 struct btrfs_root *sub_root = root;
5242 struct btrfs_key location;
76dda93c 5243 int index;
b4aff1f8 5244 int ret = 0;
39279cc3
CM
5245
5246 if (dentry->d_name.len > BTRFS_NAME_LEN)
5247 return ERR_PTR(-ENAMETOOLONG);
5f39d397 5248
39e3c955 5249 ret = btrfs_inode_by_name(dir, dentry, &location);
39279cc3
CM
5250 if (ret < 0)
5251 return ERR_PTR(ret);
5f39d397 5252
4df27c4d 5253 if (location.objectid == 0)
5662344b 5254 return ERR_PTR(-ENOENT);
4df27c4d
YZ
5255
5256 if (location.type == BTRFS_INODE_ITEM_KEY) {
73f73415 5257 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
4df27c4d
YZ
5258 return inode;
5259 }
5260
5261 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5262
76dda93c 5263 index = srcu_read_lock(&root->fs_info->subvol_srcu);
4df27c4d
YZ
5264 ret = fixup_tree_root_location(root, dir, dentry,
5265 &location, &sub_root);
5266 if (ret < 0) {
5267 if (ret != -ENOENT)
5268 inode = ERR_PTR(ret);
5269 else
5270 inode = new_simple_dir(dir->i_sb, &location, sub_root);
5271 } else {
73f73415 5272 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
39279cc3 5273 }
76dda93c
YZ
5274 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5275
34d19bad 5276 if (!IS_ERR(inode) && root != sub_root) {
c71bf099
YZ
5277 down_read(&root->fs_info->cleanup_work_sem);
5278 if (!(inode->i_sb->s_flags & MS_RDONLY))
66b4ffd1 5279 ret = btrfs_orphan_cleanup(sub_root);
c71bf099 5280 up_read(&root->fs_info->cleanup_work_sem);
01cd3367
JB
5281 if (ret) {
5282 iput(inode);
66b4ffd1 5283 inode = ERR_PTR(ret);
01cd3367 5284 }
c71bf099
YZ
5285 }
5286
3de4586c
CM
5287 return inode;
5288}
5289
fe15ce44 5290static int btrfs_dentry_delete(const struct dentry *dentry)
76dda93c
YZ
5291{
5292 struct btrfs_root *root;
848cce0d 5293 struct inode *inode = dentry->d_inode;
76dda93c 5294
848cce0d
LZ
5295 if (!inode && !IS_ROOT(dentry))
5296 inode = dentry->d_parent->d_inode;
76dda93c 5297
848cce0d
LZ
5298 if (inode) {
5299 root = BTRFS_I(inode)->root;
efefb143
YZ
5300 if (btrfs_root_refs(&root->root_item) == 0)
5301 return 1;
848cce0d
LZ
5302
5303 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5304 return 1;
efefb143 5305 }
76dda93c
YZ
5306 return 0;
5307}
5308
b4aff1f8
JB
5309static void btrfs_dentry_release(struct dentry *dentry)
5310{
944a4515 5311 kfree(dentry->d_fsdata);
b4aff1f8
JB
5312}
5313
3de4586c 5314static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
00cd8dd3 5315 unsigned int flags)
3de4586c 5316{
5662344b 5317 struct inode *inode;
a66e7cc6 5318
5662344b
TI
5319 inode = btrfs_lookup_dentry(dir, dentry);
5320 if (IS_ERR(inode)) {
5321 if (PTR_ERR(inode) == -ENOENT)
5322 inode = NULL;
5323 else
5324 return ERR_CAST(inode);
5325 }
5326
3a0dfa6a 5327 return d_materialise_unique(dentry, inode);
39279cc3
CM
5328}
5329
16cdcec7 5330unsigned char btrfs_filetype_table[] = {
39279cc3
CM
5331 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5332};
5333
9cdda8d3 5334static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
39279cc3 5335{
9cdda8d3 5336 struct inode *inode = file_inode(file);
39279cc3
CM
5337 struct btrfs_root *root = BTRFS_I(inode)->root;
5338 struct btrfs_item *item;
5339 struct btrfs_dir_item *di;
5340 struct btrfs_key key;
5f39d397 5341 struct btrfs_key found_key;
39279cc3 5342 struct btrfs_path *path;
16cdcec7
MX
5343 struct list_head ins_list;
5344 struct list_head del_list;
39279cc3 5345 int ret;
5f39d397 5346 struct extent_buffer *leaf;
39279cc3 5347 int slot;
39279cc3
CM
5348 unsigned char d_type;
5349 int over = 0;
5350 u32 di_cur;
5351 u32 di_total;
5352 u32 di_len;
5353 int key_type = BTRFS_DIR_INDEX_KEY;
5f39d397
CM
5354 char tmp_name[32];
5355 char *name_ptr;
5356 int name_len;
9cdda8d3 5357 int is_curr = 0; /* ctx->pos points to the current index? */
39279cc3
CM
5358
5359 /* FIXME, use a real flag for deciding about the key type */
5360 if (root->fs_info->tree_root == root)
5361 key_type = BTRFS_DIR_ITEM_KEY;
5f39d397 5362
9cdda8d3
AV
5363 if (!dir_emit_dots(file, ctx))
5364 return 0;
5365
49593bfa 5366 path = btrfs_alloc_path();
16cdcec7
MX
5367 if (!path)
5368 return -ENOMEM;
ff5714cc 5369
026fd317 5370 path->reada = 1;
49593bfa 5371
16cdcec7
MX
5372 if (key_type == BTRFS_DIR_INDEX_KEY) {
5373 INIT_LIST_HEAD(&ins_list);
5374 INIT_LIST_HEAD(&del_list);
5375 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5376 }
5377
962a298f 5378 key.type = key_type;
9cdda8d3 5379 key.offset = ctx->pos;
33345d01 5380 key.objectid = btrfs_ino(inode);
5f39d397 5381
39279cc3
CM
5382 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5383 if (ret < 0)
5384 goto err;
49593bfa
DW
5385
5386 while (1) {
5f39d397 5387 leaf = path->nodes[0];
39279cc3 5388 slot = path->slots[0];
b9e03af0
LZ
5389 if (slot >= btrfs_header_nritems(leaf)) {
5390 ret = btrfs_next_leaf(root, path);
5391 if (ret < 0)
5392 goto err;
5393 else if (ret > 0)
5394 break;
5395 continue;
39279cc3 5396 }
3de4586c 5397
dd3cc16b 5398 item = btrfs_item_nr(slot);
5f39d397
CM
5399 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5400
5401 if (found_key.objectid != key.objectid)
39279cc3 5402 break;
962a298f 5403 if (found_key.type != key_type)
39279cc3 5404 break;
9cdda8d3 5405 if (found_key.offset < ctx->pos)
b9e03af0 5406 goto next;
16cdcec7
MX
5407 if (key_type == BTRFS_DIR_INDEX_KEY &&
5408 btrfs_should_delete_dir_index(&del_list,
5409 found_key.offset))
5410 goto next;
5f39d397 5411
9cdda8d3 5412 ctx->pos = found_key.offset;
16cdcec7 5413 is_curr = 1;
49593bfa 5414
39279cc3
CM
5415 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5416 di_cur = 0;
5f39d397 5417 di_total = btrfs_item_size(leaf, item);
49593bfa
DW
5418
5419 while (di_cur < di_total) {
5f39d397
CM
5420 struct btrfs_key location;
5421
22a94d44
JB
5422 if (verify_dir_item(root, leaf, di))
5423 break;
5424
5f39d397 5425 name_len = btrfs_dir_name_len(leaf, di);
49593bfa 5426 if (name_len <= sizeof(tmp_name)) {
5f39d397
CM
5427 name_ptr = tmp_name;
5428 } else {
5429 name_ptr = kmalloc(name_len, GFP_NOFS);
49593bfa
DW
5430 if (!name_ptr) {
5431 ret = -ENOMEM;
5432 goto err;
5433 }
5f39d397
CM
5434 }
5435 read_extent_buffer(leaf, name_ptr,
5436 (unsigned long)(di + 1), name_len);
5437
5438 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5439 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3de4586c 5440
fede766f 5441
3de4586c 5442 /* is this a reference to our own snapshot? If so
8c9c2bf7
AJ
5443 * skip it.
5444 *
5445 * In contrast to old kernels, we insert the snapshot's
5446 * dir item and dir index after it has been created, so
5447 * we won't find a reference to our own snapshot. We
5448 * still keep the following code for backward
5449 * compatibility.
3de4586c
CM
5450 */
5451 if (location.type == BTRFS_ROOT_ITEM_KEY &&
5452 location.objectid == root->root_key.objectid) {
5453 over = 0;
5454 goto skip;
5455 }
9cdda8d3
AV
5456 over = !dir_emit(ctx, name_ptr, name_len,
5457 location.objectid, d_type);
5f39d397 5458
3de4586c 5459skip:
5f39d397
CM
5460 if (name_ptr != tmp_name)
5461 kfree(name_ptr);
5462
39279cc3
CM
5463 if (over)
5464 goto nopos;
5103e947 5465 di_len = btrfs_dir_name_len(leaf, di) +
49593bfa 5466 btrfs_dir_data_len(leaf, di) + sizeof(*di);
39279cc3
CM
5467 di_cur += di_len;
5468 di = (struct btrfs_dir_item *)((char *)di + di_len);
5469 }
b9e03af0
LZ
5470next:
5471 path->slots[0]++;
39279cc3 5472 }
49593bfa 5473
16cdcec7
MX
5474 if (key_type == BTRFS_DIR_INDEX_KEY) {
5475 if (is_curr)
9cdda8d3
AV
5476 ctx->pos++;
5477 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
16cdcec7
MX
5478 if (ret)
5479 goto nopos;
5480 }
5481
49593bfa 5482 /* Reached end of directory/root. Bump pos past the last item. */
db62efbb
ZB
5483 ctx->pos++;
5484
5485 /*
5486 * Stop new entries from being returned after we return the last
5487 * entry.
5488 *
5489 * New directory entries are assigned a strictly increasing
5490 * offset. This means that new entries created during readdir
5491 * are *guaranteed* to be seen in the future by that readdir.
5492 * This has broken buggy programs which operate on names as
5493 * they're returned by readdir. Until we re-use freed offsets
5494 * we have this hack to stop new entries from being returned
5495 * under the assumption that they'll never reach this huge
5496 * offset.
5497 *
5498 * This is being careful not to overflow 32bit loff_t unless the
5499 * last entry requires it because doing so has broken 32bit apps
5500 * in the past.
5501 */
5502 if (key_type == BTRFS_DIR_INDEX_KEY) {
5503 if (ctx->pos >= INT_MAX)
5504 ctx->pos = LLONG_MAX;
5505 else
5506 ctx->pos = INT_MAX;
5507 }
39279cc3
CM
5508nopos:
5509 ret = 0;
5510err:
16cdcec7
MX
5511 if (key_type == BTRFS_DIR_INDEX_KEY)
5512 btrfs_put_delayed_items(&ins_list, &del_list);
39279cc3 5513 btrfs_free_path(path);
39279cc3
CM
5514 return ret;
5515}
5516
a9185b41 5517int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
39279cc3
CM
5518{
5519 struct btrfs_root *root = BTRFS_I(inode)->root;
5520 struct btrfs_trans_handle *trans;
5521 int ret = 0;
0af3d00b 5522 bool nolock = false;
39279cc3 5523
72ac3c0d 5524 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4ca8b41e
CM
5525 return 0;
5526
83eea1f1 5527 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
82d5902d 5528 nolock = true;
0af3d00b 5529
a9185b41 5530 if (wbc->sync_mode == WB_SYNC_ALL) {
0af3d00b 5531 if (nolock)
7a7eaa40 5532 trans = btrfs_join_transaction_nolock(root);
0af3d00b 5533 else
7a7eaa40 5534 trans = btrfs_join_transaction(root);
3612b495
TI
5535 if (IS_ERR(trans))
5536 return PTR_ERR(trans);
a698d075 5537 ret = btrfs_commit_transaction(trans, root);
39279cc3
CM
5538 }
5539 return ret;
5540}
5541
5542/*
54aa1f4d 5543 * This is somewhat expensive, updating the tree every time the
39279cc3
CM
5544 * inode changes. But, it is most likely to find the inode in cache.
5545 * FIXME, needs more benchmarking...there are no reasons other than performance
5546 * to keep or drop this code.
5547 */
48a3b636 5548static int btrfs_dirty_inode(struct inode *inode)
39279cc3
CM
5549{
5550 struct btrfs_root *root = BTRFS_I(inode)->root;
5551 struct btrfs_trans_handle *trans;
8929ecfa
YZ
5552 int ret;
5553
72ac3c0d 5554 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
22c44fe6 5555 return 0;
39279cc3 5556
7a7eaa40 5557 trans = btrfs_join_transaction(root);
22c44fe6
JB
5558 if (IS_ERR(trans))
5559 return PTR_ERR(trans);
8929ecfa
YZ
5560
5561 ret = btrfs_update_inode(trans, root, inode);
94b60442
CM
5562 if (ret && ret == -ENOSPC) {
5563 /* whoops, lets try again with the full transaction */
5564 btrfs_end_transaction(trans, root);
5565 trans = btrfs_start_transaction(root, 1);
22c44fe6
JB
5566 if (IS_ERR(trans))
5567 return PTR_ERR(trans);
8929ecfa 5568
94b60442 5569 ret = btrfs_update_inode(trans, root, inode);
94b60442 5570 }
39279cc3 5571 btrfs_end_transaction(trans, root);
16cdcec7
MX
5572 if (BTRFS_I(inode)->delayed_node)
5573 btrfs_balance_delayed_items(root);
22c44fe6
JB
5574
5575 return ret;
5576}
5577
5578/*
5579 * This is a copy of file_update_time. We need this so we can return error on
5580 * ENOSPC for updating the inode in the case of file write and mmap writes.
5581 */
e41f941a
JB
5582static int btrfs_update_time(struct inode *inode, struct timespec *now,
5583 int flags)
22c44fe6 5584{
2bc55652
AB
5585 struct btrfs_root *root = BTRFS_I(inode)->root;
5586
5587 if (btrfs_root_readonly(root))
5588 return -EROFS;
5589
e41f941a 5590 if (flags & S_VERSION)
22c44fe6 5591 inode_inc_iversion(inode);
e41f941a
JB
5592 if (flags & S_CTIME)
5593 inode->i_ctime = *now;
5594 if (flags & S_MTIME)
5595 inode->i_mtime = *now;
5596 if (flags & S_ATIME)
5597 inode->i_atime = *now;
5598 return btrfs_dirty_inode(inode);
39279cc3
CM
5599}
5600
d352ac68
CM
5601/*
5602 * find the highest existing sequence number in a directory
5603 * and then set the in-memory index_cnt variable to reflect
5604 * free sequence numbers
5605 */
aec7477b
JB
5606static int btrfs_set_inode_index_count(struct inode *inode)
5607{
5608 struct btrfs_root *root = BTRFS_I(inode)->root;
5609 struct btrfs_key key, found_key;
5610 struct btrfs_path *path;
5611 struct extent_buffer *leaf;
5612 int ret;
5613
33345d01 5614 key.objectid = btrfs_ino(inode);
962a298f 5615 key.type = BTRFS_DIR_INDEX_KEY;
aec7477b
JB
5616 key.offset = (u64)-1;
5617
5618 path = btrfs_alloc_path();
5619 if (!path)
5620 return -ENOMEM;
5621
5622 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5623 if (ret < 0)
5624 goto out;
5625 /* FIXME: we should be able to handle this */
5626 if (ret == 0)
5627 goto out;
5628 ret = 0;
5629
5630 /*
5631 * MAGIC NUMBER EXPLANATION:
5632 * since we search a directory based on f_pos we have to start at 2
5633 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5634 * else has to start at 2
5635 */
5636 if (path->slots[0] == 0) {
5637 BTRFS_I(inode)->index_cnt = 2;
5638 goto out;
5639 }
5640
5641 path->slots[0]--;
5642
5643 leaf = path->nodes[0];
5644 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5645
33345d01 5646 if (found_key.objectid != btrfs_ino(inode) ||
962a298f 5647 found_key.type != BTRFS_DIR_INDEX_KEY) {
aec7477b
JB
5648 BTRFS_I(inode)->index_cnt = 2;
5649 goto out;
5650 }
5651
5652 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5653out:
5654 btrfs_free_path(path);
5655 return ret;
5656}
5657
d352ac68
CM
5658/*
5659 * helper to find a free sequence number in a given directory. This current
5660 * code is very simple, later versions will do smarter things in the btree
5661 */
3de4586c 5662int btrfs_set_inode_index(struct inode *dir, u64 *index)
aec7477b
JB
5663{
5664 int ret = 0;
5665
5666 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
16cdcec7
MX
5667 ret = btrfs_inode_delayed_dir_index_count(dir);
5668 if (ret) {
5669 ret = btrfs_set_inode_index_count(dir);
5670 if (ret)
5671 return ret;
5672 }
aec7477b
JB
5673 }
5674
00e4e6b3 5675 *index = BTRFS_I(dir)->index_cnt;
aec7477b
JB
5676 BTRFS_I(dir)->index_cnt++;
5677
5678 return ret;
5679}
5680
b0d5d10f
CM
5681static int btrfs_insert_inode_locked(struct inode *inode)
5682{
5683 struct btrfs_iget_args args;
5684 args.location = &BTRFS_I(inode)->location;
5685 args.root = BTRFS_I(inode)->root;
5686
5687 return insert_inode_locked4(inode,
5688 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5689 btrfs_find_actor, &args);
5690}
5691
39279cc3
CM
5692static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5693 struct btrfs_root *root,
aec7477b 5694 struct inode *dir,
9c58309d 5695 const char *name, int name_len,
175a4eb7
AV
5696 u64 ref_objectid, u64 objectid,
5697 umode_t mode, u64 *index)
39279cc3
CM
5698{
5699 struct inode *inode;
5f39d397 5700 struct btrfs_inode_item *inode_item;
39279cc3 5701 struct btrfs_key *location;
5f39d397 5702 struct btrfs_path *path;
9c58309d
CM
5703 struct btrfs_inode_ref *ref;
5704 struct btrfs_key key[2];
5705 u32 sizes[2];
ef3b9af5 5706 int nitems = name ? 2 : 1;
9c58309d 5707 unsigned long ptr;
39279cc3 5708 int ret;
39279cc3 5709
5f39d397 5710 path = btrfs_alloc_path();
d8926bb3
MF
5711 if (!path)
5712 return ERR_PTR(-ENOMEM);
5f39d397 5713
39279cc3 5714 inode = new_inode(root->fs_info->sb);
8fb27640
YS
5715 if (!inode) {
5716 btrfs_free_path(path);
39279cc3 5717 return ERR_PTR(-ENOMEM);
8fb27640 5718 }
39279cc3 5719
5762b5c9
FM
5720 /*
5721 * O_TMPFILE, set link count to 0, so that after this point,
5722 * we fill in an inode item with the correct link count.
5723 */
5724 if (!name)
5725 set_nlink(inode, 0);
5726
581bb050
LZ
5727 /*
5728 * we have to initialize this early, so we can reclaim the inode
5729 * number if we fail afterwards in this function.
5730 */
5731 inode->i_ino = objectid;
5732
ef3b9af5 5733 if (dir && name) {
1abe9b8a 5734 trace_btrfs_inode_request(dir);
5735
3de4586c 5736 ret = btrfs_set_inode_index(dir, index);
09771430 5737 if (ret) {
8fb27640 5738 btrfs_free_path(path);
09771430 5739 iput(inode);
aec7477b 5740 return ERR_PTR(ret);
09771430 5741 }
ef3b9af5
FM
5742 } else if (dir) {
5743 *index = 0;
aec7477b
JB
5744 }
5745 /*
5746 * index_cnt is ignored for everything but a dir,
5747 * btrfs_get_inode_index_count has an explanation for the magic
5748 * number
5749 */
5750 BTRFS_I(inode)->index_cnt = 2;
67de1176 5751 BTRFS_I(inode)->dir_index = *index;
39279cc3 5752 BTRFS_I(inode)->root = root;
e02119d5 5753 BTRFS_I(inode)->generation = trans->transid;
76195853 5754 inode->i_generation = BTRFS_I(inode)->generation;
b888db2b 5755
5dc562c5
JB
5756 /*
5757 * We could have gotten an inode number from somebody who was fsynced
5758 * and then removed in this same transaction, so let's just set full
5759 * sync since it will be a full sync anyway and this will blow away the
5760 * old info in the log.
5761 */
5762 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5763
9c58309d 5764 key[0].objectid = objectid;
962a298f 5765 key[0].type = BTRFS_INODE_ITEM_KEY;
9c58309d
CM
5766 key[0].offset = 0;
5767
9c58309d 5768 sizes[0] = sizeof(struct btrfs_inode_item);
ef3b9af5
FM
5769
5770 if (name) {
5771 /*
5772 * Start new inodes with an inode_ref. This is slightly more
5773 * efficient for small numbers of hard links since they will
5774 * be packed into one item. Extended refs will kick in if we
5775 * add more hard links than can fit in the ref item.
5776 */
5777 key[1].objectid = objectid;
962a298f 5778 key[1].type = BTRFS_INODE_REF_KEY;
ef3b9af5
FM
5779 key[1].offset = ref_objectid;
5780
5781 sizes[1] = name_len + sizeof(*ref);
5782 }
9c58309d 5783
b0d5d10f
CM
5784 location = &BTRFS_I(inode)->location;
5785 location->objectid = objectid;
5786 location->offset = 0;
962a298f 5787 location->type = BTRFS_INODE_ITEM_KEY;
b0d5d10f
CM
5788
5789 ret = btrfs_insert_inode_locked(inode);
5790 if (ret < 0)
5791 goto fail;
5792
b9473439 5793 path->leave_spinning = 1;
ef3b9af5 5794 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
9c58309d 5795 if (ret != 0)
b0d5d10f 5796 goto fail_unlock;
5f39d397 5797
ecc11fab 5798 inode_init_owner(inode, dir, mode);
a76a3cd4 5799 inode_set_bytes(inode, 0);
39279cc3 5800 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5f39d397
CM
5801 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5802 struct btrfs_inode_item);
293f7e07
LZ
5803 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5804 sizeof(*inode_item));
e02119d5 5805 fill_inode_item(trans, path->nodes[0], inode_item, inode);
9c58309d 5806
ef3b9af5
FM
5807 if (name) {
5808 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5809 struct btrfs_inode_ref);
5810 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5811 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5812 ptr = (unsigned long)(ref + 1);
5813 write_extent_buffer(path->nodes[0], name, ptr, name_len);
5814 }
9c58309d 5815
5f39d397
CM
5816 btrfs_mark_buffer_dirty(path->nodes[0]);
5817 btrfs_free_path(path);
5818
6cbff00f
CH
5819 btrfs_inherit_iflags(inode, dir);
5820
569254b0 5821 if (S_ISREG(mode)) {
94272164
CM
5822 if (btrfs_test_opt(root, NODATASUM))
5823 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
213490b3 5824 if (btrfs_test_opt(root, NODATACOW))
f2bdf9a8
JB
5825 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5826 BTRFS_INODE_NODATASUM;
94272164
CM
5827 }
5828
5d4f98a2 5829 inode_tree_add(inode);
1abe9b8a 5830
5831 trace_btrfs_inode_new(inode);
1973f0fa 5832 btrfs_set_inode_last_trans(trans, inode);
1abe9b8a 5833
8ea05e3a
AB
5834 btrfs_update_root_times(trans, root);
5835
63541927
FDBM
5836 ret = btrfs_inode_inherit_props(trans, inode, dir);
5837 if (ret)
5838 btrfs_err(root->fs_info,
5839 "error inheriting props for ino %llu (root %llu): %d",
5840 btrfs_ino(inode), root->root_key.objectid, ret);
5841
39279cc3 5842 return inode;
b0d5d10f
CM
5843
5844fail_unlock:
5845 unlock_new_inode(inode);
5f39d397 5846fail:
ef3b9af5 5847 if (dir && name)
aec7477b 5848 BTRFS_I(dir)->index_cnt--;
5f39d397 5849 btrfs_free_path(path);
09771430 5850 iput(inode);
5f39d397 5851 return ERR_PTR(ret);
39279cc3
CM
5852}
5853
5854static inline u8 btrfs_inode_type(struct inode *inode)
5855{
5856 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5857}
5858
d352ac68
CM
5859/*
5860 * utility function to add 'inode' into 'parent_inode' with
5861 * a give name and a given sequence number.
5862 * if 'add_backref' is true, also insert a backref from the
5863 * inode to the parent directory.
5864 */
e02119d5
CM
5865int btrfs_add_link(struct btrfs_trans_handle *trans,
5866 struct inode *parent_inode, struct inode *inode,
5867 const char *name, int name_len, int add_backref, u64 index)
39279cc3 5868{
4df27c4d 5869 int ret = 0;
39279cc3 5870 struct btrfs_key key;
e02119d5 5871 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
33345d01
LZ
5872 u64 ino = btrfs_ino(inode);
5873 u64 parent_ino = btrfs_ino(parent_inode);
5f39d397 5874
33345d01 5875 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4df27c4d
YZ
5876 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5877 } else {
33345d01 5878 key.objectid = ino;
962a298f 5879 key.type = BTRFS_INODE_ITEM_KEY;
4df27c4d
YZ
5880 key.offset = 0;
5881 }
5882
33345d01 5883 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4df27c4d
YZ
5884 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5885 key.objectid, root->root_key.objectid,
33345d01 5886 parent_ino, index, name, name_len);
4df27c4d 5887 } else if (add_backref) {
33345d01
LZ
5888 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5889 parent_ino, index);
4df27c4d 5890 }
39279cc3 5891
79787eaa
JM
5892 /* Nothing to clean up yet */
5893 if (ret)
5894 return ret;
4df27c4d 5895
79787eaa
JM
5896 ret = btrfs_insert_dir_item(trans, root, name, name_len,
5897 parent_inode, &key,
5898 btrfs_inode_type(inode), index);
9c52057c 5899 if (ret == -EEXIST || ret == -EOVERFLOW)
79787eaa
JM
5900 goto fail_dir_item;
5901 else if (ret) {
5902 btrfs_abort_transaction(trans, root, ret);
5903 return ret;
39279cc3 5904 }
79787eaa
JM
5905
5906 btrfs_i_size_write(parent_inode, parent_inode->i_size +
5907 name_len * 2);
0c4d2d95 5908 inode_inc_iversion(parent_inode);
79787eaa
JM
5909 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5910 ret = btrfs_update_inode(trans, root, parent_inode);
5911 if (ret)
5912 btrfs_abort_transaction(trans, root, ret);
39279cc3 5913 return ret;
fe66a05a
CM
5914
5915fail_dir_item:
5916 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5917 u64 local_index;
5918 int err;
5919 err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5920 key.objectid, root->root_key.objectid,
5921 parent_ino, &local_index, name, name_len);
5922
5923 } else if (add_backref) {
5924 u64 local_index;
5925 int err;
5926
5927 err = btrfs_del_inode_ref(trans, root, name, name_len,
5928 ino, parent_ino, &local_index);
5929 }
5930 return ret;
39279cc3
CM
5931}
5932
5933static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
a1b075d2
JB
5934 struct inode *dir, struct dentry *dentry,
5935 struct inode *inode, int backref, u64 index)
39279cc3 5936{
a1b075d2
JB
5937 int err = btrfs_add_link(trans, dir, inode,
5938 dentry->d_name.name, dentry->d_name.len,
5939 backref, index);
39279cc3
CM
5940 if (err > 0)
5941 err = -EEXIST;
5942 return err;
5943}
5944
618e21d5 5945static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
1a67aafb 5946 umode_t mode, dev_t rdev)
618e21d5
JB
5947{
5948 struct btrfs_trans_handle *trans;
5949 struct btrfs_root *root = BTRFS_I(dir)->root;
1832a6d5 5950 struct inode *inode = NULL;
618e21d5
JB
5951 int err;
5952 int drop_inode = 0;
5953 u64 objectid;
00e4e6b3 5954 u64 index = 0;
618e21d5
JB
5955
5956 if (!new_valid_dev(rdev))
5957 return -EINVAL;
5958
9ed74f2d
JB
5959 /*
5960 * 2 for inode item and ref
5961 * 2 for dir items
5962 * 1 for xattr if selinux is on
5963 */
a22285a6
YZ
5964 trans = btrfs_start_transaction(root, 5);
5965 if (IS_ERR(trans))
5966 return PTR_ERR(trans);
1832a6d5 5967
581bb050
LZ
5968 err = btrfs_find_free_ino(root, &objectid);
5969 if (err)
5970 goto out_unlock;
5971
aec7477b 5972 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
33345d01 5973 dentry->d_name.len, btrfs_ino(dir), objectid,
d82a6f1d 5974 mode, &index);
7cf96da3
TI
5975 if (IS_ERR(inode)) {
5976 err = PTR_ERR(inode);
618e21d5 5977 goto out_unlock;
7cf96da3 5978 }
618e21d5 5979
ad19db71
CS
5980 /*
5981 * If the active LSM wants to access the inode during
5982 * d_instantiate it needs these. Smack checks to see
5983 * if the filesystem supports xattrs by looking at the
5984 * ops vector.
5985 */
ad19db71 5986 inode->i_op = &btrfs_special_inode_operations;
b0d5d10f
CM
5987 init_special_inode(inode, inode->i_mode, rdev);
5988
5989 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
618e21d5 5990 if (err)
b0d5d10f
CM
5991 goto out_unlock_inode;
5992
5993 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5994 if (err) {
5995 goto out_unlock_inode;
5996 } else {
1b4ab1bb 5997 btrfs_update_inode(trans, root, inode);
b0d5d10f 5998 unlock_new_inode(inode);
08c422c2 5999 d_instantiate(dentry, inode);
618e21d5 6000 }
b0d5d10f 6001
618e21d5 6002out_unlock:
7ad85bb7 6003 btrfs_end_transaction(trans, root);
c581afc8 6004 btrfs_balance_delayed_items(root);
b53d3f5d 6005 btrfs_btree_balance_dirty(root);
618e21d5
JB
6006 if (drop_inode) {
6007 inode_dec_link_count(inode);
6008 iput(inode);
6009 }
618e21d5 6010 return err;
b0d5d10f
CM
6011
6012out_unlock_inode:
6013 drop_inode = 1;
6014 unlock_new_inode(inode);
6015 goto out_unlock;
6016
618e21d5
JB
6017}
6018
39279cc3 6019static int btrfs_create(struct inode *dir, struct dentry *dentry,
ebfc3b49 6020 umode_t mode, bool excl)
39279cc3
CM
6021{
6022 struct btrfs_trans_handle *trans;
6023 struct btrfs_root *root = BTRFS_I(dir)->root;
1832a6d5 6024 struct inode *inode = NULL;
43baa579 6025 int drop_inode_on_err = 0;
a22285a6 6026 int err;
39279cc3 6027 u64 objectid;
00e4e6b3 6028 u64 index = 0;
39279cc3 6029
9ed74f2d
JB
6030 /*
6031 * 2 for inode item and ref
6032 * 2 for dir items
6033 * 1 for xattr if selinux is on
6034 */
a22285a6
YZ
6035 trans = btrfs_start_transaction(root, 5);
6036 if (IS_ERR(trans))
6037 return PTR_ERR(trans);
9ed74f2d 6038
581bb050
LZ
6039 err = btrfs_find_free_ino(root, &objectid);
6040 if (err)
6041 goto out_unlock;
6042
aec7477b 6043 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
33345d01 6044 dentry->d_name.len, btrfs_ino(dir), objectid,
d82a6f1d 6045 mode, &index);
7cf96da3
TI
6046 if (IS_ERR(inode)) {
6047 err = PTR_ERR(inode);
39279cc3 6048 goto out_unlock;
7cf96da3 6049 }
43baa579 6050 drop_inode_on_err = 1;
ad19db71
CS
6051 /*
6052 * If the active LSM wants to access the inode during
6053 * d_instantiate it needs these. Smack checks to see
6054 * if the filesystem supports xattrs by looking at the
6055 * ops vector.
6056 */
6057 inode->i_fop = &btrfs_file_operations;
6058 inode->i_op = &btrfs_file_inode_operations;
b0d5d10f
CM
6059 inode->i_mapping->a_ops = &btrfs_aops;
6060 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6061
6062 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6063 if (err)
6064 goto out_unlock_inode;
6065
6066 err = btrfs_update_inode(trans, root, inode);
6067 if (err)
6068 goto out_unlock_inode;
ad19db71 6069
a1b075d2 6070 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
39279cc3 6071 if (err)
b0d5d10f 6072 goto out_unlock_inode;
43baa579 6073
43baa579 6074 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
b0d5d10f 6075 unlock_new_inode(inode);
43baa579
FB
6076 d_instantiate(dentry, inode);
6077
39279cc3 6078out_unlock:
7ad85bb7 6079 btrfs_end_transaction(trans, root);
43baa579 6080 if (err && drop_inode_on_err) {
39279cc3
CM
6081 inode_dec_link_count(inode);
6082 iput(inode);
6083 }
c581afc8 6084 btrfs_balance_delayed_items(root);
b53d3f5d 6085 btrfs_btree_balance_dirty(root);
39279cc3 6086 return err;
b0d5d10f
CM
6087
6088out_unlock_inode:
6089 unlock_new_inode(inode);
6090 goto out_unlock;
6091
39279cc3
CM
6092}
6093
6094static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6095 struct dentry *dentry)
6096{
6097 struct btrfs_trans_handle *trans;
6098 struct btrfs_root *root = BTRFS_I(dir)->root;
6099 struct inode *inode = old_dentry->d_inode;
00e4e6b3 6100 u64 index;
39279cc3
CM
6101 int err;
6102 int drop_inode = 0;
6103
4a8be425
TH
6104 /* do not allow sys_link's with other subvols of the same device */
6105 if (root->objectid != BTRFS_I(inode)->root->objectid)
3ab3564f 6106 return -EXDEV;
4a8be425 6107
f186373f 6108 if (inode->i_nlink >= BTRFS_LINK_MAX)
c055e99e 6109 return -EMLINK;
4a8be425 6110
3de4586c 6111 err = btrfs_set_inode_index(dir, &index);
aec7477b
JB
6112 if (err)
6113 goto fail;
6114
a22285a6 6115 /*
7e6b6465 6116 * 2 items for inode and inode ref
a22285a6 6117 * 2 items for dir items
7e6b6465 6118 * 1 item for parent inode
a22285a6 6119 */
7e6b6465 6120 trans = btrfs_start_transaction(root, 5);
a22285a6
YZ
6121 if (IS_ERR(trans)) {
6122 err = PTR_ERR(trans);
6123 goto fail;
6124 }
5f39d397 6125
67de1176
MX
6126 /* There are several dir indexes for this inode, clear the cache. */
6127 BTRFS_I(inode)->dir_index = 0ULL;
8b558c5f 6128 inc_nlink(inode);
0c4d2d95 6129 inode_inc_iversion(inode);
3153495d 6130 inode->i_ctime = CURRENT_TIME;
7de9c6ee 6131 ihold(inode);
e9976151 6132 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
aec7477b 6133
a1b075d2 6134 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5f39d397 6135
a5719521 6136 if (err) {
54aa1f4d 6137 drop_inode = 1;
a5719521 6138 } else {
10d9f309 6139 struct dentry *parent = dentry->d_parent;
a5719521 6140 err = btrfs_update_inode(trans, root, inode);
79787eaa
JM
6141 if (err)
6142 goto fail;
ef3b9af5
FM
6143 if (inode->i_nlink == 1) {
6144 /*
6145 * If new hard link count is 1, it's a file created
6146 * with open(2) O_TMPFILE flag.
6147 */
6148 err = btrfs_orphan_del(trans, inode);
6149 if (err)
6150 goto fail;
6151 }
08c422c2 6152 d_instantiate(dentry, inode);
6a912213 6153 btrfs_log_new_name(trans, inode, NULL, parent);
a5719521 6154 }
39279cc3 6155
7ad85bb7 6156 btrfs_end_transaction(trans, root);
c581afc8 6157 btrfs_balance_delayed_items(root);
1832a6d5 6158fail:
39279cc3
CM
6159 if (drop_inode) {
6160 inode_dec_link_count(inode);
6161 iput(inode);
6162 }
b53d3f5d 6163 btrfs_btree_balance_dirty(root);
39279cc3
CM
6164 return err;
6165}
6166
18bb1db3 6167static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
39279cc3 6168{
b9d86667 6169 struct inode *inode = NULL;
39279cc3
CM
6170 struct btrfs_trans_handle *trans;
6171 struct btrfs_root *root = BTRFS_I(dir)->root;
6172 int err = 0;
6173 int drop_on_err = 0;
b9d86667 6174 u64 objectid = 0;
00e4e6b3 6175 u64 index = 0;
39279cc3 6176
9ed74f2d
JB
6177 /*
6178 * 2 items for inode and ref
6179 * 2 items for dir items
6180 * 1 for xattr if selinux is on
6181 */
a22285a6
YZ
6182 trans = btrfs_start_transaction(root, 5);
6183 if (IS_ERR(trans))
6184 return PTR_ERR(trans);
39279cc3 6185
581bb050
LZ
6186 err = btrfs_find_free_ino(root, &objectid);
6187 if (err)
6188 goto out_fail;
6189
aec7477b 6190 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
33345d01 6191 dentry->d_name.len, btrfs_ino(dir), objectid,
d82a6f1d 6192 S_IFDIR | mode, &index);
39279cc3
CM
6193 if (IS_ERR(inode)) {
6194 err = PTR_ERR(inode);
6195 goto out_fail;
6196 }
5f39d397 6197
39279cc3 6198 drop_on_err = 1;
b0d5d10f
CM
6199 /* these must be set before we unlock the inode */
6200 inode->i_op = &btrfs_dir_inode_operations;
6201 inode->i_fop = &btrfs_dir_file_operations;
33268eaf 6202
2a7dba39 6203 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
33268eaf 6204 if (err)
b0d5d10f 6205 goto out_fail_inode;
39279cc3 6206
dbe674a9 6207 btrfs_i_size_write(inode, 0);
39279cc3
CM
6208 err = btrfs_update_inode(trans, root, inode);
6209 if (err)
b0d5d10f 6210 goto out_fail_inode;
5f39d397 6211
a1b075d2
JB
6212 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6213 dentry->d_name.len, 0, index);
39279cc3 6214 if (err)
b0d5d10f 6215 goto out_fail_inode;
5f39d397 6216
39279cc3 6217 d_instantiate(dentry, inode);
b0d5d10f
CM
6218 /*
6219 * mkdir is special. We're unlocking after we call d_instantiate
6220 * to avoid a race with nfsd calling d_instantiate.
6221 */
6222 unlock_new_inode(inode);
39279cc3 6223 drop_on_err = 0;
39279cc3
CM
6224
6225out_fail:
7ad85bb7 6226 btrfs_end_transaction(trans, root);
39279cc3
CM
6227 if (drop_on_err)
6228 iput(inode);
c581afc8 6229 btrfs_balance_delayed_items(root);
b53d3f5d 6230 btrfs_btree_balance_dirty(root);
39279cc3 6231 return err;
b0d5d10f
CM
6232
6233out_fail_inode:
6234 unlock_new_inode(inode);
6235 goto out_fail;
39279cc3
CM
6236}
6237
e6c4efd8
QW
6238/* Find next extent map of a given extent map, caller needs to ensure locks */
6239static struct extent_map *next_extent_map(struct extent_map *em)
6240{
6241 struct rb_node *next;
6242
6243 next = rb_next(&em->rb_node);
6244 if (!next)
6245 return NULL;
6246 return container_of(next, struct extent_map, rb_node);
6247}
6248
6249static struct extent_map *prev_extent_map(struct extent_map *em)
6250{
6251 struct rb_node *prev;
6252
6253 prev = rb_prev(&em->rb_node);
6254 if (!prev)
6255 return NULL;
6256 return container_of(prev, struct extent_map, rb_node);
6257}
6258
d352ac68 6259/* helper for btfs_get_extent. Given an existing extent in the tree,
e6c4efd8 6260 * the existing extent is the nearest extent to map_start,
d352ac68 6261 * and an extent that you want to insert, deal with overlap and insert
e6c4efd8 6262 * the best fitted new extent into the tree.
d352ac68 6263 */
3b951516
CM
6264static int merge_extent_mapping(struct extent_map_tree *em_tree,
6265 struct extent_map *existing,
e6dcd2dc 6266 struct extent_map *em,
51f395ad 6267 u64 map_start)
3b951516 6268{
e6c4efd8
QW
6269 struct extent_map *prev;
6270 struct extent_map *next;
6271 u64 start;
6272 u64 end;
3b951516 6273 u64 start_diff;
3b951516 6274
e6dcd2dc 6275 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
e6c4efd8
QW
6276
6277 if (existing->start > map_start) {
6278 next = existing;
6279 prev = prev_extent_map(next);
6280 } else {
6281 prev = existing;
6282 next = next_extent_map(prev);
6283 }
6284
6285 start = prev ? extent_map_end(prev) : em->start;
6286 start = max_t(u64, start, em->start);
6287 end = next ? next->start : extent_map_end(em);
6288 end = min_t(u64, end, extent_map_end(em));
6289 start_diff = start - em->start;
6290 em->start = start;
6291 em->len = end - start;
c8b97818
CM
6292 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6293 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
e6dcd2dc 6294 em->block_start += start_diff;
c8b97818
CM
6295 em->block_len -= start_diff;
6296 }
09a2a8f9 6297 return add_extent_mapping(em_tree, em, 0);
3b951516
CM
6298}
6299
c8b97818
CM
6300static noinline int uncompress_inline(struct btrfs_path *path,
6301 struct inode *inode, struct page *page,
6302 size_t pg_offset, u64 extent_offset,
6303 struct btrfs_file_extent_item *item)
6304{
6305 int ret;
6306 struct extent_buffer *leaf = path->nodes[0];
6307 char *tmp;
6308 size_t max_size;
6309 unsigned long inline_size;
6310 unsigned long ptr;
261507a0 6311 int compress_type;
c8b97818
CM
6312
6313 WARN_ON(pg_offset != 0);
261507a0 6314 compress_type = btrfs_file_extent_compression(leaf, item);
c8b97818
CM
6315 max_size = btrfs_file_extent_ram_bytes(leaf, item);
6316 inline_size = btrfs_file_extent_inline_item_len(leaf,
dd3cc16b 6317 btrfs_item_nr(path->slots[0]));
c8b97818 6318 tmp = kmalloc(inline_size, GFP_NOFS);
8d413713
TI
6319 if (!tmp)
6320 return -ENOMEM;
c8b97818
CM
6321 ptr = btrfs_file_extent_inline_start(item);
6322
6323 read_extent_buffer(leaf, tmp, ptr, inline_size);
6324
5b050f04 6325 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
261507a0
LZ
6326 ret = btrfs_decompress(compress_type, tmp, page,
6327 extent_offset, inline_size, max_size);
c8b97818 6328 kfree(tmp);
166ae5a4 6329 return ret;
c8b97818
CM
6330}
6331
d352ac68
CM
6332/*
6333 * a bit scary, this does extent mapping from logical file offset to the disk.
d397712b
CM
6334 * the ugly parts come from merging extents from the disk with the in-ram
6335 * representation. This gets more complex because of the data=ordered code,
d352ac68
CM
6336 * where the in-ram extents might be locked pending data=ordered completion.
6337 *
6338 * This also copies inline extents directly into the page.
6339 */
d397712b 6340
a52d9a80 6341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
70dec807 6342 size_t pg_offset, u64 start, u64 len,
a52d9a80
CM
6343 int create)
6344{
6345 int ret;
6346 int err = 0;
a52d9a80
CM
6347 u64 extent_start = 0;
6348 u64 extent_end = 0;
33345d01 6349 u64 objectid = btrfs_ino(inode);
a52d9a80 6350 u32 found_type;
f421950f 6351 struct btrfs_path *path = NULL;
a52d9a80
CM
6352 struct btrfs_root *root = BTRFS_I(inode)->root;
6353 struct btrfs_file_extent_item *item;
5f39d397
CM
6354 struct extent_buffer *leaf;
6355 struct btrfs_key found_key;
a52d9a80
CM
6356 struct extent_map *em = NULL;
6357 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
d1310b2e 6358 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
a52d9a80 6359 struct btrfs_trans_handle *trans = NULL;
7ffbb598 6360 const bool new_inline = !page || create;
a52d9a80 6361
a52d9a80 6362again:
890871be 6363 read_lock(&em_tree->lock);
d1310b2e 6364 em = lookup_extent_mapping(em_tree, start, len);
a061fc8d
CM
6365 if (em)
6366 em->bdev = root->fs_info->fs_devices->latest_bdev;
890871be 6367 read_unlock(&em_tree->lock);
d1310b2e 6368
a52d9a80 6369 if (em) {
e1c4b745
CM
6370 if (em->start > start || em->start + em->len <= start)
6371 free_extent_map(em);
6372 else if (em->block_start == EXTENT_MAP_INLINE && page)
70dec807
CM
6373 free_extent_map(em);
6374 else
6375 goto out;
a52d9a80 6376 }
172ddd60 6377 em = alloc_extent_map();
a52d9a80 6378 if (!em) {
d1310b2e
CM
6379 err = -ENOMEM;
6380 goto out;
a52d9a80 6381 }
e6dcd2dc 6382 em->bdev = root->fs_info->fs_devices->latest_bdev;
d1310b2e 6383 em->start = EXTENT_MAP_HOLE;
445a6944 6384 em->orig_start = EXTENT_MAP_HOLE;
d1310b2e 6385 em->len = (u64)-1;
c8b97818 6386 em->block_len = (u64)-1;
f421950f
CM
6387
6388 if (!path) {
6389 path = btrfs_alloc_path();
026fd317
JB
6390 if (!path) {
6391 err = -ENOMEM;
6392 goto out;
6393 }
6394 /*
6395 * Chances are we'll be called again, so go ahead and do
6396 * readahead
6397 */
6398 path->reada = 1;
f421950f
CM
6399 }
6400
179e29e4
CM
6401 ret = btrfs_lookup_file_extent(trans, root, path,
6402 objectid, start, trans != NULL);
a52d9a80
CM
6403 if (ret < 0) {
6404 err = ret;
6405 goto out;
6406 }
6407
6408 if (ret != 0) {
6409 if (path->slots[0] == 0)
6410 goto not_found;
6411 path->slots[0]--;
6412 }
6413
5f39d397
CM
6414 leaf = path->nodes[0];
6415 item = btrfs_item_ptr(leaf, path->slots[0],
a52d9a80 6416 struct btrfs_file_extent_item);
a52d9a80 6417 /* are we inside the extent that was found? */
5f39d397 6418 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
962a298f 6419 found_type = found_key.type;
5f39d397 6420 if (found_key.objectid != objectid ||
a52d9a80 6421 found_type != BTRFS_EXTENT_DATA_KEY) {
25a50341
JB
6422 /*
6423 * If we backup past the first extent we want to move forward
6424 * and see if there is an extent in front of us, otherwise we'll
6425 * say there is a hole for our whole search range which can
6426 * cause problems.
6427 */
6428 extent_end = start;
6429 goto next;
a52d9a80
CM
6430 }
6431
5f39d397
CM
6432 found_type = btrfs_file_extent_type(leaf, item);
6433 extent_start = found_key.offset;
d899e052
YZ
6434 if (found_type == BTRFS_FILE_EXTENT_REG ||
6435 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
a52d9a80 6436 extent_end = extent_start +
db94535d 6437 btrfs_file_extent_num_bytes(leaf, item);
9036c102
YZ
6438 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6439 size_t size;
514ac8ad 6440 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
fda2832f 6441 extent_end = ALIGN(extent_start + size, root->sectorsize);
9036c102 6442 }
25a50341 6443next:
9036c102
YZ
6444 if (start >= extent_end) {
6445 path->slots[0]++;
6446 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6447 ret = btrfs_next_leaf(root, path);
6448 if (ret < 0) {
6449 err = ret;
6450 goto out;
a52d9a80 6451 }
9036c102
YZ
6452 if (ret > 0)
6453 goto not_found;
6454 leaf = path->nodes[0];
a52d9a80 6455 }
9036c102
YZ
6456 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6457 if (found_key.objectid != objectid ||
6458 found_key.type != BTRFS_EXTENT_DATA_KEY)
6459 goto not_found;
6460 if (start + len <= found_key.offset)
6461 goto not_found;
e2eca69d
WS
6462 if (start > found_key.offset)
6463 goto next;
9036c102 6464 em->start = start;
70c8a91c 6465 em->orig_start = start;
9036c102
YZ
6466 em->len = found_key.offset - start;
6467 goto not_found_em;
6468 }
6469
7ffbb598
FM
6470 btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6471
d899e052
YZ
6472 if (found_type == BTRFS_FILE_EXTENT_REG ||
6473 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
a52d9a80
CM
6474 goto insert;
6475 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5f39d397 6476 unsigned long ptr;
a52d9a80 6477 char *map;
3326d1b0
CM
6478 size_t size;
6479 size_t extent_offset;
6480 size_t copy_size;
a52d9a80 6481
7ffbb598 6482 if (new_inline)
689f9346 6483 goto out;
5f39d397 6484
514ac8ad 6485 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
9036c102 6486 extent_offset = page_offset(page) + pg_offset - extent_start;
70dec807 6487 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
3326d1b0 6488 size - extent_offset);
3326d1b0 6489 em->start = extent_start + extent_offset;
fda2832f 6490 em->len = ALIGN(copy_size, root->sectorsize);
b4939680 6491 em->orig_block_len = em->len;
70c8a91c 6492 em->orig_start = em->start;
689f9346 6493 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
179e29e4 6494 if (create == 0 && !PageUptodate(page)) {
261507a0
LZ
6495 if (btrfs_file_extent_compression(leaf, item) !=
6496 BTRFS_COMPRESS_NONE) {
c8b97818
CM
6497 ret = uncompress_inline(path, inode, page,
6498 pg_offset,
6499 extent_offset, item);
166ae5a4
ZB
6500 if (ret) {
6501 err = ret;
6502 goto out;
6503 }
c8b97818
CM
6504 } else {
6505 map = kmap(page);
6506 read_extent_buffer(leaf, map + pg_offset, ptr,
6507 copy_size);
93c82d57
CM
6508 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6509 memset(map + pg_offset + copy_size, 0,
6510 PAGE_CACHE_SIZE - pg_offset -
6511 copy_size);
6512 }
c8b97818
CM
6513 kunmap(page);
6514 }
179e29e4
CM
6515 flush_dcache_page(page);
6516 } else if (create && PageUptodate(page)) {
6bf7e080 6517 BUG();
179e29e4
CM
6518 if (!trans) {
6519 kunmap(page);
6520 free_extent_map(em);
6521 em = NULL;
ff5714cc 6522
b3b4aa74 6523 btrfs_release_path(path);
7a7eaa40 6524 trans = btrfs_join_transaction(root);
ff5714cc 6525
3612b495
TI
6526 if (IS_ERR(trans))
6527 return ERR_CAST(trans);
179e29e4
CM
6528 goto again;
6529 }
c8b97818 6530 map = kmap(page);
70dec807 6531 write_extent_buffer(leaf, map + pg_offset, ptr,
179e29e4 6532 copy_size);
c8b97818 6533 kunmap(page);
179e29e4 6534 btrfs_mark_buffer_dirty(leaf);
a52d9a80 6535 }
d1310b2e 6536 set_extent_uptodate(io_tree, em->start,
507903b8 6537 extent_map_end(em) - 1, NULL, GFP_NOFS);
a52d9a80 6538 goto insert;
a52d9a80
CM
6539 }
6540not_found:
6541 em->start = start;
70c8a91c 6542 em->orig_start = start;
d1310b2e 6543 em->len = len;
a52d9a80 6544not_found_em:
5f39d397 6545 em->block_start = EXTENT_MAP_HOLE;
9036c102 6546 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
a52d9a80 6547insert:
b3b4aa74 6548 btrfs_release_path(path);
d1310b2e 6549 if (em->start > start || extent_map_end(em) <= start) {
c2cf52eb 6550 btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
c1c9ff7c 6551 em->start, em->len, start, len);
a52d9a80
CM
6552 err = -EIO;
6553 goto out;
6554 }
d1310b2e
CM
6555
6556 err = 0;
890871be 6557 write_lock(&em_tree->lock);
09a2a8f9 6558 ret = add_extent_mapping(em_tree, em, 0);
3b951516
CM
6559 /* it is possible that someone inserted the extent into the tree
6560 * while we had the lock dropped. It is also possible that
6561 * an overlapping map exists in the tree
6562 */
a52d9a80 6563 if (ret == -EEXIST) {
3b951516 6564 struct extent_map *existing;
e6dcd2dc
CM
6565
6566 ret = 0;
6567
e6c4efd8
QW
6568 existing = search_extent_mapping(em_tree, start, len);
6569 /*
6570 * existing will always be non-NULL, since there must be
6571 * extent causing the -EEXIST.
6572 */
6573 if (start >= extent_map_end(existing) ||
32be3a1a 6574 start <= existing->start) {
e6c4efd8
QW
6575 /*
6576 * The existing extent map is the one nearest to
6577 * the [start, start + len) range which overlaps
6578 */
6579 err = merge_extent_mapping(em_tree, existing,
6580 em, start);
e1c4b745 6581 free_extent_map(existing);
e6c4efd8 6582 if (err) {
3b951516
CM
6583 free_extent_map(em);
6584 em = NULL;
6585 }
6586 } else {
6587 free_extent_map(em);
6588 em = existing;
e6dcd2dc 6589 err = 0;
a52d9a80 6590 }
a52d9a80 6591 }
890871be 6592 write_unlock(&em_tree->lock);
a52d9a80 6593out:
1abe9b8a 6594
4cd8587c 6595 trace_btrfs_get_extent(root, em);
1abe9b8a 6596
f421950f
CM
6597 if (path)
6598 btrfs_free_path(path);
a52d9a80
CM
6599 if (trans) {
6600 ret = btrfs_end_transaction(trans, root);
d397712b 6601 if (!err)
a52d9a80
CM
6602 err = ret;
6603 }
a52d9a80
CM
6604 if (err) {
6605 free_extent_map(em);
a52d9a80
CM
6606 return ERR_PTR(err);
6607 }
79787eaa 6608 BUG_ON(!em); /* Error is always set */
a52d9a80
CM
6609 return em;
6610}
6611
ec29ed5b
CM
6612struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6613 size_t pg_offset, u64 start, u64 len,
6614 int create)
6615{
6616 struct extent_map *em;
6617 struct extent_map *hole_em = NULL;
6618 u64 range_start = start;
6619 u64 end;
6620 u64 found;
6621 u64 found_end;
6622 int err = 0;
6623
6624 em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6625 if (IS_ERR(em))
6626 return em;
6627 if (em) {
6628 /*
f9e4fb53
LB
6629 * if our em maps to
6630 * - a hole or
6631 * - a pre-alloc extent,
6632 * there might actually be delalloc bytes behind it.
ec29ed5b 6633 */
f9e4fb53
LB
6634 if (em->block_start != EXTENT_MAP_HOLE &&
6635 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
ec29ed5b
CM
6636 return em;
6637 else
6638 hole_em = em;
6639 }
6640
6641 /* check to see if we've wrapped (len == -1 or similar) */
6642 end = start + len;
6643 if (end < start)
6644 end = (u64)-1;
6645 else
6646 end -= 1;
6647
6648 em = NULL;
6649
6650 /* ok, we didn't find anything, lets look for delalloc */
6651 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6652 end, len, EXTENT_DELALLOC, 1);
6653 found_end = range_start + found;
6654 if (found_end < range_start)
6655 found_end = (u64)-1;
6656
6657 /*
6658 * we didn't find anything useful, return
6659 * the original results from get_extent()
6660 */
6661 if (range_start > end || found_end <= start) {
6662 em = hole_em;
6663 hole_em = NULL;
6664 goto out;
6665 }
6666
6667 /* adjust the range_start to make sure it doesn't
6668 * go backwards from the start they passed in
6669 */
67871254 6670 range_start = max(start, range_start);
ec29ed5b
CM
6671 found = found_end - range_start;
6672
6673 if (found > 0) {
6674 u64 hole_start = start;
6675 u64 hole_len = len;
6676
172ddd60 6677 em = alloc_extent_map();
ec29ed5b
CM
6678 if (!em) {
6679 err = -ENOMEM;
6680 goto out;
6681 }
6682 /*
6683 * when btrfs_get_extent can't find anything it
6684 * returns one huge hole
6685 *
6686 * make sure what it found really fits our range, and
6687 * adjust to make sure it is based on the start from
6688 * the caller
6689 */
6690 if (hole_em) {
6691 u64 calc_end = extent_map_end(hole_em);
6692
6693 if (calc_end <= start || (hole_em->start > end)) {
6694 free_extent_map(hole_em);
6695 hole_em = NULL;
6696 } else {
6697 hole_start = max(hole_em->start, start);
6698 hole_len = calc_end - hole_start;
6699 }
6700 }
6701 em->bdev = NULL;
6702 if (hole_em && range_start > hole_start) {
6703 /* our hole starts before our delalloc, so we
6704 * have to return just the parts of the hole
6705 * that go until the delalloc starts
6706 */
6707 em->len = min(hole_len,
6708 range_start - hole_start);
6709 em->start = hole_start;
6710 em->orig_start = hole_start;
6711 /*
6712 * don't adjust block start at all,
6713 * it is fixed at EXTENT_MAP_HOLE
6714 */
6715 em->block_start = hole_em->block_start;
6716 em->block_len = hole_len;
f9e4fb53
LB
6717 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6718 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
ec29ed5b
CM
6719 } else {
6720 em->start = range_start;
6721 em->len = found;
6722 em->orig_start = range_start;
6723 em->block_start = EXTENT_MAP_DELALLOC;
6724 em->block_len = found;
6725 }
6726 } else if (hole_em) {
6727 return hole_em;
6728 }
6729out:
6730
6731 free_extent_map(hole_em);
6732 if (err) {
6733 free_extent_map(em);
6734 return ERR_PTR(err);
6735 }
6736 return em;
6737}
6738
4b46fce2
JB
6739static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6740 u64 start, u64 len)
6741{
6742 struct btrfs_root *root = BTRFS_I(inode)->root;
70c8a91c 6743 struct extent_map *em;
4b46fce2
JB
6744 struct btrfs_key ins;
6745 u64 alloc_hint;
6746 int ret;
4b46fce2 6747
4b46fce2 6748 alloc_hint = get_extent_allocation_hint(inode, start, len);
00361589 6749 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
e570fd27 6750 alloc_hint, &ins, 1, 1);
00361589
JB
6751 if (ret)
6752 return ERR_PTR(ret);
4b46fce2 6753
70c8a91c 6754 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
cc95bef6 6755 ins.offset, ins.offset, ins.offset, 0);
00361589 6756 if (IS_ERR(em)) {
e570fd27 6757 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
00361589
JB
6758 return em;
6759 }
4b46fce2
JB
6760
6761 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6762 ins.offset, ins.offset, 0);
6763 if (ret) {
e570fd27 6764 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
00361589
JB
6765 free_extent_map(em);
6766 return ERR_PTR(ret);
4b46fce2 6767 }
00361589 6768
4b46fce2
JB
6769 return em;
6770}
6771
46bfbb5c
CM
6772/*
6773 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6774 * block must be cow'd
6775 */
00361589 6776noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7ee9e440
JB
6777 u64 *orig_start, u64 *orig_block_len,
6778 u64 *ram_bytes)
46bfbb5c 6779{
00361589 6780 struct btrfs_trans_handle *trans;
46bfbb5c
CM
6781 struct btrfs_path *path;
6782 int ret;
6783 struct extent_buffer *leaf;
6784 struct btrfs_root *root = BTRFS_I(inode)->root;
7b2b7085 6785 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
46bfbb5c
CM
6786 struct btrfs_file_extent_item *fi;
6787 struct btrfs_key key;
6788 u64 disk_bytenr;
6789 u64 backref_offset;
6790 u64 extent_end;
6791 u64 num_bytes;
6792 int slot;
6793 int found_type;
7ee9e440 6794 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
e77751aa 6795
46bfbb5c
CM
6796 path = btrfs_alloc_path();
6797 if (!path)
6798 return -ENOMEM;
6799
00361589 6800 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
46bfbb5c
CM
6801 offset, 0);
6802 if (ret < 0)
6803 goto out;
6804
6805 slot = path->slots[0];
6806 if (ret == 1) {
6807 if (slot == 0) {
6808 /* can't find the item, must cow */
6809 ret = 0;
6810 goto out;
6811 }
6812 slot--;
6813 }
6814 ret = 0;
6815 leaf = path->nodes[0];
6816 btrfs_item_key_to_cpu(leaf, &key, slot);
33345d01 6817 if (key.objectid != btrfs_ino(inode) ||
46bfbb5c
CM
6818 key.type != BTRFS_EXTENT_DATA_KEY) {
6819 /* not our file or wrong item type, must cow */
6820 goto out;
6821 }
6822
6823 if (key.offset > offset) {
6824 /* Wrong offset, must cow */
6825 goto out;
6826 }
6827
6828 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6829 found_type = btrfs_file_extent_type(leaf, fi);
6830 if (found_type != BTRFS_FILE_EXTENT_REG &&
6831 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6832 /* not a regular extent, must cow */
6833 goto out;
6834 }
7ee9e440
JB
6835
6836 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6837 goto out;
6838
e77751aa
MX
6839 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6840 if (extent_end <= offset)
6841 goto out;
6842
46bfbb5c 6843 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7ee9e440
JB
6844 if (disk_bytenr == 0)
6845 goto out;
6846
6847 if (btrfs_file_extent_compression(leaf, fi) ||
6848 btrfs_file_extent_encryption(leaf, fi) ||
6849 btrfs_file_extent_other_encoding(leaf, fi))
6850 goto out;
6851
46bfbb5c
CM
6852 backref_offset = btrfs_file_extent_offset(leaf, fi);
6853
7ee9e440
JB
6854 if (orig_start) {
6855 *orig_start = key.offset - backref_offset;
6856 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6857 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6858 }
eb384b55 6859
46bfbb5c
CM
6860 if (btrfs_extent_readonly(root, disk_bytenr))
6861 goto out;
7b2b7085
MX
6862
6863 num_bytes = min(offset + *len, extent_end) - offset;
6864 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6865 u64 range_end;
6866
6867 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6868 ret = test_range_bit(io_tree, offset, range_end,
6869 EXTENT_DELALLOC, 0, NULL);
6870 if (ret) {
6871 ret = -EAGAIN;
6872 goto out;
6873 }
6874 }
6875
1bda19eb 6876 btrfs_release_path(path);
46bfbb5c
CM
6877
6878 /*
6879 * look for other files referencing this extent, if we
6880 * find any we must cow
6881 */
00361589
JB
6882 trans = btrfs_join_transaction(root);
6883 if (IS_ERR(trans)) {
6884 ret = 0;
46bfbb5c 6885 goto out;
00361589
JB
6886 }
6887
6888 ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6889 key.offset - backref_offset, disk_bytenr);
6890 btrfs_end_transaction(trans, root);
6891 if (ret) {
6892 ret = 0;
6893 goto out;
6894 }
46bfbb5c
CM
6895
6896 /*
6897 * adjust disk_bytenr and num_bytes to cover just the bytes
6898 * in this extent we are about to write. If there
6899 * are any csums in that range we have to cow in order
6900 * to keep the csums correct
6901 */
6902 disk_bytenr += backref_offset;
6903 disk_bytenr += offset - key.offset;
46bfbb5c
CM
6904 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6905 goto out;
6906 /*
6907 * all of the above have passed, it is safe to overwrite this extent
6908 * without cow
6909 */
eb384b55 6910 *len = num_bytes;
46bfbb5c
CM
6911 ret = 1;
6912out:
6913 btrfs_free_path(path);
6914 return ret;
6915}
6916
fc4adbff
AG
6917bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6918{
6919 struct radix_tree_root *root = &inode->i_mapping->page_tree;
6920 int found = false;
6921 void **pagep = NULL;
6922 struct page *page = NULL;
6923 int start_idx;
6924 int end_idx;
6925
6926 start_idx = start >> PAGE_CACHE_SHIFT;
6927
6928 /*
6929 * end is the last byte in the last page. end == start is legal
6930 */
6931 end_idx = end >> PAGE_CACHE_SHIFT;
6932
6933 rcu_read_lock();
6934
6935 /* Most of the code in this while loop is lifted from
6936 * find_get_page. It's been modified to begin searching from a
6937 * page and return just the first page found in that range. If the
6938 * found idx is less than or equal to the end idx then we know that
6939 * a page exists. If no pages are found or if those pages are
6940 * outside of the range then we're fine (yay!) */
6941 while (page == NULL &&
6942 radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6943 page = radix_tree_deref_slot(pagep);
6944 if (unlikely(!page))
6945 break;
6946
6947 if (radix_tree_exception(page)) {
809f9016
FM
6948 if (radix_tree_deref_retry(page)) {
6949 page = NULL;
fc4adbff 6950 continue;
809f9016 6951 }
fc4adbff
AG
6952 /*
6953 * Otherwise, shmem/tmpfs must be storing a swap entry
6954 * here as an exceptional entry: so return it without
6955 * attempting to raise page count.
6956 */
6fdef6d4 6957 page = NULL;
fc4adbff
AG
6958 break; /* TODO: Is this relevant for this use case? */
6959 }
6960
91405151
FM
6961 if (!page_cache_get_speculative(page)) {
6962 page = NULL;
fc4adbff 6963 continue;
91405151 6964 }
fc4adbff
AG
6965
6966 /*
6967 * Has the page moved?
6968 * This is part of the lockless pagecache protocol. See
6969 * include/linux/pagemap.h for details.
6970 */
6971 if (unlikely(page != *pagep)) {
6972 page_cache_release(page);
6973 page = NULL;
6974 }
6975 }
6976
6977 if (page) {
6978 if (page->index <= end_idx)
6979 found = true;
6980 page_cache_release(page);
6981 }
6982
6983 rcu_read_unlock();
6984 return found;
6985}
6986
eb838e73
JB
6987static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6988 struct extent_state **cached_state, int writing)
6989{
6990 struct btrfs_ordered_extent *ordered;
6991 int ret = 0;
6992
6993 while (1) {
6994 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6995 0, cached_state);
6996 /*
6997 * We're concerned with the entire range that we're going to be
6998 * doing DIO to, so we need to make sure theres no ordered
6999 * extents in this range.
7000 */
7001 ordered = btrfs_lookup_ordered_range(inode, lockstart,
7002 lockend - lockstart + 1);
7003
7004 /*
7005 * We need to make sure there are no buffered pages in this
7006 * range either, we could have raced between the invalidate in
7007 * generic_file_direct_write and locking the extent. The
7008 * invalidate needs to happen so that reads after a write do not
7009 * get stale data.
7010 */
fc4adbff
AG
7011 if (!ordered &&
7012 (!writing ||
7013 !btrfs_page_exists_in_range(inode, lockstart, lockend)))
eb838e73
JB
7014 break;
7015
7016 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7017 cached_state, GFP_NOFS);
7018
7019 if (ordered) {
7020 btrfs_start_ordered_extent(inode, ordered, 1);
7021 btrfs_put_ordered_extent(ordered);
7022 } else {
7023 /* Screw you mmap */
7024 ret = filemap_write_and_wait_range(inode->i_mapping,
7025 lockstart,
7026 lockend);
7027 if (ret)
7028 break;
7029
7030 /*
7031 * If we found a page that couldn't be invalidated just
7032 * fall back to buffered.
7033 */
7034 ret = invalidate_inode_pages2_range(inode->i_mapping,
7035 lockstart >> PAGE_CACHE_SHIFT,
7036 lockend >> PAGE_CACHE_SHIFT);
7037 if (ret)
7038 break;
7039 }
7040
7041 cond_resched();
7042 }
7043
7044 return ret;
7045}
7046
69ffb543
JB
7047static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7048 u64 len, u64 orig_start,
7049 u64 block_start, u64 block_len,
cc95bef6
JB
7050 u64 orig_block_len, u64 ram_bytes,
7051 int type)
69ffb543
JB
7052{
7053 struct extent_map_tree *em_tree;
7054 struct extent_map *em;
7055 struct btrfs_root *root = BTRFS_I(inode)->root;
7056 int ret;
7057
7058 em_tree = &BTRFS_I(inode)->extent_tree;
7059 em = alloc_extent_map();
7060 if (!em)
7061 return ERR_PTR(-ENOMEM);
7062
7063 em->start = start;
7064 em->orig_start = orig_start;
2ab28f32
JB
7065 em->mod_start = start;
7066 em->mod_len = len;
69ffb543
JB
7067 em->len = len;
7068 em->block_len = block_len;
7069 em->block_start = block_start;
7070 em->bdev = root->fs_info->fs_devices->latest_bdev;
b4939680 7071 em->orig_block_len = orig_block_len;
cc95bef6 7072 em->ram_bytes = ram_bytes;
70c8a91c 7073 em->generation = -1;
69ffb543
JB
7074 set_bit(EXTENT_FLAG_PINNED, &em->flags);
7075 if (type == BTRFS_ORDERED_PREALLOC)
b11e234d 7076 set_bit(EXTENT_FLAG_FILLING, &em->flags);
69ffb543
JB
7077
7078 do {
7079 btrfs_drop_extent_cache(inode, em->start,
7080 em->start + em->len - 1, 0);
7081 write_lock(&em_tree->lock);
09a2a8f9 7082 ret = add_extent_mapping(em_tree, em, 1);
69ffb543
JB
7083 write_unlock(&em_tree->lock);
7084 } while (ret == -EEXIST);
7085
7086 if (ret) {
7087 free_extent_map(em);
7088 return ERR_PTR(ret);
7089 }
7090
7091 return em;
7092}
7093
7094
4b46fce2
JB
7095static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7096 struct buffer_head *bh_result, int create)
7097{
7098 struct extent_map *em;
7099 struct btrfs_root *root = BTRFS_I(inode)->root;
eb838e73 7100 struct extent_state *cached_state = NULL;
4b46fce2 7101 u64 start = iblock << inode->i_blkbits;
eb838e73 7102 u64 lockstart, lockend;
4b46fce2 7103 u64 len = bh_result->b_size;
eb838e73 7104 int unlock_bits = EXTENT_LOCKED;
0934856d 7105 int ret = 0;
eb838e73 7106
172a5049 7107 if (create)
eb838e73 7108 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
172a5049 7109 else
c329861d 7110 len = min_t(u64, len, root->sectorsize);
eb838e73 7111
c329861d
JB
7112 lockstart = start;
7113 lockend = start + len - 1;
7114
eb838e73
JB
7115 /*
7116 * If this errors out it's because we couldn't invalidate pagecache for
7117 * this range and we need to fallback to buffered.
7118 */
7119 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
7120 return -ENOTBLK;
7121
4b46fce2 7122 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
eb838e73
JB
7123 if (IS_ERR(em)) {
7124 ret = PTR_ERR(em);
7125 goto unlock_err;
7126 }
4b46fce2
JB
7127
7128 /*
7129 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7130 * io. INLINE is special, and we could probably kludge it in here, but
7131 * it's still buffered so for safety lets just fall back to the generic
7132 * buffered path.
7133 *
7134 * For COMPRESSED we _have_ to read the entire extent in so we can
7135 * decompress it, so there will be buffering required no matter what we
7136 * do, so go ahead and fallback to buffered.
7137 *
7138 * We return -ENOTBLK because thats what makes DIO go ahead and go back
7139 * to buffered IO. Don't blame me, this is the price we pay for using
7140 * the generic code.
7141 */
7142 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7143 em->block_start == EXTENT_MAP_INLINE) {
7144 free_extent_map(em);
eb838e73
JB
7145 ret = -ENOTBLK;
7146 goto unlock_err;
4b46fce2
JB
7147 }
7148
7149 /* Just a good old fashioned hole, return */
7150 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7151 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7152 free_extent_map(em);
eb838e73 7153 goto unlock_err;
4b46fce2
JB
7154 }
7155
7156 /*
7157 * We don't allocate a new extent in the following cases
7158 *
7159 * 1) The inode is marked as NODATACOW. In this case we'll just use the
7160 * existing extent.
7161 * 2) The extent is marked as PREALLOC. We're good to go here and can
7162 * just use the extent.
7163 *
7164 */
46bfbb5c 7165 if (!create) {
eb838e73
JB
7166 len = min(len, em->len - (start - em->start));
7167 lockstart = start + len;
7168 goto unlock;
46bfbb5c 7169 }
4b46fce2
JB
7170
7171 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7172 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7173 em->block_start != EXTENT_MAP_HOLE)) {
4b46fce2
JB
7174 int type;
7175 int ret;
eb384b55 7176 u64 block_start, orig_start, orig_block_len, ram_bytes;
4b46fce2
JB
7177
7178 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7179 type = BTRFS_ORDERED_PREALLOC;
7180 else
7181 type = BTRFS_ORDERED_NOCOW;
46bfbb5c 7182 len = min(len, em->len - (start - em->start));
4b46fce2 7183 block_start = em->block_start + (start - em->start);
46bfbb5c 7184
00361589 7185 if (can_nocow_extent(inode, start, &len, &orig_start,
7ee9e440 7186 &orig_block_len, &ram_bytes) == 1) {
69ffb543
JB
7187 if (type == BTRFS_ORDERED_PREALLOC) {
7188 free_extent_map(em);
7189 em = create_pinned_em(inode, start, len,
7190 orig_start,
b4939680 7191 block_start, len,
cc95bef6
JB
7192 orig_block_len,
7193 ram_bytes, type);
555e1286
FM
7194 if (IS_ERR(em)) {
7195 ret = PTR_ERR(em);
69ffb543 7196 goto unlock_err;
555e1286 7197 }
69ffb543
JB
7198 }
7199
46bfbb5c
CM
7200 ret = btrfs_add_ordered_extent_dio(inode, start,
7201 block_start, len, len, type);
46bfbb5c
CM
7202 if (ret) {
7203 free_extent_map(em);
eb838e73 7204 goto unlock_err;
46bfbb5c
CM
7205 }
7206 goto unlock;
4b46fce2 7207 }
4b46fce2 7208 }
00361589 7209
46bfbb5c
CM
7210 /*
7211 * this will cow the extent, reset the len in case we changed
7212 * it above
7213 */
7214 len = bh_result->b_size;
70c8a91c
JB
7215 free_extent_map(em);
7216 em = btrfs_new_extent_direct(inode, start, len);
eb838e73
JB
7217 if (IS_ERR(em)) {
7218 ret = PTR_ERR(em);
7219 goto unlock_err;
7220 }
46bfbb5c
CM
7221 len = min(len, em->len - (start - em->start));
7222unlock:
4b46fce2
JB
7223 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7224 inode->i_blkbits;
46bfbb5c 7225 bh_result->b_size = len;
4b46fce2
JB
7226 bh_result->b_bdev = em->bdev;
7227 set_buffer_mapped(bh_result);
c3473e83
JB
7228 if (create) {
7229 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7230 set_buffer_new(bh_result);
7231
7232 /*
7233 * Need to update the i_size under the extent lock so buffered
7234 * readers will get the updated i_size when we unlock.
7235 */
7236 if (start + len > i_size_read(inode))
7237 i_size_write(inode, start + len);
0934856d 7238
172a5049
MX
7239 spin_lock(&BTRFS_I(inode)->lock);
7240 BTRFS_I(inode)->outstanding_extents++;
7241 spin_unlock(&BTRFS_I(inode)->lock);
7242
0934856d
MX
7243 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7244 lockstart + len - 1, EXTENT_DELALLOC, NULL,
7245 &cached_state, GFP_NOFS);
7246 BUG_ON(ret);
c3473e83 7247 }
4b46fce2 7248
eb838e73
JB
7249 /*
7250 * In the case of write we need to clear and unlock the entire range,
7251 * in the case of read we need to unlock only the end area that we
7252 * aren't using if there is any left over space.
7253 */
24c03fa5 7254 if (lockstart < lockend) {
0934856d
MX
7255 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7256 lockend, unlock_bits, 1, 0,
7257 &cached_state, GFP_NOFS);
24c03fa5 7258 } else {
eb838e73 7259 free_extent_state(cached_state);
24c03fa5 7260 }
eb838e73 7261
4b46fce2
JB
7262 free_extent_map(em);
7263
7264 return 0;
eb838e73
JB
7265
7266unlock_err:
eb838e73
JB
7267 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7268 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7269 return ret;
4b46fce2
JB
7270}
7271
8b110e39
MX
7272static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7273 int rw, int mirror_num)
7274{
7275 struct btrfs_root *root = BTRFS_I(inode)->root;
7276 int ret;
7277
7278 BUG_ON(rw & REQ_WRITE);
7279
7280 bio_get(bio);
7281
7282 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7283 BTRFS_WQ_ENDIO_DIO_REPAIR);
7284 if (ret)
7285 goto err;
7286
7287 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7288err:
7289 bio_put(bio);
7290 return ret;
7291}
7292
7293static int btrfs_check_dio_repairable(struct inode *inode,
7294 struct bio *failed_bio,
7295 struct io_failure_record *failrec,
7296 int failed_mirror)
7297{
7298 int num_copies;
7299
7300 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7301 failrec->logical, failrec->len);
7302 if (num_copies == 1) {
7303 /*
7304 * we only have a single copy of the data, so don't bother with
7305 * all the retry and error correction code that follows. no
7306 * matter what the error is, it is very likely to persist.
7307 */
7308 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7309 num_copies, failrec->this_mirror, failed_mirror);
7310 return 0;
7311 }
7312
7313 failrec->failed_mirror = failed_mirror;
7314 failrec->this_mirror++;
7315 if (failrec->this_mirror == failed_mirror)
7316 failrec->this_mirror++;
7317
7318 if (failrec->this_mirror > num_copies) {
7319 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7320 num_copies, failrec->this_mirror, failed_mirror);
7321 return 0;
7322 }
7323
7324 return 1;
7325}
7326
7327static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7328 struct page *page, u64 start, u64 end,
7329 int failed_mirror, bio_end_io_t *repair_endio,
7330 void *repair_arg)
7331{
7332 struct io_failure_record *failrec;
7333 struct bio *bio;
7334 int isector;
7335 int read_mode;
7336 int ret;
7337
7338 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7339
7340 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7341 if (ret)
7342 return ret;
7343
7344 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7345 failed_mirror);
7346 if (!ret) {
7347 free_io_failure(inode, failrec);
7348 return -EIO;
7349 }
7350
7351 if (failed_bio->bi_vcnt > 1)
7352 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7353 else
7354 read_mode = READ_SYNC;
7355
7356 isector = start - btrfs_io_bio(failed_bio)->logical;
7357 isector >>= inode->i_sb->s_blocksize_bits;
7358 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7359 0, isector, repair_endio, repair_arg);
7360 if (!bio) {
7361 free_io_failure(inode, failrec);
7362 return -EIO;
7363 }
7364
7365 btrfs_debug(BTRFS_I(inode)->root->fs_info,
7366 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7367 read_mode, failrec->this_mirror, failrec->in_validation);
7368
7369 ret = submit_dio_repair_bio(inode, bio, read_mode,
7370 failrec->this_mirror);
7371 if (ret) {
7372 free_io_failure(inode, failrec);
7373 bio_put(bio);
7374 }
7375
7376 return ret;
7377}
7378
7379struct btrfs_retry_complete {
7380 struct completion done;
7381 struct inode *inode;
7382 u64 start;
7383 int uptodate;
7384};
7385
7386static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7387{
7388 struct btrfs_retry_complete *done = bio->bi_private;
7389 struct bio_vec *bvec;
7390 int i;
7391
7392 if (err)
7393 goto end;
7394
7395 done->uptodate = 1;
7396 bio_for_each_segment_all(bvec, bio, i)
7397 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7398end:
7399 complete(&done->done);
7400 bio_put(bio);
7401}
7402
7403static int __btrfs_correct_data_nocsum(struct inode *inode,
7404 struct btrfs_io_bio *io_bio)
4b46fce2 7405{
2c30c71b 7406 struct bio_vec *bvec;
8b110e39 7407 struct btrfs_retry_complete done;
4b46fce2 7408 u64 start;
2c30c71b 7409 int i;
c1dc0896 7410 int ret;
4b46fce2 7411
8b110e39
MX
7412 start = io_bio->logical;
7413 done.inode = inode;
7414
7415 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7416try_again:
7417 done.uptodate = 0;
7418 done.start = start;
7419 init_completion(&done.done);
7420
7421 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7422 start + bvec->bv_len - 1,
7423 io_bio->mirror_num,
7424 btrfs_retry_endio_nocsum, &done);
7425 if (ret)
7426 return ret;
7427
7428 wait_for_completion(&done.done);
7429
7430 if (!done.uptodate) {
7431 /* We might have another mirror, so try again */
7432 goto try_again;
7433 }
7434
7435 start += bvec->bv_len;
7436 }
7437
7438 return 0;
7439}
7440
7441static void btrfs_retry_endio(struct bio *bio, int err)
7442{
7443 struct btrfs_retry_complete *done = bio->bi_private;
7444 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7445 struct bio_vec *bvec;
7446 int uptodate;
7447 int ret;
7448 int i;
7449
7450 if (err)
7451 goto end;
7452
7453 uptodate = 1;
7454 bio_for_each_segment_all(bvec, bio, i) {
7455 ret = __readpage_endio_check(done->inode, io_bio, i,
7456 bvec->bv_page, 0,
7457 done->start, bvec->bv_len);
7458 if (!ret)
7459 clean_io_failure(done->inode, done->start,
7460 bvec->bv_page, 0);
7461 else
7462 uptodate = 0;
7463 }
7464
7465 done->uptodate = uptodate;
7466end:
7467 complete(&done->done);
7468 bio_put(bio);
7469}
7470
7471static int __btrfs_subio_endio_read(struct inode *inode,
7472 struct btrfs_io_bio *io_bio, int err)
7473{
7474 struct bio_vec *bvec;
7475 struct btrfs_retry_complete done;
7476 u64 start;
7477 u64 offset = 0;
7478 int i;
7479 int ret;
dc380aea 7480
8b110e39 7481 err = 0;
c1dc0896 7482 start = io_bio->logical;
8b110e39
MX
7483 done.inode = inode;
7484
c1dc0896 7485 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
dc380aea
MX
7486 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7487 0, start, bvec->bv_len);
8b110e39
MX
7488 if (likely(!ret))
7489 goto next;
7490try_again:
7491 done.uptodate = 0;
7492 done.start = start;
7493 init_completion(&done.done);
7494
7495 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7496 start + bvec->bv_len - 1,
7497 io_bio->mirror_num,
7498 btrfs_retry_endio, &done);
7499 if (ret) {
7500 err = ret;
7501 goto next;
7502 }
7503
7504 wait_for_completion(&done.done);
7505
7506 if (!done.uptodate) {
7507 /* We might have another mirror, so try again */
7508 goto try_again;
7509 }
7510next:
7511 offset += bvec->bv_len;
4b46fce2 7512 start += bvec->bv_len;
2c30c71b 7513 }
c1dc0896
MX
7514
7515 return err;
7516}
7517
8b110e39
MX
7518static int btrfs_subio_endio_read(struct inode *inode,
7519 struct btrfs_io_bio *io_bio, int err)
7520{
7521 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7522
7523 if (skip_csum) {
7524 if (unlikely(err))
7525 return __btrfs_correct_data_nocsum(inode, io_bio);
7526 else
7527 return 0;
7528 } else {
7529 return __btrfs_subio_endio_read(inode, io_bio, err);
7530 }
7531}
7532
c1dc0896
MX
7533static void btrfs_endio_direct_read(struct bio *bio, int err)
7534{
7535 struct btrfs_dio_private *dip = bio->bi_private;
7536 struct inode *inode = dip->inode;
7537 struct bio *dio_bio;
7538 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7539
8b110e39
MX
7540 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7541 err = btrfs_subio_endio_read(inode, io_bio, err);
c1dc0896 7542
4b46fce2 7543 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
d0082371 7544 dip->logical_offset + dip->bytes - 1);
9be3395b 7545 dio_bio = dip->dio_bio;
4b46fce2 7546
4b46fce2 7547 kfree(dip);
c0da7aa1
JB
7548
7549 /* If we had a csum failure make sure to clear the uptodate flag */
7550 if (err)
9be3395b
CM
7551 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7552 dio_end_io(dio_bio, err);
23ea8e5a
MX
7553
7554 if (io_bio->end_io)
7555 io_bio->end_io(io_bio, err);
9be3395b 7556 bio_put(bio);
4b46fce2
JB
7557}
7558
7559static void btrfs_endio_direct_write(struct bio *bio, int err)
7560{
7561 struct btrfs_dio_private *dip = bio->bi_private;
7562 struct inode *inode = dip->inode;
7563 struct btrfs_root *root = BTRFS_I(inode)->root;
4b46fce2 7564 struct btrfs_ordered_extent *ordered = NULL;
163cf09c
CM
7565 u64 ordered_offset = dip->logical_offset;
7566 u64 ordered_bytes = dip->bytes;
9be3395b 7567 struct bio *dio_bio;
4b46fce2
JB
7568 int ret;
7569
7570 if (err)
7571 goto out_done;
163cf09c
CM
7572again:
7573 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
7574 &ordered_offset,
5fd02043 7575 ordered_bytes, !err);
4b46fce2 7576 if (!ret)
163cf09c 7577 goto out_test;
4b46fce2 7578
9e0af237
LB
7579 btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7580 finish_ordered_fn, NULL, NULL);
fccb5d86
QW
7581 btrfs_queue_work(root->fs_info->endio_write_workers,
7582 &ordered->work);
163cf09c
CM
7583out_test:
7584 /*
7585 * our bio might span multiple ordered extents. If we haven't
7586 * completed the accounting for the whole dio, go back and try again
7587 */
7588 if (ordered_offset < dip->logical_offset + dip->bytes) {
7589 ordered_bytes = dip->logical_offset + dip->bytes -
7590 ordered_offset;
5fd02043 7591 ordered = NULL;
163cf09c
CM
7592 goto again;
7593 }
4b46fce2 7594out_done:
9be3395b 7595 dio_bio = dip->dio_bio;
4b46fce2 7596
4b46fce2 7597 kfree(dip);
c0da7aa1
JB
7598
7599 /* If we had an error make sure to clear the uptodate flag */
7600 if (err)
9be3395b
CM
7601 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7602 dio_end_io(dio_bio, err);
7603 bio_put(bio);
4b46fce2
JB
7604}
7605
eaf25d93
CM
7606static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
7607 struct bio *bio, int mirror_num,
7608 unsigned long bio_flags, u64 offset)
7609{
7610 int ret;
7611 struct btrfs_root *root = BTRFS_I(inode)->root;
7612 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
79787eaa 7613 BUG_ON(ret); /* -ENOMEM */
eaf25d93
CM
7614 return 0;
7615}
7616
e65e1535
MX
7617static void btrfs_end_dio_bio(struct bio *bio, int err)
7618{
7619 struct btrfs_dio_private *dip = bio->bi_private;
7620
8b110e39
MX
7621 if (err)
7622 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7623 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7624 btrfs_ino(dip->inode), bio->bi_rw,
7625 (unsigned long long)bio->bi_iter.bi_sector,
7626 bio->bi_iter.bi_size, err);
7627
7628 if (dip->subio_endio)
7629 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
c1dc0896
MX
7630
7631 if (err) {
e65e1535
MX
7632 dip->errors = 1;
7633
7634 /*
7635 * before atomic variable goto zero, we must make sure
7636 * dip->errors is perceived to be set.
7637 */
4e857c58 7638 smp_mb__before_atomic();
e65e1535
MX
7639 }
7640
7641 /* if there are more bios still pending for this dio, just exit */
7642 if (!atomic_dec_and_test(&dip->pending_bios))
7643 goto out;
7644
9be3395b 7645 if (dip->errors) {
e65e1535 7646 bio_io_error(dip->orig_bio);
9be3395b
CM
7647 } else {
7648 set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
e65e1535
MX
7649 bio_endio(dip->orig_bio, 0);
7650 }
7651out:
7652 bio_put(bio);
7653}
7654
7655static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7656 u64 first_sector, gfp_t gfp_flags)
7657{
7658 int nr_vecs = bio_get_nr_vecs(bdev);
7659 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7660}
7661
c1dc0896
MX
7662static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7663 struct inode *inode,
7664 struct btrfs_dio_private *dip,
7665 struct bio *bio,
7666 u64 file_offset)
7667{
7668 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7669 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7670 int ret;
7671
7672 /*
7673 * We load all the csum data we need when we submit
7674 * the first bio to reduce the csum tree search and
7675 * contention.
7676 */
7677 if (dip->logical_offset == file_offset) {
7678 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7679 file_offset);
7680 if (ret)
7681 return ret;
7682 }
7683
7684 if (bio == dip->orig_bio)
7685 return 0;
7686
7687 file_offset -= dip->logical_offset;
7688 file_offset >>= inode->i_sb->s_blocksize_bits;
7689 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7690
7691 return 0;
7692}
7693
e65e1535
MX
7694static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7695 int rw, u64 file_offset, int skip_sum,
c329861d 7696 int async_submit)
e65e1535 7697{
facc8a22 7698 struct btrfs_dio_private *dip = bio->bi_private;
e65e1535
MX
7699 int write = rw & REQ_WRITE;
7700 struct btrfs_root *root = BTRFS_I(inode)->root;
7701 int ret;
7702
b812ce28
JB
7703 if (async_submit)
7704 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7705
e65e1535 7706 bio_get(bio);
5fd02043
JB
7707
7708 if (!write) {
bfebd8b5
DS
7709 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7710 BTRFS_WQ_ENDIO_DATA);
5fd02043
JB
7711 if (ret)
7712 goto err;
7713 }
e65e1535 7714
1ae39938
JB
7715 if (skip_sum)
7716 goto map;
7717
7718 if (write && async_submit) {
e65e1535
MX
7719 ret = btrfs_wq_submit_bio(root->fs_info,
7720 inode, rw, bio, 0, 0,
7721 file_offset,
7722 __btrfs_submit_bio_start_direct_io,
7723 __btrfs_submit_bio_done);
7724 goto err;
1ae39938
JB
7725 } else if (write) {
7726 /*
7727 * If we aren't doing async submit, calculate the csum of the
7728 * bio now.
7729 */
7730 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7731 if (ret)
7732 goto err;
23ea8e5a 7733 } else {
c1dc0896
MX
7734 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
7735 file_offset);
c2db1073
TI
7736 if (ret)
7737 goto err;
7738 }
1ae39938
JB
7739map:
7740 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
e65e1535
MX
7741err:
7742 bio_put(bio);
7743 return ret;
7744}
7745
7746static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7747 int skip_sum)
7748{
7749 struct inode *inode = dip->inode;
7750 struct btrfs_root *root = BTRFS_I(inode)->root;
e65e1535
MX
7751 struct bio *bio;
7752 struct bio *orig_bio = dip->orig_bio;
7753 struct bio_vec *bvec = orig_bio->bi_io_vec;
4f024f37 7754 u64 start_sector = orig_bio->bi_iter.bi_sector;
e65e1535
MX
7755 u64 file_offset = dip->logical_offset;
7756 u64 submit_len = 0;
7757 u64 map_length;
7758 int nr_pages = 0;
23ea8e5a 7759 int ret;
1ae39938 7760 int async_submit = 0;
e65e1535 7761
4f024f37 7762 map_length = orig_bio->bi_iter.bi_size;
53b381b3 7763 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
e65e1535 7764 &map_length, NULL, 0);
7a5c3c9b 7765 if (ret)
e65e1535 7766 return -EIO;
facc8a22 7767
4f024f37 7768 if (map_length >= orig_bio->bi_iter.bi_size) {
02f57c7a 7769 bio = orig_bio;
c1dc0896 7770 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
02f57c7a
JB
7771 goto submit;
7772 }
7773
53b381b3
DW
7774 /* async crcs make it difficult to collect full stripe writes. */
7775 if (btrfs_get_alloc_profile(root, 1) &
7776 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7777 async_submit = 0;
7778 else
7779 async_submit = 1;
7780
02f57c7a
JB
7781 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7782 if (!bio)
7783 return -ENOMEM;
7a5c3c9b 7784
02f57c7a
JB
7785 bio->bi_private = dip;
7786 bio->bi_end_io = btrfs_end_dio_bio;
c1dc0896 7787 btrfs_io_bio(bio)->logical = file_offset;
02f57c7a
JB
7788 atomic_inc(&dip->pending_bios);
7789
e65e1535 7790 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
ee39b432 7791 if (map_length < submit_len + bvec->bv_len ||
e65e1535 7792 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
ee39b432 7793 bvec->bv_offset) < bvec->bv_len) {
e65e1535
MX
7794 /*
7795 * inc the count before we submit the bio so
7796 * we know the end IO handler won't happen before
7797 * we inc the count. Otherwise, the dip might get freed
7798 * before we're done setting it up
7799 */
7800 atomic_inc(&dip->pending_bios);
7801 ret = __btrfs_submit_dio_bio(bio, inode, rw,
7802 file_offset, skip_sum,
c329861d 7803 async_submit);
e65e1535
MX
7804 if (ret) {
7805 bio_put(bio);
7806 atomic_dec(&dip->pending_bios);
7807 goto out_err;
7808 }
7809
e65e1535
MX
7810 start_sector += submit_len >> 9;
7811 file_offset += submit_len;
7812
7813 submit_len = 0;
7814 nr_pages = 0;
7815
7816 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7817 start_sector, GFP_NOFS);
7818 if (!bio)
7819 goto out_err;
7820 bio->bi_private = dip;
7821 bio->bi_end_io = btrfs_end_dio_bio;
c1dc0896 7822 btrfs_io_bio(bio)->logical = file_offset;
e65e1535 7823
4f024f37 7824 map_length = orig_bio->bi_iter.bi_size;
53b381b3 7825 ret = btrfs_map_block(root->fs_info, rw,
3ec706c8 7826 start_sector << 9,
e65e1535
MX
7827 &map_length, NULL, 0);
7828 if (ret) {
7829 bio_put(bio);
7830 goto out_err;
7831 }
7832 } else {
7833 submit_len += bvec->bv_len;
67871254 7834 nr_pages++;
e65e1535
MX
7835 bvec++;
7836 }
7837 }
7838
02f57c7a 7839submit:
e65e1535 7840 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
c329861d 7841 async_submit);
e65e1535
MX
7842 if (!ret)
7843 return 0;
7844
7845 bio_put(bio);
7846out_err:
7847 dip->errors = 1;
7848 /*
7849 * before atomic variable goto zero, we must
7850 * make sure dip->errors is perceived to be set.
7851 */
4e857c58 7852 smp_mb__before_atomic();
e65e1535
MX
7853 if (atomic_dec_and_test(&dip->pending_bios))
7854 bio_io_error(dip->orig_bio);
7855
7856 /* bio_end_io() will handle error, so we needn't return it */
7857 return 0;
7858}
7859
9be3395b
CM
7860static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7861 struct inode *inode, loff_t file_offset)
4b46fce2
JB
7862{
7863 struct btrfs_root *root = BTRFS_I(inode)->root;
7864 struct btrfs_dio_private *dip;
9be3395b 7865 struct bio *io_bio;
23ea8e5a 7866 struct btrfs_io_bio *btrfs_bio;
4b46fce2 7867 int skip_sum;
7b6d91da 7868 int write = rw & REQ_WRITE;
4b46fce2
JB
7869 int ret = 0;
7870
7871 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7872
9be3395b 7873 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
9be3395b
CM
7874 if (!io_bio) {
7875 ret = -ENOMEM;
7876 goto free_ordered;
7877 }
7878
c1dc0896 7879 dip = kzalloc(sizeof(*dip), GFP_NOFS);
4b46fce2
JB
7880 if (!dip) {
7881 ret = -ENOMEM;
9be3395b 7882 goto free_io_bio;
4b46fce2 7883 }
4b46fce2 7884
9be3395b 7885 dip->private = dio_bio->bi_private;
4b46fce2
JB
7886 dip->inode = inode;
7887 dip->logical_offset = file_offset;
4f024f37
KO
7888 dip->bytes = dio_bio->bi_iter.bi_size;
7889 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
9be3395b 7890 io_bio->bi_private = dip;
9be3395b
CM
7891 dip->orig_bio = io_bio;
7892 dip->dio_bio = dio_bio;
e65e1535 7893 atomic_set(&dip->pending_bios, 0);
c1dc0896
MX
7894 btrfs_bio = btrfs_io_bio(io_bio);
7895 btrfs_bio->logical = file_offset;
4b46fce2 7896
c1dc0896 7897 if (write) {
9be3395b 7898 io_bio->bi_end_io = btrfs_endio_direct_write;
c1dc0896 7899 } else {
9be3395b 7900 io_bio->bi_end_io = btrfs_endio_direct_read;
c1dc0896
MX
7901 dip->subio_endio = btrfs_subio_endio_read;
7902 }
4b46fce2 7903
e65e1535
MX
7904 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7905 if (!ret)
eaf25d93 7906 return;
9be3395b 7907
23ea8e5a
MX
7908 if (btrfs_bio->end_io)
7909 btrfs_bio->end_io(btrfs_bio, ret);
9be3395b
CM
7910free_io_bio:
7911 bio_put(io_bio);
7912
4b46fce2
JB
7913free_ordered:
7914 /*
7915 * If this is a write, we need to clean up the reserved space and kill
7916 * the ordered extent.
7917 */
7918 if (write) {
7919 struct btrfs_ordered_extent *ordered;
955256f2 7920 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
4b46fce2
JB
7921 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7922 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7923 btrfs_free_reserved_extent(root, ordered->start,
e570fd27 7924 ordered->disk_len, 1);
4b46fce2
JB
7925 btrfs_put_ordered_extent(ordered);
7926 btrfs_put_ordered_extent(ordered);
7927 }
9be3395b 7928 bio_endio(dio_bio, ret);
4b46fce2
JB
7929}
7930
5a5f79b5 7931static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
28060d5d 7932 const struct iov_iter *iter, loff_t offset)
5a5f79b5
CM
7933{
7934 int seg;
a1b75f7d 7935 int i;
5a5f79b5
CM
7936 unsigned blocksize_mask = root->sectorsize - 1;
7937 ssize_t retval = -EINVAL;
5a5f79b5
CM
7938
7939 if (offset & blocksize_mask)
7940 goto out;
7941
28060d5d
AV
7942 if (iov_iter_alignment(iter) & blocksize_mask)
7943 goto out;
a1b75f7d 7944
28060d5d
AV
7945 /* If this is a write we don't need to check anymore */
7946 if (rw & WRITE)
7947 return 0;
7948 /*
7949 * Check to make sure we don't have duplicate iov_base's in this
7950 * iovec, if so return EINVAL, otherwise we'll get csum errors
7951 * when reading back.
7952 */
7953 for (seg = 0; seg < iter->nr_segs; seg++) {
7954 for (i = seg + 1; i < iter->nr_segs; i++) {
7955 if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
a1b75f7d
JB
7956 goto out;
7957 }
5a5f79b5
CM
7958 }
7959 retval = 0;
7960out:
7961 return retval;
7962}
eb838e73 7963
16432985 7964static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
d8d3d94b 7965 struct iov_iter *iter, loff_t offset)
16432985 7966{
4b46fce2
JB
7967 struct file *file = iocb->ki_filp;
7968 struct inode *inode = file->f_mapping->host;
0934856d 7969 size_t count = 0;
2e60a51e 7970 int flags = 0;
38851cc1
MX
7971 bool wakeup = true;
7972 bool relock = false;
0934856d 7973 ssize_t ret;
4b46fce2 7974
28060d5d 7975 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
5a5f79b5 7976 return 0;
3f7c579c 7977
38851cc1 7978 atomic_inc(&inode->i_dio_count);
4e857c58 7979 smp_mb__after_atomic();
38851cc1 7980
0e267c44 7981 /*
41bd9ca4
MX
7982 * The generic stuff only does filemap_write_and_wait_range, which
7983 * isn't enough if we've written compressed pages to this area, so
7984 * we need to flush the dirty pages again to make absolutely sure
7985 * that any outstanding dirty pages are on disk.
0e267c44 7986 */
a6cbcd4a 7987 count = iov_iter_count(iter);
41bd9ca4
MX
7988 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7989 &BTRFS_I(inode)->runtime_flags))
9a025a08
WS
7990 filemap_fdatawrite_range(inode->i_mapping, offset,
7991 offset + count - 1);
0e267c44 7992
0934856d 7993 if (rw & WRITE) {
38851cc1
MX
7994 /*
7995 * If the write DIO is beyond the EOF, we need update
7996 * the isize, but it is protected by i_mutex. So we can
7997 * not unlock the i_mutex at this case.
7998 */
7999 if (offset + count <= inode->i_size) {
8000 mutex_unlock(&inode->i_mutex);
8001 relock = true;
8002 }
0934856d
MX
8003 ret = btrfs_delalloc_reserve_space(inode, count);
8004 if (ret)
38851cc1 8005 goto out;
ee39b432
DS
8006 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8007 &BTRFS_I(inode)->runtime_flags)) {
38851cc1
MX
8008 inode_dio_done(inode);
8009 flags = DIO_LOCKING | DIO_SKIP_HOLES;
8010 wakeup = false;
0934856d
MX
8011 }
8012
8013 ret = __blockdev_direct_IO(rw, iocb, inode,
8014 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
31b14039 8015 iter, offset, btrfs_get_blocks_direct, NULL,
2e60a51e 8016 btrfs_submit_direct, flags);
0934856d
MX
8017 if (rw & WRITE) {
8018 if (ret < 0 && ret != -EIOCBQUEUED)
8019 btrfs_delalloc_release_space(inode, count);
172a5049 8020 else if (ret >= 0 && (size_t)ret < count)
0934856d
MX
8021 btrfs_delalloc_release_space(inode,
8022 count - (size_t)ret);
172a5049
MX
8023 else
8024 btrfs_delalloc_release_metadata(inode, 0);
0934856d 8025 }
38851cc1 8026out:
2e60a51e
MX
8027 if (wakeup)
8028 inode_dio_done(inode);
38851cc1
MX
8029 if (relock)
8030 mutex_lock(&inode->i_mutex);
0934856d
MX
8031
8032 return ret;
16432985
CM
8033}
8034
05dadc09
TI
8035#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
8036
1506fcc8
YS
8037static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8038 __u64 start, __u64 len)
8039{
05dadc09
TI
8040 int ret;
8041
8042 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8043 if (ret)
8044 return ret;
8045
ec29ed5b 8046 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
1506fcc8
YS
8047}
8048
a52d9a80 8049int btrfs_readpage(struct file *file, struct page *page)
9ebefb18 8050{
d1310b2e
CM
8051 struct extent_io_tree *tree;
8052 tree = &BTRFS_I(page->mapping->host)->io_tree;
8ddc7d9c 8053 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
9ebefb18 8054}
1832a6d5 8055
a52d9a80 8056static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
39279cc3 8057{
d1310b2e 8058 struct extent_io_tree *tree;
b888db2b
CM
8059
8060
8061 if (current->flags & PF_MEMALLOC) {
8062 redirty_page_for_writepage(wbc, page);
8063 unlock_page(page);
8064 return 0;
8065 }
d1310b2e 8066 tree = &BTRFS_I(page->mapping->host)->io_tree;
a52d9a80 8067 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
9ebefb18
CM
8068}
8069
48a3b636
ES
8070static int btrfs_writepages(struct address_space *mapping,
8071 struct writeback_control *wbc)
b293f02e 8072{
d1310b2e 8073 struct extent_io_tree *tree;
771ed689 8074
d1310b2e 8075 tree = &BTRFS_I(mapping->host)->io_tree;
b293f02e
CM
8076 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8077}
8078
3ab2fb5a
CM
8079static int
8080btrfs_readpages(struct file *file, struct address_space *mapping,
8081 struct list_head *pages, unsigned nr_pages)
8082{
d1310b2e
CM
8083 struct extent_io_tree *tree;
8084 tree = &BTRFS_I(mapping->host)->io_tree;
3ab2fb5a
CM
8085 return extent_readpages(tree, mapping, pages, nr_pages,
8086 btrfs_get_extent);
8087}
e6dcd2dc 8088static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
9ebefb18 8089{
d1310b2e
CM
8090 struct extent_io_tree *tree;
8091 struct extent_map_tree *map;
a52d9a80 8092 int ret;
8c2383c3 8093
d1310b2e
CM
8094 tree = &BTRFS_I(page->mapping->host)->io_tree;
8095 map = &BTRFS_I(page->mapping->host)->extent_tree;
70dec807 8096 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
a52d9a80
CM
8097 if (ret == 1) {
8098 ClearPagePrivate(page);
8099 set_page_private(page, 0);
8100 page_cache_release(page);
39279cc3 8101 }
a52d9a80 8102 return ret;
39279cc3
CM
8103}
8104
e6dcd2dc
CM
8105static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8106{
98509cfc
CM
8107 if (PageWriteback(page) || PageDirty(page))
8108 return 0;
b335b003 8109 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
e6dcd2dc
CM
8110}
8111
d47992f8
LC
8112static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8113 unsigned int length)
39279cc3 8114{
5fd02043 8115 struct inode *inode = page->mapping->host;
d1310b2e 8116 struct extent_io_tree *tree;
e6dcd2dc 8117 struct btrfs_ordered_extent *ordered;
2ac55d41 8118 struct extent_state *cached_state = NULL;
e6dcd2dc
CM
8119 u64 page_start = page_offset(page);
8120 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
131e404a 8121 int inode_evicting = inode->i_state & I_FREEING;
39279cc3 8122
8b62b72b
CM
8123 /*
8124 * we have the page locked, so new writeback can't start,
8125 * and the dirty bit won't be cleared while we are here.
8126 *
8127 * Wait for IO on this page so that we can safely clear
8128 * the PagePrivate2 bit and do ordered accounting
8129 */
e6dcd2dc 8130 wait_on_page_writeback(page);
8b62b72b 8131
5fd02043 8132 tree = &BTRFS_I(inode)->io_tree;
e6dcd2dc
CM
8133 if (offset) {
8134 btrfs_releasepage(page, GFP_NOFS);
8135 return;
8136 }
131e404a
FDBM
8137
8138 if (!inode_evicting)
8139 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
8140 ordered = btrfs_lookup_ordered_extent(inode, page_start);
e6dcd2dc 8141 if (ordered) {
eb84ae03
CM
8142 /*
8143 * IO on this page will never be started, so we need
8144 * to account for any ordered extents now
8145 */
131e404a
FDBM
8146 if (!inode_evicting)
8147 clear_extent_bit(tree, page_start, page_end,
8148 EXTENT_DIRTY | EXTENT_DELALLOC |
8149 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8150 EXTENT_DEFRAG, 1, 0, &cached_state,
8151 GFP_NOFS);
8b62b72b
CM
8152 /*
8153 * whoever cleared the private bit is responsible
8154 * for the finish_ordered_io
8155 */
77cef2ec
JB
8156 if (TestClearPagePrivate2(page)) {
8157 struct btrfs_ordered_inode_tree *tree;
8158 u64 new_len;
8159
8160 tree = &BTRFS_I(inode)->ordered_tree;
8161
8162 spin_lock_irq(&tree->lock);
8163 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8164 new_len = page_start - ordered->file_offset;
8165 if (new_len < ordered->truncated_len)
8166 ordered->truncated_len = new_len;
8167 spin_unlock_irq(&tree->lock);
8168
8169 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8170 page_start,
8171 PAGE_CACHE_SIZE, 1))
8172 btrfs_finish_ordered_io(ordered);
8b62b72b 8173 }
e6dcd2dc 8174 btrfs_put_ordered_extent(ordered);
131e404a
FDBM
8175 if (!inode_evicting) {
8176 cached_state = NULL;
8177 lock_extent_bits(tree, page_start, page_end, 0,
8178 &cached_state);
8179 }
8180 }
8181
8182 if (!inode_evicting) {
8183 clear_extent_bit(tree, page_start, page_end,
8184 EXTENT_LOCKED | EXTENT_DIRTY |
8185 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8186 EXTENT_DEFRAG, 1, 1,
8187 &cached_state, GFP_NOFS);
8188
8189 __btrfs_releasepage(page, GFP_NOFS);
e6dcd2dc 8190 }
e6dcd2dc 8191
4a096752 8192 ClearPageChecked(page);
9ad6b7bc 8193 if (PagePrivate(page)) {
9ad6b7bc
CM
8194 ClearPagePrivate(page);
8195 set_page_private(page, 0);
8196 page_cache_release(page);
8197 }
39279cc3
CM
8198}
8199
9ebefb18
CM
8200/*
8201 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8202 * called from a page fault handler when a page is first dirtied. Hence we must
8203 * be careful to check for EOF conditions here. We set the page up correctly
8204 * for a written page which means we get ENOSPC checking when writing into
8205 * holes and correct delalloc and unwritten extent mapping on filesystems that
8206 * support these features.
8207 *
8208 * We are not allowed to take the i_mutex here so we have to play games to
8209 * protect against truncate races as the page could now be beyond EOF. Because
8210 * vmtruncate() writes the inode size before removing pages, once we have the
8211 * page lock we can determine safely if the page is beyond EOF. If it is not
8212 * beyond EOF, then the page is guaranteed safe against truncation until we
8213 * unlock the page.
8214 */
c2ec175c 8215int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
9ebefb18 8216{
c2ec175c 8217 struct page *page = vmf->page;
496ad9aa 8218 struct inode *inode = file_inode(vma->vm_file);
1832a6d5 8219 struct btrfs_root *root = BTRFS_I(inode)->root;
e6dcd2dc
CM
8220 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8221 struct btrfs_ordered_extent *ordered;
2ac55d41 8222 struct extent_state *cached_state = NULL;
e6dcd2dc
CM
8223 char *kaddr;
8224 unsigned long zero_start;
9ebefb18 8225 loff_t size;
1832a6d5 8226 int ret;
9998eb70 8227 int reserved = 0;
a52d9a80 8228 u64 page_start;
e6dcd2dc 8229 u64 page_end;
9ebefb18 8230
b2b5ef5c 8231 sb_start_pagefault(inode->i_sb);
0ca1f7ce 8232 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
9998eb70 8233 if (!ret) {
e41f941a 8234 ret = file_update_time(vma->vm_file);
9998eb70
CM
8235 reserved = 1;
8236 }
56a76f82
NP
8237 if (ret) {
8238 if (ret == -ENOMEM)
8239 ret = VM_FAULT_OOM;
8240 else /* -ENOSPC, -EIO, etc */
8241 ret = VM_FAULT_SIGBUS;
9998eb70
CM
8242 if (reserved)
8243 goto out;
8244 goto out_noreserve;
56a76f82 8245 }
1832a6d5 8246
56a76f82 8247 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
e6dcd2dc 8248again:
9ebefb18 8249 lock_page(page);
9ebefb18 8250 size = i_size_read(inode);
e6dcd2dc
CM
8251 page_start = page_offset(page);
8252 page_end = page_start + PAGE_CACHE_SIZE - 1;
a52d9a80 8253
9ebefb18 8254 if ((page->mapping != inode->i_mapping) ||
e6dcd2dc 8255 (page_start >= size)) {
9ebefb18
CM
8256 /* page got truncated out from underneath us */
8257 goto out_unlock;
8258 }
e6dcd2dc
CM
8259 wait_on_page_writeback(page);
8260
d0082371 8261 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
e6dcd2dc
CM
8262 set_page_extent_mapped(page);
8263
eb84ae03
CM
8264 /*
8265 * we can't set the delalloc bits if there are pending ordered
8266 * extents. Drop our locks and wait for them to finish
8267 */
e6dcd2dc
CM
8268 ordered = btrfs_lookup_ordered_extent(inode, page_start);
8269 if (ordered) {
2ac55d41
JB
8270 unlock_extent_cached(io_tree, page_start, page_end,
8271 &cached_state, GFP_NOFS);
e6dcd2dc 8272 unlock_page(page);
eb84ae03 8273 btrfs_start_ordered_extent(inode, ordered, 1);
e6dcd2dc
CM
8274 btrfs_put_ordered_extent(ordered);
8275 goto again;
8276 }
8277
fbf19087
JB
8278 /*
8279 * XXX - page_mkwrite gets called every time the page is dirtied, even
8280 * if it was already dirty, so for space accounting reasons we need to
8281 * clear any delalloc bits for the range we are fixing to save. There
8282 * is probably a better way to do this, but for now keep consistent with
8283 * prepare_pages in the normal write path.
8284 */
2ac55d41 8285 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
9e8a4a8b
LB
8286 EXTENT_DIRTY | EXTENT_DELALLOC |
8287 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
2ac55d41 8288 0, 0, &cached_state, GFP_NOFS);
fbf19087 8289
2ac55d41
JB
8290 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
8291 &cached_state);
9ed74f2d 8292 if (ret) {
2ac55d41
JB
8293 unlock_extent_cached(io_tree, page_start, page_end,
8294 &cached_state, GFP_NOFS);
9ed74f2d
JB
8295 ret = VM_FAULT_SIGBUS;
8296 goto out_unlock;
8297 }
e6dcd2dc 8298 ret = 0;
9ebefb18
CM
8299
8300 /* page is wholly or partially inside EOF */
a52d9a80 8301 if (page_start + PAGE_CACHE_SIZE > size)
e6dcd2dc 8302 zero_start = size & ~PAGE_CACHE_MASK;
9ebefb18 8303 else
e6dcd2dc 8304 zero_start = PAGE_CACHE_SIZE;
9ebefb18 8305
e6dcd2dc
CM
8306 if (zero_start != PAGE_CACHE_SIZE) {
8307 kaddr = kmap(page);
8308 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
8309 flush_dcache_page(page);
8310 kunmap(page);
8311 }
247e743c 8312 ClearPageChecked(page);
e6dcd2dc 8313 set_page_dirty(page);
50a9b214 8314 SetPageUptodate(page);
5a3f23d5 8315
257c62e1
CM
8316 BTRFS_I(inode)->last_trans = root->fs_info->generation;
8317 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
46d8bc34 8318 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
257c62e1 8319
2ac55d41 8320 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9ebefb18
CM
8321
8322out_unlock:
b2b5ef5c
JK
8323 if (!ret) {
8324 sb_end_pagefault(inode->i_sb);
50a9b214 8325 return VM_FAULT_LOCKED;
b2b5ef5c 8326 }
9ebefb18 8327 unlock_page(page);
1832a6d5 8328out:
ec39e180 8329 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
9998eb70 8330out_noreserve:
b2b5ef5c 8331 sb_end_pagefault(inode->i_sb);
9ebefb18
CM
8332 return ret;
8333}
8334
a41ad394 8335static int btrfs_truncate(struct inode *inode)
39279cc3
CM
8336{
8337 struct btrfs_root *root = BTRFS_I(inode)->root;
fcb80c2a 8338 struct btrfs_block_rsv *rsv;
a71754fc 8339 int ret = 0;
3893e33b 8340 int err = 0;
39279cc3 8341 struct btrfs_trans_handle *trans;
dbe674a9 8342 u64 mask = root->sectorsize - 1;
07127184 8343 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
39279cc3 8344
0ef8b726
JB
8345 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
8346 (u64)-1);
8347 if (ret)
8348 return ret;
39279cc3 8349
fcb80c2a
JB
8350 /*
8351 * Yes ladies and gentelment, this is indeed ugly. The fact is we have
8352 * 3 things going on here
8353 *
8354 * 1) We need to reserve space for our orphan item and the space to
8355 * delete our orphan item. Lord knows we don't want to have a dangling
8356 * orphan item because we didn't reserve space to remove it.
8357 *
8358 * 2) We need to reserve space to update our inode.
8359 *
8360 * 3) We need to have something to cache all the space that is going to
8361 * be free'd up by the truncate operation, but also have some slack
8362 * space reserved in case it uses space during the truncate (thank you
8363 * very much snapshotting).
8364 *
8365 * And we need these to all be seperate. The fact is we can use alot of
8366 * space doing the truncate, and we have no earthly idea how much space
8367 * we will use, so we need the truncate reservation to be seperate so it
8368 * doesn't end up using space reserved for updating the inode or
8369 * removing the orphan item. We also need to be able to stop the
8370 * transaction and start a new one, which means we need to be able to
8371 * update the inode several times, and we have no idea of knowing how
8372 * many times that will be, so we can't just reserve 1 item for the
8373 * entirety of the opration, so that has to be done seperately as well.
8374 * Then there is the orphan item, which does indeed need to be held on
8375 * to for the whole operation, and we need nobody to touch this reserved
8376 * space except the orphan code.
8377 *
8378 * So that leaves us with
8379 *
8380 * 1) root->orphan_block_rsv - for the orphan deletion.
8381 * 2) rsv - for the truncate reservation, which we will steal from the
8382 * transaction reservation.
8383 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
8384 * updating the inode.
8385 */
66d8f3dd 8386 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
fcb80c2a
JB
8387 if (!rsv)
8388 return -ENOMEM;
4a338542 8389 rsv->size = min_size;
ca7e70f5 8390 rsv->failfast = 1;
f0cd846e 8391
907cbceb 8392 /*
07127184 8393 * 1 for the truncate slack space
907cbceb
JB
8394 * 1 for updating the inode.
8395 */
f3fe820c 8396 trans = btrfs_start_transaction(root, 2);
fcb80c2a
JB
8397 if (IS_ERR(trans)) {
8398 err = PTR_ERR(trans);
8399 goto out;
8400 }
f0cd846e 8401
907cbceb
JB
8402 /* Migrate the slack space for the truncate to our reserve */
8403 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
8404 min_size);
fcb80c2a 8405 BUG_ON(ret);
f0cd846e 8406
5dc562c5
JB
8407 /*
8408 * So if we truncate and then write and fsync we normally would just
8409 * write the extents that changed, which is a problem if we need to
8410 * first truncate that entire inode. So set this flag so we write out
8411 * all of the extents in the inode to the sync log so we're completely
8412 * safe.
8413 */
8414 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
ca7e70f5 8415 trans->block_rsv = rsv;
907cbceb 8416
8082510e
YZ
8417 while (1) {
8418 ret = btrfs_truncate_inode_items(trans, root, inode,
8419 inode->i_size,
8420 BTRFS_EXTENT_DATA_KEY);
ca7e70f5 8421 if (ret != -ENOSPC) {
3893e33b 8422 err = ret;
8082510e 8423 break;
3893e33b 8424 }
39279cc3 8425
fcb80c2a 8426 trans->block_rsv = &root->fs_info->trans_block_rsv;
8082510e 8427 ret = btrfs_update_inode(trans, root, inode);
3893e33b
JB
8428 if (ret) {
8429 err = ret;
8430 break;
8431 }
ca7e70f5 8432
8082510e 8433 btrfs_end_transaction(trans, root);
b53d3f5d 8434 btrfs_btree_balance_dirty(root);
ca7e70f5
JB
8435
8436 trans = btrfs_start_transaction(root, 2);
8437 if (IS_ERR(trans)) {
8438 ret = err = PTR_ERR(trans);
8439 trans = NULL;
8440 break;
8441 }
8442
8443 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
8444 rsv, min_size);
8445 BUG_ON(ret); /* shouldn't happen */
8446 trans->block_rsv = rsv;
8082510e
YZ
8447 }
8448
8449 if (ret == 0 && inode->i_nlink > 0) {
fcb80c2a 8450 trans->block_rsv = root->orphan_block_rsv;
8082510e 8451 ret = btrfs_orphan_del(trans, inode);
3893e33b
JB
8452 if (ret)
8453 err = ret;
8082510e
YZ
8454 }
8455
917c16b2
CM
8456 if (trans) {
8457 trans->block_rsv = &root->fs_info->trans_block_rsv;
8458 ret = btrfs_update_inode(trans, root, inode);
8459 if (ret && !err)
8460 err = ret;
7b128766 8461
7ad85bb7 8462 ret = btrfs_end_transaction(trans, root);
b53d3f5d 8463 btrfs_btree_balance_dirty(root);
917c16b2 8464 }
fcb80c2a
JB
8465
8466out:
8467 btrfs_free_block_rsv(root, rsv);
8468
3893e33b
JB
8469 if (ret && !err)
8470 err = ret;
a41ad394 8471
3893e33b 8472 return err;
39279cc3
CM
8473}
8474
d352ac68
CM
8475/*
8476 * create a new subvolume directory/inode (helper for the ioctl).
8477 */
d2fb3437 8478int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
63541927
FDBM
8479 struct btrfs_root *new_root,
8480 struct btrfs_root *parent_root,
8481 u64 new_dirid)
39279cc3 8482{
39279cc3 8483 struct inode *inode;
76dda93c 8484 int err;
00e4e6b3 8485 u64 index = 0;
39279cc3 8486
12fc9d09
FA
8487 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
8488 new_dirid, new_dirid,
8489 S_IFDIR | (~current_umask() & S_IRWXUGO),
8490 &index);
54aa1f4d 8491 if (IS_ERR(inode))
f46b5a66 8492 return PTR_ERR(inode);
39279cc3
CM
8493 inode->i_op = &btrfs_dir_inode_operations;
8494 inode->i_fop = &btrfs_dir_file_operations;
8495
bfe86848 8496 set_nlink(inode, 1);
dbe674a9 8497 btrfs_i_size_write(inode, 0);
b0d5d10f 8498 unlock_new_inode(inode);
3b96362c 8499
63541927
FDBM
8500 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8501 if (err)
8502 btrfs_err(new_root->fs_info,
351fd353 8503 "error inheriting subvolume %llu properties: %d",
63541927
FDBM
8504 new_root->root_key.objectid, err);
8505
76dda93c 8506 err = btrfs_update_inode(trans, new_root, inode);
cb8e7090 8507
76dda93c 8508 iput(inode);
ce598979 8509 return err;
39279cc3
CM
8510}
8511
39279cc3
CM
8512struct inode *btrfs_alloc_inode(struct super_block *sb)
8513{
8514 struct btrfs_inode *ei;
2ead6ae7 8515 struct inode *inode;
39279cc3
CM
8516
8517 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
8518 if (!ei)
8519 return NULL;
2ead6ae7
YZ
8520
8521 ei->root = NULL;
2ead6ae7 8522 ei->generation = 0;
15ee9bc7 8523 ei->last_trans = 0;
257c62e1 8524 ei->last_sub_trans = 0;
e02119d5 8525 ei->logged_trans = 0;
2ead6ae7 8526 ei->delalloc_bytes = 0;
47059d93 8527 ei->defrag_bytes = 0;
2ead6ae7
YZ
8528 ei->disk_i_size = 0;
8529 ei->flags = 0;
7709cde3 8530 ei->csum_bytes = 0;
2ead6ae7 8531 ei->index_cnt = (u64)-1;
67de1176 8532 ei->dir_index = 0;
2ead6ae7 8533 ei->last_unlink_trans = 0;
46d8bc34 8534 ei->last_log_commit = 0;
2ead6ae7 8535
9e0baf60
JB
8536 spin_lock_init(&ei->lock);
8537 ei->outstanding_extents = 0;
8538 ei->reserved_extents = 0;
2ead6ae7 8539
72ac3c0d 8540 ei->runtime_flags = 0;
261507a0 8541 ei->force_compress = BTRFS_COMPRESS_NONE;
2ead6ae7 8542
16cdcec7
MX
8543 ei->delayed_node = NULL;
8544
2ead6ae7 8545 inode = &ei->vfs_inode;
a8067e02 8546 extent_map_tree_init(&ei->extent_tree);
f993c883
DS
8547 extent_io_tree_init(&ei->io_tree, &inode->i_data);
8548 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
0b32f4bb
JB
8549 ei->io_tree.track_uptodate = 1;
8550 ei->io_failure_tree.track_uptodate = 1;
b812ce28 8551 atomic_set(&ei->sync_writers, 0);
2ead6ae7 8552 mutex_init(&ei->log_mutex);
f248679e 8553 mutex_init(&ei->delalloc_mutex);
e6dcd2dc 8554 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
2ead6ae7 8555 INIT_LIST_HEAD(&ei->delalloc_inodes);
2ead6ae7
YZ
8556 RB_CLEAR_NODE(&ei->rb_node);
8557
8558 return inode;
39279cc3
CM
8559}
8560
aaedb55b
JB
8561#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8562void btrfs_test_destroy_inode(struct inode *inode)
8563{
8564 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8565 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8566}
8567#endif
8568
fa0d7e3d
NP
8569static void btrfs_i_callback(struct rcu_head *head)
8570{
8571 struct inode *inode = container_of(head, struct inode, i_rcu);
fa0d7e3d
NP
8572 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8573}
8574
39279cc3
CM
8575void btrfs_destroy_inode(struct inode *inode)
8576{
e6dcd2dc 8577 struct btrfs_ordered_extent *ordered;
5a3f23d5
CM
8578 struct btrfs_root *root = BTRFS_I(inode)->root;
8579
b3d9b7a3 8580 WARN_ON(!hlist_empty(&inode->i_dentry));
39279cc3 8581 WARN_ON(inode->i_data.nrpages);
9e0baf60
JB
8582 WARN_ON(BTRFS_I(inode)->outstanding_extents);
8583 WARN_ON(BTRFS_I(inode)->reserved_extents);
7709cde3
JB
8584 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8585 WARN_ON(BTRFS_I(inode)->csum_bytes);
47059d93 8586 WARN_ON(BTRFS_I(inode)->defrag_bytes);
39279cc3 8587
a6dbd429
JB
8588 /*
8589 * This can happen where we create an inode, but somebody else also
8590 * created the same inode and we need to destroy the one we already
8591 * created.
8592 */
8593 if (!root)
8594 goto free;
8595
8a35d95f
JB
8596 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8597 &BTRFS_I(inode)->runtime_flags)) {
c2cf52eb 8598 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
c1c9ff7c 8599 btrfs_ino(inode));
8a35d95f 8600 atomic_dec(&root->orphan_inodes);
7b128766 8601 }
7b128766 8602
d397712b 8603 while (1) {
e6dcd2dc
CM
8604 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8605 if (!ordered)
8606 break;
8607 else {
c2cf52eb 8608 btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
c1c9ff7c 8609 ordered->file_offset, ordered->len);
e6dcd2dc
CM
8610 btrfs_remove_ordered_extent(inode, ordered);
8611 btrfs_put_ordered_extent(ordered);
8612 btrfs_put_ordered_extent(ordered);
8613 }
8614 }
5d4f98a2 8615 inode_tree_del(inode);
5b21f2ed 8616 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
a6dbd429 8617free:
fa0d7e3d 8618 call_rcu(&inode->i_rcu, btrfs_i_callback);
39279cc3
CM
8619}
8620
45321ac5 8621int btrfs_drop_inode(struct inode *inode)
76dda93c
YZ
8622{
8623 struct btrfs_root *root = BTRFS_I(inode)->root;
45321ac5 8624
6379ef9f
NA
8625 if (root == NULL)
8626 return 1;
8627
fa6ac876 8628 /* the snap/subvol tree is on deleting */
69e9c6c6 8629 if (btrfs_root_refs(&root->root_item) == 0)
45321ac5 8630 return 1;
76dda93c 8631 else
45321ac5 8632 return generic_drop_inode(inode);
76dda93c
YZ
8633}
8634
0ee0fda0 8635static void init_once(void *foo)
39279cc3
CM
8636{
8637 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8638
8639 inode_init_once(&ei->vfs_inode);
8640}
8641
8642void btrfs_destroy_cachep(void)
8643{
8c0a8537
KS
8644 /*
8645 * Make sure all delayed rcu free inodes are flushed before we
8646 * destroy cache.
8647 */
8648 rcu_barrier();
39279cc3
CM
8649 if (btrfs_inode_cachep)
8650 kmem_cache_destroy(btrfs_inode_cachep);
8651 if (btrfs_trans_handle_cachep)
8652 kmem_cache_destroy(btrfs_trans_handle_cachep);
8653 if (btrfs_transaction_cachep)
8654 kmem_cache_destroy(btrfs_transaction_cachep);
39279cc3
CM
8655 if (btrfs_path_cachep)
8656 kmem_cache_destroy(btrfs_path_cachep);
dc89e982
JB
8657 if (btrfs_free_space_cachep)
8658 kmem_cache_destroy(btrfs_free_space_cachep);
8ccf6f19
MX
8659 if (btrfs_delalloc_work_cachep)
8660 kmem_cache_destroy(btrfs_delalloc_work_cachep);
39279cc3
CM
8661}
8662
8663int btrfs_init_cachep(void)
8664{
837e1972 8665 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9601e3f6
CH
8666 sizeof(struct btrfs_inode), 0,
8667 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
39279cc3
CM
8668 if (!btrfs_inode_cachep)
8669 goto fail;
9601e3f6 8670
837e1972 8671 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9601e3f6
CH
8672 sizeof(struct btrfs_trans_handle), 0,
8673 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
39279cc3
CM
8674 if (!btrfs_trans_handle_cachep)
8675 goto fail;
9601e3f6 8676
837e1972 8677 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9601e3f6
CH
8678 sizeof(struct btrfs_transaction), 0,
8679 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
39279cc3
CM
8680 if (!btrfs_transaction_cachep)
8681 goto fail;
9601e3f6 8682
837e1972 8683 btrfs_path_cachep = kmem_cache_create("btrfs_path",
9601e3f6
CH
8684 sizeof(struct btrfs_path), 0,
8685 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
39279cc3
CM
8686 if (!btrfs_path_cachep)
8687 goto fail;
9601e3f6 8688
837e1972 8689 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
dc89e982
JB
8690 sizeof(struct btrfs_free_space), 0,
8691 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8692 if (!btrfs_free_space_cachep)
8693 goto fail;
8694
8ccf6f19
MX
8695 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
8696 sizeof(struct btrfs_delalloc_work), 0,
8697 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
8698 NULL);
8699 if (!btrfs_delalloc_work_cachep)
8700 goto fail;
8701
39279cc3
CM
8702 return 0;
8703fail:
8704 btrfs_destroy_cachep();
8705 return -ENOMEM;
8706}
8707
8708static int btrfs_getattr(struct vfsmount *mnt,
8709 struct dentry *dentry, struct kstat *stat)
8710{
df0af1a5 8711 u64 delalloc_bytes;
39279cc3 8712 struct inode *inode = dentry->d_inode;
fadc0d8b
DS
8713 u32 blocksize = inode->i_sb->s_blocksize;
8714
39279cc3 8715 generic_fillattr(inode, stat);
0ee5dc67 8716 stat->dev = BTRFS_I(inode)->root->anon_dev;
d6667462 8717 stat->blksize = PAGE_CACHE_SIZE;
df0af1a5
MX
8718
8719 spin_lock(&BTRFS_I(inode)->lock);
8720 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8721 spin_unlock(&BTRFS_I(inode)->lock);
fadc0d8b 8722 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
df0af1a5 8723 ALIGN(delalloc_bytes, blocksize)) >> 9;
39279cc3
CM
8724 return 0;
8725}
8726
d397712b
CM
8727static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8728 struct inode *new_dir, struct dentry *new_dentry)
39279cc3
CM
8729{
8730 struct btrfs_trans_handle *trans;
8731 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4df27c4d 8732 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
39279cc3
CM
8733 struct inode *new_inode = new_dentry->d_inode;
8734 struct inode *old_inode = old_dentry->d_inode;
8735 struct timespec ctime = CURRENT_TIME;
00e4e6b3 8736 u64 index = 0;
4df27c4d 8737 u64 root_objectid;
39279cc3 8738 int ret;
33345d01 8739 u64 old_ino = btrfs_ino(old_inode);
39279cc3 8740
33345d01 8741 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
f679a840
YZ
8742 return -EPERM;
8743
4df27c4d 8744 /* we only allow rename subvolume link between subvolumes */
33345d01 8745 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
3394e160
CM
8746 return -EXDEV;
8747
33345d01
LZ
8748 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8749 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
39279cc3 8750 return -ENOTEMPTY;
5f39d397 8751
4df27c4d
YZ
8752 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8753 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8754 return -ENOTEMPTY;
9c52057c
CM
8755
8756
8757 /* check for collisions, even if the name isn't there */
4871c158 8758 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9c52057c
CM
8759 new_dentry->d_name.name,
8760 new_dentry->d_name.len);
8761
8762 if (ret) {
8763 if (ret == -EEXIST) {
8764 /* we shouldn't get
8765 * eexist without a new_inode */
fae7f21c 8766 if (WARN_ON(!new_inode)) {
9c52057c
CM
8767 return ret;
8768 }
8769 } else {
8770 /* maybe -EOVERFLOW */
8771 return ret;
8772 }
8773 }
8774 ret = 0;
8775
5a3f23d5 8776 /*
8d875f95
CM
8777 * we're using rename to replace one file with another. Start IO on it
8778 * now so we don't add too much work to the end of the transaction
5a3f23d5 8779 */
8d875f95 8780 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
5a3f23d5
CM
8781 filemap_flush(old_inode->i_mapping);
8782
76dda93c 8783 /* close the racy window with snapshot create/destroy ioctl */
33345d01 8784 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
76dda93c 8785 down_read(&root->fs_info->subvol_sem);
a22285a6
YZ
8786 /*
8787 * We want to reserve the absolute worst case amount of items. So if
8788 * both inodes are subvols and we need to unlink them then that would
8789 * require 4 item modifications, but if they are both normal inodes it
8790 * would require 5 item modifications, so we'll assume their normal
8791 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8792 * should cover the worst case number of items we'll modify.
8793 */
6e137ed3 8794 trans = btrfs_start_transaction(root, 11);
b44c59a8
JL
8795 if (IS_ERR(trans)) {
8796 ret = PTR_ERR(trans);
8797 goto out_notrans;
8798 }
76dda93c 8799
4df27c4d
YZ
8800 if (dest != root)
8801 btrfs_record_root_in_trans(trans, dest);
5f39d397 8802
a5719521
YZ
8803 ret = btrfs_set_inode_index(new_dir, &index);
8804 if (ret)
8805 goto out_fail;
5a3f23d5 8806
67de1176 8807 BTRFS_I(old_inode)->dir_index = 0ULL;
33345d01 8808 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4df27c4d 8809 /* force full log commit if subvolume involved. */
995946dd 8810 btrfs_set_log_full_commit(root->fs_info, trans);
4df27c4d 8811 } else {
a5719521
YZ
8812 ret = btrfs_insert_inode_ref(trans, dest,
8813 new_dentry->d_name.name,
8814 new_dentry->d_name.len,
33345d01
LZ
8815 old_ino,
8816 btrfs_ino(new_dir), index);
a5719521
YZ
8817 if (ret)
8818 goto out_fail;
4df27c4d
YZ
8819 /*
8820 * this is an ugly little race, but the rename is required
8821 * to make sure that if we crash, the inode is either at the
8822 * old name or the new one. pinning the log transaction lets
8823 * us make sure we don't allow a log commit to come in after
8824 * we unlink the name but before we add the new name back in.
8825 */
8826 btrfs_pin_log_trans(root);
8827 }
5a3f23d5 8828
0c4d2d95
JB
8829 inode_inc_iversion(old_dir);
8830 inode_inc_iversion(new_dir);
8831 inode_inc_iversion(old_inode);
39279cc3
CM
8832 old_dir->i_ctime = old_dir->i_mtime = ctime;
8833 new_dir->i_ctime = new_dir->i_mtime = ctime;
8834 old_inode->i_ctime = ctime;
5f39d397 8835
12fcfd22
CM
8836 if (old_dentry->d_parent != new_dentry->d_parent)
8837 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8838
33345d01 8839 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4df27c4d
YZ
8840 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8841 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8842 old_dentry->d_name.name,
8843 old_dentry->d_name.len);
8844 } else {
92986796
AV
8845 ret = __btrfs_unlink_inode(trans, root, old_dir,
8846 old_dentry->d_inode,
8847 old_dentry->d_name.name,
8848 old_dentry->d_name.len);
8849 if (!ret)
8850 ret = btrfs_update_inode(trans, root, old_inode);
4df27c4d 8851 }
79787eaa
JM
8852 if (ret) {
8853 btrfs_abort_transaction(trans, root, ret);
8854 goto out_fail;
8855 }
39279cc3
CM
8856
8857 if (new_inode) {
0c4d2d95 8858 inode_inc_iversion(new_inode);
39279cc3 8859 new_inode->i_ctime = CURRENT_TIME;
33345d01 8860 if (unlikely(btrfs_ino(new_inode) ==
4df27c4d
YZ
8861 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8862 root_objectid = BTRFS_I(new_inode)->location.objectid;
8863 ret = btrfs_unlink_subvol(trans, dest, new_dir,
8864 root_objectid,
8865 new_dentry->d_name.name,
8866 new_dentry->d_name.len);
8867 BUG_ON(new_inode->i_nlink == 0);
8868 } else {
8869 ret = btrfs_unlink_inode(trans, dest, new_dir,
8870 new_dentry->d_inode,
8871 new_dentry->d_name.name,
8872 new_dentry->d_name.len);
8873 }
4ef31a45 8874 if (!ret && new_inode->i_nlink == 0)
e02119d5 8875 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
79787eaa
JM
8876 if (ret) {
8877 btrfs_abort_transaction(trans, root, ret);
8878 goto out_fail;
8879 }
39279cc3 8880 }
aec7477b 8881
4df27c4d
YZ
8882 ret = btrfs_add_link(trans, new_dir, old_inode,
8883 new_dentry->d_name.name,
a5719521 8884 new_dentry->d_name.len, 0, index);
79787eaa
JM
8885 if (ret) {
8886 btrfs_abort_transaction(trans, root, ret);
8887 goto out_fail;
8888 }
39279cc3 8889
67de1176
MX
8890 if (old_inode->i_nlink == 1)
8891 BTRFS_I(old_inode)->dir_index = index;
8892
33345d01 8893 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
10d9f309 8894 struct dentry *parent = new_dentry->d_parent;
6a912213 8895 btrfs_log_new_name(trans, old_inode, old_dir, parent);
4df27c4d
YZ
8896 btrfs_end_log_trans(root);
8897 }
39279cc3 8898out_fail:
7ad85bb7 8899 btrfs_end_transaction(trans, root);
b44c59a8 8900out_notrans:
33345d01 8901 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
76dda93c 8902 up_read(&root->fs_info->subvol_sem);
9ed74f2d 8903
39279cc3
CM
8904 return ret;
8905}
8906
80ace85c
MS
8907static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
8908 struct inode *new_dir, struct dentry *new_dentry,
8909 unsigned int flags)
8910{
8911 if (flags & ~RENAME_NOREPLACE)
8912 return -EINVAL;
8913
8914 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
8915}
8916
8ccf6f19
MX
8917static void btrfs_run_delalloc_work(struct btrfs_work *work)
8918{
8919 struct btrfs_delalloc_work *delalloc_work;
9f23e289 8920 struct inode *inode;
8ccf6f19
MX
8921
8922 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8923 work);
9f23e289
JB
8924 inode = delalloc_work->inode;
8925 if (delalloc_work->wait) {
8926 btrfs_wait_ordered_range(inode, 0, (u64)-1);
8927 } else {
8928 filemap_flush(inode->i_mapping);
8929 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8930 &BTRFS_I(inode)->runtime_flags))
8931 filemap_flush(inode->i_mapping);
8932 }
8ccf6f19
MX
8933
8934 if (delalloc_work->delay_iput)
9f23e289 8935 btrfs_add_delayed_iput(inode);
8ccf6f19 8936 else
9f23e289 8937 iput(inode);
8ccf6f19
MX
8938 complete(&delalloc_work->completion);
8939}
8940
8941struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8942 int wait, int delay_iput)
8943{
8944 struct btrfs_delalloc_work *work;
8945
8946 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8947 if (!work)
8948 return NULL;
8949
8950 init_completion(&work->completion);
8951 INIT_LIST_HEAD(&work->list);
8952 work->inode = inode;
8953 work->wait = wait;
8954 work->delay_iput = delay_iput;
9e0af237
LB
8955 WARN_ON_ONCE(!inode);
8956 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8957 btrfs_run_delalloc_work, NULL, NULL);
8ccf6f19
MX
8958
8959 return work;
8960}
8961
8962void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8963{
8964 wait_for_completion(&work->completion);
8965 kmem_cache_free(btrfs_delalloc_work_cachep, work);
8966}
8967
d352ac68
CM
8968/*
8969 * some fairly slow code that needs optimization. This walks the list
8970 * of all the inodes with pending delalloc and forces them to disk.
8971 */
6c255e67
MX
8972static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8973 int nr)
ea8c2819 8974{
ea8c2819 8975 struct btrfs_inode *binode;
5b21f2ed 8976 struct inode *inode;
8ccf6f19
MX
8977 struct btrfs_delalloc_work *work, *next;
8978 struct list_head works;
1eafa6c7 8979 struct list_head splice;
8ccf6f19 8980 int ret = 0;
ea8c2819 8981
8ccf6f19 8982 INIT_LIST_HEAD(&works);
1eafa6c7 8983 INIT_LIST_HEAD(&splice);
63607cc8 8984
573bfb72 8985 mutex_lock(&root->delalloc_mutex);
eb73c1b7
MX
8986 spin_lock(&root->delalloc_lock);
8987 list_splice_init(&root->delalloc_inodes, &splice);
1eafa6c7
MX
8988 while (!list_empty(&splice)) {
8989 binode = list_entry(splice.next, struct btrfs_inode,
ea8c2819 8990 delalloc_inodes);
1eafa6c7 8991
eb73c1b7
MX
8992 list_move_tail(&binode->delalloc_inodes,
8993 &root->delalloc_inodes);
5b21f2ed 8994 inode = igrab(&binode->vfs_inode);
df0af1a5 8995 if (!inode) {
eb73c1b7 8996 cond_resched_lock(&root->delalloc_lock);
1eafa6c7 8997 continue;
df0af1a5 8998 }
eb73c1b7 8999 spin_unlock(&root->delalloc_lock);
1eafa6c7
MX
9000
9001 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
5d99a998 9002 if (!work) {
f4ab9ea7
JB
9003 if (delay_iput)
9004 btrfs_add_delayed_iput(inode);
9005 else
9006 iput(inode);
1eafa6c7 9007 ret = -ENOMEM;
a1ecaabb 9008 goto out;
5b21f2ed 9009 }
1eafa6c7 9010 list_add_tail(&work->list, &works);
a44903ab
QW
9011 btrfs_queue_work(root->fs_info->flush_workers,
9012 &work->work);
6c255e67
MX
9013 ret++;
9014 if (nr != -1 && ret >= nr)
a1ecaabb 9015 goto out;
5b21f2ed 9016 cond_resched();
eb73c1b7 9017 spin_lock(&root->delalloc_lock);
ea8c2819 9018 }
eb73c1b7 9019 spin_unlock(&root->delalloc_lock);
8c8bee1d 9020
a1ecaabb 9021out:
eb73c1b7
MX
9022 list_for_each_entry_safe(work, next, &works, list) {
9023 list_del_init(&work->list);
9024 btrfs_wait_and_free_delalloc_work(work);
9025 }
9026
9027 if (!list_empty_careful(&splice)) {
9028 spin_lock(&root->delalloc_lock);
9029 list_splice_tail(&splice, &root->delalloc_inodes);
9030 spin_unlock(&root->delalloc_lock);
9031 }
573bfb72 9032 mutex_unlock(&root->delalloc_mutex);
eb73c1b7
MX
9033 return ret;
9034}
1eafa6c7 9035
eb73c1b7
MX
9036int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
9037{
9038 int ret;
1eafa6c7 9039
2c21b4d7 9040 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
eb73c1b7
MX
9041 return -EROFS;
9042
6c255e67
MX
9043 ret = __start_delalloc_inodes(root, delay_iput, -1);
9044 if (ret > 0)
9045 ret = 0;
eb73c1b7
MX
9046 /*
9047 * the filemap_flush will queue IO into the worker threads, but
8c8bee1d
CM
9048 * we have to make sure the IO is actually started and that
9049 * ordered extents get created before we return
9050 */
9051 atomic_inc(&root->fs_info->async_submit_draining);
d397712b 9052 while (atomic_read(&root->fs_info->nr_async_submits) ||
771ed689 9053 atomic_read(&root->fs_info->async_delalloc_pages)) {
8c8bee1d 9054 wait_event(root->fs_info->async_submit_wait,
771ed689
CM
9055 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
9056 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8c8bee1d
CM
9057 }
9058 atomic_dec(&root->fs_info->async_submit_draining);
eb73c1b7
MX
9059 return ret;
9060}
9061
6c255e67
MX
9062int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
9063 int nr)
eb73c1b7
MX
9064{
9065 struct btrfs_root *root;
9066 struct list_head splice;
9067 int ret;
9068
2c21b4d7 9069 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
eb73c1b7
MX
9070 return -EROFS;
9071
9072 INIT_LIST_HEAD(&splice);
9073
573bfb72 9074 mutex_lock(&fs_info->delalloc_root_mutex);
eb73c1b7
MX
9075 spin_lock(&fs_info->delalloc_root_lock);
9076 list_splice_init(&fs_info->delalloc_roots, &splice);
6c255e67 9077 while (!list_empty(&splice) && nr) {
eb73c1b7
MX
9078 root = list_first_entry(&splice, struct btrfs_root,
9079 delalloc_root);
9080 root = btrfs_grab_fs_root(root);
9081 BUG_ON(!root);
9082 list_move_tail(&root->delalloc_root,
9083 &fs_info->delalloc_roots);
9084 spin_unlock(&fs_info->delalloc_root_lock);
9085
6c255e67 9086 ret = __start_delalloc_inodes(root, delay_iput, nr);
eb73c1b7 9087 btrfs_put_fs_root(root);
6c255e67 9088 if (ret < 0)
eb73c1b7
MX
9089 goto out;
9090
6c255e67
MX
9091 if (nr != -1) {
9092 nr -= ret;
9093 WARN_ON(nr < 0);
9094 }
eb73c1b7 9095 spin_lock(&fs_info->delalloc_root_lock);
8ccf6f19 9096 }
eb73c1b7 9097 spin_unlock(&fs_info->delalloc_root_lock);
1eafa6c7 9098
6c255e67 9099 ret = 0;
eb73c1b7
MX
9100 atomic_inc(&fs_info->async_submit_draining);
9101 while (atomic_read(&fs_info->nr_async_submits) ||
9102 atomic_read(&fs_info->async_delalloc_pages)) {
9103 wait_event(fs_info->async_submit_wait,
9104 (atomic_read(&fs_info->nr_async_submits) == 0 &&
9105 atomic_read(&fs_info->async_delalloc_pages) == 0));
9106 }
9107 atomic_dec(&fs_info->async_submit_draining);
eb73c1b7 9108out:
1eafa6c7 9109 if (!list_empty_careful(&splice)) {
eb73c1b7
MX
9110 spin_lock(&fs_info->delalloc_root_lock);
9111 list_splice_tail(&splice, &fs_info->delalloc_roots);
9112 spin_unlock(&fs_info->delalloc_root_lock);
1eafa6c7 9113 }
573bfb72 9114 mutex_unlock(&fs_info->delalloc_root_mutex);
8ccf6f19 9115 return ret;
ea8c2819
CM
9116}
9117
39279cc3
CM
9118static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9119 const char *symname)
9120{
9121 struct btrfs_trans_handle *trans;
9122 struct btrfs_root *root = BTRFS_I(dir)->root;
9123 struct btrfs_path *path;
9124 struct btrfs_key key;
1832a6d5 9125 struct inode *inode = NULL;
39279cc3
CM
9126 int err;
9127 int drop_inode = 0;
9128 u64 objectid;
67871254 9129 u64 index = 0;
39279cc3
CM
9130 int name_len;
9131 int datasize;
5f39d397 9132 unsigned long ptr;
39279cc3 9133 struct btrfs_file_extent_item *ei;
5f39d397 9134 struct extent_buffer *leaf;
39279cc3 9135
f06becc4 9136 name_len = strlen(symname);
39279cc3
CM
9137 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
9138 return -ENAMETOOLONG;
1832a6d5 9139
9ed74f2d
JB
9140 /*
9141 * 2 items for inode item and ref
9142 * 2 items for dir items
9143 * 1 item for xattr if selinux is on
9144 */
a22285a6
YZ
9145 trans = btrfs_start_transaction(root, 5);
9146 if (IS_ERR(trans))
9147 return PTR_ERR(trans);
1832a6d5 9148
581bb050
LZ
9149 err = btrfs_find_free_ino(root, &objectid);
9150 if (err)
9151 goto out_unlock;
9152
aec7477b 9153 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
33345d01 9154 dentry->d_name.len, btrfs_ino(dir), objectid,
d82a6f1d 9155 S_IFLNK|S_IRWXUGO, &index);
7cf96da3
TI
9156 if (IS_ERR(inode)) {
9157 err = PTR_ERR(inode);
39279cc3 9158 goto out_unlock;
7cf96da3 9159 }
39279cc3 9160
ad19db71
CS
9161 /*
9162 * If the active LSM wants to access the inode during
9163 * d_instantiate it needs these. Smack checks to see
9164 * if the filesystem supports xattrs by looking at the
9165 * ops vector.
9166 */
9167 inode->i_fop = &btrfs_file_operations;
9168 inode->i_op = &btrfs_file_inode_operations;
b0d5d10f
CM
9169 inode->i_mapping->a_ops = &btrfs_aops;
9170 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9171 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9172
9173 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9174 if (err)
9175 goto out_unlock_inode;
ad19db71 9176
a1b075d2 9177 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
39279cc3 9178 if (err)
b0d5d10f 9179 goto out_unlock_inode;
39279cc3
CM
9180
9181 path = btrfs_alloc_path();
d8926bb3
MF
9182 if (!path) {
9183 err = -ENOMEM;
b0d5d10f 9184 goto out_unlock_inode;
d8926bb3 9185 }
33345d01 9186 key.objectid = btrfs_ino(inode);
39279cc3 9187 key.offset = 0;
962a298f 9188 key.type = BTRFS_EXTENT_DATA_KEY;
39279cc3
CM
9189 datasize = btrfs_file_extent_calc_inline_size(name_len);
9190 err = btrfs_insert_empty_item(trans, root, path, &key,
9191 datasize);
54aa1f4d 9192 if (err) {
b0839166 9193 btrfs_free_path(path);
b0d5d10f 9194 goto out_unlock_inode;
54aa1f4d 9195 }
5f39d397
CM
9196 leaf = path->nodes[0];
9197 ei = btrfs_item_ptr(leaf, path->slots[0],
9198 struct btrfs_file_extent_item);
9199 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9200 btrfs_set_file_extent_type(leaf, ei,
39279cc3 9201 BTRFS_FILE_EXTENT_INLINE);
c8b97818
CM
9202 btrfs_set_file_extent_encryption(leaf, ei, 0);
9203 btrfs_set_file_extent_compression(leaf, ei, 0);
9204 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9205 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9206
39279cc3 9207 ptr = btrfs_file_extent_inline_start(ei);
5f39d397
CM
9208 write_extent_buffer(leaf, symname, ptr, name_len);
9209 btrfs_mark_buffer_dirty(leaf);
39279cc3 9210 btrfs_free_path(path);
5f39d397 9211
39279cc3
CM
9212 inode->i_op = &btrfs_symlink_inode_operations;
9213 inode->i_mapping->a_ops = &btrfs_symlink_aops;
04160088 9214 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
d899e052 9215 inode_set_bytes(inode, name_len);
f06becc4 9216 btrfs_i_size_write(inode, name_len);
54aa1f4d 9217 err = btrfs_update_inode(trans, root, inode);
b0d5d10f 9218 if (err) {
54aa1f4d 9219 drop_inode = 1;
b0d5d10f
CM
9220 goto out_unlock_inode;
9221 }
9222
9223 unlock_new_inode(inode);
9224 d_instantiate(dentry, inode);
39279cc3
CM
9225
9226out_unlock:
7ad85bb7 9227 btrfs_end_transaction(trans, root);
39279cc3
CM
9228 if (drop_inode) {
9229 inode_dec_link_count(inode);
9230 iput(inode);
9231 }
b53d3f5d 9232 btrfs_btree_balance_dirty(root);
39279cc3 9233 return err;
b0d5d10f
CM
9234
9235out_unlock_inode:
9236 drop_inode = 1;
9237 unlock_new_inode(inode);
9238 goto out_unlock;
39279cc3 9239}
16432985 9240
0af3d00b
JB
9241static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9242 u64 start, u64 num_bytes, u64 min_size,
9243 loff_t actual_len, u64 *alloc_hint,
9244 struct btrfs_trans_handle *trans)
d899e052 9245{
5dc562c5
JB
9246 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9247 struct extent_map *em;
d899e052
YZ
9248 struct btrfs_root *root = BTRFS_I(inode)->root;
9249 struct btrfs_key ins;
d899e052 9250 u64 cur_offset = start;
55a61d1d 9251 u64 i_size;
154ea289 9252 u64 cur_bytes;
d899e052 9253 int ret = 0;
0af3d00b 9254 bool own_trans = true;
d899e052 9255
0af3d00b
JB
9256 if (trans)
9257 own_trans = false;
d899e052 9258 while (num_bytes > 0) {
0af3d00b
JB
9259 if (own_trans) {
9260 trans = btrfs_start_transaction(root, 3);
9261 if (IS_ERR(trans)) {
9262 ret = PTR_ERR(trans);
9263 break;
9264 }
5a303d5d
YZ
9265 }
9266
154ea289
CM
9267 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
9268 cur_bytes = max(cur_bytes, min_size);
00361589 9269 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
e570fd27 9270 *alloc_hint, &ins, 1, 0);
5a303d5d 9271 if (ret) {
0af3d00b
JB
9272 if (own_trans)
9273 btrfs_end_transaction(trans, root);
a22285a6 9274 break;
d899e052 9275 }
5a303d5d 9276
d899e052
YZ
9277 ret = insert_reserved_file_extent(trans, inode,
9278 cur_offset, ins.objectid,
9279 ins.offset, ins.offset,
920bbbfb 9280 ins.offset, 0, 0, 0,
d899e052 9281 BTRFS_FILE_EXTENT_PREALLOC);
79787eaa 9282 if (ret) {
857cc2fc 9283 btrfs_free_reserved_extent(root, ins.objectid,
e570fd27 9284 ins.offset, 0);
79787eaa
JM
9285 btrfs_abort_transaction(trans, root, ret);
9286 if (own_trans)
9287 btrfs_end_transaction(trans, root);
9288 break;
9289 }
a1ed835e
CM
9290 btrfs_drop_extent_cache(inode, cur_offset,
9291 cur_offset + ins.offset -1, 0);
5a303d5d 9292
5dc562c5
JB
9293 em = alloc_extent_map();
9294 if (!em) {
9295 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
9296 &BTRFS_I(inode)->runtime_flags);
9297 goto next;
9298 }
9299
9300 em->start = cur_offset;
9301 em->orig_start = cur_offset;
9302 em->len = ins.offset;
9303 em->block_start = ins.objectid;
9304 em->block_len = ins.offset;
b4939680 9305 em->orig_block_len = ins.offset;
cc95bef6 9306 em->ram_bytes = ins.offset;
5dc562c5
JB
9307 em->bdev = root->fs_info->fs_devices->latest_bdev;
9308 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9309 em->generation = trans->transid;
9310
9311 while (1) {
9312 write_lock(&em_tree->lock);
09a2a8f9 9313 ret = add_extent_mapping(em_tree, em, 1);
5dc562c5
JB
9314 write_unlock(&em_tree->lock);
9315 if (ret != -EEXIST)
9316 break;
9317 btrfs_drop_extent_cache(inode, cur_offset,
9318 cur_offset + ins.offset - 1,
9319 0);
9320 }
9321 free_extent_map(em);
9322next:
d899e052
YZ
9323 num_bytes -= ins.offset;
9324 cur_offset += ins.offset;
efa56464 9325 *alloc_hint = ins.objectid + ins.offset;
5a303d5d 9326
0c4d2d95 9327 inode_inc_iversion(inode);
d899e052 9328 inode->i_ctime = CURRENT_TIME;
6cbff00f 9329 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
d899e052 9330 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
efa56464
YZ
9331 (actual_len > inode->i_size) &&
9332 (cur_offset > inode->i_size)) {
d1ea6a61 9333 if (cur_offset > actual_len)
55a61d1d 9334 i_size = actual_len;
d1ea6a61 9335 else
55a61d1d
JB
9336 i_size = cur_offset;
9337 i_size_write(inode, i_size);
9338 btrfs_ordered_update_i_size(inode, i_size, NULL);
5a303d5d
YZ
9339 }
9340
d899e052 9341 ret = btrfs_update_inode(trans, root, inode);
79787eaa
JM
9342
9343 if (ret) {
9344 btrfs_abort_transaction(trans, root, ret);
9345 if (own_trans)
9346 btrfs_end_transaction(trans, root);
9347 break;
9348 }
d899e052 9349
0af3d00b
JB
9350 if (own_trans)
9351 btrfs_end_transaction(trans, root);
5a303d5d 9352 }
d899e052
YZ
9353 return ret;
9354}
9355
0af3d00b
JB
9356int btrfs_prealloc_file_range(struct inode *inode, int mode,
9357 u64 start, u64 num_bytes, u64 min_size,
9358 loff_t actual_len, u64 *alloc_hint)
9359{
9360 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9361 min_size, actual_len, alloc_hint,
9362 NULL);
9363}
9364
9365int btrfs_prealloc_file_range_trans(struct inode *inode,
9366 struct btrfs_trans_handle *trans, int mode,
9367 u64 start, u64 num_bytes, u64 min_size,
9368 loff_t actual_len, u64 *alloc_hint)
9369{
9370 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9371 min_size, actual_len, alloc_hint, trans);
9372}
9373
e6dcd2dc
CM
9374static int btrfs_set_page_dirty(struct page *page)
9375{
e6dcd2dc
CM
9376 return __set_page_dirty_nobuffers(page);
9377}
9378
10556cb2 9379static int btrfs_permission(struct inode *inode, int mask)
fdebe2bd 9380{
b83cc969 9381 struct btrfs_root *root = BTRFS_I(inode)->root;
cb6db4e5 9382 umode_t mode = inode->i_mode;
b83cc969 9383
cb6db4e5
JM
9384 if (mask & MAY_WRITE &&
9385 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9386 if (btrfs_root_readonly(root))
9387 return -EROFS;
9388 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9389 return -EACCES;
9390 }
2830ba7f 9391 return generic_permission(inode, mask);
fdebe2bd 9392}
39279cc3 9393
ef3b9af5
FM
9394static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9395{
9396 struct btrfs_trans_handle *trans;
9397 struct btrfs_root *root = BTRFS_I(dir)->root;
9398 struct inode *inode = NULL;
9399 u64 objectid;
9400 u64 index;
9401 int ret = 0;
9402
9403 /*
9404 * 5 units required for adding orphan entry
9405 */
9406 trans = btrfs_start_transaction(root, 5);
9407 if (IS_ERR(trans))
9408 return PTR_ERR(trans);
9409
9410 ret = btrfs_find_free_ino(root, &objectid);
9411 if (ret)
9412 goto out;
9413
9414 inode = btrfs_new_inode(trans, root, dir, NULL, 0,
9415 btrfs_ino(dir), objectid, mode, &index);
9416 if (IS_ERR(inode)) {
9417 ret = PTR_ERR(inode);
9418 inode = NULL;
9419 goto out;
9420 }
9421
ef3b9af5
FM
9422 inode->i_fop = &btrfs_file_operations;
9423 inode->i_op = &btrfs_file_inode_operations;
9424
9425 inode->i_mapping->a_ops = &btrfs_aops;
9426 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9427 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9428
b0d5d10f
CM
9429 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9430 if (ret)
9431 goto out_inode;
9432
9433 ret = btrfs_update_inode(trans, root, inode);
9434 if (ret)
9435 goto out_inode;
ef3b9af5
FM
9436 ret = btrfs_orphan_add(trans, inode);
9437 if (ret)
b0d5d10f 9438 goto out_inode;
ef3b9af5 9439
5762b5c9
FM
9440 /*
9441 * We set number of links to 0 in btrfs_new_inode(), and here we set
9442 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9443 * through:
9444 *
9445 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9446 */
9447 set_nlink(inode, 1);
b0d5d10f 9448 unlock_new_inode(inode);
ef3b9af5
FM
9449 d_tmpfile(dentry, inode);
9450 mark_inode_dirty(inode);
9451
9452out:
9453 btrfs_end_transaction(trans, root);
9454 if (ret)
9455 iput(inode);
9456 btrfs_balance_delayed_items(root);
9457 btrfs_btree_balance_dirty(root);
ef3b9af5 9458 return ret;
b0d5d10f
CM
9459
9460out_inode:
9461 unlock_new_inode(inode);
9462 goto out;
9463
ef3b9af5
FM
9464}
9465
6e1d5dcc 9466static const struct inode_operations btrfs_dir_inode_operations = {
3394e160 9467 .getattr = btrfs_getattr,
39279cc3
CM
9468 .lookup = btrfs_lookup,
9469 .create = btrfs_create,
9470 .unlink = btrfs_unlink,
9471 .link = btrfs_link,
9472 .mkdir = btrfs_mkdir,
9473 .rmdir = btrfs_rmdir,
80ace85c 9474 .rename2 = btrfs_rename2,
39279cc3
CM
9475 .symlink = btrfs_symlink,
9476 .setattr = btrfs_setattr,
618e21d5 9477 .mknod = btrfs_mknod,
95819c05
CH
9478 .setxattr = btrfs_setxattr,
9479 .getxattr = btrfs_getxattr,
5103e947 9480 .listxattr = btrfs_listxattr,
95819c05 9481 .removexattr = btrfs_removexattr,
fdebe2bd 9482 .permission = btrfs_permission,
4e34e719 9483 .get_acl = btrfs_get_acl,
996a710d 9484 .set_acl = btrfs_set_acl,
93fd63c2 9485 .update_time = btrfs_update_time,
ef3b9af5 9486 .tmpfile = btrfs_tmpfile,
39279cc3 9487};
6e1d5dcc 9488static const struct inode_operations btrfs_dir_ro_inode_operations = {
39279cc3 9489 .lookup = btrfs_lookup,
fdebe2bd 9490 .permission = btrfs_permission,
4e34e719 9491 .get_acl = btrfs_get_acl,
996a710d 9492 .set_acl = btrfs_set_acl,
93fd63c2 9493 .update_time = btrfs_update_time,
39279cc3 9494};
76dda93c 9495
828c0950 9496static const struct file_operations btrfs_dir_file_operations = {
39279cc3
CM
9497 .llseek = generic_file_llseek,
9498 .read = generic_read_dir,
9cdda8d3 9499 .iterate = btrfs_real_readdir,
34287aa3 9500 .unlocked_ioctl = btrfs_ioctl,
39279cc3 9501#ifdef CONFIG_COMPAT
34287aa3 9502 .compat_ioctl = btrfs_ioctl,
39279cc3 9503#endif
6bf13c0c 9504 .release = btrfs_release_file,
e02119d5 9505 .fsync = btrfs_sync_file,
39279cc3
CM
9506};
9507
d1310b2e 9508static struct extent_io_ops btrfs_extent_io_ops = {
07157aac 9509 .fill_delalloc = run_delalloc_range,
065631f6 9510 .submit_bio_hook = btrfs_submit_bio_hook,
239b14b3 9511 .merge_bio_hook = btrfs_merge_bio_hook,
07157aac 9512 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
e6dcd2dc 9513 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
247e743c 9514 .writepage_start_hook = btrfs_writepage_start_hook,
b0c68f8b
CM
9515 .set_bit_hook = btrfs_set_bit_hook,
9516 .clear_bit_hook = btrfs_clear_bit_hook,
9ed74f2d
JB
9517 .merge_extent_hook = btrfs_merge_extent_hook,
9518 .split_extent_hook = btrfs_split_extent_hook,
07157aac
CM
9519};
9520
35054394
CM
9521/*
9522 * btrfs doesn't support the bmap operation because swapfiles
9523 * use bmap to make a mapping of extents in the file. They assume
9524 * these extents won't change over the life of the file and they
9525 * use the bmap result to do IO directly to the drive.
9526 *
9527 * the btrfs bmap call would return logical addresses that aren't
9528 * suitable for IO and they also will change frequently as COW
9529 * operations happen. So, swapfile + btrfs == corruption.
9530 *
9531 * For now we're avoiding this by dropping bmap.
9532 */
7f09410b 9533static const struct address_space_operations btrfs_aops = {
39279cc3
CM
9534 .readpage = btrfs_readpage,
9535 .writepage = btrfs_writepage,
b293f02e 9536 .writepages = btrfs_writepages,
3ab2fb5a 9537 .readpages = btrfs_readpages,
16432985 9538 .direct_IO = btrfs_direct_IO,
a52d9a80
CM
9539 .invalidatepage = btrfs_invalidatepage,
9540 .releasepage = btrfs_releasepage,
e6dcd2dc 9541 .set_page_dirty = btrfs_set_page_dirty,
465fdd97 9542 .error_remove_page = generic_error_remove_page,
39279cc3
CM
9543};
9544
7f09410b 9545static const struct address_space_operations btrfs_symlink_aops = {
39279cc3
CM
9546 .readpage = btrfs_readpage,
9547 .writepage = btrfs_writepage,
2bf5a725
CM
9548 .invalidatepage = btrfs_invalidatepage,
9549 .releasepage = btrfs_releasepage,
39279cc3
CM
9550};
9551
6e1d5dcc 9552static const struct inode_operations btrfs_file_inode_operations = {
39279cc3
CM
9553 .getattr = btrfs_getattr,
9554 .setattr = btrfs_setattr,
95819c05
CH
9555 .setxattr = btrfs_setxattr,
9556 .getxattr = btrfs_getxattr,
5103e947 9557 .listxattr = btrfs_listxattr,
95819c05 9558 .removexattr = btrfs_removexattr,
fdebe2bd 9559 .permission = btrfs_permission,
1506fcc8 9560 .fiemap = btrfs_fiemap,
4e34e719 9561 .get_acl = btrfs_get_acl,
996a710d 9562 .set_acl = btrfs_set_acl,
e41f941a 9563 .update_time = btrfs_update_time,
39279cc3 9564};
6e1d5dcc 9565static const struct inode_operations btrfs_special_inode_operations = {
618e21d5
JB
9566 .getattr = btrfs_getattr,
9567 .setattr = btrfs_setattr,
fdebe2bd 9568 .permission = btrfs_permission,
95819c05
CH
9569 .setxattr = btrfs_setxattr,
9570 .getxattr = btrfs_getxattr,
33268eaf 9571 .listxattr = btrfs_listxattr,
95819c05 9572 .removexattr = btrfs_removexattr,
4e34e719 9573 .get_acl = btrfs_get_acl,
996a710d 9574 .set_acl = btrfs_set_acl,
e41f941a 9575 .update_time = btrfs_update_time,
618e21d5 9576};
6e1d5dcc 9577static const struct inode_operations btrfs_symlink_inode_operations = {
39279cc3
CM
9578 .readlink = generic_readlink,
9579 .follow_link = page_follow_link_light,
9580 .put_link = page_put_link,
f209561a 9581 .getattr = btrfs_getattr,
22c44fe6 9582 .setattr = btrfs_setattr,
fdebe2bd 9583 .permission = btrfs_permission,
0279b4cd
JO
9584 .setxattr = btrfs_setxattr,
9585 .getxattr = btrfs_getxattr,
9586 .listxattr = btrfs_listxattr,
9587 .removexattr = btrfs_removexattr,
e41f941a 9588 .update_time = btrfs_update_time,
39279cc3 9589};
76dda93c 9590
82d339d9 9591const struct dentry_operations btrfs_dentry_operations = {
76dda93c 9592 .d_delete = btrfs_dentry_delete,
b4aff1f8 9593 .d_release = btrfs_dentry_release,
76dda93c 9594};