2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/smp_lock.h>
26 #include <linux/backing-dev.h>
27 #include <linux/mpage.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/statfs.h>
31 #include <linux/compat.h>
32 #include <linux/version.h>
35 #include "transaction.h"
36 #include "btrfs_inode.h"
38 #include "print-tree.h"
41 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
42 struct page **prepared_pages,
43 const char __user * buf)
47 int offset = pos & (PAGE_CACHE_SIZE - 1);
49 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
50 size_t count = min_t(size_t,
51 PAGE_CACHE_SIZE - offset, write_bytes);
52 struct page *page = prepared_pages[i];
53 fault_in_pages_readable(buf, count);
55 /* Copy data from userspace to the current page */
57 page_fault = __copy_from_user(page_address(page) + offset,
59 /* Flush processor's dcache for this page */
60 flush_dcache_page(page);
68 return page_fault ? -EFAULT : 0;
71 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
74 for (i = 0; i < num_pages; i++) {
77 unlock_page(pages[i]);
78 mark_page_accessed(pages[i]);
79 page_cache_release(pages[i]);
83 static int insert_inline_extent(struct btrfs_trans_handle *trans,
84 struct btrfs_root *root, struct inode *inode,
85 u64 offset, size_t size,
86 struct page **pages, size_t page_offset,
90 struct btrfs_path *path;
91 struct extent_buffer *leaf;
94 struct btrfs_file_extent_item *ei;
102 path = btrfs_alloc_path();
106 btrfs_set_trans_block_group(trans, inode);
108 key.objectid = inode->i_ino;
110 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
112 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
118 struct btrfs_key found_key;
120 if (path->slots[0] == 0)
124 leaf = path->nodes[0];
125 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
127 if (found_key.objectid != inode->i_ino)
130 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
132 ei = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_file_extent_item);
135 if (btrfs_file_extent_type(leaf, ei) !=
136 BTRFS_FILE_EXTENT_INLINE) {
139 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
146 leaf = path->nodes[0];
147 ei = btrfs_item_ptr(leaf, path->slots[0],
148 struct btrfs_file_extent_item);
150 if (btrfs_file_extent_type(leaf, ei) !=
151 BTRFS_FILE_EXTENT_INLINE) {
153 btrfs_print_leaf(root, leaf);
154 printk("found wasn't inline offset %Lu inode %lu\n",
155 offset, inode->i_ino);
158 found_size = btrfs_file_extent_inline_len(leaf,
159 btrfs_item_nr(leaf, path->slots[0]));
160 found_end = key.offset + found_size;
162 if (found_end < offset + size) {
163 btrfs_release_path(root, path);
164 ret = btrfs_search_slot(trans, root, &key, path,
165 offset + size - found_end, 1);
168 ret = btrfs_extend_item(trans, root, path,
169 offset + size - found_end);
174 leaf = path->nodes[0];
175 ei = btrfs_item_ptr(leaf, path->slots[0],
176 struct btrfs_file_extent_item);
178 if (found_end < offset) {
179 ptr = btrfs_file_extent_inline_start(ei) + found_size;
180 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
184 btrfs_release_path(root, path);
185 datasize = offset + size - key.offset;
186 datasize = btrfs_file_extent_calc_inline_size(datasize);
187 ret = btrfs_insert_empty_item(trans, root, path, &key,
191 printk("got bad ret %d\n", ret);
194 leaf = path->nodes[0];
195 ei = btrfs_item_ptr(leaf, path->slots[0],
196 struct btrfs_file_extent_item);
197 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
198 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
200 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
206 kaddr = kmap_atomic(page, KM_USER0);
207 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
208 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
209 kunmap_atomic(kaddr, KM_USER0);
213 if (i >= num_pages) {
214 printk("i %d num_pages %d\n", i, num_pages);
218 btrfs_mark_buffer_dirty(leaf);
220 btrfs_free_path(path);
224 static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
225 struct btrfs_root *root,
234 struct inode *inode = file->f_path.dentry->d_inode;
235 struct extent_map *em;
236 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
240 u64 end_of_last_block;
241 u64 end_pos = pos + write_bytes;
243 loff_t isize = i_size_read(inode);
245 em = alloc_extent_map(GFP_NOFS);
249 em->bdev = inode->i_sb->s_bdev;
251 start_pos = pos & ~((u64)root->sectorsize - 1);
252 num_bytes = (write_bytes + pos - start_pos +
253 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
255 down_read(&BTRFS_I(inode)->root->snap_sem);
256 end_of_last_block = start_pos + num_bytes - 1;
258 lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
259 mutex_lock(&root->fs_info->fs_mutex);
260 trans = btrfs_start_transaction(root, 1);
265 btrfs_set_trans_block_group(trans, inode);
266 inode->i_blocks += num_bytes >> 9;
269 if ((end_of_last_block & 4095) == 0) {
270 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
272 set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
274 /* FIXME...EIEIO, ENOSPC and more */
276 /* insert any holes we need to create */
277 if (inode->i_size < start_pos) {
278 u64 last_pos_in_file;
280 u64 mask = root->sectorsize - 1;
281 last_pos_in_file = (isize + mask) & ~mask;
282 hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
284 if (last_pos_in_file < start_pos) {
285 err = btrfs_drop_extents(trans, root, inode,
287 last_pos_in_file + hole_size,
293 err = btrfs_insert_file_extent(trans, root,
303 * either allocate an extent for the new bytes or setup the key
304 * to show we are doing inline data in the extent
306 inline_size = end_pos;
307 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
308 inline_size > 32768 ||
309 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
312 for (i = 0; i < num_pages; i++) {
313 struct page *p = pages[i];
317 last_end = (u64)(pages[num_pages -1]->index) <<
319 last_end += PAGE_CACHE_SIZE - 1;
320 set_extent_delalloc(em_tree, start_pos, end_of_last_block,
324 /* step one, delete the existing extents in this range */
325 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
326 ~((u64)root->sectorsize - 1);
327 err = btrfs_drop_extents(trans, root, inode, start_pos,
328 aligned_end, aligned_end, &hint_byte);
331 err = insert_inline_extent(trans, root, inode, start_pos,
332 end_pos - start_pos, pages, 0,
336 if (end_pos > isize) {
337 i_size_write(inode, end_pos);
338 btrfs_update_inode(trans, root, inode);
341 err = btrfs_end_transaction(trans, root);
343 mutex_unlock(&root->fs_info->fs_mutex);
344 unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
346 up_read(&BTRFS_I(inode)->root->snap_sem);
350 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
352 struct extent_map *em;
353 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
356 em = lookup_extent_mapping(em_tree, start, end);
359 remove_extent_mapping(em_tree, em);
362 /* once for the tree*/
369 * this is very complex, but the basic idea is to drop all extents
370 * in the range start - end. hint_block is filled in with a block number
371 * that would be a good hint to the block allocator for this file.
373 * If an extent intersects the range but is not entirely inside the range
374 * it is either truncated or split. Anything entirely inside the range
375 * is deleted from the tree.
377 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
378 struct btrfs_root *root, struct inode *inode,
379 u64 start, u64 end, u64 inline_end, u64 *hint_byte)
382 struct btrfs_key key;
383 struct extent_buffer *leaf;
385 struct btrfs_file_extent_item *extent;
388 struct btrfs_file_extent_item old;
389 struct btrfs_path *path;
390 u64 search_start = start;
397 btrfs_drop_extent_cache(inode, start, end - 1);
399 path = btrfs_alloc_path();
404 btrfs_release_path(root, path);
405 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
410 if (path->slots[0] == 0) {
422 leaf = path->nodes[0];
423 slot = path->slots[0];
425 btrfs_item_key_to_cpu(leaf, &key, slot);
426 if (key.offset >= end || key.objectid != inode->i_ino) {
429 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
433 search_start = key.offset;
436 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
437 extent = btrfs_item_ptr(leaf, slot,
438 struct btrfs_file_extent_item);
439 found_type = btrfs_file_extent_type(leaf, extent);
440 if (found_type == BTRFS_FILE_EXTENT_REG) {
441 extent_end = key.offset +
442 btrfs_file_extent_num_bytes(leaf, extent);
444 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
445 struct btrfs_item *item;
446 item = btrfs_item_nr(leaf, slot);
448 extent_end = key.offset +
449 btrfs_file_extent_inline_len(leaf, item);
452 extent_end = search_start;
455 /* we found nothing we can drop */
456 if ((!found_extent && !found_inline) ||
457 search_start >= extent_end) {
460 nritems = btrfs_header_nritems(leaf);
461 if (slot >= nritems - 1) {
462 nextret = btrfs_next_leaf(root, path);
473 u64 mask = root->sectorsize - 1;
474 search_start = (extent_end + mask) & ~mask;
476 search_start = extent_end;
478 if (end <= extent_end && start >= key.offset && found_inline) {
479 *hint_byte = EXTENT_MAP_INLINE;
481 if (end < extent_end && end >= key.offset) {
484 btrfs_file_extent_disk_bytenr(leaf, extent);
486 btrfs_file_extent_disk_num_bytes(leaf,
488 read_extent_buffer(leaf, &old,
489 (unsigned long)extent,
491 if (disk_bytenr != 0) {
492 ret = btrfs_inc_extent_ref(trans, root,
493 disk_bytenr, disk_num_bytes);
498 if (found_inline && start <= key.offset &&
499 inline_end < extent_end)
502 /* truncate existing extent */
503 if (start > key.offset) {
507 WARN_ON(start & (root->sectorsize - 1));
509 new_num = start - key.offset;
510 old_num = btrfs_file_extent_num_bytes(leaf,
513 btrfs_file_extent_disk_bytenr(leaf,
515 if (btrfs_file_extent_disk_bytenr(leaf,
518 (old_num - new_num) >> 9;
520 btrfs_set_file_extent_num_bytes(leaf, extent,
522 btrfs_mark_buffer_dirty(leaf);
523 } else if (end > extent_end &&
524 key.offset < inline_end &&
525 inline_end < extent_end) {
527 new_size = btrfs_file_extent_calc_inline_size(
528 inline_end - key.offset);
529 btrfs_truncate_item(trans, root, path,
533 /* delete the entire extent */
536 u64 disk_num_bytes = 0;
537 u64 extent_num_bytes = 0;
540 btrfs_file_extent_disk_bytenr(leaf,
543 btrfs_file_extent_disk_num_bytes(leaf,
546 btrfs_file_extent_num_bytes(leaf, extent);
548 btrfs_file_extent_disk_bytenr(leaf,
551 ret = btrfs_del_item(trans, root, path);
552 /* TODO update progress marker and return */
554 btrfs_release_path(root, path);
556 if (found_extent && disk_bytenr != 0) {
557 inode->i_blocks -= extent_num_bytes >> 9;
558 ret = btrfs_free_extent(trans, root,
564 if (!bookend && search_start >= end) {
571 if (bookend && found_inline && start <= key.offset &&
572 inline_end < extent_end) {
574 new_size = btrfs_file_extent_calc_inline_size(
575 extent_end - inline_end);
576 btrfs_truncate_item(trans, root, path, new_size, 0);
578 /* create bookend, splitting the extent in two */
579 if (bookend && found_extent) {
580 struct btrfs_key ins;
581 ins.objectid = inode->i_ino;
583 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
584 btrfs_release_path(root, path);
585 ret = btrfs_insert_empty_item(trans, root, path, &ins,
588 leaf = path->nodes[0];
590 btrfs_print_leaf(root, leaf);
591 printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.type, ins.offset, start, end, key.offset, extent_end, keep);
594 extent = btrfs_item_ptr(leaf, path->slots[0],
595 struct btrfs_file_extent_item);
596 write_extent_buffer(leaf, &old,
597 (unsigned long)extent, sizeof(old));
599 btrfs_set_file_extent_offset(leaf, extent,
600 le64_to_cpu(old.offset) + end - key.offset);
601 WARN_ON(le64_to_cpu(old.num_bytes) <
603 btrfs_set_file_extent_num_bytes(leaf, extent,
605 btrfs_set_file_extent_type(leaf, extent,
606 BTRFS_FILE_EXTENT_REG);
608 btrfs_mark_buffer_dirty(path->nodes[0]);
609 if (le64_to_cpu(old.disk_bytenr) != 0) {
611 btrfs_file_extent_num_bytes(leaf,
619 btrfs_free_path(path);
624 * this gets pages into the page cache and locks them down
626 static int prepare_pages(struct btrfs_root *root,
631 unsigned long first_index,
632 unsigned long last_index,
636 unsigned long index = pos >> PAGE_CACHE_SHIFT;
637 struct inode *inode = file->f_path.dentry->d_inode;
641 start_pos = pos & ~((u64)root->sectorsize - 1);
643 memset(pages, 0, num_pages * sizeof(struct page *));
645 for (i = 0; i < num_pages; i++) {
646 pages[i] = grab_cache_page(inode->i_mapping, index + i);
651 cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
652 wait_on_page_writeback(pages[i]);
653 set_page_extent_mapped(pages[i]);
654 WARN_ON(!PageLocked(pages[i]));
659 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
660 size_t count, loff_t *ppos)
664 ssize_t num_written = 0;
667 struct inode *inode = file->f_path.dentry->d_inode;
668 struct btrfs_root *root = BTRFS_I(inode)->root;
669 struct page **pages = NULL;
671 struct page *pinned[2];
672 unsigned long first_index;
673 unsigned long last_index;
675 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
676 PAGE_CACHE_SIZE / (sizeof(struct page *)));
679 if (file->f_flags & O_DIRECT)
685 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
686 current->backing_dev_info = inode->i_mapping->backing_dev_info;
687 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
692 err = remove_suid(file->f_path.dentry);
695 file_update_time(file);
697 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
699 mutex_lock(&inode->i_mutex);
700 first_index = pos >> PAGE_CACHE_SHIFT;
701 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
704 * there are lots of better ways to do this, but this code
705 * makes sure the first and last page in the file range are
706 * up to date and ready for cow
708 if ((pos & (PAGE_CACHE_SIZE - 1))) {
709 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
710 if (!PageUptodate(pinned[0])) {
711 ret = btrfs_readpage(NULL, pinned[0]);
713 wait_on_page_locked(pinned[0]);
715 unlock_page(pinned[0]);
718 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
719 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
720 if (!PageUptodate(pinned[1])) {
721 ret = btrfs_readpage(NULL, pinned[1]);
723 wait_on_page_locked(pinned[1]);
725 unlock_page(pinned[1]);
730 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
731 size_t write_bytes = min(count, nrptrs *
732 (size_t)PAGE_CACHE_SIZE -
734 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
737 WARN_ON(num_pages > nrptrs);
738 memset(pages, 0, sizeof(pages));
739 ret = prepare_pages(root, file, pages, num_pages,
740 pos, first_index, last_index,
745 ret = btrfs_copy_from_user(pos, num_pages,
746 write_bytes, pages, buf);
748 btrfs_drop_pages(pages, num_pages);
752 ret = dirty_and_release_pages(NULL, root, file, pages,
753 num_pages, pos, write_bytes);
754 btrfs_drop_pages(pages, num_pages);
759 count -= write_bytes;
761 num_written += write_bytes;
763 balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
764 btrfs_btree_balance_dirty(root, 1);
767 mutex_unlock(&inode->i_mutex);
771 page_cache_release(pinned[0]);
773 page_cache_release(pinned[1]);
776 if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
777 err = sync_page_range(inode, inode->i_mapping,
778 start_pos, num_written);
782 current->backing_dev_info = NULL;
783 return num_written ? num_written : err;
786 static int btrfs_sync_file(struct file *file,
787 struct dentry *dentry, int datasync)
789 struct inode *inode = dentry->d_inode;
790 struct btrfs_root *root = BTRFS_I(inode)->root;
792 struct btrfs_trans_handle *trans;
795 * check the transaction that last modified this inode
796 * and see if its already been committed
798 mutex_lock(&root->fs_info->fs_mutex);
799 if (!BTRFS_I(inode)->last_trans)
801 mutex_lock(&root->fs_info->trans_mutex);
802 if (BTRFS_I(inode)->last_trans <=
803 root->fs_info->last_trans_committed) {
804 BTRFS_I(inode)->last_trans = 0;
805 mutex_unlock(&root->fs_info->trans_mutex);
808 mutex_unlock(&root->fs_info->trans_mutex);
811 * ok we haven't committed the transaction yet, lets do a commit
813 trans = btrfs_start_transaction(root, 1);
818 ret = btrfs_commit_transaction(trans, root);
820 mutex_unlock(&root->fs_info->fs_mutex);
821 return ret > 0 ? EIO : ret;
824 static struct vm_operations_struct btrfs_file_vm_ops = {
825 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
826 .nopage = filemap_nopage,
827 .populate = filemap_populate,
829 .fault = filemap_fault,
831 .page_mkwrite = btrfs_page_mkwrite,
834 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
836 vma->vm_ops = &btrfs_file_vm_ops;
841 struct file_operations btrfs_file_operations = {
842 .llseek = generic_file_llseek,
843 .read = do_sync_read,
844 .aio_read = generic_file_aio_read,
845 .write = btrfs_file_write,
846 .mmap = btrfs_file_mmap,
847 .open = generic_file_open,
848 .fsync = btrfs_sync_file,
849 .unlocked_ioctl = btrfs_ioctl,
851 .compat_ioctl = btrfs_ioctl,