Btrfs: change how subvolumes are organized
[linux-2.6-block.git] / fs / btrfs / ioctl.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/fsnotify.h>
25 #include <linux/pagemap.h>
26 #include <linux/highmem.h>
27 #include <linux/time.h>
28 #include <linux/init.h>
29 #include <linux/string.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mount.h>
32 #include <linux/mpage.h>
33 #include <linux/namei.h>
34 #include <linux/swap.h>
35 #include <linux/writeback.h>
36 #include <linux/statfs.h>
37 #include <linux/compat.h>
38 #include <linux/bit_spinlock.h>
39 #include <linux/security.h>
40 #include <linux/xattr.h>
41 #include <linux/vmalloc.h>
42 #include "compat.h"
43 #include "ctree.h"
44 #include "disk-io.h"
45 #include "transaction.h"
46 #include "btrfs_inode.h"
47 #include "ioctl.h"
48 #include "print-tree.h"
49 #include "volumes.h"
50 #include "locking.h"
51
52 /* Mask out flags that are inappropriate for the given type of inode. */
53 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
54 {
55         if (S_ISDIR(mode))
56                 return flags;
57         else if (S_ISREG(mode))
58                 return flags & ~FS_DIRSYNC_FL;
59         else
60                 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
61 }
62
63 /*
64  * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
65  */
66 static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
67 {
68         unsigned int iflags = 0;
69
70         if (flags & BTRFS_INODE_SYNC)
71                 iflags |= FS_SYNC_FL;
72         if (flags & BTRFS_INODE_IMMUTABLE)
73                 iflags |= FS_IMMUTABLE_FL;
74         if (flags & BTRFS_INODE_APPEND)
75                 iflags |= FS_APPEND_FL;
76         if (flags & BTRFS_INODE_NODUMP)
77                 iflags |= FS_NODUMP_FL;
78         if (flags & BTRFS_INODE_NOATIME)
79                 iflags |= FS_NOATIME_FL;
80         if (flags & BTRFS_INODE_DIRSYNC)
81                 iflags |= FS_DIRSYNC_FL;
82
83         return iflags;
84 }
85
86 /*
87  * Update inode->i_flags based on the btrfs internal flags.
88  */
89 void btrfs_update_iflags(struct inode *inode)
90 {
91         struct btrfs_inode *ip = BTRFS_I(inode);
92
93         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
94
95         if (ip->flags & BTRFS_INODE_SYNC)
96                 inode->i_flags |= S_SYNC;
97         if (ip->flags & BTRFS_INODE_IMMUTABLE)
98                 inode->i_flags |= S_IMMUTABLE;
99         if (ip->flags & BTRFS_INODE_APPEND)
100                 inode->i_flags |= S_APPEND;
101         if (ip->flags & BTRFS_INODE_NOATIME)
102                 inode->i_flags |= S_NOATIME;
103         if (ip->flags & BTRFS_INODE_DIRSYNC)
104                 inode->i_flags |= S_DIRSYNC;
105 }
106
107 /*
108  * Inherit flags from the parent inode.
109  *
110  * Unlike extN we don't have any flags we don't want to inherit currently.
111  */
112 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
113 {
114         unsigned int flags;
115
116         if (!dir)
117                 return;
118
119         flags = BTRFS_I(dir)->flags;
120
121         if (S_ISREG(inode->i_mode))
122                 flags &= ~BTRFS_INODE_DIRSYNC;
123         else if (!S_ISDIR(inode->i_mode))
124                 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
125
126         BTRFS_I(inode)->flags = flags;
127         btrfs_update_iflags(inode);
128 }
129
130 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
131 {
132         struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
133         unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
134
135         if (copy_to_user(arg, &flags, sizeof(flags)))
136                 return -EFAULT;
137         return 0;
138 }
139
140 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
141 {
142         struct inode *inode = file->f_path.dentry->d_inode;
143         struct btrfs_inode *ip = BTRFS_I(inode);
144         struct btrfs_root *root = ip->root;
145         struct btrfs_trans_handle *trans;
146         unsigned int flags, oldflags;
147         int ret;
148
149         if (copy_from_user(&flags, arg, sizeof(flags)))
150                 return -EFAULT;
151
152         if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
153                       FS_NOATIME_FL | FS_NODUMP_FL | \
154                       FS_SYNC_FL | FS_DIRSYNC_FL))
155                 return -EOPNOTSUPP;
156
157         if (!is_owner_or_cap(inode))
158                 return -EACCES;
159
160         mutex_lock(&inode->i_mutex);
161
162         flags = btrfs_mask_flags(inode->i_mode, flags);
163         oldflags = btrfs_flags_to_ioctl(ip->flags);
164         if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
165                 if (!capable(CAP_LINUX_IMMUTABLE)) {
166                         ret = -EPERM;
167                         goto out_unlock;
168                 }
169         }
170
171         ret = mnt_want_write(file->f_path.mnt);
172         if (ret)
173                 goto out_unlock;
174
175         if (flags & FS_SYNC_FL)
176                 ip->flags |= BTRFS_INODE_SYNC;
177         else
178                 ip->flags &= ~BTRFS_INODE_SYNC;
179         if (flags & FS_IMMUTABLE_FL)
180                 ip->flags |= BTRFS_INODE_IMMUTABLE;
181         else
182                 ip->flags &= ~BTRFS_INODE_IMMUTABLE;
183         if (flags & FS_APPEND_FL)
184                 ip->flags |= BTRFS_INODE_APPEND;
185         else
186                 ip->flags &= ~BTRFS_INODE_APPEND;
187         if (flags & FS_NODUMP_FL)
188                 ip->flags |= BTRFS_INODE_NODUMP;
189         else
190                 ip->flags &= ~BTRFS_INODE_NODUMP;
191         if (flags & FS_NOATIME_FL)
192                 ip->flags |= BTRFS_INODE_NOATIME;
193         else
194                 ip->flags &= ~BTRFS_INODE_NOATIME;
195         if (flags & FS_DIRSYNC_FL)
196                 ip->flags |= BTRFS_INODE_DIRSYNC;
197         else
198                 ip->flags &= ~BTRFS_INODE_DIRSYNC;
199
200
201         trans = btrfs_join_transaction(root, 1);
202         BUG_ON(!trans);
203
204         ret = btrfs_update_inode(trans, root, inode);
205         BUG_ON(ret);
206
207         btrfs_update_iflags(inode);
208         inode->i_ctime = CURRENT_TIME;
209         btrfs_end_transaction(trans, root);
210
211         mnt_drop_write(file->f_path.mnt);
212  out_unlock:
213         mutex_unlock(&inode->i_mutex);
214         return 0;
215 }
216
217 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
218 {
219         struct inode *inode = file->f_path.dentry->d_inode;
220
221         return put_user(inode->i_generation, arg);
222 }
223
224 static noinline int create_subvol(struct btrfs_root *root,
225                                   struct dentry *dentry,
226                                   char *name, int namelen)
227 {
228         struct btrfs_trans_handle *trans;
229         struct btrfs_key key;
230         struct btrfs_root_item root_item;
231         struct btrfs_inode_item *inode_item;
232         struct extent_buffer *leaf;
233         struct btrfs_root *new_root = root;
234         struct inode *dir;
235         int ret;
236         int err;
237         u64 objectid;
238         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
239         u64 index = 0;
240         unsigned long nr = 1;
241
242         ret = btrfs_check_metadata_free_space(root);
243         if (ret)
244                 goto fail_commit;
245
246         trans = btrfs_start_transaction(root, 1);
247         BUG_ON(!trans);
248
249         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
250                                        0, &objectid);
251         if (ret)
252                 goto fail;
253
254         leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
255                                       0, objectid, NULL, 0, 0, 0);
256         if (IS_ERR(leaf)) {
257                 ret = PTR_ERR(leaf);
258                 goto fail;
259         }
260
261         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
262         btrfs_set_header_bytenr(leaf, leaf->start);
263         btrfs_set_header_generation(leaf, trans->transid);
264         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
265         btrfs_set_header_owner(leaf, objectid);
266
267         write_extent_buffer(leaf, root->fs_info->fsid,
268                             (unsigned long)btrfs_header_fsid(leaf),
269                             BTRFS_FSID_SIZE);
270         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
271                             (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
272                             BTRFS_UUID_SIZE);
273         btrfs_mark_buffer_dirty(leaf);
274
275         inode_item = &root_item.inode;
276         memset(inode_item, 0, sizeof(*inode_item));
277         inode_item->generation = cpu_to_le64(1);
278         inode_item->size = cpu_to_le64(3);
279         inode_item->nlink = cpu_to_le32(1);
280         inode_item->nbytes = cpu_to_le64(root->leafsize);
281         inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
282
283         btrfs_set_root_bytenr(&root_item, leaf->start);
284         btrfs_set_root_generation(&root_item, trans->transid);
285         btrfs_set_root_level(&root_item, 0);
286         btrfs_set_root_refs(&root_item, 1);
287         btrfs_set_root_used(&root_item, 0);
288         btrfs_set_root_last_snapshot(&root_item, 0);
289
290         memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
291         root_item.drop_level = 0;
292
293         btrfs_tree_unlock(leaf);
294         free_extent_buffer(leaf);
295         leaf = NULL;
296
297         btrfs_set_root_dirid(&root_item, new_dirid);
298
299         key.objectid = objectid;
300         key.offset = 0;
301         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
302         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
303                                 &root_item);
304         if (ret)
305                 goto fail;
306
307         /*
308          * insert the directory item
309          */
310         key.offset = (u64)-1;
311         dir = dentry->d_parent->d_inode;
312         ret = btrfs_set_inode_index(dir, &index);
313         BUG_ON(ret);
314
315         ret = btrfs_insert_dir_item(trans, root,
316                                     name, namelen, dir->i_ino, &key,
317                                     BTRFS_FT_DIR, index);
318         if (ret)
319                 goto fail;
320
321         btrfs_i_size_write(dir, dir->i_size + namelen * 2);
322         ret = btrfs_update_inode(trans, root, dir);
323         BUG_ON(ret);
324
325         ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
326                                  objectid, root->root_key.objectid,
327                                  dir->i_ino, index, name, namelen);
328         BUG_ON(ret);
329
330         ret = btrfs_commit_transaction(trans, root);
331         if (ret)
332                 goto fail_commit;
333
334         new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
335         BUG_ON(!new_root);
336
337         trans = btrfs_start_transaction(new_root, 1);
338         BUG_ON(!trans);
339
340         ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
341                                        BTRFS_I(dir)->block_group);
342         if (ret)
343                 goto fail;
344
345 fail:
346         nr = trans->blocks_used;
347         err = btrfs_commit_transaction(trans, new_root);
348         if (err && !ret)
349                 ret = err;
350 fail_commit:
351         btrfs_btree_balance_dirty(root, nr);
352         return ret;
353 }
354
355 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
356                            char *name, int namelen)
357 {
358         struct btrfs_pending_snapshot *pending_snapshot;
359         struct btrfs_trans_handle *trans;
360         int ret = 0;
361         int err;
362         unsigned long nr = 0;
363
364         if (!root->ref_cows)
365                 return -EINVAL;
366
367         ret = btrfs_check_metadata_free_space(root);
368         if (ret)
369                 goto fail_unlock;
370
371         pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
372         if (!pending_snapshot) {
373                 ret = -ENOMEM;
374                 goto fail_unlock;
375         }
376         pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
377         if (!pending_snapshot->name) {
378                 ret = -ENOMEM;
379                 kfree(pending_snapshot);
380                 goto fail_unlock;
381         }
382         memcpy(pending_snapshot->name, name, namelen);
383         pending_snapshot->name[namelen] = '\0';
384         pending_snapshot->dentry = dentry;
385         trans = btrfs_start_transaction(root, 1);
386         BUG_ON(!trans);
387         pending_snapshot->root = root;
388         list_add(&pending_snapshot->list,
389                  &trans->transaction->pending_snapshots);
390         err = btrfs_commit_transaction(trans, root);
391
392 fail_unlock:
393         btrfs_btree_balance_dirty(root, nr);
394         return ret;
395 }
396
397 /* copy of may_create in fs/namei.c() */
398 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
399 {
400         if (child->d_inode)
401                 return -EEXIST;
402         if (IS_DEADDIR(dir))
403                 return -ENOENT;
404         return inode_permission(dir, MAY_WRITE | MAY_EXEC);
405 }
406
407 /*
408  * Create a new subvolume below @parent.  This is largely modeled after
409  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
410  * inside this filesystem so it's quite a bit simpler.
411  */
412 static noinline int btrfs_mksubvol(struct path *parent, char *name,
413                                    int mode, int namelen,
414                                    struct btrfs_root *snap_src)
415 {
416         struct dentry *dentry;
417         int error;
418
419         mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
420
421         dentry = lookup_one_len(name, parent->dentry, namelen);
422         error = PTR_ERR(dentry);
423         if (IS_ERR(dentry))
424                 goto out_unlock;
425
426         error = -EEXIST;
427         if (dentry->d_inode)
428                 goto out_dput;
429
430         if (!IS_POSIXACL(parent->dentry->d_inode))
431                 mode &= ~current_umask();
432
433         error = mnt_want_write(parent->mnt);
434         if (error)
435                 goto out_dput;
436
437         error = btrfs_may_create(parent->dentry->d_inode, dentry);
438         if (error)
439                 goto out_drop_write;
440
441         /*
442          * Actually perform the low-level subvolume creation after all
443          * this VFS fuzz.
444          *
445          * Eventually we want to pass in an inode under which we create this
446          * subvolume, but for now all are under the filesystem root.
447          *
448          * Also we should pass on the mode eventually to allow creating new
449          * subvolume with specific mode bits.
450          */
451         if (snap_src) {
452                 struct dentry *dir = dentry->d_parent;
453                 struct dentry *test = dir->d_parent;
454                 struct btrfs_path *path = btrfs_alloc_path();
455                 int ret;
456                 u64 test_oid;
457                 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
458
459                 test_oid = snap_src->root_key.objectid;
460
461                 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
462                                           path, parent_oid, test_oid);
463                 if (ret == 0)
464                         goto create;
465                 btrfs_release_path(snap_src->fs_info->tree_root, path);
466
467                 /* we need to make sure we aren't creating a directory loop
468                  * by taking a snapshot of something that has our current
469                  * subvol in its directory tree.  So, this loops through
470                  * the dentries and checks the forward refs for each subvolume
471                  * to see if is references the subvolume where we are
472                  * placing this new snapshot.
473                  */
474                 while (1) {
475                         if (!test ||
476                             dir == snap_src->fs_info->sb->s_root ||
477                             test == snap_src->fs_info->sb->s_root ||
478                             test->d_inode->i_sb != snap_src->fs_info->sb) {
479                                 break;
480                         }
481                         if (S_ISLNK(test->d_inode->i_mode)) {
482                                 printk(KERN_INFO "Btrfs symlink in snapshot "
483                                        "path, failed\n");
484                                 error = -EMLINK;
485                                 btrfs_free_path(path);
486                                 goto out_drop_write;
487                         }
488                         test_oid =
489                                 BTRFS_I(test->d_inode)->root->root_key.objectid;
490                         ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
491                                   path, test_oid, parent_oid);
492                         if (ret == 0) {
493                                 printk(KERN_INFO "Btrfs snapshot creation "
494                                        "failed, looping\n");
495                                 error = -EMLINK;
496                                 btrfs_free_path(path);
497                                 goto out_drop_write;
498                         }
499                         btrfs_release_path(snap_src->fs_info->tree_root, path);
500                         test = test->d_parent;
501                 }
502 create:
503                 btrfs_free_path(path);
504                 error = create_snapshot(snap_src, dentry, name, namelen);
505         } else {
506                 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
507                                       dentry, name, namelen);
508         }
509         if (error)
510                 goto out_drop_write;
511
512         fsnotify_mkdir(parent->dentry->d_inode, dentry);
513 out_drop_write:
514         mnt_drop_write(parent->mnt);
515 out_dput:
516         dput(dentry);
517 out_unlock:
518         mutex_unlock(&parent->dentry->d_inode->i_mutex);
519         return error;
520 }
521
522
523 static int btrfs_defrag_file(struct file *file)
524 {
525         struct inode *inode = fdentry(file)->d_inode;
526         struct btrfs_root *root = BTRFS_I(inode)->root;
527         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
528         struct btrfs_ordered_extent *ordered;
529         struct page *page;
530         unsigned long last_index;
531         unsigned long ra_pages = root->fs_info->bdi.ra_pages;
532         unsigned long total_read = 0;
533         u64 page_start;
534         u64 page_end;
535         unsigned long i;
536         int ret;
537
538         ret = btrfs_check_data_free_space(root, inode, inode->i_size);
539         if (ret)
540                 return -ENOSPC;
541
542         mutex_lock(&inode->i_mutex);
543         last_index = inode->i_size >> PAGE_CACHE_SHIFT;
544         for (i = 0; i <= last_index; i++) {
545                 if (total_read % ra_pages == 0) {
546                         btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
547                                        min(last_index, i + ra_pages - 1));
548                 }
549                 total_read++;
550 again:
551                 page = grab_cache_page(inode->i_mapping, i);
552                 if (!page)
553                         goto out_unlock;
554                 if (!PageUptodate(page)) {
555                         btrfs_readpage(NULL, page);
556                         lock_page(page);
557                         if (!PageUptodate(page)) {
558                                 unlock_page(page);
559                                 page_cache_release(page);
560                                 goto out_unlock;
561                         }
562                 }
563
564                 wait_on_page_writeback(page);
565
566                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
567                 page_end = page_start + PAGE_CACHE_SIZE - 1;
568                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
569
570                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
571                 if (ordered) {
572                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
573                         unlock_page(page);
574                         page_cache_release(page);
575                         btrfs_start_ordered_extent(inode, ordered, 1);
576                         btrfs_put_ordered_extent(ordered);
577                         goto again;
578                 }
579                 set_page_extent_mapped(page);
580
581                 /*
582                  * this makes sure page_mkwrite is called on the
583                  * page if it is dirtied again later
584                  */
585                 clear_page_dirty_for_io(page);
586
587                 btrfs_set_extent_delalloc(inode, page_start, page_end);
588                 set_page_dirty(page);
589                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
590                 unlock_page(page);
591                 page_cache_release(page);
592                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
593         }
594
595 out_unlock:
596         mutex_unlock(&inode->i_mutex);
597         return 0;
598 }
599
600 static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
601 {
602         u64 new_size;
603         u64 old_size;
604         u64 devid = 1;
605         struct btrfs_ioctl_vol_args *vol_args;
606         struct btrfs_trans_handle *trans;
607         struct btrfs_device *device = NULL;
608         char *sizestr;
609         char *devstr = NULL;
610         int ret = 0;
611         int namelen;
612         int mod = 0;
613
614         if (root->fs_info->sb->s_flags & MS_RDONLY)
615                 return -EROFS;
616
617         if (!capable(CAP_SYS_ADMIN))
618                 return -EPERM;
619
620         vol_args = memdup_user(arg, sizeof(*vol_args));
621         if (IS_ERR(vol_args))
622                 return PTR_ERR(vol_args);
623
624         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
625         namelen = strlen(vol_args->name);
626
627         mutex_lock(&root->fs_info->volume_mutex);
628         sizestr = vol_args->name;
629         devstr = strchr(sizestr, ':');
630         if (devstr) {
631                 char *end;
632                 sizestr = devstr + 1;
633                 *devstr = '\0';
634                 devstr = vol_args->name;
635                 devid = simple_strtoull(devstr, &end, 10);
636                 printk(KERN_INFO "resizing devid %llu\n",
637                        (unsigned long long)devid);
638         }
639         device = btrfs_find_device(root, devid, NULL, NULL);
640         if (!device) {
641                 printk(KERN_INFO "resizer unable to find device %llu\n",
642                        (unsigned long long)devid);
643                 ret = -EINVAL;
644                 goto out_unlock;
645         }
646         if (!strcmp(sizestr, "max"))
647                 new_size = device->bdev->bd_inode->i_size;
648         else {
649                 if (sizestr[0] == '-') {
650                         mod = -1;
651                         sizestr++;
652                 } else if (sizestr[0] == '+') {
653                         mod = 1;
654                         sizestr++;
655                 }
656                 new_size = btrfs_parse_size(sizestr);
657                 if (new_size == 0) {
658                         ret = -EINVAL;
659                         goto out_unlock;
660                 }
661         }
662
663         old_size = device->total_bytes;
664
665         if (mod < 0) {
666                 if (new_size > old_size) {
667                         ret = -EINVAL;
668                         goto out_unlock;
669                 }
670                 new_size = old_size - new_size;
671         } else if (mod > 0) {
672                 new_size = old_size + new_size;
673         }
674
675         if (new_size < 256 * 1024 * 1024) {
676                 ret = -EINVAL;
677                 goto out_unlock;
678         }
679         if (new_size > device->bdev->bd_inode->i_size) {
680                 ret = -EFBIG;
681                 goto out_unlock;
682         }
683
684         do_div(new_size, root->sectorsize);
685         new_size *= root->sectorsize;
686
687         printk(KERN_INFO "new size for %s is %llu\n",
688                 device->name, (unsigned long long)new_size);
689
690         if (new_size > old_size) {
691                 trans = btrfs_start_transaction(root, 1);
692                 ret = btrfs_grow_device(trans, device, new_size);
693                 btrfs_commit_transaction(trans, root);
694         } else {
695                 ret = btrfs_shrink_device(device, new_size);
696         }
697
698 out_unlock:
699         mutex_unlock(&root->fs_info->volume_mutex);
700         kfree(vol_args);
701         return ret;
702 }
703
704 static noinline int btrfs_ioctl_snap_create(struct file *file,
705                                             void __user *arg, int subvol)
706 {
707         struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
708         struct btrfs_ioctl_vol_args *vol_args;
709         struct btrfs_dir_item *di;
710         struct btrfs_path *path;
711         struct file *src_file;
712         u64 root_dirid;
713         int namelen;
714         int ret = 0;
715
716         if (root->fs_info->sb->s_flags & MS_RDONLY)
717                 return -EROFS;
718
719         vol_args = memdup_user(arg, sizeof(*vol_args));
720         if (IS_ERR(vol_args))
721                 return PTR_ERR(vol_args);
722
723         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
724         namelen = strlen(vol_args->name);
725         if (strchr(vol_args->name, '/')) {
726                 ret = -EINVAL;
727                 goto out;
728         }
729
730         path = btrfs_alloc_path();
731         if (!path) {
732                 ret = -ENOMEM;
733                 goto out;
734         }
735
736         root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
737         di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
738                             path, root_dirid,
739                             vol_args->name, namelen, 0);
740         btrfs_free_path(path);
741
742         if (di && !IS_ERR(di)) {
743                 ret = -EEXIST;
744                 goto out;
745         }
746
747         if (IS_ERR(di)) {
748                 ret = PTR_ERR(di);
749                 goto out;
750         }
751
752         if (subvol) {
753                 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
754                                      file->f_path.dentry->d_inode->i_mode,
755                                      namelen, NULL);
756         } else {
757                 struct inode *src_inode;
758                 src_file = fget(vol_args->fd);
759                 if (!src_file) {
760                         ret = -EINVAL;
761                         goto out;
762                 }
763
764                 src_inode = src_file->f_path.dentry->d_inode;
765                 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
766                         printk(KERN_INFO "btrfs: Snapshot src from "
767                                "another FS\n");
768                         ret = -EINVAL;
769                         fput(src_file);
770                         goto out;
771                 }
772                 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
773                              file->f_path.dentry->d_inode->i_mode,
774                              namelen, BTRFS_I(src_inode)->root);
775                 fput(src_file);
776         }
777
778 out:
779         kfree(vol_args);
780         return ret;
781 }
782
783 static int btrfs_ioctl_defrag(struct file *file)
784 {
785         struct inode *inode = fdentry(file)->d_inode;
786         struct btrfs_root *root = BTRFS_I(inode)->root;
787         int ret;
788
789         ret = mnt_want_write(file->f_path.mnt);
790         if (ret)
791                 return ret;
792
793         switch (inode->i_mode & S_IFMT) {
794         case S_IFDIR:
795                 if (!capable(CAP_SYS_ADMIN)) {
796                         ret = -EPERM;
797                         goto out;
798                 }
799                 btrfs_defrag_root(root, 0);
800                 btrfs_defrag_root(root->fs_info->extent_root, 0);
801                 break;
802         case S_IFREG:
803                 if (!(file->f_mode & FMODE_WRITE)) {
804                         ret = -EINVAL;
805                         goto out;
806                 }
807                 btrfs_defrag_file(file);
808                 break;
809         }
810 out:
811         mnt_drop_write(file->f_path.mnt);
812         return ret;
813 }
814
815 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
816 {
817         struct btrfs_ioctl_vol_args *vol_args;
818         int ret;
819
820         if (!capable(CAP_SYS_ADMIN))
821                 return -EPERM;
822
823         vol_args = memdup_user(arg, sizeof(*vol_args));
824         if (IS_ERR(vol_args))
825                 return PTR_ERR(vol_args);
826
827         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
828         ret = btrfs_init_new_device(root, vol_args->name);
829
830         kfree(vol_args);
831         return ret;
832 }
833
834 static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
835 {
836         struct btrfs_ioctl_vol_args *vol_args;
837         int ret;
838
839         if (!capable(CAP_SYS_ADMIN))
840                 return -EPERM;
841
842         if (root->fs_info->sb->s_flags & MS_RDONLY)
843                 return -EROFS;
844
845         vol_args = memdup_user(arg, sizeof(*vol_args));
846         if (IS_ERR(vol_args))
847                 return PTR_ERR(vol_args);
848
849         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
850         ret = btrfs_rm_device(root, vol_args->name);
851
852         kfree(vol_args);
853         return ret;
854 }
855
856 static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
857                 u64 off, u64 olen, u64 destoff)
858 {
859         struct inode *inode = fdentry(file)->d_inode;
860         struct btrfs_root *root = BTRFS_I(inode)->root;
861         struct file *src_file;
862         struct inode *src;
863         struct btrfs_trans_handle *trans;
864         struct btrfs_path *path;
865         struct extent_buffer *leaf;
866         char *buf;
867         struct btrfs_key key;
868         u32 nritems;
869         int slot;
870         int ret;
871         u64 len = olen;
872         u64 bs = root->fs_info->sb->s_blocksize;
873         u64 hint_byte;
874
875         /*
876          * TODO:
877          * - split compressed inline extents.  annoying: we need to
878          *   decompress into destination's address_space (the file offset
879          *   may change, so source mapping won't do), then recompress (or
880          *   otherwise reinsert) a subrange.
881          * - allow ranges within the same file to be cloned (provided
882          *   they don't overlap)?
883          */
884
885         /* the destination must be opened for writing */
886         if (!(file->f_mode & FMODE_WRITE))
887                 return -EINVAL;
888
889         ret = mnt_want_write(file->f_path.mnt);
890         if (ret)
891                 return ret;
892
893         src_file = fget(srcfd);
894         if (!src_file) {
895                 ret = -EBADF;
896                 goto out_drop_write;
897         }
898         src = src_file->f_dentry->d_inode;
899
900         ret = -EINVAL;
901         if (src == inode)
902                 goto out_fput;
903
904         ret = -EISDIR;
905         if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
906                 goto out_fput;
907
908         ret = -EXDEV;
909         if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
910                 goto out_fput;
911
912         ret = -ENOMEM;
913         buf = vmalloc(btrfs_level_size(root, 0));
914         if (!buf)
915                 goto out_fput;
916
917         path = btrfs_alloc_path();
918         if (!path) {
919                 vfree(buf);
920                 goto out_fput;
921         }
922         path->reada = 2;
923
924         if (inode < src) {
925                 mutex_lock(&inode->i_mutex);
926                 mutex_lock(&src->i_mutex);
927         } else {
928                 mutex_lock(&src->i_mutex);
929                 mutex_lock(&inode->i_mutex);
930         }
931
932         /* determine range to clone */
933         ret = -EINVAL;
934         if (off >= src->i_size || off + len > src->i_size)
935                 goto out_unlock;
936         if (len == 0)
937                 olen = len = src->i_size - off;
938         /* if we extend to eof, continue to block boundary */
939         if (off + len == src->i_size)
940                 len = ((src->i_size + bs-1) & ~(bs-1))
941                         - off;
942
943         /* verify the end result is block aligned */
944         if ((off & (bs-1)) ||
945             ((off + len) & (bs-1)))
946                 goto out_unlock;
947
948         /* do any pending delalloc/csum calc on src, one way or
949            another, and lock file content */
950         while (1) {
951                 struct btrfs_ordered_extent *ordered;
952                 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
953                 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
954                 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
955                         break;
956                 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
957                 if (ordered)
958                         btrfs_put_ordered_extent(ordered);
959                 btrfs_wait_ordered_range(src, off, off+len);
960         }
961
962         trans = btrfs_start_transaction(root, 1);
963         BUG_ON(!trans);
964
965         /* punch hole in destination first */
966         btrfs_drop_extents(trans, root, inode, off, off + len,
967                            off + len, 0, &hint_byte, 1);
968
969         /* clone data */
970         key.objectid = src->i_ino;
971         key.type = BTRFS_EXTENT_DATA_KEY;
972         key.offset = 0;
973
974         while (1) {
975                 /*
976                  * note the key will change type as we walk through the
977                  * tree.
978                  */
979                 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
980                 if (ret < 0)
981                         goto out;
982
983                 nritems = btrfs_header_nritems(path->nodes[0]);
984                 if (path->slots[0] >= nritems) {
985                         ret = btrfs_next_leaf(root, path);
986                         if (ret < 0)
987                                 goto out;
988                         if (ret > 0)
989                                 break;
990                         nritems = btrfs_header_nritems(path->nodes[0]);
991                 }
992                 leaf = path->nodes[0];
993                 slot = path->slots[0];
994
995                 btrfs_item_key_to_cpu(leaf, &key, slot);
996                 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
997                     key.objectid != src->i_ino)
998                         break;
999
1000                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
1001                         struct btrfs_file_extent_item *extent;
1002                         int type;
1003                         u32 size;
1004                         struct btrfs_key new_key;
1005                         u64 disko = 0, diskl = 0;
1006                         u64 datao = 0, datal = 0;
1007                         u8 comp;
1008
1009                         size = btrfs_item_size_nr(leaf, slot);
1010                         read_extent_buffer(leaf, buf,
1011                                            btrfs_item_ptr_offset(leaf, slot),
1012                                            size);
1013
1014                         extent = btrfs_item_ptr(leaf, slot,
1015                                                 struct btrfs_file_extent_item);
1016                         comp = btrfs_file_extent_compression(leaf, extent);
1017                         type = btrfs_file_extent_type(leaf, extent);
1018                         if (type == BTRFS_FILE_EXTENT_REG ||
1019                             type == BTRFS_FILE_EXTENT_PREALLOC) {
1020                                 disko = btrfs_file_extent_disk_bytenr(leaf,
1021                                                                       extent);
1022                                 diskl = btrfs_file_extent_disk_num_bytes(leaf,
1023                                                                  extent);
1024                                 datao = btrfs_file_extent_offset(leaf, extent);
1025                                 datal = btrfs_file_extent_num_bytes(leaf,
1026                                                                     extent);
1027                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
1028                                 /* take upper bound, may be compressed */
1029                                 datal = btrfs_file_extent_ram_bytes(leaf,
1030                                                                     extent);
1031                         }
1032                         btrfs_release_path(root, path);
1033
1034                         if (key.offset + datal < off ||
1035                             key.offset >= off+len)
1036                                 goto next;
1037
1038                         memcpy(&new_key, &key, sizeof(new_key));
1039                         new_key.objectid = inode->i_ino;
1040                         new_key.offset = key.offset + destoff - off;
1041
1042                         if (type == BTRFS_FILE_EXTENT_REG ||
1043                             type == BTRFS_FILE_EXTENT_PREALLOC) {
1044                                 ret = btrfs_insert_empty_item(trans, root, path,
1045                                                               &new_key, size);
1046                                 if (ret)
1047                                         goto out;
1048
1049                                 leaf = path->nodes[0];
1050                                 slot = path->slots[0];
1051                                 write_extent_buffer(leaf, buf,
1052                                             btrfs_item_ptr_offset(leaf, slot),
1053                                             size);
1054
1055                                 extent = btrfs_item_ptr(leaf, slot,
1056                                                 struct btrfs_file_extent_item);
1057
1058                                 if (off > key.offset) {
1059                                         datao += off - key.offset;
1060                                         datal -= off - key.offset;
1061                                 }
1062                                 if (key.offset + datao + datal + key.offset >
1063                                     off + len)
1064                                         datal = off + len - key.offset - datao;
1065                                 /* disko == 0 means it's a hole */
1066                                 if (!disko)
1067                                         datao = 0;
1068
1069                                 btrfs_set_file_extent_offset(leaf, extent,
1070                                                              datao);
1071                                 btrfs_set_file_extent_num_bytes(leaf, extent,
1072                                                                 datal);
1073                                 if (disko) {
1074                                         inode_add_bytes(inode, datal);
1075                                         ret = btrfs_inc_extent_ref(trans, root,
1076                                                         disko, diskl, 0,
1077                                                         root->root_key.objectid,
1078                                                         inode->i_ino,
1079                                                         new_key.offset - datao);
1080                                         BUG_ON(ret);
1081                                 }
1082                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
1083                                 u64 skip = 0;
1084                                 u64 trim = 0;
1085                                 if (off > key.offset) {
1086                                         skip = off - key.offset;
1087                                         new_key.offset += skip;
1088                                 }
1089
1090                                 if (key.offset + datal > off+len)
1091                                         trim = key.offset + datal - (off+len);
1092
1093                                 if (comp && (skip || trim)) {
1094                                         ret = -EINVAL;
1095                                         goto out;
1096                                 }
1097                                 size -= skip + trim;
1098                                 datal -= skip + trim;
1099                                 ret = btrfs_insert_empty_item(trans, root, path,
1100                                                               &new_key, size);
1101                                 if (ret)
1102                                         goto out;
1103
1104                                 if (skip) {
1105                                         u32 start =
1106                                           btrfs_file_extent_calc_inline_size(0);
1107                                         memmove(buf+start, buf+start+skip,
1108                                                 datal);
1109                                 }
1110
1111                                 leaf = path->nodes[0];
1112                                 slot = path->slots[0];
1113                                 write_extent_buffer(leaf, buf,
1114                                             btrfs_item_ptr_offset(leaf, slot),
1115                                             size);
1116                                 inode_add_bytes(inode, datal);
1117                         }
1118
1119                         btrfs_mark_buffer_dirty(leaf);
1120                 }
1121
1122 next:
1123                 btrfs_release_path(root, path);
1124                 key.offset++;
1125         }
1126         ret = 0;
1127 out:
1128         btrfs_release_path(root, path);
1129         if (ret == 0) {
1130                 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1131                 if (destoff + olen > inode->i_size)
1132                         btrfs_i_size_write(inode, destoff + olen);
1133                 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1134                 ret = btrfs_update_inode(trans, root, inode);
1135         }
1136         btrfs_end_transaction(trans, root);
1137         unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1138         if (ret)
1139                 vmtruncate(inode, 0);
1140 out_unlock:
1141         mutex_unlock(&src->i_mutex);
1142         mutex_unlock(&inode->i_mutex);
1143         vfree(buf);
1144         btrfs_free_path(path);
1145 out_fput:
1146         fput(src_file);
1147 out_drop_write:
1148         mnt_drop_write(file->f_path.mnt);
1149         return ret;
1150 }
1151
1152 static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1153 {
1154         struct btrfs_ioctl_clone_range_args args;
1155
1156         if (copy_from_user(&args, argp, sizeof(args)))
1157                 return -EFAULT;
1158         return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1159                                  args.src_length, args.dest_offset);
1160 }
1161
1162 /*
1163  * there are many ways the trans_start and trans_end ioctls can lead
1164  * to deadlocks.  They should only be used by applications that
1165  * basically own the machine, and have a very in depth understanding
1166  * of all the possible deadlocks and enospc problems.
1167  */
1168 static long btrfs_ioctl_trans_start(struct file *file)
1169 {
1170         struct inode *inode = fdentry(file)->d_inode;
1171         struct btrfs_root *root = BTRFS_I(inode)->root;
1172         struct btrfs_trans_handle *trans;
1173         int ret = 0;
1174
1175         if (!capable(CAP_SYS_ADMIN))
1176                 return -EPERM;
1177
1178         if (file->private_data) {
1179                 ret = -EINPROGRESS;
1180                 goto out;
1181         }
1182
1183         ret = mnt_want_write(file->f_path.mnt);
1184         if (ret)
1185                 goto out;
1186
1187         mutex_lock(&root->fs_info->trans_mutex);
1188         root->fs_info->open_ioctl_trans++;
1189         mutex_unlock(&root->fs_info->trans_mutex);
1190
1191         trans = btrfs_start_ioctl_transaction(root, 0);
1192         if (trans)
1193                 file->private_data = trans;
1194         else
1195                 ret = -ENOMEM;
1196         /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1197 out:
1198         return ret;
1199 }
1200
1201 /*
1202  * there are many ways the trans_start and trans_end ioctls can lead
1203  * to deadlocks.  They should only be used by applications that
1204  * basically own the machine, and have a very in depth understanding
1205  * of all the possible deadlocks and enospc problems.
1206  */
1207 long btrfs_ioctl_trans_end(struct file *file)
1208 {
1209         struct inode *inode = fdentry(file)->d_inode;
1210         struct btrfs_root *root = BTRFS_I(inode)->root;
1211         struct btrfs_trans_handle *trans;
1212         int ret = 0;
1213
1214         trans = file->private_data;
1215         if (!trans) {
1216                 ret = -EINVAL;
1217                 goto out;
1218         }
1219         btrfs_end_transaction(trans, root);
1220         file->private_data = NULL;
1221
1222         mutex_lock(&root->fs_info->trans_mutex);
1223         root->fs_info->open_ioctl_trans--;
1224         mutex_unlock(&root->fs_info->trans_mutex);
1225
1226         mnt_drop_write(file->f_path.mnt);
1227
1228 out:
1229         return ret;
1230 }
1231
1232 long btrfs_ioctl(struct file *file, unsigned int
1233                 cmd, unsigned long arg)
1234 {
1235         struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1236         void __user *argp = (void __user *)arg;
1237
1238         switch (cmd) {
1239         case FS_IOC_GETFLAGS:
1240                 return btrfs_ioctl_getflags(file, argp);
1241         case FS_IOC_SETFLAGS:
1242                 return btrfs_ioctl_setflags(file, argp);
1243         case FS_IOC_GETVERSION:
1244                 return btrfs_ioctl_getversion(file, argp);
1245         case BTRFS_IOC_SNAP_CREATE:
1246                 return btrfs_ioctl_snap_create(file, argp, 0);
1247         case BTRFS_IOC_SUBVOL_CREATE:
1248                 return btrfs_ioctl_snap_create(file, argp, 1);
1249         case BTRFS_IOC_DEFRAG:
1250                 return btrfs_ioctl_defrag(file);
1251         case BTRFS_IOC_RESIZE:
1252                 return btrfs_ioctl_resize(root, argp);
1253         case BTRFS_IOC_ADD_DEV:
1254                 return btrfs_ioctl_add_dev(root, argp);
1255         case BTRFS_IOC_RM_DEV:
1256                 return btrfs_ioctl_rm_dev(root, argp);
1257         case BTRFS_IOC_BALANCE:
1258                 return btrfs_balance(root->fs_info->dev_root);
1259         case BTRFS_IOC_CLONE:
1260                 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1261         case BTRFS_IOC_CLONE_RANGE:
1262                 return btrfs_ioctl_clone_range(file, argp);
1263         case BTRFS_IOC_TRANS_START:
1264                 return btrfs_ioctl_trans_start(file);
1265         case BTRFS_IOC_TRANS_END:
1266                 return btrfs_ioctl_trans_end(file);
1267         case BTRFS_IOC_SYNC:
1268                 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1269                 return 0;
1270         }
1271
1272         return -ENOTTY;
1273 }