Btrfs: start of block group code
[linux-2.6-block.git] / fs / btrfs / super.c
1 #include <linux/module.h>
2 #include <linux/buffer_head.h>
3 #include <linux/fs.h>
4 #include <linux/pagemap.h>
5 #include <linux/highmem.h>
6 #include <linux/time.h>
7 #include <linux/init.h>
8 #include <linux/string.h>
9 #include <linux/smp_lock.h>
10 #include <linux/backing-dev.h>
11 #include <linux/mpage.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/statfs.h>
15 #include "ctree.h"
16 #include "disk-io.h"
17 #include "transaction.h"
18 #include "btrfs_inode.h"
19 #include "ioctl.h"
20
21 void btrfs_fsinfo_release(struct kobject *obj)
22 {
23         struct btrfs_fs_info *fsinfo = container_of(obj,
24                                             struct btrfs_fs_info, kobj);
25         kfree(fsinfo);
26 }
27
28 struct kobj_type btrfs_fsinfo_ktype = {
29         .release = btrfs_fsinfo_release,
30 };
31
32 struct btrfs_iget_args {
33         u64 ino;
34         struct btrfs_root *root;
35 };
36
37 decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
38
39 #define BTRFS_SUPER_MAGIC 0x9123682E
40
41 static struct inode_operations btrfs_dir_inode_operations;
42 static struct inode_operations btrfs_dir_ro_inode_operations;
43 static struct super_operations btrfs_super_ops;
44 static struct file_operations btrfs_dir_file_operations;
45 static struct inode_operations btrfs_file_inode_operations;
46 static struct address_space_operations btrfs_aops;
47 static struct file_operations btrfs_file_operations;
48
49 static void btrfs_read_locked_inode(struct inode *inode)
50 {
51         struct btrfs_path *path;
52         struct btrfs_inode_item *inode_item;
53         struct btrfs_root *root = BTRFS_I(inode)->root;
54         struct btrfs_key location;
55         int ret;
56
57         path = btrfs_alloc_path();
58         BUG_ON(!path);
59         btrfs_init_path(path);
60         mutex_lock(&root->fs_info->fs_mutex);
61
62         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
63         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
64         if (ret) {
65                 btrfs_free_path(path);
66                 goto make_bad;
67         }
68         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
69                                   path->slots[0],
70                                   struct btrfs_inode_item);
71
72         inode->i_mode = btrfs_inode_mode(inode_item);
73         inode->i_nlink = btrfs_inode_nlink(inode_item);
74         inode->i_uid = btrfs_inode_uid(inode_item);
75         inode->i_gid = btrfs_inode_gid(inode_item);
76         inode->i_size = btrfs_inode_size(inode_item);
77         inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
78         inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
79         inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
80         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
81         inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
82         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
83         inode->i_blocks = btrfs_inode_nblocks(inode_item);
84         inode->i_generation = btrfs_inode_generation(inode_item);
85
86         btrfs_free_path(path);
87         inode_item = NULL;
88
89         mutex_unlock(&root->fs_info->fs_mutex);
90
91         switch (inode->i_mode & S_IFMT) {
92 #if 0
93         default:
94                 init_special_inode(inode, inode->i_mode,
95                                    btrfs_inode_rdev(inode_item));
96                 break;
97 #endif
98         case S_IFREG:
99                 inode->i_mapping->a_ops = &btrfs_aops;
100                 inode->i_fop = &btrfs_file_operations;
101                 inode->i_op = &btrfs_file_inode_operations;
102                 break;
103         case S_IFDIR:
104                 inode->i_fop = &btrfs_dir_file_operations;
105                 if (root == root->fs_info->tree_root)
106                         inode->i_op = &btrfs_dir_ro_inode_operations;
107                 else
108                         inode->i_op = &btrfs_dir_inode_operations;
109                 break;
110         case S_IFLNK:
111                 // inode->i_op = &page_symlink_inode_operations;
112                 break;
113         }
114         return;
115
116 make_bad:
117         btrfs_release_path(root, path);
118         btrfs_free_path(path);
119         mutex_unlock(&root->fs_info->fs_mutex);
120         make_bad_inode(inode);
121 }
122
123 static void fill_inode_item(struct btrfs_inode_item *item,
124                             struct inode *inode)
125 {
126         btrfs_set_inode_uid(item, inode->i_uid);
127         btrfs_set_inode_gid(item, inode->i_gid);
128         btrfs_set_inode_size(item, inode->i_size);
129         btrfs_set_inode_mode(item, inode->i_mode);
130         btrfs_set_inode_nlink(item, inode->i_nlink);
131         btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
132         btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
133         btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
134         btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
135         btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
136         btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
137         btrfs_set_inode_nblocks(item, inode->i_blocks);
138         btrfs_set_inode_generation(item, inode->i_generation);
139 }
140
141
142 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
143                               struct btrfs_root *root,
144                               struct inode *inode)
145 {
146         struct btrfs_inode_item *inode_item;
147         struct btrfs_path *path;
148         int ret;
149
150         path = btrfs_alloc_path();
151         BUG_ON(!path);
152         btrfs_init_path(path);
153         ret = btrfs_lookup_inode(trans, root, path,
154                                  &BTRFS_I(inode)->location, 1);
155         if (ret) {
156                 if (ret > 0)
157                         ret = -ENOENT;
158                 goto failed;
159         }
160
161         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
162                                   path->slots[0],
163                                   struct btrfs_inode_item);
164
165         fill_inode_item(inode_item, inode);
166         btrfs_mark_buffer_dirty(path->nodes[0]);
167         ret = 0;
168 failed:
169         btrfs_release_path(root, path);
170         btrfs_free_path(path);
171         return ret;
172 }
173
174
175 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
176                               struct btrfs_root *root,
177                               struct inode *dir,
178                               struct dentry *dentry)
179 {
180         struct btrfs_path *path;
181         const char *name = dentry->d_name.name;
182         int name_len = dentry->d_name.len;
183         int ret = 0;
184         u64 objectid;
185         struct btrfs_dir_item *di;
186
187         path = btrfs_alloc_path();
188         BUG_ON(!path);
189         btrfs_init_path(path);
190         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
191                                     name, name_len, -1);
192         if (IS_ERR(di)) {
193                 ret = PTR_ERR(di);
194                 goto err;
195         }
196         if (!di) {
197                 ret = -ENOENT;
198                 goto err;
199         }
200         objectid = btrfs_disk_key_objectid(&di->location);
201         ret = btrfs_delete_one_dir_name(trans, root, path, di);
202         BUG_ON(ret);
203         btrfs_release_path(root, path);
204
205         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
206                                          objectid, name, name_len, -1);
207         if (IS_ERR(di)) {
208                 ret = PTR_ERR(di);
209                 goto err;
210         }
211         if (!di) {
212                 ret = -ENOENT;
213                 goto err;
214         }
215         ret = btrfs_delete_one_dir_name(trans, root, path, di);
216         BUG_ON(ret);
217
218         dentry->d_inode->i_ctime = dir->i_ctime;
219 err:
220         btrfs_free_path(path);
221         if (!ret) {
222                 dir->i_size -= name_len * 2;
223                 btrfs_update_inode(trans, root, dir);
224                 drop_nlink(dentry->d_inode);
225                 btrfs_update_inode(trans, root, dentry->d_inode);
226         }
227         return ret;
228 }
229
230 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
231 {
232         struct btrfs_root *root;
233         struct btrfs_trans_handle *trans;
234         int ret;
235
236         root = BTRFS_I(dir)->root;
237         mutex_lock(&root->fs_info->fs_mutex);
238         trans = btrfs_start_transaction(root, 1);
239         ret = btrfs_unlink_trans(trans, root, dir, dentry);
240         btrfs_end_transaction(trans, root);
241         mutex_unlock(&root->fs_info->fs_mutex);
242         return ret;
243 }
244
245 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
246 {
247         struct inode *inode = dentry->d_inode;
248         int err;
249         int ret;
250         struct btrfs_root *root = BTRFS_I(dir)->root;
251         struct btrfs_path *path;
252         struct btrfs_key key;
253         struct btrfs_trans_handle *trans;
254         struct btrfs_key found_key;
255         int found_type;
256         struct btrfs_leaf *leaf;
257         char *goodnames = "..";
258
259         path = btrfs_alloc_path();
260         BUG_ON(!path);
261         btrfs_init_path(path);
262         mutex_lock(&root->fs_info->fs_mutex);
263         trans = btrfs_start_transaction(root, 1);
264         key.objectid = inode->i_ino;
265         key.offset = (u64)-1;
266         key.flags = (u32)-1;
267         while(1) {
268                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
269                 if (ret < 0) {
270                         err = ret;
271                         goto out;
272                 }
273                 BUG_ON(ret == 0);
274                 if (path->slots[0] == 0) {
275                         err = -ENOENT;
276                         goto out;
277                 }
278                 path->slots[0]--;
279                 leaf = btrfs_buffer_leaf(path->nodes[0]);
280                 btrfs_disk_key_to_cpu(&found_key,
281                                       &leaf->items[path->slots[0]].key);
282                 found_type = btrfs_key_type(&found_key);
283                 if (found_key.objectid != inode->i_ino) {
284                         err = -ENOENT;
285                         goto out;
286                 }
287                 if ((found_type != BTRFS_DIR_ITEM_KEY &&
288                      found_type != BTRFS_DIR_INDEX_KEY) ||
289                     (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
290                     !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
291                         err = -ENOTEMPTY;
292                         goto out;
293                 }
294                 ret = btrfs_del_item(trans, root, path);
295                 BUG_ON(ret);
296
297                 if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
298                         break;
299                 btrfs_release_path(root, path);
300         }
301         ret = 0;
302         btrfs_release_path(root, path);
303
304         /* now the directory is empty */
305         err = btrfs_unlink_trans(trans, root, dir, dentry);
306         if (!err) {
307                 inode->i_size = 0;
308         }
309 out:
310         btrfs_release_path(root, path);
311         btrfs_free_path(path);
312         mutex_unlock(&root->fs_info->fs_mutex);
313         ret = btrfs_end_transaction(trans, root);
314         if (ret && !err)
315                 err = ret;
316         return err;
317 }
318
319 static int btrfs_free_inode(struct btrfs_trans_handle *trans,
320                             struct btrfs_root *root,
321                             struct inode *inode)
322 {
323         struct btrfs_path *path;
324         int ret;
325
326         clear_inode(inode);
327
328         path = btrfs_alloc_path();
329         BUG_ON(!path);
330         btrfs_init_path(path);
331         ret = btrfs_lookup_inode(trans, root, path,
332                                  &BTRFS_I(inode)->location, -1);
333         BUG_ON(ret);
334         ret = btrfs_del_item(trans, root, path);
335         BUG_ON(ret);
336         btrfs_free_path(path);
337         return ret;
338 }
339
340 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
341                                    struct btrfs_root *root,
342                                    struct inode *inode)
343 {
344         int ret;
345         struct btrfs_path *path;
346         struct btrfs_key key;
347         struct btrfs_disk_key *found_key;
348         struct btrfs_leaf *leaf;
349         struct btrfs_file_extent_item *fi = NULL;
350         u64 extent_start = 0;
351         u64 extent_num_blocks = 0;
352         int found_extent;
353
354         path = btrfs_alloc_path();
355         BUG_ON(!path);
356         /* FIXME, add redo link to tree so we don't leak on crash */
357         key.objectid = inode->i_ino;
358         key.offset = (u64)-1;
359         key.flags = 0;
360         /*
361          * use BTRFS_CSUM_ITEM_KEY because it is larger than inline keys
362          * or extent data
363          */
364         btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
365         while(1) {
366                 btrfs_init_path(path);
367                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
368                 if (ret < 0) {
369                         goto error;
370                 }
371                 if (ret > 0) {
372                         BUG_ON(path->slots[0] == 0);
373                         path->slots[0]--;
374                 }
375                 leaf = btrfs_buffer_leaf(path->nodes[0]);
376                 found_key = &leaf->items[path->slots[0]].key;
377                 if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
378                         break;
379                 if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
380                     btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
381                         break;
382                 if (btrfs_disk_key_offset(found_key) < inode->i_size)
383                         break;
384                 found_extent = 0;
385                 if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
386                         fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
387                                             path->slots[0],
388                                             struct btrfs_file_extent_item);
389                         if (btrfs_file_extent_type(fi) !=
390                             BTRFS_FILE_EXTENT_INLINE) {
391                                 extent_start =
392                                         btrfs_file_extent_disk_blocknr(fi);
393                                 extent_num_blocks =
394                                         btrfs_file_extent_disk_num_blocks(fi);
395                                 /* FIXME blocksize != 4096 */
396                                 inode->i_blocks -=
397                                         btrfs_file_extent_num_blocks(fi) << 3;
398                                 found_extent = 1;
399                         }
400                 }
401                 ret = btrfs_del_item(trans, root, path);
402                 BUG_ON(ret);
403                 btrfs_release_path(root, path);
404                 if (found_extent) {
405                         ret = btrfs_free_extent(trans, root, extent_start,
406                                                 extent_num_blocks, 0);
407                         BUG_ON(ret);
408                 }
409         }
410         ret = 0;
411 error:
412         btrfs_release_path(root, path);
413         btrfs_free_path(path);
414         return ret;
415 }
416
417 static void btrfs_delete_inode(struct inode *inode)
418 {
419         struct btrfs_trans_handle *trans;
420         struct btrfs_root *root = BTRFS_I(inode)->root;
421         int ret;
422
423         truncate_inode_pages(&inode->i_data, 0);
424         if (is_bad_inode(inode)) {
425                 goto no_delete;
426         }
427         inode->i_size = 0;
428         mutex_lock(&root->fs_info->fs_mutex);
429         trans = btrfs_start_transaction(root, 1);
430         if (S_ISREG(inode->i_mode)) {
431                 ret = btrfs_truncate_in_trans(trans, root, inode);
432                 BUG_ON(ret);
433         }
434         btrfs_free_inode(trans, root, inode);
435         btrfs_end_transaction(trans, root);
436         mutex_unlock(&root->fs_info->fs_mutex);
437         return;
438 no_delete:
439         clear_inode(inode);
440 }
441
442 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
443                                struct btrfs_key *location)
444 {
445         const char *name = dentry->d_name.name;
446         int namelen = dentry->d_name.len;
447         struct btrfs_dir_item *di;
448         struct btrfs_path *path;
449         struct btrfs_root *root = BTRFS_I(dir)->root;
450         int ret;
451
452         path = btrfs_alloc_path();
453         BUG_ON(!path);
454         btrfs_init_path(path);
455         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
456                                     namelen, 0);
457         if (!di || IS_ERR(di)) {
458                 location->objectid = 0;
459                 ret = 0;
460                 goto out;
461         }
462         btrfs_disk_key_to_cpu(location, &di->location);
463 out:
464         btrfs_release_path(root, path);
465         btrfs_free_path(path);
466         return ret;
467 }
468
469 int fixup_tree_root_location(struct btrfs_root *root,
470                              struct btrfs_key *location,
471                              struct btrfs_root **sub_root)
472 {
473         struct btrfs_path *path;
474         struct btrfs_root_item *ri;
475
476         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
477                 return 0;
478         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
479                 return 0;
480
481         path = btrfs_alloc_path();
482         BUG_ON(!path);
483         mutex_lock(&root->fs_info->fs_mutex);
484
485         *sub_root = btrfs_read_fs_root(root->fs_info, location);
486         if (IS_ERR(*sub_root))
487                 return PTR_ERR(*sub_root);
488
489         ri = &(*sub_root)->root_item;
490         location->objectid = btrfs_root_dirid(ri);
491         location->flags = 0;
492         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
493         location->offset = 0;
494
495         btrfs_free_path(path);
496         mutex_unlock(&root->fs_info->fs_mutex);
497         return 0;
498 }
499
500 int btrfs_init_locked_inode(struct inode *inode, void *p)
501 {
502         struct btrfs_iget_args *args = p;
503         inode->i_ino = args->ino;
504         BTRFS_I(inode)->root = args->root;
505         return 0;
506 }
507
508 int btrfs_find_actor(struct inode *inode, void *opaque)
509 {
510         struct btrfs_iget_args *args = opaque;
511         return (args->ino == inode->i_ino &&
512                 args->root == BTRFS_I(inode)->root);
513 }
514
515 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
516                                 struct btrfs_root *root)
517 {
518         struct inode *inode;
519         struct btrfs_iget_args args;
520         args.ino = objectid;
521         args.root = root;
522
523         inode = iget5_locked(s, objectid, btrfs_find_actor,
524                              btrfs_init_locked_inode,
525                              (void *)&args);
526         return inode;
527 }
528
529 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
530                                    struct nameidata *nd)
531 {
532         struct inode * inode;
533         struct btrfs_inode *bi = BTRFS_I(dir);
534         struct btrfs_root *root = bi->root;
535         struct btrfs_root *sub_root = root;
536         struct btrfs_key location;
537         int ret;
538
539         if (dentry->d_name.len > BTRFS_NAME_LEN)
540                 return ERR_PTR(-ENAMETOOLONG);
541         mutex_lock(&root->fs_info->fs_mutex);
542         ret = btrfs_inode_by_name(dir, dentry, &location);
543         mutex_unlock(&root->fs_info->fs_mutex);
544         if (ret < 0)
545                 return ERR_PTR(ret);
546         inode = NULL;
547         if (location.objectid) {
548                 ret = fixup_tree_root_location(root, &location, &sub_root);
549                 if (ret < 0)
550                         return ERR_PTR(ret);
551                 if (ret > 0)
552                         return ERR_PTR(-ENOENT);
553                 inode = btrfs_iget_locked(dir->i_sb, location.objectid,
554                                           sub_root);
555                 if (!inode)
556                         return ERR_PTR(-EACCES);
557                 if (inode->i_state & I_NEW) {
558                         if (sub_root != root) {
559 printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
560                                 igrab(inode);
561                                 sub_root->inode = inode;
562                         }
563                         BTRFS_I(inode)->root = sub_root;
564                         memcpy(&BTRFS_I(inode)->location, &location,
565                                sizeof(location));
566                         btrfs_read_locked_inode(inode);
567                         unlock_new_inode(inode);
568                 }
569         }
570         return d_splice_alias(inode, dentry);
571 }
572
573 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
574 {
575         struct inode *inode = filp->f_path.dentry->d_inode;
576         struct btrfs_root *root = BTRFS_I(inode)->root;
577         struct btrfs_item *item;
578         struct btrfs_dir_item *di;
579         struct btrfs_key key;
580         struct btrfs_path *path;
581         int ret;
582         u32 nritems;
583         struct btrfs_leaf *leaf;
584         int slot;
585         int advance;
586         unsigned char d_type = DT_UNKNOWN;
587         int over = 0;
588         u32 di_cur;
589         u32 di_total;
590         u32 di_len;
591         int key_type = BTRFS_DIR_INDEX_KEY;
592
593         /* FIXME, use a real flag for deciding about the key type */
594         if (root->fs_info->tree_root == root)
595                 key_type = BTRFS_DIR_ITEM_KEY;
596         mutex_lock(&root->fs_info->fs_mutex);
597         key.objectid = inode->i_ino;
598         key.flags = 0;
599         btrfs_set_key_type(&key, key_type);
600         key.offset = filp->f_pos;
601         path = btrfs_alloc_path();
602         btrfs_init_path(path);
603         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
604         if (ret < 0)
605                 goto err;
606         advance = 0;
607         while(1) {
608                 leaf = btrfs_buffer_leaf(path->nodes[0]);
609                 nritems = btrfs_header_nritems(&leaf->header);
610                 slot = path->slots[0];
611                 if (advance || slot >= nritems) {
612                         if (slot >= nritems -1) {
613                                 ret = btrfs_next_leaf(root, path);
614                                 if (ret)
615                                         break;
616                                 leaf = btrfs_buffer_leaf(path->nodes[0]);
617                                 nritems = btrfs_header_nritems(&leaf->header);
618                                 slot = path->slots[0];
619                         } else {
620                                 slot++;
621                                 path->slots[0]++;
622                         }
623                 }
624                 advance = 1;
625                 item = leaf->items + slot;
626                 if (btrfs_disk_key_objectid(&item->key) != key.objectid)
627                         break;
628                 if (btrfs_disk_key_type(&item->key) != key_type)
629                         break;
630                 if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
631                         continue;
632                 filp->f_pos = btrfs_disk_key_offset(&item->key);
633                 advance = 1;
634                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
635                 di_cur = 0;
636                 di_total = btrfs_item_size(leaf->items + slot);
637                 while(di_cur < di_total) {
638                         over = filldir(dirent, (const char *)(di + 1),
639                                        btrfs_dir_name_len(di),
640                                        btrfs_disk_key_offset(&item->key),
641                                        btrfs_disk_key_objectid(&di->location),
642                                        d_type);
643                         if (over)
644                                 goto nopos;
645                         di_len = btrfs_dir_name_len(di) + sizeof(*di);
646                         di_cur += di_len;
647                         di = (struct btrfs_dir_item *)((char *)di + di_len);
648                 }
649         }
650         filp->f_pos++;
651 nopos:
652         ret = 0;
653 err:
654         btrfs_release_path(root, path);
655         btrfs_free_path(path);
656         mutex_unlock(&root->fs_info->fs_mutex);
657         return ret;
658 }
659
660 static void btrfs_put_super (struct super_block * sb)
661 {
662         struct btrfs_root *root = btrfs_sb(sb);
663         int ret;
664
665         ret = close_ctree(root);
666         if (ret) {
667                 printk("close ctree returns %d\n", ret);
668         }
669         sb->s_fs_info = NULL;
670 }
671
672 static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
673 {
674         struct inode * inode;
675         struct dentry * root_dentry;
676         struct btrfs_super_block *disk_super;
677         struct btrfs_root *tree_root;
678         struct btrfs_inode *bi;
679
680         sb->s_maxbytes = MAX_LFS_FILESIZE;
681         sb->s_magic = BTRFS_SUPER_MAGIC;
682         sb->s_op = &btrfs_super_ops;
683         sb->s_time_gran = 1;
684
685         tree_root = open_ctree(sb);
686
687         if (!tree_root) {
688                 printk("btrfs: open_ctree failed\n");
689                 return -EIO;
690         }
691         sb->s_fs_info = tree_root;
692         disk_super = tree_root->fs_info->disk_super;
693         printk("read in super total blocks %Lu root %Lu\n",
694                btrfs_super_total_blocks(disk_super),
695                btrfs_super_root_dir(disk_super));
696
697         inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
698                                   tree_root);
699         bi = BTRFS_I(inode);
700         bi->location.objectid = inode->i_ino;
701         bi->location.offset = 0;
702         bi->location.flags = 0;
703         bi->root = tree_root;
704         btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
705
706         if (!inode)
707                 return -ENOMEM;
708         if (inode->i_state & I_NEW) {
709                 btrfs_read_locked_inode(inode);
710                 unlock_new_inode(inode);
711         }
712
713         root_dentry = d_alloc_root(inode);
714         if (!root_dentry) {
715                 iput(inode);
716                 return -ENOMEM;
717         }
718         sb->s_root = root_dentry;
719
720         return 0;
721 }
722
723 static int btrfs_write_inode(struct inode *inode, int wait)
724 {
725         struct btrfs_root *root = BTRFS_I(inode)->root;
726         struct btrfs_trans_handle *trans;
727         int ret = 0;
728
729         if (wait) {
730                 mutex_lock(&root->fs_info->fs_mutex);
731                 trans = btrfs_start_transaction(root, 1);
732                 ret = btrfs_commit_transaction(trans, root);
733                 mutex_unlock(&root->fs_info->fs_mutex);
734         }
735         return ret;
736 }
737
738 static void btrfs_dirty_inode(struct inode *inode)
739 {
740         struct btrfs_root *root = BTRFS_I(inode)->root;
741         struct btrfs_trans_handle *trans;
742
743         mutex_lock(&root->fs_info->fs_mutex);
744         trans = btrfs_start_transaction(root, 1);
745         btrfs_update_inode(trans, root, inode);
746         btrfs_end_transaction(trans, root);
747         mutex_unlock(&root->fs_info->fs_mutex);
748 }
749
750 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
751                                      struct btrfs_root *root,
752                                      u64 objectid, int mode)
753 {
754         struct inode *inode;
755         struct btrfs_inode_item inode_item;
756         struct btrfs_key *location;
757         int ret;
758
759         inode = new_inode(root->fs_info->sb);
760         if (!inode)
761                 return ERR_PTR(-ENOMEM);
762
763         BTRFS_I(inode)->root = root;
764
765         inode->i_uid = current->fsuid;
766         inode->i_gid = current->fsgid;
767         inode->i_mode = mode;
768         inode->i_ino = objectid;
769         inode->i_blocks = 0;
770         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
771         fill_inode_item(&inode_item, inode);
772         location = &BTRFS_I(inode)->location;
773         location->objectid = objectid;
774         location->flags = 0;
775         location->offset = 0;
776         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
777
778         ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
779         BUG_ON(ret);
780
781         insert_inode_hash(inode);
782         return inode;
783 }
784
785 static int btrfs_add_link(struct btrfs_trans_handle *trans,
786                             struct dentry *dentry, struct inode *inode)
787 {
788         int ret;
789         struct btrfs_key key;
790         struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
791         key.objectid = inode->i_ino;
792         key.flags = 0;
793         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
794         key.offset = 0;
795
796         ret = btrfs_insert_dir_item(trans, root,
797                                     dentry->d_name.name, dentry->d_name.len,
798                                     dentry->d_parent->d_inode->i_ino,
799                                     &key, 0);
800         if (ret == 0) {
801                 dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
802                 ret = btrfs_update_inode(trans, root,
803                                          dentry->d_parent->d_inode);
804         }
805         return ret;
806 }
807
808 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
809                             struct dentry *dentry, struct inode *inode)
810 {
811         int err = btrfs_add_link(trans, dentry, inode);
812         if (!err) {
813                 d_instantiate(dentry, inode);
814                 return 0;
815         }
816         if (err > 0)
817                 err = -EEXIST;
818         return err;
819 }
820
821 static int btrfs_create(struct inode *dir, struct dentry *dentry,
822                         int mode, struct nameidata *nd)
823 {
824         struct btrfs_trans_handle *trans;
825         struct btrfs_root *root = BTRFS_I(dir)->root;
826         struct inode *inode;
827         int err;
828         int drop_inode = 0;
829         u64 objectid;
830
831         mutex_lock(&root->fs_info->fs_mutex);
832         trans = btrfs_start_transaction(root, 1);
833
834         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
835         if (err) {
836                 err = -ENOSPC;
837                 goto out_unlock;
838         }
839
840         inode = btrfs_new_inode(trans, root, objectid, mode);
841         err = PTR_ERR(inode);
842         if (IS_ERR(inode))
843                 goto out_unlock;
844         // FIXME mark the inode dirty
845         err = btrfs_add_nondir(trans, dentry, inode);
846         if (err)
847                 drop_inode = 1;
848         else {
849                 inode->i_mapping->a_ops = &btrfs_aops;
850                 inode->i_fop = &btrfs_file_operations;
851                 inode->i_op = &btrfs_file_inode_operations;
852         }
853         dir->i_sb->s_dirt = 1;
854 out_unlock:
855         btrfs_end_transaction(trans, root);
856         mutex_unlock(&root->fs_info->fs_mutex);
857
858         if (drop_inode) {
859                 inode_dec_link_count(inode);
860                 iput(inode);
861         }
862         return err;
863 }
864
865 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
866                                 struct btrfs_root *root,
867                                 u64 objectid, u64 dirid)
868 {
869         int ret;
870         char buf[2];
871         struct btrfs_key key;
872
873         buf[0] = '.';
874         buf[1] = '.';
875
876         key.objectid = objectid;
877         key.offset = 0;
878         key.flags = 0;
879         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
880
881         ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
882                                     &key, 1);
883         if (ret)
884                 goto error;
885         key.objectid = dirid;
886         ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
887                                     &key, 1);
888         if (ret)
889                 goto error;
890 error:
891         return ret;
892 }
893
894 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
895 {
896         struct inode *inode;
897         struct btrfs_trans_handle *trans;
898         struct btrfs_root *root = BTRFS_I(dir)->root;
899         int err = 0;
900         int drop_on_err = 0;
901         u64 objectid;
902
903         mutex_lock(&root->fs_info->fs_mutex);
904         trans = btrfs_start_transaction(root, 1);
905         if (IS_ERR(trans)) {
906                 err = PTR_ERR(trans);
907                 goto out_unlock;
908         }
909
910         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
911         if (err) {
912                 err = -ENOSPC;
913                 goto out_unlock;
914         }
915
916         inode = btrfs_new_inode(trans, root, objectid, S_IFDIR | mode);
917         if (IS_ERR(inode)) {
918                 err = PTR_ERR(inode);
919                 goto out_fail;
920         }
921         drop_on_err = 1;
922         inode->i_op = &btrfs_dir_inode_operations;
923         inode->i_fop = &btrfs_dir_file_operations;
924
925         err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
926         if (err)
927                 goto out_fail;
928
929         inode->i_size = 6;
930         err = btrfs_update_inode(trans, root, inode);
931         if (err)
932                 goto out_fail;
933         err = btrfs_add_link(trans, dentry, inode);
934         if (err)
935                 goto out_fail;
936         d_instantiate(dentry, inode);
937         drop_on_err = 0;
938
939 out_fail:
940         btrfs_end_transaction(trans, root);
941 out_unlock:
942         mutex_unlock(&root->fs_info->fs_mutex);
943         if (drop_on_err)
944                 iput(inode);
945         return err;
946 }
947
948 static int btrfs_sync_file(struct file *file,
949                            struct dentry *dentry, int datasync)
950 {
951         struct inode *inode = dentry->d_inode;
952         struct btrfs_root *root = BTRFS_I(inode)->root;
953         int ret;
954         struct btrfs_trans_handle *trans;
955
956         mutex_lock(&root->fs_info->fs_mutex);
957         trans = btrfs_start_transaction(root, 1);
958         if (!trans) {
959                 ret = -ENOMEM;
960                 goto out;
961         }
962         ret = btrfs_commit_transaction(trans, root);
963         mutex_unlock(&root->fs_info->fs_mutex);
964 out:
965         return ret > 0 ? EIO : ret;
966 }
967
968 static int btrfs_sync_fs(struct super_block *sb, int wait)
969 {
970         struct btrfs_trans_handle *trans;
971         struct btrfs_root *root;
972         int ret;
973         root = btrfs_sb(sb);
974
975         sb->s_dirt = 0;
976         if (!wait) {
977                 filemap_flush(root->fs_info->btree_inode->i_mapping);
978                 return 0;
979         }
980         filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
981         mutex_lock(&root->fs_info->fs_mutex);
982         trans = btrfs_start_transaction(root, 1);
983         ret = btrfs_commit_transaction(trans, root);
984         sb->s_dirt = 0;
985         BUG_ON(ret);
986 printk("btrfs sync_fs\n");
987         mutex_unlock(&root->fs_info->fs_mutex);
988         return 0;
989 }
990
991 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
992                            struct buffer_head *result, int create)
993 {
994         int ret;
995         int err = 0;
996         u64 blocknr;
997         u64 extent_start = 0;
998         u64 extent_end = 0;
999         u64 objectid = inode->i_ino;
1000         u32 found_type;
1001         struct btrfs_path *path;
1002         struct btrfs_root *root = BTRFS_I(inode)->root;
1003         struct btrfs_file_extent_item *item;
1004         struct btrfs_leaf *leaf;
1005         struct btrfs_disk_key *found_key;
1006
1007         path = btrfs_alloc_path();
1008         BUG_ON(!path);
1009         btrfs_init_path(path);
1010         if (create) {
1011                 WARN_ON(1);
1012         }
1013
1014         ret = btrfs_lookup_file_extent(NULL, root, path,
1015                                        inode->i_ino,
1016                                        iblock << inode->i_blkbits, 0);
1017         if (ret < 0) {
1018                 err = ret;
1019                 goto out;
1020         }
1021
1022         if (ret != 0) {
1023                 if (path->slots[0] == 0) {
1024                         btrfs_release_path(root, path);
1025                         goto out;
1026                 }
1027                 path->slots[0]--;
1028         }
1029
1030         item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
1031                               struct btrfs_file_extent_item);
1032         leaf = btrfs_buffer_leaf(path->nodes[0]);
1033         blocknr = btrfs_file_extent_disk_blocknr(item);
1034         blocknr += btrfs_file_extent_offset(item);
1035
1036         /* are we inside the extent that was found? */
1037         found_key = &leaf->items[path->slots[0]].key;
1038         found_type = btrfs_disk_key_type(found_key);
1039         if (btrfs_disk_key_objectid(found_key) != objectid ||
1040             found_type != BTRFS_EXTENT_DATA_KEY) {
1041                 extent_end = 0;
1042                 extent_start = 0;
1043                 btrfs_release_path(root, path);
1044                 goto out;
1045         }
1046         found_type = btrfs_file_extent_type(item);
1047         extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
1048         if (found_type == BTRFS_FILE_EXTENT_REG) {
1049                 extent_start = extent_start >> inode->i_blkbits;
1050                 extent_end = extent_start + btrfs_file_extent_num_blocks(item);
1051                 if (iblock >= extent_start && iblock < extent_end) {
1052                         err = 0;
1053                         btrfs_map_bh_to_logical(root, result, blocknr +
1054                                                 iblock - extent_start);
1055                         goto out;
1056                 }
1057         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1058                 char *ptr;
1059                 char *map;
1060                 u32 size;
1061                 size = btrfs_file_extent_inline_len(leaf->items +
1062                                                     path->slots[0]);
1063                 extent_end = (extent_start + size) >> inode->i_blkbits;
1064                 extent_start >>= inode->i_blkbits;
1065                 if (iblock < extent_start || iblock > extent_end) {
1066                         goto out;
1067                 }
1068                 ptr = btrfs_file_extent_inline_start(item);
1069                 map = kmap(result->b_page);
1070                 memcpy(map, ptr, size);
1071                 memset(map + size, 0, PAGE_CACHE_SIZE - size);
1072                 flush_dcache_page(result->b_page);
1073                 kunmap(result->b_page);
1074                 set_buffer_uptodate(result);
1075                 SetPageChecked(result->b_page);
1076                 btrfs_map_bh_to_logical(root, result, 0);
1077         }
1078 out:
1079         btrfs_release_path(root, path);
1080         btrfs_free_path(path);
1081         return err;
1082 }
1083
1084 static int btrfs_get_block(struct inode *inode, sector_t iblock,
1085                            struct buffer_head *result, int create)
1086 {
1087         int err;
1088         struct btrfs_root *root = BTRFS_I(inode)->root;
1089         mutex_lock(&root->fs_info->fs_mutex);
1090         err = btrfs_get_block_lock(inode, iblock, result, create);
1091         mutex_unlock(&root->fs_info->fs_mutex);
1092         return err;
1093 }
1094
1095 static int btrfs_prepare_write(struct file *file, struct page *page,
1096                                unsigned from, unsigned to)
1097 {
1098         return nobh_prepare_write(page, from, to, btrfs_get_block);
1099 }
1100
1101 static void btrfs_write_super(struct super_block *sb)
1102 {
1103         btrfs_sync_fs(sb, 1);
1104 }
1105
1106 static int btrfs_readpage(struct file *file, struct page *page)
1107 {
1108         return mpage_readpage(page, btrfs_get_block);
1109 }
1110
1111 /*
1112  * While block_write_full_page is writing back the dirty buffers under
1113  * the page lock, whoever dirtied the buffers may decide to clean them
1114  * again at any time.  We handle that by only looking at the buffer
1115  * state inside lock_buffer().
1116  *
1117  * If block_write_full_page() is called for regular writeback
1118  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1119  * locked buffer.   This only can happen if someone has written the buffer
1120  * directly, with submit_bh().  At the address_space level PageWriteback
1121  * prevents this contention from occurring.
1122  */
1123 static int __btrfs_write_full_page(struct inode *inode, struct page *page,
1124                                    struct writeback_control *wbc)
1125 {
1126         int err;
1127         sector_t block;
1128         sector_t last_block;
1129         struct buffer_head *bh, *head;
1130         const unsigned blocksize = 1 << inode->i_blkbits;
1131         int nr_underway = 0;
1132
1133         BUG_ON(!PageLocked(page));
1134
1135         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1136
1137         if (!page_has_buffers(page)) {
1138                 create_empty_buffers(page, blocksize,
1139                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1140         }
1141
1142         /*
1143          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1144          * here, and the (potentially unmapped) buffers may become dirty at
1145          * any time.  If a buffer becomes dirty here after we've inspected it
1146          * then we just miss that fact, and the page stays dirty.
1147          *
1148          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1149          * handle that here by just cleaning them.
1150          */
1151
1152         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1153         head = page_buffers(page);
1154         bh = head;
1155
1156         /*
1157          * Get all the dirty buffers mapped to disk addresses and
1158          * handle any aliases from the underlying blockdev's mapping.
1159          */
1160         do {
1161                 if (block > last_block) {
1162                         /*
1163                          * mapped buffers outside i_size will occur, because
1164                          * this page can be outside i_size when there is a
1165                          * truncate in progress.
1166                          */
1167                         /*
1168                          * The buffer was zeroed by block_write_full_page()
1169                          */
1170                         clear_buffer_dirty(bh);
1171                         set_buffer_uptodate(bh);
1172                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1173                         WARN_ON(bh->b_size != blocksize);
1174                         err = btrfs_get_block(inode, block, bh, 0);
1175                         if (err)
1176                                 goto recover;
1177                         if (buffer_new(bh)) {
1178                                 /* blockdev mappings never come here */
1179                                 clear_buffer_new(bh);
1180                                 unmap_underlying_metadata(bh->b_bdev,
1181                                                         bh->b_blocknr);
1182                         }
1183                 }
1184                 bh = bh->b_this_page;
1185                 block++;
1186         } while (bh != head);
1187
1188         do {
1189                 if (!buffer_mapped(bh))
1190                         continue;
1191                 /*
1192                  * If it's a fully non-blocking write attempt and we cannot
1193                  * lock the buffer then redirty the page.  Note that this can
1194                  * potentially cause a busy-wait loop from pdflush and kswapd
1195                  * activity, but those code paths have their own higher-level
1196                  * throttling.
1197                  */
1198                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1199                         lock_buffer(bh);
1200                 } else if (test_set_buffer_locked(bh)) {
1201                         redirty_page_for_writepage(wbc, page);
1202                         continue;
1203                 }
1204                 if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
1205                         mark_buffer_async_write(bh);
1206                 } else {
1207                         unlock_buffer(bh);
1208                 }
1209         } while ((bh = bh->b_this_page) != head);
1210
1211         /*
1212          * The page and its buffers are protected by PageWriteback(), so we can
1213          * drop the bh refcounts early.
1214          */
1215         BUG_ON(PageWriteback(page));
1216         set_page_writeback(page);
1217
1218         do {
1219                 struct buffer_head *next = bh->b_this_page;
1220                 if (buffer_async_write(bh)) {
1221                         submit_bh(WRITE, bh);
1222                         nr_underway++;
1223                 }
1224                 bh = next;
1225         } while (bh != head);
1226         unlock_page(page);
1227
1228         err = 0;
1229 done:
1230         if (nr_underway == 0) {
1231                 /*
1232                  * The page was marked dirty, but the buffers were
1233                  * clean.  Someone wrote them back by hand with
1234                  * ll_rw_block/submit_bh.  A rare case.
1235                  */
1236                 int uptodate = 1;
1237                 do {
1238                         if (!buffer_uptodate(bh)) {
1239                                 uptodate = 0;
1240                                 break;
1241                         }
1242                         bh = bh->b_this_page;
1243                 } while (bh != head);
1244                 if (uptodate)
1245                         SetPageUptodate(page);
1246                 end_page_writeback(page);
1247                 /*
1248                  * The page and buffer_heads can be released at any time from
1249                  * here on.
1250                  */
1251                 wbc->pages_skipped++;   /* We didn't write this page */
1252         }
1253         return err;
1254
1255 recover:
1256         /*
1257          * ENOSPC, or some other error.  We may already have added some
1258          * blocks to the file, so we need to write these out to avoid
1259          * exposing stale data.
1260          * The page is currently locked and not marked for writeback
1261          */
1262         bh = head;
1263         /* Recovery: lock and submit the mapped buffers */
1264         do {
1265                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1266                         lock_buffer(bh);
1267                         mark_buffer_async_write(bh);
1268                 } else {
1269                         /*
1270                          * The buffer may have been set dirty during
1271                          * attachment to a dirty page.
1272                          */
1273                         clear_buffer_dirty(bh);
1274                 }
1275         } while ((bh = bh->b_this_page) != head);
1276         SetPageError(page);
1277         BUG_ON(PageWriteback(page));
1278         set_page_writeback(page);
1279         do {
1280                 struct buffer_head *next = bh->b_this_page;
1281                 if (buffer_async_write(bh)) {
1282                         clear_buffer_dirty(bh);
1283                         submit_bh(WRITE, bh);
1284                         nr_underway++;
1285                 }
1286                 bh = next;
1287         } while (bh != head);
1288         unlock_page(page);
1289         goto done;
1290 }
1291
1292 /*
1293  * The generic ->writepage function for buffer-backed address_spaces
1294  */
1295 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
1296 {
1297         struct inode * const inode = page->mapping->host;
1298         loff_t i_size = i_size_read(inode);
1299         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
1300         unsigned offset;
1301         void *kaddr;
1302
1303         /* Is the page fully inside i_size? */
1304         if (page->index < end_index)
1305                 return __btrfs_write_full_page(inode, page, wbc);
1306
1307         /* Is the page fully outside i_size? (truncate in progress) */
1308         offset = i_size & (PAGE_CACHE_SIZE-1);
1309         if (page->index >= end_index+1 || !offset) {
1310                 /*
1311                  * The page may have dirty, unmapped buffers.  For example,
1312                  * they may have been added in ext3_writepage().  Make them
1313                  * freeable here, so the page does not leak.
1314                  */
1315                 block_invalidatepage(page, 0);
1316                 unlock_page(page);
1317                 return 0; /* don't care */
1318         }
1319
1320         /*
1321          * The page straddles i_size.  It must be zeroed out on each and every
1322          * writepage invokation because it may be mmapped.  "A file is mapped
1323          * in multiples of the page size.  For a file that is not a multiple of
1324          * the  page size, the remaining memory is zeroed when mapped, and
1325          * writes to that region are not written out to the file."
1326          */
1327         kaddr = kmap_atomic(page, KM_USER0);
1328         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1329         flush_dcache_page(page);
1330         kunmap_atomic(kaddr, KM_USER0);
1331         return __btrfs_write_full_page(inode, page, wbc);
1332 }
1333
1334 static void btrfs_truncate(struct inode *inode)
1335 {
1336         struct btrfs_root *root = BTRFS_I(inode)->root;
1337         int ret;
1338         struct btrfs_trans_handle *trans;
1339
1340         if (!S_ISREG(inode->i_mode))
1341                 return;
1342         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1343                 return;
1344
1345         nobh_truncate_page(inode->i_mapping, inode->i_size);
1346
1347         /* FIXME, add redo link to tree so we don't leak on crash */
1348         mutex_lock(&root->fs_info->fs_mutex);
1349         trans = btrfs_start_transaction(root, 1);
1350         ret = btrfs_truncate_in_trans(trans, root, inode);
1351         BUG_ON(ret);
1352         ret = btrfs_end_transaction(trans, root);
1353         BUG_ON(ret);
1354         mutex_unlock(&root->fs_info->fs_mutex);
1355         mark_inode_dirty(inode);
1356 }
1357
1358 /*
1359  * Make sure any changes to nobh_commit_write() are reflected in
1360  * nobh_truncate_page(), since it doesn't call commit_write().
1361  */
1362 static int btrfs_commit_write(struct file *file, struct page *page,
1363                               unsigned from, unsigned to)
1364 {
1365         struct inode *inode = page->mapping->host;
1366         struct buffer_head *bh;
1367         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1368
1369         SetPageUptodate(page);
1370         bh = page_buffers(page);
1371         if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1372                 set_page_dirty(page);
1373         }
1374         if (pos > inode->i_size) {
1375                 i_size_write(inode, pos);
1376                 mark_inode_dirty(inode);
1377         }
1378         return 0;
1379 }
1380
1381 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
1382                                 struct page **prepared_pages,
1383                                 const char __user * buf)
1384 {
1385         long page_fault = 0;
1386         int i;
1387         int offset = pos & (PAGE_CACHE_SIZE - 1);
1388
1389         for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
1390                 size_t count = min_t(size_t,
1391                                      PAGE_CACHE_SIZE - offset, write_bytes);
1392                 struct page *page = prepared_pages[i];
1393                 fault_in_pages_readable(buf, count);
1394
1395                 /* Copy data from userspace to the current page */
1396                 kmap(page);
1397                 page_fault = __copy_from_user(page_address(page) + offset,
1398                                               buf, count);
1399                 /* Flush processor's dcache for this page */
1400                 flush_dcache_page(page);
1401                 kunmap(page);
1402                 buf += count;
1403                 write_bytes -= count;
1404
1405                 if (page_fault)
1406                         break;
1407         }
1408         return page_fault ? -EFAULT : 0;
1409 }
1410
1411 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
1412 {
1413         size_t i;
1414         for (i = 0; i < num_pages; i++) {
1415                 if (!pages[i])
1416                         break;
1417                 unlock_page(pages[i]);
1418                 mark_page_accessed(pages[i]);
1419                 page_cache_release(pages[i]);
1420         }
1421 }
1422 static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
1423                                    struct btrfs_root *root,
1424                                    struct file *file,
1425                                    struct page **pages,
1426                                    size_t num_pages,
1427                                    loff_t pos,
1428                                    size_t write_bytes)
1429 {
1430         int i;
1431         int offset;
1432         int err = 0;
1433         int ret;
1434         int this_write;
1435         struct inode *inode = file->f_path.dentry->d_inode;
1436         struct buffer_head *bh;
1437         struct btrfs_file_extent_item *ei;
1438
1439         for (i = 0; i < num_pages; i++) {
1440                 offset = pos & (PAGE_CACHE_SIZE -1);
1441                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1442                 /* FIXME, one block at a time */
1443
1444                 mutex_lock(&root->fs_info->fs_mutex);
1445                 trans = btrfs_start_transaction(root, 1);
1446
1447                 bh = page_buffers(pages[i]);
1448                 if (buffer_mapped(bh) && bh->b_blocknr == 0) {
1449                         struct btrfs_key key;
1450                         struct btrfs_path *path;
1451                         char *ptr;
1452                         u32 datasize;
1453
1454                         path = btrfs_alloc_path();
1455                         BUG_ON(!path);
1456                         key.objectid = inode->i_ino;
1457                         key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
1458                         key.flags = 0;
1459                         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
1460                         BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
1461                         datasize = offset +
1462                                 btrfs_file_extent_calc_inline_size(write_bytes);
1463                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1464                                                       datasize);
1465                         BUG_ON(ret);
1466                         ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
1467                                path->slots[0], struct btrfs_file_extent_item);
1468                         btrfs_set_file_extent_generation(ei, trans->transid);
1469                         btrfs_set_file_extent_type(ei,
1470                                                    BTRFS_FILE_EXTENT_INLINE);
1471                         ptr = btrfs_file_extent_inline_start(ei);
1472                         memcpy(ptr, bh->b_data, offset + write_bytes);
1473                         mark_buffer_dirty(path->nodes[0]);
1474                         btrfs_free_path(path);
1475                 } else {
1476                         btrfs_csum_file_block(trans, root, inode->i_ino,
1477                                       pages[i]->index << PAGE_CACHE_SHIFT,
1478                                       kmap(pages[i]), PAGE_CACHE_SIZE);
1479                         kunmap(pages[i]);
1480                 }
1481                 SetPageChecked(pages[i]);
1482                 ret = btrfs_end_transaction(trans, root);
1483                 BUG_ON(ret);
1484                 mutex_unlock(&root->fs_info->fs_mutex);
1485
1486                 ret = btrfs_commit_write(file, pages[i], offset,
1487                                          offset + this_write);
1488                 pos += this_write;
1489                 if (ret) {
1490                         err = ret;
1491                         goto failed;
1492                 }
1493                 WARN_ON(this_write > write_bytes);
1494                 write_bytes -= this_write;
1495         }
1496 failed:
1497         return err;
1498 }
1499
1500 static int drop_extents(struct btrfs_trans_handle *trans,
1501                           struct btrfs_root *root,
1502                           struct inode *inode,
1503                           u64 start, u64 end)
1504 {
1505         int ret;
1506         struct btrfs_key key;
1507         struct btrfs_leaf *leaf;
1508         int slot;
1509         struct btrfs_file_extent_item *extent;
1510         u64 extent_end = 0;
1511         int keep;
1512         struct btrfs_file_extent_item old;
1513         struct btrfs_path *path;
1514         u64 search_start = start;
1515         int bookend;
1516         int found_type;
1517         int found_extent;
1518         int found_inline;
1519
1520         path = btrfs_alloc_path();
1521         if (!path)
1522                 return -ENOMEM;
1523         while(1) {
1524                 btrfs_release_path(root, path);
1525                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
1526                                                search_start, -1);
1527                 if (ret < 0)
1528                         goto out;
1529                 if (ret > 0) {
1530                         if (path->slots[0] == 0) {
1531                                 ret = 0;
1532                                 goto out;
1533                         }
1534                         path->slots[0]--;
1535                 }
1536                 keep = 0;
1537                 bookend = 0;
1538                 found_extent = 0;
1539                 found_inline = 0;
1540                 extent = NULL;
1541                 leaf = btrfs_buffer_leaf(path->nodes[0]);
1542                 slot = path->slots[0];
1543                 btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
1544                 if (key.offset >= end || key.objectid != inode->i_ino) {
1545                         ret = 0;
1546                         goto out;
1547                 }
1548                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
1549                         ret = 0;
1550                         goto out;
1551                 }
1552                 extent = btrfs_item_ptr(leaf, slot,
1553                                         struct btrfs_file_extent_item);
1554                 found_type = btrfs_file_extent_type(extent);
1555                 if (found_type == BTRFS_FILE_EXTENT_REG) {
1556                         extent_end = key.offset +
1557                                 (btrfs_file_extent_num_blocks(extent) <<
1558                                  inode->i_blkbits);
1559                         found_extent = 1;
1560                 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1561                         found_inline = 1;
1562                         extent_end = key.offset +
1563                              btrfs_file_extent_inline_len(leaf->items + slot);
1564                 }
1565
1566                 if (!found_extent && !found_inline) {
1567                         ret = 0;
1568                         goto out;
1569                 }
1570
1571                 if (search_start >= extent_end) {
1572                         ret = 0;
1573                         goto out;
1574                 }
1575
1576                 search_start = extent_end;
1577
1578                 if (end < extent_end && end >= key.offset) {
1579                         if (found_extent) {
1580                                 memcpy(&old, extent, sizeof(old));
1581                                 ret = btrfs_inc_extent_ref(trans, root,
1582                                       btrfs_file_extent_disk_blocknr(&old),
1583                                       btrfs_file_extent_disk_num_blocks(&old));
1584                                 BUG_ON(ret);
1585                         }
1586                         WARN_ON(found_inline);
1587                         bookend = 1;
1588                 }
1589
1590                 if (start > key.offset) {
1591                         u64 new_num;
1592                         u64 old_num;
1593                         /* truncate existing extent */
1594                         keep = 1;
1595                         WARN_ON(start & (root->blocksize - 1));
1596                         if (found_extent) {
1597                                 new_num = (start - key.offset) >>
1598                                         inode->i_blkbits;
1599                                 old_num = btrfs_file_extent_num_blocks(extent);
1600                                 inode->i_blocks -= (old_num - new_num) << 3;
1601                                 btrfs_set_file_extent_num_blocks(extent,
1602                                                                  new_num);
1603                                 mark_buffer_dirty(path->nodes[0]);
1604                         } else {
1605                                 WARN_ON(1);
1606                                 /*
1607                                 ret = btrfs_truncate_item(trans, root, path,
1608                                                           start - key.offset);
1609                                 BUG_ON(ret);
1610                                 */
1611                         }
1612                 }
1613                 if (!keep) {
1614                         u64 disk_blocknr = 0;
1615                         u64 disk_num_blocks = 0;
1616                         u64 extent_num_blocks = 0;
1617                         if (found_extent) {
1618                                 disk_blocknr =
1619                                       btrfs_file_extent_disk_blocknr(extent);
1620                                 disk_num_blocks =
1621                                       btrfs_file_extent_disk_num_blocks(extent);
1622                                 extent_num_blocks =
1623                                       btrfs_file_extent_num_blocks(extent);
1624                         }
1625                         ret = btrfs_del_item(trans, root, path);
1626                         BUG_ON(ret);
1627                         btrfs_release_path(root, path);
1628                         if (found_extent) {
1629                                 inode->i_blocks -=
1630                                 btrfs_file_extent_num_blocks(extent) << 3;
1631                                 ret = btrfs_free_extent(trans, root,
1632                                                         disk_blocknr,
1633                                                         disk_num_blocks, 0);
1634                         }
1635
1636                         BUG_ON(ret);
1637                         if (!bookend && search_start >= end) {
1638                                 ret = 0;
1639                                 goto out;
1640                         }
1641                         if (!bookend)
1642                                 continue;
1643                 }
1644                 if (bookend && found_extent) {
1645                         /* create bookend */
1646                         struct btrfs_key ins;
1647                         ins.objectid = inode->i_ino;
1648                         ins.offset = end;
1649                         ins.flags = 0;
1650                         btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
1651
1652                         btrfs_release_path(root, path);
1653                         ret = btrfs_insert_empty_item(trans, root, path, &ins,
1654                                                       sizeof(*extent));
1655                         BUG_ON(ret);
1656                         extent = btrfs_item_ptr(
1657                                     btrfs_buffer_leaf(path->nodes[0]),
1658                                     path->slots[0],
1659                                     struct btrfs_file_extent_item);
1660                         btrfs_set_file_extent_disk_blocknr(extent,
1661                                     btrfs_file_extent_disk_blocknr(&old));
1662                         btrfs_set_file_extent_disk_num_blocks(extent,
1663                                     btrfs_file_extent_disk_num_blocks(&old));
1664
1665                         btrfs_set_file_extent_offset(extent,
1666                                     btrfs_file_extent_offset(&old) +
1667                                     ((end - key.offset) >> inode->i_blkbits));
1668                         WARN_ON(btrfs_file_extent_num_blocks(&old) <
1669                                 (end - key.offset) >> inode->i_blkbits);
1670                         btrfs_set_file_extent_num_blocks(extent,
1671                                     btrfs_file_extent_num_blocks(&old) -
1672                                     ((end - key.offset) >> inode->i_blkbits));
1673
1674                         btrfs_set_file_extent_type(extent,
1675                                                    BTRFS_FILE_EXTENT_REG);
1676                         btrfs_set_file_extent_generation(extent,
1677                                     btrfs_file_extent_generation(&old));
1678                         btrfs_mark_buffer_dirty(path->nodes[0]);
1679                         inode->i_blocks +=
1680                                 btrfs_file_extent_num_blocks(extent) << 3;
1681                         ret = 0;
1682                         goto out;
1683                 }
1684         }
1685 out:
1686         btrfs_free_path(path);
1687         return ret;
1688 }
1689
1690 static int prepare_pages(struct btrfs_root *root,
1691                          struct file *file,
1692                          struct page **pages,
1693                          size_t num_pages,
1694                          loff_t pos,
1695                          unsigned long first_index,
1696                          unsigned long last_index,
1697                          size_t write_bytes,
1698                          u64 alloc_extent_start)
1699 {
1700         int i;
1701         unsigned long index = pos >> PAGE_CACHE_SHIFT;
1702         struct inode *inode = file->f_path.dentry->d_inode;
1703         int offset;
1704         int err = 0;
1705         int this_write;
1706         struct buffer_head *bh;
1707         struct buffer_head *head;
1708         loff_t isize = i_size_read(inode);
1709
1710         memset(pages, 0, num_pages * sizeof(struct page *));
1711
1712         for (i = 0; i < num_pages; i++) {
1713                 pages[i] = grab_cache_page(inode->i_mapping, index + i);
1714                 if (!pages[i]) {
1715                         err = -ENOMEM;
1716                         goto failed_release;
1717                 }
1718                 offset = pos & (PAGE_CACHE_SIZE -1);
1719                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1720                 create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
1721                                      (1 << BH_Uptodate));
1722                 head = page_buffers(pages[i]);
1723                 bh = head;
1724                 do {
1725                         err = btrfs_map_bh_to_logical(root, bh,
1726                                                       alloc_extent_start);
1727                         BUG_ON(err);
1728                         if (err)
1729                                 goto failed_truncate;
1730                         bh = bh->b_this_page;
1731                         if (alloc_extent_start)
1732                                 alloc_extent_start++;
1733                 } while (bh != head);
1734                 pos += this_write;
1735                 WARN_ON(this_write > write_bytes);
1736                 write_bytes -= this_write;
1737         }
1738         return 0;
1739
1740 failed_release:
1741         btrfs_drop_pages(pages, num_pages);
1742         return err;
1743
1744 failed_truncate:
1745         btrfs_drop_pages(pages, num_pages);
1746         if (pos > isize)
1747                 vmtruncate(inode, isize);
1748         return err;
1749 }
1750
1751 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1752                                 size_t count, loff_t *ppos)
1753 {
1754         loff_t pos;
1755         size_t num_written = 0;
1756         int err = 0;
1757         int ret = 0;
1758         struct inode *inode = file->f_path.dentry->d_inode;
1759         struct btrfs_root *root = BTRFS_I(inode)->root;
1760         struct page *pages[8];
1761         struct page *pinned[2] = { NULL, NULL };
1762         unsigned long first_index;
1763         unsigned long last_index;
1764         u64 start_pos;
1765         u64 num_blocks;
1766         u64 alloc_extent_start;
1767         struct btrfs_trans_handle *trans;
1768         struct btrfs_key ins;
1769
1770         if (file->f_flags & O_DIRECT)
1771                 return -EINVAL;
1772         pos = *ppos;
1773         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1774         current->backing_dev_info = inode->i_mapping->backing_dev_info;
1775         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1776         if (err)
1777                 goto out;
1778         if (count == 0)
1779                 goto out;
1780         err = remove_suid(file->f_path.dentry);
1781         if (err)
1782                 goto out;
1783         file_update_time(file);
1784
1785         start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1786         num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
1787                         inode->i_blkbits;
1788
1789         mutex_lock(&inode->i_mutex);
1790         first_index = pos >> PAGE_CACHE_SHIFT;
1791         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1792
1793         if ((first_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1794             (pos & (PAGE_CACHE_SIZE - 1))) {
1795                 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1796                 if (!PageUptodate(pinned[0])) {
1797                         ret = mpage_readpage(pinned[0], btrfs_get_block);
1798                         BUG_ON(ret);
1799                 } else {
1800                         unlock_page(pinned[0]);
1801                 }
1802         }
1803         if (first_index != last_index &&
1804             (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1805             (count & (PAGE_CACHE_SIZE - 1))) {
1806                 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1807                 if (!PageUptodate(pinned[1])) {
1808                         ret = mpage_readpage(pinned[1], btrfs_get_block);
1809                         BUG_ON(ret);
1810                 } else {
1811                         unlock_page(pinned[1]);
1812                 }
1813         }
1814
1815         mutex_lock(&root->fs_info->fs_mutex);
1816         trans = btrfs_start_transaction(root, 1);
1817         if (!trans) {
1818                 err = -ENOMEM;
1819                 mutex_unlock(&root->fs_info->fs_mutex);
1820                 goto out_unlock;
1821         }
1822         /* FIXME blocksize != 4096 */
1823         inode->i_blocks += num_blocks << 3;
1824         if (start_pos < inode->i_size) {
1825                 /* FIXME blocksize != pagesize */
1826                 ret = drop_extents(trans, root, inode,
1827                                    start_pos,
1828                                    (pos + count + root->blocksize -1) &
1829                                    ~((u64)root->blocksize - 1));
1830                 BUG_ON(ret);
1831         }
1832         if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
1833             pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
1834                 ret = btrfs_alloc_extent(trans, root, inode->i_ino,
1835                                          num_blocks, 1, (u64)-1, &ins);
1836                 BUG_ON(ret);
1837                 ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
1838                                        start_pos, ins.objectid, ins.offset);
1839                 BUG_ON(ret);
1840         } else {
1841                 ins.offset = 0;
1842                 ins.objectid = 0;
1843         }
1844         BUG_ON(ret);
1845         alloc_extent_start = ins.objectid;
1846         ret = btrfs_end_transaction(trans, root);
1847         mutex_unlock(&root->fs_info->fs_mutex);
1848
1849         while(count > 0) {
1850                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1851                 size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
1852                 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1853                                         PAGE_CACHE_SHIFT;
1854
1855                 memset(pages, 0, sizeof(pages));
1856                 ret = prepare_pages(root, file, pages, num_pages,
1857                                     pos, first_index, last_index,
1858                                     write_bytes, alloc_extent_start);
1859                 BUG_ON(ret);
1860
1861                 /* FIXME blocks != pagesize */
1862                 if (alloc_extent_start)
1863                         alloc_extent_start += num_pages;
1864                 ret = btrfs_copy_from_user(pos, num_pages,
1865                                            write_bytes, pages, buf);
1866                 BUG_ON(ret);
1867
1868                 ret = dirty_and_release_pages(NULL, root, file, pages,
1869                                               num_pages, pos, write_bytes);
1870                 BUG_ON(ret);
1871                 btrfs_drop_pages(pages, num_pages);
1872
1873                 buf += write_bytes;
1874                 count -= write_bytes;
1875                 pos += write_bytes;
1876                 num_written += write_bytes;
1877
1878                 balance_dirty_pages_ratelimited(inode->i_mapping);
1879                 cond_resched();
1880         }
1881 out_unlock:
1882         mutex_unlock(&inode->i_mutex);
1883 out:
1884         if (pinned[0])
1885                 page_cache_release(pinned[0]);
1886         if (pinned[1])
1887                 page_cache_release(pinned[1]);
1888         *ppos = pos;
1889         current->backing_dev_info = NULL;
1890         mark_inode_dirty(inode);
1891         return num_written ? num_written : err;
1892 }
1893
1894 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
1895                         unsigned long offset, unsigned long size)
1896 {
1897         char *kaddr;
1898         unsigned long left, count = desc->count;
1899         struct inode *inode = page->mapping->host;
1900
1901         if (size > count)
1902                 size = count;
1903
1904         if (!PageChecked(page)) {
1905                 /* FIXME, do it per block */
1906                 struct btrfs_root *root = BTRFS_I(inode)->root;
1907
1908                 int ret = btrfs_csum_verify_file_block(root,
1909                                   page->mapping->host->i_ino,
1910                                   page->index << PAGE_CACHE_SHIFT,
1911                                   kmap(page), PAGE_CACHE_SIZE);
1912                 if (ret) {
1913                         printk("failed to verify ino %lu page %lu\n",
1914                                page->mapping->host->i_ino,
1915                                page->index);
1916                         memset(page_address(page), 0, PAGE_CACHE_SIZE);
1917                 }
1918                 SetPageChecked(page);
1919                 kunmap(page);
1920         }
1921         /*
1922          * Faults on the destination of a read are common, so do it before
1923          * taking the kmap.
1924          */
1925         if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1926                 kaddr = kmap_atomic(page, KM_USER0);
1927                 left = __copy_to_user_inatomic(desc->arg.buf,
1928                                                 kaddr + offset, size);
1929                 kunmap_atomic(kaddr, KM_USER0);
1930                 if (left == 0)
1931                         goto success;
1932         }
1933
1934         /* Do it the slow way */
1935         kaddr = kmap(page);
1936         left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1937         kunmap(page);
1938
1939         if (left) {
1940                 size -= left;
1941                 desc->error = -EFAULT;
1942         }
1943 success:
1944         desc->count = count - size;
1945         desc->written += size;
1946         desc->arg.buf += size;
1947         return size;
1948 }
1949
1950 /**
1951  * btrfs_file_aio_read - filesystem read routine
1952  * @iocb:       kernel I/O control block
1953  * @iov:        io vector request
1954  * @nr_segs:    number of segments in the iovec
1955  * @pos:        current file position
1956  */
1957 static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1958                                    unsigned long nr_segs, loff_t pos)
1959 {
1960         struct file *filp = iocb->ki_filp;
1961         ssize_t retval;
1962         unsigned long seg;
1963         size_t count;
1964         loff_t *ppos = &iocb->ki_pos;
1965
1966         count = 0;
1967         for (seg = 0; seg < nr_segs; seg++) {
1968                 const struct iovec *iv = &iov[seg];
1969
1970                 /*
1971                  * If any segment has a negative length, or the cumulative
1972                  * length ever wraps negative then return -EINVAL.
1973                  */
1974                 count += iv->iov_len;
1975                 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1976                         return -EINVAL;
1977                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1978                         continue;
1979                 if (seg == 0)
1980                         return -EFAULT;
1981                 nr_segs = seg;
1982                 count -= iv->iov_len;   /* This segment is no good */
1983                 break;
1984         }
1985         retval = 0;
1986         if (count) {
1987                 for (seg = 0; seg < nr_segs; seg++) {
1988                         read_descriptor_t desc;
1989
1990                         desc.written = 0;
1991                         desc.arg.buf = iov[seg].iov_base;
1992                         desc.count = iov[seg].iov_len;
1993                         if (desc.count == 0)
1994                                 continue;
1995                         desc.error = 0;
1996                         do_generic_file_read(filp, ppos, &desc,
1997                                              btrfs_read_actor);
1998                         retval += desc.written;
1999                         if (desc.error) {
2000                                 retval = retval ?: desc.error;
2001                                 break;
2002                         }
2003                 }
2004         }
2005         return retval;
2006 }
2007
2008 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
2009 {
2010         struct btrfs_trans_handle *trans;
2011         struct btrfs_key key;
2012         struct btrfs_root_item root_item;
2013         struct btrfs_inode_item *inode_item;
2014         struct buffer_head *subvol;
2015         struct btrfs_leaf *leaf;
2016         struct btrfs_root *new_root;
2017         struct inode *inode;
2018         int ret;
2019         u64 objectid;
2020         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
2021
2022         mutex_lock(&root->fs_info->fs_mutex);
2023         trans = btrfs_start_transaction(root, 1);
2024         BUG_ON(!trans);
2025
2026         subvol = btrfs_alloc_free_block(trans, root);
2027         if (subvol == NULL)
2028                 return -ENOSPC;
2029         leaf = btrfs_buffer_leaf(subvol);
2030         btrfs_set_header_nritems(&leaf->header, 0);
2031         btrfs_set_header_level(&leaf->header, 0);
2032         btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
2033         btrfs_set_header_generation(&leaf->header, trans->transid);
2034         btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
2035         memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
2036                sizeof(leaf->header.fsid));
2037         mark_buffer_dirty(subvol);
2038
2039         inode_item = &root_item.inode;
2040         memset(inode_item, 0, sizeof(*inode_item));
2041         btrfs_set_inode_generation(inode_item, 1);
2042         btrfs_set_inode_size(inode_item, 3);
2043         btrfs_set_inode_nlink(inode_item, 1);
2044         btrfs_set_inode_nblocks(inode_item, 1);
2045         btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
2046
2047         btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
2048         btrfs_set_root_refs(&root_item, 1);
2049         brelse(subvol);
2050         subvol = NULL;
2051
2052         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2053                                        0, &objectid);
2054         BUG_ON(ret);
2055
2056         btrfs_set_root_dirid(&root_item, new_dirid);
2057
2058         key.objectid = objectid;
2059         key.offset = 1;
2060         key.flags = 0;
2061         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2062         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2063                                 &root_item);
2064         BUG_ON(ret);
2065
2066         /*
2067          * insert the directory item
2068          */
2069         key.offset = (u64)-1;
2070         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2071                                     name, namelen,
2072                                     root->fs_info->sb->s_root->d_inode->i_ino,
2073                                     &key, 0);
2074         BUG_ON(ret);
2075
2076         ret = btrfs_commit_transaction(trans, root);
2077         BUG_ON(ret);
2078
2079         new_root = btrfs_read_fs_root(root->fs_info, &key);
2080         BUG_ON(!new_root);
2081
2082         trans = btrfs_start_transaction(new_root, 1);
2083         BUG_ON(!trans);
2084
2085         inode = btrfs_new_inode(trans, new_root, new_dirid, S_IFDIR | 0700);
2086         inode->i_op = &btrfs_dir_inode_operations;
2087         inode->i_fop = &btrfs_dir_file_operations;
2088
2089         ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
2090         BUG_ON(ret);
2091
2092         inode->i_nlink = 1;
2093         inode->i_size = 6;
2094         ret = btrfs_update_inode(trans, new_root, inode);
2095         BUG_ON(ret);
2096
2097         ret = btrfs_commit_transaction(trans, new_root);
2098         BUG_ON(ret);
2099
2100         iput(inode);
2101
2102         mutex_unlock(&root->fs_info->fs_mutex);
2103         return 0;
2104 }
2105
2106 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
2107 {
2108         struct btrfs_trans_handle *trans;
2109         struct btrfs_key key;
2110         struct btrfs_root_item new_root_item;
2111         int ret;
2112         u64 objectid;
2113
2114         if (!root->ref_cows)
2115                 return -EINVAL;
2116
2117         mutex_lock(&root->fs_info->fs_mutex);
2118         trans = btrfs_start_transaction(root, 1);
2119         BUG_ON(!trans);
2120
2121         ret = btrfs_update_inode(trans, root, root->inode);
2122         BUG_ON(ret);
2123
2124         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2125                                        0, &objectid);
2126         BUG_ON(ret);
2127
2128         memcpy(&new_root_item, &root->root_item,
2129                sizeof(new_root_item));
2130
2131         key.objectid = objectid;
2132         key.offset = 1;
2133         key.flags = 0;
2134         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2135         btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
2136
2137         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2138                                 &new_root_item);
2139         BUG_ON(ret);
2140
2141         /*
2142          * insert the directory item
2143          */
2144         key.offset = (u64)-1;
2145         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2146                                     name, namelen,
2147                                     root->fs_info->sb->s_root->d_inode->i_ino,
2148                                     &key, 0);
2149
2150         BUG_ON(ret);
2151
2152         ret = btrfs_inc_root_ref(trans, root);
2153         BUG_ON(ret);
2154
2155         ret = btrfs_commit_transaction(trans, root);
2156         BUG_ON(ret);
2157         mutex_unlock(&root->fs_info->fs_mutex);
2158         return 0;
2159 }
2160
2161 static int add_disk(struct btrfs_root *root, char *name, int namelen)
2162 {
2163         struct block_device *bdev;
2164         struct btrfs_path *path;
2165         struct super_block *sb = root->fs_info->sb;
2166         struct btrfs_root *dev_root = root->fs_info->dev_root;
2167         struct btrfs_trans_handle *trans;
2168         struct btrfs_device_item *dev_item;
2169         struct btrfs_key key;
2170         u16 item_size;
2171         u64 num_blocks;
2172         u64 new_blocks;
2173         u64 device_id;
2174         int ret;
2175
2176 printk("adding disk %s\n", name);
2177         path = btrfs_alloc_path();
2178         if (!path)
2179                 return -ENOMEM;
2180         num_blocks = btrfs_super_total_blocks(root->fs_info->disk_super);
2181         bdev = open_bdev_excl(name, O_RDWR, sb);
2182         if (IS_ERR(bdev)) {
2183                 ret = PTR_ERR(bdev);
2184 printk("open bdev excl failed ret %d\n", ret);
2185                 goto out_nolock;
2186         }
2187         set_blocksize(bdev, sb->s_blocksize);
2188         new_blocks = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2189         key.objectid = num_blocks;
2190         key.offset = new_blocks;
2191         key.flags = 0;
2192         btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
2193
2194         mutex_lock(&dev_root->fs_info->fs_mutex);
2195         trans = btrfs_start_transaction(dev_root, 1);
2196         item_size = sizeof(*dev_item) + namelen;
2197 printk("insert empty on %Lu %Lu %u size %d\n", num_blocks, new_blocks, key.flags, item_size);
2198         ret = btrfs_insert_empty_item(trans, dev_root, path, &key, item_size);
2199         if (ret) {
2200 printk("insert failed %d\n", ret);
2201                 close_bdev_excl(bdev);
2202                 if (ret > 0)
2203                         ret = -EEXIST;
2204                 goto out;
2205         }
2206         dev_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
2207                                   path->slots[0], struct btrfs_device_item);
2208         btrfs_set_device_pathlen(dev_item, namelen);
2209         memcpy(dev_item + 1, name, namelen);
2210
2211         device_id = btrfs_super_last_device_id(root->fs_info->disk_super) + 1;
2212         btrfs_set_super_last_device_id(root->fs_info->disk_super, device_id);
2213         btrfs_set_device_id(dev_item, device_id);
2214         mark_buffer_dirty(path->nodes[0]);
2215
2216         ret = btrfs_insert_dev_radix(root, bdev, device_id, num_blocks,
2217                                      new_blocks);
2218
2219         if (!ret) {
2220                 btrfs_set_super_total_blocks(root->fs_info->disk_super,
2221                                              num_blocks + new_blocks);
2222                 i_size_write(root->fs_info->btree_inode,
2223                              (num_blocks + new_blocks) <<
2224                              root->fs_info->btree_inode->i_blkbits);
2225         }
2226
2227 out:
2228         ret = btrfs_commit_transaction(trans, dev_root);
2229         BUG_ON(ret);
2230         mutex_unlock(&root->fs_info->fs_mutex);
2231 out_nolock:
2232         btrfs_free_path(path);
2233
2234         return ret;
2235 }
2236
2237 static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
2238                        cmd, unsigned long arg)
2239 {
2240         struct btrfs_root *root = BTRFS_I(inode)->root;
2241         struct btrfs_ioctl_vol_args vol_args;
2242         int ret = 0;
2243         struct btrfs_dir_item *di;
2244         int namelen;
2245         struct btrfs_path *path;
2246         u64 root_dirid;
2247
2248         switch (cmd) {
2249         case BTRFS_IOC_SNAP_CREATE:
2250                 if (copy_from_user(&vol_args,
2251                                    (struct btrfs_ioctl_vol_args __user *)arg,
2252                                    sizeof(vol_args)))
2253                         return -EFAULT;
2254                 namelen = strlen(vol_args.name);
2255                 if (namelen > BTRFS_VOL_NAME_MAX)
2256                         return -EINVAL;
2257                 path = btrfs_alloc_path();
2258                 if (!path)
2259                         return -ENOMEM;
2260                 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
2261                 mutex_lock(&root->fs_info->fs_mutex);
2262                 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
2263                                     path, root_dirid,
2264                                     vol_args.name, namelen, 0);
2265                 mutex_unlock(&root->fs_info->fs_mutex);
2266                 btrfs_free_path(path);
2267                 if (di && !IS_ERR(di))
2268                         return -EEXIST;
2269
2270                 if (root == root->fs_info->tree_root)
2271                         ret = create_subvol(root, vol_args.name, namelen);
2272                 else
2273                         ret = create_snapshot(root, vol_args.name, namelen);
2274                 WARN_ON(ret);
2275                 break;
2276         case BTRFS_IOC_ADD_DISK:
2277                 if (copy_from_user(&vol_args,
2278                                    (struct btrfs_ioctl_vol_args __user *)arg,
2279                                    sizeof(vol_args)))
2280                         return -EFAULT;
2281                 namelen = strlen(vol_args.name);
2282                 if (namelen > BTRFS_VOL_NAME_MAX)
2283                         return -EINVAL;
2284                 vol_args.name[namelen] = '\0';
2285                 ret = add_disk(root, vol_args.name, namelen);
2286                 break;
2287         default:
2288                 return -ENOTTY;
2289         }
2290         return ret;
2291 }
2292
2293 static struct kmem_cache *btrfs_inode_cachep;
2294 struct kmem_cache *btrfs_trans_handle_cachep;
2295 struct kmem_cache *btrfs_transaction_cachep;
2296 struct kmem_cache *btrfs_bit_radix_cachep;
2297 struct kmem_cache *btrfs_path_cachep;
2298
2299 /*
2300  * Called inside transaction, so use GFP_NOFS
2301  */
2302 static struct inode *btrfs_alloc_inode(struct super_block *sb)
2303 {
2304         struct btrfs_inode *ei;
2305
2306         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
2307         if (!ei)
2308                 return NULL;
2309         return &ei->vfs_inode;
2310 }
2311
2312 static void btrfs_destroy_inode(struct inode *inode)
2313 {
2314         WARN_ON(!list_empty(&inode->i_dentry));
2315         WARN_ON(inode->i_data.nrpages);
2316
2317         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
2318 }
2319
2320 static void init_once(void * foo, struct kmem_cache * cachep,
2321                       unsigned long flags)
2322 {
2323         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
2324
2325         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2326             SLAB_CTOR_CONSTRUCTOR) {
2327                 inode_init_once(&ei->vfs_inode);
2328         }
2329 }
2330
2331 static int init_inodecache(void)
2332 {
2333         btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
2334                                              sizeof(struct btrfs_inode),
2335                                              0, (SLAB_RECLAIM_ACCOUNT|
2336                                                 SLAB_MEM_SPREAD),
2337                                              init_once, NULL);
2338         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
2339                                              sizeof(struct btrfs_trans_handle),
2340                                              0, (SLAB_RECLAIM_ACCOUNT|
2341                                                 SLAB_MEM_SPREAD),
2342                                              NULL, NULL);
2343         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
2344                                              sizeof(struct btrfs_transaction),
2345                                              0, (SLAB_RECLAIM_ACCOUNT|
2346                                                 SLAB_MEM_SPREAD),
2347                                              NULL, NULL);
2348         btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
2349                                              sizeof(struct btrfs_transaction),
2350                                              0, (SLAB_RECLAIM_ACCOUNT|
2351                                                 SLAB_MEM_SPREAD),
2352                                              NULL, NULL);
2353         btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
2354                                              256,
2355                                              0, (SLAB_RECLAIM_ACCOUNT|
2356                                                 SLAB_MEM_SPREAD |
2357                                                 SLAB_DESTROY_BY_RCU),
2358                                              NULL, NULL);
2359         if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL ||
2360             btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL)
2361                 return -ENOMEM;
2362         return 0;
2363 }
2364
2365 static void destroy_inodecache(void)
2366 {
2367         kmem_cache_destroy(btrfs_inode_cachep);
2368         kmem_cache_destroy(btrfs_trans_handle_cachep);
2369         kmem_cache_destroy(btrfs_transaction_cachep);
2370         kmem_cache_destroy(btrfs_bit_radix_cachep);
2371         kmem_cache_destroy(btrfs_path_cachep);
2372 }
2373
2374 static int btrfs_get_sb(struct file_system_type *fs_type,
2375         int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2376 {
2377         return get_sb_bdev(fs_type, flags, dev_name, data,
2378                            btrfs_fill_super, mnt);
2379 }
2380
2381
2382 static int btrfs_getattr(struct vfsmount *mnt,
2383                          struct dentry *dentry, struct kstat *stat)
2384 {
2385         struct inode *inode = dentry->d_inode;
2386         generic_fillattr(inode, stat);
2387         stat->blksize = 256 * 1024;
2388         return 0;
2389 }
2390
2391 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2392 {
2393         struct btrfs_root *root = btrfs_sb(dentry->d_sb);
2394         struct btrfs_super_block *disk_super = root->fs_info->disk_super;
2395
2396         buf->f_namelen = BTRFS_NAME_LEN;
2397         buf->f_blocks = btrfs_super_total_blocks(disk_super);
2398         buf->f_bfree = buf->f_blocks - btrfs_super_blocks_used(disk_super);
2399         buf->f_bavail = buf->f_bfree;
2400         buf->f_bsize = dentry->d_sb->s_blocksize;
2401         buf->f_type = BTRFS_SUPER_MAGIC;
2402         return 0;
2403 }
2404
2405 static struct file_system_type btrfs_fs_type = {
2406         .owner          = THIS_MODULE,
2407         .name           = "btrfs",
2408         .get_sb         = btrfs_get_sb,
2409         .kill_sb        = kill_block_super,
2410         .fs_flags       = FS_REQUIRES_DEV,
2411 };
2412
2413 static struct super_operations btrfs_super_ops = {
2414         .delete_inode   = btrfs_delete_inode,
2415         .put_super      = btrfs_put_super,
2416         .read_inode     = btrfs_read_locked_inode,
2417         .write_super    = btrfs_write_super,
2418         .sync_fs        = btrfs_sync_fs,
2419         .write_inode    = btrfs_write_inode,
2420         .dirty_inode    = btrfs_dirty_inode,
2421         .alloc_inode    = btrfs_alloc_inode,
2422         .destroy_inode  = btrfs_destroy_inode,
2423         .statfs         = btrfs_statfs,
2424 };
2425
2426 static struct inode_operations btrfs_dir_inode_operations = {
2427         .lookup         = btrfs_lookup,
2428         .create         = btrfs_create,
2429         .unlink         = btrfs_unlink,
2430         .mkdir          = btrfs_mkdir,
2431         .rmdir          = btrfs_rmdir,
2432 };
2433
2434 static struct inode_operations btrfs_dir_ro_inode_operations = {
2435         .lookup         = btrfs_lookup,
2436 };
2437
2438 static struct file_operations btrfs_dir_file_operations = {
2439         .llseek         = generic_file_llseek,
2440         .read           = generic_read_dir,
2441         .readdir        = btrfs_readdir,
2442         .ioctl          = btrfs_ioctl,
2443 };
2444
2445 static struct address_space_operations btrfs_aops = {
2446         .readpage       = btrfs_readpage,
2447         .writepage      = btrfs_writepage,
2448         .sync_page      = block_sync_page,
2449         .prepare_write  = btrfs_prepare_write,
2450         .commit_write   = btrfs_commit_write,
2451 };
2452
2453 static struct inode_operations btrfs_file_inode_operations = {
2454         .truncate       = btrfs_truncate,
2455         .getattr        = btrfs_getattr,
2456 };
2457
2458 static struct file_operations btrfs_file_operations = {
2459         .llseek         = generic_file_llseek,
2460         .read           = do_sync_read,
2461         .aio_read       = btrfs_file_aio_read,
2462         .write          = btrfs_file_write,
2463         .mmap           = generic_file_mmap,
2464         .open           = generic_file_open,
2465         .ioctl          = btrfs_ioctl,
2466         .fsync          = btrfs_sync_file,
2467 };
2468
2469 static int __init init_btrfs_fs(void)
2470 {
2471         int err;
2472         printk("btrfs loaded!\n");
2473         err = init_inodecache();
2474         if (err)
2475                 return err;
2476         kset_set_kset_s(&btrfs_subsys, fs_subsys);
2477         err = subsystem_register(&btrfs_subsys);
2478         if (err)
2479                 goto out;
2480         return register_filesystem(&btrfs_fs_type);
2481 out:
2482         destroy_inodecache();
2483         return err;
2484 }
2485
2486 static void __exit exit_btrfs_fs(void)
2487 {
2488         destroy_inodecache();
2489         unregister_filesystem(&btrfs_fs_type);
2490         subsystem_unregister(&btrfs_subsys);
2491         printk("btrfs unloaded\n");
2492 }
2493
2494 module_init(init_btrfs_fs)
2495 module_exit(exit_btrfs_fs)
2496
2497 MODULE_LICENSE("GPL");