Btrfs: read device stats on mount, write modified ones during commit
[linux-2.6-block.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <asm/div64.h>
29 #include "compat.h"
30 #include "ctree.h"
31 #include "extent_map.h"
32 #include "disk-io.h"
33 #include "transaction.h"
34 #include "print-tree.h"
35 #include "volumes.h"
36 #include "async-thread.h"
37 #include "check-integrity.h"
38
39 static int init_first_rw_device(struct btrfs_trans_handle *trans,
40                                 struct btrfs_root *root,
41                                 struct btrfs_device *device);
42 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
43 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
44 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
45
46 static DEFINE_MUTEX(uuid_mutex);
47 static LIST_HEAD(fs_uuids);
48
49 static void lock_chunks(struct btrfs_root *root)
50 {
51         mutex_lock(&root->fs_info->chunk_mutex);
52 }
53
54 static void unlock_chunks(struct btrfs_root *root)
55 {
56         mutex_unlock(&root->fs_info->chunk_mutex);
57 }
58
59 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
60 {
61         struct btrfs_device *device;
62         WARN_ON(fs_devices->opened);
63         while (!list_empty(&fs_devices->devices)) {
64                 device = list_entry(fs_devices->devices.next,
65                                     struct btrfs_device, dev_list);
66                 list_del(&device->dev_list);
67                 kfree(device->name);
68                 kfree(device);
69         }
70         kfree(fs_devices);
71 }
72
73 void btrfs_cleanup_fs_uuids(void)
74 {
75         struct btrfs_fs_devices *fs_devices;
76
77         while (!list_empty(&fs_uuids)) {
78                 fs_devices = list_entry(fs_uuids.next,
79                                         struct btrfs_fs_devices, list);
80                 list_del(&fs_devices->list);
81                 free_fs_devices(fs_devices);
82         }
83 }
84
85 static noinline struct btrfs_device *__find_device(struct list_head *head,
86                                                    u64 devid, u8 *uuid)
87 {
88         struct btrfs_device *dev;
89
90         list_for_each_entry(dev, head, dev_list) {
91                 if (dev->devid == devid &&
92                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
93                         return dev;
94                 }
95         }
96         return NULL;
97 }
98
99 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
100 {
101         struct btrfs_fs_devices *fs_devices;
102
103         list_for_each_entry(fs_devices, &fs_uuids, list) {
104                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
105                         return fs_devices;
106         }
107         return NULL;
108 }
109
110 static void requeue_list(struct btrfs_pending_bios *pending_bios,
111                         struct bio *head, struct bio *tail)
112 {
113
114         struct bio *old_head;
115
116         old_head = pending_bios->head;
117         pending_bios->head = head;
118         if (pending_bios->tail)
119                 tail->bi_next = old_head;
120         else
121                 pending_bios->tail = tail;
122 }
123
124 /*
125  * we try to collect pending bios for a device so we don't get a large
126  * number of procs sending bios down to the same device.  This greatly
127  * improves the schedulers ability to collect and merge the bios.
128  *
129  * But, it also turns into a long list of bios to process and that is sure
130  * to eventually make the worker thread block.  The solution here is to
131  * make some progress and then put this work struct back at the end of
132  * the list if the block device is congested.  This way, multiple devices
133  * can make progress from a single worker thread.
134  */
135 static noinline void run_scheduled_bios(struct btrfs_device *device)
136 {
137         struct bio *pending;
138         struct backing_dev_info *bdi;
139         struct btrfs_fs_info *fs_info;
140         struct btrfs_pending_bios *pending_bios;
141         struct bio *tail;
142         struct bio *cur;
143         int again = 0;
144         unsigned long num_run;
145         unsigned long batch_run = 0;
146         unsigned long limit;
147         unsigned long last_waited = 0;
148         int force_reg = 0;
149         int sync_pending = 0;
150         struct blk_plug plug;
151
152         /*
153          * this function runs all the bios we've collected for
154          * a particular device.  We don't want to wander off to
155          * another device without first sending all of these down.
156          * So, setup a plug here and finish it off before we return
157          */
158         blk_start_plug(&plug);
159
160         bdi = blk_get_backing_dev_info(device->bdev);
161         fs_info = device->dev_root->fs_info;
162         limit = btrfs_async_submit_limit(fs_info);
163         limit = limit * 2 / 3;
164
165 loop:
166         spin_lock(&device->io_lock);
167
168 loop_lock:
169         num_run = 0;
170
171         /* take all the bios off the list at once and process them
172          * later on (without the lock held).  But, remember the
173          * tail and other pointers so the bios can be properly reinserted
174          * into the list if we hit congestion
175          */
176         if (!force_reg && device->pending_sync_bios.head) {
177                 pending_bios = &device->pending_sync_bios;
178                 force_reg = 1;
179         } else {
180                 pending_bios = &device->pending_bios;
181                 force_reg = 0;
182         }
183
184         pending = pending_bios->head;
185         tail = pending_bios->tail;
186         WARN_ON(pending && !tail);
187
188         /*
189          * if pending was null this time around, no bios need processing
190          * at all and we can stop.  Otherwise it'll loop back up again
191          * and do an additional check so no bios are missed.
192          *
193          * device->running_pending is used to synchronize with the
194          * schedule_bio code.
195          */
196         if (device->pending_sync_bios.head == NULL &&
197             device->pending_bios.head == NULL) {
198                 again = 0;
199                 device->running_pending = 0;
200         } else {
201                 again = 1;
202                 device->running_pending = 1;
203         }
204
205         pending_bios->head = NULL;
206         pending_bios->tail = NULL;
207
208         spin_unlock(&device->io_lock);
209
210         while (pending) {
211
212                 rmb();
213                 /* we want to work on both lists, but do more bios on the
214                  * sync list than the regular list
215                  */
216                 if ((num_run > 32 &&
217                     pending_bios != &device->pending_sync_bios &&
218                     device->pending_sync_bios.head) ||
219                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
220                     device->pending_bios.head)) {
221                         spin_lock(&device->io_lock);
222                         requeue_list(pending_bios, pending, tail);
223                         goto loop_lock;
224                 }
225
226                 cur = pending;
227                 pending = pending->bi_next;
228                 cur->bi_next = NULL;
229                 atomic_dec(&fs_info->nr_async_bios);
230
231                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
232                     waitqueue_active(&fs_info->async_submit_wait))
233                         wake_up(&fs_info->async_submit_wait);
234
235                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
236
237                 /*
238                  * if we're doing the sync list, record that our
239                  * plug has some sync requests on it
240                  *
241                  * If we're doing the regular list and there are
242                  * sync requests sitting around, unplug before
243                  * we add more
244                  */
245                 if (pending_bios == &device->pending_sync_bios) {
246                         sync_pending = 1;
247                 } else if (sync_pending) {
248                         blk_finish_plug(&plug);
249                         blk_start_plug(&plug);
250                         sync_pending = 0;
251                 }
252
253                 btrfsic_submit_bio(cur->bi_rw, cur);
254                 num_run++;
255                 batch_run++;
256                 if (need_resched())
257                         cond_resched();
258
259                 /*
260                  * we made progress, there is more work to do and the bdi
261                  * is now congested.  Back off and let other work structs
262                  * run instead
263                  */
264                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
265                     fs_info->fs_devices->open_devices > 1) {
266                         struct io_context *ioc;
267
268                         ioc = current->io_context;
269
270                         /*
271                          * the main goal here is that we don't want to
272                          * block if we're going to be able to submit
273                          * more requests without blocking.
274                          *
275                          * This code does two great things, it pokes into
276                          * the elevator code from a filesystem _and_
277                          * it makes assumptions about how batching works.
278                          */
279                         if (ioc && ioc->nr_batch_requests > 0 &&
280                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
281                             (last_waited == 0 ||
282                              ioc->last_waited == last_waited)) {
283                                 /*
284                                  * we want to go through our batch of
285                                  * requests and stop.  So, we copy out
286                                  * the ioc->last_waited time and test
287                                  * against it before looping
288                                  */
289                                 last_waited = ioc->last_waited;
290                                 if (need_resched())
291                                         cond_resched();
292                                 continue;
293                         }
294                         spin_lock(&device->io_lock);
295                         requeue_list(pending_bios, pending, tail);
296                         device->running_pending = 1;
297
298                         spin_unlock(&device->io_lock);
299                         btrfs_requeue_work(&device->work);
300                         goto done;
301                 }
302                 /* unplug every 64 requests just for good measure */
303                 if (batch_run % 64 == 0) {
304                         blk_finish_plug(&plug);
305                         blk_start_plug(&plug);
306                         sync_pending = 0;
307                 }
308         }
309
310         cond_resched();
311         if (again)
312                 goto loop;
313
314         spin_lock(&device->io_lock);
315         if (device->pending_bios.head || device->pending_sync_bios.head)
316                 goto loop_lock;
317         spin_unlock(&device->io_lock);
318
319 done:
320         blk_finish_plug(&plug);
321 }
322
323 static void pending_bios_fn(struct btrfs_work *work)
324 {
325         struct btrfs_device *device;
326
327         device = container_of(work, struct btrfs_device, work);
328         run_scheduled_bios(device);
329 }
330
331 static noinline int device_list_add(const char *path,
332                            struct btrfs_super_block *disk_super,
333                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
334 {
335         struct btrfs_device *device;
336         struct btrfs_fs_devices *fs_devices;
337         u64 found_transid = btrfs_super_generation(disk_super);
338         char *name;
339
340         fs_devices = find_fsid(disk_super->fsid);
341         if (!fs_devices) {
342                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
343                 if (!fs_devices)
344                         return -ENOMEM;
345                 INIT_LIST_HEAD(&fs_devices->devices);
346                 INIT_LIST_HEAD(&fs_devices->alloc_list);
347                 list_add(&fs_devices->list, &fs_uuids);
348                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
349                 fs_devices->latest_devid = devid;
350                 fs_devices->latest_trans = found_transid;
351                 mutex_init(&fs_devices->device_list_mutex);
352                 device = NULL;
353         } else {
354                 device = __find_device(&fs_devices->devices, devid,
355                                        disk_super->dev_item.uuid);
356         }
357         if (!device) {
358                 if (fs_devices->opened)
359                         return -EBUSY;
360
361                 device = kzalloc(sizeof(*device), GFP_NOFS);
362                 if (!device) {
363                         /* we can safely leave the fs_devices entry around */
364                         return -ENOMEM;
365                 }
366                 device->devid = devid;
367                 device->dev_stats_valid = 0;
368                 device->work.func = pending_bios_fn;
369                 memcpy(device->uuid, disk_super->dev_item.uuid,
370                        BTRFS_UUID_SIZE);
371                 spin_lock_init(&device->io_lock);
372                 device->name = kstrdup(path, GFP_NOFS);
373                 if (!device->name) {
374                         kfree(device);
375                         return -ENOMEM;
376                 }
377                 INIT_LIST_HEAD(&device->dev_alloc_list);
378
379                 /* init readahead state */
380                 spin_lock_init(&device->reada_lock);
381                 device->reada_curr_zone = NULL;
382                 atomic_set(&device->reada_in_flight, 0);
383                 device->reada_next = 0;
384                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
385                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
386
387                 mutex_lock(&fs_devices->device_list_mutex);
388                 list_add_rcu(&device->dev_list, &fs_devices->devices);
389                 mutex_unlock(&fs_devices->device_list_mutex);
390
391                 device->fs_devices = fs_devices;
392                 fs_devices->num_devices++;
393         } else if (!device->name || strcmp(device->name, path)) {
394                 name = kstrdup(path, GFP_NOFS);
395                 if (!name)
396                         return -ENOMEM;
397                 kfree(device->name);
398                 device->name = name;
399                 if (device->missing) {
400                         fs_devices->missing_devices--;
401                         device->missing = 0;
402                 }
403         }
404
405         if (found_transid > fs_devices->latest_trans) {
406                 fs_devices->latest_devid = devid;
407                 fs_devices->latest_trans = found_transid;
408         }
409         *fs_devices_ret = fs_devices;
410         return 0;
411 }
412
413 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
414 {
415         struct btrfs_fs_devices *fs_devices;
416         struct btrfs_device *device;
417         struct btrfs_device *orig_dev;
418
419         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
420         if (!fs_devices)
421                 return ERR_PTR(-ENOMEM);
422
423         INIT_LIST_HEAD(&fs_devices->devices);
424         INIT_LIST_HEAD(&fs_devices->alloc_list);
425         INIT_LIST_HEAD(&fs_devices->list);
426         mutex_init(&fs_devices->device_list_mutex);
427         fs_devices->latest_devid = orig->latest_devid;
428         fs_devices->latest_trans = orig->latest_trans;
429         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
430
431         /* We have held the volume lock, it is safe to get the devices. */
432         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
433                 device = kzalloc(sizeof(*device), GFP_NOFS);
434                 if (!device)
435                         goto error;
436
437                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
438                 if (!device->name) {
439                         kfree(device);
440                         goto error;
441                 }
442
443                 device->devid = orig_dev->devid;
444                 device->work.func = pending_bios_fn;
445                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
446                 spin_lock_init(&device->io_lock);
447                 INIT_LIST_HEAD(&device->dev_list);
448                 INIT_LIST_HEAD(&device->dev_alloc_list);
449
450                 list_add(&device->dev_list, &fs_devices->devices);
451                 device->fs_devices = fs_devices;
452                 fs_devices->num_devices++;
453         }
454         return fs_devices;
455 error:
456         free_fs_devices(fs_devices);
457         return ERR_PTR(-ENOMEM);
458 }
459
460 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
461 {
462         struct btrfs_device *device, *next;
463
464         struct block_device *latest_bdev = NULL;
465         u64 latest_devid = 0;
466         u64 latest_transid = 0;
467
468         mutex_lock(&uuid_mutex);
469 again:
470         /* This is the initialized path, it is safe to release the devices. */
471         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
472                 if (device->in_fs_metadata) {
473                         if (!latest_transid ||
474                             device->generation > latest_transid) {
475                                 latest_devid = device->devid;
476                                 latest_transid = device->generation;
477                                 latest_bdev = device->bdev;
478                         }
479                         continue;
480                 }
481
482                 if (device->bdev) {
483                         blkdev_put(device->bdev, device->mode);
484                         device->bdev = NULL;
485                         fs_devices->open_devices--;
486                 }
487                 if (device->writeable) {
488                         list_del_init(&device->dev_alloc_list);
489                         device->writeable = 0;
490                         fs_devices->rw_devices--;
491                 }
492                 list_del_init(&device->dev_list);
493                 fs_devices->num_devices--;
494                 kfree(device->name);
495                 kfree(device);
496         }
497
498         if (fs_devices->seed) {
499                 fs_devices = fs_devices->seed;
500                 goto again;
501         }
502
503         fs_devices->latest_bdev = latest_bdev;
504         fs_devices->latest_devid = latest_devid;
505         fs_devices->latest_trans = latest_transid;
506
507         mutex_unlock(&uuid_mutex);
508 }
509
510 static void __free_device(struct work_struct *work)
511 {
512         struct btrfs_device *device;
513
514         device = container_of(work, struct btrfs_device, rcu_work);
515
516         if (device->bdev)
517                 blkdev_put(device->bdev, device->mode);
518
519         kfree(device->name);
520         kfree(device);
521 }
522
523 static void free_device(struct rcu_head *head)
524 {
525         struct btrfs_device *device;
526
527         device = container_of(head, struct btrfs_device, rcu);
528
529         INIT_WORK(&device->rcu_work, __free_device);
530         schedule_work(&device->rcu_work);
531 }
532
533 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
534 {
535         struct btrfs_device *device;
536
537         if (--fs_devices->opened > 0)
538                 return 0;
539
540         mutex_lock(&fs_devices->device_list_mutex);
541         list_for_each_entry(device, &fs_devices->devices, dev_list) {
542                 struct btrfs_device *new_device;
543
544                 if (device->bdev)
545                         fs_devices->open_devices--;
546
547                 if (device->writeable) {
548                         list_del_init(&device->dev_alloc_list);
549                         fs_devices->rw_devices--;
550                 }
551
552                 if (device->can_discard)
553                         fs_devices->num_can_discard--;
554
555                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
556                 BUG_ON(!new_device); /* -ENOMEM */
557                 memcpy(new_device, device, sizeof(*new_device));
558                 new_device->name = kstrdup(device->name, GFP_NOFS);
559                 BUG_ON(device->name && !new_device->name); /* -ENOMEM */
560                 new_device->bdev = NULL;
561                 new_device->writeable = 0;
562                 new_device->in_fs_metadata = 0;
563                 new_device->can_discard = 0;
564                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
565
566                 call_rcu(&device->rcu, free_device);
567         }
568         mutex_unlock(&fs_devices->device_list_mutex);
569
570         WARN_ON(fs_devices->open_devices);
571         WARN_ON(fs_devices->rw_devices);
572         fs_devices->opened = 0;
573         fs_devices->seeding = 0;
574
575         return 0;
576 }
577
578 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
579 {
580         struct btrfs_fs_devices *seed_devices = NULL;
581         int ret;
582
583         mutex_lock(&uuid_mutex);
584         ret = __btrfs_close_devices(fs_devices);
585         if (!fs_devices->opened) {
586                 seed_devices = fs_devices->seed;
587                 fs_devices->seed = NULL;
588         }
589         mutex_unlock(&uuid_mutex);
590
591         while (seed_devices) {
592                 fs_devices = seed_devices;
593                 seed_devices = fs_devices->seed;
594                 __btrfs_close_devices(fs_devices);
595                 free_fs_devices(fs_devices);
596         }
597         return ret;
598 }
599
600 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
601                                 fmode_t flags, void *holder)
602 {
603         struct request_queue *q;
604         struct block_device *bdev;
605         struct list_head *head = &fs_devices->devices;
606         struct btrfs_device *device;
607         struct block_device *latest_bdev = NULL;
608         struct buffer_head *bh;
609         struct btrfs_super_block *disk_super;
610         u64 latest_devid = 0;
611         u64 latest_transid = 0;
612         u64 devid;
613         int seeding = 1;
614         int ret = 0;
615
616         flags |= FMODE_EXCL;
617
618         list_for_each_entry(device, head, dev_list) {
619                 if (device->bdev)
620                         continue;
621                 if (!device->name)
622                         continue;
623
624                 bdev = blkdev_get_by_path(device->name, flags, holder);
625                 if (IS_ERR(bdev)) {
626                         printk(KERN_INFO "open %s failed\n", device->name);
627                         goto error;
628                 }
629                 filemap_write_and_wait(bdev->bd_inode->i_mapping);
630                 invalidate_bdev(bdev);
631                 set_blocksize(bdev, 4096);
632
633                 bh = btrfs_read_dev_super(bdev);
634                 if (!bh)
635                         goto error_close;
636
637                 disk_super = (struct btrfs_super_block *)bh->b_data;
638                 devid = btrfs_stack_device_id(&disk_super->dev_item);
639                 if (devid != device->devid)
640                         goto error_brelse;
641
642                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
643                            BTRFS_UUID_SIZE))
644                         goto error_brelse;
645
646                 device->generation = btrfs_super_generation(disk_super);
647                 if (!latest_transid || device->generation > latest_transid) {
648                         latest_devid = devid;
649                         latest_transid = device->generation;
650                         latest_bdev = bdev;
651                 }
652
653                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
654                         device->writeable = 0;
655                 } else {
656                         device->writeable = !bdev_read_only(bdev);
657                         seeding = 0;
658                 }
659
660                 q = bdev_get_queue(bdev);
661                 if (blk_queue_discard(q)) {
662                         device->can_discard = 1;
663                         fs_devices->num_can_discard++;
664                 }
665
666                 device->bdev = bdev;
667                 device->in_fs_metadata = 0;
668                 device->mode = flags;
669
670                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
671                         fs_devices->rotating = 1;
672
673                 fs_devices->open_devices++;
674                 if (device->writeable) {
675                         fs_devices->rw_devices++;
676                         list_add(&device->dev_alloc_list,
677                                  &fs_devices->alloc_list);
678                 }
679                 brelse(bh);
680                 continue;
681
682 error_brelse:
683                 brelse(bh);
684 error_close:
685                 blkdev_put(bdev, flags);
686 error:
687                 continue;
688         }
689         if (fs_devices->open_devices == 0) {
690                 ret = -EINVAL;
691                 goto out;
692         }
693         fs_devices->seeding = seeding;
694         fs_devices->opened = 1;
695         fs_devices->latest_bdev = latest_bdev;
696         fs_devices->latest_devid = latest_devid;
697         fs_devices->latest_trans = latest_transid;
698         fs_devices->total_rw_bytes = 0;
699 out:
700         return ret;
701 }
702
703 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
704                        fmode_t flags, void *holder)
705 {
706         int ret;
707
708         mutex_lock(&uuid_mutex);
709         if (fs_devices->opened) {
710                 fs_devices->opened++;
711                 ret = 0;
712         } else {
713                 ret = __btrfs_open_devices(fs_devices, flags, holder);
714         }
715         mutex_unlock(&uuid_mutex);
716         return ret;
717 }
718
719 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
720                           struct btrfs_fs_devices **fs_devices_ret)
721 {
722         struct btrfs_super_block *disk_super;
723         struct block_device *bdev;
724         struct buffer_head *bh;
725         int ret;
726         u64 devid;
727         u64 transid;
728
729         flags |= FMODE_EXCL;
730         bdev = blkdev_get_by_path(path, flags, holder);
731
732         if (IS_ERR(bdev)) {
733                 ret = PTR_ERR(bdev);
734                 goto error;
735         }
736
737         mutex_lock(&uuid_mutex);
738         ret = set_blocksize(bdev, 4096);
739         if (ret)
740                 goto error_close;
741         bh = btrfs_read_dev_super(bdev);
742         if (!bh) {
743                 ret = -EINVAL;
744                 goto error_close;
745         }
746         disk_super = (struct btrfs_super_block *)bh->b_data;
747         devid = btrfs_stack_device_id(&disk_super->dev_item);
748         transid = btrfs_super_generation(disk_super);
749         if (disk_super->label[0])
750                 printk(KERN_INFO "device label %s ", disk_super->label);
751         else
752                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
753         printk(KERN_CONT "devid %llu transid %llu %s\n",
754                (unsigned long long)devid, (unsigned long long)transid, path);
755         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
756
757         brelse(bh);
758 error_close:
759         mutex_unlock(&uuid_mutex);
760         blkdev_put(bdev, flags);
761 error:
762         return ret;
763 }
764
765 /* helper to account the used device space in the range */
766 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
767                                    u64 end, u64 *length)
768 {
769         struct btrfs_key key;
770         struct btrfs_root *root = device->dev_root;
771         struct btrfs_dev_extent *dev_extent;
772         struct btrfs_path *path;
773         u64 extent_end;
774         int ret;
775         int slot;
776         struct extent_buffer *l;
777
778         *length = 0;
779
780         if (start >= device->total_bytes)
781                 return 0;
782
783         path = btrfs_alloc_path();
784         if (!path)
785                 return -ENOMEM;
786         path->reada = 2;
787
788         key.objectid = device->devid;
789         key.offset = start;
790         key.type = BTRFS_DEV_EXTENT_KEY;
791
792         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
793         if (ret < 0)
794                 goto out;
795         if (ret > 0) {
796                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
797                 if (ret < 0)
798                         goto out;
799         }
800
801         while (1) {
802                 l = path->nodes[0];
803                 slot = path->slots[0];
804                 if (slot >= btrfs_header_nritems(l)) {
805                         ret = btrfs_next_leaf(root, path);
806                         if (ret == 0)
807                                 continue;
808                         if (ret < 0)
809                                 goto out;
810
811                         break;
812                 }
813                 btrfs_item_key_to_cpu(l, &key, slot);
814
815                 if (key.objectid < device->devid)
816                         goto next;
817
818                 if (key.objectid > device->devid)
819                         break;
820
821                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
822                         goto next;
823
824                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
825                 extent_end = key.offset + btrfs_dev_extent_length(l,
826                                                                   dev_extent);
827                 if (key.offset <= start && extent_end > end) {
828                         *length = end - start + 1;
829                         break;
830                 } else if (key.offset <= start && extent_end > start)
831                         *length += extent_end - start;
832                 else if (key.offset > start && extent_end <= end)
833                         *length += extent_end - key.offset;
834                 else if (key.offset > start && key.offset <= end) {
835                         *length += end - key.offset + 1;
836                         break;
837                 } else if (key.offset > end)
838                         break;
839
840 next:
841                 path->slots[0]++;
842         }
843         ret = 0;
844 out:
845         btrfs_free_path(path);
846         return ret;
847 }
848
849 /*
850  * find_free_dev_extent - find free space in the specified device
851  * @device:     the device which we search the free space in
852  * @num_bytes:  the size of the free space that we need
853  * @start:      store the start of the free space.
854  * @len:        the size of the free space. that we find, or the size of the max
855  *              free space if we don't find suitable free space
856  *
857  * this uses a pretty simple search, the expectation is that it is
858  * called very infrequently and that a given device has a small number
859  * of extents
860  *
861  * @start is used to store the start of the free space if we find. But if we
862  * don't find suitable free space, it will be used to store the start position
863  * of the max free space.
864  *
865  * @len is used to store the size of the free space that we find.
866  * But if we don't find suitable free space, it is used to store the size of
867  * the max free space.
868  */
869 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
870                          u64 *start, u64 *len)
871 {
872         struct btrfs_key key;
873         struct btrfs_root *root = device->dev_root;
874         struct btrfs_dev_extent *dev_extent;
875         struct btrfs_path *path;
876         u64 hole_size;
877         u64 max_hole_start;
878         u64 max_hole_size;
879         u64 extent_end;
880         u64 search_start;
881         u64 search_end = device->total_bytes;
882         int ret;
883         int slot;
884         struct extent_buffer *l;
885
886         /* FIXME use last free of some kind */
887
888         /* we don't want to overwrite the superblock on the drive,
889          * so we make sure to start at an offset of at least 1MB
890          */
891         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
892
893         max_hole_start = search_start;
894         max_hole_size = 0;
895         hole_size = 0;
896
897         if (search_start >= search_end) {
898                 ret = -ENOSPC;
899                 goto error;
900         }
901
902         path = btrfs_alloc_path();
903         if (!path) {
904                 ret = -ENOMEM;
905                 goto error;
906         }
907         path->reada = 2;
908
909         key.objectid = device->devid;
910         key.offset = search_start;
911         key.type = BTRFS_DEV_EXTENT_KEY;
912
913         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
914         if (ret < 0)
915                 goto out;
916         if (ret > 0) {
917                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
918                 if (ret < 0)
919                         goto out;
920         }
921
922         while (1) {
923                 l = path->nodes[0];
924                 slot = path->slots[0];
925                 if (slot >= btrfs_header_nritems(l)) {
926                         ret = btrfs_next_leaf(root, path);
927                         if (ret == 0)
928                                 continue;
929                         if (ret < 0)
930                                 goto out;
931
932                         break;
933                 }
934                 btrfs_item_key_to_cpu(l, &key, slot);
935
936                 if (key.objectid < device->devid)
937                         goto next;
938
939                 if (key.objectid > device->devid)
940                         break;
941
942                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
943                         goto next;
944
945                 if (key.offset > search_start) {
946                         hole_size = key.offset - search_start;
947
948                         if (hole_size > max_hole_size) {
949                                 max_hole_start = search_start;
950                                 max_hole_size = hole_size;
951                         }
952
953                         /*
954                          * If this free space is greater than which we need,
955                          * it must be the max free space that we have found
956                          * until now, so max_hole_start must point to the start
957                          * of this free space and the length of this free space
958                          * is stored in max_hole_size. Thus, we return
959                          * max_hole_start and max_hole_size and go back to the
960                          * caller.
961                          */
962                         if (hole_size >= num_bytes) {
963                                 ret = 0;
964                                 goto out;
965                         }
966                 }
967
968                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
969                 extent_end = key.offset + btrfs_dev_extent_length(l,
970                                                                   dev_extent);
971                 if (extent_end > search_start)
972                         search_start = extent_end;
973 next:
974                 path->slots[0]++;
975                 cond_resched();
976         }
977
978         /*
979          * At this point, search_start should be the end of
980          * allocated dev extents, and when shrinking the device,
981          * search_end may be smaller than search_start.
982          */
983         if (search_end > search_start)
984                 hole_size = search_end - search_start;
985
986         if (hole_size > max_hole_size) {
987                 max_hole_start = search_start;
988                 max_hole_size = hole_size;
989         }
990
991         /* See above. */
992         if (hole_size < num_bytes)
993                 ret = -ENOSPC;
994         else
995                 ret = 0;
996
997 out:
998         btrfs_free_path(path);
999 error:
1000         *start = max_hole_start;
1001         if (len)
1002                 *len = max_hole_size;
1003         return ret;
1004 }
1005
1006 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1007                           struct btrfs_device *device,
1008                           u64 start)
1009 {
1010         int ret;
1011         struct btrfs_path *path;
1012         struct btrfs_root *root = device->dev_root;
1013         struct btrfs_key key;
1014         struct btrfs_key found_key;
1015         struct extent_buffer *leaf = NULL;
1016         struct btrfs_dev_extent *extent = NULL;
1017
1018         path = btrfs_alloc_path();
1019         if (!path)
1020                 return -ENOMEM;
1021
1022         key.objectid = device->devid;
1023         key.offset = start;
1024         key.type = BTRFS_DEV_EXTENT_KEY;
1025 again:
1026         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1027         if (ret > 0) {
1028                 ret = btrfs_previous_item(root, path, key.objectid,
1029                                           BTRFS_DEV_EXTENT_KEY);
1030                 if (ret)
1031                         goto out;
1032                 leaf = path->nodes[0];
1033                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1034                 extent = btrfs_item_ptr(leaf, path->slots[0],
1035                                         struct btrfs_dev_extent);
1036                 BUG_ON(found_key.offset > start || found_key.offset +
1037                        btrfs_dev_extent_length(leaf, extent) < start);
1038                 key = found_key;
1039                 btrfs_release_path(path);
1040                 goto again;
1041         } else if (ret == 0) {
1042                 leaf = path->nodes[0];
1043                 extent = btrfs_item_ptr(leaf, path->slots[0],
1044                                         struct btrfs_dev_extent);
1045         } else {
1046                 btrfs_error(root->fs_info, ret, "Slot search failed");
1047                 goto out;
1048         }
1049
1050         if (device->bytes_used > 0) {
1051                 u64 len = btrfs_dev_extent_length(leaf, extent);
1052                 device->bytes_used -= len;
1053                 spin_lock(&root->fs_info->free_chunk_lock);
1054                 root->fs_info->free_chunk_space += len;
1055                 spin_unlock(&root->fs_info->free_chunk_lock);
1056         }
1057         ret = btrfs_del_item(trans, root, path);
1058         if (ret) {
1059                 btrfs_error(root->fs_info, ret,
1060                             "Failed to remove dev extent item");
1061         }
1062 out:
1063         btrfs_free_path(path);
1064         return ret;
1065 }
1066
1067 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1068                            struct btrfs_device *device,
1069                            u64 chunk_tree, u64 chunk_objectid,
1070                            u64 chunk_offset, u64 start, u64 num_bytes)
1071 {
1072         int ret;
1073         struct btrfs_path *path;
1074         struct btrfs_root *root = device->dev_root;
1075         struct btrfs_dev_extent *extent;
1076         struct extent_buffer *leaf;
1077         struct btrfs_key key;
1078
1079         WARN_ON(!device->in_fs_metadata);
1080         path = btrfs_alloc_path();
1081         if (!path)
1082                 return -ENOMEM;
1083
1084         key.objectid = device->devid;
1085         key.offset = start;
1086         key.type = BTRFS_DEV_EXTENT_KEY;
1087         ret = btrfs_insert_empty_item(trans, root, path, &key,
1088                                       sizeof(*extent));
1089         if (ret)
1090                 goto out;
1091
1092         leaf = path->nodes[0];
1093         extent = btrfs_item_ptr(leaf, path->slots[0],
1094                                 struct btrfs_dev_extent);
1095         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1096         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1097         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1098
1099         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1100                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1101                     BTRFS_UUID_SIZE);
1102
1103         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1104         btrfs_mark_buffer_dirty(leaf);
1105 out:
1106         btrfs_free_path(path);
1107         return ret;
1108 }
1109
1110 static noinline int find_next_chunk(struct btrfs_root *root,
1111                                     u64 objectid, u64 *offset)
1112 {
1113         struct btrfs_path *path;
1114         int ret;
1115         struct btrfs_key key;
1116         struct btrfs_chunk *chunk;
1117         struct btrfs_key found_key;
1118
1119         path = btrfs_alloc_path();
1120         if (!path)
1121                 return -ENOMEM;
1122
1123         key.objectid = objectid;
1124         key.offset = (u64)-1;
1125         key.type = BTRFS_CHUNK_ITEM_KEY;
1126
1127         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1128         if (ret < 0)
1129                 goto error;
1130
1131         BUG_ON(ret == 0); /* Corruption */
1132
1133         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1134         if (ret) {
1135                 *offset = 0;
1136         } else {
1137                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1138                                       path->slots[0]);
1139                 if (found_key.objectid != objectid)
1140                         *offset = 0;
1141                 else {
1142                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1143                                                struct btrfs_chunk);
1144                         *offset = found_key.offset +
1145                                 btrfs_chunk_length(path->nodes[0], chunk);
1146                 }
1147         }
1148         ret = 0;
1149 error:
1150         btrfs_free_path(path);
1151         return ret;
1152 }
1153
1154 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1155 {
1156         int ret;
1157         struct btrfs_key key;
1158         struct btrfs_key found_key;
1159         struct btrfs_path *path;
1160
1161         root = root->fs_info->chunk_root;
1162
1163         path = btrfs_alloc_path();
1164         if (!path)
1165                 return -ENOMEM;
1166
1167         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1168         key.type = BTRFS_DEV_ITEM_KEY;
1169         key.offset = (u64)-1;
1170
1171         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1172         if (ret < 0)
1173                 goto error;
1174
1175         BUG_ON(ret == 0); /* Corruption */
1176
1177         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1178                                   BTRFS_DEV_ITEM_KEY);
1179         if (ret) {
1180                 *objectid = 1;
1181         } else {
1182                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1183                                       path->slots[0]);
1184                 *objectid = found_key.offset + 1;
1185         }
1186         ret = 0;
1187 error:
1188         btrfs_free_path(path);
1189         return ret;
1190 }
1191
1192 /*
1193  * the device information is stored in the chunk root
1194  * the btrfs_device struct should be fully filled in
1195  */
1196 int btrfs_add_device(struct btrfs_trans_handle *trans,
1197                      struct btrfs_root *root,
1198                      struct btrfs_device *device)
1199 {
1200         int ret;
1201         struct btrfs_path *path;
1202         struct btrfs_dev_item *dev_item;
1203         struct extent_buffer *leaf;
1204         struct btrfs_key key;
1205         unsigned long ptr;
1206
1207         root = root->fs_info->chunk_root;
1208
1209         path = btrfs_alloc_path();
1210         if (!path)
1211                 return -ENOMEM;
1212
1213         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1214         key.type = BTRFS_DEV_ITEM_KEY;
1215         key.offset = device->devid;
1216
1217         ret = btrfs_insert_empty_item(trans, root, path, &key,
1218                                       sizeof(*dev_item));
1219         if (ret)
1220                 goto out;
1221
1222         leaf = path->nodes[0];
1223         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1224
1225         btrfs_set_device_id(leaf, dev_item, device->devid);
1226         btrfs_set_device_generation(leaf, dev_item, 0);
1227         btrfs_set_device_type(leaf, dev_item, device->type);
1228         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1229         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1230         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1231         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1232         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1233         btrfs_set_device_group(leaf, dev_item, 0);
1234         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1235         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1236         btrfs_set_device_start_offset(leaf, dev_item, 0);
1237
1238         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1239         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1240         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1241         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1242         btrfs_mark_buffer_dirty(leaf);
1243
1244         ret = 0;
1245 out:
1246         btrfs_free_path(path);
1247         return ret;
1248 }
1249
1250 static int btrfs_rm_dev_item(struct btrfs_root *root,
1251                              struct btrfs_device *device)
1252 {
1253         int ret;
1254         struct btrfs_path *path;
1255         struct btrfs_key key;
1256         struct btrfs_trans_handle *trans;
1257
1258         root = root->fs_info->chunk_root;
1259
1260         path = btrfs_alloc_path();
1261         if (!path)
1262                 return -ENOMEM;
1263
1264         trans = btrfs_start_transaction(root, 0);
1265         if (IS_ERR(trans)) {
1266                 btrfs_free_path(path);
1267                 return PTR_ERR(trans);
1268         }
1269         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1270         key.type = BTRFS_DEV_ITEM_KEY;
1271         key.offset = device->devid;
1272         lock_chunks(root);
1273
1274         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1275         if (ret < 0)
1276                 goto out;
1277
1278         if (ret > 0) {
1279                 ret = -ENOENT;
1280                 goto out;
1281         }
1282
1283         ret = btrfs_del_item(trans, root, path);
1284         if (ret)
1285                 goto out;
1286 out:
1287         btrfs_free_path(path);
1288         unlock_chunks(root);
1289         btrfs_commit_transaction(trans, root);
1290         return ret;
1291 }
1292
1293 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1294 {
1295         struct btrfs_device *device;
1296         struct btrfs_device *next_device;
1297         struct block_device *bdev;
1298         struct buffer_head *bh = NULL;
1299         struct btrfs_super_block *disk_super;
1300         struct btrfs_fs_devices *cur_devices;
1301         u64 all_avail;
1302         u64 devid;
1303         u64 num_devices;
1304         u8 *dev_uuid;
1305         int ret = 0;
1306         bool clear_super = false;
1307
1308         mutex_lock(&uuid_mutex);
1309
1310         all_avail = root->fs_info->avail_data_alloc_bits |
1311                 root->fs_info->avail_system_alloc_bits |
1312                 root->fs_info->avail_metadata_alloc_bits;
1313
1314         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1315             root->fs_info->fs_devices->num_devices <= 4) {
1316                 printk(KERN_ERR "btrfs: unable to go below four devices "
1317                        "on raid10\n");
1318                 ret = -EINVAL;
1319                 goto out;
1320         }
1321
1322         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1323             root->fs_info->fs_devices->num_devices <= 2) {
1324                 printk(KERN_ERR "btrfs: unable to go below two "
1325                        "devices on raid1\n");
1326                 ret = -EINVAL;
1327                 goto out;
1328         }
1329
1330         if (strcmp(device_path, "missing") == 0) {
1331                 struct list_head *devices;
1332                 struct btrfs_device *tmp;
1333
1334                 device = NULL;
1335                 devices = &root->fs_info->fs_devices->devices;
1336                 /*
1337                  * It is safe to read the devices since the volume_mutex
1338                  * is held.
1339                  */
1340                 list_for_each_entry(tmp, devices, dev_list) {
1341                         if (tmp->in_fs_metadata && !tmp->bdev) {
1342                                 device = tmp;
1343                                 break;
1344                         }
1345                 }
1346                 bdev = NULL;
1347                 bh = NULL;
1348                 disk_super = NULL;
1349                 if (!device) {
1350                         printk(KERN_ERR "btrfs: no missing devices found to "
1351                                "remove\n");
1352                         goto out;
1353                 }
1354         } else {
1355                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1356                                           root->fs_info->bdev_holder);
1357                 if (IS_ERR(bdev)) {
1358                         ret = PTR_ERR(bdev);
1359                         goto out;
1360                 }
1361
1362                 set_blocksize(bdev, 4096);
1363                 invalidate_bdev(bdev);
1364                 bh = btrfs_read_dev_super(bdev);
1365                 if (!bh) {
1366                         ret = -EINVAL;
1367                         goto error_close;
1368                 }
1369                 disk_super = (struct btrfs_super_block *)bh->b_data;
1370                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1371                 dev_uuid = disk_super->dev_item.uuid;
1372                 device = btrfs_find_device(root, devid, dev_uuid,
1373                                            disk_super->fsid);
1374                 if (!device) {
1375                         ret = -ENOENT;
1376                         goto error_brelse;
1377                 }
1378         }
1379
1380         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1381                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1382                        "device\n");
1383                 ret = -EINVAL;
1384                 goto error_brelse;
1385         }
1386
1387         if (device->writeable) {
1388                 lock_chunks(root);
1389                 list_del_init(&device->dev_alloc_list);
1390                 unlock_chunks(root);
1391                 root->fs_info->fs_devices->rw_devices--;
1392                 clear_super = true;
1393         }
1394
1395         ret = btrfs_shrink_device(device, 0);
1396         if (ret)
1397                 goto error_undo;
1398
1399         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1400         if (ret)
1401                 goto error_undo;
1402
1403         spin_lock(&root->fs_info->free_chunk_lock);
1404         root->fs_info->free_chunk_space = device->total_bytes -
1405                 device->bytes_used;
1406         spin_unlock(&root->fs_info->free_chunk_lock);
1407
1408         device->in_fs_metadata = 0;
1409         btrfs_scrub_cancel_dev(root, device);
1410
1411         /*
1412          * the device list mutex makes sure that we don't change
1413          * the device list while someone else is writing out all
1414          * the device supers.
1415          */
1416
1417         cur_devices = device->fs_devices;
1418         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1419         list_del_rcu(&device->dev_list);
1420
1421         device->fs_devices->num_devices--;
1422
1423         if (device->missing)
1424                 root->fs_info->fs_devices->missing_devices--;
1425
1426         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1427                                  struct btrfs_device, dev_list);
1428         if (device->bdev == root->fs_info->sb->s_bdev)
1429                 root->fs_info->sb->s_bdev = next_device->bdev;
1430         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1431                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1432
1433         if (device->bdev)
1434                 device->fs_devices->open_devices--;
1435
1436         call_rcu(&device->rcu, free_device);
1437         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1438
1439         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1440         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1441
1442         if (cur_devices->open_devices == 0) {
1443                 struct btrfs_fs_devices *fs_devices;
1444                 fs_devices = root->fs_info->fs_devices;
1445                 while (fs_devices) {
1446                         if (fs_devices->seed == cur_devices)
1447                                 break;
1448                         fs_devices = fs_devices->seed;
1449                 }
1450                 fs_devices->seed = cur_devices->seed;
1451                 cur_devices->seed = NULL;
1452                 lock_chunks(root);
1453                 __btrfs_close_devices(cur_devices);
1454                 unlock_chunks(root);
1455                 free_fs_devices(cur_devices);
1456         }
1457
1458         /*
1459          * at this point, the device is zero sized.  We want to
1460          * remove it from the devices list and zero out the old super
1461          */
1462         if (clear_super) {
1463                 /* make sure this device isn't detected as part of
1464                  * the FS anymore
1465                  */
1466                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1467                 set_buffer_dirty(bh);
1468                 sync_dirty_buffer(bh);
1469         }
1470
1471         ret = 0;
1472
1473 error_brelse:
1474         brelse(bh);
1475 error_close:
1476         if (bdev)
1477                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1478 out:
1479         mutex_unlock(&uuid_mutex);
1480         return ret;
1481 error_undo:
1482         if (device->writeable) {
1483                 lock_chunks(root);
1484                 list_add(&device->dev_alloc_list,
1485                          &root->fs_info->fs_devices->alloc_list);
1486                 unlock_chunks(root);
1487                 root->fs_info->fs_devices->rw_devices++;
1488         }
1489         goto error_brelse;
1490 }
1491
1492 /*
1493  * does all the dirty work required for changing file system's UUID.
1494  */
1495 static int btrfs_prepare_sprout(struct btrfs_root *root)
1496 {
1497         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1498         struct btrfs_fs_devices *old_devices;
1499         struct btrfs_fs_devices *seed_devices;
1500         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1501         struct btrfs_device *device;
1502         u64 super_flags;
1503
1504         BUG_ON(!mutex_is_locked(&uuid_mutex));
1505         if (!fs_devices->seeding)
1506                 return -EINVAL;
1507
1508         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1509         if (!seed_devices)
1510                 return -ENOMEM;
1511
1512         old_devices = clone_fs_devices(fs_devices);
1513         if (IS_ERR(old_devices)) {
1514                 kfree(seed_devices);
1515                 return PTR_ERR(old_devices);
1516         }
1517
1518         list_add(&old_devices->list, &fs_uuids);
1519
1520         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1521         seed_devices->opened = 1;
1522         INIT_LIST_HEAD(&seed_devices->devices);
1523         INIT_LIST_HEAD(&seed_devices->alloc_list);
1524         mutex_init(&seed_devices->device_list_mutex);
1525
1526         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1527         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1528                               synchronize_rcu);
1529         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1530
1531         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1532         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1533                 device->fs_devices = seed_devices;
1534         }
1535
1536         fs_devices->seeding = 0;
1537         fs_devices->num_devices = 0;
1538         fs_devices->open_devices = 0;
1539         fs_devices->seed = seed_devices;
1540
1541         generate_random_uuid(fs_devices->fsid);
1542         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1543         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1544         super_flags = btrfs_super_flags(disk_super) &
1545                       ~BTRFS_SUPER_FLAG_SEEDING;
1546         btrfs_set_super_flags(disk_super, super_flags);
1547
1548         return 0;
1549 }
1550
1551 /*
1552  * strore the expected generation for seed devices in device items.
1553  */
1554 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1555                                struct btrfs_root *root)
1556 {
1557         struct btrfs_path *path;
1558         struct extent_buffer *leaf;
1559         struct btrfs_dev_item *dev_item;
1560         struct btrfs_device *device;
1561         struct btrfs_key key;
1562         u8 fs_uuid[BTRFS_UUID_SIZE];
1563         u8 dev_uuid[BTRFS_UUID_SIZE];
1564         u64 devid;
1565         int ret;
1566
1567         path = btrfs_alloc_path();
1568         if (!path)
1569                 return -ENOMEM;
1570
1571         root = root->fs_info->chunk_root;
1572         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1573         key.offset = 0;
1574         key.type = BTRFS_DEV_ITEM_KEY;
1575
1576         while (1) {
1577                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1578                 if (ret < 0)
1579                         goto error;
1580
1581                 leaf = path->nodes[0];
1582 next_slot:
1583                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1584                         ret = btrfs_next_leaf(root, path);
1585                         if (ret > 0)
1586                                 break;
1587                         if (ret < 0)
1588                                 goto error;
1589                         leaf = path->nodes[0];
1590                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1591                         btrfs_release_path(path);
1592                         continue;
1593                 }
1594
1595                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1596                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1597                     key.type != BTRFS_DEV_ITEM_KEY)
1598                         break;
1599
1600                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1601                                           struct btrfs_dev_item);
1602                 devid = btrfs_device_id(leaf, dev_item);
1603                 read_extent_buffer(leaf, dev_uuid,
1604                                    (unsigned long)btrfs_device_uuid(dev_item),
1605                                    BTRFS_UUID_SIZE);
1606                 read_extent_buffer(leaf, fs_uuid,
1607                                    (unsigned long)btrfs_device_fsid(dev_item),
1608                                    BTRFS_UUID_SIZE);
1609                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1610                 BUG_ON(!device); /* Logic error */
1611
1612                 if (device->fs_devices->seeding) {
1613                         btrfs_set_device_generation(leaf, dev_item,
1614                                                     device->generation);
1615                         btrfs_mark_buffer_dirty(leaf);
1616                 }
1617
1618                 path->slots[0]++;
1619                 goto next_slot;
1620         }
1621         ret = 0;
1622 error:
1623         btrfs_free_path(path);
1624         return ret;
1625 }
1626
1627 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1628 {
1629         struct request_queue *q;
1630         struct btrfs_trans_handle *trans;
1631         struct btrfs_device *device;
1632         struct block_device *bdev;
1633         struct list_head *devices;
1634         struct super_block *sb = root->fs_info->sb;
1635         u64 total_bytes;
1636         int seeding_dev = 0;
1637         int ret = 0;
1638
1639         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1640                 return -EROFS;
1641
1642         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1643                                   root->fs_info->bdev_holder);
1644         if (IS_ERR(bdev))
1645                 return PTR_ERR(bdev);
1646
1647         if (root->fs_info->fs_devices->seeding) {
1648                 seeding_dev = 1;
1649                 down_write(&sb->s_umount);
1650                 mutex_lock(&uuid_mutex);
1651         }
1652
1653         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1654
1655         devices = &root->fs_info->fs_devices->devices;
1656         /*
1657          * we have the volume lock, so we don't need the extra
1658          * device list mutex while reading the list here.
1659          */
1660         list_for_each_entry(device, devices, dev_list) {
1661                 if (device->bdev == bdev) {
1662                         ret = -EEXIST;
1663                         goto error;
1664                 }
1665         }
1666
1667         device = kzalloc(sizeof(*device), GFP_NOFS);
1668         if (!device) {
1669                 /* we can safely leave the fs_devices entry around */
1670                 ret = -ENOMEM;
1671                 goto error;
1672         }
1673
1674         device->name = kstrdup(device_path, GFP_NOFS);
1675         if (!device->name) {
1676                 kfree(device);
1677                 ret = -ENOMEM;
1678                 goto error;
1679         }
1680
1681         ret = find_next_devid(root, &device->devid);
1682         if (ret) {
1683                 kfree(device->name);
1684                 kfree(device);
1685                 goto error;
1686         }
1687
1688         trans = btrfs_start_transaction(root, 0);
1689         if (IS_ERR(trans)) {
1690                 kfree(device->name);
1691                 kfree(device);
1692                 ret = PTR_ERR(trans);
1693                 goto error;
1694         }
1695
1696         lock_chunks(root);
1697
1698         q = bdev_get_queue(bdev);
1699         if (blk_queue_discard(q))
1700                 device->can_discard = 1;
1701         device->writeable = 1;
1702         device->work.func = pending_bios_fn;
1703         generate_random_uuid(device->uuid);
1704         spin_lock_init(&device->io_lock);
1705         device->generation = trans->transid;
1706         device->io_width = root->sectorsize;
1707         device->io_align = root->sectorsize;
1708         device->sector_size = root->sectorsize;
1709         device->total_bytes = i_size_read(bdev->bd_inode);
1710         device->disk_total_bytes = device->total_bytes;
1711         device->dev_root = root->fs_info->dev_root;
1712         device->bdev = bdev;
1713         device->in_fs_metadata = 1;
1714         device->mode = FMODE_EXCL;
1715         set_blocksize(device->bdev, 4096);
1716
1717         if (seeding_dev) {
1718                 sb->s_flags &= ~MS_RDONLY;
1719                 ret = btrfs_prepare_sprout(root);
1720                 BUG_ON(ret); /* -ENOMEM */
1721         }
1722
1723         device->fs_devices = root->fs_info->fs_devices;
1724
1725         /*
1726          * we don't want write_supers to jump in here with our device
1727          * half setup
1728          */
1729         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1730         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1731         list_add(&device->dev_alloc_list,
1732                  &root->fs_info->fs_devices->alloc_list);
1733         root->fs_info->fs_devices->num_devices++;
1734         root->fs_info->fs_devices->open_devices++;
1735         root->fs_info->fs_devices->rw_devices++;
1736         if (device->can_discard)
1737                 root->fs_info->fs_devices->num_can_discard++;
1738         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1739
1740         spin_lock(&root->fs_info->free_chunk_lock);
1741         root->fs_info->free_chunk_space += device->total_bytes;
1742         spin_unlock(&root->fs_info->free_chunk_lock);
1743
1744         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1745                 root->fs_info->fs_devices->rotating = 1;
1746
1747         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1748         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1749                                     total_bytes + device->total_bytes);
1750
1751         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1752         btrfs_set_super_num_devices(root->fs_info->super_copy,
1753                                     total_bytes + 1);
1754         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1755
1756         if (seeding_dev) {
1757                 ret = init_first_rw_device(trans, root, device);
1758                 if (ret)
1759                         goto error_trans;
1760                 ret = btrfs_finish_sprout(trans, root);
1761                 if (ret)
1762                         goto error_trans;
1763         } else {
1764                 ret = btrfs_add_device(trans, root, device);
1765                 if (ret)
1766                         goto error_trans;
1767         }
1768
1769         /*
1770          * we've got more storage, clear any full flags on the space
1771          * infos
1772          */
1773         btrfs_clear_space_info_full(root->fs_info);
1774
1775         unlock_chunks(root);
1776         ret = btrfs_commit_transaction(trans, root);
1777
1778         if (seeding_dev) {
1779                 mutex_unlock(&uuid_mutex);
1780                 up_write(&sb->s_umount);
1781
1782                 if (ret) /* transaction commit */
1783                         return ret;
1784
1785                 ret = btrfs_relocate_sys_chunks(root);
1786                 if (ret < 0)
1787                         btrfs_error(root->fs_info, ret,
1788                                     "Failed to relocate sys chunks after "
1789                                     "device initialization. This can be fixed "
1790                                     "using the \"btrfs balance\" command.");
1791         }
1792
1793         return ret;
1794
1795 error_trans:
1796         unlock_chunks(root);
1797         btrfs_abort_transaction(trans, root, ret);
1798         btrfs_end_transaction(trans, root);
1799         kfree(device->name);
1800         kfree(device);
1801 error:
1802         blkdev_put(bdev, FMODE_EXCL);
1803         if (seeding_dev) {
1804                 mutex_unlock(&uuid_mutex);
1805                 up_write(&sb->s_umount);
1806         }
1807         return ret;
1808 }
1809
1810 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1811                                         struct btrfs_device *device)
1812 {
1813         int ret;
1814         struct btrfs_path *path;
1815         struct btrfs_root *root;
1816         struct btrfs_dev_item *dev_item;
1817         struct extent_buffer *leaf;
1818         struct btrfs_key key;
1819
1820         root = device->dev_root->fs_info->chunk_root;
1821
1822         path = btrfs_alloc_path();
1823         if (!path)
1824                 return -ENOMEM;
1825
1826         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1827         key.type = BTRFS_DEV_ITEM_KEY;
1828         key.offset = device->devid;
1829
1830         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1831         if (ret < 0)
1832                 goto out;
1833
1834         if (ret > 0) {
1835                 ret = -ENOENT;
1836                 goto out;
1837         }
1838
1839         leaf = path->nodes[0];
1840         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1841
1842         btrfs_set_device_id(leaf, dev_item, device->devid);
1843         btrfs_set_device_type(leaf, dev_item, device->type);
1844         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1845         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1846         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1847         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1848         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1849         btrfs_mark_buffer_dirty(leaf);
1850
1851 out:
1852         btrfs_free_path(path);
1853         return ret;
1854 }
1855
1856 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1857                       struct btrfs_device *device, u64 new_size)
1858 {
1859         struct btrfs_super_block *super_copy =
1860                 device->dev_root->fs_info->super_copy;
1861         u64 old_total = btrfs_super_total_bytes(super_copy);
1862         u64 diff = new_size - device->total_bytes;
1863
1864         if (!device->writeable)
1865                 return -EACCES;
1866         if (new_size <= device->total_bytes)
1867                 return -EINVAL;
1868
1869         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1870         device->fs_devices->total_rw_bytes += diff;
1871
1872         device->total_bytes = new_size;
1873         device->disk_total_bytes = new_size;
1874         btrfs_clear_space_info_full(device->dev_root->fs_info);
1875
1876         return btrfs_update_device(trans, device);
1877 }
1878
1879 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1880                       struct btrfs_device *device, u64 new_size)
1881 {
1882         int ret;
1883         lock_chunks(device->dev_root);
1884         ret = __btrfs_grow_device(trans, device, new_size);
1885         unlock_chunks(device->dev_root);
1886         return ret;
1887 }
1888
1889 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1890                             struct btrfs_root *root,
1891                             u64 chunk_tree, u64 chunk_objectid,
1892                             u64 chunk_offset)
1893 {
1894         int ret;
1895         struct btrfs_path *path;
1896         struct btrfs_key key;
1897
1898         root = root->fs_info->chunk_root;
1899         path = btrfs_alloc_path();
1900         if (!path)
1901                 return -ENOMEM;
1902
1903         key.objectid = chunk_objectid;
1904         key.offset = chunk_offset;
1905         key.type = BTRFS_CHUNK_ITEM_KEY;
1906
1907         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1908         if (ret < 0)
1909                 goto out;
1910         else if (ret > 0) { /* Logic error or corruption */
1911                 btrfs_error(root->fs_info, -ENOENT,
1912                             "Failed lookup while freeing chunk.");
1913                 ret = -ENOENT;
1914                 goto out;
1915         }
1916
1917         ret = btrfs_del_item(trans, root, path);
1918         if (ret < 0)
1919                 btrfs_error(root->fs_info, ret,
1920                             "Failed to delete chunk item.");
1921 out:
1922         btrfs_free_path(path);
1923         return ret;
1924 }
1925
1926 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1927                         chunk_offset)
1928 {
1929         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1930         struct btrfs_disk_key *disk_key;
1931         struct btrfs_chunk *chunk;
1932         u8 *ptr;
1933         int ret = 0;
1934         u32 num_stripes;
1935         u32 array_size;
1936         u32 len = 0;
1937         u32 cur;
1938         struct btrfs_key key;
1939
1940         array_size = btrfs_super_sys_array_size(super_copy);
1941
1942         ptr = super_copy->sys_chunk_array;
1943         cur = 0;
1944
1945         while (cur < array_size) {
1946                 disk_key = (struct btrfs_disk_key *)ptr;
1947                 btrfs_disk_key_to_cpu(&key, disk_key);
1948
1949                 len = sizeof(*disk_key);
1950
1951                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1952                         chunk = (struct btrfs_chunk *)(ptr + len);
1953                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1954                         len += btrfs_chunk_item_size(num_stripes);
1955                 } else {
1956                         ret = -EIO;
1957                         break;
1958                 }
1959                 if (key.objectid == chunk_objectid &&
1960                     key.offset == chunk_offset) {
1961                         memmove(ptr, ptr + len, array_size - (cur + len));
1962                         array_size -= len;
1963                         btrfs_set_super_sys_array_size(super_copy, array_size);
1964                 } else {
1965                         ptr += len;
1966                         cur += len;
1967                 }
1968         }
1969         return ret;
1970 }
1971
1972 static int btrfs_relocate_chunk(struct btrfs_root *root,
1973                          u64 chunk_tree, u64 chunk_objectid,
1974                          u64 chunk_offset)
1975 {
1976         struct extent_map_tree *em_tree;
1977         struct btrfs_root *extent_root;
1978         struct btrfs_trans_handle *trans;
1979         struct extent_map *em;
1980         struct map_lookup *map;
1981         int ret;
1982         int i;
1983
1984         root = root->fs_info->chunk_root;
1985         extent_root = root->fs_info->extent_root;
1986         em_tree = &root->fs_info->mapping_tree.map_tree;
1987
1988         ret = btrfs_can_relocate(extent_root, chunk_offset);
1989         if (ret)
1990                 return -ENOSPC;
1991
1992         /* step one, relocate all the extents inside this chunk */
1993         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1994         if (ret)
1995                 return ret;
1996
1997         trans = btrfs_start_transaction(root, 0);
1998         BUG_ON(IS_ERR(trans));
1999
2000         lock_chunks(root);
2001
2002         /*
2003          * step two, delete the device extents and the
2004          * chunk tree entries
2005          */
2006         read_lock(&em_tree->lock);
2007         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2008         read_unlock(&em_tree->lock);
2009
2010         BUG_ON(!em || em->start > chunk_offset ||
2011                em->start + em->len < chunk_offset);
2012         map = (struct map_lookup *)em->bdev;
2013
2014         for (i = 0; i < map->num_stripes; i++) {
2015                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2016                                             map->stripes[i].physical);
2017                 BUG_ON(ret);
2018
2019                 if (map->stripes[i].dev) {
2020                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2021                         BUG_ON(ret);
2022                 }
2023         }
2024         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2025                                chunk_offset);
2026
2027         BUG_ON(ret);
2028
2029         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2030
2031         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2032                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2033                 BUG_ON(ret);
2034         }
2035
2036         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2037         BUG_ON(ret);
2038
2039         write_lock(&em_tree->lock);
2040         remove_extent_mapping(em_tree, em);
2041         write_unlock(&em_tree->lock);
2042
2043         kfree(map);
2044         em->bdev = NULL;
2045
2046         /* once for the tree */
2047         free_extent_map(em);
2048         /* once for us */
2049         free_extent_map(em);
2050
2051         unlock_chunks(root);
2052         btrfs_end_transaction(trans, root);
2053         return 0;
2054 }
2055
2056 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2057 {
2058         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2059         struct btrfs_path *path;
2060         struct extent_buffer *leaf;
2061         struct btrfs_chunk *chunk;
2062         struct btrfs_key key;
2063         struct btrfs_key found_key;
2064         u64 chunk_tree = chunk_root->root_key.objectid;
2065         u64 chunk_type;
2066         bool retried = false;
2067         int failed = 0;
2068         int ret;
2069
2070         path = btrfs_alloc_path();
2071         if (!path)
2072                 return -ENOMEM;
2073
2074 again:
2075         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2076         key.offset = (u64)-1;
2077         key.type = BTRFS_CHUNK_ITEM_KEY;
2078
2079         while (1) {
2080                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2081                 if (ret < 0)
2082                         goto error;
2083                 BUG_ON(ret == 0); /* Corruption */
2084
2085                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2086                                           key.type);
2087                 if (ret < 0)
2088                         goto error;
2089                 if (ret > 0)
2090                         break;
2091
2092                 leaf = path->nodes[0];
2093                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2094
2095                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2096                                        struct btrfs_chunk);
2097                 chunk_type = btrfs_chunk_type(leaf, chunk);
2098                 btrfs_release_path(path);
2099
2100                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2101                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2102                                                    found_key.objectid,
2103                                                    found_key.offset);
2104                         if (ret == -ENOSPC)
2105                                 failed++;
2106                         else if (ret)
2107                                 BUG();
2108                 }
2109
2110                 if (found_key.offset == 0)
2111                         break;
2112                 key.offset = found_key.offset - 1;
2113         }
2114         ret = 0;
2115         if (failed && !retried) {
2116                 failed = 0;
2117                 retried = true;
2118                 goto again;
2119         } else if (failed && retried) {
2120                 WARN_ON(1);
2121                 ret = -ENOSPC;
2122         }
2123 error:
2124         btrfs_free_path(path);
2125         return ret;
2126 }
2127
2128 static int insert_balance_item(struct btrfs_root *root,
2129                                struct btrfs_balance_control *bctl)
2130 {
2131         struct btrfs_trans_handle *trans;
2132         struct btrfs_balance_item *item;
2133         struct btrfs_disk_balance_args disk_bargs;
2134         struct btrfs_path *path;
2135         struct extent_buffer *leaf;
2136         struct btrfs_key key;
2137         int ret, err;
2138
2139         path = btrfs_alloc_path();
2140         if (!path)
2141                 return -ENOMEM;
2142
2143         trans = btrfs_start_transaction(root, 0);
2144         if (IS_ERR(trans)) {
2145                 btrfs_free_path(path);
2146                 return PTR_ERR(trans);
2147         }
2148
2149         key.objectid = BTRFS_BALANCE_OBJECTID;
2150         key.type = BTRFS_BALANCE_ITEM_KEY;
2151         key.offset = 0;
2152
2153         ret = btrfs_insert_empty_item(trans, root, path, &key,
2154                                       sizeof(*item));
2155         if (ret)
2156                 goto out;
2157
2158         leaf = path->nodes[0];
2159         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2160
2161         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2162
2163         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2164         btrfs_set_balance_data(leaf, item, &disk_bargs);
2165         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2166         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2167         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2168         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2169
2170         btrfs_set_balance_flags(leaf, item, bctl->flags);
2171
2172         btrfs_mark_buffer_dirty(leaf);
2173 out:
2174         btrfs_free_path(path);
2175         err = btrfs_commit_transaction(trans, root);
2176         if (err && !ret)
2177                 ret = err;
2178         return ret;
2179 }
2180
2181 static int del_balance_item(struct btrfs_root *root)
2182 {
2183         struct btrfs_trans_handle *trans;
2184         struct btrfs_path *path;
2185         struct btrfs_key key;
2186         int ret, err;
2187
2188         path = btrfs_alloc_path();
2189         if (!path)
2190                 return -ENOMEM;
2191
2192         trans = btrfs_start_transaction(root, 0);
2193         if (IS_ERR(trans)) {
2194                 btrfs_free_path(path);
2195                 return PTR_ERR(trans);
2196         }
2197
2198         key.objectid = BTRFS_BALANCE_OBJECTID;
2199         key.type = BTRFS_BALANCE_ITEM_KEY;
2200         key.offset = 0;
2201
2202         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2203         if (ret < 0)
2204                 goto out;
2205         if (ret > 0) {
2206                 ret = -ENOENT;
2207                 goto out;
2208         }
2209
2210         ret = btrfs_del_item(trans, root, path);
2211 out:
2212         btrfs_free_path(path);
2213         err = btrfs_commit_transaction(trans, root);
2214         if (err && !ret)
2215                 ret = err;
2216         return ret;
2217 }
2218
2219 /*
2220  * This is a heuristic used to reduce the number of chunks balanced on
2221  * resume after balance was interrupted.
2222  */
2223 static void update_balance_args(struct btrfs_balance_control *bctl)
2224 {
2225         /*
2226          * Turn on soft mode for chunk types that were being converted.
2227          */
2228         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2229                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2230         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2231                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2232         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2233                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2234
2235         /*
2236          * Turn on usage filter if is not already used.  The idea is
2237          * that chunks that we have already balanced should be
2238          * reasonably full.  Don't do it for chunks that are being
2239          * converted - that will keep us from relocating unconverted
2240          * (albeit full) chunks.
2241          */
2242         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2243             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2244                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2245                 bctl->data.usage = 90;
2246         }
2247         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2248             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2249                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2250                 bctl->sys.usage = 90;
2251         }
2252         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2253             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2254                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2255                 bctl->meta.usage = 90;
2256         }
2257 }
2258
2259 /*
2260  * Should be called with both balance and volume mutexes held to
2261  * serialize other volume operations (add_dev/rm_dev/resize) with
2262  * restriper.  Same goes for unset_balance_control.
2263  */
2264 static void set_balance_control(struct btrfs_balance_control *bctl)
2265 {
2266         struct btrfs_fs_info *fs_info = bctl->fs_info;
2267
2268         BUG_ON(fs_info->balance_ctl);
2269
2270         spin_lock(&fs_info->balance_lock);
2271         fs_info->balance_ctl = bctl;
2272         spin_unlock(&fs_info->balance_lock);
2273 }
2274
2275 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2276 {
2277         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2278
2279         BUG_ON(!fs_info->balance_ctl);
2280
2281         spin_lock(&fs_info->balance_lock);
2282         fs_info->balance_ctl = NULL;
2283         spin_unlock(&fs_info->balance_lock);
2284
2285         kfree(bctl);
2286 }
2287
2288 /*
2289  * Balance filters.  Return 1 if chunk should be filtered out
2290  * (should not be balanced).
2291  */
2292 static int chunk_profiles_filter(u64 chunk_type,
2293                                  struct btrfs_balance_args *bargs)
2294 {
2295         chunk_type = chunk_to_extended(chunk_type) &
2296                                 BTRFS_EXTENDED_PROFILE_MASK;
2297
2298         if (bargs->profiles & chunk_type)
2299                 return 0;
2300
2301         return 1;
2302 }
2303
2304 static u64 div_factor_fine(u64 num, int factor)
2305 {
2306         if (factor <= 0)
2307                 return 0;
2308         if (factor >= 100)
2309                 return num;
2310
2311         num *= factor;
2312         do_div(num, 100);
2313         return num;
2314 }
2315
2316 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2317                               struct btrfs_balance_args *bargs)
2318 {
2319         struct btrfs_block_group_cache *cache;
2320         u64 chunk_used, user_thresh;
2321         int ret = 1;
2322
2323         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2324         chunk_used = btrfs_block_group_used(&cache->item);
2325
2326         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2327         if (chunk_used < user_thresh)
2328                 ret = 0;
2329
2330         btrfs_put_block_group(cache);
2331         return ret;
2332 }
2333
2334 static int chunk_devid_filter(struct extent_buffer *leaf,
2335                               struct btrfs_chunk *chunk,
2336                               struct btrfs_balance_args *bargs)
2337 {
2338         struct btrfs_stripe *stripe;
2339         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2340         int i;
2341
2342         for (i = 0; i < num_stripes; i++) {
2343                 stripe = btrfs_stripe_nr(chunk, i);
2344                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2345                         return 0;
2346         }
2347
2348         return 1;
2349 }
2350
2351 /* [pstart, pend) */
2352 static int chunk_drange_filter(struct extent_buffer *leaf,
2353                                struct btrfs_chunk *chunk,
2354                                u64 chunk_offset,
2355                                struct btrfs_balance_args *bargs)
2356 {
2357         struct btrfs_stripe *stripe;
2358         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2359         u64 stripe_offset;
2360         u64 stripe_length;
2361         int factor;
2362         int i;
2363
2364         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2365                 return 0;
2366
2367         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2368              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2369                 factor = 2;
2370         else
2371                 factor = 1;
2372         factor = num_stripes / factor;
2373
2374         for (i = 0; i < num_stripes; i++) {
2375                 stripe = btrfs_stripe_nr(chunk, i);
2376                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2377                         continue;
2378
2379                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2380                 stripe_length = btrfs_chunk_length(leaf, chunk);
2381                 do_div(stripe_length, factor);
2382
2383                 if (stripe_offset < bargs->pend &&
2384                     stripe_offset + stripe_length > bargs->pstart)
2385                         return 0;
2386         }
2387
2388         return 1;
2389 }
2390
2391 /* [vstart, vend) */
2392 static int chunk_vrange_filter(struct extent_buffer *leaf,
2393                                struct btrfs_chunk *chunk,
2394                                u64 chunk_offset,
2395                                struct btrfs_balance_args *bargs)
2396 {
2397         if (chunk_offset < bargs->vend &&
2398             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2399                 /* at least part of the chunk is inside this vrange */
2400                 return 0;
2401
2402         return 1;
2403 }
2404
2405 static int chunk_soft_convert_filter(u64 chunk_type,
2406                                      struct btrfs_balance_args *bargs)
2407 {
2408         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2409                 return 0;
2410
2411         chunk_type = chunk_to_extended(chunk_type) &
2412                                 BTRFS_EXTENDED_PROFILE_MASK;
2413
2414         if (bargs->target == chunk_type)
2415                 return 1;
2416
2417         return 0;
2418 }
2419
2420 static int should_balance_chunk(struct btrfs_root *root,
2421                                 struct extent_buffer *leaf,
2422                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2423 {
2424         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2425         struct btrfs_balance_args *bargs = NULL;
2426         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2427
2428         /* type filter */
2429         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2430               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2431                 return 0;
2432         }
2433
2434         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2435                 bargs = &bctl->data;
2436         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2437                 bargs = &bctl->sys;
2438         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2439                 bargs = &bctl->meta;
2440
2441         /* profiles filter */
2442         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2443             chunk_profiles_filter(chunk_type, bargs)) {
2444                 return 0;
2445         }
2446
2447         /* usage filter */
2448         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2449             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2450                 return 0;
2451         }
2452
2453         /* devid filter */
2454         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2455             chunk_devid_filter(leaf, chunk, bargs)) {
2456                 return 0;
2457         }
2458
2459         /* drange filter, makes sense only with devid filter */
2460         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2461             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2462                 return 0;
2463         }
2464
2465         /* vrange filter */
2466         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2467             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2468                 return 0;
2469         }
2470
2471         /* soft profile changing mode */
2472         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2473             chunk_soft_convert_filter(chunk_type, bargs)) {
2474                 return 0;
2475         }
2476
2477         return 1;
2478 }
2479
2480 static u64 div_factor(u64 num, int factor)
2481 {
2482         if (factor == 10)
2483                 return num;
2484         num *= factor;
2485         do_div(num, 10);
2486         return num;
2487 }
2488
2489 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2490 {
2491         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2492         struct btrfs_root *chunk_root = fs_info->chunk_root;
2493         struct btrfs_root *dev_root = fs_info->dev_root;
2494         struct list_head *devices;
2495         struct btrfs_device *device;
2496         u64 old_size;
2497         u64 size_to_free;
2498         struct btrfs_chunk *chunk;
2499         struct btrfs_path *path;
2500         struct btrfs_key key;
2501         struct btrfs_key found_key;
2502         struct btrfs_trans_handle *trans;
2503         struct extent_buffer *leaf;
2504         int slot;
2505         int ret;
2506         int enospc_errors = 0;
2507         bool counting = true;
2508
2509         /* step one make some room on all the devices */
2510         devices = &fs_info->fs_devices->devices;
2511         list_for_each_entry(device, devices, dev_list) {
2512                 old_size = device->total_bytes;
2513                 size_to_free = div_factor(old_size, 1);
2514                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2515                 if (!device->writeable ||
2516                     device->total_bytes - device->bytes_used > size_to_free)
2517                         continue;
2518
2519                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2520                 if (ret == -ENOSPC)
2521                         break;
2522                 BUG_ON(ret);
2523
2524                 trans = btrfs_start_transaction(dev_root, 0);
2525                 BUG_ON(IS_ERR(trans));
2526
2527                 ret = btrfs_grow_device(trans, device, old_size);
2528                 BUG_ON(ret);
2529
2530                 btrfs_end_transaction(trans, dev_root);
2531         }
2532
2533         /* step two, relocate all the chunks */
2534         path = btrfs_alloc_path();
2535         if (!path) {
2536                 ret = -ENOMEM;
2537                 goto error;
2538         }
2539
2540         /* zero out stat counters */
2541         spin_lock(&fs_info->balance_lock);
2542         memset(&bctl->stat, 0, sizeof(bctl->stat));
2543         spin_unlock(&fs_info->balance_lock);
2544 again:
2545         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2546         key.offset = (u64)-1;
2547         key.type = BTRFS_CHUNK_ITEM_KEY;
2548
2549         while (1) {
2550                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2551                     atomic_read(&fs_info->balance_cancel_req)) {
2552                         ret = -ECANCELED;
2553                         goto error;
2554                 }
2555
2556                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2557                 if (ret < 0)
2558                         goto error;
2559
2560                 /*
2561                  * this shouldn't happen, it means the last relocate
2562                  * failed
2563                  */
2564                 if (ret == 0)
2565                         BUG(); /* FIXME break ? */
2566
2567                 ret = btrfs_previous_item(chunk_root, path, 0,
2568                                           BTRFS_CHUNK_ITEM_KEY);
2569                 if (ret) {
2570                         ret = 0;
2571                         break;
2572                 }
2573
2574                 leaf = path->nodes[0];
2575                 slot = path->slots[0];
2576                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2577
2578                 if (found_key.objectid != key.objectid)
2579                         break;
2580
2581                 /* chunk zero is special */
2582                 if (found_key.offset == 0)
2583                         break;
2584
2585                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2586
2587                 if (!counting) {
2588                         spin_lock(&fs_info->balance_lock);
2589                         bctl->stat.considered++;
2590                         spin_unlock(&fs_info->balance_lock);
2591                 }
2592
2593                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2594                                            found_key.offset);
2595                 btrfs_release_path(path);
2596                 if (!ret)
2597                         goto loop;
2598
2599                 if (counting) {
2600                         spin_lock(&fs_info->balance_lock);
2601                         bctl->stat.expected++;
2602                         spin_unlock(&fs_info->balance_lock);
2603                         goto loop;
2604                 }
2605
2606                 ret = btrfs_relocate_chunk(chunk_root,
2607                                            chunk_root->root_key.objectid,
2608                                            found_key.objectid,
2609                                            found_key.offset);
2610                 if (ret && ret != -ENOSPC)
2611                         goto error;
2612                 if (ret == -ENOSPC) {
2613                         enospc_errors++;
2614                 } else {
2615                         spin_lock(&fs_info->balance_lock);
2616                         bctl->stat.completed++;
2617                         spin_unlock(&fs_info->balance_lock);
2618                 }
2619 loop:
2620                 key.offset = found_key.offset - 1;
2621         }
2622
2623         if (counting) {
2624                 btrfs_release_path(path);
2625                 counting = false;
2626                 goto again;
2627         }
2628 error:
2629         btrfs_free_path(path);
2630         if (enospc_errors) {
2631                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2632                        enospc_errors);
2633                 if (!ret)
2634                         ret = -ENOSPC;
2635         }
2636
2637         return ret;
2638 }
2639
2640 /**
2641  * alloc_profile_is_valid - see if a given profile is valid and reduced
2642  * @flags: profile to validate
2643  * @extended: if true @flags is treated as an extended profile
2644  */
2645 static int alloc_profile_is_valid(u64 flags, int extended)
2646 {
2647         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2648                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
2649
2650         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2651
2652         /* 1) check that all other bits are zeroed */
2653         if (flags & ~mask)
2654                 return 0;
2655
2656         /* 2) see if profile is reduced */
2657         if (flags == 0)
2658                 return !extended; /* "0" is valid for usual profiles */
2659
2660         /* true if exactly one bit set */
2661         return (flags & (flags - 1)) == 0;
2662 }
2663
2664 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2665 {
2666         /* cancel requested || normal exit path */
2667         return atomic_read(&fs_info->balance_cancel_req) ||
2668                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2669                  atomic_read(&fs_info->balance_cancel_req) == 0);
2670 }
2671
2672 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2673 {
2674         int ret;
2675
2676         unset_balance_control(fs_info);
2677         ret = del_balance_item(fs_info->tree_root);
2678         BUG_ON(ret);
2679 }
2680
2681 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2682                                struct btrfs_ioctl_balance_args *bargs);
2683
2684 /*
2685  * Should be called with both balance and volume mutexes held
2686  */
2687 int btrfs_balance(struct btrfs_balance_control *bctl,
2688                   struct btrfs_ioctl_balance_args *bargs)
2689 {
2690         struct btrfs_fs_info *fs_info = bctl->fs_info;
2691         u64 allowed;
2692         int mixed = 0;
2693         int ret;
2694
2695         if (btrfs_fs_closing(fs_info) ||
2696             atomic_read(&fs_info->balance_pause_req) ||
2697             atomic_read(&fs_info->balance_cancel_req)) {
2698                 ret = -EINVAL;
2699                 goto out;
2700         }
2701
2702         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2703         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2704                 mixed = 1;
2705
2706         /*
2707          * In case of mixed groups both data and meta should be picked,
2708          * and identical options should be given for both of them.
2709          */
2710         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
2711         if (mixed && (bctl->flags & allowed)) {
2712                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2713                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2714                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2715                         printk(KERN_ERR "btrfs: with mixed groups data and "
2716                                "metadata balance options must be the same\n");
2717                         ret = -EINVAL;
2718                         goto out;
2719                 }
2720         }
2721
2722         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2723         if (fs_info->fs_devices->num_devices == 1)
2724                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2725         else if (fs_info->fs_devices->num_devices < 4)
2726                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2727         else
2728                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2729                                 BTRFS_BLOCK_GROUP_RAID10);
2730
2731         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2732             (!alloc_profile_is_valid(bctl->data.target, 1) ||
2733              (bctl->data.target & ~allowed))) {
2734                 printk(KERN_ERR "btrfs: unable to start balance with target "
2735                        "data profile %llu\n",
2736                        (unsigned long long)bctl->data.target);
2737                 ret = -EINVAL;
2738                 goto out;
2739         }
2740         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2741             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
2742              (bctl->meta.target & ~allowed))) {
2743                 printk(KERN_ERR "btrfs: unable to start balance with target "
2744                        "metadata profile %llu\n",
2745                        (unsigned long long)bctl->meta.target);
2746                 ret = -EINVAL;
2747                 goto out;
2748         }
2749         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2750             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
2751              (bctl->sys.target & ~allowed))) {
2752                 printk(KERN_ERR "btrfs: unable to start balance with target "
2753                        "system profile %llu\n",
2754                        (unsigned long long)bctl->sys.target);
2755                 ret = -EINVAL;
2756                 goto out;
2757         }
2758
2759         /* allow dup'ed data chunks only in mixed mode */
2760         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2761             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
2762                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2763                 ret = -EINVAL;
2764                 goto out;
2765         }
2766
2767         /* allow to reduce meta or sys integrity only if force set */
2768         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2769                         BTRFS_BLOCK_GROUP_RAID10;
2770         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2771              (fs_info->avail_system_alloc_bits & allowed) &&
2772              !(bctl->sys.target & allowed)) ||
2773             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2774              (fs_info->avail_metadata_alloc_bits & allowed) &&
2775              !(bctl->meta.target & allowed))) {
2776                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2777                         printk(KERN_INFO "btrfs: force reducing metadata "
2778                                "integrity\n");
2779                 } else {
2780                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2781                                "integrity, use force if you want this\n");
2782                         ret = -EINVAL;
2783                         goto out;
2784                 }
2785         }
2786
2787         ret = insert_balance_item(fs_info->tree_root, bctl);
2788         if (ret && ret != -EEXIST)
2789                 goto out;
2790
2791         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2792                 BUG_ON(ret == -EEXIST);
2793                 set_balance_control(bctl);
2794         } else {
2795                 BUG_ON(ret != -EEXIST);
2796                 spin_lock(&fs_info->balance_lock);
2797                 update_balance_args(bctl);
2798                 spin_unlock(&fs_info->balance_lock);
2799         }
2800
2801         atomic_inc(&fs_info->balance_running);
2802         mutex_unlock(&fs_info->balance_mutex);
2803
2804         ret = __btrfs_balance(fs_info);
2805
2806         mutex_lock(&fs_info->balance_mutex);
2807         atomic_dec(&fs_info->balance_running);
2808
2809         if (bargs) {
2810                 memset(bargs, 0, sizeof(*bargs));
2811                 update_ioctl_balance_args(fs_info, 0, bargs);
2812         }
2813
2814         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2815             balance_need_close(fs_info)) {
2816                 __cancel_balance(fs_info);
2817         }
2818
2819         wake_up(&fs_info->balance_wait_q);
2820
2821         return ret;
2822 out:
2823         if (bctl->flags & BTRFS_BALANCE_RESUME)
2824                 __cancel_balance(fs_info);
2825         else
2826                 kfree(bctl);
2827         return ret;
2828 }
2829
2830 static int balance_kthread(void *data)
2831 {
2832         struct btrfs_balance_control *bctl =
2833                         (struct btrfs_balance_control *)data;
2834         struct btrfs_fs_info *fs_info = bctl->fs_info;
2835         int ret = 0;
2836
2837         mutex_lock(&fs_info->volume_mutex);
2838         mutex_lock(&fs_info->balance_mutex);
2839
2840         set_balance_control(bctl);
2841
2842         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2843                 printk(KERN_INFO "btrfs: force skipping balance\n");
2844         } else {
2845                 printk(KERN_INFO "btrfs: continuing balance\n");
2846                 ret = btrfs_balance(bctl, NULL);
2847         }
2848
2849         mutex_unlock(&fs_info->balance_mutex);
2850         mutex_unlock(&fs_info->volume_mutex);
2851         return ret;
2852 }
2853
2854 int btrfs_recover_balance(struct btrfs_root *tree_root)
2855 {
2856         struct task_struct *tsk;
2857         struct btrfs_balance_control *bctl;
2858         struct btrfs_balance_item *item;
2859         struct btrfs_disk_balance_args disk_bargs;
2860         struct btrfs_path *path;
2861         struct extent_buffer *leaf;
2862         struct btrfs_key key;
2863         int ret;
2864
2865         path = btrfs_alloc_path();
2866         if (!path)
2867                 return -ENOMEM;
2868
2869         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2870         if (!bctl) {
2871                 ret = -ENOMEM;
2872                 goto out;
2873         }
2874
2875         key.objectid = BTRFS_BALANCE_OBJECTID;
2876         key.type = BTRFS_BALANCE_ITEM_KEY;
2877         key.offset = 0;
2878
2879         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2880         if (ret < 0)
2881                 goto out_bctl;
2882         if (ret > 0) { /* ret = -ENOENT; */
2883                 ret = 0;
2884                 goto out_bctl;
2885         }
2886
2887         leaf = path->nodes[0];
2888         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2889
2890         bctl->fs_info = tree_root->fs_info;
2891         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2892
2893         btrfs_balance_data(leaf, item, &disk_bargs);
2894         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2895         btrfs_balance_meta(leaf, item, &disk_bargs);
2896         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2897         btrfs_balance_sys(leaf, item, &disk_bargs);
2898         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2899
2900         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2901         if (IS_ERR(tsk))
2902                 ret = PTR_ERR(tsk);
2903         else
2904                 goto out;
2905
2906 out_bctl:
2907         kfree(bctl);
2908 out:
2909         btrfs_free_path(path);
2910         return ret;
2911 }
2912
2913 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2914 {
2915         int ret = 0;
2916
2917         mutex_lock(&fs_info->balance_mutex);
2918         if (!fs_info->balance_ctl) {
2919                 mutex_unlock(&fs_info->balance_mutex);
2920                 return -ENOTCONN;
2921         }
2922
2923         if (atomic_read(&fs_info->balance_running)) {
2924                 atomic_inc(&fs_info->balance_pause_req);
2925                 mutex_unlock(&fs_info->balance_mutex);
2926
2927                 wait_event(fs_info->balance_wait_q,
2928                            atomic_read(&fs_info->balance_running) == 0);
2929
2930                 mutex_lock(&fs_info->balance_mutex);
2931                 /* we are good with balance_ctl ripped off from under us */
2932                 BUG_ON(atomic_read(&fs_info->balance_running));
2933                 atomic_dec(&fs_info->balance_pause_req);
2934         } else {
2935                 ret = -ENOTCONN;
2936         }
2937
2938         mutex_unlock(&fs_info->balance_mutex);
2939         return ret;
2940 }
2941
2942 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2943 {
2944         mutex_lock(&fs_info->balance_mutex);
2945         if (!fs_info->balance_ctl) {
2946                 mutex_unlock(&fs_info->balance_mutex);
2947                 return -ENOTCONN;
2948         }
2949
2950         atomic_inc(&fs_info->balance_cancel_req);
2951         /*
2952          * if we are running just wait and return, balance item is
2953          * deleted in btrfs_balance in this case
2954          */
2955         if (atomic_read(&fs_info->balance_running)) {
2956                 mutex_unlock(&fs_info->balance_mutex);
2957                 wait_event(fs_info->balance_wait_q,
2958                            atomic_read(&fs_info->balance_running) == 0);
2959                 mutex_lock(&fs_info->balance_mutex);
2960         } else {
2961                 /* __cancel_balance needs volume_mutex */
2962                 mutex_unlock(&fs_info->balance_mutex);
2963                 mutex_lock(&fs_info->volume_mutex);
2964                 mutex_lock(&fs_info->balance_mutex);
2965
2966                 if (fs_info->balance_ctl)
2967                         __cancel_balance(fs_info);
2968
2969                 mutex_unlock(&fs_info->volume_mutex);
2970         }
2971
2972         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2973         atomic_dec(&fs_info->balance_cancel_req);
2974         mutex_unlock(&fs_info->balance_mutex);
2975         return 0;
2976 }
2977
2978 /*
2979  * shrinking a device means finding all of the device extents past
2980  * the new size, and then following the back refs to the chunks.
2981  * The chunk relocation code actually frees the device extent
2982  */
2983 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2984 {
2985         struct btrfs_trans_handle *trans;
2986         struct btrfs_root *root = device->dev_root;
2987         struct btrfs_dev_extent *dev_extent = NULL;
2988         struct btrfs_path *path;
2989         u64 length;
2990         u64 chunk_tree;
2991         u64 chunk_objectid;
2992         u64 chunk_offset;
2993         int ret;
2994         int slot;
2995         int failed = 0;
2996         bool retried = false;
2997         struct extent_buffer *l;
2998         struct btrfs_key key;
2999         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3000         u64 old_total = btrfs_super_total_bytes(super_copy);
3001         u64 old_size = device->total_bytes;
3002         u64 diff = device->total_bytes - new_size;
3003
3004         if (new_size >= device->total_bytes)
3005                 return -EINVAL;
3006
3007         path = btrfs_alloc_path();
3008         if (!path)
3009                 return -ENOMEM;
3010
3011         path->reada = 2;
3012
3013         lock_chunks(root);
3014
3015         device->total_bytes = new_size;
3016         if (device->writeable) {
3017                 device->fs_devices->total_rw_bytes -= diff;
3018                 spin_lock(&root->fs_info->free_chunk_lock);
3019                 root->fs_info->free_chunk_space -= diff;
3020                 spin_unlock(&root->fs_info->free_chunk_lock);
3021         }
3022         unlock_chunks(root);
3023
3024 again:
3025         key.objectid = device->devid;
3026         key.offset = (u64)-1;
3027         key.type = BTRFS_DEV_EXTENT_KEY;
3028
3029         do {
3030                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3031                 if (ret < 0)
3032                         goto done;
3033
3034                 ret = btrfs_previous_item(root, path, 0, key.type);
3035                 if (ret < 0)
3036                         goto done;
3037                 if (ret) {
3038                         ret = 0;
3039                         btrfs_release_path(path);
3040                         break;
3041                 }
3042
3043                 l = path->nodes[0];
3044                 slot = path->slots[0];
3045                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3046
3047                 if (key.objectid != device->devid) {
3048                         btrfs_release_path(path);
3049                         break;
3050                 }
3051
3052                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3053                 length = btrfs_dev_extent_length(l, dev_extent);
3054
3055                 if (key.offset + length <= new_size) {
3056                         btrfs_release_path(path);
3057                         break;
3058                 }
3059
3060                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3061                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3062                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3063                 btrfs_release_path(path);
3064
3065                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3066                                            chunk_offset);
3067                 if (ret && ret != -ENOSPC)
3068                         goto done;
3069                 if (ret == -ENOSPC)
3070                         failed++;
3071         } while (key.offset-- > 0);
3072
3073         if (failed && !retried) {
3074                 failed = 0;
3075                 retried = true;
3076                 goto again;
3077         } else if (failed && retried) {
3078                 ret = -ENOSPC;
3079                 lock_chunks(root);
3080
3081                 device->total_bytes = old_size;
3082                 if (device->writeable)
3083                         device->fs_devices->total_rw_bytes += diff;
3084                 spin_lock(&root->fs_info->free_chunk_lock);
3085                 root->fs_info->free_chunk_space += diff;
3086                 spin_unlock(&root->fs_info->free_chunk_lock);
3087                 unlock_chunks(root);
3088                 goto done;
3089         }
3090
3091         /* Shrinking succeeded, else we would be at "done". */
3092         trans = btrfs_start_transaction(root, 0);
3093         if (IS_ERR(trans)) {
3094                 ret = PTR_ERR(trans);
3095                 goto done;
3096         }
3097
3098         lock_chunks(root);
3099
3100         device->disk_total_bytes = new_size;
3101         /* Now btrfs_update_device() will change the on-disk size. */
3102         ret = btrfs_update_device(trans, device);
3103         if (ret) {
3104                 unlock_chunks(root);
3105                 btrfs_end_transaction(trans, root);
3106                 goto done;
3107         }
3108         WARN_ON(diff > old_total);
3109         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3110         unlock_chunks(root);
3111         btrfs_end_transaction(trans, root);
3112 done:
3113         btrfs_free_path(path);
3114         return ret;
3115 }
3116
3117 static int btrfs_add_system_chunk(struct btrfs_root *root,
3118                            struct btrfs_key *key,
3119                            struct btrfs_chunk *chunk, int item_size)
3120 {
3121         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3122         struct btrfs_disk_key disk_key;
3123         u32 array_size;
3124         u8 *ptr;
3125
3126         array_size = btrfs_super_sys_array_size(super_copy);
3127         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3128                 return -EFBIG;
3129
3130         ptr = super_copy->sys_chunk_array + array_size;
3131         btrfs_cpu_key_to_disk(&disk_key, key);
3132         memcpy(ptr, &disk_key, sizeof(disk_key));
3133         ptr += sizeof(disk_key);
3134         memcpy(ptr, chunk, item_size);
3135         item_size += sizeof(disk_key);
3136         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3137         return 0;
3138 }
3139
3140 /*
3141  * sort the devices in descending order by max_avail, total_avail
3142  */
3143 static int btrfs_cmp_device_info(const void *a, const void *b)
3144 {
3145         const struct btrfs_device_info *di_a = a;
3146         const struct btrfs_device_info *di_b = b;
3147
3148         if (di_a->max_avail > di_b->max_avail)
3149                 return -1;
3150         if (di_a->max_avail < di_b->max_avail)
3151                 return 1;
3152         if (di_a->total_avail > di_b->total_avail)
3153                 return -1;
3154         if (di_a->total_avail < di_b->total_avail)
3155                 return 1;
3156         return 0;
3157 }
3158
3159 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3160                                struct btrfs_root *extent_root,
3161                                struct map_lookup **map_ret,
3162                                u64 *num_bytes_out, u64 *stripe_size_out,
3163                                u64 start, u64 type)
3164 {
3165         struct btrfs_fs_info *info = extent_root->fs_info;
3166         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3167         struct list_head *cur;
3168         struct map_lookup *map = NULL;
3169         struct extent_map_tree *em_tree;
3170         struct extent_map *em;
3171         struct btrfs_device_info *devices_info = NULL;
3172         u64 total_avail;
3173         int num_stripes;        /* total number of stripes to allocate */
3174         int sub_stripes;        /* sub_stripes info for map */
3175         int dev_stripes;        /* stripes per dev */
3176         int devs_max;           /* max devs to use */
3177         int devs_min;           /* min devs needed */
3178         int devs_increment;     /* ndevs has to be a multiple of this */
3179         int ncopies;            /* how many copies to data has */
3180         int ret;
3181         u64 max_stripe_size;
3182         u64 max_chunk_size;
3183         u64 stripe_size;
3184         u64 num_bytes;
3185         int ndevs;
3186         int i;
3187         int j;
3188
3189         BUG_ON(!alloc_profile_is_valid(type, 0));
3190
3191         if (list_empty(&fs_devices->alloc_list))
3192                 return -ENOSPC;
3193
3194         sub_stripes = 1;
3195         dev_stripes = 1;
3196         devs_increment = 1;
3197         ncopies = 1;
3198         devs_max = 0;   /* 0 == as many as possible */
3199         devs_min = 1;
3200
3201         /*
3202          * define the properties of each RAID type.
3203          * FIXME: move this to a global table and use it in all RAID
3204          * calculation code
3205          */
3206         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3207                 dev_stripes = 2;
3208                 ncopies = 2;
3209                 devs_max = 1;
3210         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3211                 devs_min = 2;
3212         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3213                 devs_increment = 2;
3214                 ncopies = 2;
3215                 devs_max = 2;
3216                 devs_min = 2;
3217         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3218                 sub_stripes = 2;
3219                 devs_increment = 2;
3220                 ncopies = 2;
3221                 devs_min = 4;
3222         } else {
3223                 devs_max = 1;
3224         }
3225
3226         if (type & BTRFS_BLOCK_GROUP_DATA) {
3227                 max_stripe_size = 1024 * 1024 * 1024;
3228                 max_chunk_size = 10 * max_stripe_size;
3229         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3230                 /* for larger filesystems, use larger metadata chunks */
3231                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3232                         max_stripe_size = 1024 * 1024 * 1024;
3233                 else
3234                         max_stripe_size = 256 * 1024 * 1024;
3235                 max_chunk_size = max_stripe_size;
3236         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3237                 max_stripe_size = 32 * 1024 * 1024;
3238                 max_chunk_size = 2 * max_stripe_size;
3239         } else {
3240                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3241                        type);
3242                 BUG_ON(1);
3243         }
3244
3245         /* we don't want a chunk larger than 10% of writeable space */
3246         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3247                              max_chunk_size);
3248
3249         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3250                                GFP_NOFS);
3251         if (!devices_info)
3252                 return -ENOMEM;
3253
3254         cur = fs_devices->alloc_list.next;
3255
3256         /*
3257          * in the first pass through the devices list, we gather information
3258          * about the available holes on each device.
3259          */
3260         ndevs = 0;
3261         while (cur != &fs_devices->alloc_list) {
3262                 struct btrfs_device *device;
3263                 u64 max_avail;
3264                 u64 dev_offset;
3265
3266                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3267
3268                 cur = cur->next;
3269
3270                 if (!device->writeable) {
3271                         printk(KERN_ERR
3272                                "btrfs: read-only device in alloc_list\n");
3273                         WARN_ON(1);
3274                         continue;
3275                 }
3276
3277                 if (!device->in_fs_metadata)
3278                         continue;
3279
3280                 if (device->total_bytes > device->bytes_used)
3281                         total_avail = device->total_bytes - device->bytes_used;
3282                 else
3283                         total_avail = 0;
3284
3285                 /* If there is no space on this device, skip it. */
3286                 if (total_avail == 0)
3287                         continue;
3288
3289                 ret = find_free_dev_extent(device,
3290                                            max_stripe_size * dev_stripes,
3291                                            &dev_offset, &max_avail);
3292                 if (ret && ret != -ENOSPC)
3293                         goto error;
3294
3295                 if (ret == 0)
3296                         max_avail = max_stripe_size * dev_stripes;
3297
3298                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3299                         continue;
3300
3301                 devices_info[ndevs].dev_offset = dev_offset;
3302                 devices_info[ndevs].max_avail = max_avail;
3303                 devices_info[ndevs].total_avail = total_avail;
3304                 devices_info[ndevs].dev = device;
3305                 ++ndevs;
3306         }
3307
3308         /*
3309          * now sort the devices by hole size / available space
3310          */
3311         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3312              btrfs_cmp_device_info, NULL);
3313
3314         /* round down to number of usable stripes */
3315         ndevs -= ndevs % devs_increment;
3316
3317         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3318                 ret = -ENOSPC;
3319                 goto error;
3320         }
3321
3322         if (devs_max && ndevs > devs_max)
3323                 ndevs = devs_max;
3324         /*
3325          * the primary goal is to maximize the number of stripes, so use as many
3326          * devices as possible, even if the stripes are not maximum sized.
3327          */
3328         stripe_size = devices_info[ndevs-1].max_avail;
3329         num_stripes = ndevs * dev_stripes;
3330
3331         if (stripe_size * ndevs > max_chunk_size * ncopies) {
3332                 stripe_size = max_chunk_size * ncopies;
3333                 do_div(stripe_size, ndevs);
3334         }
3335
3336         do_div(stripe_size, dev_stripes);
3337
3338         /* align to BTRFS_STRIPE_LEN */
3339         do_div(stripe_size, BTRFS_STRIPE_LEN);
3340         stripe_size *= BTRFS_STRIPE_LEN;
3341
3342         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3343         if (!map) {
3344                 ret = -ENOMEM;
3345                 goto error;
3346         }
3347         map->num_stripes = num_stripes;
3348
3349         for (i = 0; i < ndevs; ++i) {
3350                 for (j = 0; j < dev_stripes; ++j) {
3351                         int s = i * dev_stripes + j;
3352                         map->stripes[s].dev = devices_info[i].dev;
3353                         map->stripes[s].physical = devices_info[i].dev_offset +
3354                                                    j * stripe_size;
3355                 }
3356         }
3357         map->sector_size = extent_root->sectorsize;
3358         map->stripe_len = BTRFS_STRIPE_LEN;
3359         map->io_align = BTRFS_STRIPE_LEN;
3360         map->io_width = BTRFS_STRIPE_LEN;
3361         map->type = type;
3362         map->sub_stripes = sub_stripes;
3363
3364         *map_ret = map;
3365         num_bytes = stripe_size * (num_stripes / ncopies);
3366
3367         *stripe_size_out = stripe_size;
3368         *num_bytes_out = num_bytes;
3369
3370         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3371
3372         em = alloc_extent_map();
3373         if (!em) {
3374                 ret = -ENOMEM;
3375                 goto error;
3376         }
3377         em->bdev = (struct block_device *)map;
3378         em->start = start;
3379         em->len = num_bytes;
3380         em->block_start = 0;
3381         em->block_len = em->len;
3382
3383         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3384         write_lock(&em_tree->lock);
3385         ret = add_extent_mapping(em_tree, em);
3386         write_unlock(&em_tree->lock);
3387         free_extent_map(em);
3388         if (ret)
3389                 goto error;
3390
3391         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3392                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3393                                      start, num_bytes);
3394         if (ret)
3395                 goto error;
3396
3397         for (i = 0; i < map->num_stripes; ++i) {
3398                 struct btrfs_device *device;
3399                 u64 dev_offset;
3400
3401                 device = map->stripes[i].dev;
3402                 dev_offset = map->stripes[i].physical;
3403
3404                 ret = btrfs_alloc_dev_extent(trans, device,
3405                                 info->chunk_root->root_key.objectid,
3406                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3407                                 start, dev_offset, stripe_size);
3408                 if (ret) {
3409                         btrfs_abort_transaction(trans, extent_root, ret);
3410                         goto error;
3411                 }
3412         }
3413
3414         kfree(devices_info);
3415         return 0;
3416
3417 error:
3418         kfree(map);
3419         kfree(devices_info);
3420         return ret;
3421 }
3422
3423 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3424                                 struct btrfs_root *extent_root,
3425                                 struct map_lookup *map, u64 chunk_offset,
3426                                 u64 chunk_size, u64 stripe_size)
3427 {
3428         u64 dev_offset;
3429         struct btrfs_key key;
3430         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3431         struct btrfs_device *device;
3432         struct btrfs_chunk *chunk;
3433         struct btrfs_stripe *stripe;
3434         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3435         int index = 0;
3436         int ret;
3437
3438         chunk = kzalloc(item_size, GFP_NOFS);
3439         if (!chunk)
3440                 return -ENOMEM;
3441
3442         index = 0;
3443         while (index < map->num_stripes) {
3444                 device = map->stripes[index].dev;
3445                 device->bytes_used += stripe_size;
3446                 ret = btrfs_update_device(trans, device);
3447                 if (ret)
3448                         goto out_free;
3449                 index++;
3450         }
3451
3452         spin_lock(&extent_root->fs_info->free_chunk_lock);
3453         extent_root->fs_info->free_chunk_space -= (stripe_size *
3454                                                    map->num_stripes);
3455         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3456
3457         index = 0;
3458         stripe = &chunk->stripe;
3459         while (index < map->num_stripes) {
3460                 device = map->stripes[index].dev;
3461                 dev_offset = map->stripes[index].physical;
3462
3463                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3464                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3465                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3466                 stripe++;
3467                 index++;
3468         }
3469
3470         btrfs_set_stack_chunk_length(chunk, chunk_size);
3471         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3472         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3473         btrfs_set_stack_chunk_type(chunk, map->type);
3474         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3475         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3476         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3477         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3478         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3479
3480         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3481         key.type = BTRFS_CHUNK_ITEM_KEY;
3482         key.offset = chunk_offset;
3483
3484         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3485
3486         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3487                 /*
3488                  * TODO: Cleanup of inserted chunk root in case of
3489                  * failure.
3490                  */
3491                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3492                                              item_size);
3493         }
3494
3495 out_free:
3496         kfree(chunk);
3497         return ret;
3498 }
3499
3500 /*
3501  * Chunk allocation falls into two parts. The first part does works
3502  * that make the new allocated chunk useable, but not do any operation
3503  * that modifies the chunk tree. The second part does the works that
3504  * require modifying the chunk tree. This division is important for the
3505  * bootstrap process of adding storage to a seed btrfs.
3506  */
3507 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3508                       struct btrfs_root *extent_root, u64 type)
3509 {
3510         u64 chunk_offset;
3511         u64 chunk_size;
3512         u64 stripe_size;
3513         struct map_lookup *map;
3514         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3515         int ret;
3516
3517         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3518                               &chunk_offset);
3519         if (ret)
3520                 return ret;
3521
3522         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3523                                   &stripe_size, chunk_offset, type);
3524         if (ret)
3525                 return ret;
3526
3527         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3528                                    chunk_size, stripe_size);
3529         if (ret)
3530                 return ret;
3531         return 0;
3532 }
3533
3534 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3535                                          struct btrfs_root *root,
3536                                          struct btrfs_device *device)
3537 {
3538         u64 chunk_offset;
3539         u64 sys_chunk_offset;
3540         u64 chunk_size;
3541         u64 sys_chunk_size;
3542         u64 stripe_size;
3543         u64 sys_stripe_size;
3544         u64 alloc_profile;
3545         struct map_lookup *map;
3546         struct map_lookup *sys_map;
3547         struct btrfs_fs_info *fs_info = root->fs_info;
3548         struct btrfs_root *extent_root = fs_info->extent_root;
3549         int ret;
3550
3551         ret = find_next_chunk(fs_info->chunk_root,
3552                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3553         if (ret)
3554                 return ret;
3555
3556         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3557                                 fs_info->avail_metadata_alloc_bits;
3558         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3559
3560         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3561                                   &stripe_size, chunk_offset, alloc_profile);
3562         if (ret)
3563                 return ret;
3564
3565         sys_chunk_offset = chunk_offset + chunk_size;
3566
3567         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3568                                 fs_info->avail_system_alloc_bits;
3569         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3570
3571         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3572                                   &sys_chunk_size, &sys_stripe_size,
3573                                   sys_chunk_offset, alloc_profile);
3574         if (ret)
3575                 goto abort;
3576
3577         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3578         if (ret)
3579                 goto abort;
3580
3581         /*
3582          * Modifying chunk tree needs allocating new blocks from both
3583          * system block group and metadata block group. So we only can
3584          * do operations require modifying the chunk tree after both
3585          * block groups were created.
3586          */
3587         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3588                                    chunk_size, stripe_size);
3589         if (ret)
3590                 goto abort;
3591
3592         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3593                                    sys_chunk_offset, sys_chunk_size,
3594                                    sys_stripe_size);
3595         if (ret)
3596                 goto abort;
3597
3598         return 0;
3599
3600 abort:
3601         btrfs_abort_transaction(trans, root, ret);
3602         return ret;
3603 }
3604
3605 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3606 {
3607         struct extent_map *em;
3608         struct map_lookup *map;
3609         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3610         int readonly = 0;
3611         int i;
3612
3613         read_lock(&map_tree->map_tree.lock);
3614         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3615         read_unlock(&map_tree->map_tree.lock);
3616         if (!em)
3617                 return 1;
3618
3619         if (btrfs_test_opt(root, DEGRADED)) {
3620                 free_extent_map(em);
3621                 return 0;
3622         }
3623
3624         map = (struct map_lookup *)em->bdev;
3625         for (i = 0; i < map->num_stripes; i++) {
3626                 if (!map->stripes[i].dev->writeable) {
3627                         readonly = 1;
3628                         break;
3629                 }
3630         }
3631         free_extent_map(em);
3632         return readonly;
3633 }
3634
3635 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3636 {
3637         extent_map_tree_init(&tree->map_tree);
3638 }
3639
3640 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3641 {
3642         struct extent_map *em;
3643
3644         while (1) {
3645                 write_lock(&tree->map_tree.lock);
3646                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3647                 if (em)
3648                         remove_extent_mapping(&tree->map_tree, em);
3649                 write_unlock(&tree->map_tree.lock);
3650                 if (!em)
3651                         break;
3652                 kfree(em->bdev);
3653                 /* once for us */
3654                 free_extent_map(em);
3655                 /* once for the tree */
3656                 free_extent_map(em);
3657         }
3658 }
3659
3660 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3661 {
3662         struct extent_map *em;
3663         struct map_lookup *map;
3664         struct extent_map_tree *em_tree = &map_tree->map_tree;
3665         int ret;
3666
3667         read_lock(&em_tree->lock);
3668         em = lookup_extent_mapping(em_tree, logical, len);
3669         read_unlock(&em_tree->lock);
3670         BUG_ON(!em);
3671
3672         BUG_ON(em->start > logical || em->start + em->len < logical);
3673         map = (struct map_lookup *)em->bdev;
3674         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3675                 ret = map->num_stripes;
3676         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3677                 ret = map->sub_stripes;
3678         else
3679                 ret = 1;
3680         free_extent_map(em);
3681         return ret;
3682 }
3683
3684 static int find_live_mirror(struct map_lookup *map, int first, int num,
3685                             int optimal)
3686 {
3687         int i;
3688         if (map->stripes[optimal].dev->bdev)
3689                 return optimal;
3690         for (i = first; i < first + num; i++) {
3691                 if (map->stripes[i].dev->bdev)
3692                         return i;
3693         }
3694         /* we couldn't find one that doesn't fail.  Just return something
3695          * and the io error handling code will clean up eventually
3696          */
3697         return optimal;
3698 }
3699
3700 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3701                              u64 logical, u64 *length,
3702                              struct btrfs_bio **bbio_ret,
3703                              int mirror_num)
3704 {
3705         struct extent_map *em;
3706         struct map_lookup *map;
3707         struct extent_map_tree *em_tree = &map_tree->map_tree;
3708         u64 offset;
3709         u64 stripe_offset;
3710         u64 stripe_end_offset;
3711         u64 stripe_nr;
3712         u64 stripe_nr_orig;
3713         u64 stripe_nr_end;
3714         int stripe_index;
3715         int i;
3716         int ret = 0;
3717         int num_stripes;
3718         int max_errors = 0;
3719         struct btrfs_bio *bbio = NULL;
3720
3721         read_lock(&em_tree->lock);
3722         em = lookup_extent_mapping(em_tree, logical, *length);
3723         read_unlock(&em_tree->lock);
3724
3725         if (!em) {
3726                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3727                        (unsigned long long)logical,
3728                        (unsigned long long)*length);
3729                 BUG();
3730         }
3731
3732         BUG_ON(em->start > logical || em->start + em->len < logical);
3733         map = (struct map_lookup *)em->bdev;
3734         offset = logical - em->start;
3735
3736         if (mirror_num > map->num_stripes)
3737                 mirror_num = 0;
3738
3739         stripe_nr = offset;
3740         /*
3741          * stripe_nr counts the total number of stripes we have to stride
3742          * to get to this block
3743          */
3744         do_div(stripe_nr, map->stripe_len);
3745
3746         stripe_offset = stripe_nr * map->stripe_len;
3747         BUG_ON(offset < stripe_offset);
3748
3749         /* stripe_offset is the offset of this block in its stripe*/
3750         stripe_offset = offset - stripe_offset;
3751
3752         if (rw & REQ_DISCARD)
3753                 *length = min_t(u64, em->len - offset, *length);
3754         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3755                 /* we limit the length of each bio to what fits in a stripe */
3756                 *length = min_t(u64, em->len - offset,
3757                                 map->stripe_len - stripe_offset);
3758         } else {
3759                 *length = em->len - offset;
3760         }
3761
3762         if (!bbio_ret)
3763                 goto out;
3764
3765         num_stripes = 1;
3766         stripe_index = 0;
3767         stripe_nr_orig = stripe_nr;
3768         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3769                         (~(map->stripe_len - 1));
3770         do_div(stripe_nr_end, map->stripe_len);
3771         stripe_end_offset = stripe_nr_end * map->stripe_len -
3772                             (offset + *length);
3773         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3774                 if (rw & REQ_DISCARD)
3775                         num_stripes = min_t(u64, map->num_stripes,
3776                                             stripe_nr_end - stripe_nr_orig);
3777                 stripe_index = do_div(stripe_nr, map->num_stripes);
3778         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3779                 if (rw & (REQ_WRITE | REQ_DISCARD))
3780                         num_stripes = map->num_stripes;
3781                 else if (mirror_num)
3782                         stripe_index = mirror_num - 1;
3783                 else {
3784                         stripe_index = find_live_mirror(map, 0,
3785                                             map->num_stripes,
3786                                             current->pid % map->num_stripes);
3787                         mirror_num = stripe_index + 1;
3788                 }
3789
3790         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3791                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3792                         num_stripes = map->num_stripes;
3793                 } else if (mirror_num) {
3794                         stripe_index = mirror_num - 1;
3795                 } else {
3796                         mirror_num = 1;
3797                 }
3798
3799         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3800                 int factor = map->num_stripes / map->sub_stripes;
3801
3802                 stripe_index = do_div(stripe_nr, factor);
3803                 stripe_index *= map->sub_stripes;
3804
3805                 if (rw & REQ_WRITE)
3806                         num_stripes = map->sub_stripes;
3807                 else if (rw & REQ_DISCARD)
3808                         num_stripes = min_t(u64, map->sub_stripes *
3809                                             (stripe_nr_end - stripe_nr_orig),
3810                                             map->num_stripes);
3811                 else if (mirror_num)
3812                         stripe_index += mirror_num - 1;
3813                 else {
3814                         int old_stripe_index = stripe_index;
3815                         stripe_index = find_live_mirror(map, stripe_index,
3816                                               map->sub_stripes, stripe_index +
3817                                               current->pid % map->sub_stripes);
3818                         mirror_num = stripe_index - old_stripe_index + 1;
3819                 }
3820         } else {
3821                 /*
3822                  * after this do_div call, stripe_nr is the number of stripes
3823                  * on this device we have to walk to find the data, and
3824                  * stripe_index is the number of our device in the stripe array
3825                  */
3826                 stripe_index = do_div(stripe_nr, map->num_stripes);
3827                 mirror_num = stripe_index + 1;
3828         }
3829         BUG_ON(stripe_index >= map->num_stripes);
3830
3831         bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3832         if (!bbio) {
3833                 ret = -ENOMEM;
3834                 goto out;
3835         }
3836         atomic_set(&bbio->error, 0);
3837
3838         if (rw & REQ_DISCARD) {
3839                 int factor = 0;
3840                 int sub_stripes = 0;
3841                 u64 stripes_per_dev = 0;
3842                 u32 remaining_stripes = 0;
3843                 u32 last_stripe = 0;
3844
3845                 if (map->type &
3846                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3847                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3848                                 sub_stripes = 1;
3849                         else
3850                                 sub_stripes = map->sub_stripes;
3851
3852                         factor = map->num_stripes / sub_stripes;
3853                         stripes_per_dev = div_u64_rem(stripe_nr_end -
3854                                                       stripe_nr_orig,
3855                                                       factor,
3856                                                       &remaining_stripes);
3857                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
3858                         last_stripe *= sub_stripes;
3859                 }
3860
3861                 for (i = 0; i < num_stripes; i++) {
3862                         bbio->stripes[i].physical =
3863                                 map->stripes[stripe_index].physical +
3864                                 stripe_offset + stripe_nr * map->stripe_len;
3865                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3866
3867                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3868                                          BTRFS_BLOCK_GROUP_RAID10)) {
3869                                 bbio->stripes[i].length = stripes_per_dev *
3870                                                           map->stripe_len;
3871
3872                                 if (i / sub_stripes < remaining_stripes)
3873                                         bbio->stripes[i].length +=
3874                                                 map->stripe_len;
3875
3876                                 /*
3877                                  * Special for the first stripe and
3878                                  * the last stripe:
3879                                  *
3880                                  * |-------|...|-------|
3881                                  *     |----------|
3882                                  *    off     end_off
3883                                  */
3884                                 if (i < sub_stripes)
3885                                         bbio->stripes[i].length -=
3886                                                 stripe_offset;
3887
3888                                 if (stripe_index >= last_stripe &&
3889                                     stripe_index <= (last_stripe +
3890                                                      sub_stripes - 1))
3891                                         bbio->stripes[i].length -=
3892                                                 stripe_end_offset;
3893
3894                                 if (i == sub_stripes - 1)
3895                                         stripe_offset = 0;
3896                         } else
3897                                 bbio->stripes[i].length = *length;
3898
3899                         stripe_index++;
3900                         if (stripe_index == map->num_stripes) {
3901                                 /* This could only happen for RAID0/10 */
3902                                 stripe_index = 0;
3903                                 stripe_nr++;
3904                         }
3905                 }
3906         } else {
3907                 for (i = 0; i < num_stripes; i++) {
3908                         bbio->stripes[i].physical =
3909                                 map->stripes[stripe_index].physical +
3910                                 stripe_offset +
3911                                 stripe_nr * map->stripe_len;
3912                         bbio->stripes[i].dev =
3913                                 map->stripes[stripe_index].dev;
3914                         stripe_index++;
3915                 }
3916         }
3917
3918         if (rw & REQ_WRITE) {
3919                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3920                                  BTRFS_BLOCK_GROUP_RAID10 |
3921                                  BTRFS_BLOCK_GROUP_DUP)) {
3922                         max_errors = 1;
3923                 }
3924         }
3925
3926         *bbio_ret = bbio;
3927         bbio->num_stripes = num_stripes;
3928         bbio->max_errors = max_errors;
3929         bbio->mirror_num = mirror_num;
3930 out:
3931         free_extent_map(em);
3932         return ret;
3933 }
3934
3935 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3936                       u64 logical, u64 *length,
3937                       struct btrfs_bio **bbio_ret, int mirror_num)
3938 {
3939         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3940                                  mirror_num);
3941 }
3942
3943 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3944                      u64 chunk_start, u64 physical, u64 devid,
3945                      u64 **logical, int *naddrs, int *stripe_len)
3946 {
3947         struct extent_map_tree *em_tree = &map_tree->map_tree;
3948         struct extent_map *em;
3949         struct map_lookup *map;
3950         u64 *buf;
3951         u64 bytenr;
3952         u64 length;
3953         u64 stripe_nr;
3954         int i, j, nr = 0;
3955
3956         read_lock(&em_tree->lock);
3957         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3958         read_unlock(&em_tree->lock);
3959
3960         BUG_ON(!em || em->start != chunk_start);
3961         map = (struct map_lookup *)em->bdev;
3962
3963         length = em->len;
3964         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3965                 do_div(length, map->num_stripes / map->sub_stripes);
3966         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3967                 do_div(length, map->num_stripes);
3968
3969         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3970         BUG_ON(!buf); /* -ENOMEM */
3971
3972         for (i = 0; i < map->num_stripes; i++) {
3973                 if (devid && map->stripes[i].dev->devid != devid)
3974                         continue;
3975                 if (map->stripes[i].physical > physical ||
3976                     map->stripes[i].physical + length <= physical)
3977                         continue;
3978
3979                 stripe_nr = physical - map->stripes[i].physical;
3980                 do_div(stripe_nr, map->stripe_len);
3981
3982                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3983                         stripe_nr = stripe_nr * map->num_stripes + i;
3984                         do_div(stripe_nr, map->sub_stripes);
3985                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3986                         stripe_nr = stripe_nr * map->num_stripes + i;
3987                 }
3988                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3989                 WARN_ON(nr >= map->num_stripes);
3990                 for (j = 0; j < nr; j++) {
3991                         if (buf[j] == bytenr)
3992                                 break;
3993                 }
3994                 if (j == nr) {
3995                         WARN_ON(nr >= map->num_stripes);
3996                         buf[nr++] = bytenr;
3997                 }
3998         }
3999
4000         *logical = buf;
4001         *naddrs = nr;
4002         *stripe_len = map->stripe_len;
4003
4004         free_extent_map(em);
4005         return 0;
4006 }
4007
4008 static void *merge_stripe_index_into_bio_private(void *bi_private,
4009                                                  unsigned int stripe_index)
4010 {
4011         /*
4012          * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4013          * at most 1.
4014          * The alternative solution (instead of stealing bits from the
4015          * pointer) would be to allocate an intermediate structure
4016          * that contains the old private pointer plus the stripe_index.
4017          */
4018         BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4019         BUG_ON(stripe_index > 3);
4020         return (void *)(((uintptr_t)bi_private) | stripe_index);
4021 }
4022
4023 static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4024 {
4025         return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4026 }
4027
4028 static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4029 {
4030         return (unsigned int)((uintptr_t)bi_private) & 3;
4031 }
4032
4033 static void btrfs_end_bio(struct bio *bio, int err)
4034 {
4035         struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4036         int is_orig_bio = 0;
4037
4038         if (err) {
4039                 atomic_inc(&bbio->error);
4040                 if (err == -EIO || err == -EREMOTEIO) {
4041                         unsigned int stripe_index =
4042                                 extract_stripe_index_from_bio_private(
4043                                         bio->bi_private);
4044                         struct btrfs_device *dev;
4045
4046                         BUG_ON(stripe_index >= bbio->num_stripes);
4047                         dev = bbio->stripes[stripe_index].dev;
4048                         if (bio->bi_rw & WRITE)
4049                                 btrfs_dev_stat_inc(dev,
4050                                                    BTRFS_DEV_STAT_WRITE_ERRS);
4051                         else
4052                                 btrfs_dev_stat_inc(dev,
4053                                                    BTRFS_DEV_STAT_READ_ERRS);
4054                         if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4055                                 btrfs_dev_stat_inc(dev,
4056                                                    BTRFS_DEV_STAT_FLUSH_ERRS);
4057                         btrfs_dev_stat_print_on_error(dev);
4058                 }
4059         }
4060
4061         if (bio == bbio->orig_bio)
4062                 is_orig_bio = 1;
4063
4064         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4065                 if (!is_orig_bio) {
4066                         bio_put(bio);
4067                         bio = bbio->orig_bio;
4068                 }
4069                 bio->bi_private = bbio->private;
4070                 bio->bi_end_io = bbio->end_io;
4071                 bio->bi_bdev = (struct block_device *)
4072                                         (unsigned long)bbio->mirror_num;
4073                 /* only send an error to the higher layers if it is
4074                  * beyond the tolerance of the multi-bio
4075                  */
4076                 if (atomic_read(&bbio->error) > bbio->max_errors) {
4077                         err = -EIO;
4078                 } else {
4079                         /*
4080                          * this bio is actually up to date, we didn't
4081                          * go over the max number of errors
4082                          */
4083                         set_bit(BIO_UPTODATE, &bio->bi_flags);
4084                         err = 0;
4085                 }
4086                 kfree(bbio);
4087
4088                 bio_endio(bio, err);
4089         } else if (!is_orig_bio) {
4090                 bio_put(bio);
4091         }
4092 }
4093
4094 struct async_sched {
4095         struct bio *bio;
4096         int rw;
4097         struct btrfs_fs_info *info;
4098         struct btrfs_work work;
4099 };
4100
4101 /*
4102  * see run_scheduled_bios for a description of why bios are collected for
4103  * async submit.
4104  *
4105  * This will add one bio to the pending list for a device and make sure
4106  * the work struct is scheduled.
4107  */
4108 static noinline void schedule_bio(struct btrfs_root *root,
4109                                  struct btrfs_device *device,
4110                                  int rw, struct bio *bio)
4111 {
4112         int should_queue = 1;
4113         struct btrfs_pending_bios *pending_bios;
4114
4115         /* don't bother with additional async steps for reads, right now */
4116         if (!(rw & REQ_WRITE)) {
4117                 bio_get(bio);
4118                 btrfsic_submit_bio(rw, bio);
4119                 bio_put(bio);
4120                 return;
4121         }
4122
4123         /*
4124          * nr_async_bios allows us to reliably return congestion to the
4125          * higher layers.  Otherwise, the async bio makes it appear we have
4126          * made progress against dirty pages when we've really just put it
4127          * on a queue for later
4128          */
4129         atomic_inc(&root->fs_info->nr_async_bios);
4130         WARN_ON(bio->bi_next);
4131         bio->bi_next = NULL;
4132         bio->bi_rw |= rw;
4133
4134         spin_lock(&device->io_lock);
4135         if (bio->bi_rw & REQ_SYNC)
4136                 pending_bios = &device->pending_sync_bios;
4137         else
4138                 pending_bios = &device->pending_bios;
4139
4140         if (pending_bios->tail)
4141                 pending_bios->tail->bi_next = bio;
4142
4143         pending_bios->tail = bio;
4144         if (!pending_bios->head)
4145                 pending_bios->head = bio;
4146         if (device->running_pending)
4147                 should_queue = 0;
4148
4149         spin_unlock(&device->io_lock);
4150
4151         if (should_queue)
4152                 btrfs_queue_worker(&root->fs_info->submit_workers,
4153                                    &device->work);
4154 }
4155
4156 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4157                   int mirror_num, int async_submit)
4158 {
4159         struct btrfs_mapping_tree *map_tree;
4160         struct btrfs_device *dev;
4161         struct bio *first_bio = bio;
4162         u64 logical = (u64)bio->bi_sector << 9;
4163         u64 length = 0;
4164         u64 map_length;
4165         int ret;
4166         int dev_nr = 0;
4167         int total_devs = 1;
4168         struct btrfs_bio *bbio = NULL;
4169
4170         length = bio->bi_size;
4171         map_tree = &root->fs_info->mapping_tree;
4172         map_length = length;
4173
4174         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4175                               mirror_num);
4176         if (ret) /* -ENOMEM */
4177                 return ret;
4178
4179         total_devs = bbio->num_stripes;
4180         if (map_length < length) {
4181                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4182                        "len %llu\n", (unsigned long long)logical,
4183                        (unsigned long long)length,
4184                        (unsigned long long)map_length);
4185                 BUG();
4186         }
4187
4188         bbio->orig_bio = first_bio;
4189         bbio->private = first_bio->bi_private;
4190         bbio->end_io = first_bio->bi_end_io;
4191         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4192
4193         while (dev_nr < total_devs) {
4194                 if (dev_nr < total_devs - 1) {
4195                         bio = bio_clone(first_bio, GFP_NOFS);
4196                         BUG_ON(!bio); /* -ENOMEM */
4197                 } else {
4198                         bio = first_bio;
4199                 }
4200                 bio->bi_private = bbio;
4201                 bio->bi_private = merge_stripe_index_into_bio_private(
4202                                 bio->bi_private, (unsigned int)dev_nr);
4203                 bio->bi_end_io = btrfs_end_bio;
4204                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4205                 dev = bbio->stripes[dev_nr].dev;
4206                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4207                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4208                                  "(%s id %llu), size=%u\n", rw,
4209                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4210                                  dev->name, dev->devid, bio->bi_size);
4211                         bio->bi_bdev = dev->bdev;
4212                         if (async_submit)
4213                                 schedule_bio(root, dev, rw, bio);
4214                         else
4215                                 btrfsic_submit_bio(rw, bio);
4216                 } else {
4217                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4218                         bio->bi_sector = logical >> 9;
4219                         bio_endio(bio, -EIO);
4220                 }
4221                 dev_nr++;
4222         }
4223         return 0;
4224 }
4225
4226 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4227                                        u8 *uuid, u8 *fsid)
4228 {
4229         struct btrfs_device *device;
4230         struct btrfs_fs_devices *cur_devices;
4231
4232         cur_devices = root->fs_info->fs_devices;
4233         while (cur_devices) {
4234                 if (!fsid ||
4235                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4236                         device = __find_device(&cur_devices->devices,
4237                                                devid, uuid);
4238                         if (device)
4239                                 return device;
4240                 }
4241                 cur_devices = cur_devices->seed;
4242         }
4243         return NULL;
4244 }
4245
4246 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4247                                             u64 devid, u8 *dev_uuid)
4248 {
4249         struct btrfs_device *device;
4250         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4251
4252         device = kzalloc(sizeof(*device), GFP_NOFS);
4253         if (!device)
4254                 return NULL;
4255         list_add(&device->dev_list,
4256                  &fs_devices->devices);
4257         device->dev_root = root->fs_info->dev_root;
4258         device->devid = devid;
4259         device->work.func = pending_bios_fn;
4260         device->fs_devices = fs_devices;
4261         device->missing = 1;
4262         fs_devices->num_devices++;
4263         fs_devices->missing_devices++;
4264         spin_lock_init(&device->io_lock);
4265         INIT_LIST_HEAD(&device->dev_alloc_list);
4266         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4267         return device;
4268 }
4269
4270 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4271                           struct extent_buffer *leaf,
4272                           struct btrfs_chunk *chunk)
4273 {
4274         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4275         struct map_lookup *map;
4276         struct extent_map *em;
4277         u64 logical;
4278         u64 length;
4279         u64 devid;
4280         u8 uuid[BTRFS_UUID_SIZE];
4281         int num_stripes;
4282         int ret;
4283         int i;
4284
4285         logical = key->offset;
4286         length = btrfs_chunk_length(leaf, chunk);
4287
4288         read_lock(&map_tree->map_tree.lock);
4289         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4290         read_unlock(&map_tree->map_tree.lock);
4291
4292         /* already mapped? */
4293         if (em && em->start <= logical && em->start + em->len > logical) {
4294                 free_extent_map(em);
4295                 return 0;
4296         } else if (em) {
4297                 free_extent_map(em);
4298         }
4299
4300         em = alloc_extent_map();
4301         if (!em)
4302                 return -ENOMEM;
4303         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4304         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4305         if (!map) {
4306                 free_extent_map(em);
4307                 return -ENOMEM;
4308         }
4309
4310         em->bdev = (struct block_device *)map;
4311         em->start = logical;
4312         em->len = length;
4313         em->block_start = 0;
4314         em->block_len = em->len;
4315
4316         map->num_stripes = num_stripes;
4317         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4318         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4319         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4320         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4321         map->type = btrfs_chunk_type(leaf, chunk);
4322         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4323         for (i = 0; i < num_stripes; i++) {
4324                 map->stripes[i].physical =
4325                         btrfs_stripe_offset_nr(leaf, chunk, i);
4326                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4327                 read_extent_buffer(leaf, uuid, (unsigned long)
4328                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4329                                    BTRFS_UUID_SIZE);
4330                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4331                                                         NULL);
4332                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4333                         kfree(map);
4334                         free_extent_map(em);
4335                         return -EIO;
4336                 }
4337                 if (!map->stripes[i].dev) {
4338                         map->stripes[i].dev =
4339                                 add_missing_dev(root, devid, uuid);
4340                         if (!map->stripes[i].dev) {
4341                                 kfree(map);
4342                                 free_extent_map(em);
4343                                 return -EIO;
4344                         }
4345                 }
4346                 map->stripes[i].dev->in_fs_metadata = 1;
4347         }
4348
4349         write_lock(&map_tree->map_tree.lock);
4350         ret = add_extent_mapping(&map_tree->map_tree, em);
4351         write_unlock(&map_tree->map_tree.lock);
4352         BUG_ON(ret); /* Tree corruption */
4353         free_extent_map(em);
4354
4355         return 0;
4356 }
4357
4358 static void fill_device_from_item(struct extent_buffer *leaf,
4359                                  struct btrfs_dev_item *dev_item,
4360                                  struct btrfs_device *device)
4361 {
4362         unsigned long ptr;
4363
4364         device->devid = btrfs_device_id(leaf, dev_item);
4365         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4366         device->total_bytes = device->disk_total_bytes;
4367         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4368         device->type = btrfs_device_type(leaf, dev_item);
4369         device->io_align = btrfs_device_io_align(leaf, dev_item);
4370         device->io_width = btrfs_device_io_width(leaf, dev_item);
4371         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4372
4373         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4374         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4375 }
4376
4377 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4378 {
4379         struct btrfs_fs_devices *fs_devices;
4380         int ret;
4381
4382         BUG_ON(!mutex_is_locked(&uuid_mutex));
4383
4384         fs_devices = root->fs_info->fs_devices->seed;
4385         while (fs_devices) {
4386                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4387                         ret = 0;
4388                         goto out;
4389                 }
4390                 fs_devices = fs_devices->seed;
4391         }
4392
4393         fs_devices = find_fsid(fsid);
4394         if (!fs_devices) {
4395                 ret = -ENOENT;
4396                 goto out;
4397         }
4398
4399         fs_devices = clone_fs_devices(fs_devices);
4400         if (IS_ERR(fs_devices)) {
4401                 ret = PTR_ERR(fs_devices);
4402                 goto out;
4403         }
4404
4405         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4406                                    root->fs_info->bdev_holder);
4407         if (ret) {
4408                 free_fs_devices(fs_devices);
4409                 goto out;
4410         }
4411
4412         if (!fs_devices->seeding) {
4413                 __btrfs_close_devices(fs_devices);
4414                 free_fs_devices(fs_devices);
4415                 ret = -EINVAL;
4416                 goto out;
4417         }
4418
4419         fs_devices->seed = root->fs_info->fs_devices->seed;
4420         root->fs_info->fs_devices->seed = fs_devices;
4421 out:
4422         return ret;
4423 }
4424
4425 static int read_one_dev(struct btrfs_root *root,
4426                         struct extent_buffer *leaf,
4427                         struct btrfs_dev_item *dev_item)
4428 {
4429         struct btrfs_device *device;
4430         u64 devid;
4431         int ret;
4432         u8 fs_uuid[BTRFS_UUID_SIZE];
4433         u8 dev_uuid[BTRFS_UUID_SIZE];
4434
4435         devid = btrfs_device_id(leaf, dev_item);
4436         read_extent_buffer(leaf, dev_uuid,
4437                            (unsigned long)btrfs_device_uuid(dev_item),
4438                            BTRFS_UUID_SIZE);
4439         read_extent_buffer(leaf, fs_uuid,
4440                            (unsigned long)btrfs_device_fsid(dev_item),
4441                            BTRFS_UUID_SIZE);
4442
4443         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4444                 ret = open_seed_devices(root, fs_uuid);
4445                 if (ret && !btrfs_test_opt(root, DEGRADED))
4446                         return ret;
4447         }
4448
4449         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4450         if (!device || !device->bdev) {
4451                 if (!btrfs_test_opt(root, DEGRADED))
4452                         return -EIO;
4453
4454                 if (!device) {
4455                         printk(KERN_WARNING "warning devid %llu missing\n",
4456                                (unsigned long long)devid);
4457                         device = add_missing_dev(root, devid, dev_uuid);
4458                         if (!device)
4459                                 return -ENOMEM;
4460                 } else if (!device->missing) {
4461                         /*
4462                          * this happens when a device that was properly setup
4463                          * in the device info lists suddenly goes bad.
4464                          * device->bdev is NULL, and so we have to set
4465                          * device->missing to one here
4466                          */
4467                         root->fs_info->fs_devices->missing_devices++;
4468                         device->missing = 1;
4469                 }
4470         }
4471
4472         if (device->fs_devices != root->fs_info->fs_devices) {
4473                 BUG_ON(device->writeable);
4474                 if (device->generation !=
4475                     btrfs_device_generation(leaf, dev_item))
4476                         return -EINVAL;
4477         }
4478
4479         fill_device_from_item(leaf, dev_item, device);
4480         device->dev_root = root->fs_info->dev_root;
4481         device->in_fs_metadata = 1;
4482         if (device->writeable) {
4483                 device->fs_devices->total_rw_bytes += device->total_bytes;
4484                 spin_lock(&root->fs_info->free_chunk_lock);
4485                 root->fs_info->free_chunk_space += device->total_bytes -
4486                         device->bytes_used;
4487                 spin_unlock(&root->fs_info->free_chunk_lock);
4488         }
4489         ret = 0;
4490         return ret;
4491 }
4492
4493 int btrfs_read_sys_array(struct btrfs_root *root)
4494 {
4495         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4496         struct extent_buffer *sb;
4497         struct btrfs_disk_key *disk_key;
4498         struct btrfs_chunk *chunk;
4499         u8 *ptr;
4500         unsigned long sb_ptr;
4501         int ret = 0;
4502         u32 num_stripes;
4503         u32 array_size;
4504         u32 len = 0;
4505         u32 cur;
4506         struct btrfs_key key;
4507
4508         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4509                                           BTRFS_SUPER_INFO_SIZE);
4510         if (!sb)
4511                 return -ENOMEM;
4512         btrfs_set_buffer_uptodate(sb);
4513         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4514         /*
4515          * The sb extent buffer is artifical and just used to read the system array.
4516          * btrfs_set_buffer_uptodate() call does not properly mark all it's
4517          * pages up-to-date when the page is larger: extent does not cover the
4518          * whole page and consequently check_page_uptodate does not find all
4519          * the page's extents up-to-date (the hole beyond sb),
4520          * write_extent_buffer then triggers a WARN_ON.
4521          *
4522          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
4523          * but sb spans only this function. Add an explicit SetPageUptodate call
4524          * to silence the warning eg. on PowerPC 64.
4525          */
4526         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4527                 SetPageUptodate(sb->pages[0]);
4528
4529         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4530         array_size = btrfs_super_sys_array_size(super_copy);
4531
4532         ptr = super_copy->sys_chunk_array;
4533         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4534         cur = 0;
4535
4536         while (cur < array_size) {
4537                 disk_key = (struct btrfs_disk_key *)ptr;
4538                 btrfs_disk_key_to_cpu(&key, disk_key);
4539
4540                 len = sizeof(*disk_key); ptr += len;
4541                 sb_ptr += len;
4542                 cur += len;
4543
4544                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4545                         chunk = (struct btrfs_chunk *)sb_ptr;
4546                         ret = read_one_chunk(root, &key, sb, chunk);
4547                         if (ret)
4548                                 break;
4549                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4550                         len = btrfs_chunk_item_size(num_stripes);
4551                 } else {
4552                         ret = -EIO;
4553                         break;
4554                 }
4555                 ptr += len;
4556                 sb_ptr += len;
4557                 cur += len;
4558         }
4559         free_extent_buffer(sb);
4560         return ret;
4561 }
4562
4563 struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4564                                                    u64 logical, int mirror_num)
4565 {
4566         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4567         int ret;
4568         u64 map_length = 0;
4569         struct btrfs_bio *bbio = NULL;
4570         struct btrfs_device *device;
4571
4572         BUG_ON(mirror_num == 0);
4573         ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4574                               mirror_num);
4575         if (ret) {
4576                 BUG_ON(bbio != NULL);
4577                 return NULL;
4578         }
4579         BUG_ON(mirror_num != bbio->mirror_num);
4580         device = bbio->stripes[mirror_num - 1].dev;
4581         kfree(bbio);
4582         return device;
4583 }
4584
4585 int btrfs_read_chunk_tree(struct btrfs_root *root)
4586 {
4587         struct btrfs_path *path;
4588         struct extent_buffer *leaf;
4589         struct btrfs_key key;
4590         struct btrfs_key found_key;
4591         int ret;
4592         int slot;
4593
4594         root = root->fs_info->chunk_root;
4595
4596         path = btrfs_alloc_path();
4597         if (!path)
4598                 return -ENOMEM;
4599
4600         mutex_lock(&uuid_mutex);
4601         lock_chunks(root);
4602
4603         /* first we search for all of the device items, and then we
4604          * read in all of the chunk items.  This way we can create chunk
4605          * mappings that reference all of the devices that are afound
4606          */
4607         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4608         key.offset = 0;
4609         key.type = 0;
4610 again:
4611         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4612         if (ret < 0)
4613                 goto error;
4614         while (1) {
4615                 leaf = path->nodes[0];
4616                 slot = path->slots[0];
4617                 if (slot >= btrfs_header_nritems(leaf)) {
4618                         ret = btrfs_next_leaf(root, path);
4619                         if (ret == 0)
4620                                 continue;
4621                         if (ret < 0)
4622                                 goto error;
4623                         break;
4624                 }
4625                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4626                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4627                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4628                                 break;
4629                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4630                                 struct btrfs_dev_item *dev_item;
4631                                 dev_item = btrfs_item_ptr(leaf, slot,
4632                                                   struct btrfs_dev_item);
4633                                 ret = read_one_dev(root, leaf, dev_item);
4634                                 if (ret)
4635                                         goto error;
4636                         }
4637                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4638                         struct btrfs_chunk *chunk;
4639                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4640                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4641                         if (ret)
4642                                 goto error;
4643                 }
4644                 path->slots[0]++;
4645         }
4646         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4647                 key.objectid = 0;
4648                 btrfs_release_path(path);
4649                 goto again;
4650         }
4651         ret = 0;
4652 error:
4653         unlock_chunks(root);
4654         mutex_unlock(&uuid_mutex);
4655
4656         btrfs_free_path(path);
4657         return ret;
4658 }
4659
4660 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4661 {
4662         int i;
4663
4664         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4665                 btrfs_dev_stat_reset(dev, i);
4666 }
4667
4668 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4669 {
4670         struct btrfs_key key;
4671         struct btrfs_key found_key;
4672         struct btrfs_root *dev_root = fs_info->dev_root;
4673         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4674         struct extent_buffer *eb;
4675         int slot;
4676         int ret = 0;
4677         struct btrfs_device *device;
4678         struct btrfs_path *path = NULL;
4679         int i;
4680
4681         path = btrfs_alloc_path();
4682         if (!path) {
4683                 ret = -ENOMEM;
4684                 goto out;
4685         }
4686
4687         mutex_lock(&fs_devices->device_list_mutex);
4688         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4689                 int item_size;
4690                 struct btrfs_dev_stats_item *ptr;
4691
4692                 key.objectid = 0;
4693                 key.type = BTRFS_DEV_STATS_KEY;
4694                 key.offset = device->devid;
4695                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4696                 if (ret) {
4697                         printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4698                                device->name, (unsigned long long)device->devid);
4699                         __btrfs_reset_dev_stats(device);
4700                         device->dev_stats_valid = 1;
4701                         btrfs_release_path(path);
4702                         continue;
4703                 }
4704                 slot = path->slots[0];
4705                 eb = path->nodes[0];
4706                 btrfs_item_key_to_cpu(eb, &found_key, slot);
4707                 item_size = btrfs_item_size_nr(eb, slot);
4708
4709                 ptr = btrfs_item_ptr(eb, slot,
4710                                      struct btrfs_dev_stats_item);
4711
4712                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4713                         if (item_size >= (1 + i) * sizeof(__le64))
4714                                 btrfs_dev_stat_set(device, i,
4715                                         btrfs_dev_stats_value(eb, ptr, i));
4716                         else
4717                                 btrfs_dev_stat_reset(device, i);
4718                 }
4719
4720                 device->dev_stats_valid = 1;
4721                 btrfs_dev_stat_print_on_load(device);
4722                 btrfs_release_path(path);
4723         }
4724         mutex_unlock(&fs_devices->device_list_mutex);
4725
4726 out:
4727         btrfs_free_path(path);
4728         return ret < 0 ? ret : 0;
4729 }
4730
4731 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4732                                 struct btrfs_root *dev_root,
4733                                 struct btrfs_device *device)
4734 {
4735         struct btrfs_path *path;
4736         struct btrfs_key key;
4737         struct extent_buffer *eb;
4738         struct btrfs_dev_stats_item *ptr;
4739         int ret;
4740         int i;
4741
4742         key.objectid = 0;
4743         key.type = BTRFS_DEV_STATS_KEY;
4744         key.offset = device->devid;
4745
4746         path = btrfs_alloc_path();
4747         BUG_ON(!path);
4748         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4749         if (ret < 0) {
4750                 printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4751                        ret, device->name);
4752                 goto out;
4753         }
4754
4755         if (ret == 0 &&
4756             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4757                 /* need to delete old one and insert a new one */
4758                 ret = btrfs_del_item(trans, dev_root, path);
4759                 if (ret != 0) {
4760                         printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4761                                device->name, ret);
4762                         goto out;
4763                 }
4764                 ret = 1;
4765         }
4766
4767         if (ret == 1) {
4768                 /* need to insert a new item */
4769                 btrfs_release_path(path);
4770                 ret = btrfs_insert_empty_item(trans, dev_root, path,
4771                                               &key, sizeof(*ptr));
4772                 if (ret < 0) {
4773                         printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4774                                device->name, ret);
4775                         goto out;
4776                 }
4777         }
4778
4779         eb = path->nodes[0];
4780         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4781         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4782                 btrfs_set_dev_stats_value(eb, ptr, i,
4783                                           btrfs_dev_stat_read(device, i));
4784         btrfs_mark_buffer_dirty(eb);
4785
4786 out:
4787         btrfs_free_path(path);
4788         return ret;
4789 }
4790
4791 /*
4792  * called from commit_transaction. Writes all changed device stats to disk.
4793  */
4794 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4795                         struct btrfs_fs_info *fs_info)
4796 {
4797         struct btrfs_root *dev_root = fs_info->dev_root;
4798         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4799         struct btrfs_device *device;
4800         int ret = 0;
4801
4802         mutex_lock(&fs_devices->device_list_mutex);
4803         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4804                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4805                         continue;
4806
4807                 ret = update_dev_stat_item(trans, dev_root, device);
4808                 if (!ret)
4809                         device->dev_stats_dirty = 0;
4810         }
4811         mutex_unlock(&fs_devices->device_list_mutex);
4812
4813         return ret;
4814 }
4815
4816 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4817 {
4818         btrfs_dev_stat_inc(dev, index);
4819         btrfs_dev_stat_print_on_error(dev);
4820 }
4821
4822 void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4823 {
4824         if (!dev->dev_stats_valid)
4825                 return;
4826         printk_ratelimited(KERN_ERR
4827                            "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4828                            dev->name,
4829                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4830                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4831                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4832                            btrfs_dev_stat_read(dev,
4833                                                BTRFS_DEV_STAT_CORRUPTION_ERRS),
4834                            btrfs_dev_stat_read(dev,
4835                                                BTRFS_DEV_STAT_GENERATION_ERRS));
4836 }
4837
4838 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4839 {
4840         printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4841                dev->name,
4842                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4843                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4844                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4845                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4846                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4847 }
4848
4849 int btrfs_get_dev_stats(struct btrfs_root *root,
4850                         struct btrfs_ioctl_get_dev_stats *stats,
4851                         int reset_after_read)
4852 {
4853         struct btrfs_device *dev;
4854         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4855         int i;
4856
4857         mutex_lock(&fs_devices->device_list_mutex);
4858         dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4859         mutex_unlock(&fs_devices->device_list_mutex);
4860
4861         if (!dev) {
4862                 printk(KERN_WARNING
4863                        "btrfs: get dev_stats failed, device not found\n");
4864                 return -ENODEV;
4865         } else if (!dev->dev_stats_valid) {
4866                 printk(KERN_WARNING
4867                        "btrfs: get dev_stats failed, not yet valid\n");
4868                 return -ENODEV;
4869         } else if (reset_after_read) {
4870                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4871                         if (stats->nr_items > i)
4872                                 stats->values[i] =
4873                                         btrfs_dev_stat_read_and_reset(dev, i);
4874                         else
4875                                 btrfs_dev_stat_reset(dev, i);
4876                 }
4877         } else {
4878                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4879                         if (stats->nr_items > i)
4880                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
4881         }
4882         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4883                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4884         return 0;
4885 }