btrfs: use free_device where opencoded
[linux-block.git] / fs / btrfs / volumes.c
CommitLineData
0b86a832
CM
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
5a0e3ad6 20#include <linux/slab.h>
8a4b83cc 21#include <linux/buffer_head.h>
f2d8d74d 22#include <linux/blkdev.h>
b765ead5 23#include <linux/iocontext.h>
6f88a440 24#include <linux/capability.h>
442a4f63 25#include <linux/ratelimit.h>
59641015 26#include <linux/kthread.h>
53b381b3 27#include <linux/raid/pq.h>
803b2f54 28#include <linux/semaphore.h>
8da4b8c4 29#include <linux/uuid.h>
53b381b3 30#include <asm/div64.h>
0b86a832
CM
31#include "ctree.h"
32#include "extent_map.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "print-tree.h"
36#include "volumes.h"
53b381b3 37#include "raid56.h"
8b712842 38#include "async-thread.h"
21adbd5c 39#include "check-integrity.h"
606686ee 40#include "rcu-string.h"
3fed40cc 41#include "math.h"
8dabb742 42#include "dev-replace.h"
99994cde 43#include "sysfs.h"
0b86a832 44
af902047
ZL
45const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
46 [BTRFS_RAID_RAID10] = {
47 .sub_stripes = 2,
48 .dev_stripes = 1,
49 .devs_max = 0, /* 0 == as many as possible */
50 .devs_min = 4,
8789f4fe 51 .tolerated_failures = 1,
af902047
ZL
52 .devs_increment = 2,
53 .ncopies = 2,
54 },
55 [BTRFS_RAID_RAID1] = {
56 .sub_stripes = 1,
57 .dev_stripes = 1,
58 .devs_max = 2,
59 .devs_min = 2,
8789f4fe 60 .tolerated_failures = 1,
af902047
ZL
61 .devs_increment = 2,
62 .ncopies = 2,
63 },
64 [BTRFS_RAID_DUP] = {
65 .sub_stripes = 1,
66 .dev_stripes = 2,
67 .devs_max = 1,
68 .devs_min = 1,
8789f4fe 69 .tolerated_failures = 0,
af902047
ZL
70 .devs_increment = 1,
71 .ncopies = 2,
72 },
73 [BTRFS_RAID_RAID0] = {
74 .sub_stripes = 1,
75 .dev_stripes = 1,
76 .devs_max = 0,
77 .devs_min = 2,
8789f4fe 78 .tolerated_failures = 0,
af902047
ZL
79 .devs_increment = 1,
80 .ncopies = 1,
81 },
82 [BTRFS_RAID_SINGLE] = {
83 .sub_stripes = 1,
84 .dev_stripes = 1,
85 .devs_max = 1,
86 .devs_min = 1,
8789f4fe 87 .tolerated_failures = 0,
af902047
ZL
88 .devs_increment = 1,
89 .ncopies = 1,
90 },
91 [BTRFS_RAID_RAID5] = {
92 .sub_stripes = 1,
93 .dev_stripes = 1,
94 .devs_max = 0,
95 .devs_min = 2,
8789f4fe 96 .tolerated_failures = 1,
af902047
ZL
97 .devs_increment = 1,
98 .ncopies = 2,
99 },
100 [BTRFS_RAID_RAID6] = {
101 .sub_stripes = 1,
102 .dev_stripes = 1,
103 .devs_max = 0,
104 .devs_min = 3,
8789f4fe 105 .tolerated_failures = 2,
af902047
ZL
106 .devs_increment = 1,
107 .ncopies = 3,
108 },
109};
110
fb75d857 111const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
af902047
ZL
112 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
113 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
114 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
115 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
116 [BTRFS_RAID_SINGLE] = 0,
117 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
119};
120
621292ba
DS
121/*
122 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
123 * condition is not met. Zero means there's no corresponding
124 * BTRFS_ERROR_DEV_*_NOT_MET value.
125 */
126const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
127 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
128 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
129 [BTRFS_RAID_DUP] = 0,
130 [BTRFS_RAID_RAID0] = 0,
131 [BTRFS_RAID_SINGLE] = 0,
132 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
133 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
134};
135
2b82032c 136static int init_first_rw_device(struct btrfs_trans_handle *trans,
e4a4dce7 137 struct btrfs_fs_info *fs_info);
2ff7e61e 138static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
733f4fbb 139static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
48a3b636 140static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
733f4fbb 141static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
5ab56090
LB
142static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
143 enum btrfs_map_op op,
144 u64 logical, u64 *length,
145 struct btrfs_bio **bbio_ret,
146 int mirror_num, int need_raid_map);
2b82032c 147
67a2c45e 148DEFINE_MUTEX(uuid_mutex);
8a4b83cc 149static LIST_HEAD(fs_uuids);
c73eccf7
AJ
150struct list_head *btrfs_get_fs_uuids(void)
151{
152 return &fs_uuids;
153}
8a4b83cc 154
2dfeca9b
DS
155/*
156 * alloc_fs_devices - allocate struct btrfs_fs_devices
157 * @fsid: if not NULL, copy the uuid to fs_devices::fsid
158 *
159 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
160 * The returned struct is not linked onto any lists and can be destroyed with
161 * kfree() right away.
162 */
163static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
2208a378
ID
164{
165 struct btrfs_fs_devices *fs_devs;
166
78f2c9e6 167 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
2208a378
ID
168 if (!fs_devs)
169 return ERR_PTR(-ENOMEM);
170
171 mutex_init(&fs_devs->device_list_mutex);
172
173 INIT_LIST_HEAD(&fs_devs->devices);
935e5cc9 174 INIT_LIST_HEAD(&fs_devs->resized_devices);
2208a378
ID
175 INIT_LIST_HEAD(&fs_devs->alloc_list);
176 INIT_LIST_HEAD(&fs_devs->list);
2208a378
ID
177 if (fsid)
178 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
2208a378
ID
179
180 return fs_devs;
181}
182
48dae9cf
DS
183static void free_device(struct btrfs_device *device)
184{
185 rcu_string_free(device->name);
186 bio_put(device->flush_bio);
187 kfree(device);
188}
189
e4404d6e
YZ
190static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
191{
192 struct btrfs_device *device;
193 WARN_ON(fs_devices->opened);
194 while (!list_empty(&fs_devices->devices)) {
195 device = list_entry(fs_devices->devices.next,
196 struct btrfs_device, dev_list);
197 list_del(&device->dev_list);
55de4803 198 free_device(device);
e4404d6e
YZ
199 }
200 kfree(fs_devices);
201}
202
b8b8ff59
LC
203static void btrfs_kobject_uevent(struct block_device *bdev,
204 enum kobject_action action)
205{
206 int ret;
207
208 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
209 if (ret)
efe120a0 210 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
b8b8ff59
LC
211 action,
212 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
213 &disk_to_dev(bdev->bd_disk)->kobj);
214}
215
143bede5 216void btrfs_cleanup_fs_uuids(void)
8a4b83cc
CM
217{
218 struct btrfs_fs_devices *fs_devices;
8a4b83cc 219
2b82032c
YZ
220 while (!list_empty(&fs_uuids)) {
221 fs_devices = list_entry(fs_uuids.next,
222 struct btrfs_fs_devices, list);
223 list_del(&fs_devices->list);
e4404d6e 224 free_fs_devices(fs_devices);
8a4b83cc 225 }
8a4b83cc
CM
226}
227
48dae9cf
DS
228/*
229 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
230 * Returned struct is not linked onto any lists and must be destroyed using
231 * free_device.
232 */
12bd2fc0
ID
233static struct btrfs_device *__alloc_device(void)
234{
235 struct btrfs_device *dev;
236
78f2c9e6 237 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
12bd2fc0
ID
238 if (!dev)
239 return ERR_PTR(-ENOMEM);
240
e0ae9994
DS
241 /*
242 * Preallocate a bio that's always going to be used for flushing device
243 * barriers and matches the device lifespan
244 */
245 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
246 if (!dev->flush_bio) {
247 kfree(dev);
248 return ERR_PTR(-ENOMEM);
249 }
e0ae9994 250
12bd2fc0
ID
251 INIT_LIST_HEAD(&dev->dev_list);
252 INIT_LIST_HEAD(&dev->dev_alloc_list);
935e5cc9 253 INIT_LIST_HEAD(&dev->resized_list);
12bd2fc0
ID
254
255 spin_lock_init(&dev->io_lock);
256
257 spin_lock_init(&dev->reada_lock);
258 atomic_set(&dev->reada_in_flight, 0);
addc3fa7 259 atomic_set(&dev->dev_stats_ccnt, 0);
546bed63 260 btrfs_device_data_ordered_init(dev);
9bcaaea7 261 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
d0164adc 262 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
12bd2fc0
ID
263
264 return dev;
265}
266
35c70103
DS
267/*
268 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
269 * return NULL.
270 *
271 * If devid and uuid are both specified, the match must be exact, otherwise
272 * only devid is used.
273 */
274static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
275 u64 devid, const u8 *uuid)
8a4b83cc 276{
35c70103 277 struct list_head *head = &fs_devices->devices;
8a4b83cc 278 struct btrfs_device *dev;
8a4b83cc 279
c6e30871 280 list_for_each_entry(dev, head, dev_list) {
a443755f 281 if (dev->devid == devid &&
8f18cf13 282 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
8a4b83cc 283 return dev;
a443755f 284 }
8a4b83cc
CM
285 }
286 return NULL;
287}
288
a1b32a59 289static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
8a4b83cc 290{
8a4b83cc
CM
291 struct btrfs_fs_devices *fs_devices;
292
c6e30871 293 list_for_each_entry(fs_devices, &fs_uuids, list) {
8a4b83cc
CM
294 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
295 return fs_devices;
296 }
297 return NULL;
298}
299
beaf8ab3
SB
300static int
301btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
302 int flush, struct block_device **bdev,
303 struct buffer_head **bh)
304{
305 int ret;
306
307 *bdev = blkdev_get_by_path(device_path, flags, holder);
308
309 if (IS_ERR(*bdev)) {
310 ret = PTR_ERR(*bdev);
beaf8ab3
SB
311 goto error;
312 }
313
314 if (flush)
315 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
9f6d2510 316 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
beaf8ab3
SB
317 if (ret) {
318 blkdev_put(*bdev, flags);
319 goto error;
320 }
321 invalidate_bdev(*bdev);
322 *bh = btrfs_read_dev_super(*bdev);
92fc03fb
AJ
323 if (IS_ERR(*bh)) {
324 ret = PTR_ERR(*bh);
beaf8ab3
SB
325 blkdev_put(*bdev, flags);
326 goto error;
327 }
328
329 return 0;
330
331error:
332 *bdev = NULL;
333 *bh = NULL;
334 return ret;
335}
336
ffbd517d
CM
337static void requeue_list(struct btrfs_pending_bios *pending_bios,
338 struct bio *head, struct bio *tail)
339{
340
341 struct bio *old_head;
342
343 old_head = pending_bios->head;
344 pending_bios->head = head;
345 if (pending_bios->tail)
346 tail->bi_next = old_head;
347 else
348 pending_bios->tail = tail;
349}
350
8b712842
CM
351/*
352 * we try to collect pending bios for a device so we don't get a large
353 * number of procs sending bios down to the same device. This greatly
354 * improves the schedulers ability to collect and merge the bios.
355 *
356 * But, it also turns into a long list of bios to process and that is sure
357 * to eventually make the worker thread block. The solution here is to
358 * make some progress and then put this work struct back at the end of
359 * the list if the block device is congested. This way, multiple devices
360 * can make progress from a single worker thread.
361 */
143bede5 362static noinline void run_scheduled_bios(struct btrfs_device *device)
8b712842 363{
0b246afa 364 struct btrfs_fs_info *fs_info = device->fs_info;
8b712842
CM
365 struct bio *pending;
366 struct backing_dev_info *bdi;
ffbd517d 367 struct btrfs_pending_bios *pending_bios;
8b712842
CM
368 struct bio *tail;
369 struct bio *cur;
370 int again = 0;
ffbd517d 371 unsigned long num_run;
d644d8a1 372 unsigned long batch_run = 0;
b765ead5 373 unsigned long last_waited = 0;
d84275c9 374 int force_reg = 0;
0e588859 375 int sync_pending = 0;
211588ad
CM
376 struct blk_plug plug;
377
378 /*
379 * this function runs all the bios we've collected for
380 * a particular device. We don't want to wander off to
381 * another device without first sending all of these down.
382 * So, setup a plug here and finish it off before we return
383 */
384 blk_start_plug(&plug);
8b712842 385
efa7c9f9 386 bdi = device->bdev->bd_bdi;
b64a2851 387
8b712842
CM
388loop:
389 spin_lock(&device->io_lock);
390
a6837051 391loop_lock:
d84275c9 392 num_run = 0;
ffbd517d 393
8b712842
CM
394 /* take all the bios off the list at once and process them
395 * later on (without the lock held). But, remember the
396 * tail and other pointers so the bios can be properly reinserted
397 * into the list if we hit congestion
398 */
d84275c9 399 if (!force_reg && device->pending_sync_bios.head) {
ffbd517d 400 pending_bios = &device->pending_sync_bios;
d84275c9
CM
401 force_reg = 1;
402 } else {
ffbd517d 403 pending_bios = &device->pending_bios;
d84275c9
CM
404 force_reg = 0;
405 }
ffbd517d
CM
406
407 pending = pending_bios->head;
408 tail = pending_bios->tail;
8b712842 409 WARN_ON(pending && !tail);
8b712842
CM
410
411 /*
412 * if pending was null this time around, no bios need processing
413 * at all and we can stop. Otherwise it'll loop back up again
414 * and do an additional check so no bios are missed.
415 *
416 * device->running_pending is used to synchronize with the
417 * schedule_bio code.
418 */
ffbd517d
CM
419 if (device->pending_sync_bios.head == NULL &&
420 device->pending_bios.head == NULL) {
8b712842
CM
421 again = 0;
422 device->running_pending = 0;
ffbd517d
CM
423 } else {
424 again = 1;
425 device->running_pending = 1;
8b712842 426 }
ffbd517d
CM
427
428 pending_bios->head = NULL;
429 pending_bios->tail = NULL;
430
8b712842
CM
431 spin_unlock(&device->io_lock);
432
d397712b 433 while (pending) {
ffbd517d
CM
434
435 rmb();
d84275c9
CM
436 /* we want to work on both lists, but do more bios on the
437 * sync list than the regular list
438 */
439 if ((num_run > 32 &&
440 pending_bios != &device->pending_sync_bios &&
441 device->pending_sync_bios.head) ||
442 (num_run > 64 && pending_bios == &device->pending_sync_bios &&
443 device->pending_bios.head)) {
ffbd517d
CM
444 spin_lock(&device->io_lock);
445 requeue_list(pending_bios, pending, tail);
446 goto loop_lock;
447 }
448
8b712842
CM
449 cur = pending;
450 pending = pending->bi_next;
451 cur->bi_next = NULL;
b64a2851 452
dac56212 453 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
d644d8a1 454
2ab1ba68
CM
455 /*
456 * if we're doing the sync list, record that our
457 * plug has some sync requests on it
458 *
459 * If we're doing the regular list and there are
460 * sync requests sitting around, unplug before
461 * we add more
462 */
463 if (pending_bios == &device->pending_sync_bios) {
464 sync_pending = 1;
465 } else if (sync_pending) {
466 blk_finish_plug(&plug);
467 blk_start_plug(&plug);
468 sync_pending = 0;
469 }
470
4e49ea4a 471 btrfsic_submit_bio(cur);
5ff7ba3a
CM
472 num_run++;
473 batch_run++;
853d8ec4
DS
474
475 cond_resched();
8b712842
CM
476
477 /*
478 * we made progress, there is more work to do and the bdi
479 * is now congested. Back off and let other work structs
480 * run instead
481 */
57fd5a5f 482 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
5f2cc086 483 fs_info->fs_devices->open_devices > 1) {
b765ead5 484 struct io_context *ioc;
8b712842 485
b765ead5
CM
486 ioc = current->io_context;
487
488 /*
489 * the main goal here is that we don't want to
490 * block if we're going to be able to submit
491 * more requests without blocking.
492 *
493 * This code does two great things, it pokes into
494 * the elevator code from a filesystem _and_
495 * it makes assumptions about how batching works.
496 */
497 if (ioc && ioc->nr_batch_requests > 0 &&
498 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
499 (last_waited == 0 ||
500 ioc->last_waited == last_waited)) {
501 /*
502 * we want to go through our batch of
503 * requests and stop. So, we copy out
504 * the ioc->last_waited time and test
505 * against it before looping
506 */
507 last_waited = ioc->last_waited;
853d8ec4 508 cond_resched();
b765ead5
CM
509 continue;
510 }
8b712842 511 spin_lock(&device->io_lock);
ffbd517d 512 requeue_list(pending_bios, pending, tail);
a6837051 513 device->running_pending = 1;
8b712842
CM
514
515 spin_unlock(&device->io_lock);
a8c93d4e
QW
516 btrfs_queue_work(fs_info->submit_workers,
517 &device->work);
8b712842
CM
518 goto done;
519 }
520 }
ffbd517d 521
51684082
CM
522 cond_resched();
523 if (again)
524 goto loop;
525
526 spin_lock(&device->io_lock);
527 if (device->pending_bios.head || device->pending_sync_bios.head)
528 goto loop_lock;
529 spin_unlock(&device->io_lock);
530
8b712842 531done:
211588ad 532 blk_finish_plug(&plug);
8b712842
CM
533}
534
b2950863 535static void pending_bios_fn(struct btrfs_work *work)
8b712842
CM
536{
537 struct btrfs_device *device;
538
539 device = container_of(work, struct btrfs_device, work);
540 run_scheduled_bios(device);
541}
542
4fde46f0 543
c9162bdf 544static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
4fde46f0
AJ
545{
546 struct btrfs_fs_devices *fs_devs;
547 struct btrfs_device *dev;
548
549 if (!cur_dev->name)
550 return;
551
552 list_for_each_entry(fs_devs, &fs_uuids, list) {
553 int del = 1;
554
555 if (fs_devs->opened)
556 continue;
557 if (fs_devs->seeding)
558 continue;
559
560 list_for_each_entry(dev, &fs_devs->devices, dev_list) {
561
562 if (dev == cur_dev)
563 continue;
564 if (!dev->name)
565 continue;
566
567 /*
568 * Todo: This won't be enough. What if the same device
569 * comes back (with new uuid and) with its mapper path?
570 * But for now, this does help as mostly an admin will
571 * either use mapper or non mapper path throughout.
572 */
573 rcu_read_lock();
574 del = strcmp(rcu_str_deref(dev->name),
575 rcu_str_deref(cur_dev->name));
576 rcu_read_unlock();
577 if (!del)
578 break;
579 }
580
581 if (!del) {
582 /* delete the stale device */
583 if (fs_devs->num_devices == 1) {
584 btrfs_sysfs_remove_fsid(fs_devs);
585 list_del(&fs_devs->list);
586 free_fs_devices(fs_devs);
587 } else {
588 fs_devs->num_devices--;
589 list_del(&dev->dev_list);
55de4803 590 free_device(dev);
4fde46f0
AJ
591 }
592 break;
593 }
594 }
595}
596
60999ca4
DS
597/*
598 * Add new device to list of registered devices
599 *
600 * Returns:
601 * 1 - first time device is seen
602 * 0 - device already known
603 * < 0 - error
604 */
a1b32a59 605static noinline int device_list_add(const char *path,
8a4b83cc
CM
606 struct btrfs_super_block *disk_super,
607 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
608{
609 struct btrfs_device *device;
610 struct btrfs_fs_devices *fs_devices;
606686ee 611 struct rcu_string *name;
60999ca4 612 int ret = 0;
8a4b83cc
CM
613 u64 found_transid = btrfs_super_generation(disk_super);
614
615 fs_devices = find_fsid(disk_super->fsid);
616 if (!fs_devices) {
2208a378
ID
617 fs_devices = alloc_fs_devices(disk_super->fsid);
618 if (IS_ERR(fs_devices))
619 return PTR_ERR(fs_devices);
620
8a4b83cc 621 list_add(&fs_devices->list, &fs_uuids);
2208a378 622
8a4b83cc
CM
623 device = NULL;
624 } else {
35c70103
DS
625 device = find_device(fs_devices, devid,
626 disk_super->dev_item.uuid);
8a4b83cc 627 }
443f24fe 628
8a4b83cc 629 if (!device) {
2b82032c
YZ
630 if (fs_devices->opened)
631 return -EBUSY;
632
12bd2fc0
ID
633 device = btrfs_alloc_device(NULL, &devid,
634 disk_super->dev_item.uuid);
635 if (IS_ERR(device)) {
8a4b83cc 636 /* we can safely leave the fs_devices entry around */
12bd2fc0 637 return PTR_ERR(device);
8a4b83cc 638 }
606686ee
JB
639
640 name = rcu_string_strdup(path, GFP_NOFS);
641 if (!name) {
55de4803 642 free_device(device);
8a4b83cc
CM
643 return -ENOMEM;
644 }
606686ee 645 rcu_assign_pointer(device->name, name);
90519d66 646
e5e9a520 647 mutex_lock(&fs_devices->device_list_mutex);
1f78160c 648 list_add_rcu(&device->dev_list, &fs_devices->devices);
f7171750 649 fs_devices->num_devices++;
e5e9a520
CM
650 mutex_unlock(&fs_devices->device_list_mutex);
651
60999ca4 652 ret = 1;
2b82032c 653 device->fs_devices = fs_devices;
606686ee 654 } else if (!device->name || strcmp(device->name->str, path)) {
b96de000
AJ
655 /*
656 * When FS is already mounted.
657 * 1. If you are here and if the device->name is NULL that
658 * means this device was missing at time of FS mount.
659 * 2. If you are here and if the device->name is different
660 * from 'path' that means either
661 * a. The same device disappeared and reappeared with
662 * different name. or
663 * b. The missing-disk-which-was-replaced, has
664 * reappeared now.
665 *
666 * We must allow 1 and 2a above. But 2b would be a spurious
667 * and unintentional.
668 *
669 * Further in case of 1 and 2a above, the disk at 'path'
670 * would have missed some transaction when it was away and
671 * in case of 2a the stale bdev has to be updated as well.
672 * 2b must not be allowed at all time.
673 */
674
675 /*
0f23ae74
CM
676 * For now, we do allow update to btrfs_fs_device through the
677 * btrfs dev scan cli after FS has been mounted. We're still
678 * tracking a problem where systems fail mount by subvolume id
679 * when we reject replacement on a mounted FS.
b96de000 680 */
0f23ae74 681 if (!fs_devices->opened && found_transid < device->generation) {
77bdae4d
AJ
682 /*
683 * That is if the FS is _not_ mounted and if you
684 * are here, that means there is more than one
685 * disk with same uuid and devid.We keep the one
686 * with larger generation number or the last-in if
687 * generation are equal.
688 */
0f23ae74 689 return -EEXIST;
77bdae4d 690 }
b96de000 691
606686ee 692 name = rcu_string_strdup(path, GFP_NOFS);
3a0524dc
TH
693 if (!name)
694 return -ENOMEM;
606686ee
JB
695 rcu_string_free(device->name);
696 rcu_assign_pointer(device->name, name);
cd02dca5
CM
697 if (device->missing) {
698 fs_devices->missing_devices--;
699 device->missing = 0;
700 }
8a4b83cc
CM
701 }
702
77bdae4d
AJ
703 /*
704 * Unmount does not free the btrfs_device struct but would zero
705 * generation along with most of the other members. So just update
706 * it back. We need it to pick the disk with largest generation
707 * (as above).
708 */
709 if (!fs_devices->opened)
710 device->generation = found_transid;
711
4fde46f0
AJ
712 /*
713 * if there is new btrfs on an already registered device,
714 * then remove the stale device entry.
715 */
02feae3c
AJ
716 if (ret > 0)
717 btrfs_free_stale_device(device);
4fde46f0 718
8a4b83cc 719 *fs_devices_ret = fs_devices;
60999ca4
DS
720
721 return ret;
8a4b83cc
CM
722}
723
e4404d6e
YZ
724static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
725{
726 struct btrfs_fs_devices *fs_devices;
727 struct btrfs_device *device;
728 struct btrfs_device *orig_dev;
729
2208a378
ID
730 fs_devices = alloc_fs_devices(orig->fsid);
731 if (IS_ERR(fs_devices))
732 return fs_devices;
e4404d6e 733
adbbb863 734 mutex_lock(&orig->device_list_mutex);
02db0844 735 fs_devices->total_devices = orig->total_devices;
e4404d6e 736
46224705 737 /* We have held the volume lock, it is safe to get the devices. */
e4404d6e 738 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
606686ee
JB
739 struct rcu_string *name;
740
12bd2fc0
ID
741 device = btrfs_alloc_device(NULL, &orig_dev->devid,
742 orig_dev->uuid);
743 if (IS_ERR(device))
e4404d6e
YZ
744 goto error;
745
606686ee
JB
746 /*
747 * This is ok to do without rcu read locked because we hold the
748 * uuid mutex so nothing we touch in here is going to disappear.
749 */
e755f780 750 if (orig_dev->name) {
78f2c9e6
DS
751 name = rcu_string_strdup(orig_dev->name->str,
752 GFP_KERNEL);
e755f780 753 if (!name) {
55de4803 754 free_device(device);
e755f780
AJ
755 goto error;
756 }
757 rcu_assign_pointer(device->name, name);
fd2696f3 758 }
e4404d6e 759
e4404d6e
YZ
760 list_add(&device->dev_list, &fs_devices->devices);
761 device->fs_devices = fs_devices;
762 fs_devices->num_devices++;
763 }
adbbb863 764 mutex_unlock(&orig->device_list_mutex);
e4404d6e
YZ
765 return fs_devices;
766error:
adbbb863 767 mutex_unlock(&orig->device_list_mutex);
e4404d6e
YZ
768 free_fs_devices(fs_devices);
769 return ERR_PTR(-ENOMEM);
770}
771
9eaed21e 772void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
dfe25020 773{
c6e30871 774 struct btrfs_device *device, *next;
443f24fe 775 struct btrfs_device *latest_dev = NULL;
a6b0d5c8 776
dfe25020
CM
777 mutex_lock(&uuid_mutex);
778again:
46224705 779 /* This is the initialized path, it is safe to release the devices. */
c6e30871 780 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
a6b0d5c8 781 if (device->in_fs_metadata) {
63a212ab 782 if (!device->is_tgtdev_for_dev_replace &&
443f24fe
MX
783 (!latest_dev ||
784 device->generation > latest_dev->generation)) {
785 latest_dev = device;
a6b0d5c8 786 }
2b82032c 787 continue;
a6b0d5c8 788 }
2b82032c 789
8dabb742
SB
790 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
791 /*
792 * In the first step, keep the device which has
793 * the correct fsid and the devid that is used
794 * for the dev_replace procedure.
795 * In the second step, the dev_replace state is
796 * read from the device tree and it is known
797 * whether the procedure is really active or
798 * not, which means whether this device is
799 * used or whether it should be removed.
800 */
801 if (step == 0 || device->is_tgtdev_for_dev_replace) {
802 continue;
803 }
804 }
2b82032c 805 if (device->bdev) {
d4d77629 806 blkdev_put(device->bdev, device->mode);
2b82032c
YZ
807 device->bdev = NULL;
808 fs_devices->open_devices--;
809 }
810 if (device->writeable) {
811 list_del_init(&device->dev_alloc_list);
812 device->writeable = 0;
8dabb742
SB
813 if (!device->is_tgtdev_for_dev_replace)
814 fs_devices->rw_devices--;
2b82032c 815 }
e4404d6e
YZ
816 list_del_init(&device->dev_list);
817 fs_devices->num_devices--;
55de4803 818 free_device(device);
dfe25020 819 }
2b82032c
YZ
820
821 if (fs_devices->seed) {
822 fs_devices = fs_devices->seed;
2b82032c
YZ
823 goto again;
824 }
825
443f24fe 826 fs_devices->latest_bdev = latest_dev->bdev;
a6b0d5c8 827
dfe25020 828 mutex_unlock(&uuid_mutex);
dfe25020 829}
a0af469b 830
f06c5965 831static void free_device_rcu(struct rcu_head *head)
1f78160c
XG
832{
833 struct btrfs_device *device;
834
9f5316c1 835 device = container_of(head, struct btrfs_device, rcu);
55de4803 836 free_device(device);
1f78160c
XG
837}
838
14238819
AJ
839static void btrfs_close_bdev(struct btrfs_device *device)
840{
841 if (device->bdev && device->writeable) {
842 sync_blockdev(device->bdev);
843 invalidate_bdev(device->bdev);
844 }
845
846 if (device->bdev)
847 blkdev_put(device->bdev, device->mode);
848}
849
0ccd0528 850static void btrfs_prepare_close_one_device(struct btrfs_device *device)
f448341a
AJ
851{
852 struct btrfs_fs_devices *fs_devices = device->fs_devices;
853 struct btrfs_device *new_device;
854 struct rcu_string *name;
855
856 if (device->bdev)
857 fs_devices->open_devices--;
858
859 if (device->writeable &&
860 device->devid != BTRFS_DEV_REPLACE_DEVID) {
861 list_del_init(&device->dev_alloc_list);
862 fs_devices->rw_devices--;
863 }
864
865 if (device->missing)
866 fs_devices->missing_devices--;
867
868 new_device = btrfs_alloc_device(NULL, &device->devid,
869 device->uuid);
870 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
871
872 /* Safe because we are under uuid_mutex */
873 if (device->name) {
874 name = rcu_string_strdup(device->name->str, GFP_NOFS);
875 BUG_ON(!name); /* -ENOMEM */
876 rcu_assign_pointer(new_device->name, name);
877 }
878
879 list_replace_rcu(&device->dev_list, &new_device->dev_list);
880 new_device->fs_devices = device->fs_devices;
f448341a
AJ
881}
882
2b82032c 883static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
8a4b83cc 884{
2037a093 885 struct btrfs_device *device, *tmp;
0ccd0528
AJ
886 struct list_head pending_put;
887
888 INIT_LIST_HEAD(&pending_put);
e4404d6e 889
2b82032c
YZ
890 if (--fs_devices->opened > 0)
891 return 0;
8a4b83cc 892
c9513edb 893 mutex_lock(&fs_devices->device_list_mutex);
2037a093 894 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
0ccd0528
AJ
895 btrfs_prepare_close_one_device(device);
896 list_add(&device->dev_list, &pending_put);
8a4b83cc 897 }
c9513edb
XG
898 mutex_unlock(&fs_devices->device_list_mutex);
899
0ccd0528
AJ
900 /*
901 * btrfs_show_devname() is using the device_list_mutex,
902 * sometimes call to blkdev_put() leads vfs calling
903 * into this func. So do put outside of device_list_mutex,
904 * as of now.
905 */
906 while (!list_empty(&pending_put)) {
907 device = list_first_entry(&pending_put,
908 struct btrfs_device, dev_list);
909 list_del(&device->dev_list);
910 btrfs_close_bdev(device);
f06c5965 911 call_rcu(&device->rcu, free_device_rcu);
0ccd0528
AJ
912 }
913
e4404d6e
YZ
914 WARN_ON(fs_devices->open_devices);
915 WARN_ON(fs_devices->rw_devices);
2b82032c
YZ
916 fs_devices->opened = 0;
917 fs_devices->seeding = 0;
2b82032c 918
8a4b83cc
CM
919 return 0;
920}
921
2b82032c
YZ
922int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
923{
e4404d6e 924 struct btrfs_fs_devices *seed_devices = NULL;
2b82032c
YZ
925 int ret;
926
927 mutex_lock(&uuid_mutex);
928 ret = __btrfs_close_devices(fs_devices);
e4404d6e
YZ
929 if (!fs_devices->opened) {
930 seed_devices = fs_devices->seed;
931 fs_devices->seed = NULL;
932 }
2b82032c 933 mutex_unlock(&uuid_mutex);
e4404d6e
YZ
934
935 while (seed_devices) {
936 fs_devices = seed_devices;
937 seed_devices = fs_devices->seed;
938 __btrfs_close_devices(fs_devices);
939 free_fs_devices(fs_devices);
940 }
2b82032c
YZ
941 return ret;
942}
943
e4404d6e
YZ
944static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
945 fmode_t flags, void *holder)
8a4b83cc 946{
d5e2003c 947 struct request_queue *q;
8a4b83cc
CM
948 struct block_device *bdev;
949 struct list_head *head = &fs_devices->devices;
8a4b83cc 950 struct btrfs_device *device;
443f24fe 951 struct btrfs_device *latest_dev = NULL;
a0af469b
CM
952 struct buffer_head *bh;
953 struct btrfs_super_block *disk_super;
a0af469b 954 u64 devid;
2b82032c 955 int seeding = 1;
a0af469b 956 int ret = 0;
8a4b83cc 957
d4d77629
TH
958 flags |= FMODE_EXCL;
959
c6e30871 960 list_for_each_entry(device, head, dev_list) {
c1c4d91c
CM
961 if (device->bdev)
962 continue;
dfe25020
CM
963 if (!device->name)
964 continue;
965
f63e0cca
ES
966 /* Just open everything we can; ignore failures here */
967 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
968 &bdev, &bh))
beaf8ab3 969 continue;
a0af469b
CM
970
971 disk_super = (struct btrfs_super_block *)bh->b_data;
a343832f 972 devid = btrfs_stack_device_id(&disk_super->dev_item);
a0af469b
CM
973 if (devid != device->devid)
974 goto error_brelse;
975
2b82032c
YZ
976 if (memcmp(device->uuid, disk_super->dev_item.uuid,
977 BTRFS_UUID_SIZE))
978 goto error_brelse;
979
980 device->generation = btrfs_super_generation(disk_super);
443f24fe
MX
981 if (!latest_dev ||
982 device->generation > latest_dev->generation)
983 latest_dev = device;
a0af469b 984
2b82032c
YZ
985 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
986 device->writeable = 0;
987 } else {
988 device->writeable = !bdev_read_only(bdev);
989 seeding = 0;
990 }
991
d5e2003c 992 q = bdev_get_queue(bdev);
90180da4 993 if (blk_queue_discard(q))
d5e2003c 994 device->can_discard = 1;
e884f4f0
AJ
995 if (!blk_queue_nonrot(q))
996 fs_devices->rotating = 1;
d5e2003c 997
8a4b83cc 998 device->bdev = bdev;
dfe25020 999 device->in_fs_metadata = 0;
15916de8
CM
1000 device->mode = flags;
1001
a0af469b 1002 fs_devices->open_devices++;
55e50e45
ID
1003 if (device->writeable &&
1004 device->devid != BTRFS_DEV_REPLACE_DEVID) {
2b82032c
YZ
1005 fs_devices->rw_devices++;
1006 list_add(&device->dev_alloc_list,
1007 &fs_devices->alloc_list);
1008 }
4f6c9328 1009 brelse(bh);
a0af469b 1010 continue;
a061fc8d 1011
a0af469b
CM
1012error_brelse:
1013 brelse(bh);
d4d77629 1014 blkdev_put(bdev, flags);
a0af469b 1015 continue;
8a4b83cc 1016 }
a0af469b 1017 if (fs_devices->open_devices == 0) {
20bcd649 1018 ret = -EINVAL;
a0af469b
CM
1019 goto out;
1020 }
2b82032c
YZ
1021 fs_devices->seeding = seeding;
1022 fs_devices->opened = 1;
443f24fe 1023 fs_devices->latest_bdev = latest_dev->bdev;
2b82032c 1024 fs_devices->total_rw_bytes = 0;
a0af469b 1025out:
2b82032c
YZ
1026 return ret;
1027}
1028
1029int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
97288f2c 1030 fmode_t flags, void *holder)
2b82032c
YZ
1031{
1032 int ret;
1033
1034 mutex_lock(&uuid_mutex);
1035 if (fs_devices->opened) {
e4404d6e
YZ
1036 fs_devices->opened++;
1037 ret = 0;
2b82032c 1038 } else {
15916de8 1039 ret = __btrfs_open_devices(fs_devices, flags, holder);
2b82032c 1040 }
8a4b83cc 1041 mutex_unlock(&uuid_mutex);
8a4b83cc
CM
1042 return ret;
1043}
1044
c9162bdf 1045static void btrfs_release_disk_super(struct page *page)
6cf86a00
AJ
1046{
1047 kunmap(page);
1048 put_page(page);
1049}
1050
c9162bdf
OS
1051static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1052 struct page **page,
1053 struct btrfs_super_block **disk_super)
6cf86a00
AJ
1054{
1055 void *p;
1056 pgoff_t index;
1057
1058 /* make sure our super fits in the device */
1059 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1060 return 1;
1061
1062 /* make sure our super fits in the page */
1063 if (sizeof(**disk_super) > PAGE_SIZE)
1064 return 1;
1065
1066 /* make sure our super doesn't straddle pages on disk */
1067 index = bytenr >> PAGE_SHIFT;
1068 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1069 return 1;
1070
1071 /* pull in the page with our super */
1072 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1073 index, GFP_KERNEL);
1074
1075 if (IS_ERR_OR_NULL(*page))
1076 return 1;
1077
1078 p = kmap(*page);
1079
1080 /* align our pointer to the offset of the super block */
1081 *disk_super = p + (bytenr & ~PAGE_MASK);
1082
1083 if (btrfs_super_bytenr(*disk_super) != bytenr ||
1084 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1085 btrfs_release_disk_super(*page);
1086 return 1;
1087 }
1088
1089 if ((*disk_super)->label[0] &&
1090 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1091 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1092
1093 return 0;
1094}
1095
6f60cbd3
DS
1096/*
1097 * Look for a btrfs signature on a device. This may be called out of the mount path
1098 * and we are not allowed to call set_blocksize during the scan. The superblock
1099 * is read via pagecache
1100 */
97288f2c 1101int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
8a4b83cc
CM
1102 struct btrfs_fs_devices **fs_devices_ret)
1103{
1104 struct btrfs_super_block *disk_super;
1105 struct block_device *bdev;
6f60cbd3 1106 struct page *page;
6f60cbd3 1107 int ret = -EINVAL;
8a4b83cc 1108 u64 devid;
f2984462 1109 u64 transid;
02db0844 1110 u64 total_devices;
6f60cbd3 1111 u64 bytenr;
8a4b83cc 1112
6f60cbd3
DS
1113 /*
1114 * we would like to check all the supers, but that would make
1115 * a btrfs mount succeed after a mkfs from a different FS.
1116 * So, we need to add a special mount option to scan for
1117 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1118 */
1119 bytenr = btrfs_sb_offset(0);
d4d77629 1120 flags |= FMODE_EXCL;
10f6327b 1121 mutex_lock(&uuid_mutex);
6f60cbd3
DS
1122
1123 bdev = blkdev_get_by_path(path, flags, holder);
6f60cbd3
DS
1124 if (IS_ERR(bdev)) {
1125 ret = PTR_ERR(bdev);
beaf8ab3 1126 goto error;
6f60cbd3
DS
1127 }
1128
6cf86a00 1129 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
6f60cbd3
DS
1130 goto error_bdev_put;
1131
a343832f 1132 devid = btrfs_stack_device_id(&disk_super->dev_item);
f2984462 1133 transid = btrfs_super_generation(disk_super);
02db0844 1134 total_devices = btrfs_super_num_devices(disk_super);
6f60cbd3 1135
8a4b83cc 1136 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
60999ca4
DS
1137 if (ret > 0) {
1138 if (disk_super->label[0]) {
62e85577 1139 pr_info("BTRFS: device label %s ", disk_super->label);
60999ca4 1140 } else {
62e85577 1141 pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
60999ca4
DS
1142 }
1143
62e85577 1144 pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
60999ca4
DS
1145 ret = 0;
1146 }
02db0844
JB
1147 if (!ret && fs_devices_ret)
1148 (*fs_devices_ret)->total_devices = total_devices;
6f60cbd3 1149
6cf86a00 1150 btrfs_release_disk_super(page);
6f60cbd3
DS
1151
1152error_bdev_put:
d4d77629 1153 blkdev_put(bdev, flags);
8a4b83cc 1154error:
beaf8ab3 1155 mutex_unlock(&uuid_mutex);
8a4b83cc
CM
1156 return ret;
1157}
0b86a832 1158
6d07bcec
MX
1159/* helper to account the used device space in the range */
1160int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1161 u64 end, u64 *length)
1162{
1163 struct btrfs_key key;
fb456252 1164 struct btrfs_root *root = device->fs_info->dev_root;
6d07bcec
MX
1165 struct btrfs_dev_extent *dev_extent;
1166 struct btrfs_path *path;
1167 u64 extent_end;
1168 int ret;
1169 int slot;
1170 struct extent_buffer *l;
1171
1172 *length = 0;
1173
63a212ab 1174 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
6d07bcec
MX
1175 return 0;
1176
1177 path = btrfs_alloc_path();
1178 if (!path)
1179 return -ENOMEM;
e4058b54 1180 path->reada = READA_FORWARD;
6d07bcec
MX
1181
1182 key.objectid = device->devid;
1183 key.offset = start;
1184 key.type = BTRFS_DEV_EXTENT_KEY;
1185
1186 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1187 if (ret < 0)
1188 goto out;
1189 if (ret > 0) {
1190 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1191 if (ret < 0)
1192 goto out;
1193 }
1194
1195 while (1) {
1196 l = path->nodes[0];
1197 slot = path->slots[0];
1198 if (slot >= btrfs_header_nritems(l)) {
1199 ret = btrfs_next_leaf(root, path);
1200 if (ret == 0)
1201 continue;
1202 if (ret < 0)
1203 goto out;
1204
1205 break;
1206 }
1207 btrfs_item_key_to_cpu(l, &key, slot);
1208
1209 if (key.objectid < device->devid)
1210 goto next;
1211
1212 if (key.objectid > device->devid)
1213 break;
1214
962a298f 1215 if (key.type != BTRFS_DEV_EXTENT_KEY)
6d07bcec
MX
1216 goto next;
1217
1218 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1219 extent_end = key.offset + btrfs_dev_extent_length(l,
1220 dev_extent);
1221 if (key.offset <= start && extent_end > end) {
1222 *length = end - start + 1;
1223 break;
1224 } else if (key.offset <= start && extent_end > start)
1225 *length += extent_end - start;
1226 else if (key.offset > start && extent_end <= end)
1227 *length += extent_end - key.offset;
1228 else if (key.offset > start && key.offset <= end) {
1229 *length += end - key.offset + 1;
1230 break;
1231 } else if (key.offset > end)
1232 break;
1233
1234next:
1235 path->slots[0]++;
1236 }
1237 ret = 0;
1238out:
1239 btrfs_free_path(path);
1240 return ret;
1241}
1242
499f377f 1243static int contains_pending_extent(struct btrfs_transaction *transaction,
6df9a95e
JB
1244 struct btrfs_device *device,
1245 u64 *start, u64 len)
1246{
fb456252 1247 struct btrfs_fs_info *fs_info = device->fs_info;
6df9a95e 1248 struct extent_map *em;
499f377f 1249 struct list_head *search_list = &fs_info->pinned_chunks;
6df9a95e 1250 int ret = 0;
1b984508 1251 u64 physical_start = *start;
6df9a95e 1252
499f377f
JM
1253 if (transaction)
1254 search_list = &transaction->pending_chunks;
04216820
FM
1255again:
1256 list_for_each_entry(em, search_list, list) {
6df9a95e
JB
1257 struct map_lookup *map;
1258 int i;
1259
95617d69 1260 map = em->map_lookup;
6df9a95e 1261 for (i = 0; i < map->num_stripes; i++) {
c152b63e
FM
1262 u64 end;
1263
6df9a95e
JB
1264 if (map->stripes[i].dev != device)
1265 continue;
1b984508 1266 if (map->stripes[i].physical >= physical_start + len ||
6df9a95e 1267 map->stripes[i].physical + em->orig_block_len <=
1b984508 1268 physical_start)
6df9a95e 1269 continue;
c152b63e
FM
1270 /*
1271 * Make sure that while processing the pinned list we do
1272 * not override our *start with a lower value, because
1273 * we can have pinned chunks that fall within this
1274 * device hole and that have lower physical addresses
1275 * than the pending chunks we processed before. If we
1276 * do not take this special care we can end up getting
1277 * 2 pending chunks that start at the same physical
1278 * device offsets because the end offset of a pinned
1279 * chunk can be equal to the start offset of some
1280 * pending chunk.
1281 */
1282 end = map->stripes[i].physical + em->orig_block_len;
1283 if (end > *start) {
1284 *start = end;
1285 ret = 1;
1286 }
6df9a95e
JB
1287 }
1288 }
499f377f
JM
1289 if (search_list != &fs_info->pinned_chunks) {
1290 search_list = &fs_info->pinned_chunks;
04216820
FM
1291 goto again;
1292 }
6df9a95e
JB
1293
1294 return ret;
1295}
1296
1297
0b86a832 1298/*
499f377f
JM
1299 * find_free_dev_extent_start - find free space in the specified device
1300 * @device: the device which we search the free space in
1301 * @num_bytes: the size of the free space that we need
1302 * @search_start: the position from which to begin the search
1303 * @start: store the start of the free space.
1304 * @len: the size of the free space. that we find, or the size
1305 * of the max free space if we don't find suitable free space
7bfc837d 1306 *
0b86a832
CM
1307 * this uses a pretty simple search, the expectation is that it is
1308 * called very infrequently and that a given device has a small number
1309 * of extents
7bfc837d
MX
1310 *
1311 * @start is used to store the start of the free space if we find. But if we
1312 * don't find suitable free space, it will be used to store the start position
1313 * of the max free space.
1314 *
1315 * @len is used to store the size of the free space that we find.
1316 * But if we don't find suitable free space, it is used to store the size of
1317 * the max free space.
0b86a832 1318 */
499f377f
JM
1319int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1320 struct btrfs_device *device, u64 num_bytes,
1321 u64 search_start, u64 *start, u64 *len)
0b86a832 1322{
0b246afa
JM
1323 struct btrfs_fs_info *fs_info = device->fs_info;
1324 struct btrfs_root *root = fs_info->dev_root;
0b86a832 1325 struct btrfs_key key;
7bfc837d 1326 struct btrfs_dev_extent *dev_extent;
2b82032c 1327 struct btrfs_path *path;
7bfc837d
MX
1328 u64 hole_size;
1329 u64 max_hole_start;
1330 u64 max_hole_size;
1331 u64 extent_end;
0b86a832
CM
1332 u64 search_end = device->total_bytes;
1333 int ret;
7bfc837d 1334 int slot;
0b86a832 1335 struct extent_buffer *l;
8cdc7c5b
FM
1336
1337 /*
1338 * We don't want to overwrite the superblock on the drive nor any area
1339 * used by the boot loader (grub for example), so we make sure to start
1340 * at an offset of at least 1MB.
1341 */
0d0c71b3 1342 search_start = max_t(u64, search_start, SZ_1M);
0b86a832 1343
6df9a95e
JB
1344 path = btrfs_alloc_path();
1345 if (!path)
1346 return -ENOMEM;
f2ab7618 1347
7bfc837d
MX
1348 max_hole_start = search_start;
1349 max_hole_size = 0;
1350
f2ab7618 1351again:
63a212ab 1352 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
7bfc837d 1353 ret = -ENOSPC;
6df9a95e 1354 goto out;
7bfc837d
MX
1355 }
1356
e4058b54 1357 path->reada = READA_FORWARD;
6df9a95e
JB
1358 path->search_commit_root = 1;
1359 path->skip_locking = 1;
7bfc837d 1360
0b86a832
CM
1361 key.objectid = device->devid;
1362 key.offset = search_start;
1363 key.type = BTRFS_DEV_EXTENT_KEY;
7bfc837d 1364
125ccb0a 1365 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0b86a832 1366 if (ret < 0)
7bfc837d 1367 goto out;
1fcbac58
YZ
1368 if (ret > 0) {
1369 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1370 if (ret < 0)
7bfc837d 1371 goto out;
1fcbac58 1372 }
7bfc837d 1373
0b86a832
CM
1374 while (1) {
1375 l = path->nodes[0];
1376 slot = path->slots[0];
1377 if (slot >= btrfs_header_nritems(l)) {
1378 ret = btrfs_next_leaf(root, path);
1379 if (ret == 0)
1380 continue;
1381 if (ret < 0)
7bfc837d
MX
1382 goto out;
1383
1384 break;
0b86a832
CM
1385 }
1386 btrfs_item_key_to_cpu(l, &key, slot);
1387
1388 if (key.objectid < device->devid)
1389 goto next;
1390
1391 if (key.objectid > device->devid)
7bfc837d 1392 break;
0b86a832 1393
962a298f 1394 if (key.type != BTRFS_DEV_EXTENT_KEY)
7bfc837d 1395 goto next;
9779b72f 1396
7bfc837d
MX
1397 if (key.offset > search_start) {
1398 hole_size = key.offset - search_start;
9779b72f 1399
6df9a95e
JB
1400 /*
1401 * Have to check before we set max_hole_start, otherwise
1402 * we could end up sending back this offset anyway.
1403 */
499f377f 1404 if (contains_pending_extent(transaction, device,
6df9a95e 1405 &search_start,
1b984508
FL
1406 hole_size)) {
1407 if (key.offset >= search_start) {
1408 hole_size = key.offset - search_start;
1409 } else {
1410 WARN_ON_ONCE(1);
1411 hole_size = 0;
1412 }
1413 }
6df9a95e 1414
7bfc837d
MX
1415 if (hole_size > max_hole_size) {
1416 max_hole_start = search_start;
1417 max_hole_size = hole_size;
1418 }
9779b72f 1419
7bfc837d
MX
1420 /*
1421 * If this free space is greater than which we need,
1422 * it must be the max free space that we have found
1423 * until now, so max_hole_start must point to the start
1424 * of this free space and the length of this free space
1425 * is stored in max_hole_size. Thus, we return
1426 * max_hole_start and max_hole_size and go back to the
1427 * caller.
1428 */
1429 if (hole_size >= num_bytes) {
1430 ret = 0;
1431 goto out;
0b86a832
CM
1432 }
1433 }
0b86a832 1434
0b86a832 1435 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
7bfc837d
MX
1436 extent_end = key.offset + btrfs_dev_extent_length(l,
1437 dev_extent);
1438 if (extent_end > search_start)
1439 search_start = extent_end;
0b86a832
CM
1440next:
1441 path->slots[0]++;
1442 cond_resched();
1443 }
0b86a832 1444
38c01b96 1445 /*
1446 * At this point, search_start should be the end of
1447 * allocated dev extents, and when shrinking the device,
1448 * search_end may be smaller than search_start.
1449 */
f2ab7618 1450 if (search_end > search_start) {
38c01b96 1451 hole_size = search_end - search_start;
1452
499f377f 1453 if (contains_pending_extent(transaction, device, &search_start,
f2ab7618
ZL
1454 hole_size)) {
1455 btrfs_release_path(path);
1456 goto again;
1457 }
0b86a832 1458
f2ab7618
ZL
1459 if (hole_size > max_hole_size) {
1460 max_hole_start = search_start;
1461 max_hole_size = hole_size;
1462 }
6df9a95e
JB
1463 }
1464
7bfc837d 1465 /* See above. */
f2ab7618 1466 if (max_hole_size < num_bytes)
7bfc837d
MX
1467 ret = -ENOSPC;
1468 else
1469 ret = 0;
1470
1471out:
2b82032c 1472 btrfs_free_path(path);
7bfc837d 1473 *start = max_hole_start;
b2117a39 1474 if (len)
7bfc837d 1475 *len = max_hole_size;
0b86a832
CM
1476 return ret;
1477}
1478
499f377f
JM
1479int find_free_dev_extent(struct btrfs_trans_handle *trans,
1480 struct btrfs_device *device, u64 num_bytes,
1481 u64 *start, u64 *len)
1482{
499f377f 1483 /* FIXME use last free of some kind */
499f377f 1484 return find_free_dev_extent_start(trans->transaction, device,
8cdc7c5b 1485 num_bytes, 0, start, len);
499f377f
JM
1486}
1487
b2950863 1488static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
8f18cf13 1489 struct btrfs_device *device,
2196d6e8 1490 u64 start, u64 *dev_extent_len)
8f18cf13 1491{
0b246afa
JM
1492 struct btrfs_fs_info *fs_info = device->fs_info;
1493 struct btrfs_root *root = fs_info->dev_root;
8f18cf13
CM
1494 int ret;
1495 struct btrfs_path *path;
8f18cf13 1496 struct btrfs_key key;
a061fc8d
CM
1497 struct btrfs_key found_key;
1498 struct extent_buffer *leaf = NULL;
1499 struct btrfs_dev_extent *extent = NULL;
8f18cf13
CM
1500
1501 path = btrfs_alloc_path();
1502 if (!path)
1503 return -ENOMEM;
1504
1505 key.objectid = device->devid;
1506 key.offset = start;
1507 key.type = BTRFS_DEV_EXTENT_KEY;
924cd8fb 1508again:
8f18cf13 1509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
a061fc8d
CM
1510 if (ret > 0) {
1511 ret = btrfs_previous_item(root, path, key.objectid,
1512 BTRFS_DEV_EXTENT_KEY);
b0b802d7
TI
1513 if (ret)
1514 goto out;
a061fc8d
CM
1515 leaf = path->nodes[0];
1516 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1517 extent = btrfs_item_ptr(leaf, path->slots[0],
1518 struct btrfs_dev_extent);
1519 BUG_ON(found_key.offset > start || found_key.offset +
1520 btrfs_dev_extent_length(leaf, extent) < start);
924cd8fb
MX
1521 key = found_key;
1522 btrfs_release_path(path);
1523 goto again;
a061fc8d
CM
1524 } else if (ret == 0) {
1525 leaf = path->nodes[0];
1526 extent = btrfs_item_ptr(leaf, path->slots[0],
1527 struct btrfs_dev_extent);
79787eaa 1528 } else {
0b246afa 1529 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
79787eaa 1530 goto out;
a061fc8d 1531 }
8f18cf13 1532
2196d6e8
MX
1533 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1534
8f18cf13 1535 ret = btrfs_del_item(trans, root, path);
79787eaa 1536 if (ret) {
0b246afa
JM
1537 btrfs_handle_fs_error(fs_info, ret,
1538 "Failed to remove dev extent item");
13212b54 1539 } else {
3204d33c 1540 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
79787eaa 1541 }
b0b802d7 1542out:
8f18cf13
CM
1543 btrfs_free_path(path);
1544 return ret;
1545}
1546
48a3b636
ES
1547static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1548 struct btrfs_device *device,
48a3b636 1549 u64 chunk_offset, u64 start, u64 num_bytes)
0b86a832
CM
1550{
1551 int ret;
1552 struct btrfs_path *path;
0b246afa
JM
1553 struct btrfs_fs_info *fs_info = device->fs_info;
1554 struct btrfs_root *root = fs_info->dev_root;
0b86a832
CM
1555 struct btrfs_dev_extent *extent;
1556 struct extent_buffer *leaf;
1557 struct btrfs_key key;
1558
dfe25020 1559 WARN_ON(!device->in_fs_metadata);
63a212ab 1560 WARN_ON(device->is_tgtdev_for_dev_replace);
0b86a832
CM
1561 path = btrfs_alloc_path();
1562 if (!path)
1563 return -ENOMEM;
1564
0b86a832 1565 key.objectid = device->devid;
2b82032c 1566 key.offset = start;
0b86a832
CM
1567 key.type = BTRFS_DEV_EXTENT_KEY;
1568 ret = btrfs_insert_empty_item(trans, root, path, &key,
1569 sizeof(*extent));
2cdcecbc
MF
1570 if (ret)
1571 goto out;
0b86a832
CM
1572
1573 leaf = path->nodes[0];
1574 extent = btrfs_item_ptr(leaf, path->slots[0],
1575 struct btrfs_dev_extent);
b5d9071c
NB
1576 btrfs_set_dev_extent_chunk_tree(leaf, extent,
1577 BTRFS_CHUNK_TREE_OBJECTID);
0ca00afb
NB
1578 btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1579 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
e17cade2
CM
1580 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1581
0b86a832
CM
1582 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1583 btrfs_mark_buffer_dirty(leaf);
2cdcecbc 1584out:
0b86a832
CM
1585 btrfs_free_path(path);
1586 return ret;
1587}
1588
6df9a95e 1589static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
0b86a832 1590{
6df9a95e
JB
1591 struct extent_map_tree *em_tree;
1592 struct extent_map *em;
1593 struct rb_node *n;
1594 u64 ret = 0;
0b86a832 1595
6df9a95e
JB
1596 em_tree = &fs_info->mapping_tree.map_tree;
1597 read_lock(&em_tree->lock);
1598 n = rb_last(&em_tree->map);
1599 if (n) {
1600 em = rb_entry(n, struct extent_map, rb_node);
1601 ret = em->start + em->len;
0b86a832 1602 }
6df9a95e
JB
1603 read_unlock(&em_tree->lock);
1604
0b86a832
CM
1605 return ret;
1606}
1607
53f10659
ID
1608static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1609 u64 *devid_ret)
0b86a832
CM
1610{
1611 int ret;
1612 struct btrfs_key key;
1613 struct btrfs_key found_key;
2b82032c
YZ
1614 struct btrfs_path *path;
1615
2b82032c
YZ
1616 path = btrfs_alloc_path();
1617 if (!path)
1618 return -ENOMEM;
0b86a832
CM
1619
1620 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1621 key.type = BTRFS_DEV_ITEM_KEY;
1622 key.offset = (u64)-1;
1623
53f10659 1624 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
0b86a832
CM
1625 if (ret < 0)
1626 goto error;
1627
79787eaa 1628 BUG_ON(ret == 0); /* Corruption */
0b86a832 1629
53f10659
ID
1630 ret = btrfs_previous_item(fs_info->chunk_root, path,
1631 BTRFS_DEV_ITEMS_OBJECTID,
0b86a832
CM
1632 BTRFS_DEV_ITEM_KEY);
1633 if (ret) {
53f10659 1634 *devid_ret = 1;
0b86a832
CM
1635 } else {
1636 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1637 path->slots[0]);
53f10659 1638 *devid_ret = found_key.offset + 1;
0b86a832
CM
1639 }
1640 ret = 0;
1641error:
2b82032c 1642 btrfs_free_path(path);
0b86a832
CM
1643 return ret;
1644}
1645
1646/*
1647 * the device information is stored in the chunk root
1648 * the btrfs_device struct should be fully filled in
1649 */
c74a0b02 1650static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
5b4aacef 1651 struct btrfs_fs_info *fs_info,
48a3b636 1652 struct btrfs_device *device)
0b86a832 1653{
5b4aacef 1654 struct btrfs_root *root = fs_info->chunk_root;
0b86a832
CM
1655 int ret;
1656 struct btrfs_path *path;
1657 struct btrfs_dev_item *dev_item;
1658 struct extent_buffer *leaf;
1659 struct btrfs_key key;
1660 unsigned long ptr;
0b86a832 1661
0b86a832
CM
1662 path = btrfs_alloc_path();
1663 if (!path)
1664 return -ENOMEM;
1665
0b86a832
CM
1666 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1667 key.type = BTRFS_DEV_ITEM_KEY;
2b82032c 1668 key.offset = device->devid;
0b86a832
CM
1669
1670 ret = btrfs_insert_empty_item(trans, root, path, &key,
0d81ba5d 1671 sizeof(*dev_item));
0b86a832
CM
1672 if (ret)
1673 goto out;
1674
1675 leaf = path->nodes[0];
1676 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1677
1678 btrfs_set_device_id(leaf, dev_item, device->devid);
2b82032c 1679 btrfs_set_device_generation(leaf, dev_item, 0);
0b86a832
CM
1680 btrfs_set_device_type(leaf, dev_item, device->type);
1681 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1682 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1683 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
1684 btrfs_set_device_total_bytes(leaf, dev_item,
1685 btrfs_device_get_disk_total_bytes(device));
1686 btrfs_set_device_bytes_used(leaf, dev_item,
1687 btrfs_device_get_bytes_used(device));
e17cade2
CM
1688 btrfs_set_device_group(leaf, dev_item, 0);
1689 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1690 btrfs_set_device_bandwidth(leaf, dev_item, 0);
c3027eb5 1691 btrfs_set_device_start_offset(leaf, dev_item, 0);
0b86a832 1692
410ba3a2 1693 ptr = btrfs_device_uuid(dev_item);
e17cade2 1694 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1473b24e 1695 ptr = btrfs_device_fsid(dev_item);
44880fdc 1696 write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
0b86a832 1697 btrfs_mark_buffer_dirty(leaf);
0b86a832 1698
2b82032c 1699 ret = 0;
0b86a832
CM
1700out:
1701 btrfs_free_path(path);
1702 return ret;
1703}
8f18cf13 1704
5a1972bd
QW
1705/*
1706 * Function to update ctime/mtime for a given device path.
1707 * Mainly used for ctime/mtime based probe like libblkid.
1708 */
da353f6b 1709static void update_dev_time(const char *path_name)
5a1972bd
QW
1710{
1711 struct file *filp;
1712
1713 filp = filp_open(path_name, O_RDWR, 0);
98af592f 1714 if (IS_ERR(filp))
5a1972bd
QW
1715 return;
1716 file_update_time(filp);
1717 filp_close(filp, NULL);
5a1972bd
QW
1718}
1719
5b4aacef 1720static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
a061fc8d
CM
1721 struct btrfs_device *device)
1722{
5b4aacef 1723 struct btrfs_root *root = fs_info->chunk_root;
a061fc8d
CM
1724 int ret;
1725 struct btrfs_path *path;
a061fc8d 1726 struct btrfs_key key;
a061fc8d
CM
1727 struct btrfs_trans_handle *trans;
1728
a061fc8d
CM
1729 path = btrfs_alloc_path();
1730 if (!path)
1731 return -ENOMEM;
1732
a22285a6 1733 trans = btrfs_start_transaction(root, 0);
98d5dc13
TI
1734 if (IS_ERR(trans)) {
1735 btrfs_free_path(path);
1736 return PTR_ERR(trans);
1737 }
a061fc8d
CM
1738 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1739 key.type = BTRFS_DEV_ITEM_KEY;
1740 key.offset = device->devid;
1741
1742 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5e9f2ad5
NB
1743 if (ret) {
1744 if (ret > 0)
1745 ret = -ENOENT;
1746 btrfs_abort_transaction(trans, ret);
1747 btrfs_end_transaction(trans);
a061fc8d
CM
1748 goto out;
1749 }
1750
1751 ret = btrfs_del_item(trans, root, path);
5e9f2ad5
NB
1752 if (ret) {
1753 btrfs_abort_transaction(trans, ret);
1754 btrfs_end_transaction(trans);
1755 }
1756
a061fc8d
CM
1757out:
1758 btrfs_free_path(path);
5e9f2ad5
NB
1759 if (!ret)
1760 ret = btrfs_commit_transaction(trans);
a061fc8d
CM
1761 return ret;
1762}
1763
3cc31a0d
DS
1764/*
1765 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1766 * filesystem. It's up to the caller to adjust that number regarding eg. device
1767 * replace.
1768 */
1769static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1770 u64 num_devices)
a061fc8d 1771{
a061fc8d 1772 u64 all_avail;
de98ced9 1773 unsigned seq;
418775a2 1774 int i;
a061fc8d 1775
de98ced9 1776 do {
bd45ffbc 1777 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9 1778
bd45ffbc
AJ
1779 all_avail = fs_info->avail_data_alloc_bits |
1780 fs_info->avail_system_alloc_bits |
1781 fs_info->avail_metadata_alloc_bits;
1782 } while (read_seqretry(&fs_info->profiles_lock, seq));
a061fc8d 1783
418775a2
DS
1784 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1785 if (!(all_avail & btrfs_raid_group[i]))
1786 continue;
a061fc8d 1787
418775a2
DS
1788 if (num_devices < btrfs_raid_array[i].devs_min) {
1789 int ret = btrfs_raid_mindev_error[i];
bd45ffbc 1790
418775a2
DS
1791 if (ret)
1792 return ret;
1793 }
53b381b3
DW
1794 }
1795
bd45ffbc 1796 return 0;
f1fa7f26
AJ
1797}
1798
c9162bdf
OS
1799static struct btrfs_device * btrfs_find_next_active_device(
1800 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
a061fc8d 1801{
2b82032c 1802 struct btrfs_device *next_device;
88acff64
AJ
1803
1804 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1805 if (next_device != device &&
1806 !next_device->missing && next_device->bdev)
1807 return next_device;
1808 }
1809
1810 return NULL;
1811}
1812
1813/*
1814 * Helper function to check if the given device is part of s_bdev / latest_bdev
1815 * and replace it with the provided or the next active device, in the context
1816 * where this function called, there should be always be another device (or
1817 * this_dev) which is active.
1818 */
1819void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1820 struct btrfs_device *device, struct btrfs_device *this_dev)
1821{
1822 struct btrfs_device *next_device;
1823
1824 if (this_dev)
1825 next_device = this_dev;
1826 else
1827 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1828 device);
1829 ASSERT(next_device);
1830
1831 if (fs_info->sb->s_bdev &&
1832 (fs_info->sb->s_bdev == device->bdev))
1833 fs_info->sb->s_bdev = next_device->bdev;
1834
1835 if (fs_info->fs_devices->latest_bdev == device->bdev)
1836 fs_info->fs_devices->latest_bdev = next_device->bdev;
1837}
1838
da353f6b
DS
1839int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1840 u64 devid)
f1fa7f26
AJ
1841{
1842 struct btrfs_device *device;
1f78160c 1843 struct btrfs_fs_devices *cur_devices;
2b82032c 1844 u64 num_devices;
a061fc8d
CM
1845 int ret = 0;
1846
2c997384 1847 mutex_lock(&fs_info->volume_mutex);
a061fc8d
CM
1848 mutex_lock(&uuid_mutex);
1849
0b246afa
JM
1850 num_devices = fs_info->fs_devices->num_devices;
1851 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
1852 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
8dabb742
SB
1853 WARN_ON(num_devices < 1);
1854 num_devices--;
1855 }
0b246afa 1856 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
8dabb742 1857
0b246afa 1858 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
f1fa7f26 1859 if (ret)
a061fc8d 1860 goto out;
a061fc8d 1861
2ff7e61e
JM
1862 ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1863 &device);
24fc572f 1864 if (ret)
53b381b3 1865 goto out;
dfe25020 1866
63a212ab 1867 if (device->is_tgtdev_for_dev_replace) {
183860f6 1868 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
24fc572f 1869 goto out;
63a212ab
SB
1870 }
1871
0b246afa 1872 if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
183860f6 1873 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
24fc572f 1874 goto out;
2b82032c
YZ
1875 }
1876
1877 if (device->writeable) {
34441361 1878 mutex_lock(&fs_info->chunk_mutex);
2b82032c 1879 list_del_init(&device->dev_alloc_list);
c3929c36 1880 device->fs_devices->rw_devices--;
34441361 1881 mutex_unlock(&fs_info->chunk_mutex);
dfe25020 1882 }
a061fc8d 1883
d7901554 1884 mutex_unlock(&uuid_mutex);
a061fc8d 1885 ret = btrfs_shrink_device(device, 0);
d7901554 1886 mutex_lock(&uuid_mutex);
a061fc8d 1887 if (ret)
9b3517e9 1888 goto error_undo;
a061fc8d 1889
63a212ab
SB
1890 /*
1891 * TODO: the superblock still includes this device in its num_devices
1892 * counter although write_all_supers() is not locked out. This
1893 * could give a filesystem state which requires a degraded mount.
1894 */
0b246afa 1895 ret = btrfs_rm_dev_item(fs_info, device);
a061fc8d 1896 if (ret)
9b3517e9 1897 goto error_undo;
a061fc8d 1898
2b82032c 1899 device->in_fs_metadata = 0;
0b246afa 1900 btrfs_scrub_cancel_dev(fs_info, device);
e5e9a520
CM
1901
1902 /*
1903 * the device list mutex makes sure that we don't change
1904 * the device list while someone else is writing out all
d7306801
FDBM
1905 * the device supers. Whoever is writing all supers, should
1906 * lock the device list mutex before getting the number of
1907 * devices in the super block (super_copy). Conversely,
1908 * whoever updates the number of devices in the super block
1909 * (super_copy) should hold the device list mutex.
e5e9a520 1910 */
1f78160c
XG
1911
1912 cur_devices = device->fs_devices;
0b246afa 1913 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1f78160c 1914 list_del_rcu(&device->dev_list);
e5e9a520 1915
e4404d6e 1916 device->fs_devices->num_devices--;
02db0844 1917 device->fs_devices->total_devices--;
2b82032c 1918
cd02dca5 1919 if (device->missing)
3a7d55c8 1920 device->fs_devices->missing_devices--;
cd02dca5 1921
0b246afa 1922 btrfs_assign_next_active_device(fs_info, device, NULL);
2b82032c 1923
0bfaa9c5 1924 if (device->bdev) {
e4404d6e 1925 device->fs_devices->open_devices--;
0bfaa9c5 1926 /* remove sysfs entry */
0b246afa 1927 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
0bfaa9c5 1928 }
99994cde 1929
0b246afa
JM
1930 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1931 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1932 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2b82032c 1933
cea67ab9
JM
1934 /*
1935 * at this point, the device is zero sized and detached from
1936 * the devices list. All that's left is to zero out the old
1937 * supers and free the device.
1938 */
1939 if (device->writeable)
1940 btrfs_scratch_superblocks(device->bdev, device->name->str);
1941
1942 btrfs_close_bdev(device);
f06c5965 1943 call_rcu(&device->rcu, free_device_rcu);
cea67ab9 1944
1f78160c 1945 if (cur_devices->open_devices == 0) {
e4404d6e 1946 struct btrfs_fs_devices *fs_devices;
0b246afa 1947 fs_devices = fs_info->fs_devices;
e4404d6e 1948 while (fs_devices) {
8321cf25
RS
1949 if (fs_devices->seed == cur_devices) {
1950 fs_devices->seed = cur_devices->seed;
e4404d6e 1951 break;
8321cf25 1952 }
e4404d6e 1953 fs_devices = fs_devices->seed;
2b82032c 1954 }
1f78160c 1955 cur_devices->seed = NULL;
1f78160c 1956 __btrfs_close_devices(cur_devices);
1f78160c 1957 free_fs_devices(cur_devices);
2b82032c
YZ
1958 }
1959
a061fc8d
CM
1960out:
1961 mutex_unlock(&uuid_mutex);
2c997384 1962 mutex_unlock(&fs_info->volume_mutex);
a061fc8d 1963 return ret;
24fc572f 1964
9b3517e9
ID
1965error_undo:
1966 if (device->writeable) {
34441361 1967 mutex_lock(&fs_info->chunk_mutex);
9b3517e9 1968 list_add(&device->dev_alloc_list,
0b246afa 1969 &fs_info->fs_devices->alloc_list);
c3929c36 1970 device->fs_devices->rw_devices++;
34441361 1971 mutex_unlock(&fs_info->chunk_mutex);
9b3517e9 1972 }
24fc572f 1973 goto out;
a061fc8d
CM
1974}
1975
084b6e7c
QW
1976void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1977 struct btrfs_device *srcdev)
e93c89c1 1978{
d51908ce
AJ
1979 struct btrfs_fs_devices *fs_devices;
1980
e93c89c1 1981 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1357272f 1982
25e8e911
AJ
1983 /*
1984 * in case of fs with no seed, srcdev->fs_devices will point
1985 * to fs_devices of fs_info. However when the dev being replaced is
1986 * a seed dev it will point to the seed's local fs_devices. In short
1987 * srcdev will have its correct fs_devices in both the cases.
1988 */
1989 fs_devices = srcdev->fs_devices;
d51908ce 1990
e93c89c1 1991 list_del_rcu(&srcdev->dev_list);
619c47f3 1992 list_del(&srcdev->dev_alloc_list);
d51908ce 1993 fs_devices->num_devices--;
82372bc8 1994 if (srcdev->missing)
d51908ce 1995 fs_devices->missing_devices--;
e93c89c1 1996
48b3b9d4 1997 if (srcdev->writeable)
82372bc8 1998 fs_devices->rw_devices--;
1357272f 1999
82372bc8 2000 if (srcdev->bdev)
d51908ce 2001 fs_devices->open_devices--;
084b6e7c
QW
2002}
2003
2004void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2005 struct btrfs_device *srcdev)
2006{
2007 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
e93c89c1 2008
48b3b9d4
AJ
2009 if (srcdev->writeable) {
2010 /* zero out the old super if it is writable */
2011 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2012 }
14238819
AJ
2013
2014 btrfs_close_bdev(srcdev);
f06c5965 2015 call_rcu(&srcdev->rcu, free_device_rcu);
94d5f0c2 2016
94d5f0c2
AJ
2017 /* if this is no devs we rather delete the fs_devices */
2018 if (!fs_devices->num_devices) {
2019 struct btrfs_fs_devices *tmp_fs_devices;
2020
6dd38f81
AJ
2021 /*
2022 * On a mounted FS, num_devices can't be zero unless it's a
2023 * seed. In case of a seed device being replaced, the replace
2024 * target added to the sprout FS, so there will be no more
2025 * device left under the seed FS.
2026 */
2027 ASSERT(fs_devices->seeding);
2028
94d5f0c2
AJ
2029 tmp_fs_devices = fs_info->fs_devices;
2030 while (tmp_fs_devices) {
2031 if (tmp_fs_devices->seed == fs_devices) {
2032 tmp_fs_devices->seed = fs_devices->seed;
2033 break;
2034 }
2035 tmp_fs_devices = tmp_fs_devices->seed;
2036 }
2037 fs_devices->seed = NULL;
8bef8401
AJ
2038 __btrfs_close_devices(fs_devices);
2039 free_fs_devices(fs_devices);
94d5f0c2 2040 }
e93c89c1
SB
2041}
2042
2043void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2044 struct btrfs_device *tgtdev)
2045{
67a2c45e 2046 mutex_lock(&uuid_mutex);
e93c89c1
SB
2047 WARN_ON(!tgtdev);
2048 mutex_lock(&fs_info->fs_devices->device_list_mutex);
d2ff1b20 2049
32576040 2050 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
d2ff1b20 2051
779bf3fe 2052 if (tgtdev->bdev)
e93c89c1 2053 fs_info->fs_devices->open_devices--;
779bf3fe 2054
e93c89c1 2055 fs_info->fs_devices->num_devices--;
e93c89c1 2056
88acff64 2057 btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
e93c89c1 2058
e93c89c1 2059 list_del_rcu(&tgtdev->dev_list);
e93c89c1
SB
2060
2061 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
67a2c45e 2062 mutex_unlock(&uuid_mutex);
779bf3fe
AJ
2063
2064 /*
2065 * The update_dev_time() with in btrfs_scratch_superblocks()
2066 * may lead to a call to btrfs_show_devname() which will try
2067 * to hold device_list_mutex. And here this device
2068 * is already out of device list, so we don't have to hold
2069 * the device_list_mutex lock.
2070 */
2071 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
14238819
AJ
2072
2073 btrfs_close_bdev(tgtdev);
f06c5965 2074 call_rcu(&tgtdev->rcu, free_device_rcu);
e93c89c1
SB
2075}
2076
2ff7e61e 2077static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
da353f6b 2078 const char *device_path,
48a3b636 2079 struct btrfs_device **device)
7ba15b7d
SB
2080{
2081 int ret = 0;
2082 struct btrfs_super_block *disk_super;
2083 u64 devid;
2084 u8 *dev_uuid;
2085 struct block_device *bdev;
2086 struct buffer_head *bh;
2087
2088 *device = NULL;
2089 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
0b246afa 2090 fs_info->bdev_holder, 0, &bdev, &bh);
7ba15b7d
SB
2091 if (ret)
2092 return ret;
2093 disk_super = (struct btrfs_super_block *)bh->b_data;
2094 devid = btrfs_stack_device_id(&disk_super->dev_item);
2095 dev_uuid = disk_super->dev_item.uuid;
0b246afa 2096 *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
7ba15b7d
SB
2097 brelse(bh);
2098 if (!*device)
2099 ret = -ENOENT;
2100 blkdev_put(bdev, FMODE_READ);
2101 return ret;
2102}
2103
2ff7e61e 2104int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
da353f6b 2105 const char *device_path,
7ba15b7d
SB
2106 struct btrfs_device **device)
2107{
2108 *device = NULL;
2109 if (strcmp(device_path, "missing") == 0) {
2110 struct list_head *devices;
2111 struct btrfs_device *tmp;
2112
0b246afa 2113 devices = &fs_info->fs_devices->devices;
7ba15b7d
SB
2114 /*
2115 * It is safe to read the devices since the volume_mutex
2116 * is held by the caller.
2117 */
2118 list_for_each_entry(tmp, devices, dev_list) {
2119 if (tmp->in_fs_metadata && !tmp->bdev) {
2120 *device = tmp;
2121 break;
2122 }
2123 }
2124
d74a6259
AJ
2125 if (!*device)
2126 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
7ba15b7d
SB
2127
2128 return 0;
2129 } else {
2ff7e61e 2130 return btrfs_find_device_by_path(fs_info, device_path, device);
7ba15b7d
SB
2131 }
2132}
2133
5c5c0df0
DS
2134/*
2135 * Lookup a device given by device id, or the path if the id is 0.
2136 */
2ff7e61e 2137int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
da353f6b
DS
2138 const char *devpath,
2139 struct btrfs_device **device)
24e0474b
AJ
2140{
2141 int ret;
2142
5c5c0df0 2143 if (devid) {
24e0474b 2144 ret = 0;
0b246afa 2145 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
24e0474b
AJ
2146 if (!*device)
2147 ret = -ENOENT;
2148 } else {
5c5c0df0 2149 if (!devpath || !devpath[0])
b3d1b153
AJ
2150 return -EINVAL;
2151
2ff7e61e 2152 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
24e0474b
AJ
2153 device);
2154 }
2155 return ret;
2156}
2157
2b82032c
YZ
2158/*
2159 * does all the dirty work required for changing file system's UUID.
2160 */
2ff7e61e 2161static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2b82032c 2162{
0b246afa 2163 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2b82032c 2164 struct btrfs_fs_devices *old_devices;
e4404d6e 2165 struct btrfs_fs_devices *seed_devices;
0b246afa 2166 struct btrfs_super_block *disk_super = fs_info->super_copy;
2b82032c
YZ
2167 struct btrfs_device *device;
2168 u64 super_flags;
2169
2170 BUG_ON(!mutex_is_locked(&uuid_mutex));
e4404d6e 2171 if (!fs_devices->seeding)
2b82032c
YZ
2172 return -EINVAL;
2173
2dfeca9b 2174 seed_devices = alloc_fs_devices(NULL);
2208a378
ID
2175 if (IS_ERR(seed_devices))
2176 return PTR_ERR(seed_devices);
2b82032c 2177
e4404d6e
YZ
2178 old_devices = clone_fs_devices(fs_devices);
2179 if (IS_ERR(old_devices)) {
2180 kfree(seed_devices);
2181 return PTR_ERR(old_devices);
2b82032c 2182 }
e4404d6e 2183
2b82032c
YZ
2184 list_add(&old_devices->list, &fs_uuids);
2185
e4404d6e
YZ
2186 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2187 seed_devices->opened = 1;
2188 INIT_LIST_HEAD(&seed_devices->devices);
2189 INIT_LIST_HEAD(&seed_devices->alloc_list);
e5e9a520 2190 mutex_init(&seed_devices->device_list_mutex);
c9513edb 2191
0b246afa 2192 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1f78160c
XG
2193 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2194 synchronize_rcu);
2196d6e8
MX
2195 list_for_each_entry(device, &seed_devices->devices, dev_list)
2196 device->fs_devices = seed_devices;
c9513edb 2197
34441361 2198 mutex_lock(&fs_info->chunk_mutex);
e4404d6e 2199 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
34441361 2200 mutex_unlock(&fs_info->chunk_mutex);
e4404d6e 2201
2b82032c
YZ
2202 fs_devices->seeding = 0;
2203 fs_devices->num_devices = 0;
2204 fs_devices->open_devices = 0;
69611ac8 2205 fs_devices->missing_devices = 0;
69611ac8 2206 fs_devices->rotating = 0;
e4404d6e 2207 fs_devices->seed = seed_devices;
2b82032c
YZ
2208
2209 generate_random_uuid(fs_devices->fsid);
0b246afa 2210 memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2b82032c 2211 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
0b246afa 2212 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
f7171750 2213
2b82032c
YZ
2214 super_flags = btrfs_super_flags(disk_super) &
2215 ~BTRFS_SUPER_FLAG_SEEDING;
2216 btrfs_set_super_flags(disk_super, super_flags);
2217
2218 return 0;
2219}
2220
2221/*
01327610 2222 * Store the expected generation for seed devices in device items.
2b82032c
YZ
2223 */
2224static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
5b4aacef 2225 struct btrfs_fs_info *fs_info)
2b82032c 2226{
5b4aacef 2227 struct btrfs_root *root = fs_info->chunk_root;
2b82032c
YZ
2228 struct btrfs_path *path;
2229 struct extent_buffer *leaf;
2230 struct btrfs_dev_item *dev_item;
2231 struct btrfs_device *device;
2232 struct btrfs_key key;
44880fdc 2233 u8 fs_uuid[BTRFS_FSID_SIZE];
2b82032c
YZ
2234 u8 dev_uuid[BTRFS_UUID_SIZE];
2235 u64 devid;
2236 int ret;
2237
2238 path = btrfs_alloc_path();
2239 if (!path)
2240 return -ENOMEM;
2241
2b82032c
YZ
2242 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2243 key.offset = 0;
2244 key.type = BTRFS_DEV_ITEM_KEY;
2245
2246 while (1) {
2247 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2248 if (ret < 0)
2249 goto error;
2250
2251 leaf = path->nodes[0];
2252next_slot:
2253 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2254 ret = btrfs_next_leaf(root, path);
2255 if (ret > 0)
2256 break;
2257 if (ret < 0)
2258 goto error;
2259 leaf = path->nodes[0];
2260 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 2261 btrfs_release_path(path);
2b82032c
YZ
2262 continue;
2263 }
2264
2265 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2266 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2267 key.type != BTRFS_DEV_ITEM_KEY)
2268 break;
2269
2270 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2271 struct btrfs_dev_item);
2272 devid = btrfs_device_id(leaf, dev_item);
410ba3a2 2273 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2b82032c 2274 BTRFS_UUID_SIZE);
1473b24e 2275 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 2276 BTRFS_FSID_SIZE);
0b246afa 2277 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
79787eaa 2278 BUG_ON(!device); /* Logic error */
2b82032c
YZ
2279
2280 if (device->fs_devices->seeding) {
2281 btrfs_set_device_generation(leaf, dev_item,
2282 device->generation);
2283 btrfs_mark_buffer_dirty(leaf);
2284 }
2285
2286 path->slots[0]++;
2287 goto next_slot;
2288 }
2289 ret = 0;
2290error:
2291 btrfs_free_path(path);
2292 return ret;
2293}
2294
da353f6b 2295int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
788f20eb 2296{
5112febb 2297 struct btrfs_root *root = fs_info->dev_root;
d5e2003c 2298 struct request_queue *q;
788f20eb
CM
2299 struct btrfs_trans_handle *trans;
2300 struct btrfs_device *device;
2301 struct block_device *bdev;
788f20eb 2302 struct list_head *devices;
0b246afa 2303 struct super_block *sb = fs_info->sb;
606686ee 2304 struct rcu_string *name;
3c1dbdf5 2305 u64 tmp;
2b82032c 2306 int seeding_dev = 0;
788f20eb 2307 int ret = 0;
7132a262 2308 bool unlocked = false;
788f20eb 2309
bc98a42c 2310 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
f8c5d0b4 2311 return -EROFS;
788f20eb 2312
a5d16333 2313 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
0b246afa 2314 fs_info->bdev_holder);
7f59203a
JB
2315 if (IS_ERR(bdev))
2316 return PTR_ERR(bdev);
a2135011 2317
0b246afa 2318 if (fs_info->fs_devices->seeding) {
2b82032c
YZ
2319 seeding_dev = 1;
2320 down_write(&sb->s_umount);
2321 mutex_lock(&uuid_mutex);
2322 }
2323
8c8bee1d 2324 filemap_write_and_wait(bdev->bd_inode->i_mapping);
a2135011 2325
0b246afa 2326 devices = &fs_info->fs_devices->devices;
d25628bd 2327
0b246afa 2328 mutex_lock(&fs_info->fs_devices->device_list_mutex);
c6e30871 2329 list_for_each_entry(device, devices, dev_list) {
788f20eb
CM
2330 if (device->bdev == bdev) {
2331 ret = -EEXIST;
d25628bd 2332 mutex_unlock(
0b246afa 2333 &fs_info->fs_devices->device_list_mutex);
2b82032c 2334 goto error;
788f20eb
CM
2335 }
2336 }
0b246afa 2337 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
788f20eb 2338
0b246afa 2339 device = btrfs_alloc_device(fs_info, NULL, NULL);
12bd2fc0 2340 if (IS_ERR(device)) {
788f20eb 2341 /* we can safely leave the fs_devices entry around */
12bd2fc0 2342 ret = PTR_ERR(device);
2b82032c 2343 goto error;
788f20eb
CM
2344 }
2345
78f2c9e6 2346 name = rcu_string_strdup(device_path, GFP_KERNEL);
606686ee 2347 if (!name) {
55de4803 2348 free_device(device);
2b82032c
YZ
2349 ret = -ENOMEM;
2350 goto error;
788f20eb 2351 }
606686ee 2352 rcu_assign_pointer(device->name, name);
2b82032c 2353
a22285a6 2354 trans = btrfs_start_transaction(root, 0);
98d5dc13 2355 if (IS_ERR(trans)) {
55de4803 2356 free_device(device);
98d5dc13
TI
2357 ret = PTR_ERR(trans);
2358 goto error;
2359 }
2360
d5e2003c
JB
2361 q = bdev_get_queue(bdev);
2362 if (blk_queue_discard(q))
2363 device->can_discard = 1;
2b82032c 2364 device->writeable = 1;
2b82032c 2365 device->generation = trans->transid;
0b246afa
JM
2366 device->io_width = fs_info->sectorsize;
2367 device->io_align = fs_info->sectorsize;
2368 device->sector_size = fs_info->sectorsize;
7dfb8be1
NB
2369 device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2370 fs_info->sectorsize);
2cc3c559 2371 device->disk_total_bytes = device->total_bytes;
935e5cc9 2372 device->commit_total_bytes = device->total_bytes;
fb456252 2373 device->fs_info = fs_info;
788f20eb 2374 device->bdev = bdev;
dfe25020 2375 device->in_fs_metadata = 1;
63a212ab 2376 device->is_tgtdev_for_dev_replace = 0;
fb01aa85 2377 device->mode = FMODE_EXCL;
27087f37 2378 device->dev_stats_valid = 1;
9f6d2510 2379 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
788f20eb 2380
2b82032c 2381 if (seeding_dev) {
1751e8a6 2382 sb->s_flags &= ~SB_RDONLY;
2ff7e61e 2383 ret = btrfs_prepare_sprout(fs_info);
d31c32f6
AJ
2384 if (ret) {
2385 btrfs_abort_transaction(trans, ret);
2386 goto error_trans;
2387 }
2b82032c 2388 }
788f20eb 2389
0b246afa 2390 device->fs_devices = fs_info->fs_devices;
e5e9a520 2391
0b246afa 2392 mutex_lock(&fs_info->fs_devices->device_list_mutex);
34441361 2393 mutex_lock(&fs_info->chunk_mutex);
0b246afa 2394 list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2b82032c 2395 list_add(&device->dev_alloc_list,
0b246afa
JM
2396 &fs_info->fs_devices->alloc_list);
2397 fs_info->fs_devices->num_devices++;
2398 fs_info->fs_devices->open_devices++;
2399 fs_info->fs_devices->rw_devices++;
2400 fs_info->fs_devices->total_devices++;
2401 fs_info->fs_devices->total_rw_bytes += device->total_bytes;
325cd4ba 2402
a5ed45f8 2403 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2bf64758 2404
e884f4f0 2405 if (!blk_queue_nonrot(q))
0b246afa 2406 fs_info->fs_devices->rotating = 1;
c289811c 2407
0b246afa
JM
2408 tmp = btrfs_super_total_bytes(fs_info->super_copy);
2409 btrfs_set_super_total_bytes(fs_info->super_copy,
7dfb8be1 2410 round_down(tmp + device->total_bytes, fs_info->sectorsize));
788f20eb 2411
0b246afa
JM
2412 tmp = btrfs_super_num_devices(fs_info->super_copy);
2413 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
0d39376a
AJ
2414
2415 /* add sysfs device entry */
0b246afa 2416 btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
0d39376a 2417
2196d6e8
MX
2418 /*
2419 * we've got more storage, clear any full flags on the space
2420 * infos
2421 */
0b246afa 2422 btrfs_clear_space_info_full(fs_info);
2196d6e8 2423
34441361 2424 mutex_unlock(&fs_info->chunk_mutex);
0b246afa 2425 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
788f20eb 2426
2b82032c 2427 if (seeding_dev) {
34441361 2428 mutex_lock(&fs_info->chunk_mutex);
e4a4dce7 2429 ret = init_first_rw_device(trans, fs_info);
34441361 2430 mutex_unlock(&fs_info->chunk_mutex);
005d6427 2431 if (ret) {
66642832 2432 btrfs_abort_transaction(trans, ret);
d31c32f6 2433 goto error_sysfs;
005d6427 2434 }
2196d6e8
MX
2435 }
2436
c74a0b02 2437 ret = btrfs_add_dev_item(trans, fs_info, device);
2196d6e8 2438 if (ret) {
66642832 2439 btrfs_abort_transaction(trans, ret);
d31c32f6 2440 goto error_sysfs;
2196d6e8
MX
2441 }
2442
2443 if (seeding_dev) {
2444 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2445
0b246afa 2446 ret = btrfs_finish_sprout(trans, fs_info);
005d6427 2447 if (ret) {
66642832 2448 btrfs_abort_transaction(trans, ret);
d31c32f6 2449 goto error_sysfs;
005d6427 2450 }
b2373f25
AJ
2451
2452 /* Sprouting would change fsid of the mounted root,
2453 * so rename the fsid on the sysfs
2454 */
2455 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
0b246afa
JM
2456 fs_info->fsid);
2457 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2458 btrfs_warn(fs_info,
2459 "sysfs: failed to create fsid for sprout");
2b82032c
YZ
2460 }
2461
3a45bb20 2462 ret = btrfs_commit_transaction(trans);
a2135011 2463
2b82032c
YZ
2464 if (seeding_dev) {
2465 mutex_unlock(&uuid_mutex);
2466 up_write(&sb->s_umount);
7132a262 2467 unlocked = true;
788f20eb 2468
79787eaa
JM
2469 if (ret) /* transaction commit */
2470 return ret;
2471
2ff7e61e 2472 ret = btrfs_relocate_sys_chunks(fs_info);
79787eaa 2473 if (ret < 0)
0b246afa 2474 btrfs_handle_fs_error(fs_info, ret,
5d163e0e 2475 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
671415b7
MX
2476 trans = btrfs_attach_transaction(root);
2477 if (IS_ERR(trans)) {
2478 if (PTR_ERR(trans) == -ENOENT)
2479 return 0;
7132a262
AJ
2480 ret = PTR_ERR(trans);
2481 trans = NULL;
2482 goto error_sysfs;
671415b7 2483 }
3a45bb20 2484 ret = btrfs_commit_transaction(trans);
2b82032c 2485 }
c9e9f97b 2486
5a1972bd
QW
2487 /* Update ctime/mtime for libblkid */
2488 update_dev_time(device_path);
2b82032c 2489 return ret;
79787eaa 2490
d31c32f6
AJ
2491error_sysfs:
2492 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
79787eaa 2493error_trans:
0af2c4bf 2494 if (seeding_dev)
1751e8a6 2495 sb->s_flags |= SB_RDONLY;
7132a262
AJ
2496 if (trans)
2497 btrfs_end_transaction(trans);
55de4803 2498 free_device(device);
2b82032c 2499error:
e525fd89 2500 blkdev_put(bdev, FMODE_EXCL);
7132a262 2501 if (seeding_dev && !unlocked) {
2b82032c
YZ
2502 mutex_unlock(&uuid_mutex);
2503 up_write(&sb->s_umount);
2504 }
c9e9f97b 2505 return ret;
788f20eb
CM
2506}
2507
2ff7e61e 2508int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
da353f6b 2509 const char *device_path,
1c43366d 2510 struct btrfs_device *srcdev,
e93c89c1
SB
2511 struct btrfs_device **device_out)
2512{
2513 struct request_queue *q;
2514 struct btrfs_device *device;
2515 struct block_device *bdev;
e93c89c1
SB
2516 struct list_head *devices;
2517 struct rcu_string *name;
12bd2fc0 2518 u64 devid = BTRFS_DEV_REPLACE_DEVID;
e93c89c1
SB
2519 int ret = 0;
2520
2521 *device_out = NULL;
1c43366d
MX
2522 if (fs_info->fs_devices->seeding) {
2523 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
e93c89c1 2524 return -EINVAL;
1c43366d 2525 }
e93c89c1
SB
2526
2527 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2528 fs_info->bdev_holder);
1c43366d
MX
2529 if (IS_ERR(bdev)) {
2530 btrfs_err(fs_info, "target device %s is invalid!", device_path);
e93c89c1 2531 return PTR_ERR(bdev);
1c43366d 2532 }
e93c89c1
SB
2533
2534 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2535
2536 devices = &fs_info->fs_devices->devices;
2537 list_for_each_entry(device, devices, dev_list) {
2538 if (device->bdev == bdev) {
5d163e0e
JM
2539 btrfs_err(fs_info,
2540 "target device is in the filesystem!");
e93c89c1
SB
2541 ret = -EEXIST;
2542 goto error;
2543 }
2544 }
2545
1c43366d 2546
7cc8e58d
MX
2547 if (i_size_read(bdev->bd_inode) <
2548 btrfs_device_get_total_bytes(srcdev)) {
5d163e0e
JM
2549 btrfs_err(fs_info,
2550 "target device is smaller than source device!");
1c43366d
MX
2551 ret = -EINVAL;
2552 goto error;
2553 }
2554
2555
12bd2fc0
ID
2556 device = btrfs_alloc_device(NULL, &devid, NULL);
2557 if (IS_ERR(device)) {
2558 ret = PTR_ERR(device);
e93c89c1
SB
2559 goto error;
2560 }
2561
6165572c 2562 name = rcu_string_strdup(device_path, GFP_KERNEL);
e93c89c1 2563 if (!name) {
55de4803 2564 free_device(device);
e93c89c1
SB
2565 ret = -ENOMEM;
2566 goto error;
2567 }
2568 rcu_assign_pointer(device->name, name);
2569
2570 q = bdev_get_queue(bdev);
2571 if (blk_queue_discard(q))
2572 device->can_discard = 1;
0b246afa 2573 mutex_lock(&fs_info->fs_devices->device_list_mutex);
e93c89c1 2574 device->writeable = 1;
e93c89c1 2575 device->generation = 0;
0b246afa
JM
2576 device->io_width = fs_info->sectorsize;
2577 device->io_align = fs_info->sectorsize;
2578 device->sector_size = fs_info->sectorsize;
7cc8e58d
MX
2579 device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2580 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2581 device->bytes_used = btrfs_device_get_bytes_used(srcdev);
935e5cc9
MX
2582 ASSERT(list_empty(&srcdev->resized_list));
2583 device->commit_total_bytes = srcdev->commit_total_bytes;
ce7213c7 2584 device->commit_bytes_used = device->bytes_used;
fb456252 2585 device->fs_info = fs_info;
e93c89c1
SB
2586 device->bdev = bdev;
2587 device->in_fs_metadata = 1;
2588 device->is_tgtdev_for_dev_replace = 1;
2589 device->mode = FMODE_EXCL;
27087f37 2590 device->dev_stats_valid = 1;
9f6d2510 2591 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
e93c89c1
SB
2592 device->fs_devices = fs_info->fs_devices;
2593 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2594 fs_info->fs_devices->num_devices++;
2595 fs_info->fs_devices->open_devices++;
0b246afa 2596 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
e93c89c1
SB
2597
2598 *device_out = device;
2599 return ret;
2600
2601error:
2602 blkdev_put(bdev, FMODE_EXCL);
2603 return ret;
2604}
2605
2606void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2607 struct btrfs_device *tgtdev)
2608{
da17066c
JM
2609 u32 sectorsize = fs_info->sectorsize;
2610
e93c89c1 2611 WARN_ON(fs_info->fs_devices->rw_devices == 0);
da17066c
JM
2612 tgtdev->io_width = sectorsize;
2613 tgtdev->io_align = sectorsize;
2614 tgtdev->sector_size = sectorsize;
fb456252 2615 tgtdev->fs_info = fs_info;
e93c89c1
SB
2616 tgtdev->in_fs_metadata = 1;
2617}
2618
d397712b
CM
2619static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2620 struct btrfs_device *device)
0b86a832
CM
2621{
2622 int ret;
2623 struct btrfs_path *path;
0b246afa 2624 struct btrfs_root *root = device->fs_info->chunk_root;
0b86a832
CM
2625 struct btrfs_dev_item *dev_item;
2626 struct extent_buffer *leaf;
2627 struct btrfs_key key;
2628
0b86a832
CM
2629 path = btrfs_alloc_path();
2630 if (!path)
2631 return -ENOMEM;
2632
2633 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2634 key.type = BTRFS_DEV_ITEM_KEY;
2635 key.offset = device->devid;
2636
2637 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2638 if (ret < 0)
2639 goto out;
2640
2641 if (ret > 0) {
2642 ret = -ENOENT;
2643 goto out;
2644 }
2645
2646 leaf = path->nodes[0];
2647 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2648
2649 btrfs_set_device_id(leaf, dev_item, device->devid);
2650 btrfs_set_device_type(leaf, dev_item, device->type);
2651 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2652 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2653 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
2654 btrfs_set_device_total_bytes(leaf, dev_item,
2655 btrfs_device_get_disk_total_bytes(device));
2656 btrfs_set_device_bytes_used(leaf, dev_item,
2657 btrfs_device_get_bytes_used(device));
0b86a832
CM
2658 btrfs_mark_buffer_dirty(leaf);
2659
2660out:
2661 btrfs_free_path(path);
2662 return ret;
2663}
2664
2196d6e8 2665int btrfs_grow_device(struct btrfs_trans_handle *trans,
8f18cf13
CM
2666 struct btrfs_device *device, u64 new_size)
2667{
0b246afa
JM
2668 struct btrfs_fs_info *fs_info = device->fs_info;
2669 struct btrfs_super_block *super_copy = fs_info->super_copy;
935e5cc9 2670 struct btrfs_fs_devices *fs_devices;
2196d6e8
MX
2671 u64 old_total;
2672 u64 diff;
8f18cf13 2673
2b82032c
YZ
2674 if (!device->writeable)
2675 return -EACCES;
2196d6e8 2676
7dfb8be1
NB
2677 new_size = round_down(new_size, fs_info->sectorsize);
2678
34441361 2679 mutex_lock(&fs_info->chunk_mutex);
2196d6e8 2680 old_total = btrfs_super_total_bytes(super_copy);
0e4324a4 2681 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2196d6e8 2682
63a212ab 2683 if (new_size <= device->total_bytes ||
2196d6e8 2684 device->is_tgtdev_for_dev_replace) {
34441361 2685 mutex_unlock(&fs_info->chunk_mutex);
2b82032c 2686 return -EINVAL;
2196d6e8 2687 }
2b82032c 2688
0b246afa 2689 fs_devices = fs_info->fs_devices;
2b82032c 2690
7dfb8be1
NB
2691 btrfs_set_super_total_bytes(super_copy,
2692 round_down(old_total + diff, fs_info->sectorsize));
2b82032c
YZ
2693 device->fs_devices->total_rw_bytes += diff;
2694
7cc8e58d
MX
2695 btrfs_device_set_total_bytes(device, new_size);
2696 btrfs_device_set_disk_total_bytes(device, new_size);
fb456252 2697 btrfs_clear_space_info_full(device->fs_info);
935e5cc9
MX
2698 if (list_empty(&device->resized_list))
2699 list_add_tail(&device->resized_list,
2700 &fs_devices->resized_devices);
34441361 2701 mutex_unlock(&fs_info->chunk_mutex);
4184ea7f 2702
8f18cf13
CM
2703 return btrfs_update_device(trans, device);
2704}
2705
2706static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
408fbf19 2707 struct btrfs_fs_info *fs_info, u64 chunk_offset)
8f18cf13 2708{
5b4aacef 2709 struct btrfs_root *root = fs_info->chunk_root;
8f18cf13
CM
2710 int ret;
2711 struct btrfs_path *path;
2712 struct btrfs_key key;
2713
8f18cf13
CM
2714 path = btrfs_alloc_path();
2715 if (!path)
2716 return -ENOMEM;
2717
408fbf19 2718 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
8f18cf13
CM
2719 key.offset = chunk_offset;
2720 key.type = BTRFS_CHUNK_ITEM_KEY;
2721
2722 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
79787eaa
JM
2723 if (ret < 0)
2724 goto out;
2725 else if (ret > 0) { /* Logic error or corruption */
0b246afa
JM
2726 btrfs_handle_fs_error(fs_info, -ENOENT,
2727 "Failed lookup while freeing chunk.");
79787eaa
JM
2728 ret = -ENOENT;
2729 goto out;
2730 }
8f18cf13
CM
2731
2732 ret = btrfs_del_item(trans, root, path);
79787eaa 2733 if (ret < 0)
0b246afa
JM
2734 btrfs_handle_fs_error(fs_info, ret,
2735 "Failed to delete chunk item.");
79787eaa 2736out:
8f18cf13 2737 btrfs_free_path(path);
65a246c5 2738 return ret;
8f18cf13
CM
2739}
2740
408fbf19 2741static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
8f18cf13 2742{
0b246afa 2743 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13
CM
2744 struct btrfs_disk_key *disk_key;
2745 struct btrfs_chunk *chunk;
2746 u8 *ptr;
2747 int ret = 0;
2748 u32 num_stripes;
2749 u32 array_size;
2750 u32 len = 0;
2751 u32 cur;
2752 struct btrfs_key key;
2753
34441361 2754 mutex_lock(&fs_info->chunk_mutex);
8f18cf13
CM
2755 array_size = btrfs_super_sys_array_size(super_copy);
2756
2757 ptr = super_copy->sys_chunk_array;
2758 cur = 0;
2759
2760 while (cur < array_size) {
2761 disk_key = (struct btrfs_disk_key *)ptr;
2762 btrfs_disk_key_to_cpu(&key, disk_key);
2763
2764 len = sizeof(*disk_key);
2765
2766 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2767 chunk = (struct btrfs_chunk *)(ptr + len);
2768 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2769 len += btrfs_chunk_item_size(num_stripes);
2770 } else {
2771 ret = -EIO;
2772 break;
2773 }
408fbf19 2774 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
8f18cf13
CM
2775 key.offset == chunk_offset) {
2776 memmove(ptr, ptr + len, array_size - (cur + len));
2777 array_size -= len;
2778 btrfs_set_super_sys_array_size(super_copy, array_size);
2779 } else {
2780 ptr += len;
2781 cur += len;
2782 }
2783 }
34441361 2784 mutex_unlock(&fs_info->chunk_mutex);
8f18cf13
CM
2785 return ret;
2786}
2787
592d92ee
LB
2788static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2789 u64 logical, u64 length)
2790{
2791 struct extent_map_tree *em_tree;
2792 struct extent_map *em;
2793
2794 em_tree = &fs_info->mapping_tree.map_tree;
2795 read_lock(&em_tree->lock);
2796 em = lookup_extent_mapping(em_tree, logical, length);
2797 read_unlock(&em_tree->lock);
2798
2799 if (!em) {
2800 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2801 logical, length);
2802 return ERR_PTR(-EINVAL);
2803 }
2804
2805 if (em->start > logical || em->start + em->len < logical) {
2806 btrfs_crit(fs_info,
2807 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2808 logical, length, em->start, em->start + em->len);
2809 free_extent_map(em);
2810 return ERR_PTR(-EINVAL);
2811 }
2812
2813 /* callers are responsible for dropping em's ref. */
2814 return em;
2815}
2816
47ab2a6c 2817int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
5b4aacef 2818 struct btrfs_fs_info *fs_info, u64 chunk_offset)
8f18cf13 2819{
8f18cf13
CM
2820 struct extent_map *em;
2821 struct map_lookup *map;
2196d6e8 2822 u64 dev_extent_len = 0;
47ab2a6c 2823 int i, ret = 0;
0b246afa 2824 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
8f18cf13 2825
592d92ee
LB
2826 em = get_chunk_map(fs_info, chunk_offset, 1);
2827 if (IS_ERR(em)) {
47ab2a6c
JB
2828 /*
2829 * This is a logic error, but we don't want to just rely on the
bb7ab3b9 2830 * user having built with ASSERT enabled, so if ASSERT doesn't
47ab2a6c
JB
2831 * do anything we still error out.
2832 */
2833 ASSERT(0);
592d92ee 2834 return PTR_ERR(em);
47ab2a6c 2835 }
95617d69 2836 map = em->map_lookup;
34441361 2837 mutex_lock(&fs_info->chunk_mutex);
2ff7e61e 2838 check_system_chunk(trans, fs_info, map->type);
34441361 2839 mutex_unlock(&fs_info->chunk_mutex);
8f18cf13 2840
57ba4cb8
FM
2841 /*
2842 * Take the device list mutex to prevent races with the final phase of
2843 * a device replace operation that replaces the device object associated
2844 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2845 */
2846 mutex_lock(&fs_devices->device_list_mutex);
8f18cf13 2847 for (i = 0; i < map->num_stripes; i++) {
47ab2a6c 2848 struct btrfs_device *device = map->stripes[i].dev;
2196d6e8
MX
2849 ret = btrfs_free_dev_extent(trans, device,
2850 map->stripes[i].physical,
2851 &dev_extent_len);
47ab2a6c 2852 if (ret) {
57ba4cb8 2853 mutex_unlock(&fs_devices->device_list_mutex);
66642832 2854 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
2855 goto out;
2856 }
a061fc8d 2857
2196d6e8 2858 if (device->bytes_used > 0) {
34441361 2859 mutex_lock(&fs_info->chunk_mutex);
2196d6e8
MX
2860 btrfs_device_set_bytes_used(device,
2861 device->bytes_used - dev_extent_len);
a5ed45f8 2862 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
0b246afa 2863 btrfs_clear_space_info_full(fs_info);
34441361 2864 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8 2865 }
a061fc8d 2866
dfe25020
CM
2867 if (map->stripes[i].dev) {
2868 ret = btrfs_update_device(trans, map->stripes[i].dev);
47ab2a6c 2869 if (ret) {
57ba4cb8 2870 mutex_unlock(&fs_devices->device_list_mutex);
66642832 2871 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
2872 goto out;
2873 }
dfe25020 2874 }
8f18cf13 2875 }
57ba4cb8
FM
2876 mutex_unlock(&fs_devices->device_list_mutex);
2877
408fbf19 2878 ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
47ab2a6c 2879 if (ret) {
66642832 2880 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
2881 goto out;
2882 }
8f18cf13 2883
6bccf3ab 2884 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
1abe9b8a 2885
8f18cf13 2886 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
408fbf19 2887 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
47ab2a6c 2888 if (ret) {
66642832 2889 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
2890 goto out;
2891 }
8f18cf13
CM
2892 }
2893
6bccf3ab 2894 ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
47ab2a6c 2895 if (ret) {
66642832 2896 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
2897 goto out;
2898 }
2b82032c 2899
47ab2a6c 2900out:
2b82032c
YZ
2901 /* once for us */
2902 free_extent_map(em);
47ab2a6c
JB
2903 return ret;
2904}
2b82032c 2905
5b4aacef 2906static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
47ab2a6c 2907{
5b4aacef 2908 struct btrfs_root *root = fs_info->chunk_root;
19c4d2f9 2909 struct btrfs_trans_handle *trans;
47ab2a6c 2910 int ret;
2b82032c 2911
67c5e7d4
FM
2912 /*
2913 * Prevent races with automatic removal of unused block groups.
2914 * After we relocate and before we remove the chunk with offset
2915 * chunk_offset, automatic removal of the block group can kick in,
2916 * resulting in a failure when calling btrfs_remove_chunk() below.
2917 *
2918 * Make sure to acquire this mutex before doing a tree search (dev
2919 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2920 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2921 * we release the path used to search the chunk/dev tree and before
2922 * the current task acquires this mutex and calls us.
2923 */
0b246afa 2924 ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
67c5e7d4 2925
0b246afa 2926 ret = btrfs_can_relocate(fs_info, chunk_offset);
47ab2a6c
JB
2927 if (ret)
2928 return -ENOSPC;
2929
2930 /* step one, relocate all the extents inside this chunk */
2ff7e61e 2931 btrfs_scrub_pause(fs_info);
0b246afa 2932 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2ff7e61e 2933 btrfs_scrub_continue(fs_info);
47ab2a6c
JB
2934 if (ret)
2935 return ret;
2936
19c4d2f9
CM
2937 trans = btrfs_start_trans_remove_block_group(root->fs_info,
2938 chunk_offset);
2939 if (IS_ERR(trans)) {
2940 ret = PTR_ERR(trans);
2941 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2942 return ret;
2943 }
2944
47ab2a6c 2945 /*
19c4d2f9
CM
2946 * step two, delete the device extents and the
2947 * chunk tree entries
47ab2a6c 2948 */
5b4aacef 2949 ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3a45bb20 2950 btrfs_end_transaction(trans);
19c4d2f9 2951 return ret;
2b82032c
YZ
2952}
2953
2ff7e61e 2954static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2b82032c 2955{
0b246afa 2956 struct btrfs_root *chunk_root = fs_info->chunk_root;
2b82032c
YZ
2957 struct btrfs_path *path;
2958 struct extent_buffer *leaf;
2959 struct btrfs_chunk *chunk;
2960 struct btrfs_key key;
2961 struct btrfs_key found_key;
2b82032c 2962 u64 chunk_type;
ba1bf481
JB
2963 bool retried = false;
2964 int failed = 0;
2b82032c
YZ
2965 int ret;
2966
2967 path = btrfs_alloc_path();
2968 if (!path)
2969 return -ENOMEM;
2970
ba1bf481 2971again:
2b82032c
YZ
2972 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2973 key.offset = (u64)-1;
2974 key.type = BTRFS_CHUNK_ITEM_KEY;
2975
2976 while (1) {
0b246afa 2977 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2b82032c 2978 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4 2979 if (ret < 0) {
0b246afa 2980 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2b82032c 2981 goto error;
67c5e7d4 2982 }
79787eaa 2983 BUG_ON(ret == 0); /* Corruption */
2b82032c
YZ
2984
2985 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2986 key.type);
67c5e7d4 2987 if (ret)
0b246afa 2988 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2b82032c
YZ
2989 if (ret < 0)
2990 goto error;
2991 if (ret > 0)
2992 break;
1a40e23b 2993
2b82032c
YZ
2994 leaf = path->nodes[0];
2995 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1a40e23b 2996
2b82032c
YZ
2997 chunk = btrfs_item_ptr(leaf, path->slots[0],
2998 struct btrfs_chunk);
2999 chunk_type = btrfs_chunk_type(leaf, chunk);
b3b4aa74 3000 btrfs_release_path(path);
8f18cf13 3001
2b82032c 3002 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 3003 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
ba1bf481
JB
3004 if (ret == -ENOSPC)
3005 failed++;
14586651
HS
3006 else
3007 BUG_ON(ret);
2b82032c 3008 }
0b246afa 3009 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 3010
2b82032c
YZ
3011 if (found_key.offset == 0)
3012 break;
3013 key.offset = found_key.offset - 1;
3014 }
3015 ret = 0;
ba1bf481
JB
3016 if (failed && !retried) {
3017 failed = 0;
3018 retried = true;
3019 goto again;
fae7f21c 3020 } else if (WARN_ON(failed && retried)) {
ba1bf481
JB
3021 ret = -ENOSPC;
3022 }
2b82032c
YZ
3023error:
3024 btrfs_free_path(path);
3025 return ret;
8f18cf13
CM
3026}
3027
6bccf3ab 3028static int insert_balance_item(struct btrfs_fs_info *fs_info,
0940ebf6
ID
3029 struct btrfs_balance_control *bctl)
3030{
6bccf3ab 3031 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3032 struct btrfs_trans_handle *trans;
3033 struct btrfs_balance_item *item;
3034 struct btrfs_disk_balance_args disk_bargs;
3035 struct btrfs_path *path;
3036 struct extent_buffer *leaf;
3037 struct btrfs_key key;
3038 int ret, err;
3039
3040 path = btrfs_alloc_path();
3041 if (!path)
3042 return -ENOMEM;
3043
3044 trans = btrfs_start_transaction(root, 0);
3045 if (IS_ERR(trans)) {
3046 btrfs_free_path(path);
3047 return PTR_ERR(trans);
3048 }
3049
3050 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3051 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3052 key.offset = 0;
3053
3054 ret = btrfs_insert_empty_item(trans, root, path, &key,
3055 sizeof(*item));
3056 if (ret)
3057 goto out;
3058
3059 leaf = path->nodes[0];
3060 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3061
b159fa28 3062 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
0940ebf6
ID
3063
3064 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3065 btrfs_set_balance_data(leaf, item, &disk_bargs);
3066 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3067 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3068 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3069 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3070
3071 btrfs_set_balance_flags(leaf, item, bctl->flags);
3072
3073 btrfs_mark_buffer_dirty(leaf);
3074out:
3075 btrfs_free_path(path);
3a45bb20 3076 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3077 if (err && !ret)
3078 ret = err;
3079 return ret;
3080}
3081
6bccf3ab 3082static int del_balance_item(struct btrfs_fs_info *fs_info)
0940ebf6 3083{
6bccf3ab 3084 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3085 struct btrfs_trans_handle *trans;
3086 struct btrfs_path *path;
3087 struct btrfs_key key;
3088 int ret, err;
3089
3090 path = btrfs_alloc_path();
3091 if (!path)
3092 return -ENOMEM;
3093
3094 trans = btrfs_start_transaction(root, 0);
3095 if (IS_ERR(trans)) {
3096 btrfs_free_path(path);
3097 return PTR_ERR(trans);
3098 }
3099
3100 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3101 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3102 key.offset = 0;
3103
3104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3105 if (ret < 0)
3106 goto out;
3107 if (ret > 0) {
3108 ret = -ENOENT;
3109 goto out;
3110 }
3111
3112 ret = btrfs_del_item(trans, root, path);
3113out:
3114 btrfs_free_path(path);
3a45bb20 3115 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3116 if (err && !ret)
3117 ret = err;
3118 return ret;
3119}
3120
59641015
ID
3121/*
3122 * This is a heuristic used to reduce the number of chunks balanced on
3123 * resume after balance was interrupted.
3124 */
3125static void update_balance_args(struct btrfs_balance_control *bctl)
3126{
3127 /*
3128 * Turn on soft mode for chunk types that were being converted.
3129 */
3130 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3131 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3132 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3133 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3134 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3135 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3136
3137 /*
3138 * Turn on usage filter if is not already used. The idea is
3139 * that chunks that we have already balanced should be
3140 * reasonably full. Don't do it for chunks that are being
3141 * converted - that will keep us from relocating unconverted
3142 * (albeit full) chunks.
3143 */
3144 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3145 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3146 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3147 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3148 bctl->data.usage = 90;
3149 }
3150 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3151 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3152 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3153 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3154 bctl->sys.usage = 90;
3155 }
3156 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3157 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3158 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3159 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3160 bctl->meta.usage = 90;
3161 }
3162}
3163
c9e9f97b
ID
3164/*
3165 * Should be called with both balance and volume mutexes held to
3166 * serialize other volume operations (add_dev/rm_dev/resize) with
3167 * restriper. Same goes for unset_balance_control.
3168 */
3169static void set_balance_control(struct btrfs_balance_control *bctl)
3170{
3171 struct btrfs_fs_info *fs_info = bctl->fs_info;
3172
3173 BUG_ON(fs_info->balance_ctl);
3174
3175 spin_lock(&fs_info->balance_lock);
3176 fs_info->balance_ctl = bctl;
3177 spin_unlock(&fs_info->balance_lock);
3178}
3179
3180static void unset_balance_control(struct btrfs_fs_info *fs_info)
3181{
3182 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3183
3184 BUG_ON(!fs_info->balance_ctl);
3185
3186 spin_lock(&fs_info->balance_lock);
3187 fs_info->balance_ctl = NULL;
3188 spin_unlock(&fs_info->balance_lock);
3189
3190 kfree(bctl);
3191}
3192
ed25e9b2
ID
3193/*
3194 * Balance filters. Return 1 if chunk should be filtered out
3195 * (should not be balanced).
3196 */
899c81ea 3197static int chunk_profiles_filter(u64 chunk_type,
ed25e9b2
ID
3198 struct btrfs_balance_args *bargs)
3199{
899c81ea
ID
3200 chunk_type = chunk_to_extended(chunk_type) &
3201 BTRFS_EXTENDED_PROFILE_MASK;
ed25e9b2 3202
899c81ea 3203 if (bargs->profiles & chunk_type)
ed25e9b2
ID
3204 return 0;
3205
3206 return 1;
3207}
3208
dba72cb3 3209static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
5ce5b3c0 3210 struct btrfs_balance_args *bargs)
bc309467
DS
3211{
3212 struct btrfs_block_group_cache *cache;
3213 u64 chunk_used;
3214 u64 user_thresh_min;
3215 u64 user_thresh_max;
3216 int ret = 1;
3217
3218 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3219 chunk_used = btrfs_block_group_used(&cache->item);
3220
3221 if (bargs->usage_min == 0)
3222 user_thresh_min = 0;
3223 else
3224 user_thresh_min = div_factor_fine(cache->key.offset,
3225 bargs->usage_min);
3226
3227 if (bargs->usage_max == 0)
3228 user_thresh_max = 1;
3229 else if (bargs->usage_max > 100)
3230 user_thresh_max = cache->key.offset;
3231 else
3232 user_thresh_max = div_factor_fine(cache->key.offset,
3233 bargs->usage_max);
3234
3235 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3236 ret = 0;
3237
3238 btrfs_put_block_group(cache);
3239 return ret;
3240}
3241
dba72cb3 3242static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
bc309467 3243 u64 chunk_offset, struct btrfs_balance_args *bargs)
5ce5b3c0
ID
3244{
3245 struct btrfs_block_group_cache *cache;
3246 u64 chunk_used, user_thresh;
3247 int ret = 1;
3248
3249 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3250 chunk_used = btrfs_block_group_used(&cache->item);
3251
bc309467 3252 if (bargs->usage_min == 0)
3e39cea6 3253 user_thresh = 1;
a105bb88
ID
3254 else if (bargs->usage > 100)
3255 user_thresh = cache->key.offset;
3256 else
3257 user_thresh = div_factor_fine(cache->key.offset,
3258 bargs->usage);
3259
5ce5b3c0
ID
3260 if (chunk_used < user_thresh)
3261 ret = 0;
3262
3263 btrfs_put_block_group(cache);
3264 return ret;
3265}
3266
409d404b
ID
3267static int chunk_devid_filter(struct extent_buffer *leaf,
3268 struct btrfs_chunk *chunk,
3269 struct btrfs_balance_args *bargs)
3270{
3271 struct btrfs_stripe *stripe;
3272 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3273 int i;
3274
3275 for (i = 0; i < num_stripes; i++) {
3276 stripe = btrfs_stripe_nr(chunk, i);
3277 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3278 return 0;
3279 }
3280
3281 return 1;
3282}
3283
94e60d5a
ID
3284/* [pstart, pend) */
3285static int chunk_drange_filter(struct extent_buffer *leaf,
3286 struct btrfs_chunk *chunk,
94e60d5a
ID
3287 struct btrfs_balance_args *bargs)
3288{
3289 struct btrfs_stripe *stripe;
3290 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3291 u64 stripe_offset;
3292 u64 stripe_length;
3293 int factor;
3294 int i;
3295
3296 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3297 return 0;
3298
3299 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
53b381b3
DW
3300 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3301 factor = num_stripes / 2;
3302 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3303 factor = num_stripes - 1;
3304 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3305 factor = num_stripes - 2;
3306 } else {
3307 factor = num_stripes;
3308 }
94e60d5a
ID
3309
3310 for (i = 0; i < num_stripes; i++) {
3311 stripe = btrfs_stripe_nr(chunk, i);
3312 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3313 continue;
3314
3315 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3316 stripe_length = btrfs_chunk_length(leaf, chunk);
b8b93add 3317 stripe_length = div_u64(stripe_length, factor);
94e60d5a
ID
3318
3319 if (stripe_offset < bargs->pend &&
3320 stripe_offset + stripe_length > bargs->pstart)
3321 return 0;
3322 }
3323
3324 return 1;
3325}
3326
ea67176a
ID
3327/* [vstart, vend) */
3328static int chunk_vrange_filter(struct extent_buffer *leaf,
3329 struct btrfs_chunk *chunk,
3330 u64 chunk_offset,
3331 struct btrfs_balance_args *bargs)
3332{
3333 if (chunk_offset < bargs->vend &&
3334 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3335 /* at least part of the chunk is inside this vrange */
3336 return 0;
3337
3338 return 1;
3339}
3340
dee32d0a
GAP
3341static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3342 struct btrfs_chunk *chunk,
3343 struct btrfs_balance_args *bargs)
3344{
3345 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3346
3347 if (bargs->stripes_min <= num_stripes
3348 && num_stripes <= bargs->stripes_max)
3349 return 0;
3350
3351 return 1;
3352}
3353
899c81ea 3354static int chunk_soft_convert_filter(u64 chunk_type,
cfa4c961
ID
3355 struct btrfs_balance_args *bargs)
3356{
3357 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3358 return 0;
3359
899c81ea
ID
3360 chunk_type = chunk_to_extended(chunk_type) &
3361 BTRFS_EXTENDED_PROFILE_MASK;
cfa4c961 3362
899c81ea 3363 if (bargs->target == chunk_type)
cfa4c961
ID
3364 return 1;
3365
3366 return 0;
3367}
3368
2ff7e61e 3369static int should_balance_chunk(struct btrfs_fs_info *fs_info,
f43ffb60
ID
3370 struct extent_buffer *leaf,
3371 struct btrfs_chunk *chunk, u64 chunk_offset)
3372{
0b246afa 3373 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
f43ffb60
ID
3374 struct btrfs_balance_args *bargs = NULL;
3375 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3376
3377 /* type filter */
3378 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3379 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3380 return 0;
3381 }
3382
3383 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3384 bargs = &bctl->data;
3385 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3386 bargs = &bctl->sys;
3387 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3388 bargs = &bctl->meta;
3389
ed25e9b2
ID
3390 /* profiles filter */
3391 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3392 chunk_profiles_filter(chunk_type, bargs)) {
3393 return 0;
5ce5b3c0
ID
3394 }
3395
3396 /* usage filter */
3397 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
0b246afa 3398 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
5ce5b3c0 3399 return 0;
bc309467 3400 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
0b246afa 3401 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
bc309467 3402 return 0;
409d404b
ID
3403 }
3404
3405 /* devid filter */
3406 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3407 chunk_devid_filter(leaf, chunk, bargs)) {
3408 return 0;
94e60d5a
ID
3409 }
3410
3411 /* drange filter, makes sense only with devid filter */
3412 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
e4ff5fb5 3413 chunk_drange_filter(leaf, chunk, bargs)) {
94e60d5a 3414 return 0;
ea67176a
ID
3415 }
3416
3417 /* vrange filter */
3418 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3419 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3420 return 0;
ed25e9b2
ID
3421 }
3422
dee32d0a
GAP
3423 /* stripes filter */
3424 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3425 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3426 return 0;
3427 }
3428
cfa4c961
ID
3429 /* soft profile changing mode */
3430 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3431 chunk_soft_convert_filter(chunk_type, bargs)) {
3432 return 0;
3433 }
3434
7d824b6f
DS
3435 /*
3436 * limited by count, must be the last filter
3437 */
3438 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3439 if (bargs->limit == 0)
3440 return 0;
3441 else
3442 bargs->limit--;
12907fc7
DS
3443 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3444 /*
3445 * Same logic as the 'limit' filter; the minimum cannot be
01327610 3446 * determined here because we do not have the global information
12907fc7
DS
3447 * about the count of all chunks that satisfy the filters.
3448 */
3449 if (bargs->limit_max == 0)
3450 return 0;
3451 else
3452 bargs->limit_max--;
7d824b6f
DS
3453 }
3454
f43ffb60
ID
3455 return 1;
3456}
3457
c9e9f97b 3458static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ec44a35c 3459{
19a39dce 3460 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
c9e9f97b
ID
3461 struct btrfs_root *chunk_root = fs_info->chunk_root;
3462 struct btrfs_root *dev_root = fs_info->dev_root;
3463 struct list_head *devices;
ec44a35c
CM
3464 struct btrfs_device *device;
3465 u64 old_size;
3466 u64 size_to_free;
12907fc7 3467 u64 chunk_type;
f43ffb60 3468 struct btrfs_chunk *chunk;
5a488b9d 3469 struct btrfs_path *path = NULL;
ec44a35c 3470 struct btrfs_key key;
ec44a35c 3471 struct btrfs_key found_key;
c9e9f97b 3472 struct btrfs_trans_handle *trans;
f43ffb60
ID
3473 struct extent_buffer *leaf;
3474 int slot;
c9e9f97b
ID
3475 int ret;
3476 int enospc_errors = 0;
19a39dce 3477 bool counting = true;
12907fc7 3478 /* The single value limit and min/max limits use the same bytes in the */
7d824b6f
DS
3479 u64 limit_data = bctl->data.limit;
3480 u64 limit_meta = bctl->meta.limit;
3481 u64 limit_sys = bctl->sys.limit;
12907fc7
DS
3482 u32 count_data = 0;
3483 u32 count_meta = 0;
3484 u32 count_sys = 0;
2c9fe835 3485 int chunk_reserved = 0;
cf25ce51 3486 u64 bytes_used = 0;
ec44a35c 3487
ec44a35c 3488 /* step one make some room on all the devices */
c9e9f97b 3489 devices = &fs_info->fs_devices->devices;
c6e30871 3490 list_for_each_entry(device, devices, dev_list) {
7cc8e58d 3491 old_size = btrfs_device_get_total_bytes(device);
ec44a35c 3492 size_to_free = div_factor(old_size, 1);
ee22184b 3493 size_to_free = min_t(u64, size_to_free, SZ_1M);
2b82032c 3494 if (!device->writeable ||
7cc8e58d
MX
3495 btrfs_device_get_total_bytes(device) -
3496 btrfs_device_get_bytes_used(device) > size_to_free ||
63a212ab 3497 device->is_tgtdev_for_dev_replace)
ec44a35c
CM
3498 continue;
3499
3500 ret = btrfs_shrink_device(device, old_size - size_to_free);
ba1bf481
JB
3501 if (ret == -ENOSPC)
3502 break;
5a488b9d
LB
3503 if (ret) {
3504 /* btrfs_shrink_device never returns ret > 0 */
3505 WARN_ON(ret > 0);
3506 goto error;
3507 }
ec44a35c 3508
a22285a6 3509 trans = btrfs_start_transaction(dev_root, 0);
5a488b9d
LB
3510 if (IS_ERR(trans)) {
3511 ret = PTR_ERR(trans);
3512 btrfs_info_in_rcu(fs_info,
3513 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3514 rcu_str_deref(device->name), ret,
3515 old_size, old_size - size_to_free);
3516 goto error;
3517 }
ec44a35c
CM
3518
3519 ret = btrfs_grow_device(trans, device, old_size);
5a488b9d 3520 if (ret) {
3a45bb20 3521 btrfs_end_transaction(trans);
5a488b9d
LB
3522 /* btrfs_grow_device never returns ret > 0 */
3523 WARN_ON(ret > 0);
3524 btrfs_info_in_rcu(fs_info,
3525 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3526 rcu_str_deref(device->name), ret,
3527 old_size, old_size - size_to_free);
3528 goto error;
3529 }
ec44a35c 3530
3a45bb20 3531 btrfs_end_transaction(trans);
ec44a35c
CM
3532 }
3533
3534 /* step two, relocate all the chunks */
3535 path = btrfs_alloc_path();
17e9f796
MF
3536 if (!path) {
3537 ret = -ENOMEM;
3538 goto error;
3539 }
19a39dce
ID
3540
3541 /* zero out stat counters */
3542 spin_lock(&fs_info->balance_lock);
3543 memset(&bctl->stat, 0, sizeof(bctl->stat));
3544 spin_unlock(&fs_info->balance_lock);
3545again:
7d824b6f 3546 if (!counting) {
12907fc7
DS
3547 /*
3548 * The single value limit and min/max limits use the same bytes
3549 * in the
3550 */
7d824b6f
DS
3551 bctl->data.limit = limit_data;
3552 bctl->meta.limit = limit_meta;
3553 bctl->sys.limit = limit_sys;
3554 }
ec44a35c
CM
3555 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3556 key.offset = (u64)-1;
3557 key.type = BTRFS_CHUNK_ITEM_KEY;
3558
d397712b 3559 while (1) {
19a39dce 3560 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
a7e99c69 3561 atomic_read(&fs_info->balance_cancel_req)) {
837d5b6e
ID
3562 ret = -ECANCELED;
3563 goto error;
3564 }
3565
67c5e7d4 3566 mutex_lock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3567 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4
FM
3568 if (ret < 0) {
3569 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3570 goto error;
67c5e7d4 3571 }
ec44a35c
CM
3572
3573 /*
3574 * this shouldn't happen, it means the last relocate
3575 * failed
3576 */
3577 if (ret == 0)
c9e9f97b 3578 BUG(); /* FIXME break ? */
ec44a35c
CM
3579
3580 ret = btrfs_previous_item(chunk_root, path, 0,
3581 BTRFS_CHUNK_ITEM_KEY);
c9e9f97b 3582 if (ret) {
67c5e7d4 3583 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
c9e9f97b 3584 ret = 0;
ec44a35c 3585 break;
c9e9f97b 3586 }
7d9eb12c 3587
f43ffb60
ID
3588 leaf = path->nodes[0];
3589 slot = path->slots[0];
3590 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7d9eb12c 3591
67c5e7d4
FM
3592 if (found_key.objectid != key.objectid) {
3593 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3594 break;
67c5e7d4 3595 }
7d9eb12c 3596
f43ffb60 3597 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
12907fc7 3598 chunk_type = btrfs_chunk_type(leaf, chunk);
f43ffb60 3599
19a39dce
ID
3600 if (!counting) {
3601 spin_lock(&fs_info->balance_lock);
3602 bctl->stat.considered++;
3603 spin_unlock(&fs_info->balance_lock);
3604 }
3605
2ff7e61e 3606 ret = should_balance_chunk(fs_info, leaf, chunk,
f43ffb60 3607 found_key.offset);
2c9fe835 3608
b3b4aa74 3609 btrfs_release_path(path);
67c5e7d4
FM
3610 if (!ret) {
3611 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
f43ffb60 3612 goto loop;
67c5e7d4 3613 }
f43ffb60 3614
19a39dce 3615 if (counting) {
67c5e7d4 3616 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
19a39dce
ID
3617 spin_lock(&fs_info->balance_lock);
3618 bctl->stat.expected++;
3619 spin_unlock(&fs_info->balance_lock);
12907fc7
DS
3620
3621 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3622 count_data++;
3623 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3624 count_sys++;
3625 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3626 count_meta++;
3627
3628 goto loop;
3629 }
3630
3631 /*
3632 * Apply limit_min filter, no need to check if the LIMITS
3633 * filter is used, limit_min is 0 by default
3634 */
3635 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3636 count_data < bctl->data.limit_min)
3637 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3638 count_meta < bctl->meta.limit_min)
3639 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3640 count_sys < bctl->sys.limit_min)) {
3641 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
19a39dce
ID
3642 goto loop;
3643 }
3644
cf25ce51
LB
3645 ASSERT(fs_info->data_sinfo);
3646 spin_lock(&fs_info->data_sinfo->lock);
3647 bytes_used = fs_info->data_sinfo->bytes_used;
3648 spin_unlock(&fs_info->data_sinfo->lock);
3649
3650 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3651 !chunk_reserved && !bytes_used) {
2c9fe835
ZL
3652 trans = btrfs_start_transaction(chunk_root, 0);
3653 if (IS_ERR(trans)) {
3654 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3655 ret = PTR_ERR(trans);
3656 goto error;
3657 }
3658
2ff7e61e 3659 ret = btrfs_force_chunk_alloc(trans, fs_info,
2c9fe835 3660 BTRFS_BLOCK_GROUP_DATA);
3a45bb20 3661 btrfs_end_transaction(trans);
2c9fe835
ZL
3662 if (ret < 0) {
3663 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3664 goto error;
3665 }
2c9fe835
ZL
3666 chunk_reserved = 1;
3667 }
3668
5b4aacef 3669 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
67c5e7d4 3670 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
508794eb
JB
3671 if (ret && ret != -ENOSPC)
3672 goto error;
19a39dce 3673 if (ret == -ENOSPC) {
c9e9f97b 3674 enospc_errors++;
19a39dce
ID
3675 } else {
3676 spin_lock(&fs_info->balance_lock);
3677 bctl->stat.completed++;
3678 spin_unlock(&fs_info->balance_lock);
3679 }
f43ffb60 3680loop:
795a3321
ID
3681 if (found_key.offset == 0)
3682 break;
ba1bf481 3683 key.offset = found_key.offset - 1;
ec44a35c 3684 }
c9e9f97b 3685
19a39dce
ID
3686 if (counting) {
3687 btrfs_release_path(path);
3688 counting = false;
3689 goto again;
3690 }
ec44a35c
CM
3691error:
3692 btrfs_free_path(path);
c9e9f97b 3693 if (enospc_errors) {
efe120a0 3694 btrfs_info(fs_info, "%d enospc errors during balance",
5d163e0e 3695 enospc_errors);
c9e9f97b
ID
3696 if (!ret)
3697 ret = -ENOSPC;
3698 }
3699
ec44a35c
CM
3700 return ret;
3701}
3702
0c460c0d
ID
3703/**
3704 * alloc_profile_is_valid - see if a given profile is valid and reduced
3705 * @flags: profile to validate
3706 * @extended: if true @flags is treated as an extended profile
3707 */
3708static int alloc_profile_is_valid(u64 flags, int extended)
3709{
3710 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3711 BTRFS_BLOCK_GROUP_PROFILE_MASK);
3712
3713 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3714
3715 /* 1) check that all other bits are zeroed */
3716 if (flags & ~mask)
3717 return 0;
3718
3719 /* 2) see if profile is reduced */
3720 if (flags == 0)
3721 return !extended; /* "0" is valid for usual profiles */
3722
3723 /* true if exactly one bit set */
3724 return (flags & (flags - 1)) == 0;
3725}
3726
837d5b6e
ID
3727static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3728{
a7e99c69
ID
3729 /* cancel requested || normal exit path */
3730 return atomic_read(&fs_info->balance_cancel_req) ||
3731 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3732 atomic_read(&fs_info->balance_cancel_req) == 0);
837d5b6e
ID
3733}
3734
c9e9f97b
ID
3735static void __cancel_balance(struct btrfs_fs_info *fs_info)
3736{
0940ebf6
ID
3737 int ret;
3738
c9e9f97b 3739 unset_balance_control(fs_info);
6bccf3ab 3740 ret = del_balance_item(fs_info);
0f788c58 3741 if (ret)
34d97007 3742 btrfs_handle_fs_error(fs_info, ret, NULL);
ed0fb78f 3743
171938e5 3744 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
c9e9f97b
ID
3745}
3746
bdcd3c97
AM
3747/* Non-zero return value signifies invalidity */
3748static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3749 u64 allowed)
3750{
3751 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3752 (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3753 (bctl_arg->target & ~allowed)));
3754}
3755
c9e9f97b
ID
3756/*
3757 * Should be called with both balance and volume mutexes held
3758 */
3759int btrfs_balance(struct btrfs_balance_control *bctl,
3760 struct btrfs_ioctl_balance_args *bargs)
3761{
3762 struct btrfs_fs_info *fs_info = bctl->fs_info;
14506127 3763 u64 meta_target, data_target;
f43ffb60 3764 u64 allowed;
e4837f8f 3765 int mixed = 0;
c9e9f97b 3766 int ret;
8dabb742 3767 u64 num_devices;
de98ced9 3768 unsigned seq;
c9e9f97b 3769
837d5b6e 3770 if (btrfs_fs_closing(fs_info) ||
a7e99c69
ID
3771 atomic_read(&fs_info->balance_pause_req) ||
3772 atomic_read(&fs_info->balance_cancel_req)) {
c9e9f97b
ID
3773 ret = -EINVAL;
3774 goto out;
3775 }
3776
e4837f8f
ID
3777 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3778 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3779 mixed = 1;
3780
f43ffb60
ID
3781 /*
3782 * In case of mixed groups both data and meta should be picked,
3783 * and identical options should be given for both of them.
3784 */
e4837f8f
ID
3785 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3786 if (mixed && (bctl->flags & allowed)) {
f43ffb60
ID
3787 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3788 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3789 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
5d163e0e
JM
3790 btrfs_err(fs_info,
3791 "with mixed groups data and metadata balance options must be the same");
f43ffb60
ID
3792 ret = -EINVAL;
3793 goto out;
3794 }
3795 }
3796
8dabb742 3797 num_devices = fs_info->fs_devices->num_devices;
73beece9 3798 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
8dabb742
SB
3799 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3800 BUG_ON(num_devices < 1);
3801 num_devices--;
3802 }
73beece9 3803 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
88be159c
AH
3804 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3805 if (num_devices > 1)
e4d8ec0f 3806 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
8250dabe
AP
3807 if (num_devices > 2)
3808 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3809 if (num_devices > 3)
3810 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3811 BTRFS_BLOCK_GROUP_RAID6);
bdcd3c97 3812 if (validate_convert_profile(&bctl->data, allowed)) {
5d163e0e
JM
3813 btrfs_err(fs_info,
3814 "unable to start balance with target data profile %llu",
3815 bctl->data.target);
e4d8ec0f
ID
3816 ret = -EINVAL;
3817 goto out;
3818 }
bdcd3c97 3819 if (validate_convert_profile(&bctl->meta, allowed)) {
efe120a0 3820 btrfs_err(fs_info,
5d163e0e
JM
3821 "unable to start balance with target metadata profile %llu",
3822 bctl->meta.target);
e4d8ec0f
ID
3823 ret = -EINVAL;
3824 goto out;
3825 }
bdcd3c97 3826 if (validate_convert_profile(&bctl->sys, allowed)) {
efe120a0 3827 btrfs_err(fs_info,
5d163e0e
JM
3828 "unable to start balance with target system profile %llu",
3829 bctl->sys.target);
e4d8ec0f
ID
3830 ret = -EINVAL;
3831 goto out;
3832 }
3833
e4d8ec0f
ID
3834 /* allow to reduce meta or sys integrity only if force set */
3835 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
53b381b3
DW
3836 BTRFS_BLOCK_GROUP_RAID10 |
3837 BTRFS_BLOCK_GROUP_RAID5 |
3838 BTRFS_BLOCK_GROUP_RAID6;
de98ced9
MX
3839 do {
3840 seq = read_seqbegin(&fs_info->profiles_lock);
3841
3842 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3843 (fs_info->avail_system_alloc_bits & allowed) &&
3844 !(bctl->sys.target & allowed)) ||
3845 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3846 (fs_info->avail_metadata_alloc_bits & allowed) &&
3847 !(bctl->meta.target & allowed))) {
3848 if (bctl->flags & BTRFS_BALANCE_FORCE) {
5d163e0e
JM
3849 btrfs_info(fs_info,
3850 "force reducing metadata integrity");
de98ced9 3851 } else {
5d163e0e
JM
3852 btrfs_err(fs_info,
3853 "balance will reduce metadata integrity, use force if you want this");
de98ced9
MX
3854 ret = -EINVAL;
3855 goto out;
3856 }
e4d8ec0f 3857 }
de98ced9 3858 } while (read_seqretry(&fs_info->profiles_lock, seq));
e4d8ec0f 3859
14506127
AB
3860 /* if we're not converting, the target field is uninitialized */
3861 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3862 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
3863 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3864 bctl->data.target : fs_info->avail_data_alloc_bits;
3865 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
3866 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
ee592d07 3867 btrfs_warn(fs_info,
5d163e0e 3868 "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
14506127 3869 meta_target, data_target);
ee592d07
ST
3870 }
3871
6bccf3ab 3872 ret = insert_balance_item(fs_info, bctl);
59641015 3873 if (ret && ret != -EEXIST)
0940ebf6
ID
3874 goto out;
3875
59641015
ID
3876 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3877 BUG_ON(ret == -EEXIST);
3878 set_balance_control(bctl);
3879 } else {
3880 BUG_ON(ret != -EEXIST);
3881 spin_lock(&fs_info->balance_lock);
3882 update_balance_args(bctl);
3883 spin_unlock(&fs_info->balance_lock);
3884 }
c9e9f97b 3885
837d5b6e 3886 atomic_inc(&fs_info->balance_running);
c9e9f97b
ID
3887 mutex_unlock(&fs_info->balance_mutex);
3888
3889 ret = __btrfs_balance(fs_info);
3890
3891 mutex_lock(&fs_info->balance_mutex);
837d5b6e 3892 atomic_dec(&fs_info->balance_running);
c9e9f97b
ID
3893
3894 if (bargs) {
3895 memset(bargs, 0, sizeof(*bargs));
19a39dce 3896 update_ioctl_balance_args(fs_info, 0, bargs);
c9e9f97b
ID
3897 }
3898
3a01aa7a
ID
3899 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3900 balance_need_close(fs_info)) {
3901 __cancel_balance(fs_info);
3902 }
3903
837d5b6e 3904 wake_up(&fs_info->balance_wait_q);
c9e9f97b
ID
3905
3906 return ret;
3907out:
59641015
ID
3908 if (bctl->flags & BTRFS_BALANCE_RESUME)
3909 __cancel_balance(fs_info);
ed0fb78f 3910 else {
59641015 3911 kfree(bctl);
171938e5 3912 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
ed0fb78f 3913 }
59641015
ID
3914 return ret;
3915}
3916
3917static int balance_kthread(void *data)
3918{
2b6ba629 3919 struct btrfs_fs_info *fs_info = data;
9555c6c1 3920 int ret = 0;
59641015
ID
3921
3922 mutex_lock(&fs_info->volume_mutex);
3923 mutex_lock(&fs_info->balance_mutex);
3924
2b6ba629 3925 if (fs_info->balance_ctl) {
efe120a0 3926 btrfs_info(fs_info, "continuing balance");
2b6ba629 3927 ret = btrfs_balance(fs_info->balance_ctl, NULL);
9555c6c1 3928 }
59641015
ID
3929
3930 mutex_unlock(&fs_info->balance_mutex);
3931 mutex_unlock(&fs_info->volume_mutex);
2b6ba629 3932
59641015
ID
3933 return ret;
3934}
3935
2b6ba629
ID
3936int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3937{
3938 struct task_struct *tsk;
3939
3940 spin_lock(&fs_info->balance_lock);
3941 if (!fs_info->balance_ctl) {
3942 spin_unlock(&fs_info->balance_lock);
3943 return 0;
3944 }
3945 spin_unlock(&fs_info->balance_lock);
3946
3cdde224 3947 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
efe120a0 3948 btrfs_info(fs_info, "force skipping balance");
2b6ba629
ID
3949 return 0;
3950 }
3951
3952 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
cd633972 3953 return PTR_ERR_OR_ZERO(tsk);
2b6ba629
ID
3954}
3955
68310a5e 3956int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
59641015 3957{
59641015
ID
3958 struct btrfs_balance_control *bctl;
3959 struct btrfs_balance_item *item;
3960 struct btrfs_disk_balance_args disk_bargs;
3961 struct btrfs_path *path;
3962 struct extent_buffer *leaf;
3963 struct btrfs_key key;
3964 int ret;
3965
3966 path = btrfs_alloc_path();
3967 if (!path)
3968 return -ENOMEM;
3969
59641015 3970 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3971 key.type = BTRFS_TEMPORARY_ITEM_KEY;
59641015
ID
3972 key.offset = 0;
3973
68310a5e 3974 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
59641015 3975 if (ret < 0)
68310a5e 3976 goto out;
59641015
ID
3977 if (ret > 0) { /* ret = -ENOENT; */
3978 ret = 0;
68310a5e
ID
3979 goto out;
3980 }
3981
3982 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3983 if (!bctl) {
3984 ret = -ENOMEM;
3985 goto out;
59641015
ID
3986 }
3987
3988 leaf = path->nodes[0];
3989 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3990
68310a5e
ID
3991 bctl->fs_info = fs_info;
3992 bctl->flags = btrfs_balance_flags(leaf, item);
3993 bctl->flags |= BTRFS_BALANCE_RESUME;
59641015
ID
3994
3995 btrfs_balance_data(leaf, item, &disk_bargs);
3996 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3997 btrfs_balance_meta(leaf, item, &disk_bargs);
3998 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3999 btrfs_balance_sys(leaf, item, &disk_bargs);
4000 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4001
171938e5 4002 WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
ed0fb78f 4003
68310a5e
ID
4004 mutex_lock(&fs_info->volume_mutex);
4005 mutex_lock(&fs_info->balance_mutex);
59641015 4006
68310a5e
ID
4007 set_balance_control(bctl);
4008
4009 mutex_unlock(&fs_info->balance_mutex);
4010 mutex_unlock(&fs_info->volume_mutex);
59641015
ID
4011out:
4012 btrfs_free_path(path);
ec44a35c
CM
4013 return ret;
4014}
4015
837d5b6e
ID
4016int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4017{
4018 int ret = 0;
4019
4020 mutex_lock(&fs_info->balance_mutex);
4021 if (!fs_info->balance_ctl) {
4022 mutex_unlock(&fs_info->balance_mutex);
4023 return -ENOTCONN;
4024 }
4025
4026 if (atomic_read(&fs_info->balance_running)) {
4027 atomic_inc(&fs_info->balance_pause_req);
4028 mutex_unlock(&fs_info->balance_mutex);
4029
4030 wait_event(fs_info->balance_wait_q,
4031 atomic_read(&fs_info->balance_running) == 0);
4032
4033 mutex_lock(&fs_info->balance_mutex);
4034 /* we are good with balance_ctl ripped off from under us */
4035 BUG_ON(atomic_read(&fs_info->balance_running));
4036 atomic_dec(&fs_info->balance_pause_req);
4037 } else {
4038 ret = -ENOTCONN;
4039 }
4040
4041 mutex_unlock(&fs_info->balance_mutex);
4042 return ret;
4043}
4044
a7e99c69
ID
4045int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4046{
bc98a42c 4047 if (sb_rdonly(fs_info->sb))
e649e587
ID
4048 return -EROFS;
4049
a7e99c69
ID
4050 mutex_lock(&fs_info->balance_mutex);
4051 if (!fs_info->balance_ctl) {
4052 mutex_unlock(&fs_info->balance_mutex);
4053 return -ENOTCONN;
4054 }
4055
4056 atomic_inc(&fs_info->balance_cancel_req);
4057 /*
4058 * if we are running just wait and return, balance item is
4059 * deleted in btrfs_balance in this case
4060 */
4061 if (atomic_read(&fs_info->balance_running)) {
4062 mutex_unlock(&fs_info->balance_mutex);
4063 wait_event(fs_info->balance_wait_q,
4064 atomic_read(&fs_info->balance_running) == 0);
4065 mutex_lock(&fs_info->balance_mutex);
4066 } else {
4067 /* __cancel_balance needs volume_mutex */
4068 mutex_unlock(&fs_info->balance_mutex);
4069 mutex_lock(&fs_info->volume_mutex);
4070 mutex_lock(&fs_info->balance_mutex);
4071
4072 if (fs_info->balance_ctl)
4073 __cancel_balance(fs_info);
4074
4075 mutex_unlock(&fs_info->volume_mutex);
4076 }
4077
4078 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
4079 atomic_dec(&fs_info->balance_cancel_req);
4080 mutex_unlock(&fs_info->balance_mutex);
4081 return 0;
4082}
4083
803b2f54
SB
4084static int btrfs_uuid_scan_kthread(void *data)
4085{
4086 struct btrfs_fs_info *fs_info = data;
4087 struct btrfs_root *root = fs_info->tree_root;
4088 struct btrfs_key key;
803b2f54
SB
4089 struct btrfs_path *path = NULL;
4090 int ret = 0;
4091 struct extent_buffer *eb;
4092 int slot;
4093 struct btrfs_root_item root_item;
4094 u32 item_size;
f45388f3 4095 struct btrfs_trans_handle *trans = NULL;
803b2f54
SB
4096
4097 path = btrfs_alloc_path();
4098 if (!path) {
4099 ret = -ENOMEM;
4100 goto out;
4101 }
4102
4103 key.objectid = 0;
4104 key.type = BTRFS_ROOT_ITEM_KEY;
4105 key.offset = 0;
4106
803b2f54 4107 while (1) {
6174d3cb 4108 ret = btrfs_search_forward(root, &key, path, 0);
803b2f54
SB
4109 if (ret) {
4110 if (ret > 0)
4111 ret = 0;
4112 break;
4113 }
4114
4115 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4116 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4117 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4118 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4119 goto skip;
4120
4121 eb = path->nodes[0];
4122 slot = path->slots[0];
4123 item_size = btrfs_item_size_nr(eb, slot);
4124 if (item_size < sizeof(root_item))
4125 goto skip;
4126
803b2f54
SB
4127 read_extent_buffer(eb, &root_item,
4128 btrfs_item_ptr_offset(eb, slot),
4129 (int)sizeof(root_item));
4130 if (btrfs_root_refs(&root_item) == 0)
4131 goto skip;
f45388f3
FDBM
4132
4133 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4134 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4135 if (trans)
4136 goto update_tree;
4137
4138 btrfs_release_path(path);
803b2f54
SB
4139 /*
4140 * 1 - subvol uuid item
4141 * 1 - received_subvol uuid item
4142 */
4143 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4144 if (IS_ERR(trans)) {
4145 ret = PTR_ERR(trans);
4146 break;
4147 }
f45388f3
FDBM
4148 continue;
4149 } else {
4150 goto skip;
4151 }
4152update_tree:
4153 if (!btrfs_is_empty_uuid(root_item.uuid)) {
6bccf3ab 4154 ret = btrfs_uuid_tree_add(trans, fs_info,
803b2f54
SB
4155 root_item.uuid,
4156 BTRFS_UUID_KEY_SUBVOL,
4157 key.objectid);
4158 if (ret < 0) {
efe120a0 4159 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4160 ret);
803b2f54
SB
4161 break;
4162 }
4163 }
4164
4165 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
6bccf3ab 4166 ret = btrfs_uuid_tree_add(trans, fs_info,
803b2f54
SB
4167 root_item.received_uuid,
4168 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4169 key.objectid);
4170 if (ret < 0) {
efe120a0 4171 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4172 ret);
803b2f54
SB
4173 break;
4174 }
4175 }
4176
f45388f3 4177skip:
803b2f54 4178 if (trans) {
3a45bb20 4179 ret = btrfs_end_transaction(trans);
f45388f3 4180 trans = NULL;
803b2f54
SB
4181 if (ret)
4182 break;
4183 }
4184
803b2f54
SB
4185 btrfs_release_path(path);
4186 if (key.offset < (u64)-1) {
4187 key.offset++;
4188 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4189 key.offset = 0;
4190 key.type = BTRFS_ROOT_ITEM_KEY;
4191 } else if (key.objectid < (u64)-1) {
4192 key.offset = 0;
4193 key.type = BTRFS_ROOT_ITEM_KEY;
4194 key.objectid++;
4195 } else {
4196 break;
4197 }
4198 cond_resched();
4199 }
4200
4201out:
4202 btrfs_free_path(path);
f45388f3 4203 if (trans && !IS_ERR(trans))
3a45bb20 4204 btrfs_end_transaction(trans);
803b2f54 4205 if (ret)
efe120a0 4206 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
70f80175 4207 else
afcdd129 4208 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
803b2f54
SB
4209 up(&fs_info->uuid_tree_rescan_sem);
4210 return 0;
4211}
4212
70f80175
SB
4213/*
4214 * Callback for btrfs_uuid_tree_iterate().
4215 * returns:
4216 * 0 check succeeded, the entry is not outdated.
bb7ab3b9 4217 * < 0 if an error occurred.
70f80175
SB
4218 * > 0 if the check failed, which means the caller shall remove the entry.
4219 */
4220static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4221 u8 *uuid, u8 type, u64 subid)
4222{
4223 struct btrfs_key key;
4224 int ret = 0;
4225 struct btrfs_root *subvol_root;
4226
4227 if (type != BTRFS_UUID_KEY_SUBVOL &&
4228 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4229 goto out;
4230
4231 key.objectid = subid;
4232 key.type = BTRFS_ROOT_ITEM_KEY;
4233 key.offset = (u64)-1;
4234 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4235 if (IS_ERR(subvol_root)) {
4236 ret = PTR_ERR(subvol_root);
4237 if (ret == -ENOENT)
4238 ret = 1;
4239 goto out;
4240 }
4241
4242 switch (type) {
4243 case BTRFS_UUID_KEY_SUBVOL:
4244 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4245 ret = 1;
4246 break;
4247 case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4248 if (memcmp(uuid, subvol_root->root_item.received_uuid,
4249 BTRFS_UUID_SIZE))
4250 ret = 1;
4251 break;
4252 }
4253
4254out:
4255 return ret;
4256}
4257
4258static int btrfs_uuid_rescan_kthread(void *data)
4259{
4260 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4261 int ret;
4262
4263 /*
4264 * 1st step is to iterate through the existing UUID tree and
4265 * to delete all entries that contain outdated data.
4266 * 2nd step is to add all missing entries to the UUID tree.
4267 */
4268 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4269 if (ret < 0) {
efe120a0 4270 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
70f80175
SB
4271 up(&fs_info->uuid_tree_rescan_sem);
4272 return ret;
4273 }
4274 return btrfs_uuid_scan_kthread(data);
4275}
4276
f7a81ea4
SB
4277int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4278{
4279 struct btrfs_trans_handle *trans;
4280 struct btrfs_root *tree_root = fs_info->tree_root;
4281 struct btrfs_root *uuid_root;
803b2f54
SB
4282 struct task_struct *task;
4283 int ret;
f7a81ea4
SB
4284
4285 /*
4286 * 1 - root node
4287 * 1 - root item
4288 */
4289 trans = btrfs_start_transaction(tree_root, 2);
4290 if (IS_ERR(trans))
4291 return PTR_ERR(trans);
4292
4293 uuid_root = btrfs_create_tree(trans, fs_info,
4294 BTRFS_UUID_TREE_OBJECTID);
4295 if (IS_ERR(uuid_root)) {
6d13f549 4296 ret = PTR_ERR(uuid_root);
66642832 4297 btrfs_abort_transaction(trans, ret);
3a45bb20 4298 btrfs_end_transaction(trans);
6d13f549 4299 return ret;
f7a81ea4
SB
4300 }
4301
4302 fs_info->uuid_root = uuid_root;
4303
3a45bb20 4304 ret = btrfs_commit_transaction(trans);
803b2f54
SB
4305 if (ret)
4306 return ret;
4307
4308 down(&fs_info->uuid_tree_rescan_sem);
4309 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4310 if (IS_ERR(task)) {
70f80175 4311 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
efe120a0 4312 btrfs_warn(fs_info, "failed to start uuid_scan task");
803b2f54
SB
4313 up(&fs_info->uuid_tree_rescan_sem);
4314 return PTR_ERR(task);
4315 }
4316
4317 return 0;
f7a81ea4 4318}
803b2f54 4319
70f80175
SB
4320int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4321{
4322 struct task_struct *task;
4323
4324 down(&fs_info->uuid_tree_rescan_sem);
4325 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4326 if (IS_ERR(task)) {
4327 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
efe120a0 4328 btrfs_warn(fs_info, "failed to start uuid_rescan task");
70f80175
SB
4329 up(&fs_info->uuid_tree_rescan_sem);
4330 return PTR_ERR(task);
4331 }
4332
4333 return 0;
4334}
4335
8f18cf13
CM
4336/*
4337 * shrinking a device means finding all of the device extents past
4338 * the new size, and then following the back refs to the chunks.
4339 * The chunk relocation code actually frees the device extent
4340 */
4341int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4342{
0b246afa
JM
4343 struct btrfs_fs_info *fs_info = device->fs_info;
4344 struct btrfs_root *root = fs_info->dev_root;
8f18cf13 4345 struct btrfs_trans_handle *trans;
8f18cf13
CM
4346 struct btrfs_dev_extent *dev_extent = NULL;
4347 struct btrfs_path *path;
4348 u64 length;
8f18cf13
CM
4349 u64 chunk_offset;
4350 int ret;
4351 int slot;
ba1bf481
JB
4352 int failed = 0;
4353 bool retried = false;
53e489bc 4354 bool checked_pending_chunks = false;
8f18cf13
CM
4355 struct extent_buffer *l;
4356 struct btrfs_key key;
0b246afa 4357 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13 4358 u64 old_total = btrfs_super_total_bytes(super_copy);
7cc8e58d 4359 u64 old_size = btrfs_device_get_total_bytes(device);
7dfb8be1
NB
4360 u64 diff;
4361
4362 new_size = round_down(new_size, fs_info->sectorsize);
0e4324a4 4363 diff = round_down(old_size - new_size, fs_info->sectorsize);
8f18cf13 4364
63a212ab
SB
4365 if (device->is_tgtdev_for_dev_replace)
4366 return -EINVAL;
4367
8f18cf13
CM
4368 path = btrfs_alloc_path();
4369 if (!path)
4370 return -ENOMEM;
4371
e4058b54 4372 path->reada = READA_FORWARD;
8f18cf13 4373
34441361 4374 mutex_lock(&fs_info->chunk_mutex);
7d9eb12c 4375
7cc8e58d 4376 btrfs_device_set_total_bytes(device, new_size);
2bf64758 4377 if (device->writeable) {
2b82032c 4378 device->fs_devices->total_rw_bytes -= diff;
a5ed45f8 4379 atomic64_sub(diff, &fs_info->free_chunk_space);
2bf64758 4380 }
34441361 4381 mutex_unlock(&fs_info->chunk_mutex);
8f18cf13 4382
ba1bf481 4383again:
8f18cf13
CM
4384 key.objectid = device->devid;
4385 key.offset = (u64)-1;
4386 key.type = BTRFS_DEV_EXTENT_KEY;
4387
213e64da 4388 do {
0b246afa 4389 mutex_lock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 4390 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
67c5e7d4 4391 if (ret < 0) {
0b246afa 4392 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 4393 goto done;
67c5e7d4 4394 }
8f18cf13
CM
4395
4396 ret = btrfs_previous_item(root, path, 0, key.type);
67c5e7d4 4397 if (ret)
0b246afa 4398 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8f18cf13
CM
4399 if (ret < 0)
4400 goto done;
4401 if (ret) {
4402 ret = 0;
b3b4aa74 4403 btrfs_release_path(path);
bf1fb512 4404 break;
8f18cf13
CM
4405 }
4406
4407 l = path->nodes[0];
4408 slot = path->slots[0];
4409 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4410
ba1bf481 4411 if (key.objectid != device->devid) {
0b246afa 4412 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
b3b4aa74 4413 btrfs_release_path(path);
bf1fb512 4414 break;
ba1bf481 4415 }
8f18cf13
CM
4416
4417 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4418 length = btrfs_dev_extent_length(l, dev_extent);
4419
ba1bf481 4420 if (key.offset + length <= new_size) {
0b246afa 4421 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
b3b4aa74 4422 btrfs_release_path(path);
d6397bae 4423 break;
ba1bf481 4424 }
8f18cf13 4425
8f18cf13 4426 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
b3b4aa74 4427 btrfs_release_path(path);
8f18cf13 4428
0b246afa
JM
4429 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4430 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ba1bf481 4431 if (ret && ret != -ENOSPC)
8f18cf13 4432 goto done;
ba1bf481
JB
4433 if (ret == -ENOSPC)
4434 failed++;
213e64da 4435 } while (key.offset-- > 0);
ba1bf481
JB
4436
4437 if (failed && !retried) {
4438 failed = 0;
4439 retried = true;
4440 goto again;
4441 } else if (failed && retried) {
4442 ret = -ENOSPC;
ba1bf481 4443 goto done;
8f18cf13
CM
4444 }
4445
d6397bae 4446 /* Shrinking succeeded, else we would be at "done". */
a22285a6 4447 trans = btrfs_start_transaction(root, 0);
98d5dc13
TI
4448 if (IS_ERR(trans)) {
4449 ret = PTR_ERR(trans);
4450 goto done;
4451 }
4452
34441361 4453 mutex_lock(&fs_info->chunk_mutex);
53e489bc
FM
4454
4455 /*
4456 * We checked in the above loop all device extents that were already in
4457 * the device tree. However before we have updated the device's
4458 * total_bytes to the new size, we might have had chunk allocations that
4459 * have not complete yet (new block groups attached to transaction
4460 * handles), and therefore their device extents were not yet in the
4461 * device tree and we missed them in the loop above. So if we have any
4462 * pending chunk using a device extent that overlaps the device range
4463 * that we can not use anymore, commit the current transaction and
4464 * repeat the search on the device tree - this way we guarantee we will
4465 * not have chunks using device extents that end beyond 'new_size'.
4466 */
4467 if (!checked_pending_chunks) {
4468 u64 start = new_size;
4469 u64 len = old_size - new_size;
4470
499f377f
JM
4471 if (contains_pending_extent(trans->transaction, device,
4472 &start, len)) {
34441361 4473 mutex_unlock(&fs_info->chunk_mutex);
53e489bc
FM
4474 checked_pending_chunks = true;
4475 failed = 0;
4476 retried = false;
3a45bb20 4477 ret = btrfs_commit_transaction(trans);
53e489bc
FM
4478 if (ret)
4479 goto done;
4480 goto again;
4481 }
4482 }
4483
7cc8e58d 4484 btrfs_device_set_disk_total_bytes(device, new_size);
935e5cc9
MX
4485 if (list_empty(&device->resized_list))
4486 list_add_tail(&device->resized_list,
0b246afa 4487 &fs_info->fs_devices->resized_devices);
d6397bae 4488
d6397bae 4489 WARN_ON(diff > old_total);
7dfb8be1
NB
4490 btrfs_set_super_total_bytes(super_copy,
4491 round_down(old_total - diff, fs_info->sectorsize));
34441361 4492 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8
MX
4493
4494 /* Now btrfs_update_device() will change the on-disk size. */
4495 ret = btrfs_update_device(trans, device);
3a45bb20 4496 btrfs_end_transaction(trans);
8f18cf13
CM
4497done:
4498 btrfs_free_path(path);
53e489bc 4499 if (ret) {
34441361 4500 mutex_lock(&fs_info->chunk_mutex);
53e489bc
FM
4501 btrfs_device_set_total_bytes(device, old_size);
4502 if (device->writeable)
4503 device->fs_devices->total_rw_bytes += diff;
a5ed45f8 4504 atomic64_add(diff, &fs_info->free_chunk_space);
34441361 4505 mutex_unlock(&fs_info->chunk_mutex);
53e489bc 4506 }
8f18cf13
CM
4507 return ret;
4508}
4509
2ff7e61e 4510static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
0b86a832
CM
4511 struct btrfs_key *key,
4512 struct btrfs_chunk *chunk, int item_size)
4513{
0b246afa 4514 struct btrfs_super_block *super_copy = fs_info->super_copy;
0b86a832
CM
4515 struct btrfs_disk_key disk_key;
4516 u32 array_size;
4517 u8 *ptr;
4518
34441361 4519 mutex_lock(&fs_info->chunk_mutex);
0b86a832 4520 array_size = btrfs_super_sys_array_size(super_copy);
5f43f86e 4521 if (array_size + item_size + sizeof(disk_key)
fe48a5c0 4522 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
34441361 4523 mutex_unlock(&fs_info->chunk_mutex);
0b86a832 4524 return -EFBIG;
fe48a5c0 4525 }
0b86a832
CM
4526
4527 ptr = super_copy->sys_chunk_array + array_size;
4528 btrfs_cpu_key_to_disk(&disk_key, key);
4529 memcpy(ptr, &disk_key, sizeof(disk_key));
4530 ptr += sizeof(disk_key);
4531 memcpy(ptr, chunk, item_size);
4532 item_size += sizeof(disk_key);
4533 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
34441361 4534 mutex_unlock(&fs_info->chunk_mutex);
fe48a5c0 4535
0b86a832
CM
4536 return 0;
4537}
4538
73c5de00
AJ
4539/*
4540 * sort the devices in descending order by max_avail, total_avail
4541 */
4542static int btrfs_cmp_device_info(const void *a, const void *b)
9b3f68b9 4543{
73c5de00
AJ
4544 const struct btrfs_device_info *di_a = a;
4545 const struct btrfs_device_info *di_b = b;
9b3f68b9 4546
73c5de00 4547 if (di_a->max_avail > di_b->max_avail)
b2117a39 4548 return -1;
73c5de00 4549 if (di_a->max_avail < di_b->max_avail)
b2117a39 4550 return 1;
73c5de00
AJ
4551 if (di_a->total_avail > di_b->total_avail)
4552 return -1;
4553 if (di_a->total_avail < di_b->total_avail)
4554 return 1;
4555 return 0;
b2117a39 4556}
0b86a832 4557
53b381b3
DW
4558static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4559{
ffe2d203 4560 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
53b381b3
DW
4561 return;
4562
ceda0864 4563 btrfs_set_fs_incompat(info, RAID56);
53b381b3
DW
4564}
4565
da17066c 4566#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
23f8f9b7
GH
4567 - sizeof(struct btrfs_chunk)) \
4568 / sizeof(struct btrfs_stripe) + 1)
4569
4570#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
4571 - 2 * sizeof(struct btrfs_disk_key) \
4572 - 2 * sizeof(struct btrfs_chunk)) \
4573 / sizeof(struct btrfs_stripe) + 1)
4574
73c5de00 4575static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
72b468c8 4576 u64 start, u64 type)
b2117a39 4577{
2ff7e61e 4578 struct btrfs_fs_info *info = trans->fs_info;
73c5de00 4579 struct btrfs_fs_devices *fs_devices = info->fs_devices;
ebcc9301 4580 struct btrfs_device *device;
73c5de00
AJ
4581 struct map_lookup *map = NULL;
4582 struct extent_map_tree *em_tree;
4583 struct extent_map *em;
4584 struct btrfs_device_info *devices_info = NULL;
4585 u64 total_avail;
4586 int num_stripes; /* total number of stripes to allocate */
53b381b3
DW
4587 int data_stripes; /* number of stripes that count for
4588 block group size */
73c5de00
AJ
4589 int sub_stripes; /* sub_stripes info for map */
4590 int dev_stripes; /* stripes per dev */
4591 int devs_max; /* max devs to use */
4592 int devs_min; /* min devs needed */
4593 int devs_increment; /* ndevs has to be a multiple of this */
4594 int ncopies; /* how many copies to data has */
4595 int ret;
4596 u64 max_stripe_size;
4597 u64 max_chunk_size;
4598 u64 stripe_size;
4599 u64 num_bytes;
4600 int ndevs;
4601 int i;
4602 int j;
31e50229 4603 int index;
593060d7 4604
0c460c0d 4605 BUG_ON(!alloc_profile_is_valid(type, 0));
9b3f68b9 4606
73c5de00
AJ
4607 if (list_empty(&fs_devices->alloc_list))
4608 return -ENOSPC;
b2117a39 4609
31e50229 4610 index = __get_raid_index(type);
73c5de00 4611
31e50229
LB
4612 sub_stripes = btrfs_raid_array[index].sub_stripes;
4613 dev_stripes = btrfs_raid_array[index].dev_stripes;
4614 devs_max = btrfs_raid_array[index].devs_max;
4615 devs_min = btrfs_raid_array[index].devs_min;
4616 devs_increment = btrfs_raid_array[index].devs_increment;
4617 ncopies = btrfs_raid_array[index].ncopies;
b2117a39 4618
9b3f68b9 4619 if (type & BTRFS_BLOCK_GROUP_DATA) {
ee22184b 4620 max_stripe_size = SZ_1G;
73c5de00 4621 max_chunk_size = 10 * max_stripe_size;
23f8f9b7
GH
4622 if (!devs_max)
4623 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
9b3f68b9 4624 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1100373f 4625 /* for larger filesystems, use larger metadata chunks */
ee22184b
BL
4626 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4627 max_stripe_size = SZ_1G;
1100373f 4628 else
ee22184b 4629 max_stripe_size = SZ_256M;
73c5de00 4630 max_chunk_size = max_stripe_size;
23f8f9b7
GH
4631 if (!devs_max)
4632 devs_max = BTRFS_MAX_DEVS(info->chunk_root);
a40a90a0 4633 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
ee22184b 4634 max_stripe_size = SZ_32M;
73c5de00 4635 max_chunk_size = 2 * max_stripe_size;
23f8f9b7
GH
4636 if (!devs_max)
4637 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
73c5de00 4638 } else {
351fd353 4639 btrfs_err(info, "invalid chunk type 0x%llx requested",
73c5de00
AJ
4640 type);
4641 BUG_ON(1);
9b3f68b9
CM
4642 }
4643
2b82032c
YZ
4644 /* we don't want a chunk larger than 10% of writeable space */
4645 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4646 max_chunk_size);
9b3f68b9 4647
31e818fe 4648 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
73c5de00
AJ
4649 GFP_NOFS);
4650 if (!devices_info)
4651 return -ENOMEM;
0cad8a11 4652
9f680ce0 4653 /*
73c5de00
AJ
4654 * in the first pass through the devices list, we gather information
4655 * about the available holes on each device.
9f680ce0 4656 */
73c5de00 4657 ndevs = 0;
ebcc9301 4658 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
73c5de00
AJ
4659 u64 max_avail;
4660 u64 dev_offset;
b2117a39 4661
73c5de00 4662 if (!device->writeable) {
31b1a2bd 4663 WARN(1, KERN_ERR
efe120a0 4664 "BTRFS: read-only device in alloc_list\n");
73c5de00
AJ
4665 continue;
4666 }
b2117a39 4667
63a212ab
SB
4668 if (!device->in_fs_metadata ||
4669 device->is_tgtdev_for_dev_replace)
73c5de00 4670 continue;
b2117a39 4671
73c5de00
AJ
4672 if (device->total_bytes > device->bytes_used)
4673 total_avail = device->total_bytes - device->bytes_used;
4674 else
4675 total_avail = 0;
38c01b96 4676
4677 /* If there is no space on this device, skip it. */
4678 if (total_avail == 0)
4679 continue;
b2117a39 4680
6df9a95e 4681 ret = find_free_dev_extent(trans, device,
73c5de00
AJ
4682 max_stripe_size * dev_stripes,
4683 &dev_offset, &max_avail);
4684 if (ret && ret != -ENOSPC)
4685 goto error;
b2117a39 4686
73c5de00
AJ
4687 if (ret == 0)
4688 max_avail = max_stripe_size * dev_stripes;
b2117a39 4689
73c5de00
AJ
4690 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4691 continue;
b2117a39 4692
063d006f
ES
4693 if (ndevs == fs_devices->rw_devices) {
4694 WARN(1, "%s: found more than %llu devices\n",
4695 __func__, fs_devices->rw_devices);
4696 break;
4697 }
73c5de00
AJ
4698 devices_info[ndevs].dev_offset = dev_offset;
4699 devices_info[ndevs].max_avail = max_avail;
4700 devices_info[ndevs].total_avail = total_avail;
4701 devices_info[ndevs].dev = device;
4702 ++ndevs;
4703 }
b2117a39 4704
73c5de00
AJ
4705 /*
4706 * now sort the devices by hole size / available space
4707 */
4708 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4709 btrfs_cmp_device_info, NULL);
b2117a39 4710
73c5de00 4711 /* round down to number of usable stripes */
e5600fd6 4712 ndevs = round_down(ndevs, devs_increment);
b2117a39 4713
73c5de00
AJ
4714 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4715 ret = -ENOSPC;
4716 goto error;
b2117a39 4717 }
9f680ce0 4718
f148ef4d
NB
4719 ndevs = min(ndevs, devs_max);
4720
73c5de00
AJ
4721 /*
4722 * the primary goal is to maximize the number of stripes, so use as many
4723 * devices as possible, even if the stripes are not maximum sized.
4724 */
4725 stripe_size = devices_info[ndevs-1].max_avail;
4726 num_stripes = ndevs * dev_stripes;
b2117a39 4727
53b381b3
DW
4728 /*
4729 * this will have to be fixed for RAID1 and RAID10 over
4730 * more drives
4731 */
4732 data_stripes = num_stripes / ncopies;
4733
500ceed8 4734 if (type & BTRFS_BLOCK_GROUP_RAID5)
53b381b3 4735 data_stripes = num_stripes - 1;
500ceed8
NB
4736
4737 if (type & BTRFS_BLOCK_GROUP_RAID6)
53b381b3 4738 data_stripes = num_stripes - 2;
86db2578
CM
4739
4740 /*
4741 * Use the number of data stripes to figure out how big this chunk
4742 * is really going to be in terms of logical address space,
4743 * and compare that answer with the max chunk size
4744 */
4745 if (stripe_size * data_stripes > max_chunk_size) {
4746 u64 mask = (1ULL << 24) - 1;
b8b93add
DS
4747
4748 stripe_size = div_u64(max_chunk_size, data_stripes);
86db2578
CM
4749
4750 /* bump the answer up to a 16MB boundary */
4751 stripe_size = (stripe_size + mask) & ~mask;
4752
4753 /* but don't go higher than the limits we found
4754 * while searching for free extents
4755 */
4756 if (stripe_size > devices_info[ndevs-1].max_avail)
4757 stripe_size = devices_info[ndevs-1].max_avail;
4758 }
4759
b8b93add 4760 stripe_size = div_u64(stripe_size, dev_stripes);
37db63a4
ID
4761
4762 /* align to BTRFS_STRIPE_LEN */
500ceed8 4763 stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
b2117a39
MX
4764
4765 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4766 if (!map) {
4767 ret = -ENOMEM;
4768 goto error;
4769 }
4770 map->num_stripes = num_stripes;
9b3f68b9 4771
73c5de00
AJ
4772 for (i = 0; i < ndevs; ++i) {
4773 for (j = 0; j < dev_stripes; ++j) {
4774 int s = i * dev_stripes + j;
4775 map->stripes[s].dev = devices_info[i].dev;
4776 map->stripes[s].physical = devices_info[i].dev_offset +
4777 j * stripe_size;
6324fbf3 4778 }
6324fbf3 4779 }
500ceed8
NB
4780 map->stripe_len = BTRFS_STRIPE_LEN;
4781 map->io_align = BTRFS_STRIPE_LEN;
4782 map->io_width = BTRFS_STRIPE_LEN;
2b82032c 4783 map->type = type;
2b82032c 4784 map->sub_stripes = sub_stripes;
0b86a832 4785
53b381b3 4786 num_bytes = stripe_size * data_stripes;
0b86a832 4787
6bccf3ab 4788 trace_btrfs_chunk_alloc(info, map, start, num_bytes);
1abe9b8a 4789
172ddd60 4790 em = alloc_extent_map();
2b82032c 4791 if (!em) {
298a8f9c 4792 kfree(map);
b2117a39
MX
4793 ret = -ENOMEM;
4794 goto error;
593060d7 4795 }
298a8f9c 4796 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 4797 em->map_lookup = map;
2b82032c 4798 em->start = start;
73c5de00 4799 em->len = num_bytes;
2b82032c
YZ
4800 em->block_start = 0;
4801 em->block_len = em->len;
6df9a95e 4802 em->orig_block_len = stripe_size;
593060d7 4803
0b246afa 4804 em_tree = &info->mapping_tree.map_tree;
890871be 4805 write_lock(&em_tree->lock);
09a2a8f9 4806 ret = add_extent_mapping(em_tree, em, 0);
0f5d42b2 4807 if (ret) {
1efb72a3 4808 write_unlock(&em_tree->lock);
0f5d42b2 4809 free_extent_map(em);
1dd4602f 4810 goto error;
0f5d42b2 4811 }
0b86a832 4812
1efb72a3
NB
4813 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4814 refcount_inc(&em->refs);
4815 write_unlock(&em_tree->lock);
4816
0174484d 4817 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
6df9a95e
JB
4818 if (ret)
4819 goto error_del_extent;
2b82032c 4820
7cc8e58d
MX
4821 for (i = 0; i < map->num_stripes; i++) {
4822 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4823 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4824 }
43530c46 4825
a5ed45f8 4826 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
1c116187 4827
0f5d42b2 4828 free_extent_map(em);
0b246afa 4829 check_raid56_incompat_flag(info, type);
53b381b3 4830
b2117a39 4831 kfree(devices_info);
2b82032c 4832 return 0;
b2117a39 4833
6df9a95e 4834error_del_extent:
0f5d42b2
JB
4835 write_lock(&em_tree->lock);
4836 remove_extent_mapping(em_tree, em);
4837 write_unlock(&em_tree->lock);
4838
4839 /* One for our allocation */
4840 free_extent_map(em);
4841 /* One for the tree reference */
4842 free_extent_map(em);
495e64f4
FM
4843 /* One for the pending_chunks list reference */
4844 free_extent_map(em);
b2117a39 4845error:
b2117a39
MX
4846 kfree(devices_info);
4847 return ret;
2b82032c
YZ
4848}
4849
6df9a95e 4850int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
6bccf3ab 4851 struct btrfs_fs_info *fs_info,
6df9a95e 4852 u64 chunk_offset, u64 chunk_size)
2b82032c 4853{
6bccf3ab
JM
4854 struct btrfs_root *extent_root = fs_info->extent_root;
4855 struct btrfs_root *chunk_root = fs_info->chunk_root;
2b82032c 4856 struct btrfs_key key;
2b82032c
YZ
4857 struct btrfs_device *device;
4858 struct btrfs_chunk *chunk;
4859 struct btrfs_stripe *stripe;
6df9a95e
JB
4860 struct extent_map *em;
4861 struct map_lookup *map;
4862 size_t item_size;
4863 u64 dev_offset;
4864 u64 stripe_size;
4865 int i = 0;
140e639f 4866 int ret = 0;
2b82032c 4867
592d92ee
LB
4868 em = get_chunk_map(fs_info, chunk_offset, chunk_size);
4869 if (IS_ERR(em))
4870 return PTR_ERR(em);
6df9a95e 4871
95617d69 4872 map = em->map_lookup;
6df9a95e
JB
4873 item_size = btrfs_chunk_item_size(map->num_stripes);
4874 stripe_size = em->orig_block_len;
4875
2b82032c 4876 chunk = kzalloc(item_size, GFP_NOFS);
6df9a95e
JB
4877 if (!chunk) {
4878 ret = -ENOMEM;
4879 goto out;
4880 }
4881
50460e37
FM
4882 /*
4883 * Take the device list mutex to prevent races with the final phase of
4884 * a device replace operation that replaces the device object associated
4885 * with the map's stripes, because the device object's id can change
4886 * at any time during that final phase of the device replace operation
4887 * (dev-replace.c:btrfs_dev_replace_finishing()).
4888 */
0b246afa 4889 mutex_lock(&fs_info->fs_devices->device_list_mutex);
6df9a95e
JB
4890 for (i = 0; i < map->num_stripes; i++) {
4891 device = map->stripes[i].dev;
4892 dev_offset = map->stripes[i].physical;
2b82032c 4893
0b86a832 4894 ret = btrfs_update_device(trans, device);
3acd3953 4895 if (ret)
50460e37 4896 break;
b5d9071c
NB
4897 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
4898 dev_offset, stripe_size);
6df9a95e 4899 if (ret)
50460e37
FM
4900 break;
4901 }
4902 if (ret) {
0b246afa 4903 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
50460e37 4904 goto out;
2b82032c
YZ
4905 }
4906
2b82032c 4907 stripe = &chunk->stripe;
6df9a95e
JB
4908 for (i = 0; i < map->num_stripes; i++) {
4909 device = map->stripes[i].dev;
4910 dev_offset = map->stripes[i].physical;
0b86a832 4911
e17cade2
CM
4912 btrfs_set_stack_stripe_devid(stripe, device->devid);
4913 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4914 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2b82032c 4915 stripe++;
0b86a832 4916 }
0b246afa 4917 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0b86a832 4918
2b82032c 4919 btrfs_set_stack_chunk_length(chunk, chunk_size);
0b86a832 4920 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2b82032c
YZ
4921 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4922 btrfs_set_stack_chunk_type(chunk, map->type);
4923 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4924 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4925 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
0b246afa 4926 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
2b82032c 4927 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
0b86a832 4928
2b82032c
YZ
4929 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4930 key.type = BTRFS_CHUNK_ITEM_KEY;
4931 key.offset = chunk_offset;
0b86a832 4932
2b82032c 4933 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4ed1d16e
MF
4934 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4935 /*
4936 * TODO: Cleanup of inserted chunk root in case of
4937 * failure.
4938 */
2ff7e61e 4939 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
8f18cf13 4940 }
1abe9b8a 4941
6df9a95e 4942out:
0b86a832 4943 kfree(chunk);
6df9a95e 4944 free_extent_map(em);
4ed1d16e 4945 return ret;
2b82032c 4946}
0b86a832 4947
2b82032c
YZ
4948/*
4949 * Chunk allocation falls into two parts. The first part does works
4950 * that make the new allocated chunk useable, but not do any operation
4951 * that modifies the chunk tree. The second part does the works that
4952 * require modifying the chunk tree. This division is important for the
4953 * bootstrap process of adding storage to a seed btrfs.
4954 */
4955int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2ff7e61e 4956 struct btrfs_fs_info *fs_info, u64 type)
2b82032c
YZ
4957{
4958 u64 chunk_offset;
2b82032c 4959
0b246afa
JM
4960 ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
4961 chunk_offset = find_next_chunk(fs_info);
72b468c8 4962 return __btrfs_alloc_chunk(trans, chunk_offset, type);
2b82032c
YZ
4963}
4964
d397712b 4965static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
e4a4dce7 4966 struct btrfs_fs_info *fs_info)
2b82032c
YZ
4967{
4968 u64 chunk_offset;
4969 u64 sys_chunk_offset;
2b82032c 4970 u64 alloc_profile;
2b82032c
YZ
4971 int ret;
4972
6df9a95e 4973 chunk_offset = find_next_chunk(fs_info);
1b86826d 4974 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
72b468c8 4975 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
79787eaa
JM
4976 if (ret)
4977 return ret;
2b82032c 4978
0b246afa 4979 sys_chunk_offset = find_next_chunk(fs_info);
1b86826d 4980 alloc_profile = btrfs_system_alloc_profile(fs_info);
72b468c8 4981 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
79787eaa 4982 return ret;
2b82032c
YZ
4983}
4984
d20983b4
MX
4985static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4986{
4987 int max_errors;
4988
4989 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4990 BTRFS_BLOCK_GROUP_RAID10 |
4991 BTRFS_BLOCK_GROUP_RAID5 |
4992 BTRFS_BLOCK_GROUP_DUP)) {
4993 max_errors = 1;
4994 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4995 max_errors = 2;
4996 } else {
4997 max_errors = 0;
005d6427 4998 }
2b82032c 4999
d20983b4 5000 return max_errors;
2b82032c
YZ
5001}
5002
2ff7e61e 5003int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2b82032c
YZ
5004{
5005 struct extent_map *em;
5006 struct map_lookup *map;
2b82032c 5007 int readonly = 0;
d20983b4 5008 int miss_ndevs = 0;
2b82032c
YZ
5009 int i;
5010
592d92ee
LB
5011 em = get_chunk_map(fs_info, chunk_offset, 1);
5012 if (IS_ERR(em))
2b82032c
YZ
5013 return 1;
5014
95617d69 5015 map = em->map_lookup;
2b82032c 5016 for (i = 0; i < map->num_stripes; i++) {
d20983b4
MX
5017 if (map->stripes[i].dev->missing) {
5018 miss_ndevs++;
5019 continue;
5020 }
5021
2b82032c
YZ
5022 if (!map->stripes[i].dev->writeable) {
5023 readonly = 1;
d20983b4 5024 goto end;
2b82032c
YZ
5025 }
5026 }
d20983b4
MX
5027
5028 /*
5029 * If the number of missing devices is larger than max errors,
5030 * we can not write the data into that chunk successfully, so
5031 * set it readonly.
5032 */
5033 if (miss_ndevs > btrfs_chunk_max_errors(map))
5034 readonly = 1;
5035end:
0b86a832 5036 free_extent_map(em);
2b82032c 5037 return readonly;
0b86a832
CM
5038}
5039
5040void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5041{
a8067e02 5042 extent_map_tree_init(&tree->map_tree);
0b86a832
CM
5043}
5044
5045void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5046{
5047 struct extent_map *em;
5048
d397712b 5049 while (1) {
890871be 5050 write_lock(&tree->map_tree.lock);
0b86a832
CM
5051 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
5052 if (em)
5053 remove_extent_mapping(&tree->map_tree, em);
890871be 5054 write_unlock(&tree->map_tree.lock);
0b86a832
CM
5055 if (!em)
5056 break;
0b86a832
CM
5057 /* once for us */
5058 free_extent_map(em);
5059 /* once for the tree */
5060 free_extent_map(em);
5061 }
5062}
5063
5d964051 5064int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
f188591e
CM
5065{
5066 struct extent_map *em;
5067 struct map_lookup *map;
f188591e
CM
5068 int ret;
5069
592d92ee
LB
5070 em = get_chunk_map(fs_info, logical, len);
5071 if (IS_ERR(em))
5072 /*
5073 * We could return errors for these cases, but that could get
5074 * ugly and we'd probably do the same thing which is just not do
5075 * anything else and exit, so return 1 so the callers don't try
5076 * to use other copies.
5077 */
fb7669b5 5078 return 1;
fb7669b5 5079
95617d69 5080 map = em->map_lookup;
f188591e
CM
5081 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
5082 ret = map->num_stripes;
321aecc6
CM
5083 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5084 ret = map->sub_stripes;
53b381b3
DW
5085 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5086 ret = 2;
5087 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5088 ret = 3;
f188591e
CM
5089 else
5090 ret = 1;
5091 free_extent_map(em);
ad6d620e 5092
73beece9 5093 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
6fad823f
LB
5094 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5095 fs_info->dev_replace.tgtdev)
ad6d620e 5096 ret++;
73beece9 5097 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
ad6d620e 5098
f188591e
CM
5099 return ret;
5100}
5101
2ff7e61e 5102unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
53b381b3
DW
5103 u64 logical)
5104{
5105 struct extent_map *em;
5106 struct map_lookup *map;
0b246afa 5107 unsigned long len = fs_info->sectorsize;
53b381b3 5108
592d92ee 5109 em = get_chunk_map(fs_info, logical, len);
53b381b3 5110
69f03f13
NB
5111 if (!WARN_ON(IS_ERR(em))) {
5112 map = em->map_lookup;
5113 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5114 len = map->stripe_len * nr_data_stripes(map);
5115 free_extent_map(em);
5116 }
53b381b3
DW
5117 return len;
5118}
5119
e4ff5fb5 5120int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
53b381b3
DW
5121{
5122 struct extent_map *em;
5123 struct map_lookup *map;
53b381b3
DW
5124 int ret = 0;
5125
592d92ee 5126 em = get_chunk_map(fs_info, logical, len);
53b381b3 5127
69f03f13
NB
5128 if(!WARN_ON(IS_ERR(em))) {
5129 map = em->map_lookup;
5130 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5131 ret = 1;
5132 free_extent_map(em);
5133 }
53b381b3
DW
5134 return ret;
5135}
5136
30d9861f
SB
5137static int find_live_mirror(struct btrfs_fs_info *fs_info,
5138 struct map_lookup *map, int first, int num,
5139 int optimal, int dev_replace_is_ongoing)
dfe25020
CM
5140{
5141 int i;
30d9861f
SB
5142 int tolerance;
5143 struct btrfs_device *srcdev;
5144
5145 if (dev_replace_is_ongoing &&
5146 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5147 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5148 srcdev = fs_info->dev_replace.srcdev;
5149 else
5150 srcdev = NULL;
5151
5152 /*
5153 * try to avoid the drive that is the source drive for a
5154 * dev-replace procedure, only choose it if no other non-missing
5155 * mirror is available
5156 */
5157 for (tolerance = 0; tolerance < 2; tolerance++) {
5158 if (map->stripes[optimal].dev->bdev &&
5159 (tolerance || map->stripes[optimal].dev != srcdev))
5160 return optimal;
5161 for (i = first; i < first + num; i++) {
5162 if (map->stripes[i].dev->bdev &&
5163 (tolerance || map->stripes[i].dev != srcdev))
5164 return i;
5165 }
dfe25020 5166 }
30d9861f 5167
dfe25020
CM
5168 /* we couldn't find one that doesn't fail. Just return something
5169 * and the io error handling code will clean up eventually
5170 */
5171 return optimal;
5172}
5173
53b381b3
DW
5174static inline int parity_smaller(u64 a, u64 b)
5175{
5176 return a > b;
5177}
5178
5179/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
8e5cfb55 5180static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
53b381b3
DW
5181{
5182 struct btrfs_bio_stripe s;
5183 int i;
5184 u64 l;
5185 int again = 1;
5186
5187 while (again) {
5188 again = 0;
cc7539ed 5189 for (i = 0; i < num_stripes - 1; i++) {
8e5cfb55
ZL
5190 if (parity_smaller(bbio->raid_map[i],
5191 bbio->raid_map[i+1])) {
53b381b3 5192 s = bbio->stripes[i];
8e5cfb55 5193 l = bbio->raid_map[i];
53b381b3 5194 bbio->stripes[i] = bbio->stripes[i+1];
8e5cfb55 5195 bbio->raid_map[i] = bbio->raid_map[i+1];
53b381b3 5196 bbio->stripes[i+1] = s;
8e5cfb55 5197 bbio->raid_map[i+1] = l;
2c8cdd6e 5198
53b381b3
DW
5199 again = 1;
5200 }
5201 }
5202 }
5203}
5204
6e9606d2
ZL
5205static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5206{
5207 struct btrfs_bio *bbio = kzalloc(
e57cf21e 5208 /* the size of the btrfs_bio */
6e9606d2 5209 sizeof(struct btrfs_bio) +
e57cf21e 5210 /* plus the variable array for the stripes */
6e9606d2 5211 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
e57cf21e 5212 /* plus the variable array for the tgt dev */
6e9606d2 5213 sizeof(int) * (real_stripes) +
e57cf21e
CM
5214 /*
5215 * plus the raid_map, which includes both the tgt dev
5216 * and the stripes
5217 */
5218 sizeof(u64) * (total_stripes),
277fb5fc 5219 GFP_NOFS|__GFP_NOFAIL);
6e9606d2
ZL
5220
5221 atomic_set(&bbio->error, 0);
140475ae 5222 refcount_set(&bbio->refs, 1);
6e9606d2
ZL
5223
5224 return bbio;
5225}
5226
5227void btrfs_get_bbio(struct btrfs_bio *bbio)
5228{
140475ae
ER
5229 WARN_ON(!refcount_read(&bbio->refs));
5230 refcount_inc(&bbio->refs);
6e9606d2
ZL
5231}
5232
5233void btrfs_put_bbio(struct btrfs_bio *bbio)
5234{
5235 if (!bbio)
5236 return;
140475ae 5237 if (refcount_dec_and_test(&bbio->refs))
6e9606d2
ZL
5238 kfree(bbio);
5239}
5240
0b3d4cd3
LB
5241/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5242/*
5243 * Please note that, discard won't be sent to target device of device
5244 * replace.
5245 */
5246static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5247 u64 logical, u64 length,
5248 struct btrfs_bio **bbio_ret)
5249{
5250 struct extent_map *em;
5251 struct map_lookup *map;
5252 struct btrfs_bio *bbio;
5253 u64 offset;
5254 u64 stripe_nr;
5255 u64 stripe_nr_end;
5256 u64 stripe_end_offset;
5257 u64 stripe_cnt;
5258 u64 stripe_len;
5259 u64 stripe_offset;
5260 u64 num_stripes;
5261 u32 stripe_index;
5262 u32 factor = 0;
5263 u32 sub_stripes = 0;
5264 u64 stripes_per_dev = 0;
5265 u32 remaining_stripes = 0;
5266 u32 last_stripe = 0;
5267 int ret = 0;
5268 int i;
5269
5270 /* discard always return a bbio */
5271 ASSERT(bbio_ret);
5272
5273 em = get_chunk_map(fs_info, logical, length);
5274 if (IS_ERR(em))
5275 return PTR_ERR(em);
5276
5277 map = em->map_lookup;
5278 /* we don't discard raid56 yet */
5279 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5280 ret = -EOPNOTSUPP;
5281 goto out;
5282 }
5283
5284 offset = logical - em->start;
5285 length = min_t(u64, em->len - offset, length);
5286
5287 stripe_len = map->stripe_len;
5288 /*
5289 * stripe_nr counts the total number of stripes we have to stride
5290 * to get to this block
5291 */
5292 stripe_nr = div64_u64(offset, stripe_len);
5293
5294 /* stripe_offset is the offset of this block in its stripe */
5295 stripe_offset = offset - stripe_nr * stripe_len;
5296
5297 stripe_nr_end = round_up(offset + length, map->stripe_len);
42c61ab6 5298 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
0b3d4cd3
LB
5299 stripe_cnt = stripe_nr_end - stripe_nr;
5300 stripe_end_offset = stripe_nr_end * map->stripe_len -
5301 (offset + length);
5302 /*
5303 * after this, stripe_nr is the number of stripes on this
5304 * device we have to walk to find the data, and stripe_index is
5305 * the number of our device in the stripe array
5306 */
5307 num_stripes = 1;
5308 stripe_index = 0;
5309 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5310 BTRFS_BLOCK_GROUP_RAID10)) {
5311 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5312 sub_stripes = 1;
5313 else
5314 sub_stripes = map->sub_stripes;
5315
5316 factor = map->num_stripes / sub_stripes;
5317 num_stripes = min_t(u64, map->num_stripes,
5318 sub_stripes * stripe_cnt);
5319 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5320 stripe_index *= sub_stripes;
5321 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5322 &remaining_stripes);
5323 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5324 last_stripe *= sub_stripes;
5325 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5326 BTRFS_BLOCK_GROUP_DUP)) {
5327 num_stripes = map->num_stripes;
5328 } else {
5329 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5330 &stripe_index);
5331 }
5332
5333 bbio = alloc_btrfs_bio(num_stripes, 0);
5334 if (!bbio) {
5335 ret = -ENOMEM;
5336 goto out;
5337 }
5338
5339 for (i = 0; i < num_stripes; i++) {
5340 bbio->stripes[i].physical =
5341 map->stripes[stripe_index].physical +
5342 stripe_offset + stripe_nr * map->stripe_len;
5343 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5344
5345 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5346 BTRFS_BLOCK_GROUP_RAID10)) {
5347 bbio->stripes[i].length = stripes_per_dev *
5348 map->stripe_len;
5349
5350 if (i / sub_stripes < remaining_stripes)
5351 bbio->stripes[i].length +=
5352 map->stripe_len;
5353
5354 /*
5355 * Special for the first stripe and
5356 * the last stripe:
5357 *
5358 * |-------|...|-------|
5359 * |----------|
5360 * off end_off
5361 */
5362 if (i < sub_stripes)
5363 bbio->stripes[i].length -=
5364 stripe_offset;
5365
5366 if (stripe_index >= last_stripe &&
5367 stripe_index <= (last_stripe +
5368 sub_stripes - 1))
5369 bbio->stripes[i].length -=
5370 stripe_end_offset;
5371
5372 if (i == sub_stripes - 1)
5373 stripe_offset = 0;
5374 } else {
5375 bbio->stripes[i].length = length;
5376 }
5377
5378 stripe_index++;
5379 if (stripe_index == map->num_stripes) {
5380 stripe_index = 0;
5381 stripe_nr++;
5382 }
5383 }
5384
5385 *bbio_ret = bbio;
5386 bbio->map_type = map->type;
5387 bbio->num_stripes = num_stripes;
5388out:
5389 free_extent_map(em);
5390 return ret;
5391}
5392
5ab56090
LB
5393/*
5394 * In dev-replace case, for repair case (that's the only case where the mirror
5395 * is selected explicitly when calling btrfs_map_block), blocks left of the
5396 * left cursor can also be read from the target drive.
5397 *
5398 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5399 * array of stripes.
5400 * For READ, it also needs to be supported using the same mirror number.
5401 *
5402 * If the requested block is not left of the left cursor, EIO is returned. This
5403 * can happen because btrfs_num_copies() returns one more in the dev-replace
5404 * case.
5405 */
5406static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5407 u64 logical, u64 length,
5408 u64 srcdev_devid, int *mirror_num,
5409 u64 *physical)
5410{
5411 struct btrfs_bio *bbio = NULL;
5412 int num_stripes;
5413 int index_srcdev = 0;
5414 int found = 0;
5415 u64 physical_of_found = 0;
5416 int i;
5417 int ret = 0;
5418
5419 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5420 logical, &length, &bbio, 0, 0);
5421 if (ret) {
5422 ASSERT(bbio == NULL);
5423 return ret;
5424 }
5425
5426 num_stripes = bbio->num_stripes;
5427 if (*mirror_num > num_stripes) {
5428 /*
5429 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5430 * that means that the requested area is not left of the left
5431 * cursor
5432 */
5433 btrfs_put_bbio(bbio);
5434 return -EIO;
5435 }
5436
5437 /*
5438 * process the rest of the function using the mirror_num of the source
5439 * drive. Therefore look it up first. At the end, patch the device
5440 * pointer to the one of the target drive.
5441 */
5442 for (i = 0; i < num_stripes; i++) {
5443 if (bbio->stripes[i].dev->devid != srcdev_devid)
5444 continue;
5445
5446 /*
5447 * In case of DUP, in order to keep it simple, only add the
5448 * mirror with the lowest physical address
5449 */
5450 if (found &&
5451 physical_of_found <= bbio->stripes[i].physical)
5452 continue;
5453
5454 index_srcdev = i;
5455 found = 1;
5456 physical_of_found = bbio->stripes[i].physical;
5457 }
5458
5459 btrfs_put_bbio(bbio);
5460
5461 ASSERT(found);
5462 if (!found)
5463 return -EIO;
5464
5465 *mirror_num = index_srcdev + 1;
5466 *physical = physical_of_found;
5467 return ret;
5468}
5469
73c0f228
LB
5470static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5471 struct btrfs_bio **bbio_ret,
5472 struct btrfs_dev_replace *dev_replace,
5473 int *num_stripes_ret, int *max_errors_ret)
5474{
5475 struct btrfs_bio *bbio = *bbio_ret;
5476 u64 srcdev_devid = dev_replace->srcdev->devid;
5477 int tgtdev_indexes = 0;
5478 int num_stripes = *num_stripes_ret;
5479 int max_errors = *max_errors_ret;
5480 int i;
5481
5482 if (op == BTRFS_MAP_WRITE) {
5483 int index_where_to_add;
5484
5485 /*
5486 * duplicate the write operations while the dev replace
5487 * procedure is running. Since the copying of the old disk to
5488 * the new disk takes place at run time while the filesystem is
5489 * mounted writable, the regular write operations to the old
5490 * disk have to be duplicated to go to the new disk as well.
5491 *
5492 * Note that device->missing is handled by the caller, and that
5493 * the write to the old disk is already set up in the stripes
5494 * array.
5495 */
5496 index_where_to_add = num_stripes;
5497 for (i = 0; i < num_stripes; i++) {
5498 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5499 /* write to new disk, too */
5500 struct btrfs_bio_stripe *new =
5501 bbio->stripes + index_where_to_add;
5502 struct btrfs_bio_stripe *old =
5503 bbio->stripes + i;
5504
5505 new->physical = old->physical;
5506 new->length = old->length;
5507 new->dev = dev_replace->tgtdev;
5508 bbio->tgtdev_map[i] = index_where_to_add;
5509 index_where_to_add++;
5510 max_errors++;
5511 tgtdev_indexes++;
5512 }
5513 }
5514 num_stripes = index_where_to_add;
5515 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5516 int index_srcdev = 0;
5517 int found = 0;
5518 u64 physical_of_found = 0;
5519
5520 /*
5521 * During the dev-replace procedure, the target drive can also
5522 * be used to read data in case it is needed to repair a corrupt
5523 * block elsewhere. This is possible if the requested area is
5524 * left of the left cursor. In this area, the target drive is a
5525 * full copy of the source drive.
5526 */
5527 for (i = 0; i < num_stripes; i++) {
5528 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5529 /*
5530 * In case of DUP, in order to keep it simple,
5531 * only add the mirror with the lowest physical
5532 * address
5533 */
5534 if (found &&
5535 physical_of_found <=
5536 bbio->stripes[i].physical)
5537 continue;
5538 index_srcdev = i;
5539 found = 1;
5540 physical_of_found = bbio->stripes[i].physical;
5541 }
5542 }
5543 if (found) {
5544 struct btrfs_bio_stripe *tgtdev_stripe =
5545 bbio->stripes + num_stripes;
5546
5547 tgtdev_stripe->physical = physical_of_found;
5548 tgtdev_stripe->length =
5549 bbio->stripes[index_srcdev].length;
5550 tgtdev_stripe->dev = dev_replace->tgtdev;
5551 bbio->tgtdev_map[index_srcdev] = num_stripes;
5552
5553 tgtdev_indexes++;
5554 num_stripes++;
5555 }
5556 }
5557
5558 *num_stripes_ret = num_stripes;
5559 *max_errors_ret = max_errors;
5560 bbio->num_tgtdevs = tgtdev_indexes;
5561 *bbio_ret = bbio;
5562}
5563
2b19a1fe
LB
5564static bool need_full_stripe(enum btrfs_map_op op)
5565{
5566 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5567}
5568
cf8cddd3
CH
5569static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5570 enum btrfs_map_op op,
f2d8d74d 5571 u64 logical, u64 *length,
a1d3c478 5572 struct btrfs_bio **bbio_ret,
8e5cfb55 5573 int mirror_num, int need_raid_map)
0b86a832
CM
5574{
5575 struct extent_map *em;
5576 struct map_lookup *map;
0b86a832 5577 u64 offset;
593060d7
CM
5578 u64 stripe_offset;
5579 u64 stripe_nr;
53b381b3 5580 u64 stripe_len;
9d644a62 5581 u32 stripe_index;
cea9e445 5582 int i;
de11cc12 5583 int ret = 0;
f2d8d74d 5584 int num_stripes;
a236aed1 5585 int max_errors = 0;
2c8cdd6e 5586 int tgtdev_indexes = 0;
a1d3c478 5587 struct btrfs_bio *bbio = NULL;
472262f3
SB
5588 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5589 int dev_replace_is_ongoing = 0;
5590 int num_alloc_stripes;
ad6d620e
SB
5591 int patch_the_first_stripe_for_dev_replace = 0;
5592 u64 physical_to_patch_in_first_stripe = 0;
53b381b3 5593 u64 raid56_full_stripe_start = (u64)-1;
0b86a832 5594
0b3d4cd3
LB
5595 if (op == BTRFS_MAP_DISCARD)
5596 return __btrfs_map_block_for_discard(fs_info, logical,
5597 *length, bbio_ret);
5598
592d92ee
LB
5599 em = get_chunk_map(fs_info, logical, *length);
5600 if (IS_ERR(em))
5601 return PTR_ERR(em);
0b86a832 5602
95617d69 5603 map = em->map_lookup;
0b86a832 5604 offset = logical - em->start;
593060d7 5605
53b381b3 5606 stripe_len = map->stripe_len;
593060d7
CM
5607 stripe_nr = offset;
5608 /*
5609 * stripe_nr counts the total number of stripes we have to stride
5610 * to get to this block
5611 */
47c5713f 5612 stripe_nr = div64_u64(stripe_nr, stripe_len);
593060d7 5613
53b381b3 5614 stripe_offset = stripe_nr * stripe_len;
e042d1ec 5615 if (offset < stripe_offset) {
5d163e0e
JM
5616 btrfs_crit(fs_info,
5617 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
e042d1ec
JB
5618 stripe_offset, offset, em->start, logical,
5619 stripe_len);
5620 free_extent_map(em);
5621 return -EINVAL;
5622 }
593060d7
CM
5623
5624 /* stripe_offset is the offset of this block in its stripe*/
5625 stripe_offset = offset - stripe_offset;
5626
53b381b3 5627 /* if we're here for raid56, we need to know the stripe aligned start */
ffe2d203 5628 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
53b381b3
DW
5629 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5630 raid56_full_stripe_start = offset;
5631
5632 /* allow a write of a full stripe, but make sure we don't
5633 * allow straddling of stripes
5634 */
47c5713f
DS
5635 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5636 full_stripe_len);
53b381b3
DW
5637 raid56_full_stripe_start *= full_stripe_len;
5638 }
5639
0b3d4cd3 5640 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
53b381b3
DW
5641 u64 max_len;
5642 /* For writes to RAID[56], allow a full stripeset across all disks.
5643 For other RAID types and for RAID[56] reads, just allow a single
5644 stripe (on a single disk). */
ffe2d203 5645 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
cf8cddd3 5646 (op == BTRFS_MAP_WRITE)) {
53b381b3
DW
5647 max_len = stripe_len * nr_data_stripes(map) -
5648 (offset - raid56_full_stripe_start);
5649 } else {
5650 /* we limit the length of each bio to what fits in a stripe */
5651 max_len = stripe_len - stripe_offset;
5652 }
5653 *length = min_t(u64, em->len - offset, max_len);
cea9e445
CM
5654 } else {
5655 *length = em->len - offset;
5656 }
f2d8d74d 5657
53b381b3
DW
5658 /* This is for when we're called from btrfs_merge_bio_hook() and all
5659 it cares about is the length */
a1d3c478 5660 if (!bbio_ret)
cea9e445
CM
5661 goto out;
5662
73beece9 5663 btrfs_dev_replace_lock(dev_replace, 0);
472262f3
SB
5664 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5665 if (!dev_replace_is_ongoing)
73beece9
LB
5666 btrfs_dev_replace_unlock(dev_replace, 0);
5667 else
5668 btrfs_dev_replace_set_lock_blocking(dev_replace);
472262f3 5669
ad6d620e 5670 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
2b19a1fe 5671 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5ab56090
LB
5672 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
5673 dev_replace->srcdev->devid,
5674 &mirror_num,
5675 &physical_to_patch_in_first_stripe);
5676 if (ret)
ad6d620e 5677 goto out;
5ab56090
LB
5678 else
5679 patch_the_first_stripe_for_dev_replace = 1;
ad6d620e
SB
5680 } else if (mirror_num > map->num_stripes) {
5681 mirror_num = 0;
5682 }
5683
f2d8d74d 5684 num_stripes = 1;
cea9e445 5685 stripe_index = 0;
fce3bb9a 5686 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
47c5713f
DS
5687 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5688 &stripe_index);
de483734 5689 if (!need_full_stripe(op))
28e1cc7d 5690 mirror_num = 1;
fce3bb9a 5691 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
de483734 5692 if (need_full_stripe(op))
f2d8d74d 5693 num_stripes = map->num_stripes;
2fff734f 5694 else if (mirror_num)
f188591e 5695 stripe_index = mirror_num - 1;
dfe25020 5696 else {
30d9861f 5697 stripe_index = find_live_mirror(fs_info, map, 0,
dfe25020 5698 map->num_stripes,
30d9861f
SB
5699 current->pid % map->num_stripes,
5700 dev_replace_is_ongoing);
a1d3c478 5701 mirror_num = stripe_index + 1;
dfe25020 5702 }
2fff734f 5703
611f0e00 5704 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
de483734 5705 if (need_full_stripe(op)) {
f2d8d74d 5706 num_stripes = map->num_stripes;
a1d3c478 5707 } else if (mirror_num) {
f188591e 5708 stripe_index = mirror_num - 1;
a1d3c478
JS
5709 } else {
5710 mirror_num = 1;
5711 }
2fff734f 5712
321aecc6 5713 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
9d644a62 5714 u32 factor = map->num_stripes / map->sub_stripes;
321aecc6 5715
47c5713f 5716 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
321aecc6
CM
5717 stripe_index *= map->sub_stripes;
5718
de483734 5719 if (need_full_stripe(op))
f2d8d74d 5720 num_stripes = map->sub_stripes;
321aecc6
CM
5721 else if (mirror_num)
5722 stripe_index += mirror_num - 1;
dfe25020 5723 else {
3e74317a 5724 int old_stripe_index = stripe_index;
30d9861f
SB
5725 stripe_index = find_live_mirror(fs_info, map,
5726 stripe_index,
dfe25020 5727 map->sub_stripes, stripe_index +
30d9861f
SB
5728 current->pid % map->sub_stripes,
5729 dev_replace_is_ongoing);
3e74317a 5730 mirror_num = stripe_index - old_stripe_index + 1;
dfe25020 5731 }
53b381b3 5732
ffe2d203 5733 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
de483734 5734 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
53b381b3 5735 /* push stripe_nr back to the start of the full stripe */
42c61ab6 5736 stripe_nr = div64_u64(raid56_full_stripe_start,
b8b93add 5737 stripe_len * nr_data_stripes(map));
53b381b3
DW
5738
5739 /* RAID[56] write or recovery. Return all stripes */
5740 num_stripes = map->num_stripes;
5741 max_errors = nr_parity_stripes(map);
5742
53b381b3
DW
5743 *length = map->stripe_len;
5744 stripe_index = 0;
5745 stripe_offset = 0;
5746 } else {
5747 /*
5748 * Mirror #0 or #1 means the original data block.
5749 * Mirror #2 is RAID5 parity block.
5750 * Mirror #3 is RAID6 Q block.
5751 */
47c5713f
DS
5752 stripe_nr = div_u64_rem(stripe_nr,
5753 nr_data_stripes(map), &stripe_index);
53b381b3
DW
5754 if (mirror_num > 1)
5755 stripe_index = nr_data_stripes(map) +
5756 mirror_num - 2;
5757
5758 /* We distribute the parity blocks across stripes */
47c5713f
DS
5759 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5760 &stripe_index);
de483734 5761 if (!need_full_stripe(op) && mirror_num <= 1)
28e1cc7d 5762 mirror_num = 1;
53b381b3 5763 }
8790d502
CM
5764 } else {
5765 /*
47c5713f
DS
5766 * after this, stripe_nr is the number of stripes on this
5767 * device we have to walk to find the data, and stripe_index is
5768 * the number of our device in the stripe array
8790d502 5769 */
47c5713f
DS
5770 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5771 &stripe_index);
a1d3c478 5772 mirror_num = stripe_index + 1;
8790d502 5773 }
e042d1ec 5774 if (stripe_index >= map->num_stripes) {
5d163e0e
JM
5775 btrfs_crit(fs_info,
5776 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
e042d1ec
JB
5777 stripe_index, map->num_stripes);
5778 ret = -EINVAL;
5779 goto out;
5780 }
cea9e445 5781
472262f3 5782 num_alloc_stripes = num_stripes;
6fad823f 5783 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
0b3d4cd3 5784 if (op == BTRFS_MAP_WRITE)
ad6d620e 5785 num_alloc_stripes <<= 1;
cf8cddd3 5786 if (op == BTRFS_MAP_GET_READ_MIRRORS)
ad6d620e 5787 num_alloc_stripes++;
2c8cdd6e 5788 tgtdev_indexes = num_stripes;
ad6d620e 5789 }
2c8cdd6e 5790
6e9606d2 5791 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
de11cc12
LZ
5792 if (!bbio) {
5793 ret = -ENOMEM;
5794 goto out;
5795 }
6fad823f 5796 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
2c8cdd6e 5797 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
de11cc12 5798
8e5cfb55 5799 /* build raid_map */
2b19a1fe
LB
5800 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
5801 (need_full_stripe(op) || mirror_num > 1)) {
8e5cfb55 5802 u64 tmp;
9d644a62 5803 unsigned rot;
8e5cfb55
ZL
5804
5805 bbio->raid_map = (u64 *)((void *)bbio->stripes +
5806 sizeof(struct btrfs_bio_stripe) *
5807 num_alloc_stripes +
5808 sizeof(int) * tgtdev_indexes);
5809
5810 /* Work out the disk rotation on this stripe-set */
47c5713f 5811 div_u64_rem(stripe_nr, num_stripes, &rot);
8e5cfb55
ZL
5812
5813 /* Fill in the logical address of each stripe */
5814 tmp = stripe_nr * nr_data_stripes(map);
5815 for (i = 0; i < nr_data_stripes(map); i++)
5816 bbio->raid_map[(i+rot) % num_stripes] =
5817 em->start + (tmp + i) * map->stripe_len;
5818
5819 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5820 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5821 bbio->raid_map[(i+rot+1) % num_stripes] =
5822 RAID6_Q_STRIPE;
5823 }
5824
b89203f7 5825
0b3d4cd3
LB
5826 for (i = 0; i < num_stripes; i++) {
5827 bbio->stripes[i].physical =
5828 map->stripes[stripe_index].physical +
5829 stripe_offset +
5830 stripe_nr * map->stripe_len;
5831 bbio->stripes[i].dev =
5832 map->stripes[stripe_index].dev;
5833 stripe_index++;
593060d7 5834 }
de11cc12 5835
2b19a1fe 5836 if (need_full_stripe(op))
d20983b4 5837 max_errors = btrfs_chunk_max_errors(map);
de11cc12 5838
8e5cfb55
ZL
5839 if (bbio->raid_map)
5840 sort_parity_stripes(bbio, num_stripes);
cc7539ed 5841
73c0f228 5842 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
2b19a1fe 5843 need_full_stripe(op)) {
73c0f228
LB
5844 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
5845 &max_errors);
472262f3
SB
5846 }
5847
de11cc12 5848 *bbio_ret = bbio;
10f11900 5849 bbio->map_type = map->type;
de11cc12
LZ
5850 bbio->num_stripes = num_stripes;
5851 bbio->max_errors = max_errors;
5852 bbio->mirror_num = mirror_num;
ad6d620e
SB
5853
5854 /*
5855 * this is the case that REQ_READ && dev_replace_is_ongoing &&
5856 * mirror_num == num_stripes + 1 && dev_replace target drive is
5857 * available as a mirror
5858 */
5859 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5860 WARN_ON(num_stripes > 1);
5861 bbio->stripes[0].dev = dev_replace->tgtdev;
5862 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5863 bbio->mirror_num = map->num_stripes + 1;
5864 }
cea9e445 5865out:
73beece9
LB
5866 if (dev_replace_is_ongoing) {
5867 btrfs_dev_replace_clear_lock_blocking(dev_replace);
5868 btrfs_dev_replace_unlock(dev_replace, 0);
5869 }
0b86a832 5870 free_extent_map(em);
de11cc12 5871 return ret;
0b86a832
CM
5872}
5873
cf8cddd3 5874int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
f2d8d74d 5875 u64 logical, u64 *length,
a1d3c478 5876 struct btrfs_bio **bbio_ret, int mirror_num)
f2d8d74d 5877{
b3d3fa51 5878 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
8e5cfb55 5879 mirror_num, 0);
f2d8d74d
CM
5880}
5881
af8e2d1d 5882/* For Scrub/replace */
cf8cddd3 5883int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
af8e2d1d 5884 u64 logical, u64 *length,
825ad4c9 5885 struct btrfs_bio **bbio_ret)
af8e2d1d 5886{
825ad4c9 5887 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
af8e2d1d
MX
5888}
5889
ab8d0fc4 5890int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
a512bbf8
YZ
5891 u64 chunk_start, u64 physical, u64 devid,
5892 u64 **logical, int *naddrs, int *stripe_len)
5893{
a512bbf8
YZ
5894 struct extent_map *em;
5895 struct map_lookup *map;
5896 u64 *buf;
5897 u64 bytenr;
5898 u64 length;
5899 u64 stripe_nr;
53b381b3 5900 u64 rmap_len;
a512bbf8
YZ
5901 int i, j, nr = 0;
5902
592d92ee
LB
5903 em = get_chunk_map(fs_info, chunk_start, 1);
5904 if (IS_ERR(em))
835d974f 5905 return -EIO;
835d974f 5906
95617d69 5907 map = em->map_lookup;
a512bbf8 5908 length = em->len;
53b381b3
DW
5909 rmap_len = map->stripe_len;
5910
a512bbf8 5911 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
b8b93add 5912 length = div_u64(length, map->num_stripes / map->sub_stripes);
a512bbf8 5913 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
b8b93add 5914 length = div_u64(length, map->num_stripes);
ffe2d203 5915 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
b8b93add 5916 length = div_u64(length, nr_data_stripes(map));
53b381b3
DW
5917 rmap_len = map->stripe_len * nr_data_stripes(map);
5918 }
a512bbf8 5919
31e818fe 5920 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
79787eaa 5921 BUG_ON(!buf); /* -ENOMEM */
a512bbf8
YZ
5922
5923 for (i = 0; i < map->num_stripes; i++) {
5924 if (devid && map->stripes[i].dev->devid != devid)
5925 continue;
5926 if (map->stripes[i].physical > physical ||
5927 map->stripes[i].physical + length <= physical)
5928 continue;
5929
5930 stripe_nr = physical - map->stripes[i].physical;
42c61ab6 5931 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
a512bbf8
YZ
5932
5933 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5934 stripe_nr = stripe_nr * map->num_stripes + i;
b8b93add 5935 stripe_nr = div_u64(stripe_nr, map->sub_stripes);
a512bbf8
YZ
5936 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5937 stripe_nr = stripe_nr * map->num_stripes + i;
53b381b3
DW
5938 } /* else if RAID[56], multiply by nr_data_stripes().
5939 * Alternatively, just use rmap_len below instead of
5940 * map->stripe_len */
5941
5942 bytenr = chunk_start + stripe_nr * rmap_len;
934d375b 5943 WARN_ON(nr >= map->num_stripes);
a512bbf8
YZ
5944 for (j = 0; j < nr; j++) {
5945 if (buf[j] == bytenr)
5946 break;
5947 }
934d375b
CM
5948 if (j == nr) {
5949 WARN_ON(nr >= map->num_stripes);
a512bbf8 5950 buf[nr++] = bytenr;
934d375b 5951 }
a512bbf8
YZ
5952 }
5953
a512bbf8
YZ
5954 *logical = buf;
5955 *naddrs = nr;
53b381b3 5956 *stripe_len = rmap_len;
a512bbf8
YZ
5957
5958 free_extent_map(em);
5959 return 0;
f2d8d74d
CM
5960}
5961
4246a0b6 5962static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
8408c716 5963{
326e1dbb
MS
5964 bio->bi_private = bbio->private;
5965 bio->bi_end_io = bbio->end_io;
4246a0b6 5966 bio_endio(bio);
326e1dbb 5967
6e9606d2 5968 btrfs_put_bbio(bbio);
8408c716
MX
5969}
5970
4246a0b6 5971static void btrfs_end_bio(struct bio *bio)
8790d502 5972{
9be3395b 5973 struct btrfs_bio *bbio = bio->bi_private;
7d2b4daa 5974 int is_orig_bio = 0;
8790d502 5975
4e4cbee9 5976 if (bio->bi_status) {
a1d3c478 5977 atomic_inc(&bbio->error);
4e4cbee9
CH
5978 if (bio->bi_status == BLK_STS_IOERR ||
5979 bio->bi_status == BLK_STS_TARGET) {
442a4f63 5980 unsigned int stripe_index =
9be3395b 5981 btrfs_io_bio(bio)->stripe_index;
65f53338 5982 struct btrfs_device *dev;
442a4f63
SB
5983
5984 BUG_ON(stripe_index >= bbio->num_stripes);
5985 dev = bbio->stripes[stripe_index].dev;
597a60fa 5986 if (dev->bdev) {
37226b21 5987 if (bio_op(bio) == REQ_OP_WRITE)
1cb34c8e 5988 btrfs_dev_stat_inc_and_print(dev,
597a60fa
SB
5989 BTRFS_DEV_STAT_WRITE_ERRS);
5990 else
1cb34c8e 5991 btrfs_dev_stat_inc_and_print(dev,
597a60fa 5992 BTRFS_DEV_STAT_READ_ERRS);
70fd7614 5993 if (bio->bi_opf & REQ_PREFLUSH)
1cb34c8e 5994 btrfs_dev_stat_inc_and_print(dev,
597a60fa 5995 BTRFS_DEV_STAT_FLUSH_ERRS);
597a60fa 5996 }
442a4f63
SB
5997 }
5998 }
8790d502 5999
a1d3c478 6000 if (bio == bbio->orig_bio)
7d2b4daa
CM
6001 is_orig_bio = 1;
6002
c404e0dc
MX
6003 btrfs_bio_counter_dec(bbio->fs_info);
6004
a1d3c478 6005 if (atomic_dec_and_test(&bbio->stripes_pending)) {
7d2b4daa
CM
6006 if (!is_orig_bio) {
6007 bio_put(bio);
a1d3c478 6008 bio = bbio->orig_bio;
7d2b4daa 6009 }
c7b22bb1 6010
9be3395b 6011 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
a236aed1 6012 /* only send an error to the higher layers if it is
53b381b3 6013 * beyond the tolerance of the btrfs bio
a236aed1 6014 */
a1d3c478 6015 if (atomic_read(&bbio->error) > bbio->max_errors) {
4e4cbee9 6016 bio->bi_status = BLK_STS_IOERR;
5dbc8fca 6017 } else {
1259ab75
CM
6018 /*
6019 * this bio is actually up to date, we didn't
6020 * go over the max number of errors
6021 */
2dbe0c77 6022 bio->bi_status = BLK_STS_OK;
1259ab75 6023 }
c55f1396 6024
4246a0b6 6025 btrfs_end_bbio(bbio, bio);
7d2b4daa 6026 } else if (!is_orig_bio) {
8790d502
CM
6027 bio_put(bio);
6028 }
8790d502
CM
6029}
6030
8b712842
CM
6031/*
6032 * see run_scheduled_bios for a description of why bios are collected for
6033 * async submit.
6034 *
6035 * This will add one bio to the pending list for a device and make sure
6036 * the work struct is scheduled.
6037 */
2ff7e61e 6038static noinline void btrfs_schedule_bio(struct btrfs_device *device,
4e49ea4a 6039 struct bio *bio)
8b712842 6040{
0b246afa 6041 struct btrfs_fs_info *fs_info = device->fs_info;
8b712842 6042 int should_queue = 1;
ffbd517d 6043 struct btrfs_pending_bios *pending_bios;
8b712842 6044
53b381b3 6045 if (device->missing || !device->bdev) {
4246a0b6 6046 bio_io_error(bio);
53b381b3
DW
6047 return;
6048 }
6049
8b712842 6050 /* don't bother with additional async steps for reads, right now */
37226b21 6051 if (bio_op(bio) == REQ_OP_READ) {
492bb6de 6052 bio_get(bio);
4e49ea4a 6053 btrfsic_submit_bio(bio);
492bb6de 6054 bio_put(bio);
143bede5 6055 return;
8b712842
CM
6056 }
6057
492bb6de 6058 WARN_ON(bio->bi_next);
8b712842 6059 bio->bi_next = NULL;
8b712842
CM
6060
6061 spin_lock(&device->io_lock);
67f055c7 6062 if (op_is_sync(bio->bi_opf))
ffbd517d
CM
6063 pending_bios = &device->pending_sync_bios;
6064 else
6065 pending_bios = &device->pending_bios;
8b712842 6066
ffbd517d
CM
6067 if (pending_bios->tail)
6068 pending_bios->tail->bi_next = bio;
8b712842 6069
ffbd517d
CM
6070 pending_bios->tail = bio;
6071 if (!pending_bios->head)
6072 pending_bios->head = bio;
8b712842
CM
6073 if (device->running_pending)
6074 should_queue = 0;
6075
6076 spin_unlock(&device->io_lock);
6077
6078 if (should_queue)
0b246afa 6079 btrfs_queue_work(fs_info->submit_workers, &device->work);
8b712842
CM
6080}
6081
2ff7e61e
JM
6082static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6083 u64 physical, int dev_nr, int async)
de1ee92a
JB
6084{
6085 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
2ff7e61e 6086 struct btrfs_fs_info *fs_info = bbio->fs_info;
de1ee92a
JB
6087
6088 bio->bi_private = bbio;
9be3395b 6089 btrfs_io_bio(bio)->stripe_index = dev_nr;
de1ee92a 6090 bio->bi_end_io = btrfs_end_bio;
4f024f37 6091 bio->bi_iter.bi_sector = physical >> 9;
de1ee92a
JB
6092#ifdef DEBUG
6093 {
6094 struct rcu_string *name;
6095
6096 rcu_read_lock();
6097 name = rcu_dereference(dev->name);
ab8d0fc4
JM
6098 btrfs_debug(fs_info,
6099 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6100 bio_op(bio), bio->bi_opf,
6101 (u64)bio->bi_iter.bi_sector,
6102 (u_long)dev->bdev->bd_dev, name->str, dev->devid,
6103 bio->bi_iter.bi_size);
de1ee92a
JB
6104 rcu_read_unlock();
6105 }
6106#endif
74d46992 6107 bio_set_dev(bio, dev->bdev);
c404e0dc 6108
2ff7e61e 6109 btrfs_bio_counter_inc_noblocked(fs_info);
c404e0dc 6110
de1ee92a 6111 if (async)
2ff7e61e 6112 btrfs_schedule_bio(dev, bio);
de1ee92a 6113 else
4e49ea4a 6114 btrfsic_submit_bio(bio);
de1ee92a
JB
6115}
6116
de1ee92a
JB
6117static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6118{
6119 atomic_inc(&bbio->error);
6120 if (atomic_dec_and_test(&bbio->stripes_pending)) {
01327610 6121 /* Should be the original bio. */
8408c716
MX
6122 WARN_ON(bio != bbio->orig_bio);
6123
9be3395b 6124 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
4f024f37 6125 bio->bi_iter.bi_sector = logical >> 9;
102ed2c5
AJ
6126 if (atomic_read(&bbio->error) > bbio->max_errors)
6127 bio->bi_status = BLK_STS_IOERR;
6128 else
6129 bio->bi_status = BLK_STS_OK;
4246a0b6 6130 btrfs_end_bbio(bbio, bio);
de1ee92a
JB
6131 }
6132}
6133
58efbc9f
OS
6134blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6135 int mirror_num, int async_submit)
0b86a832 6136{
0b86a832 6137 struct btrfs_device *dev;
8790d502 6138 struct bio *first_bio = bio;
4f024f37 6139 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
0b86a832
CM
6140 u64 length = 0;
6141 u64 map_length;
0b86a832 6142 int ret;
08da757d
ZL
6143 int dev_nr;
6144 int total_devs;
a1d3c478 6145 struct btrfs_bio *bbio = NULL;
0b86a832 6146
4f024f37 6147 length = bio->bi_iter.bi_size;
0b86a832 6148 map_length = length;
cea9e445 6149
0b246afa 6150 btrfs_bio_counter_inc_blocked(fs_info);
bd7d63c2 6151 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
37226b21 6152 &map_length, &bbio, mirror_num, 1);
c404e0dc 6153 if (ret) {
0b246afa 6154 btrfs_bio_counter_dec(fs_info);
58efbc9f 6155 return errno_to_blk_status(ret);
c404e0dc 6156 }
cea9e445 6157
a1d3c478 6158 total_devs = bbio->num_stripes;
53b381b3
DW
6159 bbio->orig_bio = first_bio;
6160 bbio->private = first_bio->bi_private;
6161 bbio->end_io = first_bio->bi_end_io;
0b246afa 6162 bbio->fs_info = fs_info;
53b381b3
DW
6163 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6164
ad1ba2a0 6165 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
37226b21 6166 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
53b381b3
DW
6167 /* In this case, map_length has been set to the length of
6168 a single stripe; not the whole write */
37226b21 6169 if (bio_op(bio) == REQ_OP_WRITE) {
2ff7e61e
JM
6170 ret = raid56_parity_write(fs_info, bio, bbio,
6171 map_length);
53b381b3 6172 } else {
2ff7e61e
JM
6173 ret = raid56_parity_recover(fs_info, bio, bbio,
6174 map_length, mirror_num, 1);
53b381b3 6175 }
4245215d 6176
0b246afa 6177 btrfs_bio_counter_dec(fs_info);
58efbc9f 6178 return errno_to_blk_status(ret);
53b381b3
DW
6179 }
6180
cea9e445 6181 if (map_length < length) {
0b246afa 6182 btrfs_crit(fs_info,
5d163e0e
JM
6183 "mapping failed logical %llu bio len %llu len %llu",
6184 logical, length, map_length);
cea9e445
CM
6185 BUG();
6186 }
a1d3c478 6187
08da757d 6188 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
de1ee92a 6189 dev = bbio->stripes[dev_nr].dev;
37226b21 6190 if (!dev || !dev->bdev ||
a967efb3 6191 (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
de1ee92a 6192 bbio_error(bbio, first_bio, logical);
de1ee92a
JB
6193 continue;
6194 }
6195
3aa8e074 6196 if (dev_nr < total_devs - 1)
8b6c1d56 6197 bio = btrfs_bio_clone(first_bio);
3aa8e074 6198 else
a1d3c478 6199 bio = first_bio;
de1ee92a 6200
2ff7e61e
JM
6201 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6202 dev_nr, async_submit);
8790d502 6203 }
0b246afa 6204 btrfs_bio_counter_dec(fs_info);
58efbc9f 6205 return BLK_STS_OK;
0b86a832
CM
6206}
6207
aa1b8cd4 6208struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
2b82032c 6209 u8 *uuid, u8 *fsid)
0b86a832 6210{
2b82032c
YZ
6211 struct btrfs_device *device;
6212 struct btrfs_fs_devices *cur_devices;
6213
aa1b8cd4 6214 cur_devices = fs_info->fs_devices;
2b82032c
YZ
6215 while (cur_devices) {
6216 if (!fsid ||
44880fdc 6217 !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
35c70103 6218 device = find_device(cur_devices, devid, uuid);
2b82032c
YZ
6219 if (device)
6220 return device;
6221 }
6222 cur_devices = cur_devices->seed;
6223 }
6224 return NULL;
0b86a832
CM
6225}
6226
2ff7e61e 6227static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
dfe25020
CM
6228 u64 devid, u8 *dev_uuid)
6229{
6230 struct btrfs_device *device;
dfe25020 6231
12bd2fc0
ID
6232 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6233 if (IS_ERR(device))
adfb69af 6234 return device;
12bd2fc0
ID
6235
6236 list_add(&device->dev_list, &fs_devices->devices);
e4404d6e 6237 device->fs_devices = fs_devices;
dfe25020 6238 fs_devices->num_devices++;
12bd2fc0
ID
6239
6240 device->missing = 1;
cd02dca5 6241 fs_devices->missing_devices++;
12bd2fc0 6242
dfe25020
CM
6243 return device;
6244}
6245
12bd2fc0
ID
6246/**
6247 * btrfs_alloc_device - allocate struct btrfs_device
6248 * @fs_info: used only for generating a new devid, can be NULL if
6249 * devid is provided (i.e. @devid != NULL).
6250 * @devid: a pointer to devid for this device. If NULL a new devid
6251 * is generated.
6252 * @uuid: a pointer to UUID for this device. If NULL a new UUID
6253 * is generated.
6254 *
6255 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
48dae9cf
DS
6256 * on error. Returned struct is not linked onto any lists and must be
6257 * destroyed with free_device.
12bd2fc0
ID
6258 */
6259struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6260 const u64 *devid,
6261 const u8 *uuid)
6262{
6263 struct btrfs_device *dev;
6264 u64 tmp;
6265
fae7f21c 6266 if (WARN_ON(!devid && !fs_info))
12bd2fc0 6267 return ERR_PTR(-EINVAL);
12bd2fc0
ID
6268
6269 dev = __alloc_device();
6270 if (IS_ERR(dev))
6271 return dev;
6272
6273 if (devid)
6274 tmp = *devid;
6275 else {
6276 int ret;
6277
6278 ret = find_next_devid(fs_info, &tmp);
6279 if (ret) {
55de4803 6280 free_device(dev);
12bd2fc0
ID
6281 return ERR_PTR(ret);
6282 }
6283 }
6284 dev->devid = tmp;
6285
6286 if (uuid)
6287 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6288 else
6289 generate_random_uuid(dev->uuid);
6290
9e0af237
LB
6291 btrfs_init_work(&dev->work, btrfs_submit_helper,
6292 pending_bios_fn, NULL, NULL);
12bd2fc0
ID
6293
6294 return dev;
6295}
6296
e06cd3dd 6297/* Return -EIO if any error, otherwise return 0. */
2ff7e61e 6298static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
e06cd3dd
LB
6299 struct extent_buffer *leaf,
6300 struct btrfs_chunk *chunk, u64 logical)
0b86a832 6301{
0b86a832 6302 u64 length;
f04b772b 6303 u64 stripe_len;
e06cd3dd
LB
6304 u16 num_stripes;
6305 u16 sub_stripes;
6306 u64 type;
0b86a832 6307
e17cade2 6308 length = btrfs_chunk_length(leaf, chunk);
f04b772b
QW
6309 stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6310 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
e06cd3dd
LB
6311 sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6312 type = btrfs_chunk_type(leaf, chunk);
6313
f04b772b 6314 if (!num_stripes) {
0b246afa 6315 btrfs_err(fs_info, "invalid chunk num_stripes: %u",
f04b772b
QW
6316 num_stripes);
6317 return -EIO;
6318 }
0b246afa
JM
6319 if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
6320 btrfs_err(fs_info, "invalid chunk logical %llu", logical);
f04b772b
QW
6321 return -EIO;
6322 }
0b246afa
JM
6323 if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
6324 btrfs_err(fs_info, "invalid chunk sectorsize %u",
e06cd3dd
LB
6325 btrfs_chunk_sector_size(leaf, chunk));
6326 return -EIO;
6327 }
0b246afa
JM
6328 if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
6329 btrfs_err(fs_info, "invalid chunk length %llu", length);
f04b772b
QW
6330 return -EIO;
6331 }
3d8da678 6332 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
0b246afa 6333 btrfs_err(fs_info, "invalid chunk stripe length: %llu",
f04b772b
QW
6334 stripe_len);
6335 return -EIO;
6336 }
6337 if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
e06cd3dd 6338 type) {
0b246afa 6339 btrfs_err(fs_info, "unrecognized chunk type: %llu",
f04b772b
QW
6340 ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
6341 BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6342 btrfs_chunk_type(leaf, chunk));
6343 return -EIO;
6344 }
e06cd3dd
LB
6345 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
6346 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
6347 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
6348 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
6349 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
6350 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
6351 num_stripes != 1)) {
0b246afa 6352 btrfs_err(fs_info,
e06cd3dd
LB
6353 "invalid num_stripes:sub_stripes %u:%u for profile %llu",
6354 num_stripes, sub_stripes,
6355 type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
6356 return -EIO;
6357 }
6358
6359 return 0;
6360}
6361
5a2b8e60 6362static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
2b902dfc 6363 u64 devid, u8 *uuid, bool error)
5a2b8e60 6364{
2b902dfc
AJ
6365 if (error)
6366 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6367 devid, uuid);
6368 else
6369 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6370 devid, uuid);
5a2b8e60
AJ
6371}
6372
2ff7e61e 6373static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
e06cd3dd
LB
6374 struct extent_buffer *leaf,
6375 struct btrfs_chunk *chunk)
6376{
0b246afa 6377 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
e06cd3dd
LB
6378 struct map_lookup *map;
6379 struct extent_map *em;
6380 u64 logical;
6381 u64 length;
e06cd3dd
LB
6382 u64 devid;
6383 u8 uuid[BTRFS_UUID_SIZE];
6384 int num_stripes;
6385 int ret;
6386 int i;
6387
6388 logical = key->offset;
6389 length = btrfs_chunk_length(leaf, chunk);
e06cd3dd
LB
6390 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6391
2ff7e61e 6392 ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
e06cd3dd
LB
6393 if (ret)
6394 return ret;
a061fc8d 6395
890871be 6396 read_lock(&map_tree->map_tree.lock);
0b86a832 6397 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
890871be 6398 read_unlock(&map_tree->map_tree.lock);
0b86a832
CM
6399
6400 /* already mapped? */
6401 if (em && em->start <= logical && em->start + em->len > logical) {
6402 free_extent_map(em);
0b86a832
CM
6403 return 0;
6404 } else if (em) {
6405 free_extent_map(em);
6406 }
0b86a832 6407
172ddd60 6408 em = alloc_extent_map();
0b86a832
CM
6409 if (!em)
6410 return -ENOMEM;
593060d7 6411 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
0b86a832
CM
6412 if (!map) {
6413 free_extent_map(em);
6414 return -ENOMEM;
6415 }
6416
298a8f9c 6417 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 6418 em->map_lookup = map;
0b86a832
CM
6419 em->start = logical;
6420 em->len = length;
70c8a91c 6421 em->orig_start = 0;
0b86a832 6422 em->block_start = 0;
c8b97818 6423 em->block_len = em->len;
0b86a832 6424
593060d7
CM
6425 map->num_stripes = num_stripes;
6426 map->io_width = btrfs_chunk_io_width(leaf, chunk);
6427 map->io_align = btrfs_chunk_io_align(leaf, chunk);
593060d7
CM
6428 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6429 map->type = btrfs_chunk_type(leaf, chunk);
321aecc6 6430 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
593060d7
CM
6431 for (i = 0; i < num_stripes; i++) {
6432 map->stripes[i].physical =
6433 btrfs_stripe_offset_nr(leaf, chunk, i);
6434 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
a443755f
CM
6435 read_extent_buffer(leaf, uuid, (unsigned long)
6436 btrfs_stripe_dev_uuid_nr(chunk, i),
6437 BTRFS_UUID_SIZE);
0b246afa 6438 map->stripes[i].dev = btrfs_find_device(fs_info, devid,
aa1b8cd4 6439 uuid, NULL);
3cdde224 6440 if (!map->stripes[i].dev &&
0b246afa 6441 !btrfs_test_opt(fs_info, DEGRADED)) {
593060d7 6442 free_extent_map(em);
2b902dfc 6443 btrfs_report_missing_device(fs_info, devid, uuid, true);
45dbdbc9 6444 return -ENOENT;
593060d7 6445 }
dfe25020
CM
6446 if (!map->stripes[i].dev) {
6447 map->stripes[i].dev =
2ff7e61e
JM
6448 add_missing_dev(fs_info->fs_devices, devid,
6449 uuid);
adfb69af 6450 if (IS_ERR(map->stripes[i].dev)) {
dfe25020 6451 free_extent_map(em);
adfb69af
AJ
6452 btrfs_err(fs_info,
6453 "failed to init missing dev %llu: %ld",
6454 devid, PTR_ERR(map->stripes[i].dev));
6455 return PTR_ERR(map->stripes[i].dev);
dfe25020 6456 }
2b902dfc 6457 btrfs_report_missing_device(fs_info, devid, uuid, false);
dfe25020
CM
6458 }
6459 map->stripes[i].dev->in_fs_metadata = 1;
0b86a832
CM
6460 }
6461
890871be 6462 write_lock(&map_tree->map_tree.lock);
09a2a8f9 6463 ret = add_extent_mapping(&map_tree->map_tree, em, 0);
890871be 6464 write_unlock(&map_tree->map_tree.lock);
79787eaa 6465 BUG_ON(ret); /* Tree corruption */
0b86a832
CM
6466 free_extent_map(em);
6467
6468 return 0;
6469}
6470
143bede5 6471static void fill_device_from_item(struct extent_buffer *leaf,
0b86a832
CM
6472 struct btrfs_dev_item *dev_item,
6473 struct btrfs_device *device)
6474{
6475 unsigned long ptr;
0b86a832
CM
6476
6477 device->devid = btrfs_device_id(leaf, dev_item);
d6397bae
CB
6478 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6479 device->total_bytes = device->disk_total_bytes;
935e5cc9 6480 device->commit_total_bytes = device->disk_total_bytes;
0b86a832 6481 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
ce7213c7 6482 device->commit_bytes_used = device->bytes_used;
0b86a832
CM
6483 device->type = btrfs_device_type(leaf, dev_item);
6484 device->io_align = btrfs_device_io_align(leaf, dev_item);
6485 device->io_width = btrfs_device_io_width(leaf, dev_item);
6486 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
8dabb742 6487 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
63a212ab 6488 device->is_tgtdev_for_dev_replace = 0;
0b86a832 6489
410ba3a2 6490 ptr = btrfs_device_uuid(dev_item);
e17cade2 6491 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
0b86a832
CM
6492}
6493
2ff7e61e 6494static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
5f375835 6495 u8 *fsid)
2b82032c
YZ
6496{
6497 struct btrfs_fs_devices *fs_devices;
6498 int ret;
6499
b367e47f 6500 BUG_ON(!mutex_is_locked(&uuid_mutex));
2dfeca9b 6501 ASSERT(fsid);
2b82032c 6502
0b246afa 6503 fs_devices = fs_info->fs_devices->seed;
2b82032c 6504 while (fs_devices) {
44880fdc 6505 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
5f375835
MX
6506 return fs_devices;
6507
2b82032c
YZ
6508 fs_devices = fs_devices->seed;
6509 }
6510
6511 fs_devices = find_fsid(fsid);
6512 if (!fs_devices) {
0b246afa 6513 if (!btrfs_test_opt(fs_info, DEGRADED))
5f375835
MX
6514 return ERR_PTR(-ENOENT);
6515
6516 fs_devices = alloc_fs_devices(fsid);
6517 if (IS_ERR(fs_devices))
6518 return fs_devices;
6519
6520 fs_devices->seeding = 1;
6521 fs_devices->opened = 1;
6522 return fs_devices;
2b82032c 6523 }
e4404d6e
YZ
6524
6525 fs_devices = clone_fs_devices(fs_devices);
5f375835
MX
6526 if (IS_ERR(fs_devices))
6527 return fs_devices;
2b82032c 6528
97288f2c 6529 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
0b246afa 6530 fs_info->bdev_holder);
48d28232
JL
6531 if (ret) {
6532 free_fs_devices(fs_devices);
5f375835 6533 fs_devices = ERR_PTR(ret);
2b82032c 6534 goto out;
48d28232 6535 }
2b82032c
YZ
6536
6537 if (!fs_devices->seeding) {
6538 __btrfs_close_devices(fs_devices);
e4404d6e 6539 free_fs_devices(fs_devices);
5f375835 6540 fs_devices = ERR_PTR(-EINVAL);
2b82032c
YZ
6541 goto out;
6542 }
6543
0b246afa
JM
6544 fs_devices->seed = fs_info->fs_devices->seed;
6545 fs_info->fs_devices->seed = fs_devices;
2b82032c 6546out:
5f375835 6547 return fs_devices;
2b82032c
YZ
6548}
6549
2ff7e61e 6550static int read_one_dev(struct btrfs_fs_info *fs_info,
0b86a832
CM
6551 struct extent_buffer *leaf,
6552 struct btrfs_dev_item *dev_item)
6553{
0b246afa 6554 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
0b86a832
CM
6555 struct btrfs_device *device;
6556 u64 devid;
6557 int ret;
44880fdc 6558 u8 fs_uuid[BTRFS_FSID_SIZE];
a443755f
CM
6559 u8 dev_uuid[BTRFS_UUID_SIZE];
6560
0b86a832 6561 devid = btrfs_device_id(leaf, dev_item);
410ba3a2 6562 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
a443755f 6563 BTRFS_UUID_SIZE);
1473b24e 6564 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 6565 BTRFS_FSID_SIZE);
2b82032c 6566
44880fdc 6567 if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
2ff7e61e 6568 fs_devices = open_seed_devices(fs_info, fs_uuid);
5f375835
MX
6569 if (IS_ERR(fs_devices))
6570 return PTR_ERR(fs_devices);
2b82032c
YZ
6571 }
6572
0b246afa 6573 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
5f375835 6574 if (!device) {
c5502451 6575 if (!btrfs_test_opt(fs_info, DEGRADED)) {
2b902dfc
AJ
6576 btrfs_report_missing_device(fs_info, devid,
6577 dev_uuid, true);
45dbdbc9 6578 return -ENOENT;
c5502451 6579 }
2b82032c 6580
2ff7e61e 6581 device = add_missing_dev(fs_devices, devid, dev_uuid);
adfb69af
AJ
6582 if (IS_ERR(device)) {
6583 btrfs_err(fs_info,
6584 "failed to add missing dev %llu: %ld",
6585 devid, PTR_ERR(device));
6586 return PTR_ERR(device);
6587 }
2b902dfc 6588 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
5f375835 6589 } else {
c5502451 6590 if (!device->bdev) {
2b902dfc
AJ
6591 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6592 btrfs_report_missing_device(fs_info,
6593 devid, dev_uuid, true);
45dbdbc9 6594 return -ENOENT;
2b902dfc
AJ
6595 }
6596 btrfs_report_missing_device(fs_info, devid,
6597 dev_uuid, false);
c5502451 6598 }
5f375835
MX
6599
6600 if(!device->bdev && !device->missing) {
cd02dca5
CM
6601 /*
6602 * this happens when a device that was properly setup
6603 * in the device info lists suddenly goes bad.
6604 * device->bdev is NULL, and so we have to set
6605 * device->missing to one here
6606 */
5f375835 6607 device->fs_devices->missing_devices++;
cd02dca5 6608 device->missing = 1;
2b82032c 6609 }
5f375835
MX
6610
6611 /* Move the device to its own fs_devices */
6612 if (device->fs_devices != fs_devices) {
6613 ASSERT(device->missing);
6614
6615 list_move(&device->dev_list, &fs_devices->devices);
6616 device->fs_devices->num_devices--;
6617 fs_devices->num_devices++;
6618
6619 device->fs_devices->missing_devices--;
6620 fs_devices->missing_devices++;
6621
6622 device->fs_devices = fs_devices;
6623 }
2b82032c
YZ
6624 }
6625
0b246afa 6626 if (device->fs_devices != fs_info->fs_devices) {
2b82032c
YZ
6627 BUG_ON(device->writeable);
6628 if (device->generation !=
6629 btrfs_device_generation(leaf, dev_item))
6630 return -EINVAL;
6324fbf3 6631 }
0b86a832
CM
6632
6633 fill_device_from_item(leaf, dev_item, device);
dfe25020 6634 device->in_fs_metadata = 1;
63a212ab 6635 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
2b82032c 6636 device->fs_devices->total_rw_bytes += device->total_bytes;
a5ed45f8
NB
6637 atomic64_add(device->total_bytes - device->bytes_used,
6638 &fs_info->free_chunk_space);
2bf64758 6639 }
0b86a832 6640 ret = 0;
0b86a832
CM
6641 return ret;
6642}
6643
6bccf3ab 6644int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
0b86a832 6645{
6bccf3ab 6646 struct btrfs_root *root = fs_info->tree_root;
ab8d0fc4 6647 struct btrfs_super_block *super_copy = fs_info->super_copy;
a061fc8d 6648 struct extent_buffer *sb;
0b86a832 6649 struct btrfs_disk_key *disk_key;
0b86a832 6650 struct btrfs_chunk *chunk;
1ffb22cf
DS
6651 u8 *array_ptr;
6652 unsigned long sb_array_offset;
84eed90f 6653 int ret = 0;
0b86a832
CM
6654 u32 num_stripes;
6655 u32 array_size;
6656 u32 len = 0;
1ffb22cf 6657 u32 cur_offset;
e06cd3dd 6658 u64 type;
84eed90f 6659 struct btrfs_key key;
0b86a832 6660
0b246afa 6661 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
a83fffb7
DS
6662 /*
6663 * This will create extent buffer of nodesize, superblock size is
6664 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6665 * overallocate but we can keep it as-is, only the first page is used.
6666 */
2ff7e61e 6667 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
c871b0f2
LB
6668 if (IS_ERR(sb))
6669 return PTR_ERR(sb);
4db8c528 6670 set_extent_buffer_uptodate(sb);
85d4e461 6671 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
8a334426 6672 /*
01327610 6673 * The sb extent buffer is artificial and just used to read the system array.
4db8c528 6674 * set_extent_buffer_uptodate() call does not properly mark all it's
8a334426
DS
6675 * pages up-to-date when the page is larger: extent does not cover the
6676 * whole page and consequently check_page_uptodate does not find all
6677 * the page's extents up-to-date (the hole beyond sb),
6678 * write_extent_buffer then triggers a WARN_ON.
6679 *
6680 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6681 * but sb spans only this function. Add an explicit SetPageUptodate call
6682 * to silence the warning eg. on PowerPC 64.
6683 */
09cbfeaf 6684 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
727011e0 6685 SetPageUptodate(sb->pages[0]);
4008c04a 6686
a061fc8d 6687 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
0b86a832
CM
6688 array_size = btrfs_super_sys_array_size(super_copy);
6689
1ffb22cf
DS
6690 array_ptr = super_copy->sys_chunk_array;
6691 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6692 cur_offset = 0;
0b86a832 6693
1ffb22cf
DS
6694 while (cur_offset < array_size) {
6695 disk_key = (struct btrfs_disk_key *)array_ptr;
e3540eab
DS
6696 len = sizeof(*disk_key);
6697 if (cur_offset + len > array_size)
6698 goto out_short_read;
6699
0b86a832
CM
6700 btrfs_disk_key_to_cpu(&key, disk_key);
6701
1ffb22cf
DS
6702 array_ptr += len;
6703 sb_array_offset += len;
6704 cur_offset += len;
0b86a832 6705
0d81ba5d 6706 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1ffb22cf 6707 chunk = (struct btrfs_chunk *)sb_array_offset;
e3540eab
DS
6708 /*
6709 * At least one btrfs_chunk with one stripe must be
6710 * present, exact stripe count check comes afterwards
6711 */
6712 len = btrfs_chunk_item_size(1);
6713 if (cur_offset + len > array_size)
6714 goto out_short_read;
6715
6716 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
f5cdedd7 6717 if (!num_stripes) {
ab8d0fc4
JM
6718 btrfs_err(fs_info,
6719 "invalid number of stripes %u in sys_array at offset %u",
f5cdedd7
DS
6720 num_stripes, cur_offset);
6721 ret = -EIO;
6722 break;
6723 }
6724
e06cd3dd
LB
6725 type = btrfs_chunk_type(sb, chunk);
6726 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
ab8d0fc4 6727 btrfs_err(fs_info,
e06cd3dd
LB
6728 "invalid chunk type %llu in sys_array at offset %u",
6729 type, cur_offset);
6730 ret = -EIO;
6731 break;
6732 }
6733
e3540eab
DS
6734 len = btrfs_chunk_item_size(num_stripes);
6735 if (cur_offset + len > array_size)
6736 goto out_short_read;
6737
2ff7e61e 6738 ret = read_one_chunk(fs_info, &key, sb, chunk);
84eed90f
CM
6739 if (ret)
6740 break;
0b86a832 6741 } else {
ab8d0fc4
JM
6742 btrfs_err(fs_info,
6743 "unexpected item type %u in sys_array at offset %u",
6744 (u32)key.type, cur_offset);
84eed90f
CM
6745 ret = -EIO;
6746 break;
0b86a832 6747 }
1ffb22cf
DS
6748 array_ptr += len;
6749 sb_array_offset += len;
6750 cur_offset += len;
0b86a832 6751 }
d865177a 6752 clear_extent_buffer_uptodate(sb);
1c8b5b6e 6753 free_extent_buffer_stale(sb);
84eed90f 6754 return ret;
e3540eab
DS
6755
6756out_short_read:
ab8d0fc4 6757 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
e3540eab 6758 len, cur_offset);
d865177a 6759 clear_extent_buffer_uptodate(sb);
1c8b5b6e 6760 free_extent_buffer_stale(sb);
e3540eab 6761 return -EIO;
0b86a832
CM
6762}
6763
21634a19
QW
6764/*
6765 * Check if all chunks in the fs are OK for read-write degraded mount
6766 *
6767 * Return true if all chunks meet the minimal RW mount requirements.
6768 * Return false if any chunk doesn't meet the minimal RW mount requirements.
6769 */
6770bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
6771{
6772 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6773 struct extent_map *em;
6774 u64 next_start = 0;
6775 bool ret = true;
6776
6777 read_lock(&map_tree->map_tree.lock);
6778 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6779 read_unlock(&map_tree->map_tree.lock);
6780 /* No chunk at all? Return false anyway */
6781 if (!em) {
6782 ret = false;
6783 goto out;
6784 }
6785 while (em) {
6786 struct map_lookup *map;
6787 int missing = 0;
6788 int max_tolerated;
6789 int i;
6790
6791 map = em->map_lookup;
6792 max_tolerated =
6793 btrfs_get_num_tolerated_disk_barrier_failures(
6794 map->type);
6795 for (i = 0; i < map->num_stripes; i++) {
6796 struct btrfs_device *dev = map->stripes[i].dev;
6797
6798 if (!dev || !dev->bdev || dev->missing ||
6799 dev->last_flush_error)
6800 missing++;
6801 }
6802 if (missing > max_tolerated) {
6803 btrfs_warn(fs_info,
6804 "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
6805 em->start, missing, max_tolerated);
6806 free_extent_map(em);
6807 ret = false;
6808 goto out;
6809 }
6810 next_start = extent_map_end(em);
6811 free_extent_map(em);
6812
6813 read_lock(&map_tree->map_tree.lock);
6814 em = lookup_extent_mapping(&map_tree->map_tree, next_start,
6815 (u64)(-1) - next_start);
6816 read_unlock(&map_tree->map_tree.lock);
6817 }
6818out:
6819 return ret;
6820}
6821
5b4aacef 6822int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
0b86a832 6823{
5b4aacef 6824 struct btrfs_root *root = fs_info->chunk_root;
0b86a832
CM
6825 struct btrfs_path *path;
6826 struct extent_buffer *leaf;
6827 struct btrfs_key key;
6828 struct btrfs_key found_key;
6829 int ret;
6830 int slot;
99e3ecfc 6831 u64 total_dev = 0;
0b86a832 6832
0b86a832
CM
6833 path = btrfs_alloc_path();
6834 if (!path)
6835 return -ENOMEM;
6836
b367e47f 6837 mutex_lock(&uuid_mutex);
34441361 6838 mutex_lock(&fs_info->chunk_mutex);
b367e47f 6839
395927a9
FDBM
6840 /*
6841 * Read all device items, and then all the chunk items. All
6842 * device items are found before any chunk item (their object id
6843 * is smaller than the lowest possible object id for a chunk
6844 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
0b86a832
CM
6845 */
6846 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6847 key.offset = 0;
6848 key.type = 0;
0b86a832 6849 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
ab59381e
ZL
6850 if (ret < 0)
6851 goto error;
d397712b 6852 while (1) {
0b86a832
CM
6853 leaf = path->nodes[0];
6854 slot = path->slots[0];
6855 if (slot >= btrfs_header_nritems(leaf)) {
6856 ret = btrfs_next_leaf(root, path);
6857 if (ret == 0)
6858 continue;
6859 if (ret < 0)
6860 goto error;
6861 break;
6862 }
6863 btrfs_item_key_to_cpu(leaf, &found_key, slot);
395927a9
FDBM
6864 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6865 struct btrfs_dev_item *dev_item;
6866 dev_item = btrfs_item_ptr(leaf, slot,
0b86a832 6867 struct btrfs_dev_item);
2ff7e61e 6868 ret = read_one_dev(fs_info, leaf, dev_item);
395927a9
FDBM
6869 if (ret)
6870 goto error;
99e3ecfc 6871 total_dev++;
0b86a832
CM
6872 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6873 struct btrfs_chunk *chunk;
6874 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2ff7e61e 6875 ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
2b82032c
YZ
6876 if (ret)
6877 goto error;
0b86a832
CM
6878 }
6879 path->slots[0]++;
6880 }
99e3ecfc
LB
6881
6882 /*
6883 * After loading chunk tree, we've got all device information,
6884 * do another round of validation checks.
6885 */
0b246afa
JM
6886 if (total_dev != fs_info->fs_devices->total_devices) {
6887 btrfs_err(fs_info,
99e3ecfc 6888 "super_num_devices %llu mismatch with num_devices %llu found here",
0b246afa 6889 btrfs_super_num_devices(fs_info->super_copy),
99e3ecfc
LB
6890 total_dev);
6891 ret = -EINVAL;
6892 goto error;
6893 }
0b246afa
JM
6894 if (btrfs_super_total_bytes(fs_info->super_copy) <
6895 fs_info->fs_devices->total_rw_bytes) {
6896 btrfs_err(fs_info,
99e3ecfc 6897 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
0b246afa
JM
6898 btrfs_super_total_bytes(fs_info->super_copy),
6899 fs_info->fs_devices->total_rw_bytes);
99e3ecfc
LB
6900 ret = -EINVAL;
6901 goto error;
6902 }
0b86a832
CM
6903 ret = 0;
6904error:
34441361 6905 mutex_unlock(&fs_info->chunk_mutex);
b367e47f
LZ
6906 mutex_unlock(&uuid_mutex);
6907
2b82032c 6908 btrfs_free_path(path);
0b86a832
CM
6909 return ret;
6910}
442a4f63 6911
cb517eab
MX
6912void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6913{
6914 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6915 struct btrfs_device *device;
6916
29cc83f6
LB
6917 while (fs_devices) {
6918 mutex_lock(&fs_devices->device_list_mutex);
6919 list_for_each_entry(device, &fs_devices->devices, dev_list)
fb456252 6920 device->fs_info = fs_info;
29cc83f6
LB
6921 mutex_unlock(&fs_devices->device_list_mutex);
6922
6923 fs_devices = fs_devices->seed;
6924 }
cb517eab
MX
6925}
6926
733f4fbb
SB
6927static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6928{
6929 int i;
6930
6931 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6932 btrfs_dev_stat_reset(dev, i);
6933}
6934
6935int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6936{
6937 struct btrfs_key key;
6938 struct btrfs_key found_key;
6939 struct btrfs_root *dev_root = fs_info->dev_root;
6940 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6941 struct extent_buffer *eb;
6942 int slot;
6943 int ret = 0;
6944 struct btrfs_device *device;
6945 struct btrfs_path *path = NULL;
6946 int i;
6947
6948 path = btrfs_alloc_path();
6949 if (!path) {
6950 ret = -ENOMEM;
6951 goto out;
6952 }
6953
6954 mutex_lock(&fs_devices->device_list_mutex);
6955 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6956 int item_size;
6957 struct btrfs_dev_stats_item *ptr;
6958
242e2956
DS
6959 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6960 key.type = BTRFS_PERSISTENT_ITEM_KEY;
733f4fbb
SB
6961 key.offset = device->devid;
6962 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6963 if (ret) {
733f4fbb
SB
6964 __btrfs_reset_dev_stats(device);
6965 device->dev_stats_valid = 1;
6966 btrfs_release_path(path);
6967 continue;
6968 }
6969 slot = path->slots[0];
6970 eb = path->nodes[0];
6971 btrfs_item_key_to_cpu(eb, &found_key, slot);
6972 item_size = btrfs_item_size_nr(eb, slot);
6973
6974 ptr = btrfs_item_ptr(eb, slot,
6975 struct btrfs_dev_stats_item);
6976
6977 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6978 if (item_size >= (1 + i) * sizeof(__le64))
6979 btrfs_dev_stat_set(device, i,
6980 btrfs_dev_stats_value(eb, ptr, i));
6981 else
6982 btrfs_dev_stat_reset(device, i);
6983 }
6984
6985 device->dev_stats_valid = 1;
6986 btrfs_dev_stat_print_on_load(device);
6987 btrfs_release_path(path);
6988 }
6989 mutex_unlock(&fs_devices->device_list_mutex);
6990
6991out:
6992 btrfs_free_path(path);
6993 return ret < 0 ? ret : 0;
6994}
6995
6996static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6bccf3ab 6997 struct btrfs_fs_info *fs_info,
733f4fbb
SB
6998 struct btrfs_device *device)
6999{
6bccf3ab 7000 struct btrfs_root *dev_root = fs_info->dev_root;
733f4fbb
SB
7001 struct btrfs_path *path;
7002 struct btrfs_key key;
7003 struct extent_buffer *eb;
7004 struct btrfs_dev_stats_item *ptr;
7005 int ret;
7006 int i;
7007
242e2956
DS
7008 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7009 key.type = BTRFS_PERSISTENT_ITEM_KEY;
733f4fbb
SB
7010 key.offset = device->devid;
7011
7012 path = btrfs_alloc_path();
fa252992
DS
7013 if (!path)
7014 return -ENOMEM;
733f4fbb
SB
7015 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7016 if (ret < 0) {
0b246afa 7017 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7018 "error %d while searching for dev_stats item for device %s",
606686ee 7019 ret, rcu_str_deref(device->name));
733f4fbb
SB
7020 goto out;
7021 }
7022
7023 if (ret == 0 &&
7024 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7025 /* need to delete old one and insert a new one */
7026 ret = btrfs_del_item(trans, dev_root, path);
7027 if (ret != 0) {
0b246afa 7028 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7029 "delete too small dev_stats item for device %s failed %d",
606686ee 7030 rcu_str_deref(device->name), ret);
733f4fbb
SB
7031 goto out;
7032 }
7033 ret = 1;
7034 }
7035
7036 if (ret == 1) {
7037 /* need to insert a new item */
7038 btrfs_release_path(path);
7039 ret = btrfs_insert_empty_item(trans, dev_root, path,
7040 &key, sizeof(*ptr));
7041 if (ret < 0) {
0b246afa 7042 btrfs_warn_in_rcu(fs_info,
ecaeb14b
DS
7043 "insert dev_stats item for device %s failed %d",
7044 rcu_str_deref(device->name), ret);
733f4fbb
SB
7045 goto out;
7046 }
7047 }
7048
7049 eb = path->nodes[0];
7050 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7051 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7052 btrfs_set_dev_stats_value(eb, ptr, i,
7053 btrfs_dev_stat_read(device, i));
7054 btrfs_mark_buffer_dirty(eb);
7055
7056out:
7057 btrfs_free_path(path);
7058 return ret;
7059}
7060
7061/*
7062 * called from commit_transaction. Writes all changed device stats to disk.
7063 */
7064int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7065 struct btrfs_fs_info *fs_info)
7066{
733f4fbb
SB
7067 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7068 struct btrfs_device *device;
addc3fa7 7069 int stats_cnt;
733f4fbb
SB
7070 int ret = 0;
7071
7072 mutex_lock(&fs_devices->device_list_mutex);
7073 list_for_each_entry(device, &fs_devices->devices, dev_list) {
9deae968
NB
7074 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7075 if (!device->dev_stats_valid || stats_cnt == 0)
733f4fbb
SB
7076 continue;
7077
9deae968
NB
7078
7079 /*
7080 * There is a LOAD-LOAD control dependency between the value of
7081 * dev_stats_ccnt and updating the on-disk values which requires
7082 * reading the in-memory counters. Such control dependencies
7083 * require explicit read memory barriers.
7084 *
7085 * This memory barriers pairs with smp_mb__before_atomic in
7086 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7087 * barrier implied by atomic_xchg in
7088 * btrfs_dev_stats_read_and_reset
7089 */
7090 smp_rmb();
7091
6bccf3ab 7092 ret = update_dev_stat_item(trans, fs_info, device);
733f4fbb 7093 if (!ret)
addc3fa7 7094 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
733f4fbb
SB
7095 }
7096 mutex_unlock(&fs_devices->device_list_mutex);
7097
7098 return ret;
7099}
7100
442a4f63
SB
7101void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7102{
7103 btrfs_dev_stat_inc(dev, index);
7104 btrfs_dev_stat_print_on_error(dev);
7105}
7106
48a3b636 7107static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
442a4f63 7108{
733f4fbb
SB
7109 if (!dev->dev_stats_valid)
7110 return;
fb456252 7111 btrfs_err_rl_in_rcu(dev->fs_info,
b14af3b4 7112 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 7113 rcu_str_deref(dev->name),
442a4f63
SB
7114 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7115 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7116 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
efe120a0
FH
7117 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7118 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
442a4f63 7119}
c11d2c23 7120
733f4fbb
SB
7121static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7122{
a98cdb85
SB
7123 int i;
7124
7125 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7126 if (btrfs_dev_stat_read(dev, i) != 0)
7127 break;
7128 if (i == BTRFS_DEV_STAT_VALUES_MAX)
7129 return; /* all values == 0, suppress message */
7130
fb456252 7131 btrfs_info_in_rcu(dev->fs_info,
ecaeb14b 7132 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 7133 rcu_str_deref(dev->name),
733f4fbb
SB
7134 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7135 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7136 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7137 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7138 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7139}
7140
2ff7e61e 7141int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
b27f7c0c 7142 struct btrfs_ioctl_get_dev_stats *stats)
c11d2c23
SB
7143{
7144 struct btrfs_device *dev;
0b246afa 7145 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
c11d2c23
SB
7146 int i;
7147
7148 mutex_lock(&fs_devices->device_list_mutex);
0b246afa 7149 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
c11d2c23
SB
7150 mutex_unlock(&fs_devices->device_list_mutex);
7151
7152 if (!dev) {
0b246afa 7153 btrfs_warn(fs_info, "get dev_stats failed, device not found");
c11d2c23 7154 return -ENODEV;
733f4fbb 7155 } else if (!dev->dev_stats_valid) {
0b246afa 7156 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
733f4fbb 7157 return -ENODEV;
b27f7c0c 7158 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
c11d2c23
SB
7159 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7160 if (stats->nr_items > i)
7161 stats->values[i] =
7162 btrfs_dev_stat_read_and_reset(dev, i);
7163 else
7164 btrfs_dev_stat_reset(dev, i);
7165 }
7166 } else {
7167 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7168 if (stats->nr_items > i)
7169 stats->values[i] = btrfs_dev_stat_read(dev, i);
7170 }
7171 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7172 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7173 return 0;
7174}
a8a6dab7 7175
da353f6b 7176void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
a8a6dab7
SB
7177{
7178 struct buffer_head *bh;
7179 struct btrfs_super_block *disk_super;
12b1c263 7180 int copy_num;
a8a6dab7 7181
12b1c263
AJ
7182 if (!bdev)
7183 return;
a8a6dab7 7184
12b1c263
AJ
7185 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7186 copy_num++) {
a8a6dab7 7187
12b1c263
AJ
7188 if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7189 continue;
7190
7191 disk_super = (struct btrfs_super_block *)bh->b_data;
7192
7193 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7194 set_buffer_dirty(bh);
7195 sync_dirty_buffer(bh);
7196 brelse(bh);
7197 }
7198
7199 /* Notify udev that device has changed */
7200 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7201
7202 /* Update ctime/mtime for device path for libblkid */
7203 update_dev_time(device_path);
a8a6dab7 7204}
935e5cc9
MX
7205
7206/*
7207 * Update the size of all devices, which is used for writing out the
7208 * super blocks.
7209 */
7210void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
7211{
7212 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7213 struct btrfs_device *curr, *next;
7214
7215 if (list_empty(&fs_devices->resized_devices))
7216 return;
7217
7218 mutex_lock(&fs_devices->device_list_mutex);
34441361 7219 mutex_lock(&fs_info->chunk_mutex);
935e5cc9
MX
7220 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7221 resized_list) {
7222 list_del_init(&curr->resized_list);
7223 curr->commit_total_bytes = curr->disk_total_bytes;
7224 }
34441361 7225 mutex_unlock(&fs_info->chunk_mutex);
935e5cc9
MX
7226 mutex_unlock(&fs_devices->device_list_mutex);
7227}
ce7213c7
MX
7228
7229/* Must be invoked during the transaction commit */
2ff7e61e 7230void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
ce7213c7
MX
7231 struct btrfs_transaction *transaction)
7232{
7233 struct extent_map *em;
7234 struct map_lookup *map;
7235 struct btrfs_device *dev;
7236 int i;
7237
7238 if (list_empty(&transaction->pending_chunks))
7239 return;
7240
7241 /* In order to kick the device replace finish process */
34441361 7242 mutex_lock(&fs_info->chunk_mutex);
ce7213c7 7243 list_for_each_entry(em, &transaction->pending_chunks, list) {
95617d69 7244 map = em->map_lookup;
ce7213c7
MX
7245
7246 for (i = 0; i < map->num_stripes; i++) {
7247 dev = map->stripes[i].dev;
7248 dev->commit_bytes_used = dev->bytes_used;
7249 }
7250 }
34441361 7251 mutex_unlock(&fs_info->chunk_mutex);
ce7213c7 7252}
5a13f430
AJ
7253
7254void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7255{
7256 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7257 while (fs_devices) {
7258 fs_devices->fs_info = fs_info;
7259 fs_devices = fs_devices->seed;
7260 }
7261}
7262
7263void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7264{
7265 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7266 while (fs_devices) {
7267 fs_devices->fs_info = NULL;
7268 fs_devices = fs_devices->seed;
7269 }
7270}