btrfs: zoned: mark block groups to copy for device-replace
[linux-block.git] / fs / btrfs / volumes.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0b86a832
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
0b86a832 4 */
c1d7c514 5
0b86a832 6#include <linux/sched.h>
fccc0007 7#include <linux/sched/mm.h>
0b86a832 8#include <linux/bio.h>
5a0e3ad6 9#include <linux/slab.h>
f2d8d74d 10#include <linux/blkdev.h>
442a4f63 11#include <linux/ratelimit.h>
59641015 12#include <linux/kthread.h>
53b381b3 13#include <linux/raid/pq.h>
803b2f54 14#include <linux/semaphore.h>
8da4b8c4 15#include <linux/uuid.h>
f8e10cd3 16#include <linux/list_sort.h>
784352fe 17#include "misc.h"
0b86a832
CM
18#include "ctree.h"
19#include "extent_map.h"
20#include "disk-io.h"
21#include "transaction.h"
22#include "print-tree.h"
23#include "volumes.h"
53b381b3 24#include "raid56.h"
8b712842 25#include "async-thread.h"
21adbd5c 26#include "check-integrity.h"
606686ee 27#include "rcu-string.h"
8dabb742 28#include "dev-replace.h"
99994cde 29#include "sysfs.h"
82fc28fb 30#include "tree-checker.h"
8719aaae 31#include "space-info.h"
aac0023c 32#include "block-group.h"
b0643e59 33#include "discard.h"
5b316468 34#include "zoned.h"
0b86a832 35
af902047
ZL
36const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37 [BTRFS_RAID_RAID10] = {
38 .sub_stripes = 2,
39 .dev_stripes = 1,
40 .devs_max = 0, /* 0 == as many as possible */
41 .devs_min = 4,
8789f4fe 42 .tolerated_failures = 1,
af902047
ZL
43 .devs_increment = 2,
44 .ncopies = 2,
b50836ed 45 .nparity = 0,
ed23467b 46 .raid_name = "raid10",
41a6e891 47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
f9fbcaa2 48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
af902047
ZL
49 },
50 [BTRFS_RAID_RAID1] = {
51 .sub_stripes = 1,
52 .dev_stripes = 1,
53 .devs_max = 2,
54 .devs_min = 2,
8789f4fe 55 .tolerated_failures = 1,
af902047
ZL
56 .devs_increment = 2,
57 .ncopies = 2,
b50836ed 58 .nparity = 0,
ed23467b 59 .raid_name = "raid1",
41a6e891 60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
f9fbcaa2 61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
af902047 62 },
47e6f742
DS
63 [BTRFS_RAID_RAID1C3] = {
64 .sub_stripes = 1,
65 .dev_stripes = 1,
cf93e15e 66 .devs_max = 3,
47e6f742
DS
67 .devs_min = 3,
68 .tolerated_failures = 2,
69 .devs_increment = 3,
70 .ncopies = 3,
db26a024 71 .nparity = 0,
47e6f742
DS
72 .raid_name = "raid1c3",
73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75 },
8d6fac00
DS
76 [BTRFS_RAID_RAID1C4] = {
77 .sub_stripes = 1,
78 .dev_stripes = 1,
cf93e15e 79 .devs_max = 4,
8d6fac00
DS
80 .devs_min = 4,
81 .tolerated_failures = 3,
82 .devs_increment = 4,
83 .ncopies = 4,
db26a024 84 .nparity = 0,
8d6fac00
DS
85 .raid_name = "raid1c4",
86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
88 },
af902047
ZL
89 [BTRFS_RAID_DUP] = {
90 .sub_stripes = 1,
91 .dev_stripes = 2,
92 .devs_max = 1,
93 .devs_min = 1,
8789f4fe 94 .tolerated_failures = 0,
af902047
ZL
95 .devs_increment = 1,
96 .ncopies = 2,
b50836ed 97 .nparity = 0,
ed23467b 98 .raid_name = "dup",
41a6e891 99 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
f9fbcaa2 100 .mindev_error = 0,
af902047
ZL
101 },
102 [BTRFS_RAID_RAID0] = {
103 .sub_stripes = 1,
104 .dev_stripes = 1,
105 .devs_max = 0,
106 .devs_min = 2,
8789f4fe 107 .tolerated_failures = 0,
af902047
ZL
108 .devs_increment = 1,
109 .ncopies = 1,
b50836ed 110 .nparity = 0,
ed23467b 111 .raid_name = "raid0",
41a6e891 112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
f9fbcaa2 113 .mindev_error = 0,
af902047
ZL
114 },
115 [BTRFS_RAID_SINGLE] = {
116 .sub_stripes = 1,
117 .dev_stripes = 1,
118 .devs_max = 1,
119 .devs_min = 1,
8789f4fe 120 .tolerated_failures = 0,
af902047
ZL
121 .devs_increment = 1,
122 .ncopies = 1,
b50836ed 123 .nparity = 0,
ed23467b 124 .raid_name = "single",
41a6e891 125 .bg_flag = 0,
f9fbcaa2 126 .mindev_error = 0,
af902047
ZL
127 },
128 [BTRFS_RAID_RAID5] = {
129 .sub_stripes = 1,
130 .dev_stripes = 1,
131 .devs_max = 0,
132 .devs_min = 2,
8789f4fe 133 .tolerated_failures = 1,
af902047 134 .devs_increment = 1,
da612e31 135 .ncopies = 1,
b50836ed 136 .nparity = 1,
ed23467b 137 .raid_name = "raid5",
41a6e891 138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
f9fbcaa2 139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
af902047
ZL
140 },
141 [BTRFS_RAID_RAID6] = {
142 .sub_stripes = 1,
143 .dev_stripes = 1,
144 .devs_max = 0,
145 .devs_min = 3,
8789f4fe 146 .tolerated_failures = 2,
af902047 147 .devs_increment = 1,
da612e31 148 .ncopies = 1,
b50836ed 149 .nparity = 2,
ed23467b 150 .raid_name = "raid6",
41a6e891 151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
f9fbcaa2 152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
af902047
ZL
153 },
154};
155
158da513 156const char *btrfs_bg_type_to_raid_name(u64 flags)
ed23467b 157{
158da513
DS
158 const int index = btrfs_bg_flags_to_raid_index(flags);
159
160 if (index >= BTRFS_NR_RAID_TYPES)
ed23467b
AJ
161 return NULL;
162
158da513 163 return btrfs_raid_array[index].raid_name;
ed23467b
AJ
164}
165
f89e09cf
AJ
166/*
167 * Fill @buf with textual description of @bg_flags, no more than @size_buf
168 * bytes including terminating null byte.
169 */
170void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171{
172 int i;
173 int ret;
174 char *bp = buf;
175 u64 flags = bg_flags;
176 u32 size_bp = size_buf;
177
178 if (!flags) {
179 strcpy(bp, "NONE");
180 return;
181 }
182
183#define DESCRIBE_FLAG(flag, desc) \
184 do { \
185 if (flags & (flag)) { \
186 ret = snprintf(bp, size_bp, "%s|", (desc)); \
187 if (ret < 0 || ret >= size_bp) \
188 goto out_overflow; \
189 size_bp -= ret; \
190 bp += ret; \
191 flags &= ~(flag); \
192 } \
193 } while (0)
194
195 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
199 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202 btrfs_raid_array[i].raid_name);
203#undef DESCRIBE_FLAG
204
205 if (flags) {
206 ret = snprintf(bp, size_bp, "0x%llx|", flags);
207 size_bp -= ret;
208 }
209
210 if (size_bp < size_buf)
211 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
213 /*
214 * The text is trimmed, it's up to the caller to provide sufficiently
215 * large buffer
216 */
217out_overflow:;
218}
219
6f8e0fc7 220static int init_first_rw_device(struct btrfs_trans_handle *trans);
2ff7e61e 221static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
48a3b636 222static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
733f4fbb 223static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
5ab56090
LB
224static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
225 enum btrfs_map_op op,
226 u64 logical, u64 *length,
227 struct btrfs_bio **bbio_ret,
228 int mirror_num, int need_raid_map);
2b82032c 229
9c6b1c4d
DS
230/*
231 * Device locking
232 * ==============
233 *
234 * There are several mutexes that protect manipulation of devices and low-level
235 * structures like chunks but not block groups, extents or files
236 *
237 * uuid_mutex (global lock)
238 * ------------------------
239 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
240 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
241 * device) or requested by the device= mount option
242 *
243 * the mutex can be very coarse and can cover long-running operations
244 *
245 * protects: updates to fs_devices counters like missing devices, rw devices,
52042d8e 246 * seeding, structure cloning, opening/closing devices at mount/umount time
9c6b1c4d
DS
247 *
248 * global::fs_devs - add, remove, updates to the global list
249 *
18c850fd
JB
250 * does not protect: manipulation of the fs_devices::devices list in general
251 * but in mount context it could be used to exclude list modifications by eg.
252 * scan ioctl
9c6b1c4d
DS
253 *
254 * btrfs_device::name - renames (write side), read is RCU
255 *
256 * fs_devices::device_list_mutex (per-fs, with RCU)
257 * ------------------------------------------------
258 * protects updates to fs_devices::devices, ie. adding and deleting
259 *
260 * simple list traversal with read-only actions can be done with RCU protection
261 *
262 * may be used to exclude some operations from running concurrently without any
263 * modifications to the list (see write_all_supers)
264 *
18c850fd
JB
265 * Is not required at mount and close times, because our device list is
266 * protected by the uuid_mutex at that point.
267 *
9c6b1c4d
DS
268 * balance_mutex
269 * -------------
270 * protects balance structures (status, state) and context accessed from
271 * several places (internally, ioctl)
272 *
273 * chunk_mutex
274 * -----------
275 * protects chunks, adding or removing during allocation, trim or when a new
0b6f5d40
NB
276 * device is added/removed. Additionally it also protects post_commit_list of
277 * individual devices, since they can be added to the transaction's
278 * post_commit_list only with chunk_mutex held.
9c6b1c4d
DS
279 *
280 * cleaner_mutex
281 * -------------
282 * a big lock that is held by the cleaner thread and prevents running subvolume
283 * cleaning together with relocation or delayed iputs
284 *
285 *
286 * Lock nesting
287 * ============
288 *
289 * uuid_mutex
ae3e715f
AJ
290 * device_list_mutex
291 * chunk_mutex
292 * balance_mutex
89595e80
AJ
293 *
294 *
c3e1f96c
GR
295 * Exclusive operations
296 * ====================
89595e80
AJ
297 *
298 * Maintains the exclusivity of the following operations that apply to the
299 * whole filesystem and cannot run in parallel.
300 *
301 * - Balance (*)
302 * - Device add
303 * - Device remove
304 * - Device replace (*)
305 * - Resize
306 *
307 * The device operations (as above) can be in one of the following states:
308 *
309 * - Running state
310 * - Paused state
311 * - Completed state
312 *
313 * Only device operations marked with (*) can go into the Paused state for the
314 * following reasons:
315 *
316 * - ioctl (only Balance can be Paused through ioctl)
317 * - filesystem remounted as read-only
318 * - filesystem unmounted and mounted as read-only
319 * - system power-cycle and filesystem mounted as read-only
320 * - filesystem or device errors leading to forced read-only
321 *
c3e1f96c
GR
322 * The status of exclusive operation is set and cleared atomically.
323 * During the course of Paused state, fs_info::exclusive_operation remains set.
89595e80
AJ
324 * A device operation in Paused or Running state can be canceled or resumed
325 * either by ioctl (Balance only) or when remounted as read-write.
c3e1f96c 326 * The exclusive status is cleared when the device operation is canceled or
89595e80 327 * completed.
9c6b1c4d
DS
328 */
329
67a2c45e 330DEFINE_MUTEX(uuid_mutex);
8a4b83cc 331static LIST_HEAD(fs_uuids);
4143cb8b 332struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
c73eccf7
AJ
333{
334 return &fs_uuids;
335}
8a4b83cc 336
2dfeca9b
DS
337/*
338 * alloc_fs_devices - allocate struct btrfs_fs_devices
7239ff4b
NB
339 * @fsid: if not NULL, copy the UUID to fs_devices::fsid
340 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
2dfeca9b
DS
341 *
342 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
343 * The returned struct is not linked onto any lists and can be destroyed with
344 * kfree() right away.
345 */
7239ff4b
NB
346static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347 const u8 *metadata_fsid)
2208a378
ID
348{
349 struct btrfs_fs_devices *fs_devs;
350
78f2c9e6 351 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
2208a378
ID
352 if (!fs_devs)
353 return ERR_PTR(-ENOMEM);
354
355 mutex_init(&fs_devs->device_list_mutex);
356
357 INIT_LIST_HEAD(&fs_devs->devices);
358 INIT_LIST_HEAD(&fs_devs->alloc_list);
c4babc5e 359 INIT_LIST_HEAD(&fs_devs->fs_list);
944d3f9f 360 INIT_LIST_HEAD(&fs_devs->seed_list);
2208a378
ID
361 if (fsid)
362 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
2208a378 363
7239ff4b
NB
364 if (metadata_fsid)
365 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366 else if (fsid)
367 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
368
2208a378
ID
369 return fs_devs;
370}
371
a425f9d4 372void btrfs_free_device(struct btrfs_device *device)
48dae9cf 373{
bbbf7243 374 WARN_ON(!list_empty(&device->post_commit_list));
48dae9cf 375 rcu_string_free(device->name);
1c11b63e 376 extent_io_tree_release(&device->alloc_state);
48dae9cf 377 bio_put(device->flush_bio);
5b316468 378 btrfs_destroy_dev_zone_info(device);
48dae9cf
DS
379 kfree(device);
380}
381
e4404d6e
YZ
382static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
383{
384 struct btrfs_device *device;
385 WARN_ON(fs_devices->opened);
386 while (!list_empty(&fs_devices->devices)) {
387 device = list_entry(fs_devices->devices.next,
388 struct btrfs_device, dev_list);
389 list_del(&device->dev_list);
a425f9d4 390 btrfs_free_device(device);
e4404d6e
YZ
391 }
392 kfree(fs_devices);
393}
394
ffc5a379 395void __exit btrfs_cleanup_fs_uuids(void)
8a4b83cc
CM
396{
397 struct btrfs_fs_devices *fs_devices;
8a4b83cc 398
2b82032c
YZ
399 while (!list_empty(&fs_uuids)) {
400 fs_devices = list_entry(fs_uuids.next,
c4babc5e
AJ
401 struct btrfs_fs_devices, fs_list);
402 list_del(&fs_devices->fs_list);
e4404d6e 403 free_fs_devices(fs_devices);
8a4b83cc 404 }
8a4b83cc
CM
405}
406
48dae9cf
DS
407/*
408 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
409 * Returned struct is not linked onto any lists and must be destroyed using
a425f9d4 410 * btrfs_free_device.
48dae9cf 411 */
154f7cb8 412static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
12bd2fc0
ID
413{
414 struct btrfs_device *dev;
415
78f2c9e6 416 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
12bd2fc0
ID
417 if (!dev)
418 return ERR_PTR(-ENOMEM);
419
e0ae9994
DS
420 /*
421 * Preallocate a bio that's always going to be used for flushing device
422 * barriers and matches the device lifespan
423 */
424 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
425 if (!dev->flush_bio) {
426 kfree(dev);
427 return ERR_PTR(-ENOMEM);
428 }
e0ae9994 429
12bd2fc0
ID
430 INIT_LIST_HEAD(&dev->dev_list);
431 INIT_LIST_HEAD(&dev->dev_alloc_list);
bbbf7243 432 INIT_LIST_HEAD(&dev->post_commit_list);
12bd2fc0 433
12bd2fc0 434 atomic_set(&dev->reada_in_flight, 0);
addc3fa7 435 atomic_set(&dev->dev_stats_ccnt, 0);
c41ec452 436 btrfs_device_data_ordered_init(dev);
9bcaaea7 437 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
d0164adc 438 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
154f7cb8
QW
439 extent_io_tree_init(fs_info, &dev->alloc_state,
440 IO_TREE_DEVICE_ALLOC_STATE, NULL);
12bd2fc0
ID
441
442 return dev;
443}
444
7239ff4b
NB
445static noinline struct btrfs_fs_devices *find_fsid(
446 const u8 *fsid, const u8 *metadata_fsid)
8a4b83cc 447{
8a4b83cc
CM
448 struct btrfs_fs_devices *fs_devices;
449
7239ff4b
NB
450 ASSERT(fsid);
451
7a62d0f0 452 /* Handle non-split brain cases */
c4babc5e 453 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
7239ff4b
NB
454 if (metadata_fsid) {
455 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
456 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
457 BTRFS_FSID_SIZE) == 0)
458 return fs_devices;
459 } else {
460 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
461 return fs_devices;
462 }
8a4b83cc
CM
463 }
464 return NULL;
465}
466
c6730a0e
SY
467static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
468 struct btrfs_super_block *disk_super)
469{
470
471 struct btrfs_fs_devices *fs_devices;
472
473 /*
474 * Handle scanned device having completed its fsid change but
475 * belonging to a fs_devices that was created by first scanning
476 * a device which didn't have its fsid/metadata_uuid changed
477 * at all and the CHANGING_FSID_V2 flag set.
478 */
479 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
480 if (fs_devices->fsid_change &&
481 memcmp(disk_super->metadata_uuid, fs_devices->fsid,
482 BTRFS_FSID_SIZE) == 0 &&
483 memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
484 BTRFS_FSID_SIZE) == 0) {
485 return fs_devices;
486 }
487 }
488 /*
489 * Handle scanned device having completed its fsid change but
490 * belonging to a fs_devices that was created by a device that
491 * has an outdated pair of fsid/metadata_uuid and
492 * CHANGING_FSID_V2 flag set.
493 */
494 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
495 if (fs_devices->fsid_change &&
496 memcmp(fs_devices->metadata_uuid,
497 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
498 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
499 BTRFS_FSID_SIZE) == 0) {
500 return fs_devices;
501 }
502 }
503
504 return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
505}
506
507
beaf8ab3
SB
508static int
509btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
510 int flush, struct block_device **bdev,
8f32380d 511 struct btrfs_super_block **disk_super)
beaf8ab3
SB
512{
513 int ret;
514
515 *bdev = blkdev_get_by_path(device_path, flags, holder);
516
517 if (IS_ERR(*bdev)) {
518 ret = PTR_ERR(*bdev);
beaf8ab3
SB
519 goto error;
520 }
521
522 if (flush)
523 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
9f6d2510 524 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
beaf8ab3
SB
525 if (ret) {
526 blkdev_put(*bdev, flags);
527 goto error;
528 }
529 invalidate_bdev(*bdev);
8f32380d
JT
530 *disk_super = btrfs_read_dev_super(*bdev);
531 if (IS_ERR(*disk_super)) {
532 ret = PTR_ERR(*disk_super);
beaf8ab3
SB
533 blkdev_put(*bdev, flags);
534 goto error;
535 }
536
537 return 0;
538
539error:
540 *bdev = NULL;
beaf8ab3
SB
541 return ret;
542}
543
70bc7088
AJ
544static bool device_path_matched(const char *path, struct btrfs_device *device)
545{
546 int found;
547
548 rcu_read_lock();
549 found = strcmp(rcu_str_deref(device->name), path);
550 rcu_read_unlock();
551
552 return found == 0;
553}
554
d8367db3
AJ
555/*
556 * Search and remove all stale (devices which are not mounted) devices.
557 * When both inputs are NULL, it will search and release all stale devices.
558 * path: Optional. When provided will it release all unmounted devices
559 * matching this path only.
560 * skip_dev: Optional. Will skip this device when searching for the stale
561 * devices.
70bc7088
AJ
562 * Return: 0 for success or if @path is NULL.
563 * -EBUSY if @path is a mounted device.
564 * -ENOENT if @path does not match any device in the list.
d8367db3 565 */
70bc7088 566static int btrfs_free_stale_devices(const char *path,
fa6d2ae5 567 struct btrfs_device *skip_device)
4fde46f0 568{
fa6d2ae5
AJ
569 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
570 struct btrfs_device *device, *tmp_device;
70bc7088
AJ
571 int ret = 0;
572
573 if (path)
574 ret = -ENOENT;
4fde46f0 575
fa6d2ae5 576 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
4fde46f0 577
70bc7088 578 mutex_lock(&fs_devices->device_list_mutex);
fa6d2ae5
AJ
579 list_for_each_entry_safe(device, tmp_device,
580 &fs_devices->devices, dev_list) {
fa6d2ae5 581 if (skip_device && skip_device == device)
d8367db3 582 continue;
fa6d2ae5 583 if (path && !device->name)
4fde46f0 584 continue;
70bc7088 585 if (path && !device_path_matched(path, device))
38cf665d 586 continue;
70bc7088
AJ
587 if (fs_devices->opened) {
588 /* for an already deleted device return 0 */
589 if (path && ret != 0)
590 ret = -EBUSY;
591 break;
592 }
4fde46f0 593
4fde46f0 594 /* delete the stale device */
7bcb8164
AJ
595 fs_devices->num_devices--;
596 list_del(&device->dev_list);
597 btrfs_free_device(device);
598
70bc7088 599 ret = 0;
7bcb8164
AJ
600 }
601 mutex_unlock(&fs_devices->device_list_mutex);
70bc7088 602
7bcb8164
AJ
603 if (fs_devices->num_devices == 0) {
604 btrfs_sysfs_remove_fsid(fs_devices);
605 list_del(&fs_devices->fs_list);
606 free_fs_devices(fs_devices);
4fde46f0
AJ
607 }
608 }
70bc7088
AJ
609
610 return ret;
4fde46f0
AJ
611}
612
18c850fd
JB
613/*
614 * This is only used on mount, and we are protected from competing things
615 * messing with our fs_devices by the uuid_mutex, thus we do not need the
616 * fs_devices->device_list_mutex here.
617 */
0fb08bcc
AJ
618static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
619 struct btrfs_device *device, fmode_t flags,
620 void *holder)
621{
622 struct request_queue *q;
623 struct block_device *bdev;
0fb08bcc
AJ
624 struct btrfs_super_block *disk_super;
625 u64 devid;
626 int ret;
627
628 if (device->bdev)
629 return -EINVAL;
630 if (!device->name)
631 return -EINVAL;
632
633 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
8f32380d 634 &bdev, &disk_super);
0fb08bcc
AJ
635 if (ret)
636 return ret;
637
0fb08bcc
AJ
638 devid = btrfs_stack_device_id(&disk_super->dev_item);
639 if (devid != device->devid)
8f32380d 640 goto error_free_page;
0fb08bcc
AJ
641
642 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
8f32380d 643 goto error_free_page;
0fb08bcc
AJ
644
645 device->generation = btrfs_super_generation(disk_super);
646
647 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
7239ff4b
NB
648 if (btrfs_super_incompat_flags(disk_super) &
649 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
650 pr_err(
651 "BTRFS: Invalid seeding and uuid-changed device detected\n");
8f32380d 652 goto error_free_page;
7239ff4b
NB
653 }
654
ebbede42 655 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0395d84f 656 fs_devices->seeding = true;
0fb08bcc 657 } else {
ebbede42
AJ
658 if (bdev_read_only(bdev))
659 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
660 else
661 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0fb08bcc
AJ
662 }
663
664 q = bdev_get_queue(bdev);
0fb08bcc 665 if (!blk_queue_nonrot(q))
7f0432d0 666 fs_devices->rotating = true;
0fb08bcc
AJ
667
668 device->bdev = bdev;
e12c9621 669 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
0fb08bcc
AJ
670 device->mode = flags;
671
672 fs_devices->open_devices++;
ebbede42
AJ
673 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
674 device->devid != BTRFS_DEV_REPLACE_DEVID) {
0fb08bcc 675 fs_devices->rw_devices++;
b1b8e386 676 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
0fb08bcc 677 }
8f32380d 678 btrfs_release_disk_super(disk_super);
0fb08bcc
AJ
679
680 return 0;
681
8f32380d
JT
682error_free_page:
683 btrfs_release_disk_super(disk_super);
0fb08bcc
AJ
684 blkdev_put(bdev, flags);
685
686 return -EINVAL;
687}
688
7a62d0f0
NB
689/*
690 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
c0d81c7c
SY
691 * being created with a disk that has already completed its fsid change. Such
692 * disk can belong to an fs which has its FSID changed or to one which doesn't.
693 * Handle both cases here.
7a62d0f0
NB
694 */
695static struct btrfs_fs_devices *find_fsid_inprogress(
696 struct btrfs_super_block *disk_super)
697{
698 struct btrfs_fs_devices *fs_devices;
699
700 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
701 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
702 BTRFS_FSID_SIZE) != 0 &&
703 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
704 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
705 return fs_devices;
706 }
707 }
708
c0d81c7c 709 return find_fsid(disk_super->fsid, NULL);
7a62d0f0
NB
710}
711
cc5de4e7
NB
712
713static struct btrfs_fs_devices *find_fsid_changed(
714 struct btrfs_super_block *disk_super)
715{
716 struct btrfs_fs_devices *fs_devices;
717
718 /*
719 * Handles the case where scanned device is part of an fs that had
720 * multiple successful changes of FSID but curently device didn't
05840710
NB
721 * observe it. Meaning our fsid will be different than theirs. We need
722 * to handle two subcases :
723 * 1 - The fs still continues to have different METADATA/FSID uuids.
724 * 2 - The fs is switched back to its original FSID (METADATA/FSID
725 * are equal).
cc5de4e7
NB
726 */
727 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
05840710 728 /* Changed UUIDs */
cc5de4e7
NB
729 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
730 BTRFS_FSID_SIZE) != 0 &&
731 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
732 BTRFS_FSID_SIZE) == 0 &&
733 memcmp(fs_devices->fsid, disk_super->fsid,
05840710
NB
734 BTRFS_FSID_SIZE) != 0)
735 return fs_devices;
736
737 /* Unchanged UUIDs */
738 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
739 BTRFS_FSID_SIZE) == 0 &&
740 memcmp(fs_devices->fsid, disk_super->metadata_uuid,
741 BTRFS_FSID_SIZE) == 0)
cc5de4e7 742 return fs_devices;
cc5de4e7
NB
743 }
744
745 return NULL;
746}
1362089d
NB
747
748static struct btrfs_fs_devices *find_fsid_reverted_metadata(
749 struct btrfs_super_block *disk_super)
750{
751 struct btrfs_fs_devices *fs_devices;
752
753 /*
754 * Handle the case where the scanned device is part of an fs whose last
755 * metadata UUID change reverted it to the original FSID. At the same
756 * time * fs_devices was first created by another constitutent device
757 * which didn't fully observe the operation. This results in an
758 * btrfs_fs_devices created with metadata/fsid different AND
759 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
760 * fs_devices equal to the FSID of the disk.
761 */
762 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
763 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
764 BTRFS_FSID_SIZE) != 0 &&
765 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
766 BTRFS_FSID_SIZE) == 0 &&
767 fs_devices->fsid_change)
768 return fs_devices;
769 }
770
771 return NULL;
772}
60999ca4
DS
773/*
774 * Add new device to list of registered devices
775 *
776 * Returns:
e124ece5
AJ
777 * device pointer which was just added or updated when successful
778 * error pointer when failed
60999ca4 779 */
e124ece5 780static noinline struct btrfs_device *device_list_add(const char *path,
4306a974
AJ
781 struct btrfs_super_block *disk_super,
782 bool *new_device_added)
8a4b83cc
CM
783{
784 struct btrfs_device *device;
7a62d0f0 785 struct btrfs_fs_devices *fs_devices = NULL;
606686ee 786 struct rcu_string *name;
8a4b83cc 787 u64 found_transid = btrfs_super_generation(disk_super);
3acbcbfc 788 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
7239ff4b
NB
789 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
790 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
d1a63002
NB
791 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
792 BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
7239ff4b 793
cc5de4e7 794 if (fsid_change_in_progress) {
c0d81c7c 795 if (!has_metadata_uuid)
cc5de4e7 796 fs_devices = find_fsid_inprogress(disk_super);
c0d81c7c 797 else
cc5de4e7 798 fs_devices = find_fsid_changed(disk_super);
7a62d0f0 799 } else if (has_metadata_uuid) {
c6730a0e 800 fs_devices = find_fsid_with_metadata_uuid(disk_super);
7a62d0f0 801 } else {
1362089d
NB
802 fs_devices = find_fsid_reverted_metadata(disk_super);
803 if (!fs_devices)
804 fs_devices = find_fsid(disk_super->fsid, NULL);
7a62d0f0
NB
805 }
806
8a4b83cc 807
8a4b83cc 808 if (!fs_devices) {
7239ff4b
NB
809 if (has_metadata_uuid)
810 fs_devices = alloc_fs_devices(disk_super->fsid,
811 disk_super->metadata_uuid);
812 else
813 fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
814
2208a378 815 if (IS_ERR(fs_devices))
e124ece5 816 return ERR_CAST(fs_devices);
2208a378 817
92900e51
AV
818 fs_devices->fsid_change = fsid_change_in_progress;
819
9c6d173e 820 mutex_lock(&fs_devices->device_list_mutex);
c4babc5e 821 list_add(&fs_devices->fs_list, &fs_uuids);
2208a378 822
8a4b83cc
CM
823 device = NULL;
824 } else {
9c6d173e 825 mutex_lock(&fs_devices->device_list_mutex);
09ba3bc9 826 device = btrfs_find_device(fs_devices, devid,
b2598edf 827 disk_super->dev_item.uuid, NULL);
7a62d0f0
NB
828
829 /*
830 * If this disk has been pulled into an fs devices created by
831 * a device which had the CHANGING_FSID_V2 flag then replace the
832 * metadata_uuid/fsid values of the fs_devices.
833 */
1362089d 834 if (fs_devices->fsid_change &&
7a62d0f0
NB
835 found_transid > fs_devices->latest_generation) {
836 memcpy(fs_devices->fsid, disk_super->fsid,
837 BTRFS_FSID_SIZE);
1362089d
NB
838
839 if (has_metadata_uuid)
840 memcpy(fs_devices->metadata_uuid,
841 disk_super->metadata_uuid,
842 BTRFS_FSID_SIZE);
843 else
844 memcpy(fs_devices->metadata_uuid,
845 disk_super->fsid, BTRFS_FSID_SIZE);
7a62d0f0
NB
846
847 fs_devices->fsid_change = false;
848 }
8a4b83cc 849 }
443f24fe 850
8a4b83cc 851 if (!device) {
9c6d173e
AJ
852 if (fs_devices->opened) {
853 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 854 return ERR_PTR(-EBUSY);
9c6d173e 855 }
2b82032c 856
12bd2fc0
ID
857 device = btrfs_alloc_device(NULL, &devid,
858 disk_super->dev_item.uuid);
859 if (IS_ERR(device)) {
9c6d173e 860 mutex_unlock(&fs_devices->device_list_mutex);
8a4b83cc 861 /* we can safely leave the fs_devices entry around */
e124ece5 862 return device;
8a4b83cc 863 }
606686ee
JB
864
865 name = rcu_string_strdup(path, GFP_NOFS);
866 if (!name) {
a425f9d4 867 btrfs_free_device(device);
9c6d173e 868 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 869 return ERR_PTR(-ENOMEM);
8a4b83cc 870 }
606686ee 871 rcu_assign_pointer(device->name, name);
90519d66 872
1f78160c 873 list_add_rcu(&device->dev_list, &fs_devices->devices);
f7171750 874 fs_devices->num_devices++;
e5e9a520 875
2b82032c 876 device->fs_devices = fs_devices;
4306a974 877 *new_device_added = true;
327f18cc
AJ
878
879 if (disk_super->label[0])
aa6c0df7
AJ
880 pr_info(
881 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
882 disk_super->label, devid, found_transid, path,
883 current->comm, task_pid_nr(current));
327f18cc 884 else
aa6c0df7
AJ
885 pr_info(
886 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
887 disk_super->fsid, devid, found_transid, path,
888 current->comm, task_pid_nr(current));
327f18cc 889
606686ee 890 } else if (!device->name || strcmp(device->name->str, path)) {
b96de000
AJ
891 /*
892 * When FS is already mounted.
893 * 1. If you are here and if the device->name is NULL that
894 * means this device was missing at time of FS mount.
895 * 2. If you are here and if the device->name is different
896 * from 'path' that means either
897 * a. The same device disappeared and reappeared with
898 * different name. or
899 * b. The missing-disk-which-was-replaced, has
900 * reappeared now.
901 *
902 * We must allow 1 and 2a above. But 2b would be a spurious
903 * and unintentional.
904 *
905 * Further in case of 1 and 2a above, the disk at 'path'
906 * would have missed some transaction when it was away and
907 * in case of 2a the stale bdev has to be updated as well.
908 * 2b must not be allowed at all time.
909 */
910
911 /*
0f23ae74
CM
912 * For now, we do allow update to btrfs_fs_device through the
913 * btrfs dev scan cli after FS has been mounted. We're still
914 * tracking a problem where systems fail mount by subvolume id
915 * when we reject replacement on a mounted FS.
b96de000 916 */
0f23ae74 917 if (!fs_devices->opened && found_transid < device->generation) {
77bdae4d
AJ
918 /*
919 * That is if the FS is _not_ mounted and if you
920 * are here, that means there is more than one
921 * disk with same uuid and devid.We keep the one
922 * with larger generation number or the last-in if
923 * generation are equal.
924 */
9c6d173e 925 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 926 return ERR_PTR(-EEXIST);
77bdae4d 927 }
b96de000 928
a9261d41
AJ
929 /*
930 * We are going to replace the device path for a given devid,
931 * make sure it's the same device if the device is mounted
932 */
933 if (device->bdev) {
4e7b5671
CH
934 int error;
935 dev_t path_dev;
a9261d41 936
4e7b5671
CH
937 error = lookup_bdev(path, &path_dev);
938 if (error) {
a9261d41 939 mutex_unlock(&fs_devices->device_list_mutex);
4e7b5671 940 return ERR_PTR(error);
a9261d41
AJ
941 }
942
4e7b5671 943 if (device->bdev->bd_dev != path_dev) {
a9261d41 944 mutex_unlock(&fs_devices->device_list_mutex);
0697d9a6
JT
945 /*
946 * device->fs_info may not be reliable here, so
947 * pass in a NULL instead. This avoids a
948 * possible use-after-free when the fs_info and
949 * fs_info->sb are already torn down.
950 */
951 btrfs_warn_in_rcu(NULL,
79dae17d
AJ
952 "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
953 path, devid, found_transid,
954 current->comm,
955 task_pid_nr(current));
a9261d41
AJ
956 return ERR_PTR(-EEXIST);
957 }
a9261d41 958 btrfs_info_in_rcu(device->fs_info,
79dae17d
AJ
959 "devid %llu device path %s changed to %s scanned by %s (%d)",
960 devid, rcu_str_deref(device->name),
961 path, current->comm,
962 task_pid_nr(current));
a9261d41
AJ
963 }
964
606686ee 965 name = rcu_string_strdup(path, GFP_NOFS);
9c6d173e
AJ
966 if (!name) {
967 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 968 return ERR_PTR(-ENOMEM);
9c6d173e 969 }
606686ee
JB
970 rcu_string_free(device->name);
971 rcu_assign_pointer(device->name, name);
e6e674bd 972 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
cd02dca5 973 fs_devices->missing_devices--;
e6e674bd 974 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
cd02dca5 975 }
8a4b83cc
CM
976 }
977
77bdae4d
AJ
978 /*
979 * Unmount does not free the btrfs_device struct but would zero
980 * generation along with most of the other members. So just update
981 * it back. We need it to pick the disk with largest generation
982 * (as above).
983 */
d1a63002 984 if (!fs_devices->opened) {
77bdae4d 985 device->generation = found_transid;
d1a63002
NB
986 fs_devices->latest_generation = max_t(u64, found_transid,
987 fs_devices->latest_generation);
988 }
77bdae4d 989
f2788d2f
AJ
990 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
991
9c6d173e 992 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 993 return device;
8a4b83cc
CM
994}
995
e4404d6e
YZ
996static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
997{
998 struct btrfs_fs_devices *fs_devices;
999 struct btrfs_device *device;
1000 struct btrfs_device *orig_dev;
d2979aa2 1001 int ret = 0;
e4404d6e 1002
7239ff4b 1003 fs_devices = alloc_fs_devices(orig->fsid, NULL);
2208a378
ID
1004 if (IS_ERR(fs_devices))
1005 return fs_devices;
e4404d6e 1006
adbbb863 1007 mutex_lock(&orig->device_list_mutex);
02db0844 1008 fs_devices->total_devices = orig->total_devices;
e4404d6e
YZ
1009
1010 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
606686ee
JB
1011 struct rcu_string *name;
1012
12bd2fc0
ID
1013 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1014 orig_dev->uuid);
d2979aa2
AJ
1015 if (IS_ERR(device)) {
1016 ret = PTR_ERR(device);
e4404d6e 1017 goto error;
d2979aa2 1018 }
e4404d6e 1019
606686ee
JB
1020 /*
1021 * This is ok to do without rcu read locked because we hold the
1022 * uuid mutex so nothing we touch in here is going to disappear.
1023 */
e755f780 1024 if (orig_dev->name) {
78f2c9e6
DS
1025 name = rcu_string_strdup(orig_dev->name->str,
1026 GFP_KERNEL);
e755f780 1027 if (!name) {
a425f9d4 1028 btrfs_free_device(device);
d2979aa2 1029 ret = -ENOMEM;
e755f780
AJ
1030 goto error;
1031 }
1032 rcu_assign_pointer(device->name, name);
fd2696f3 1033 }
e4404d6e 1034
e4404d6e
YZ
1035 list_add(&device->dev_list, &fs_devices->devices);
1036 device->fs_devices = fs_devices;
1037 fs_devices->num_devices++;
1038 }
adbbb863 1039 mutex_unlock(&orig->device_list_mutex);
e4404d6e
YZ
1040 return fs_devices;
1041error:
adbbb863 1042 mutex_unlock(&orig->device_list_mutex);
e4404d6e 1043 free_fs_devices(fs_devices);
d2979aa2 1044 return ERR_PTR(ret);
e4404d6e
YZ
1045}
1046
3712ccb7 1047static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
bacce86a 1048 struct btrfs_device **latest_dev)
dfe25020 1049{
c6e30871 1050 struct btrfs_device *device, *next;
a6b0d5c8 1051
46224705 1052 /* This is the initialized path, it is safe to release the devices. */
c6e30871 1053 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
3712ccb7 1054 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
401e29c1 1055 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
3712ccb7 1056 &device->dev_state) &&
998a0671
AJ
1057 !test_bit(BTRFS_DEV_STATE_MISSING,
1058 &device->dev_state) &&
3712ccb7
NB
1059 (!*latest_dev ||
1060 device->generation > (*latest_dev)->generation)) {
1061 *latest_dev = device;
a6b0d5c8 1062 }
2b82032c 1063 continue;
a6b0d5c8 1064 }
2b82032c 1065
cf89af14
AJ
1066 /*
1067 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1068 * in btrfs_init_dev_replace() so just continue.
1069 */
1070 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1071 continue;
1072
2b82032c 1073 if (device->bdev) {
d4d77629 1074 blkdev_put(device->bdev, device->mode);
2b82032c
YZ
1075 device->bdev = NULL;
1076 fs_devices->open_devices--;
1077 }
ebbede42 1078 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2b82032c 1079 list_del_init(&device->dev_alloc_list);
ebbede42 1080 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2b82032c 1081 }
e4404d6e
YZ
1082 list_del_init(&device->dev_list);
1083 fs_devices->num_devices--;
a425f9d4 1084 btrfs_free_device(device);
dfe25020 1085 }
2b82032c 1086
3712ccb7
NB
1087}
1088
1089/*
1090 * After we have read the system tree and know devids belonging to this
1091 * filesystem, remove the device which does not belong there.
1092 */
bacce86a 1093void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
3712ccb7
NB
1094{
1095 struct btrfs_device *latest_dev = NULL;
944d3f9f 1096 struct btrfs_fs_devices *seed_dev;
3712ccb7
NB
1097
1098 mutex_lock(&uuid_mutex);
bacce86a 1099 __btrfs_free_extra_devids(fs_devices, &latest_dev);
944d3f9f
NB
1100
1101 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
bacce86a 1102 __btrfs_free_extra_devids(seed_dev, &latest_dev);
2b82032c 1103
443f24fe 1104 fs_devices->latest_bdev = latest_dev->bdev;
a6b0d5c8 1105
dfe25020 1106 mutex_unlock(&uuid_mutex);
dfe25020 1107}
a0af469b 1108
14238819
AJ
1109static void btrfs_close_bdev(struct btrfs_device *device)
1110{
08ffcae8
DS
1111 if (!device->bdev)
1112 return;
1113
ebbede42 1114 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
14238819
AJ
1115 sync_blockdev(device->bdev);
1116 invalidate_bdev(device->bdev);
1117 }
1118
08ffcae8 1119 blkdev_put(device->bdev, device->mode);
14238819
AJ
1120}
1121
959b1c04 1122static void btrfs_close_one_device(struct btrfs_device *device)
f448341a
AJ
1123{
1124 struct btrfs_fs_devices *fs_devices = device->fs_devices;
f448341a 1125
ebbede42 1126 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
f448341a
AJ
1127 device->devid != BTRFS_DEV_REPLACE_DEVID) {
1128 list_del_init(&device->dev_alloc_list);
1129 fs_devices->rw_devices--;
1130 }
1131
e6e674bd 1132 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
f448341a
AJ
1133 fs_devices->missing_devices--;
1134
959b1c04 1135 btrfs_close_bdev(device);
321f69f8 1136 if (device->bdev) {
3fff3975 1137 fs_devices->open_devices--;
321f69f8 1138 device->bdev = NULL;
f448341a 1139 }
321f69f8 1140 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
5b316468 1141 btrfs_destroy_dev_zone_info(device);
f448341a 1142
321f69f8
JT
1143 device->fs_info = NULL;
1144 atomic_set(&device->dev_stats_ccnt, 0);
1145 extent_io_tree_release(&device->alloc_state);
959b1c04 1146
321f69f8
JT
1147 /* Verify the device is back in a pristine state */
1148 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1149 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1150 ASSERT(list_empty(&device->dev_alloc_list));
1151 ASSERT(list_empty(&device->post_commit_list));
1152 ASSERT(atomic_read(&device->reada_in_flight) == 0);
f448341a
AJ
1153}
1154
54eed6ae 1155static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
8a4b83cc 1156{
2037a093 1157 struct btrfs_device *device, *tmp;
e4404d6e 1158
425c6ed6
JB
1159 lockdep_assert_held(&uuid_mutex);
1160
2b82032c 1161 if (--fs_devices->opened > 0)
54eed6ae 1162 return;
8a4b83cc 1163
425c6ed6 1164 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
959b1c04 1165 btrfs_close_one_device(device);
c9513edb 1166
e4404d6e
YZ
1167 WARN_ON(fs_devices->open_devices);
1168 WARN_ON(fs_devices->rw_devices);
2b82032c 1169 fs_devices->opened = 0;
0395d84f 1170 fs_devices->seeding = false;
c4989c2f 1171 fs_devices->fs_info = NULL;
8a4b83cc
CM
1172}
1173
54eed6ae 1174void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
2b82032c 1175{
944d3f9f
NB
1176 LIST_HEAD(list);
1177 struct btrfs_fs_devices *tmp;
2b82032c
YZ
1178
1179 mutex_lock(&uuid_mutex);
54eed6ae 1180 close_fs_devices(fs_devices);
944d3f9f
NB
1181 if (!fs_devices->opened)
1182 list_splice_init(&fs_devices->seed_list, &list);
e4404d6e 1183
944d3f9f 1184 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
0226e0eb 1185 close_fs_devices(fs_devices);
944d3f9f 1186 list_del(&fs_devices->seed_list);
e4404d6e
YZ
1187 free_fs_devices(fs_devices);
1188 }
425c6ed6 1189 mutex_unlock(&uuid_mutex);
2b82032c
YZ
1190}
1191
897fb573 1192static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
e4404d6e 1193 fmode_t flags, void *holder)
8a4b83cc 1194{
8a4b83cc 1195 struct btrfs_device *device;
443f24fe 1196 struct btrfs_device *latest_dev = NULL;
96c2e067 1197 struct btrfs_device *tmp_device;
8a4b83cc 1198
d4d77629
TH
1199 flags |= FMODE_EXCL;
1200
96c2e067
AJ
1201 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1202 dev_list) {
1203 int ret;
a0af469b 1204
96c2e067
AJ
1205 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1206 if (ret == 0 &&
1207 (!latest_dev || device->generation > latest_dev->generation)) {
9f050db4 1208 latest_dev = device;
96c2e067
AJ
1209 } else if (ret == -ENODATA) {
1210 fs_devices->num_devices--;
1211 list_del(&device->dev_list);
1212 btrfs_free_device(device);
1213 }
8a4b83cc 1214 }
1ed802c9
AJ
1215 if (fs_devices->open_devices == 0)
1216 return -EINVAL;
1217
2b82032c 1218 fs_devices->opened = 1;
443f24fe 1219 fs_devices->latest_bdev = latest_dev->bdev;
2b82032c 1220 fs_devices->total_rw_bytes = 0;
c4a816c6 1221 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
33fd2f71 1222 fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1ed802c9
AJ
1223
1224 return 0;
2b82032c
YZ
1225}
1226
f8e10cd3
AJ
1227static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1228{
1229 struct btrfs_device *dev1, *dev2;
1230
1231 dev1 = list_entry(a, struct btrfs_device, dev_list);
1232 dev2 = list_entry(b, struct btrfs_device, dev_list);
1233
1234 if (dev1->devid < dev2->devid)
1235 return -1;
1236 else if (dev1->devid > dev2->devid)
1237 return 1;
1238 return 0;
1239}
1240
2b82032c 1241int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
97288f2c 1242 fmode_t flags, void *holder)
2b82032c
YZ
1243{
1244 int ret;
1245
f5194e34 1246 lockdep_assert_held(&uuid_mutex);
18c850fd
JB
1247 /*
1248 * The device_list_mutex cannot be taken here in case opening the
1249 * underlying device takes further locks like bd_mutex.
1250 *
1251 * We also don't need the lock here as this is called during mount and
1252 * exclusion is provided by uuid_mutex
1253 */
f5194e34 1254
2b82032c 1255 if (fs_devices->opened) {
e4404d6e
YZ
1256 fs_devices->opened++;
1257 ret = 0;
2b82032c 1258 } else {
f8e10cd3 1259 list_sort(NULL, &fs_devices->devices, devid_cmp);
897fb573 1260 ret = open_fs_devices(fs_devices, flags, holder);
2b82032c 1261 }
542c5908 1262
8a4b83cc
CM
1263 return ret;
1264}
1265
8f32380d 1266void btrfs_release_disk_super(struct btrfs_super_block *super)
6cf86a00 1267{
8f32380d
JT
1268 struct page *page = virt_to_page(super);
1269
6cf86a00
AJ
1270 put_page(page);
1271}
1272
b335eab8 1273static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
12659251 1274 u64 bytenr, u64 bytenr_orig)
6cf86a00 1275{
b335eab8
NB
1276 struct btrfs_super_block *disk_super;
1277 struct page *page;
6cf86a00
AJ
1278 void *p;
1279 pgoff_t index;
1280
1281 /* make sure our super fits in the device */
1282 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
b335eab8 1283 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1284
1285 /* make sure our super fits in the page */
b335eab8
NB
1286 if (sizeof(*disk_super) > PAGE_SIZE)
1287 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1288
1289 /* make sure our super doesn't straddle pages on disk */
1290 index = bytenr >> PAGE_SHIFT;
b335eab8
NB
1291 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1292 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1293
1294 /* pull in the page with our super */
b335eab8 1295 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
6cf86a00 1296
b335eab8
NB
1297 if (IS_ERR(page))
1298 return ERR_CAST(page);
6cf86a00 1299
b335eab8 1300 p = page_address(page);
6cf86a00
AJ
1301
1302 /* align our pointer to the offset of the super block */
b335eab8 1303 disk_super = p + offset_in_page(bytenr);
6cf86a00 1304
12659251 1305 if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
b335eab8 1306 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
8f32380d 1307 btrfs_release_disk_super(p);
b335eab8 1308 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1309 }
1310
b335eab8
NB
1311 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1312 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
6cf86a00 1313
b335eab8 1314 return disk_super;
6cf86a00
AJ
1315}
1316
228a73ab
AJ
1317int btrfs_forget_devices(const char *path)
1318{
1319 int ret;
1320
1321 mutex_lock(&uuid_mutex);
1322 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1323 mutex_unlock(&uuid_mutex);
1324
1325 return ret;
1326}
1327
6f60cbd3
DS
1328/*
1329 * Look for a btrfs signature on a device. This may be called out of the mount path
1330 * and we are not allowed to call set_blocksize during the scan. The superblock
1331 * is read via pagecache
1332 */
36350e95
GJ
1333struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1334 void *holder)
8a4b83cc
CM
1335{
1336 struct btrfs_super_block *disk_super;
4306a974 1337 bool new_device_added = false;
36350e95 1338 struct btrfs_device *device = NULL;
8a4b83cc 1339 struct block_device *bdev;
12659251
NA
1340 u64 bytenr, bytenr_orig;
1341 int ret;
8a4b83cc 1342
899f9307
DS
1343 lockdep_assert_held(&uuid_mutex);
1344
6f60cbd3
DS
1345 /*
1346 * we would like to check all the supers, but that would make
1347 * a btrfs mount succeed after a mkfs from a different FS.
1348 * So, we need to add a special mount option to scan for
1349 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1350 */
d4d77629 1351 flags |= FMODE_EXCL;
6f60cbd3
DS
1352
1353 bdev = blkdev_get_by_path(path, flags, holder);
b6ed73bc 1354 if (IS_ERR(bdev))
36350e95 1355 return ERR_CAST(bdev);
6f60cbd3 1356
12659251
NA
1357 bytenr_orig = btrfs_sb_offset(0);
1358 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1359 if (ret)
1360 return ERR_PTR(ret);
1361
1362 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
b335eab8
NB
1363 if (IS_ERR(disk_super)) {
1364 device = ERR_CAST(disk_super);
6f60cbd3 1365 goto error_bdev_put;
05a5c55d 1366 }
6f60cbd3 1367
4306a974 1368 device = device_list_add(path, disk_super, &new_device_added);
36350e95 1369 if (!IS_ERR(device)) {
4306a974
AJ
1370 if (new_device_added)
1371 btrfs_free_stale_devices(path, device);
1372 }
6f60cbd3 1373
8f32380d 1374 btrfs_release_disk_super(disk_super);
6f60cbd3
DS
1375
1376error_bdev_put:
d4d77629 1377 blkdev_put(bdev, flags);
b6ed73bc 1378
36350e95 1379 return device;
8a4b83cc 1380}
0b86a832 1381
1c11b63e
JM
1382/*
1383 * Try to find a chunk that intersects [start, start + len] range and when one
1384 * such is found, record the end of it in *start
1385 */
1c11b63e
JM
1386static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1387 u64 len)
6df9a95e 1388{
1c11b63e 1389 u64 physical_start, physical_end;
6df9a95e 1390
1c11b63e 1391 lockdep_assert_held(&device->fs_info->chunk_mutex);
6df9a95e 1392
1c11b63e
JM
1393 if (!find_first_extent_bit(&device->alloc_state, *start,
1394 &physical_start, &physical_end,
1395 CHUNK_ALLOCATED, NULL)) {
c152b63e 1396
1c11b63e
JM
1397 if (in_range(physical_start, *start, len) ||
1398 in_range(*start, physical_start,
1399 physical_end - physical_start)) {
1400 *start = physical_end + 1;
1401 return true;
6df9a95e
JB
1402 }
1403 }
1c11b63e 1404 return false;
6df9a95e
JB
1405}
1406
3b4ffa40
NA
1407static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1408{
1409 switch (device->fs_devices->chunk_alloc_policy) {
1410 case BTRFS_CHUNK_ALLOC_REGULAR:
1411 /*
1412 * We don't want to overwrite the superblock on the drive nor
1413 * any area used by the boot loader (grub for example), so we
1414 * make sure to start at an offset of at least 1MB.
1415 */
1416 return max_t(u64, start, SZ_1M);
1cd6121f
NA
1417 case BTRFS_CHUNK_ALLOC_ZONED:
1418 /*
1419 * We don't care about the starting region like regular
1420 * allocator, because we anyway use/reserve the first two zones
1421 * for superblock logging.
1422 */
1423 return ALIGN(start, device->zone_info->zone_size);
3b4ffa40
NA
1424 default:
1425 BUG();
1426 }
1427}
1428
1cd6121f
NA
1429static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1430 u64 *hole_start, u64 *hole_size,
1431 u64 num_bytes)
1432{
1433 u64 zone_size = device->zone_info->zone_size;
1434 u64 pos;
1435 int ret;
1436 bool changed = false;
1437
1438 ASSERT(IS_ALIGNED(*hole_start, zone_size));
1439
1440 while (*hole_size > 0) {
1441 pos = btrfs_find_allocatable_zones(device, *hole_start,
1442 *hole_start + *hole_size,
1443 num_bytes);
1444 if (pos != *hole_start) {
1445 *hole_size = *hole_start + *hole_size - pos;
1446 *hole_start = pos;
1447 changed = true;
1448 if (*hole_size < num_bytes)
1449 break;
1450 }
1451
1452 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1453
1454 /* Range is ensured to be empty */
1455 if (!ret)
1456 return changed;
1457
1458 /* Given hole range was invalid (outside of device) */
1459 if (ret == -ERANGE) {
1460 *hole_start += *hole_size;
1461 *hole_size = 0;
1462 return 1;
1463 }
1464
1465 *hole_start += zone_size;
1466 *hole_size -= zone_size;
1467 changed = true;
1468 }
1469
1470 return changed;
1471}
1472
3b4ffa40
NA
1473/**
1474 * dev_extent_hole_check - check if specified hole is suitable for allocation
1475 * @device: the device which we have the hole
1476 * @hole_start: starting position of the hole
1477 * @hole_size: the size of the hole
1478 * @num_bytes: the size of the free space that we need
1479 *
1cd6121f 1480 * This function may modify @hole_start and @hole_size to reflect the suitable
3b4ffa40
NA
1481 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1482 */
1483static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1484 u64 *hole_size, u64 num_bytes)
1485{
1486 bool changed = false;
1487 u64 hole_end = *hole_start + *hole_size;
1488
1cd6121f
NA
1489 for (;;) {
1490 /*
1491 * Check before we set max_hole_start, otherwise we could end up
1492 * sending back this offset anyway.
1493 */
1494 if (contains_pending_extent(device, hole_start, *hole_size)) {
1495 if (hole_end >= *hole_start)
1496 *hole_size = hole_end - *hole_start;
1497 else
1498 *hole_size = 0;
1499 changed = true;
1500 }
1501
1502 switch (device->fs_devices->chunk_alloc_policy) {
1503 case BTRFS_CHUNK_ALLOC_REGULAR:
1504 /* No extra check */
1505 break;
1506 case BTRFS_CHUNK_ALLOC_ZONED:
1507 if (dev_extent_hole_check_zoned(device, hole_start,
1508 hole_size, num_bytes)) {
1509 changed = true;
1510 /*
1511 * The changed hole can contain pending extent.
1512 * Loop again to check that.
1513 */
1514 continue;
1515 }
1516 break;
1517 default:
1518 BUG();
1519 }
3b4ffa40 1520
3b4ffa40 1521 break;
3b4ffa40
NA
1522 }
1523
1524 return changed;
1525}
6df9a95e 1526
0b86a832 1527/*
499f377f
JM
1528 * find_free_dev_extent_start - find free space in the specified device
1529 * @device: the device which we search the free space in
1530 * @num_bytes: the size of the free space that we need
1531 * @search_start: the position from which to begin the search
1532 * @start: store the start of the free space.
1533 * @len: the size of the free space. that we find, or the size
1534 * of the max free space if we don't find suitable free space
7bfc837d 1535 *
0b86a832
CM
1536 * this uses a pretty simple search, the expectation is that it is
1537 * called very infrequently and that a given device has a small number
1538 * of extents
7bfc837d
MX
1539 *
1540 * @start is used to store the start of the free space if we find. But if we
1541 * don't find suitable free space, it will be used to store the start position
1542 * of the max free space.
1543 *
1544 * @len is used to store the size of the free space that we find.
1545 * But if we don't find suitable free space, it is used to store the size of
1546 * the max free space.
135da976
QW
1547 *
1548 * NOTE: This function will search *commit* root of device tree, and does extra
1549 * check to ensure dev extents are not double allocated.
1550 * This makes the function safe to allocate dev extents but may not report
1551 * correct usable device space, as device extent freed in current transaction
1552 * is not reported as avaiable.
0b86a832 1553 */
9e3246a5
QW
1554static int find_free_dev_extent_start(struct btrfs_device *device,
1555 u64 num_bytes, u64 search_start, u64 *start,
1556 u64 *len)
0b86a832 1557{
0b246afa
JM
1558 struct btrfs_fs_info *fs_info = device->fs_info;
1559 struct btrfs_root *root = fs_info->dev_root;
0b86a832 1560 struct btrfs_key key;
7bfc837d 1561 struct btrfs_dev_extent *dev_extent;
2b82032c 1562 struct btrfs_path *path;
7bfc837d
MX
1563 u64 hole_size;
1564 u64 max_hole_start;
1565 u64 max_hole_size;
1566 u64 extent_end;
0b86a832
CM
1567 u64 search_end = device->total_bytes;
1568 int ret;
7bfc837d 1569 int slot;
0b86a832 1570 struct extent_buffer *l;
8cdc7c5b 1571
3b4ffa40 1572 search_start = dev_extent_search_start(device, search_start);
0b86a832 1573
1cd6121f
NA
1574 WARN_ON(device->zone_info &&
1575 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1576
6df9a95e
JB
1577 path = btrfs_alloc_path();
1578 if (!path)
1579 return -ENOMEM;
f2ab7618 1580
7bfc837d
MX
1581 max_hole_start = search_start;
1582 max_hole_size = 0;
1583
f2ab7618 1584again:
401e29c1
AJ
1585 if (search_start >= search_end ||
1586 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7bfc837d 1587 ret = -ENOSPC;
6df9a95e 1588 goto out;
7bfc837d
MX
1589 }
1590
e4058b54 1591 path->reada = READA_FORWARD;
6df9a95e
JB
1592 path->search_commit_root = 1;
1593 path->skip_locking = 1;
7bfc837d 1594
0b86a832
CM
1595 key.objectid = device->devid;
1596 key.offset = search_start;
1597 key.type = BTRFS_DEV_EXTENT_KEY;
7bfc837d 1598
125ccb0a 1599 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0b86a832 1600 if (ret < 0)
7bfc837d 1601 goto out;
1fcbac58
YZ
1602 if (ret > 0) {
1603 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1604 if (ret < 0)
7bfc837d 1605 goto out;
1fcbac58 1606 }
7bfc837d 1607
0b86a832
CM
1608 while (1) {
1609 l = path->nodes[0];
1610 slot = path->slots[0];
1611 if (slot >= btrfs_header_nritems(l)) {
1612 ret = btrfs_next_leaf(root, path);
1613 if (ret == 0)
1614 continue;
1615 if (ret < 0)
7bfc837d
MX
1616 goto out;
1617
1618 break;
0b86a832
CM
1619 }
1620 btrfs_item_key_to_cpu(l, &key, slot);
1621
1622 if (key.objectid < device->devid)
1623 goto next;
1624
1625 if (key.objectid > device->devid)
7bfc837d 1626 break;
0b86a832 1627
962a298f 1628 if (key.type != BTRFS_DEV_EXTENT_KEY)
7bfc837d 1629 goto next;
9779b72f 1630
7bfc837d
MX
1631 if (key.offset > search_start) {
1632 hole_size = key.offset - search_start;
3b4ffa40
NA
1633 dev_extent_hole_check(device, &search_start, &hole_size,
1634 num_bytes);
6df9a95e 1635
7bfc837d
MX
1636 if (hole_size > max_hole_size) {
1637 max_hole_start = search_start;
1638 max_hole_size = hole_size;
1639 }
9779b72f 1640
7bfc837d
MX
1641 /*
1642 * If this free space is greater than which we need,
1643 * it must be the max free space that we have found
1644 * until now, so max_hole_start must point to the start
1645 * of this free space and the length of this free space
1646 * is stored in max_hole_size. Thus, we return
1647 * max_hole_start and max_hole_size and go back to the
1648 * caller.
1649 */
1650 if (hole_size >= num_bytes) {
1651 ret = 0;
1652 goto out;
0b86a832
CM
1653 }
1654 }
0b86a832 1655
0b86a832 1656 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
7bfc837d
MX
1657 extent_end = key.offset + btrfs_dev_extent_length(l,
1658 dev_extent);
1659 if (extent_end > search_start)
1660 search_start = extent_end;
0b86a832
CM
1661next:
1662 path->slots[0]++;
1663 cond_resched();
1664 }
0b86a832 1665
38c01b96 1666 /*
1667 * At this point, search_start should be the end of
1668 * allocated dev extents, and when shrinking the device,
1669 * search_end may be smaller than search_start.
1670 */
f2ab7618 1671 if (search_end > search_start) {
38c01b96 1672 hole_size = search_end - search_start;
3b4ffa40
NA
1673 if (dev_extent_hole_check(device, &search_start, &hole_size,
1674 num_bytes)) {
f2ab7618
ZL
1675 btrfs_release_path(path);
1676 goto again;
1677 }
0b86a832 1678
f2ab7618
ZL
1679 if (hole_size > max_hole_size) {
1680 max_hole_start = search_start;
1681 max_hole_size = hole_size;
1682 }
6df9a95e
JB
1683 }
1684
7bfc837d 1685 /* See above. */
f2ab7618 1686 if (max_hole_size < num_bytes)
7bfc837d
MX
1687 ret = -ENOSPC;
1688 else
1689 ret = 0;
1690
1691out:
2b82032c 1692 btrfs_free_path(path);
7bfc837d 1693 *start = max_hole_start;
b2117a39 1694 if (len)
7bfc837d 1695 *len = max_hole_size;
0b86a832
CM
1696 return ret;
1697}
1698
60dfdf25 1699int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
499f377f
JM
1700 u64 *start, u64 *len)
1701{
499f377f 1702 /* FIXME use last free of some kind */
60dfdf25 1703 return find_free_dev_extent_start(device, num_bytes, 0, start, len);
499f377f
JM
1704}
1705
b2950863 1706static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
8f18cf13 1707 struct btrfs_device *device,
2196d6e8 1708 u64 start, u64 *dev_extent_len)
8f18cf13 1709{
0b246afa
JM
1710 struct btrfs_fs_info *fs_info = device->fs_info;
1711 struct btrfs_root *root = fs_info->dev_root;
8f18cf13
CM
1712 int ret;
1713 struct btrfs_path *path;
8f18cf13 1714 struct btrfs_key key;
a061fc8d
CM
1715 struct btrfs_key found_key;
1716 struct extent_buffer *leaf = NULL;
1717 struct btrfs_dev_extent *extent = NULL;
8f18cf13
CM
1718
1719 path = btrfs_alloc_path();
1720 if (!path)
1721 return -ENOMEM;
1722
1723 key.objectid = device->devid;
1724 key.offset = start;
1725 key.type = BTRFS_DEV_EXTENT_KEY;
924cd8fb 1726again:
8f18cf13 1727 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
a061fc8d
CM
1728 if (ret > 0) {
1729 ret = btrfs_previous_item(root, path, key.objectid,
1730 BTRFS_DEV_EXTENT_KEY);
b0b802d7
TI
1731 if (ret)
1732 goto out;
a061fc8d
CM
1733 leaf = path->nodes[0];
1734 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1735 extent = btrfs_item_ptr(leaf, path->slots[0],
1736 struct btrfs_dev_extent);
1737 BUG_ON(found_key.offset > start || found_key.offset +
1738 btrfs_dev_extent_length(leaf, extent) < start);
924cd8fb
MX
1739 key = found_key;
1740 btrfs_release_path(path);
1741 goto again;
a061fc8d
CM
1742 } else if (ret == 0) {
1743 leaf = path->nodes[0];
1744 extent = btrfs_item_ptr(leaf, path->slots[0],
1745 struct btrfs_dev_extent);
79787eaa 1746 } else {
0b246afa 1747 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
79787eaa 1748 goto out;
a061fc8d 1749 }
8f18cf13 1750
2196d6e8
MX
1751 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1752
8f18cf13 1753 ret = btrfs_del_item(trans, root, path);
79787eaa 1754 if (ret) {
0b246afa
JM
1755 btrfs_handle_fs_error(fs_info, ret,
1756 "Failed to remove dev extent item");
13212b54 1757 } else {
3204d33c 1758 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
79787eaa 1759 }
b0b802d7 1760out:
8f18cf13
CM
1761 btrfs_free_path(path);
1762 return ret;
1763}
1764
48a3b636
ES
1765static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1766 struct btrfs_device *device,
48a3b636 1767 u64 chunk_offset, u64 start, u64 num_bytes)
0b86a832
CM
1768{
1769 int ret;
1770 struct btrfs_path *path;
0b246afa
JM
1771 struct btrfs_fs_info *fs_info = device->fs_info;
1772 struct btrfs_root *root = fs_info->dev_root;
0b86a832
CM
1773 struct btrfs_dev_extent *extent;
1774 struct extent_buffer *leaf;
1775 struct btrfs_key key;
1776
e12c9621 1777 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
401e29c1 1778 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
0b86a832
CM
1779 path = btrfs_alloc_path();
1780 if (!path)
1781 return -ENOMEM;
1782
0b86a832 1783 key.objectid = device->devid;
2b82032c 1784 key.offset = start;
0b86a832
CM
1785 key.type = BTRFS_DEV_EXTENT_KEY;
1786 ret = btrfs_insert_empty_item(trans, root, path, &key,
1787 sizeof(*extent));
2cdcecbc
MF
1788 if (ret)
1789 goto out;
0b86a832
CM
1790
1791 leaf = path->nodes[0];
1792 extent = btrfs_item_ptr(leaf, path->slots[0],
1793 struct btrfs_dev_extent);
b5d9071c
NB
1794 btrfs_set_dev_extent_chunk_tree(leaf, extent,
1795 BTRFS_CHUNK_TREE_OBJECTID);
0ca00afb
NB
1796 btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1797 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
e17cade2
CM
1798 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1799
0b86a832
CM
1800 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1801 btrfs_mark_buffer_dirty(leaf);
2cdcecbc 1802out:
0b86a832
CM
1803 btrfs_free_path(path);
1804 return ret;
1805}
1806
6df9a95e 1807static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
0b86a832 1808{
6df9a95e
JB
1809 struct extent_map_tree *em_tree;
1810 struct extent_map *em;
1811 struct rb_node *n;
1812 u64 ret = 0;
0b86a832 1813
c8bf1b67 1814 em_tree = &fs_info->mapping_tree;
6df9a95e 1815 read_lock(&em_tree->lock);
07e1ce09 1816 n = rb_last(&em_tree->map.rb_root);
6df9a95e
JB
1817 if (n) {
1818 em = rb_entry(n, struct extent_map, rb_node);
1819 ret = em->start + em->len;
0b86a832 1820 }
6df9a95e
JB
1821 read_unlock(&em_tree->lock);
1822
0b86a832
CM
1823 return ret;
1824}
1825
53f10659
ID
1826static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1827 u64 *devid_ret)
0b86a832
CM
1828{
1829 int ret;
1830 struct btrfs_key key;
1831 struct btrfs_key found_key;
2b82032c
YZ
1832 struct btrfs_path *path;
1833
2b82032c
YZ
1834 path = btrfs_alloc_path();
1835 if (!path)
1836 return -ENOMEM;
0b86a832
CM
1837
1838 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1839 key.type = BTRFS_DEV_ITEM_KEY;
1840 key.offset = (u64)-1;
1841
53f10659 1842 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
0b86a832
CM
1843 if (ret < 0)
1844 goto error;
1845
a06dee4d
AJ
1846 if (ret == 0) {
1847 /* Corruption */
1848 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1849 ret = -EUCLEAN;
1850 goto error;
1851 }
0b86a832 1852
53f10659
ID
1853 ret = btrfs_previous_item(fs_info->chunk_root, path,
1854 BTRFS_DEV_ITEMS_OBJECTID,
0b86a832
CM
1855 BTRFS_DEV_ITEM_KEY);
1856 if (ret) {
53f10659 1857 *devid_ret = 1;
0b86a832
CM
1858 } else {
1859 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1860 path->slots[0]);
53f10659 1861 *devid_ret = found_key.offset + 1;
0b86a832
CM
1862 }
1863 ret = 0;
1864error:
2b82032c 1865 btrfs_free_path(path);
0b86a832
CM
1866 return ret;
1867}
1868
1869/*
1870 * the device information is stored in the chunk root
1871 * the btrfs_device struct should be fully filled in
1872 */
c74a0b02 1873static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
48a3b636 1874 struct btrfs_device *device)
0b86a832
CM
1875{
1876 int ret;
1877 struct btrfs_path *path;
1878 struct btrfs_dev_item *dev_item;
1879 struct extent_buffer *leaf;
1880 struct btrfs_key key;
1881 unsigned long ptr;
0b86a832 1882
0b86a832
CM
1883 path = btrfs_alloc_path();
1884 if (!path)
1885 return -ENOMEM;
1886
0b86a832
CM
1887 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1888 key.type = BTRFS_DEV_ITEM_KEY;
2b82032c 1889 key.offset = device->devid;
0b86a832 1890
8e87e856
NB
1891 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1892 &key, sizeof(*dev_item));
0b86a832
CM
1893 if (ret)
1894 goto out;
1895
1896 leaf = path->nodes[0];
1897 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1898
1899 btrfs_set_device_id(leaf, dev_item, device->devid);
2b82032c 1900 btrfs_set_device_generation(leaf, dev_item, 0);
0b86a832
CM
1901 btrfs_set_device_type(leaf, dev_item, device->type);
1902 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1903 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1904 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
1905 btrfs_set_device_total_bytes(leaf, dev_item,
1906 btrfs_device_get_disk_total_bytes(device));
1907 btrfs_set_device_bytes_used(leaf, dev_item,
1908 btrfs_device_get_bytes_used(device));
e17cade2
CM
1909 btrfs_set_device_group(leaf, dev_item, 0);
1910 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1911 btrfs_set_device_bandwidth(leaf, dev_item, 0);
c3027eb5 1912 btrfs_set_device_start_offset(leaf, dev_item, 0);
0b86a832 1913
410ba3a2 1914 ptr = btrfs_device_uuid(dev_item);
e17cade2 1915 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1473b24e 1916 ptr = btrfs_device_fsid(dev_item);
de37aa51
NB
1917 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1918 ptr, BTRFS_FSID_SIZE);
0b86a832 1919 btrfs_mark_buffer_dirty(leaf);
0b86a832 1920
2b82032c 1921 ret = 0;
0b86a832
CM
1922out:
1923 btrfs_free_path(path);
1924 return ret;
1925}
8f18cf13 1926
5a1972bd
QW
1927/*
1928 * Function to update ctime/mtime for a given device path.
1929 * Mainly used for ctime/mtime based probe like libblkid.
1930 */
da353f6b 1931static void update_dev_time(const char *path_name)
5a1972bd
QW
1932{
1933 struct file *filp;
1934
1935 filp = filp_open(path_name, O_RDWR, 0);
98af592f 1936 if (IS_ERR(filp))
5a1972bd
QW
1937 return;
1938 file_update_time(filp);
1939 filp_close(filp, NULL);
5a1972bd
QW
1940}
1941
f331a952 1942static int btrfs_rm_dev_item(struct btrfs_device *device)
a061fc8d 1943{
f331a952 1944 struct btrfs_root *root = device->fs_info->chunk_root;
a061fc8d
CM
1945 int ret;
1946 struct btrfs_path *path;
a061fc8d 1947 struct btrfs_key key;
a061fc8d
CM
1948 struct btrfs_trans_handle *trans;
1949
a061fc8d
CM
1950 path = btrfs_alloc_path();
1951 if (!path)
1952 return -ENOMEM;
1953
a22285a6 1954 trans = btrfs_start_transaction(root, 0);
98d5dc13
TI
1955 if (IS_ERR(trans)) {
1956 btrfs_free_path(path);
1957 return PTR_ERR(trans);
1958 }
a061fc8d
CM
1959 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1960 key.type = BTRFS_DEV_ITEM_KEY;
1961 key.offset = device->devid;
1962
1963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5e9f2ad5
NB
1964 if (ret) {
1965 if (ret > 0)
1966 ret = -ENOENT;
1967 btrfs_abort_transaction(trans, ret);
1968 btrfs_end_transaction(trans);
a061fc8d
CM
1969 goto out;
1970 }
1971
1972 ret = btrfs_del_item(trans, root, path);
5e9f2ad5
NB
1973 if (ret) {
1974 btrfs_abort_transaction(trans, ret);
1975 btrfs_end_transaction(trans);
1976 }
1977
a061fc8d
CM
1978out:
1979 btrfs_free_path(path);
5e9f2ad5
NB
1980 if (!ret)
1981 ret = btrfs_commit_transaction(trans);
a061fc8d
CM
1982 return ret;
1983}
1984
3cc31a0d
DS
1985/*
1986 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1987 * filesystem. It's up to the caller to adjust that number regarding eg. device
1988 * replace.
1989 */
1990static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1991 u64 num_devices)
a061fc8d 1992{
a061fc8d 1993 u64 all_avail;
de98ced9 1994 unsigned seq;
418775a2 1995 int i;
a061fc8d 1996
de98ced9 1997 do {
bd45ffbc 1998 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9 1999
bd45ffbc
AJ
2000 all_avail = fs_info->avail_data_alloc_bits |
2001 fs_info->avail_system_alloc_bits |
2002 fs_info->avail_metadata_alloc_bits;
2003 } while (read_seqretry(&fs_info->profiles_lock, seq));
a061fc8d 2004
418775a2 2005 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
41a6e891 2006 if (!(all_avail & btrfs_raid_array[i].bg_flag))
418775a2 2007 continue;
a061fc8d 2008
418775a2 2009 if (num_devices < btrfs_raid_array[i].devs_min) {
f9fbcaa2 2010 int ret = btrfs_raid_array[i].mindev_error;
bd45ffbc 2011
418775a2
DS
2012 if (ret)
2013 return ret;
2014 }
53b381b3
DW
2015 }
2016
bd45ffbc 2017 return 0;
f1fa7f26
AJ
2018}
2019
c9162bdf
OS
2020static struct btrfs_device * btrfs_find_next_active_device(
2021 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
a061fc8d 2022{
2b82032c 2023 struct btrfs_device *next_device;
88acff64
AJ
2024
2025 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2026 if (next_device != device &&
e6e674bd
AJ
2027 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2028 && next_device->bdev)
88acff64
AJ
2029 return next_device;
2030 }
2031
2032 return NULL;
2033}
2034
2035/*
2036 * Helper function to check if the given device is part of s_bdev / latest_bdev
2037 * and replace it with the provided or the next active device, in the context
2038 * where this function called, there should be always be another device (or
2039 * this_dev) which is active.
2040 */
b105e927 2041void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
e493e8f9 2042 struct btrfs_device *next_device)
88acff64 2043{
d6507cf1 2044 struct btrfs_fs_info *fs_info = device->fs_info;
88acff64 2045
e493e8f9 2046 if (!next_device)
88acff64 2047 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
e493e8f9 2048 device);
88acff64
AJ
2049 ASSERT(next_device);
2050
2051 if (fs_info->sb->s_bdev &&
2052 (fs_info->sb->s_bdev == device->bdev))
2053 fs_info->sb->s_bdev = next_device->bdev;
2054
2055 if (fs_info->fs_devices->latest_bdev == device->bdev)
2056 fs_info->fs_devices->latest_bdev = next_device->bdev;
2057}
2058
1da73967
AJ
2059/*
2060 * Return btrfs_fs_devices::num_devices excluding the device that's being
2061 * currently replaced.
2062 */
2063static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2064{
2065 u64 num_devices = fs_info->fs_devices->num_devices;
2066
cb5583dd 2067 down_read(&fs_info->dev_replace.rwsem);
1da73967
AJ
2068 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2069 ASSERT(num_devices > 1);
2070 num_devices--;
2071 }
cb5583dd 2072 up_read(&fs_info->dev_replace.rwsem);
1da73967
AJ
2073
2074 return num_devices;
2075}
2076
313b0858
JB
2077void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2078 struct block_device *bdev,
2079 const char *device_path)
6fbceb9f 2080{
6fbceb9f
JT
2081 struct btrfs_super_block *disk_super;
2082 int copy_num;
2083
2084 if (!bdev)
2085 return;
2086
2087 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
8f32380d
JT
2088 struct page *page;
2089 int ret;
6fbceb9f 2090
8f32380d
JT
2091 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2092 if (IS_ERR(disk_super))
2093 continue;
6fbceb9f 2094
12659251
NA
2095 if (bdev_is_zoned(bdev)) {
2096 btrfs_reset_sb_log_zones(bdev, copy_num);
2097 continue;
2098 }
2099
6fbceb9f 2100 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
8f32380d
JT
2101
2102 page = virt_to_page(disk_super);
2103 set_page_dirty(page);
2104 lock_page(page);
2105 /* write_on_page() unlocks the page */
2106 ret = write_one_page(page);
2107 if (ret)
2108 btrfs_warn(fs_info,
2109 "error clearing superblock number %d (%d)",
2110 copy_num, ret);
2111 btrfs_release_disk_super(disk_super);
2112
6fbceb9f
JT
2113 }
2114
2115 /* Notify udev that device has changed */
2116 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2117
2118 /* Update ctime/mtime for device path for libblkid */
2119 update_dev_time(device_path);
2120}
2121
da353f6b 2122int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
53f8a74c 2123 u64 devid)
f1fa7f26
AJ
2124{
2125 struct btrfs_device *device;
1f78160c 2126 struct btrfs_fs_devices *cur_devices;
b5185197 2127 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2b82032c 2128 u64 num_devices;
a061fc8d
CM
2129 int ret = 0;
2130
a061fc8d
CM
2131 mutex_lock(&uuid_mutex);
2132
1da73967 2133 num_devices = btrfs_num_devices(fs_info);
8dabb742 2134
0b246afa 2135 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
f1fa7f26 2136 if (ret)
a061fc8d 2137 goto out;
a061fc8d 2138
a27a94c2
NB
2139 device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2140
2141 if (IS_ERR(device)) {
2142 if (PTR_ERR(device) == -ENOENT &&
2143 strcmp(device_path, "missing") == 0)
2144 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2145 else
2146 ret = PTR_ERR(device);
53b381b3 2147 goto out;
a27a94c2 2148 }
dfe25020 2149
eede2bf3
OS
2150 if (btrfs_pinned_by_swapfile(fs_info, device)) {
2151 btrfs_warn_in_rcu(fs_info,
2152 "cannot remove device %s (devid %llu) due to active swapfile",
2153 rcu_str_deref(device->name), device->devid);
2154 ret = -ETXTBSY;
2155 goto out;
2156 }
2157
401e29c1 2158 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
183860f6 2159 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
24fc572f 2160 goto out;
63a212ab
SB
2161 }
2162
ebbede42
AJ
2163 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2164 fs_info->fs_devices->rw_devices == 1) {
183860f6 2165 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
24fc572f 2166 goto out;
2b82032c
YZ
2167 }
2168
ebbede42 2169 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
34441361 2170 mutex_lock(&fs_info->chunk_mutex);
2b82032c 2171 list_del_init(&device->dev_alloc_list);
c3929c36 2172 device->fs_devices->rw_devices--;
34441361 2173 mutex_unlock(&fs_info->chunk_mutex);
dfe25020 2174 }
a061fc8d 2175
d7901554 2176 mutex_unlock(&uuid_mutex);
a061fc8d 2177 ret = btrfs_shrink_device(device, 0);
66d204a1
FM
2178 if (!ret)
2179 btrfs_reada_remove_dev(device);
d7901554 2180 mutex_lock(&uuid_mutex);
a061fc8d 2181 if (ret)
9b3517e9 2182 goto error_undo;
a061fc8d 2183
63a212ab
SB
2184 /*
2185 * TODO: the superblock still includes this device in its num_devices
2186 * counter although write_all_supers() is not locked out. This
2187 * could give a filesystem state which requires a degraded mount.
2188 */
f331a952 2189 ret = btrfs_rm_dev_item(device);
a061fc8d 2190 if (ret)
9b3517e9 2191 goto error_undo;
a061fc8d 2192
e12c9621 2193 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
163e97ee 2194 btrfs_scrub_cancel_dev(device);
e5e9a520
CM
2195
2196 /*
2197 * the device list mutex makes sure that we don't change
2198 * the device list while someone else is writing out all
d7306801
FDBM
2199 * the device supers. Whoever is writing all supers, should
2200 * lock the device list mutex before getting the number of
2201 * devices in the super block (super_copy). Conversely,
2202 * whoever updates the number of devices in the super block
2203 * (super_copy) should hold the device list mutex.
e5e9a520 2204 */
1f78160c 2205
41a52a0f
AJ
2206 /*
2207 * In normal cases the cur_devices == fs_devices. But in case
2208 * of deleting a seed device, the cur_devices should point to
2209 * its own fs_devices listed under the fs_devices->seed.
2210 */
1f78160c 2211 cur_devices = device->fs_devices;
b5185197 2212 mutex_lock(&fs_devices->device_list_mutex);
1f78160c 2213 list_del_rcu(&device->dev_list);
e5e9a520 2214
41a52a0f
AJ
2215 cur_devices->num_devices--;
2216 cur_devices->total_devices--;
b4993e64
AJ
2217 /* Update total_devices of the parent fs_devices if it's seed */
2218 if (cur_devices != fs_devices)
2219 fs_devices->total_devices--;
2b82032c 2220
e6e674bd 2221 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
41a52a0f 2222 cur_devices->missing_devices--;
cd02dca5 2223
d6507cf1 2224 btrfs_assign_next_active_device(device, NULL);
2b82032c 2225
0bfaa9c5 2226 if (device->bdev) {
41a52a0f 2227 cur_devices->open_devices--;
0bfaa9c5 2228 /* remove sysfs entry */
53f8a74c 2229 btrfs_sysfs_remove_device(device);
0bfaa9c5 2230 }
99994cde 2231
0b246afa
JM
2232 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2233 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
b5185197 2234 mutex_unlock(&fs_devices->device_list_mutex);
2b82032c 2235
cea67ab9
JM
2236 /*
2237 * at this point, the device is zero sized and detached from
2238 * the devices list. All that's left is to zero out the old
2239 * supers and free the device.
2240 */
ebbede42 2241 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
8f32380d
JT
2242 btrfs_scratch_superblocks(fs_info, device->bdev,
2243 device->name->str);
cea67ab9
JM
2244
2245 btrfs_close_bdev(device);
8e75fd89
NB
2246 synchronize_rcu();
2247 btrfs_free_device(device);
cea67ab9 2248
1f78160c 2249 if (cur_devices->open_devices == 0) {
944d3f9f 2250 list_del_init(&cur_devices->seed_list);
0226e0eb 2251 close_fs_devices(cur_devices);
1f78160c 2252 free_fs_devices(cur_devices);
2b82032c
YZ
2253 }
2254
a061fc8d
CM
2255out:
2256 mutex_unlock(&uuid_mutex);
a061fc8d 2257 return ret;
24fc572f 2258
9b3517e9 2259error_undo:
66d204a1 2260 btrfs_reada_undo_remove_dev(device);
ebbede42 2261 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
34441361 2262 mutex_lock(&fs_info->chunk_mutex);
9b3517e9 2263 list_add(&device->dev_alloc_list,
b5185197 2264 &fs_devices->alloc_list);
c3929c36 2265 device->fs_devices->rw_devices++;
34441361 2266 mutex_unlock(&fs_info->chunk_mutex);
9b3517e9 2267 }
24fc572f 2268 goto out;
a061fc8d
CM
2269}
2270
68a9db5f 2271void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
e93c89c1 2272{
d51908ce
AJ
2273 struct btrfs_fs_devices *fs_devices;
2274
68a9db5f 2275 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
1357272f 2276
25e8e911
AJ
2277 /*
2278 * in case of fs with no seed, srcdev->fs_devices will point
2279 * to fs_devices of fs_info. However when the dev being replaced is
2280 * a seed dev it will point to the seed's local fs_devices. In short
2281 * srcdev will have its correct fs_devices in both the cases.
2282 */
2283 fs_devices = srcdev->fs_devices;
d51908ce 2284
e93c89c1 2285 list_del_rcu(&srcdev->dev_list);
619c47f3 2286 list_del(&srcdev->dev_alloc_list);
d51908ce 2287 fs_devices->num_devices--;
e6e674bd 2288 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
d51908ce 2289 fs_devices->missing_devices--;
e93c89c1 2290
ebbede42 2291 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
82372bc8 2292 fs_devices->rw_devices--;
1357272f 2293
82372bc8 2294 if (srcdev->bdev)
d51908ce 2295 fs_devices->open_devices--;
084b6e7c
QW
2296}
2297
65237ee3 2298void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
084b6e7c
QW
2299{
2300 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
e93c89c1 2301
a466c85e
JB
2302 mutex_lock(&uuid_mutex);
2303
14238819 2304 btrfs_close_bdev(srcdev);
8e75fd89
NB
2305 synchronize_rcu();
2306 btrfs_free_device(srcdev);
94d5f0c2 2307
94d5f0c2
AJ
2308 /* if this is no devs we rather delete the fs_devices */
2309 if (!fs_devices->num_devices) {
6dd38f81
AJ
2310 /*
2311 * On a mounted FS, num_devices can't be zero unless it's a
2312 * seed. In case of a seed device being replaced, the replace
2313 * target added to the sprout FS, so there will be no more
2314 * device left under the seed FS.
2315 */
2316 ASSERT(fs_devices->seeding);
2317
944d3f9f 2318 list_del_init(&fs_devices->seed_list);
0226e0eb 2319 close_fs_devices(fs_devices);
8bef8401 2320 free_fs_devices(fs_devices);
94d5f0c2 2321 }
a466c85e 2322 mutex_unlock(&uuid_mutex);
e93c89c1
SB
2323}
2324
4f5ad7bd 2325void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
e93c89c1 2326{
4f5ad7bd 2327 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
d9a071f0 2328
d9a071f0 2329 mutex_lock(&fs_devices->device_list_mutex);
d2ff1b20 2330
53f8a74c 2331 btrfs_sysfs_remove_device(tgtdev);
d2ff1b20 2332
779bf3fe 2333 if (tgtdev->bdev)
d9a071f0 2334 fs_devices->open_devices--;
779bf3fe 2335
d9a071f0 2336 fs_devices->num_devices--;
e93c89c1 2337
d6507cf1 2338 btrfs_assign_next_active_device(tgtdev, NULL);
e93c89c1 2339
e93c89c1 2340 list_del_rcu(&tgtdev->dev_list);
e93c89c1 2341
d9a071f0 2342 mutex_unlock(&fs_devices->device_list_mutex);
779bf3fe
AJ
2343
2344 /*
2345 * The update_dev_time() with in btrfs_scratch_superblocks()
2346 * may lead to a call to btrfs_show_devname() which will try
2347 * to hold device_list_mutex. And here this device
2348 * is already out of device list, so we don't have to hold
2349 * the device_list_mutex lock.
2350 */
8f32380d
JT
2351 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2352 tgtdev->name->str);
14238819
AJ
2353
2354 btrfs_close_bdev(tgtdev);
8e75fd89
NB
2355 synchronize_rcu();
2356 btrfs_free_device(tgtdev);
e93c89c1
SB
2357}
2358
b444ad46
NB
2359static struct btrfs_device *btrfs_find_device_by_path(
2360 struct btrfs_fs_info *fs_info, const char *device_path)
7ba15b7d
SB
2361{
2362 int ret = 0;
2363 struct btrfs_super_block *disk_super;
2364 u64 devid;
2365 u8 *dev_uuid;
2366 struct block_device *bdev;
b444ad46 2367 struct btrfs_device *device;
7ba15b7d 2368
7ba15b7d 2369 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
8f32380d 2370 fs_info->bdev_holder, 0, &bdev, &disk_super);
7ba15b7d 2371 if (ret)
b444ad46 2372 return ERR_PTR(ret);
8f32380d 2373
7ba15b7d
SB
2374 devid = btrfs_stack_device_id(&disk_super->dev_item);
2375 dev_uuid = disk_super->dev_item.uuid;
7239ff4b 2376 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
e4319cd9 2377 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
b2598edf 2378 disk_super->metadata_uuid);
7239ff4b 2379 else
e4319cd9 2380 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
b2598edf 2381 disk_super->fsid);
7239ff4b 2382
8f32380d 2383 btrfs_release_disk_super(disk_super);
b444ad46
NB
2384 if (!device)
2385 device = ERR_PTR(-ENOENT);
7ba15b7d 2386 blkdev_put(bdev, FMODE_READ);
b444ad46 2387 return device;
7ba15b7d
SB
2388}
2389
5c5c0df0
DS
2390/*
2391 * Lookup a device given by device id, or the path if the id is 0.
2392 */
a27a94c2 2393struct btrfs_device *btrfs_find_device_by_devspec(
6e927ceb
AJ
2394 struct btrfs_fs_info *fs_info, u64 devid,
2395 const char *device_path)
24e0474b 2396{
a27a94c2 2397 struct btrfs_device *device;
24e0474b 2398
5c5c0df0 2399 if (devid) {
e4319cd9 2400 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
b2598edf 2401 NULL);
a27a94c2
NB
2402 if (!device)
2403 return ERR_PTR(-ENOENT);
6e927ceb
AJ
2404 return device;
2405 }
2406
2407 if (!device_path || !device_path[0])
2408 return ERR_PTR(-EINVAL);
2409
2410 if (strcmp(device_path, "missing") == 0) {
2411 /* Find first missing device */
2412 list_for_each_entry(device, &fs_info->fs_devices->devices,
2413 dev_list) {
2414 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2415 &device->dev_state) && !device->bdev)
2416 return device;
d95a830c 2417 }
6e927ceb 2418 return ERR_PTR(-ENOENT);
24e0474b 2419 }
6e927ceb
AJ
2420
2421 return btrfs_find_device_by_path(fs_info, device_path);
24e0474b
AJ
2422}
2423
2b82032c
YZ
2424/*
2425 * does all the dirty work required for changing file system's UUID.
2426 */
2ff7e61e 2427static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2b82032c 2428{
0b246afa 2429 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2b82032c 2430 struct btrfs_fs_devices *old_devices;
e4404d6e 2431 struct btrfs_fs_devices *seed_devices;
0b246afa 2432 struct btrfs_super_block *disk_super = fs_info->super_copy;
2b82032c
YZ
2433 struct btrfs_device *device;
2434 u64 super_flags;
2435
a32bf9a3 2436 lockdep_assert_held(&uuid_mutex);
e4404d6e 2437 if (!fs_devices->seeding)
2b82032c
YZ
2438 return -EINVAL;
2439
427c8fdd
NB
2440 /*
2441 * Private copy of the seed devices, anchored at
2442 * fs_info->fs_devices->seed_list
2443 */
7239ff4b 2444 seed_devices = alloc_fs_devices(NULL, NULL);
2208a378
ID
2445 if (IS_ERR(seed_devices))
2446 return PTR_ERR(seed_devices);
2b82032c 2447
427c8fdd
NB
2448 /*
2449 * It's necessary to retain a copy of the original seed fs_devices in
2450 * fs_uuids so that filesystems which have been seeded can successfully
2451 * reference the seed device from open_seed_devices. This also supports
2452 * multiple fs seed.
2453 */
e4404d6e
YZ
2454 old_devices = clone_fs_devices(fs_devices);
2455 if (IS_ERR(old_devices)) {
2456 kfree(seed_devices);
2457 return PTR_ERR(old_devices);
2b82032c 2458 }
e4404d6e 2459
c4babc5e 2460 list_add(&old_devices->fs_list, &fs_uuids);
2b82032c 2461
e4404d6e
YZ
2462 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2463 seed_devices->opened = 1;
2464 INIT_LIST_HEAD(&seed_devices->devices);
2465 INIT_LIST_HEAD(&seed_devices->alloc_list);
e5e9a520 2466 mutex_init(&seed_devices->device_list_mutex);
c9513edb 2467
321a4bf7 2468 mutex_lock(&fs_devices->device_list_mutex);
1f78160c
XG
2469 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2470 synchronize_rcu);
2196d6e8
MX
2471 list_for_each_entry(device, &seed_devices->devices, dev_list)
2472 device->fs_devices = seed_devices;
c9513edb 2473
0395d84f 2474 fs_devices->seeding = false;
2b82032c
YZ
2475 fs_devices->num_devices = 0;
2476 fs_devices->open_devices = 0;
69611ac8 2477 fs_devices->missing_devices = 0;
7f0432d0 2478 fs_devices->rotating = false;
944d3f9f 2479 list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2b82032c
YZ
2480
2481 generate_random_uuid(fs_devices->fsid);
7239ff4b 2482 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2b82032c 2483 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
321a4bf7 2484 mutex_unlock(&fs_devices->device_list_mutex);
f7171750 2485
2b82032c
YZ
2486 super_flags = btrfs_super_flags(disk_super) &
2487 ~BTRFS_SUPER_FLAG_SEEDING;
2488 btrfs_set_super_flags(disk_super, super_flags);
2489
2490 return 0;
2491}
2492
2493/*
01327610 2494 * Store the expected generation for seed devices in device items.
2b82032c 2495 */
5c466629 2496static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2b82032c 2497{
5c466629 2498 struct btrfs_fs_info *fs_info = trans->fs_info;
5b4aacef 2499 struct btrfs_root *root = fs_info->chunk_root;
2b82032c
YZ
2500 struct btrfs_path *path;
2501 struct extent_buffer *leaf;
2502 struct btrfs_dev_item *dev_item;
2503 struct btrfs_device *device;
2504 struct btrfs_key key;
44880fdc 2505 u8 fs_uuid[BTRFS_FSID_SIZE];
2b82032c
YZ
2506 u8 dev_uuid[BTRFS_UUID_SIZE];
2507 u64 devid;
2508 int ret;
2509
2510 path = btrfs_alloc_path();
2511 if (!path)
2512 return -ENOMEM;
2513
2b82032c
YZ
2514 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2515 key.offset = 0;
2516 key.type = BTRFS_DEV_ITEM_KEY;
2517
2518 while (1) {
2519 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2520 if (ret < 0)
2521 goto error;
2522
2523 leaf = path->nodes[0];
2524next_slot:
2525 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2526 ret = btrfs_next_leaf(root, path);
2527 if (ret > 0)
2528 break;
2529 if (ret < 0)
2530 goto error;
2531 leaf = path->nodes[0];
2532 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 2533 btrfs_release_path(path);
2b82032c
YZ
2534 continue;
2535 }
2536
2537 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2538 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2539 key.type != BTRFS_DEV_ITEM_KEY)
2540 break;
2541
2542 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2543 struct btrfs_dev_item);
2544 devid = btrfs_device_id(leaf, dev_item);
410ba3a2 2545 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2b82032c 2546 BTRFS_UUID_SIZE);
1473b24e 2547 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 2548 BTRFS_FSID_SIZE);
e4319cd9 2549 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
b2598edf 2550 fs_uuid);
79787eaa 2551 BUG_ON(!device); /* Logic error */
2b82032c
YZ
2552
2553 if (device->fs_devices->seeding) {
2554 btrfs_set_device_generation(leaf, dev_item,
2555 device->generation);
2556 btrfs_mark_buffer_dirty(leaf);
2557 }
2558
2559 path->slots[0]++;
2560 goto next_slot;
2561 }
2562 ret = 0;
2563error:
2564 btrfs_free_path(path);
2565 return ret;
2566}
2567
da353f6b 2568int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
788f20eb 2569{
5112febb 2570 struct btrfs_root *root = fs_info->dev_root;
d5e2003c 2571 struct request_queue *q;
788f20eb
CM
2572 struct btrfs_trans_handle *trans;
2573 struct btrfs_device *device;
2574 struct block_device *bdev;
0b246afa 2575 struct super_block *sb = fs_info->sb;
606686ee 2576 struct rcu_string *name;
5da54bc1 2577 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
39379faa
NA
2578 u64 orig_super_total_bytes;
2579 u64 orig_super_num_devices;
2b82032c 2580 int seeding_dev = 0;
788f20eb 2581 int ret = 0;
44cab9ba 2582 bool locked = false;
788f20eb 2583
5da54bc1 2584 if (sb_rdonly(sb) && !fs_devices->seeding)
f8c5d0b4 2585 return -EROFS;
788f20eb 2586
a5d16333 2587 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
0b246afa 2588 fs_info->bdev_holder);
7f59203a
JB
2589 if (IS_ERR(bdev))
2590 return PTR_ERR(bdev);
a2135011 2591
b70f5097
NA
2592 if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2593 ret = -EINVAL;
2594 goto error;
2595 }
2596
5da54bc1 2597 if (fs_devices->seeding) {
2b82032c
YZ
2598 seeding_dev = 1;
2599 down_write(&sb->s_umount);
2600 mutex_lock(&uuid_mutex);
44cab9ba 2601 locked = true;
2b82032c
YZ
2602 }
2603
b9ba017f 2604 sync_blockdev(bdev);
a2135011 2605
f4cfa9bd
NB
2606 rcu_read_lock();
2607 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
788f20eb
CM
2608 if (device->bdev == bdev) {
2609 ret = -EEXIST;
f4cfa9bd 2610 rcu_read_unlock();
2b82032c 2611 goto error;
788f20eb
CM
2612 }
2613 }
f4cfa9bd 2614 rcu_read_unlock();
788f20eb 2615
0b246afa 2616 device = btrfs_alloc_device(fs_info, NULL, NULL);
12bd2fc0 2617 if (IS_ERR(device)) {
788f20eb 2618 /* we can safely leave the fs_devices entry around */
12bd2fc0 2619 ret = PTR_ERR(device);
2b82032c 2620 goto error;
788f20eb
CM
2621 }
2622
78f2c9e6 2623 name = rcu_string_strdup(device_path, GFP_KERNEL);
606686ee 2624 if (!name) {
2b82032c 2625 ret = -ENOMEM;
5c4cf6c9 2626 goto error_free_device;
788f20eb 2627 }
606686ee 2628 rcu_assign_pointer(device->name, name);
2b82032c 2629
5b316468
NA
2630 device->fs_info = fs_info;
2631 device->bdev = bdev;
2632
2633 ret = btrfs_get_dev_zone_info(device);
2634 if (ret)
2635 goto error_free_device;
2636
a22285a6 2637 trans = btrfs_start_transaction(root, 0);
98d5dc13 2638 if (IS_ERR(trans)) {
98d5dc13 2639 ret = PTR_ERR(trans);
5b316468 2640 goto error_free_zone;
98d5dc13
TI
2641 }
2642
d5e2003c 2643 q = bdev_get_queue(bdev);
ebbede42 2644 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2b82032c 2645 device->generation = trans->transid;
0b246afa
JM
2646 device->io_width = fs_info->sectorsize;
2647 device->io_align = fs_info->sectorsize;
2648 device->sector_size = fs_info->sectorsize;
7dfb8be1
NB
2649 device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2650 fs_info->sectorsize);
2cc3c559 2651 device->disk_total_bytes = device->total_bytes;
935e5cc9 2652 device->commit_total_bytes = device->total_bytes;
e12c9621 2653 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
401e29c1 2654 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
fb01aa85 2655 device->mode = FMODE_EXCL;
27087f37 2656 device->dev_stats_valid = 1;
9f6d2510 2657 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
788f20eb 2658
2b82032c 2659 if (seeding_dev) {
a0a1db70 2660 btrfs_clear_sb_rdonly(sb);
2ff7e61e 2661 ret = btrfs_prepare_sprout(fs_info);
d31c32f6
AJ
2662 if (ret) {
2663 btrfs_abort_transaction(trans, ret);
2664 goto error_trans;
2665 }
2b82032c 2666 }
788f20eb 2667
5da54bc1 2668 device->fs_devices = fs_devices;
e5e9a520 2669
5da54bc1 2670 mutex_lock(&fs_devices->device_list_mutex);
34441361 2671 mutex_lock(&fs_info->chunk_mutex);
5da54bc1
AJ
2672 list_add_rcu(&device->dev_list, &fs_devices->devices);
2673 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2674 fs_devices->num_devices++;
2675 fs_devices->open_devices++;
2676 fs_devices->rw_devices++;
2677 fs_devices->total_devices++;
2678 fs_devices->total_rw_bytes += device->total_bytes;
325cd4ba 2679
a5ed45f8 2680 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2bf64758 2681
e884f4f0 2682 if (!blk_queue_nonrot(q))
7f0432d0 2683 fs_devices->rotating = true;
c289811c 2684
39379faa 2685 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
0b246afa 2686 btrfs_set_super_total_bytes(fs_info->super_copy,
39379faa
NA
2687 round_down(orig_super_total_bytes + device->total_bytes,
2688 fs_info->sectorsize));
788f20eb 2689
39379faa
NA
2690 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2691 btrfs_set_super_num_devices(fs_info->super_copy,
2692 orig_super_num_devices + 1);
0d39376a 2693
2196d6e8
MX
2694 /*
2695 * we've got more storage, clear any full flags on the space
2696 * infos
2697 */
0b246afa 2698 btrfs_clear_space_info_full(fs_info);
2196d6e8 2699
34441361 2700 mutex_unlock(&fs_info->chunk_mutex);
ca10845a
JB
2701
2702 /* Add sysfs device entry */
cd36da2e 2703 btrfs_sysfs_add_device(device);
ca10845a 2704
5da54bc1 2705 mutex_unlock(&fs_devices->device_list_mutex);
788f20eb 2706
2b82032c 2707 if (seeding_dev) {
34441361 2708 mutex_lock(&fs_info->chunk_mutex);
6f8e0fc7 2709 ret = init_first_rw_device(trans);
34441361 2710 mutex_unlock(&fs_info->chunk_mutex);
005d6427 2711 if (ret) {
66642832 2712 btrfs_abort_transaction(trans, ret);
d31c32f6 2713 goto error_sysfs;
005d6427 2714 }
2196d6e8
MX
2715 }
2716
8e87e856 2717 ret = btrfs_add_dev_item(trans, device);
2196d6e8 2718 if (ret) {
66642832 2719 btrfs_abort_transaction(trans, ret);
d31c32f6 2720 goto error_sysfs;
2196d6e8
MX
2721 }
2722
2723 if (seeding_dev) {
5c466629 2724 ret = btrfs_finish_sprout(trans);
005d6427 2725 if (ret) {
66642832 2726 btrfs_abort_transaction(trans, ret);
d31c32f6 2727 goto error_sysfs;
005d6427 2728 }
b2373f25 2729
8e560081
NB
2730 /*
2731 * fs_devices now represents the newly sprouted filesystem and
2732 * its fsid has been changed by btrfs_prepare_sprout
2733 */
2734 btrfs_sysfs_update_sprout_fsid(fs_devices);
2b82032c
YZ
2735 }
2736
3a45bb20 2737 ret = btrfs_commit_transaction(trans);
a2135011 2738
2b82032c
YZ
2739 if (seeding_dev) {
2740 mutex_unlock(&uuid_mutex);
2741 up_write(&sb->s_umount);
44cab9ba 2742 locked = false;
788f20eb 2743
79787eaa
JM
2744 if (ret) /* transaction commit */
2745 return ret;
2746
2ff7e61e 2747 ret = btrfs_relocate_sys_chunks(fs_info);
79787eaa 2748 if (ret < 0)
0b246afa 2749 btrfs_handle_fs_error(fs_info, ret,
5d163e0e 2750 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
671415b7
MX
2751 trans = btrfs_attach_transaction(root);
2752 if (IS_ERR(trans)) {
2753 if (PTR_ERR(trans) == -ENOENT)
2754 return 0;
7132a262
AJ
2755 ret = PTR_ERR(trans);
2756 trans = NULL;
2757 goto error_sysfs;
671415b7 2758 }
3a45bb20 2759 ret = btrfs_commit_transaction(trans);
2b82032c 2760 }
c9e9f97b 2761
7f551d96
AJ
2762 /*
2763 * Now that we have written a new super block to this device, check all
2764 * other fs_devices list if device_path alienates any other scanned
2765 * device.
2766 * We can ignore the return value as it typically returns -EINVAL and
2767 * only succeeds if the device was an alien.
2768 */
2769 btrfs_forget_devices(device_path);
2770
2771 /* Update ctime/mtime for blkid or udev */
5a1972bd 2772 update_dev_time(device_path);
7f551d96 2773
2b82032c 2774 return ret;
79787eaa 2775
d31c32f6 2776error_sysfs:
53f8a74c 2777 btrfs_sysfs_remove_device(device);
39379faa
NA
2778 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2779 mutex_lock(&fs_info->chunk_mutex);
2780 list_del_rcu(&device->dev_list);
2781 list_del(&device->dev_alloc_list);
2782 fs_info->fs_devices->num_devices--;
2783 fs_info->fs_devices->open_devices--;
2784 fs_info->fs_devices->rw_devices--;
2785 fs_info->fs_devices->total_devices--;
2786 fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2787 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2788 btrfs_set_super_total_bytes(fs_info->super_copy,
2789 orig_super_total_bytes);
2790 btrfs_set_super_num_devices(fs_info->super_copy,
2791 orig_super_num_devices);
2792 mutex_unlock(&fs_info->chunk_mutex);
2793 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
79787eaa 2794error_trans:
0af2c4bf 2795 if (seeding_dev)
a0a1db70 2796 btrfs_set_sb_rdonly(sb);
7132a262
AJ
2797 if (trans)
2798 btrfs_end_transaction(trans);
5b316468
NA
2799error_free_zone:
2800 btrfs_destroy_dev_zone_info(device);
5c4cf6c9 2801error_free_device:
a425f9d4 2802 btrfs_free_device(device);
2b82032c 2803error:
e525fd89 2804 blkdev_put(bdev, FMODE_EXCL);
44cab9ba 2805 if (locked) {
2b82032c
YZ
2806 mutex_unlock(&uuid_mutex);
2807 up_write(&sb->s_umount);
2808 }
c9e9f97b 2809 return ret;
788f20eb
CM
2810}
2811
d397712b
CM
2812static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2813 struct btrfs_device *device)
0b86a832
CM
2814{
2815 int ret;
2816 struct btrfs_path *path;
0b246afa 2817 struct btrfs_root *root = device->fs_info->chunk_root;
0b86a832
CM
2818 struct btrfs_dev_item *dev_item;
2819 struct extent_buffer *leaf;
2820 struct btrfs_key key;
2821
0b86a832
CM
2822 path = btrfs_alloc_path();
2823 if (!path)
2824 return -ENOMEM;
2825
2826 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2827 key.type = BTRFS_DEV_ITEM_KEY;
2828 key.offset = device->devid;
2829
2830 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2831 if (ret < 0)
2832 goto out;
2833
2834 if (ret > 0) {
2835 ret = -ENOENT;
2836 goto out;
2837 }
2838
2839 leaf = path->nodes[0];
2840 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2841
2842 btrfs_set_device_id(leaf, dev_item, device->devid);
2843 btrfs_set_device_type(leaf, dev_item, device->type);
2844 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2845 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2846 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
2847 btrfs_set_device_total_bytes(leaf, dev_item,
2848 btrfs_device_get_disk_total_bytes(device));
2849 btrfs_set_device_bytes_used(leaf, dev_item,
2850 btrfs_device_get_bytes_used(device));
0b86a832
CM
2851 btrfs_mark_buffer_dirty(leaf);
2852
2853out:
2854 btrfs_free_path(path);
2855 return ret;
2856}
2857
2196d6e8 2858int btrfs_grow_device(struct btrfs_trans_handle *trans,
8f18cf13
CM
2859 struct btrfs_device *device, u64 new_size)
2860{
0b246afa
JM
2861 struct btrfs_fs_info *fs_info = device->fs_info;
2862 struct btrfs_super_block *super_copy = fs_info->super_copy;
2196d6e8
MX
2863 u64 old_total;
2864 u64 diff;
8f18cf13 2865
ebbede42 2866 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2b82032c 2867 return -EACCES;
2196d6e8 2868
7dfb8be1
NB
2869 new_size = round_down(new_size, fs_info->sectorsize);
2870
34441361 2871 mutex_lock(&fs_info->chunk_mutex);
2196d6e8 2872 old_total = btrfs_super_total_bytes(super_copy);
0e4324a4 2873 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2196d6e8 2874
63a212ab 2875 if (new_size <= device->total_bytes ||
401e29c1 2876 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
34441361 2877 mutex_unlock(&fs_info->chunk_mutex);
2b82032c 2878 return -EINVAL;
2196d6e8 2879 }
2b82032c 2880
7dfb8be1
NB
2881 btrfs_set_super_total_bytes(super_copy,
2882 round_down(old_total + diff, fs_info->sectorsize));
2b82032c
YZ
2883 device->fs_devices->total_rw_bytes += diff;
2884
7cc8e58d
MX
2885 btrfs_device_set_total_bytes(device, new_size);
2886 btrfs_device_set_disk_total_bytes(device, new_size);
fb456252 2887 btrfs_clear_space_info_full(device->fs_info);
bbbf7243
NB
2888 if (list_empty(&device->post_commit_list))
2889 list_add_tail(&device->post_commit_list,
2890 &trans->transaction->dev_update_list);
34441361 2891 mutex_unlock(&fs_info->chunk_mutex);
4184ea7f 2892
8f18cf13
CM
2893 return btrfs_update_device(trans, device);
2894}
2895
f4208794 2896static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
8f18cf13 2897{
f4208794 2898 struct btrfs_fs_info *fs_info = trans->fs_info;
5b4aacef 2899 struct btrfs_root *root = fs_info->chunk_root;
8f18cf13
CM
2900 int ret;
2901 struct btrfs_path *path;
2902 struct btrfs_key key;
2903
8f18cf13
CM
2904 path = btrfs_alloc_path();
2905 if (!path)
2906 return -ENOMEM;
2907
408fbf19 2908 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
8f18cf13
CM
2909 key.offset = chunk_offset;
2910 key.type = BTRFS_CHUNK_ITEM_KEY;
2911
2912 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
79787eaa
JM
2913 if (ret < 0)
2914 goto out;
2915 else if (ret > 0) { /* Logic error or corruption */
0b246afa
JM
2916 btrfs_handle_fs_error(fs_info, -ENOENT,
2917 "Failed lookup while freeing chunk.");
79787eaa
JM
2918 ret = -ENOENT;
2919 goto out;
2920 }
8f18cf13
CM
2921
2922 ret = btrfs_del_item(trans, root, path);
79787eaa 2923 if (ret < 0)
0b246afa
JM
2924 btrfs_handle_fs_error(fs_info, ret,
2925 "Failed to delete chunk item.");
79787eaa 2926out:
8f18cf13 2927 btrfs_free_path(path);
65a246c5 2928 return ret;
8f18cf13
CM
2929}
2930
408fbf19 2931static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
8f18cf13 2932{
0b246afa 2933 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13
CM
2934 struct btrfs_disk_key *disk_key;
2935 struct btrfs_chunk *chunk;
2936 u8 *ptr;
2937 int ret = 0;
2938 u32 num_stripes;
2939 u32 array_size;
2940 u32 len = 0;
2941 u32 cur;
2942 struct btrfs_key key;
2943
34441361 2944 mutex_lock(&fs_info->chunk_mutex);
8f18cf13
CM
2945 array_size = btrfs_super_sys_array_size(super_copy);
2946
2947 ptr = super_copy->sys_chunk_array;
2948 cur = 0;
2949
2950 while (cur < array_size) {
2951 disk_key = (struct btrfs_disk_key *)ptr;
2952 btrfs_disk_key_to_cpu(&key, disk_key);
2953
2954 len = sizeof(*disk_key);
2955
2956 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2957 chunk = (struct btrfs_chunk *)(ptr + len);
2958 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2959 len += btrfs_chunk_item_size(num_stripes);
2960 } else {
2961 ret = -EIO;
2962 break;
2963 }
408fbf19 2964 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
8f18cf13
CM
2965 key.offset == chunk_offset) {
2966 memmove(ptr, ptr + len, array_size - (cur + len));
2967 array_size -= len;
2968 btrfs_set_super_sys_array_size(super_copy, array_size);
2969 } else {
2970 ptr += len;
2971 cur += len;
2972 }
2973 }
34441361 2974 mutex_unlock(&fs_info->chunk_mutex);
8f18cf13
CM
2975 return ret;
2976}
2977
60ca842e
OS
2978/*
2979 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2980 * @logical: Logical block offset in bytes.
2981 * @length: Length of extent in bytes.
2982 *
2983 * Return: Chunk mapping or ERR_PTR.
2984 */
2985struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2986 u64 logical, u64 length)
592d92ee
LB
2987{
2988 struct extent_map_tree *em_tree;
2989 struct extent_map *em;
2990
c8bf1b67 2991 em_tree = &fs_info->mapping_tree;
592d92ee
LB
2992 read_lock(&em_tree->lock);
2993 em = lookup_extent_mapping(em_tree, logical, length);
2994 read_unlock(&em_tree->lock);
2995
2996 if (!em) {
2997 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2998 logical, length);
2999 return ERR_PTR(-EINVAL);
3000 }
3001
3002 if (em->start > logical || em->start + em->len < logical) {
3003 btrfs_crit(fs_info,
3004 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3005 logical, length, em->start, em->start + em->len);
3006 free_extent_map(em);
3007 return ERR_PTR(-EINVAL);
3008 }
3009
3010 /* callers are responsible for dropping em's ref. */
3011 return em;
3012}
3013
97aff912 3014int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
8f18cf13 3015{
97aff912 3016 struct btrfs_fs_info *fs_info = trans->fs_info;
8f18cf13
CM
3017 struct extent_map *em;
3018 struct map_lookup *map;
2196d6e8 3019 u64 dev_extent_len = 0;
47ab2a6c 3020 int i, ret = 0;
0b246afa 3021 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
8f18cf13 3022
60ca842e 3023 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
592d92ee 3024 if (IS_ERR(em)) {
47ab2a6c
JB
3025 /*
3026 * This is a logic error, but we don't want to just rely on the
bb7ab3b9 3027 * user having built with ASSERT enabled, so if ASSERT doesn't
47ab2a6c
JB
3028 * do anything we still error out.
3029 */
3030 ASSERT(0);
592d92ee 3031 return PTR_ERR(em);
47ab2a6c 3032 }
95617d69 3033 map = em->map_lookup;
34441361 3034 mutex_lock(&fs_info->chunk_mutex);
451a2c13 3035 check_system_chunk(trans, map->type);
34441361 3036 mutex_unlock(&fs_info->chunk_mutex);
8f18cf13 3037
57ba4cb8
FM
3038 /*
3039 * Take the device list mutex to prevent races with the final phase of
3040 * a device replace operation that replaces the device object associated
3041 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
3042 */
3043 mutex_lock(&fs_devices->device_list_mutex);
8f18cf13 3044 for (i = 0; i < map->num_stripes; i++) {
47ab2a6c 3045 struct btrfs_device *device = map->stripes[i].dev;
2196d6e8
MX
3046 ret = btrfs_free_dev_extent(trans, device,
3047 map->stripes[i].physical,
3048 &dev_extent_len);
47ab2a6c 3049 if (ret) {
57ba4cb8 3050 mutex_unlock(&fs_devices->device_list_mutex);
66642832 3051 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3052 goto out;
3053 }
a061fc8d 3054
2196d6e8 3055 if (device->bytes_used > 0) {
34441361 3056 mutex_lock(&fs_info->chunk_mutex);
2196d6e8
MX
3057 btrfs_device_set_bytes_used(device,
3058 device->bytes_used - dev_extent_len);
a5ed45f8 3059 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
0b246afa 3060 btrfs_clear_space_info_full(fs_info);
34441361 3061 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8 3062 }
a061fc8d 3063
64bc6c2a
NB
3064 ret = btrfs_update_device(trans, device);
3065 if (ret) {
3066 mutex_unlock(&fs_devices->device_list_mutex);
3067 btrfs_abort_transaction(trans, ret);
3068 goto out;
dfe25020 3069 }
8f18cf13 3070 }
57ba4cb8
FM
3071 mutex_unlock(&fs_devices->device_list_mutex);
3072
f4208794 3073 ret = btrfs_free_chunk(trans, chunk_offset);
47ab2a6c 3074 if (ret) {
66642832 3075 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3076 goto out;
3077 }
8f18cf13 3078
6bccf3ab 3079 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
1abe9b8a 3080
8f18cf13 3081 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
408fbf19 3082 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
47ab2a6c 3083 if (ret) {
66642832 3084 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3085 goto out;
3086 }
8f18cf13
CM
3087 }
3088
5a98ec01 3089 ret = btrfs_remove_block_group(trans, chunk_offset, em);
47ab2a6c 3090 if (ret) {
66642832 3091 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3092 goto out;
3093 }
2b82032c 3094
47ab2a6c 3095out:
2b82032c
YZ
3096 /* once for us */
3097 free_extent_map(em);
47ab2a6c
JB
3098 return ret;
3099}
2b82032c 3100
5b4aacef 3101static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
47ab2a6c 3102{
5b4aacef 3103 struct btrfs_root *root = fs_info->chunk_root;
19c4d2f9 3104 struct btrfs_trans_handle *trans;
b0643e59 3105 struct btrfs_block_group *block_group;
47ab2a6c 3106 int ret;
2b82032c 3107
67c5e7d4
FM
3108 /*
3109 * Prevent races with automatic removal of unused block groups.
3110 * After we relocate and before we remove the chunk with offset
3111 * chunk_offset, automatic removal of the block group can kick in,
3112 * resulting in a failure when calling btrfs_remove_chunk() below.
3113 *
3114 * Make sure to acquire this mutex before doing a tree search (dev
3115 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3116 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3117 * we release the path used to search the chunk/dev tree and before
3118 * the current task acquires this mutex and calls us.
3119 */
a32bf9a3 3120 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
67c5e7d4 3121
47ab2a6c 3122 /* step one, relocate all the extents inside this chunk */
2ff7e61e 3123 btrfs_scrub_pause(fs_info);
0b246afa 3124 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2ff7e61e 3125 btrfs_scrub_continue(fs_info);
47ab2a6c
JB
3126 if (ret)
3127 return ret;
3128
b0643e59
DZ
3129 block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3130 if (!block_group)
3131 return -ENOENT;
3132 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3133 btrfs_put_block_group(block_group);
3134
19c4d2f9
CM
3135 trans = btrfs_start_trans_remove_block_group(root->fs_info,
3136 chunk_offset);
3137 if (IS_ERR(trans)) {
3138 ret = PTR_ERR(trans);
3139 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3140 return ret;
3141 }
3142
47ab2a6c 3143 /*
19c4d2f9
CM
3144 * step two, delete the device extents and the
3145 * chunk tree entries
47ab2a6c 3146 */
97aff912 3147 ret = btrfs_remove_chunk(trans, chunk_offset);
3a45bb20 3148 btrfs_end_transaction(trans);
19c4d2f9 3149 return ret;
2b82032c
YZ
3150}
3151
2ff7e61e 3152static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2b82032c 3153{
0b246afa 3154 struct btrfs_root *chunk_root = fs_info->chunk_root;
2b82032c
YZ
3155 struct btrfs_path *path;
3156 struct extent_buffer *leaf;
3157 struct btrfs_chunk *chunk;
3158 struct btrfs_key key;
3159 struct btrfs_key found_key;
2b82032c 3160 u64 chunk_type;
ba1bf481
JB
3161 bool retried = false;
3162 int failed = 0;
2b82032c
YZ
3163 int ret;
3164
3165 path = btrfs_alloc_path();
3166 if (!path)
3167 return -ENOMEM;
3168
ba1bf481 3169again:
2b82032c
YZ
3170 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3171 key.offset = (u64)-1;
3172 key.type = BTRFS_CHUNK_ITEM_KEY;
3173
3174 while (1) {
0b246afa 3175 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2b82032c 3176 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4 3177 if (ret < 0) {
0b246afa 3178 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2b82032c 3179 goto error;
67c5e7d4 3180 }
79787eaa 3181 BUG_ON(ret == 0); /* Corruption */
2b82032c
YZ
3182
3183 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3184 key.type);
67c5e7d4 3185 if (ret)
0b246afa 3186 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2b82032c
YZ
3187 if (ret < 0)
3188 goto error;
3189 if (ret > 0)
3190 break;
1a40e23b 3191
2b82032c
YZ
3192 leaf = path->nodes[0];
3193 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1a40e23b 3194
2b82032c
YZ
3195 chunk = btrfs_item_ptr(leaf, path->slots[0],
3196 struct btrfs_chunk);
3197 chunk_type = btrfs_chunk_type(leaf, chunk);
b3b4aa74 3198 btrfs_release_path(path);
8f18cf13 3199
2b82032c 3200 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 3201 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
ba1bf481
JB
3202 if (ret == -ENOSPC)
3203 failed++;
14586651
HS
3204 else
3205 BUG_ON(ret);
2b82032c 3206 }
0b246afa 3207 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 3208
2b82032c
YZ
3209 if (found_key.offset == 0)
3210 break;
3211 key.offset = found_key.offset - 1;
3212 }
3213 ret = 0;
ba1bf481
JB
3214 if (failed && !retried) {
3215 failed = 0;
3216 retried = true;
3217 goto again;
fae7f21c 3218 } else if (WARN_ON(failed && retried)) {
ba1bf481
JB
3219 ret = -ENOSPC;
3220 }
2b82032c
YZ
3221error:
3222 btrfs_free_path(path);
3223 return ret;
8f18cf13
CM
3224}
3225
a6f93c71
LB
3226/*
3227 * return 1 : allocate a data chunk successfully,
3228 * return <0: errors during allocating a data chunk,
3229 * return 0 : no need to allocate a data chunk.
3230 */
3231static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3232 u64 chunk_offset)
3233{
32da5386 3234 struct btrfs_block_group *cache;
a6f93c71
LB
3235 u64 bytes_used;
3236 u64 chunk_type;
3237
3238 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3239 ASSERT(cache);
3240 chunk_type = cache->flags;
3241 btrfs_put_block_group(cache);
3242
5ae21692
JT
3243 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3244 return 0;
3245
3246 spin_lock(&fs_info->data_sinfo->lock);
3247 bytes_used = fs_info->data_sinfo->bytes_used;
3248 spin_unlock(&fs_info->data_sinfo->lock);
3249
3250 if (!bytes_used) {
3251 struct btrfs_trans_handle *trans;
3252 int ret;
3253
3254 trans = btrfs_join_transaction(fs_info->tree_root);
3255 if (IS_ERR(trans))
3256 return PTR_ERR(trans);
3257
3258 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3259 btrfs_end_transaction(trans);
3260 if (ret < 0)
3261 return ret;
3262 return 1;
a6f93c71 3263 }
5ae21692 3264
a6f93c71
LB
3265 return 0;
3266}
3267
6bccf3ab 3268static int insert_balance_item(struct btrfs_fs_info *fs_info,
0940ebf6
ID
3269 struct btrfs_balance_control *bctl)
3270{
6bccf3ab 3271 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3272 struct btrfs_trans_handle *trans;
3273 struct btrfs_balance_item *item;
3274 struct btrfs_disk_balance_args disk_bargs;
3275 struct btrfs_path *path;
3276 struct extent_buffer *leaf;
3277 struct btrfs_key key;
3278 int ret, err;
3279
3280 path = btrfs_alloc_path();
3281 if (!path)
3282 return -ENOMEM;
3283
3284 trans = btrfs_start_transaction(root, 0);
3285 if (IS_ERR(trans)) {
3286 btrfs_free_path(path);
3287 return PTR_ERR(trans);
3288 }
3289
3290 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3291 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3292 key.offset = 0;
3293
3294 ret = btrfs_insert_empty_item(trans, root, path, &key,
3295 sizeof(*item));
3296 if (ret)
3297 goto out;
3298
3299 leaf = path->nodes[0];
3300 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3301
b159fa28 3302 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
0940ebf6
ID
3303
3304 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3305 btrfs_set_balance_data(leaf, item, &disk_bargs);
3306 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3307 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3308 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3309 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3310
3311 btrfs_set_balance_flags(leaf, item, bctl->flags);
3312
3313 btrfs_mark_buffer_dirty(leaf);
3314out:
3315 btrfs_free_path(path);
3a45bb20 3316 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3317 if (err && !ret)
3318 ret = err;
3319 return ret;
3320}
3321
6bccf3ab 3322static int del_balance_item(struct btrfs_fs_info *fs_info)
0940ebf6 3323{
6bccf3ab 3324 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3325 struct btrfs_trans_handle *trans;
3326 struct btrfs_path *path;
3327 struct btrfs_key key;
3328 int ret, err;
3329
3330 path = btrfs_alloc_path();
3331 if (!path)
3332 return -ENOMEM;
3333
3502a8c0 3334 trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
0940ebf6
ID
3335 if (IS_ERR(trans)) {
3336 btrfs_free_path(path);
3337 return PTR_ERR(trans);
3338 }
3339
3340 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3341 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3342 key.offset = 0;
3343
3344 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3345 if (ret < 0)
3346 goto out;
3347 if (ret > 0) {
3348 ret = -ENOENT;
3349 goto out;
3350 }
3351
3352 ret = btrfs_del_item(trans, root, path);
3353out:
3354 btrfs_free_path(path);
3a45bb20 3355 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3356 if (err && !ret)
3357 ret = err;
3358 return ret;
3359}
3360
59641015
ID
3361/*
3362 * This is a heuristic used to reduce the number of chunks balanced on
3363 * resume after balance was interrupted.
3364 */
3365static void update_balance_args(struct btrfs_balance_control *bctl)
3366{
3367 /*
3368 * Turn on soft mode for chunk types that were being converted.
3369 */
3370 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3371 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3372 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3373 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3374 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3375 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3376
3377 /*
3378 * Turn on usage filter if is not already used. The idea is
3379 * that chunks that we have already balanced should be
3380 * reasonably full. Don't do it for chunks that are being
3381 * converted - that will keep us from relocating unconverted
3382 * (albeit full) chunks.
3383 */
3384 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3385 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3386 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3387 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3388 bctl->data.usage = 90;
3389 }
3390 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3391 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3392 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3393 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3394 bctl->sys.usage = 90;
3395 }
3396 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3397 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3398 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3399 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3400 bctl->meta.usage = 90;
3401 }
3402}
3403
149196a2
DS
3404/*
3405 * Clear the balance status in fs_info and delete the balance item from disk.
3406 */
3407static void reset_balance_state(struct btrfs_fs_info *fs_info)
c9e9f97b
ID
3408{
3409 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
149196a2 3410 int ret;
c9e9f97b
ID
3411
3412 BUG_ON(!fs_info->balance_ctl);
3413
3414 spin_lock(&fs_info->balance_lock);
3415 fs_info->balance_ctl = NULL;
3416 spin_unlock(&fs_info->balance_lock);
3417
3418 kfree(bctl);
149196a2
DS
3419 ret = del_balance_item(fs_info);
3420 if (ret)
3421 btrfs_handle_fs_error(fs_info, ret, NULL);
c9e9f97b
ID
3422}
3423
ed25e9b2
ID
3424/*
3425 * Balance filters. Return 1 if chunk should be filtered out
3426 * (should not be balanced).
3427 */
899c81ea 3428static int chunk_profiles_filter(u64 chunk_type,
ed25e9b2
ID
3429 struct btrfs_balance_args *bargs)
3430{
899c81ea
ID
3431 chunk_type = chunk_to_extended(chunk_type) &
3432 BTRFS_EXTENDED_PROFILE_MASK;
ed25e9b2 3433
899c81ea 3434 if (bargs->profiles & chunk_type)
ed25e9b2
ID
3435 return 0;
3436
3437 return 1;
3438}
3439
dba72cb3 3440static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
5ce5b3c0 3441 struct btrfs_balance_args *bargs)
bc309467 3442{
32da5386 3443 struct btrfs_block_group *cache;
bc309467
DS
3444 u64 chunk_used;
3445 u64 user_thresh_min;
3446 u64 user_thresh_max;
3447 int ret = 1;
3448
3449 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
bf38be65 3450 chunk_used = cache->used;
bc309467
DS
3451
3452 if (bargs->usage_min == 0)
3453 user_thresh_min = 0;
3454 else
b3470b5d
DS
3455 user_thresh_min = div_factor_fine(cache->length,
3456 bargs->usage_min);
bc309467
DS
3457
3458 if (bargs->usage_max == 0)
3459 user_thresh_max = 1;
3460 else if (bargs->usage_max > 100)
b3470b5d 3461 user_thresh_max = cache->length;
bc309467 3462 else
b3470b5d
DS
3463 user_thresh_max = div_factor_fine(cache->length,
3464 bargs->usage_max);
bc309467
DS
3465
3466 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3467 ret = 0;
3468
3469 btrfs_put_block_group(cache);
3470 return ret;
3471}
3472
dba72cb3 3473static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
bc309467 3474 u64 chunk_offset, struct btrfs_balance_args *bargs)
5ce5b3c0 3475{
32da5386 3476 struct btrfs_block_group *cache;
5ce5b3c0
ID
3477 u64 chunk_used, user_thresh;
3478 int ret = 1;
3479
3480 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
bf38be65 3481 chunk_used = cache->used;
5ce5b3c0 3482
bc309467 3483 if (bargs->usage_min == 0)
3e39cea6 3484 user_thresh = 1;
a105bb88 3485 else if (bargs->usage > 100)
b3470b5d 3486 user_thresh = cache->length;
a105bb88 3487 else
b3470b5d 3488 user_thresh = div_factor_fine(cache->length, bargs->usage);
a105bb88 3489
5ce5b3c0
ID
3490 if (chunk_used < user_thresh)
3491 ret = 0;
3492
3493 btrfs_put_block_group(cache);
3494 return ret;
3495}
3496
409d404b
ID
3497static int chunk_devid_filter(struct extent_buffer *leaf,
3498 struct btrfs_chunk *chunk,
3499 struct btrfs_balance_args *bargs)
3500{
3501 struct btrfs_stripe *stripe;
3502 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3503 int i;
3504
3505 for (i = 0; i < num_stripes; i++) {
3506 stripe = btrfs_stripe_nr(chunk, i);
3507 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3508 return 0;
3509 }
3510
3511 return 1;
3512}
3513
946c9256
DS
3514static u64 calc_data_stripes(u64 type, int num_stripes)
3515{
3516 const int index = btrfs_bg_flags_to_raid_index(type);
3517 const int ncopies = btrfs_raid_array[index].ncopies;
3518 const int nparity = btrfs_raid_array[index].nparity;
3519
3520 if (nparity)
3521 return num_stripes - nparity;
3522 else
3523 return num_stripes / ncopies;
3524}
3525
94e60d5a
ID
3526/* [pstart, pend) */
3527static int chunk_drange_filter(struct extent_buffer *leaf,
3528 struct btrfs_chunk *chunk,
94e60d5a
ID
3529 struct btrfs_balance_args *bargs)
3530{
3531 struct btrfs_stripe *stripe;
3532 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3533 u64 stripe_offset;
3534 u64 stripe_length;
946c9256 3535 u64 type;
94e60d5a
ID
3536 int factor;
3537 int i;
3538
3539 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3540 return 0;
3541
946c9256
DS
3542 type = btrfs_chunk_type(leaf, chunk);
3543 factor = calc_data_stripes(type, num_stripes);
94e60d5a
ID
3544
3545 for (i = 0; i < num_stripes; i++) {
3546 stripe = btrfs_stripe_nr(chunk, i);
3547 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3548 continue;
3549
3550 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3551 stripe_length = btrfs_chunk_length(leaf, chunk);
b8b93add 3552 stripe_length = div_u64(stripe_length, factor);
94e60d5a
ID
3553
3554 if (stripe_offset < bargs->pend &&
3555 stripe_offset + stripe_length > bargs->pstart)
3556 return 0;
3557 }
3558
3559 return 1;
3560}
3561
ea67176a
ID
3562/* [vstart, vend) */
3563static int chunk_vrange_filter(struct extent_buffer *leaf,
3564 struct btrfs_chunk *chunk,
3565 u64 chunk_offset,
3566 struct btrfs_balance_args *bargs)
3567{
3568 if (chunk_offset < bargs->vend &&
3569 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3570 /* at least part of the chunk is inside this vrange */
3571 return 0;
3572
3573 return 1;
3574}
3575
dee32d0a
GAP
3576static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3577 struct btrfs_chunk *chunk,
3578 struct btrfs_balance_args *bargs)
3579{
3580 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3581
3582 if (bargs->stripes_min <= num_stripes
3583 && num_stripes <= bargs->stripes_max)
3584 return 0;
3585
3586 return 1;
3587}
3588
899c81ea 3589static int chunk_soft_convert_filter(u64 chunk_type,
cfa4c961
ID
3590 struct btrfs_balance_args *bargs)
3591{
3592 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3593 return 0;
3594
899c81ea
ID
3595 chunk_type = chunk_to_extended(chunk_type) &
3596 BTRFS_EXTENDED_PROFILE_MASK;
cfa4c961 3597
899c81ea 3598 if (bargs->target == chunk_type)
cfa4c961
ID
3599 return 1;
3600
3601 return 0;
3602}
3603
6ec0896c 3604static int should_balance_chunk(struct extent_buffer *leaf,
f43ffb60
ID
3605 struct btrfs_chunk *chunk, u64 chunk_offset)
3606{
6ec0896c 3607 struct btrfs_fs_info *fs_info = leaf->fs_info;
0b246afa 3608 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
f43ffb60
ID
3609 struct btrfs_balance_args *bargs = NULL;
3610 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3611
3612 /* type filter */
3613 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3614 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3615 return 0;
3616 }
3617
3618 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3619 bargs = &bctl->data;
3620 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3621 bargs = &bctl->sys;
3622 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3623 bargs = &bctl->meta;
3624
ed25e9b2
ID
3625 /* profiles filter */
3626 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3627 chunk_profiles_filter(chunk_type, bargs)) {
3628 return 0;
5ce5b3c0
ID
3629 }
3630
3631 /* usage filter */
3632 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
0b246afa 3633 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
5ce5b3c0 3634 return 0;
bc309467 3635 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
0b246afa 3636 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
bc309467 3637 return 0;
409d404b
ID
3638 }
3639
3640 /* devid filter */
3641 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3642 chunk_devid_filter(leaf, chunk, bargs)) {
3643 return 0;
94e60d5a
ID
3644 }
3645
3646 /* drange filter, makes sense only with devid filter */
3647 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
e4ff5fb5 3648 chunk_drange_filter(leaf, chunk, bargs)) {
94e60d5a 3649 return 0;
ea67176a
ID
3650 }
3651
3652 /* vrange filter */
3653 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3654 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3655 return 0;
ed25e9b2
ID
3656 }
3657
dee32d0a
GAP
3658 /* stripes filter */
3659 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3660 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3661 return 0;
3662 }
3663
cfa4c961
ID
3664 /* soft profile changing mode */
3665 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3666 chunk_soft_convert_filter(chunk_type, bargs)) {
3667 return 0;
3668 }
3669
7d824b6f
DS
3670 /*
3671 * limited by count, must be the last filter
3672 */
3673 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3674 if (bargs->limit == 0)
3675 return 0;
3676 else
3677 bargs->limit--;
12907fc7
DS
3678 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3679 /*
3680 * Same logic as the 'limit' filter; the minimum cannot be
01327610 3681 * determined here because we do not have the global information
12907fc7
DS
3682 * about the count of all chunks that satisfy the filters.
3683 */
3684 if (bargs->limit_max == 0)
3685 return 0;
3686 else
3687 bargs->limit_max--;
7d824b6f
DS
3688 }
3689
f43ffb60
ID
3690 return 1;
3691}
3692
c9e9f97b 3693static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ec44a35c 3694{
19a39dce 3695 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
c9e9f97b 3696 struct btrfs_root *chunk_root = fs_info->chunk_root;
12907fc7 3697 u64 chunk_type;
f43ffb60 3698 struct btrfs_chunk *chunk;
5a488b9d 3699 struct btrfs_path *path = NULL;
ec44a35c 3700 struct btrfs_key key;
ec44a35c 3701 struct btrfs_key found_key;
f43ffb60
ID
3702 struct extent_buffer *leaf;
3703 int slot;
c9e9f97b
ID
3704 int ret;
3705 int enospc_errors = 0;
19a39dce 3706 bool counting = true;
12907fc7 3707 /* The single value limit and min/max limits use the same bytes in the */
7d824b6f
DS
3708 u64 limit_data = bctl->data.limit;
3709 u64 limit_meta = bctl->meta.limit;
3710 u64 limit_sys = bctl->sys.limit;
12907fc7
DS
3711 u32 count_data = 0;
3712 u32 count_meta = 0;
3713 u32 count_sys = 0;
2c9fe835 3714 int chunk_reserved = 0;
ec44a35c 3715
ec44a35c 3716 path = btrfs_alloc_path();
17e9f796
MF
3717 if (!path) {
3718 ret = -ENOMEM;
3719 goto error;
3720 }
19a39dce
ID
3721
3722 /* zero out stat counters */
3723 spin_lock(&fs_info->balance_lock);
3724 memset(&bctl->stat, 0, sizeof(bctl->stat));
3725 spin_unlock(&fs_info->balance_lock);
3726again:
7d824b6f 3727 if (!counting) {
12907fc7
DS
3728 /*
3729 * The single value limit and min/max limits use the same bytes
3730 * in the
3731 */
7d824b6f
DS
3732 bctl->data.limit = limit_data;
3733 bctl->meta.limit = limit_meta;
3734 bctl->sys.limit = limit_sys;
3735 }
ec44a35c
CM
3736 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3737 key.offset = (u64)-1;
3738 key.type = BTRFS_CHUNK_ITEM_KEY;
3739
d397712b 3740 while (1) {
19a39dce 3741 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
a7e99c69 3742 atomic_read(&fs_info->balance_cancel_req)) {
837d5b6e
ID
3743 ret = -ECANCELED;
3744 goto error;
3745 }
3746
67c5e7d4 3747 mutex_lock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3748 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4
FM
3749 if (ret < 0) {
3750 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3751 goto error;
67c5e7d4 3752 }
ec44a35c
CM
3753
3754 /*
3755 * this shouldn't happen, it means the last relocate
3756 * failed
3757 */
3758 if (ret == 0)
c9e9f97b 3759 BUG(); /* FIXME break ? */
ec44a35c
CM
3760
3761 ret = btrfs_previous_item(chunk_root, path, 0,
3762 BTRFS_CHUNK_ITEM_KEY);
c9e9f97b 3763 if (ret) {
67c5e7d4 3764 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
c9e9f97b 3765 ret = 0;
ec44a35c 3766 break;
c9e9f97b 3767 }
7d9eb12c 3768
f43ffb60
ID
3769 leaf = path->nodes[0];
3770 slot = path->slots[0];
3771 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7d9eb12c 3772
67c5e7d4
FM
3773 if (found_key.objectid != key.objectid) {
3774 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ec44a35c 3775 break;
67c5e7d4 3776 }
7d9eb12c 3777
f43ffb60 3778 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
12907fc7 3779 chunk_type = btrfs_chunk_type(leaf, chunk);
f43ffb60 3780
19a39dce
ID
3781 if (!counting) {
3782 spin_lock(&fs_info->balance_lock);
3783 bctl->stat.considered++;
3784 spin_unlock(&fs_info->balance_lock);
3785 }
3786
6ec0896c 3787 ret = should_balance_chunk(leaf, chunk, found_key.offset);
2c9fe835 3788
b3b4aa74 3789 btrfs_release_path(path);
67c5e7d4
FM
3790 if (!ret) {
3791 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
f43ffb60 3792 goto loop;
67c5e7d4 3793 }
f43ffb60 3794
19a39dce 3795 if (counting) {
67c5e7d4 3796 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
19a39dce
ID
3797 spin_lock(&fs_info->balance_lock);
3798 bctl->stat.expected++;
3799 spin_unlock(&fs_info->balance_lock);
12907fc7
DS
3800
3801 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3802 count_data++;
3803 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3804 count_sys++;
3805 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3806 count_meta++;
3807
3808 goto loop;
3809 }
3810
3811 /*
3812 * Apply limit_min filter, no need to check if the LIMITS
3813 * filter is used, limit_min is 0 by default
3814 */
3815 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3816 count_data < bctl->data.limit_min)
3817 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3818 count_meta < bctl->meta.limit_min)
3819 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3820 count_sys < bctl->sys.limit_min)) {
3821 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
19a39dce
ID
3822 goto loop;
3823 }
3824
a6f93c71
LB
3825 if (!chunk_reserved) {
3826 /*
3827 * We may be relocating the only data chunk we have,
3828 * which could potentially end up with losing data's
3829 * raid profile, so lets allocate an empty one in
3830 * advance.
3831 */
3832 ret = btrfs_may_alloc_data_chunk(fs_info,
3833 found_key.offset);
2c9fe835
ZL
3834 if (ret < 0) {
3835 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3836 goto error;
a6f93c71
LB
3837 } else if (ret == 1) {
3838 chunk_reserved = 1;
2c9fe835 3839 }
2c9fe835
ZL
3840 }
3841
5b4aacef 3842 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
67c5e7d4 3843 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
19a39dce 3844 if (ret == -ENOSPC) {
c9e9f97b 3845 enospc_errors++;
eede2bf3
OS
3846 } else if (ret == -ETXTBSY) {
3847 btrfs_info(fs_info,
3848 "skipping relocation of block group %llu due to active swapfile",
3849 found_key.offset);
3850 ret = 0;
3851 } else if (ret) {
3852 goto error;
19a39dce
ID
3853 } else {
3854 spin_lock(&fs_info->balance_lock);
3855 bctl->stat.completed++;
3856 spin_unlock(&fs_info->balance_lock);
3857 }
f43ffb60 3858loop:
795a3321
ID
3859 if (found_key.offset == 0)
3860 break;
ba1bf481 3861 key.offset = found_key.offset - 1;
ec44a35c 3862 }
c9e9f97b 3863
19a39dce
ID
3864 if (counting) {
3865 btrfs_release_path(path);
3866 counting = false;
3867 goto again;
3868 }
ec44a35c
CM
3869error:
3870 btrfs_free_path(path);
c9e9f97b 3871 if (enospc_errors) {
efe120a0 3872 btrfs_info(fs_info, "%d enospc errors during balance",
5d163e0e 3873 enospc_errors);
c9e9f97b
ID
3874 if (!ret)
3875 ret = -ENOSPC;
3876 }
3877
ec44a35c
CM
3878 return ret;
3879}
3880
0c460c0d
ID
3881/**
3882 * alloc_profile_is_valid - see if a given profile is valid and reduced
3883 * @flags: profile to validate
3884 * @extended: if true @flags is treated as an extended profile
3885 */
3886static int alloc_profile_is_valid(u64 flags, int extended)
3887{
3888 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3889 BTRFS_BLOCK_GROUP_PROFILE_MASK);
3890
3891 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3892
3893 /* 1) check that all other bits are zeroed */
3894 if (flags & ~mask)
3895 return 0;
3896
3897 /* 2) see if profile is reduced */
3898 if (flags == 0)
3899 return !extended; /* "0" is valid for usual profiles */
3900
c1499166 3901 return has_single_bit_set(flags);
0c460c0d
ID
3902}
3903
837d5b6e
ID
3904static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3905{
a7e99c69
ID
3906 /* cancel requested || normal exit path */
3907 return atomic_read(&fs_info->balance_cancel_req) ||
3908 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3909 atomic_read(&fs_info->balance_cancel_req) == 0);
837d5b6e
ID
3910}
3911
5ba366c3
DS
3912/*
3913 * Validate target profile against allowed profiles and return true if it's OK.
3914 * Otherwise print the error message and return false.
3915 */
3916static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3917 const struct btrfs_balance_args *bargs,
3918 u64 allowed, const char *type)
bdcd3c97 3919{
5ba366c3
DS
3920 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3921 return true;
3922
3923 /* Profile is valid and does not have bits outside of the allowed set */
3924 if (alloc_profile_is_valid(bargs->target, 1) &&
3925 (bargs->target & ~allowed) == 0)
3926 return true;
3927
3928 btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3929 type, btrfs_bg_type_to_raid_name(bargs->target));
3930 return false;
bdcd3c97
AM
3931}
3932
56fc37d9
AJ
3933/*
3934 * Fill @buf with textual description of balance filter flags @bargs, up to
3935 * @size_buf including the terminating null. The output may be trimmed if it
3936 * does not fit into the provided buffer.
3937 */
3938static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3939 u32 size_buf)
3940{
3941 int ret;
3942 u32 size_bp = size_buf;
3943 char *bp = buf;
3944 u64 flags = bargs->flags;
3945 char tmp_buf[128] = {'\0'};
3946
3947 if (!flags)
3948 return;
3949
3950#define CHECK_APPEND_NOARG(a) \
3951 do { \
3952 ret = snprintf(bp, size_bp, (a)); \
3953 if (ret < 0 || ret >= size_bp) \
3954 goto out_overflow; \
3955 size_bp -= ret; \
3956 bp += ret; \
3957 } while (0)
3958
3959#define CHECK_APPEND_1ARG(a, v1) \
3960 do { \
3961 ret = snprintf(bp, size_bp, (a), (v1)); \
3962 if (ret < 0 || ret >= size_bp) \
3963 goto out_overflow; \
3964 size_bp -= ret; \
3965 bp += ret; \
3966 } while (0)
3967
3968#define CHECK_APPEND_2ARG(a, v1, v2) \
3969 do { \
3970 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
3971 if (ret < 0 || ret >= size_bp) \
3972 goto out_overflow; \
3973 size_bp -= ret; \
3974 bp += ret; \
3975 } while (0)
3976
158da513
DS
3977 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3978 CHECK_APPEND_1ARG("convert=%s,",
3979 btrfs_bg_type_to_raid_name(bargs->target));
56fc37d9
AJ
3980
3981 if (flags & BTRFS_BALANCE_ARGS_SOFT)
3982 CHECK_APPEND_NOARG("soft,");
3983
3984 if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3985 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3986 sizeof(tmp_buf));
3987 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3988 }
3989
3990 if (flags & BTRFS_BALANCE_ARGS_USAGE)
3991 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3992
3993 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3994 CHECK_APPEND_2ARG("usage=%u..%u,",
3995 bargs->usage_min, bargs->usage_max);
3996
3997 if (flags & BTRFS_BALANCE_ARGS_DEVID)
3998 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3999
4000 if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4001 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4002 bargs->pstart, bargs->pend);
4003
4004 if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4005 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4006 bargs->vstart, bargs->vend);
4007
4008 if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4009 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4010
4011 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4012 CHECK_APPEND_2ARG("limit=%u..%u,",
4013 bargs->limit_min, bargs->limit_max);
4014
4015 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4016 CHECK_APPEND_2ARG("stripes=%u..%u,",
4017 bargs->stripes_min, bargs->stripes_max);
4018
4019#undef CHECK_APPEND_2ARG
4020#undef CHECK_APPEND_1ARG
4021#undef CHECK_APPEND_NOARG
4022
4023out_overflow:
4024
4025 if (size_bp < size_buf)
4026 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4027 else
4028 buf[0] = '\0';
4029}
4030
4031static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4032{
4033 u32 size_buf = 1024;
4034 char tmp_buf[192] = {'\0'};
4035 char *buf;
4036 char *bp;
4037 u32 size_bp = size_buf;
4038 int ret;
4039 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4040
4041 buf = kzalloc(size_buf, GFP_KERNEL);
4042 if (!buf)
4043 return;
4044
4045 bp = buf;
4046
4047#define CHECK_APPEND_1ARG(a, v1) \
4048 do { \
4049 ret = snprintf(bp, size_bp, (a), (v1)); \
4050 if (ret < 0 || ret >= size_bp) \
4051 goto out_overflow; \
4052 size_bp -= ret; \
4053 bp += ret; \
4054 } while (0)
4055
4056 if (bctl->flags & BTRFS_BALANCE_FORCE)
4057 CHECK_APPEND_1ARG("%s", "-f ");
4058
4059 if (bctl->flags & BTRFS_BALANCE_DATA) {
4060 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4061 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4062 }
4063
4064 if (bctl->flags & BTRFS_BALANCE_METADATA) {
4065 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4066 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4067 }
4068
4069 if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4070 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4071 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4072 }
4073
4074#undef CHECK_APPEND_1ARG
4075
4076out_overflow:
4077
4078 if (size_bp < size_buf)
4079 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4080 btrfs_info(fs_info, "balance: %s %s",
4081 (bctl->flags & BTRFS_BALANCE_RESUME) ?
4082 "resume" : "start", buf);
4083
4084 kfree(buf);
4085}
4086
c9e9f97b 4087/*
dccdb07b 4088 * Should be called with balance mutexe held
c9e9f97b 4089 */
6fcf6e2b
DS
4090int btrfs_balance(struct btrfs_fs_info *fs_info,
4091 struct btrfs_balance_control *bctl,
c9e9f97b
ID
4092 struct btrfs_ioctl_balance_args *bargs)
4093{
14506127 4094 u64 meta_target, data_target;
f43ffb60 4095 u64 allowed;
e4837f8f 4096 int mixed = 0;
c9e9f97b 4097 int ret;
8dabb742 4098 u64 num_devices;
de98ced9 4099 unsigned seq;
e62869be 4100 bool reducing_redundancy;
081db89b 4101 int i;
c9e9f97b 4102
837d5b6e 4103 if (btrfs_fs_closing(fs_info) ||
a7e99c69 4104 atomic_read(&fs_info->balance_pause_req) ||
726a3421 4105 btrfs_should_cancel_balance(fs_info)) {
c9e9f97b
ID
4106 ret = -EINVAL;
4107 goto out;
4108 }
4109
e4837f8f
ID
4110 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4111 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4112 mixed = 1;
4113
f43ffb60
ID
4114 /*
4115 * In case of mixed groups both data and meta should be picked,
4116 * and identical options should be given for both of them.
4117 */
e4837f8f
ID
4118 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4119 if (mixed && (bctl->flags & allowed)) {
f43ffb60
ID
4120 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4121 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4122 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
5d163e0e 4123 btrfs_err(fs_info,
6dac13f8 4124 "balance: mixed groups data and metadata options must be the same");
f43ffb60
ID
4125 ret = -EINVAL;
4126 goto out;
4127 }
4128 }
4129
b35cf1f0
JB
4130 /*
4131 * rw_devices will not change at the moment, device add/delete/replace
c3e1f96c 4132 * are exclusive
b35cf1f0
JB
4133 */
4134 num_devices = fs_info->fs_devices->rw_devices;
fab27359
QW
4135
4136 /*
4137 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4138 * special bit for it, to make it easier to distinguish. Thus we need
4139 * to set it manually, or balance would refuse the profile.
4140 */
4141 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
081db89b
DS
4142 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4143 if (num_devices >= btrfs_raid_array[i].devs_min)
4144 allowed |= btrfs_raid_array[i].bg_flag;
1da73967 4145
5ba366c3
DS
4146 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4147 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4148 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
e4d8ec0f
ID
4149 ret = -EINVAL;
4150 goto out;
4151 }
4152
6079e12c
DS
4153 /*
4154 * Allow to reduce metadata or system integrity only if force set for
4155 * profiles with redundancy (copies, parity)
4156 */
4157 allowed = 0;
4158 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4159 if (btrfs_raid_array[i].ncopies >= 2 ||
4160 btrfs_raid_array[i].tolerated_failures >= 1)
4161 allowed |= btrfs_raid_array[i].bg_flag;
4162 }
de98ced9
MX
4163 do {
4164 seq = read_seqbegin(&fs_info->profiles_lock);
4165
4166 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4167 (fs_info->avail_system_alloc_bits & allowed) &&
4168 !(bctl->sys.target & allowed)) ||
4169 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4170 (fs_info->avail_metadata_alloc_bits & allowed) &&
5a8067c0 4171 !(bctl->meta.target & allowed)))
e62869be 4172 reducing_redundancy = true;
5a8067c0 4173 else
e62869be 4174 reducing_redundancy = false;
5a8067c0
FM
4175
4176 /* if we're not converting, the target field is uninitialized */
4177 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4178 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4179 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4180 bctl->data.target : fs_info->avail_data_alloc_bits;
de98ced9 4181 } while (read_seqretry(&fs_info->profiles_lock, seq));
e4d8ec0f 4182
e62869be 4183 if (reducing_redundancy) {
5a8067c0
FM
4184 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4185 btrfs_info(fs_info,
e62869be 4186 "balance: force reducing metadata redundancy");
5a8067c0
FM
4187 } else {
4188 btrfs_err(fs_info,
e62869be 4189 "balance: reduces metadata redundancy, use --force if you want this");
5a8067c0
FM
4190 ret = -EINVAL;
4191 goto out;
4192 }
4193 }
4194
14506127
AB
4195 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4196 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
ee592d07 4197 btrfs_warn(fs_info,
6dac13f8 4198 "balance: metadata profile %s has lower redundancy than data profile %s",
158da513
DS
4199 btrfs_bg_type_to_raid_name(meta_target),
4200 btrfs_bg_type_to_raid_name(data_target));
ee592d07
ST
4201 }
4202
9e967495
FM
4203 if (fs_info->send_in_progress) {
4204 btrfs_warn_rl(fs_info,
4205"cannot run balance while send operations are in progress (%d in progress)",
4206 fs_info->send_in_progress);
4207 ret = -EAGAIN;
4208 goto out;
4209 }
4210
6bccf3ab 4211 ret = insert_balance_item(fs_info, bctl);
59641015 4212 if (ret && ret != -EEXIST)
0940ebf6
ID
4213 goto out;
4214
59641015
ID
4215 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4216 BUG_ON(ret == -EEXIST);
833aae18
DS
4217 BUG_ON(fs_info->balance_ctl);
4218 spin_lock(&fs_info->balance_lock);
4219 fs_info->balance_ctl = bctl;
4220 spin_unlock(&fs_info->balance_lock);
59641015
ID
4221 } else {
4222 BUG_ON(ret != -EEXIST);
4223 spin_lock(&fs_info->balance_lock);
4224 update_balance_args(bctl);
4225 spin_unlock(&fs_info->balance_lock);
4226 }
c9e9f97b 4227
3009a62f
DS
4228 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4229 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
56fc37d9 4230 describe_balance_start_or_resume(fs_info);
c9e9f97b
ID
4231 mutex_unlock(&fs_info->balance_mutex);
4232
4233 ret = __btrfs_balance(fs_info);
4234
4235 mutex_lock(&fs_info->balance_mutex);
7333bd02
AJ
4236 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4237 btrfs_info(fs_info, "balance: paused");
44d354ab
QW
4238 /*
4239 * Balance can be canceled by:
4240 *
4241 * - Regular cancel request
4242 * Then ret == -ECANCELED and balance_cancel_req > 0
4243 *
4244 * - Fatal signal to "btrfs" process
4245 * Either the signal caught by wait_reserve_ticket() and callers
4246 * got -EINTR, or caught by btrfs_should_cancel_balance() and
4247 * got -ECANCELED.
4248 * Either way, in this case balance_cancel_req = 0, and
4249 * ret == -EINTR or ret == -ECANCELED.
4250 *
4251 * So here we only check the return value to catch canceled balance.
4252 */
4253 else if (ret == -ECANCELED || ret == -EINTR)
7333bd02
AJ
4254 btrfs_info(fs_info, "balance: canceled");
4255 else
4256 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4257
3009a62f 4258 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
c9e9f97b
ID
4259
4260 if (bargs) {
4261 memset(bargs, 0, sizeof(*bargs));
008ef096 4262 btrfs_update_ioctl_balance_args(fs_info, bargs);
c9e9f97b
ID
4263 }
4264
3a01aa7a
ID
4265 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4266 balance_need_close(fs_info)) {
149196a2 4267 reset_balance_state(fs_info);
c3e1f96c 4268 btrfs_exclop_finish(fs_info);
3a01aa7a
ID
4269 }
4270
837d5b6e 4271 wake_up(&fs_info->balance_wait_q);
c9e9f97b
ID
4272
4273 return ret;
4274out:
59641015 4275 if (bctl->flags & BTRFS_BALANCE_RESUME)
149196a2 4276 reset_balance_state(fs_info);
a17c95df 4277 else
59641015 4278 kfree(bctl);
c3e1f96c 4279 btrfs_exclop_finish(fs_info);
a17c95df 4280
59641015
ID
4281 return ret;
4282}
4283
4284static int balance_kthread(void *data)
4285{
2b6ba629 4286 struct btrfs_fs_info *fs_info = data;
9555c6c1 4287 int ret = 0;
59641015 4288
59641015 4289 mutex_lock(&fs_info->balance_mutex);
56fc37d9 4290 if (fs_info->balance_ctl)
6fcf6e2b 4291 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
59641015 4292 mutex_unlock(&fs_info->balance_mutex);
2b6ba629 4293
59641015
ID
4294 return ret;
4295}
4296
2b6ba629
ID
4297int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4298{
4299 struct task_struct *tsk;
4300
1354e1a1 4301 mutex_lock(&fs_info->balance_mutex);
2b6ba629 4302 if (!fs_info->balance_ctl) {
1354e1a1 4303 mutex_unlock(&fs_info->balance_mutex);
2b6ba629
ID
4304 return 0;
4305 }
1354e1a1 4306 mutex_unlock(&fs_info->balance_mutex);
2b6ba629 4307
3cdde224 4308 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
6dac13f8 4309 btrfs_info(fs_info, "balance: resume skipped");
2b6ba629
ID
4310 return 0;
4311 }
4312
02ee654d
AJ
4313 /*
4314 * A ro->rw remount sequence should continue with the paused balance
4315 * regardless of who pauses it, system or the user as of now, so set
4316 * the resume flag.
4317 */
4318 spin_lock(&fs_info->balance_lock);
4319 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4320 spin_unlock(&fs_info->balance_lock);
4321
2b6ba629 4322 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
cd633972 4323 return PTR_ERR_OR_ZERO(tsk);
2b6ba629
ID
4324}
4325
68310a5e 4326int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
59641015 4327{
59641015
ID
4328 struct btrfs_balance_control *bctl;
4329 struct btrfs_balance_item *item;
4330 struct btrfs_disk_balance_args disk_bargs;
4331 struct btrfs_path *path;
4332 struct extent_buffer *leaf;
4333 struct btrfs_key key;
4334 int ret;
4335
4336 path = btrfs_alloc_path();
4337 if (!path)
4338 return -ENOMEM;
4339
59641015 4340 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 4341 key.type = BTRFS_TEMPORARY_ITEM_KEY;
59641015
ID
4342 key.offset = 0;
4343
68310a5e 4344 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
59641015 4345 if (ret < 0)
68310a5e 4346 goto out;
59641015
ID
4347 if (ret > 0) { /* ret = -ENOENT; */
4348 ret = 0;
68310a5e
ID
4349 goto out;
4350 }
4351
4352 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4353 if (!bctl) {
4354 ret = -ENOMEM;
4355 goto out;
59641015
ID
4356 }
4357
4358 leaf = path->nodes[0];
4359 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4360
68310a5e
ID
4361 bctl->flags = btrfs_balance_flags(leaf, item);
4362 bctl->flags |= BTRFS_BALANCE_RESUME;
59641015
ID
4363
4364 btrfs_balance_data(leaf, item, &disk_bargs);
4365 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4366 btrfs_balance_meta(leaf, item, &disk_bargs);
4367 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4368 btrfs_balance_sys(leaf, item, &disk_bargs);
4369 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4370
eee95e3f
DS
4371 /*
4372 * This should never happen, as the paused balance state is recovered
4373 * during mount without any chance of other exclusive ops to collide.
4374 *
4375 * This gives the exclusive op status to balance and keeps in paused
4376 * state until user intervention (cancel or umount). If the ownership
4377 * cannot be assigned, show a message but do not fail. The balance
4378 * is in a paused state and must have fs_info::balance_ctl properly
4379 * set up.
4380 */
c3e1f96c 4381 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
eee95e3f 4382 btrfs_warn(fs_info,
6dac13f8 4383 "balance: cannot set exclusive op status, resume manually");
ed0fb78f 4384
fb286100
JB
4385 btrfs_release_path(path);
4386
68310a5e 4387 mutex_lock(&fs_info->balance_mutex);
833aae18
DS
4388 BUG_ON(fs_info->balance_ctl);
4389 spin_lock(&fs_info->balance_lock);
4390 fs_info->balance_ctl = bctl;
4391 spin_unlock(&fs_info->balance_lock);
68310a5e 4392 mutex_unlock(&fs_info->balance_mutex);
59641015
ID
4393out:
4394 btrfs_free_path(path);
ec44a35c
CM
4395 return ret;
4396}
4397
837d5b6e
ID
4398int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4399{
4400 int ret = 0;
4401
4402 mutex_lock(&fs_info->balance_mutex);
4403 if (!fs_info->balance_ctl) {
4404 mutex_unlock(&fs_info->balance_mutex);
4405 return -ENOTCONN;
4406 }
4407
3009a62f 4408 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
837d5b6e
ID
4409 atomic_inc(&fs_info->balance_pause_req);
4410 mutex_unlock(&fs_info->balance_mutex);
4411
4412 wait_event(fs_info->balance_wait_q,
3009a62f 4413 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
837d5b6e
ID
4414
4415 mutex_lock(&fs_info->balance_mutex);
4416 /* we are good with balance_ctl ripped off from under us */
3009a62f 4417 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
837d5b6e
ID
4418 atomic_dec(&fs_info->balance_pause_req);
4419 } else {
4420 ret = -ENOTCONN;
4421 }
4422
4423 mutex_unlock(&fs_info->balance_mutex);
4424 return ret;
4425}
4426
a7e99c69
ID
4427int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4428{
4429 mutex_lock(&fs_info->balance_mutex);
4430 if (!fs_info->balance_ctl) {
4431 mutex_unlock(&fs_info->balance_mutex);
4432 return -ENOTCONN;
4433 }
4434
cf7d20f4
DS
4435 /*
4436 * A paused balance with the item stored on disk can be resumed at
4437 * mount time if the mount is read-write. Otherwise it's still paused
4438 * and we must not allow cancelling as it deletes the item.
4439 */
4440 if (sb_rdonly(fs_info->sb)) {
4441 mutex_unlock(&fs_info->balance_mutex);
4442 return -EROFS;
4443 }
4444
a7e99c69
ID
4445 atomic_inc(&fs_info->balance_cancel_req);
4446 /*
4447 * if we are running just wait and return, balance item is
4448 * deleted in btrfs_balance in this case
4449 */
3009a62f 4450 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
a7e99c69
ID
4451 mutex_unlock(&fs_info->balance_mutex);
4452 wait_event(fs_info->balance_wait_q,
3009a62f 4453 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
a7e99c69
ID
4454 mutex_lock(&fs_info->balance_mutex);
4455 } else {
a7e99c69 4456 mutex_unlock(&fs_info->balance_mutex);
dccdb07b
DS
4457 /*
4458 * Lock released to allow other waiters to continue, we'll
4459 * reexamine the status again.
4460 */
a7e99c69
ID
4461 mutex_lock(&fs_info->balance_mutex);
4462
a17c95df 4463 if (fs_info->balance_ctl) {
149196a2 4464 reset_balance_state(fs_info);
c3e1f96c 4465 btrfs_exclop_finish(fs_info);
6dac13f8 4466 btrfs_info(fs_info, "balance: canceled");
a17c95df 4467 }
a7e99c69
ID
4468 }
4469
3009a62f
DS
4470 BUG_ON(fs_info->balance_ctl ||
4471 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
a7e99c69
ID
4472 atomic_dec(&fs_info->balance_cancel_req);
4473 mutex_unlock(&fs_info->balance_mutex);
4474 return 0;
4475}
4476
97f4dd09 4477int btrfs_uuid_scan_kthread(void *data)
803b2f54
SB
4478{
4479 struct btrfs_fs_info *fs_info = data;
4480 struct btrfs_root *root = fs_info->tree_root;
4481 struct btrfs_key key;
803b2f54
SB
4482 struct btrfs_path *path = NULL;
4483 int ret = 0;
4484 struct extent_buffer *eb;
4485 int slot;
4486 struct btrfs_root_item root_item;
4487 u32 item_size;
f45388f3 4488 struct btrfs_trans_handle *trans = NULL;
c94bec2c 4489 bool closing = false;
803b2f54
SB
4490
4491 path = btrfs_alloc_path();
4492 if (!path) {
4493 ret = -ENOMEM;
4494 goto out;
4495 }
4496
4497 key.objectid = 0;
4498 key.type = BTRFS_ROOT_ITEM_KEY;
4499 key.offset = 0;
4500
803b2f54 4501 while (1) {
c94bec2c
JB
4502 if (btrfs_fs_closing(fs_info)) {
4503 closing = true;
4504 break;
4505 }
7c829b72
AJ
4506 ret = btrfs_search_forward(root, &key, path,
4507 BTRFS_OLDEST_GENERATION);
803b2f54
SB
4508 if (ret) {
4509 if (ret > 0)
4510 ret = 0;
4511 break;
4512 }
4513
4514 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4515 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4516 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4517 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4518 goto skip;
4519
4520 eb = path->nodes[0];
4521 slot = path->slots[0];
4522 item_size = btrfs_item_size_nr(eb, slot);
4523 if (item_size < sizeof(root_item))
4524 goto skip;
4525
803b2f54
SB
4526 read_extent_buffer(eb, &root_item,
4527 btrfs_item_ptr_offset(eb, slot),
4528 (int)sizeof(root_item));
4529 if (btrfs_root_refs(&root_item) == 0)
4530 goto skip;
f45388f3
FDBM
4531
4532 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4533 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4534 if (trans)
4535 goto update_tree;
4536
4537 btrfs_release_path(path);
803b2f54
SB
4538 /*
4539 * 1 - subvol uuid item
4540 * 1 - received_subvol uuid item
4541 */
4542 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4543 if (IS_ERR(trans)) {
4544 ret = PTR_ERR(trans);
4545 break;
4546 }
f45388f3
FDBM
4547 continue;
4548 } else {
4549 goto skip;
4550 }
4551update_tree:
9771a5cf 4552 btrfs_release_path(path);
f45388f3 4553 if (!btrfs_is_empty_uuid(root_item.uuid)) {
cdb345a8 4554 ret = btrfs_uuid_tree_add(trans, root_item.uuid,
803b2f54
SB
4555 BTRFS_UUID_KEY_SUBVOL,
4556 key.objectid);
4557 if (ret < 0) {
efe120a0 4558 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4559 ret);
803b2f54
SB
4560 break;
4561 }
4562 }
4563
4564 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
cdb345a8 4565 ret = btrfs_uuid_tree_add(trans,
803b2f54
SB
4566 root_item.received_uuid,
4567 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4568 key.objectid);
4569 if (ret < 0) {
efe120a0 4570 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4571 ret);
803b2f54
SB
4572 break;
4573 }
4574 }
4575
f45388f3 4576skip:
9771a5cf 4577 btrfs_release_path(path);
803b2f54 4578 if (trans) {
3a45bb20 4579 ret = btrfs_end_transaction(trans);
f45388f3 4580 trans = NULL;
803b2f54
SB
4581 if (ret)
4582 break;
4583 }
4584
803b2f54
SB
4585 if (key.offset < (u64)-1) {
4586 key.offset++;
4587 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4588 key.offset = 0;
4589 key.type = BTRFS_ROOT_ITEM_KEY;
4590 } else if (key.objectid < (u64)-1) {
4591 key.offset = 0;
4592 key.type = BTRFS_ROOT_ITEM_KEY;
4593 key.objectid++;
4594 } else {
4595 break;
4596 }
4597 cond_resched();
4598 }
4599
4600out:
4601 btrfs_free_path(path);
f45388f3 4602 if (trans && !IS_ERR(trans))
3a45bb20 4603 btrfs_end_transaction(trans);
803b2f54 4604 if (ret)
efe120a0 4605 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
c94bec2c 4606 else if (!closing)
afcdd129 4607 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
803b2f54
SB
4608 up(&fs_info->uuid_tree_rescan_sem);
4609 return 0;
4610}
4611
f7a81ea4
SB
4612int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4613{
4614 struct btrfs_trans_handle *trans;
4615 struct btrfs_root *tree_root = fs_info->tree_root;
4616 struct btrfs_root *uuid_root;
803b2f54
SB
4617 struct task_struct *task;
4618 int ret;
f7a81ea4
SB
4619
4620 /*
4621 * 1 - root node
4622 * 1 - root item
4623 */
4624 trans = btrfs_start_transaction(tree_root, 2);
4625 if (IS_ERR(trans))
4626 return PTR_ERR(trans);
4627
9b7a2440 4628 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
f7a81ea4 4629 if (IS_ERR(uuid_root)) {
6d13f549 4630 ret = PTR_ERR(uuid_root);
66642832 4631 btrfs_abort_transaction(trans, ret);
3a45bb20 4632 btrfs_end_transaction(trans);
6d13f549 4633 return ret;
f7a81ea4
SB
4634 }
4635
4636 fs_info->uuid_root = uuid_root;
4637
3a45bb20 4638 ret = btrfs_commit_transaction(trans);
803b2f54
SB
4639 if (ret)
4640 return ret;
4641
4642 down(&fs_info->uuid_tree_rescan_sem);
4643 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4644 if (IS_ERR(task)) {
70f80175 4645 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
efe120a0 4646 btrfs_warn(fs_info, "failed to start uuid_scan task");
803b2f54
SB
4647 up(&fs_info->uuid_tree_rescan_sem);
4648 return PTR_ERR(task);
4649 }
4650
4651 return 0;
f7a81ea4 4652}
803b2f54 4653
8f18cf13
CM
4654/*
4655 * shrinking a device means finding all of the device extents past
4656 * the new size, and then following the back refs to the chunks.
4657 * The chunk relocation code actually frees the device extent
4658 */
4659int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4660{
0b246afa
JM
4661 struct btrfs_fs_info *fs_info = device->fs_info;
4662 struct btrfs_root *root = fs_info->dev_root;
8f18cf13 4663 struct btrfs_trans_handle *trans;
8f18cf13
CM
4664 struct btrfs_dev_extent *dev_extent = NULL;
4665 struct btrfs_path *path;
4666 u64 length;
8f18cf13
CM
4667 u64 chunk_offset;
4668 int ret;
4669 int slot;
ba1bf481
JB
4670 int failed = 0;
4671 bool retried = false;
8f18cf13
CM
4672 struct extent_buffer *l;
4673 struct btrfs_key key;
0b246afa 4674 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13 4675 u64 old_total = btrfs_super_total_bytes(super_copy);
7cc8e58d 4676 u64 old_size = btrfs_device_get_total_bytes(device);
7dfb8be1 4677 u64 diff;
61d0d0d2 4678 u64 start;
7dfb8be1
NB
4679
4680 new_size = round_down(new_size, fs_info->sectorsize);
61d0d0d2 4681 start = new_size;
0e4324a4 4682 diff = round_down(old_size - new_size, fs_info->sectorsize);
8f18cf13 4683
401e29c1 4684 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
63a212ab
SB
4685 return -EINVAL;
4686
8f18cf13
CM
4687 path = btrfs_alloc_path();
4688 if (!path)
4689 return -ENOMEM;
4690
0338dff6 4691 path->reada = READA_BACK;
8f18cf13 4692
61d0d0d2
NB
4693 trans = btrfs_start_transaction(root, 0);
4694 if (IS_ERR(trans)) {
4695 btrfs_free_path(path);
4696 return PTR_ERR(trans);
4697 }
4698
34441361 4699 mutex_lock(&fs_info->chunk_mutex);
7d9eb12c 4700
7cc8e58d 4701 btrfs_device_set_total_bytes(device, new_size);
ebbede42 4702 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2b82032c 4703 device->fs_devices->total_rw_bytes -= diff;
a5ed45f8 4704 atomic64_sub(diff, &fs_info->free_chunk_space);
2bf64758 4705 }
61d0d0d2
NB
4706
4707 /*
4708 * Once the device's size has been set to the new size, ensure all
4709 * in-memory chunks are synced to disk so that the loop below sees them
4710 * and relocates them accordingly.
4711 */
1c11b63e 4712 if (contains_pending_extent(device, &start, diff)) {
61d0d0d2
NB
4713 mutex_unlock(&fs_info->chunk_mutex);
4714 ret = btrfs_commit_transaction(trans);
4715 if (ret)
4716 goto done;
4717 } else {
4718 mutex_unlock(&fs_info->chunk_mutex);
4719 btrfs_end_transaction(trans);
4720 }
8f18cf13 4721
ba1bf481 4722again:
8f18cf13
CM
4723 key.objectid = device->devid;
4724 key.offset = (u64)-1;
4725 key.type = BTRFS_DEV_EXTENT_KEY;
4726
213e64da 4727 do {
0b246afa 4728 mutex_lock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 4729 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
67c5e7d4 4730 if (ret < 0) {
0b246afa 4731 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8f18cf13 4732 goto done;
67c5e7d4 4733 }
8f18cf13
CM
4734
4735 ret = btrfs_previous_item(root, path, 0, key.type);
8f18cf13 4736 if (ret) {
7056bf69
NB
4737 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4738 if (ret < 0)
4739 goto done;
8f18cf13 4740 ret = 0;
b3b4aa74 4741 btrfs_release_path(path);
bf1fb512 4742 break;
8f18cf13
CM
4743 }
4744
4745 l = path->nodes[0];
4746 slot = path->slots[0];
4747 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4748
ba1bf481 4749 if (key.objectid != device->devid) {
0b246afa 4750 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
b3b4aa74 4751 btrfs_release_path(path);
bf1fb512 4752 break;
ba1bf481 4753 }
8f18cf13
CM
4754
4755 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4756 length = btrfs_dev_extent_length(l, dev_extent);
4757
ba1bf481 4758 if (key.offset + length <= new_size) {
0b246afa 4759 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
b3b4aa74 4760 btrfs_release_path(path);
d6397bae 4761 break;
ba1bf481 4762 }
8f18cf13 4763
8f18cf13 4764 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
b3b4aa74 4765 btrfs_release_path(path);
8f18cf13 4766
a6f93c71
LB
4767 /*
4768 * We may be relocating the only data chunk we have,
4769 * which could potentially end up with losing data's
4770 * raid profile, so lets allocate an empty one in
4771 * advance.
4772 */
4773 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4774 if (ret < 0) {
4775 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4776 goto done;
4777 }
4778
0b246afa
JM
4779 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4780 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
eede2bf3 4781 if (ret == -ENOSPC) {
ba1bf481 4782 failed++;
eede2bf3
OS
4783 } else if (ret) {
4784 if (ret == -ETXTBSY) {
4785 btrfs_warn(fs_info,
4786 "could not shrink block group %llu due to active swapfile",
4787 chunk_offset);
4788 }
4789 goto done;
4790 }
213e64da 4791 } while (key.offset-- > 0);
ba1bf481
JB
4792
4793 if (failed && !retried) {
4794 failed = 0;
4795 retried = true;
4796 goto again;
4797 } else if (failed && retried) {
4798 ret = -ENOSPC;
ba1bf481 4799 goto done;
8f18cf13
CM
4800 }
4801
d6397bae 4802 /* Shrinking succeeded, else we would be at "done". */
a22285a6 4803 trans = btrfs_start_transaction(root, 0);
98d5dc13
TI
4804 if (IS_ERR(trans)) {
4805 ret = PTR_ERR(trans);
4806 goto done;
4807 }
4808
34441361 4809 mutex_lock(&fs_info->chunk_mutex);
c57dd1f2
QW
4810 /* Clear all state bits beyond the shrunk device size */
4811 clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4812 CHUNK_STATE_MASK);
4813
7cc8e58d 4814 btrfs_device_set_disk_total_bytes(device, new_size);
bbbf7243
NB
4815 if (list_empty(&device->post_commit_list))
4816 list_add_tail(&device->post_commit_list,
4817 &trans->transaction->dev_update_list);
d6397bae 4818
d6397bae 4819 WARN_ON(diff > old_total);
7dfb8be1
NB
4820 btrfs_set_super_total_bytes(super_copy,
4821 round_down(old_total - diff, fs_info->sectorsize));
34441361 4822 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8
MX
4823
4824 /* Now btrfs_update_device() will change the on-disk size. */
4825 ret = btrfs_update_device(trans, device);
801660b0
AJ
4826 if (ret < 0) {
4827 btrfs_abort_transaction(trans, ret);
4828 btrfs_end_transaction(trans);
4829 } else {
4830 ret = btrfs_commit_transaction(trans);
4831 }
8f18cf13
CM
4832done:
4833 btrfs_free_path(path);
53e489bc 4834 if (ret) {
34441361 4835 mutex_lock(&fs_info->chunk_mutex);
53e489bc 4836 btrfs_device_set_total_bytes(device, old_size);
ebbede42 4837 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
53e489bc 4838 device->fs_devices->total_rw_bytes += diff;
a5ed45f8 4839 atomic64_add(diff, &fs_info->free_chunk_space);
34441361 4840 mutex_unlock(&fs_info->chunk_mutex);
53e489bc 4841 }
8f18cf13
CM
4842 return ret;
4843}
4844
2ff7e61e 4845static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
0b86a832
CM
4846 struct btrfs_key *key,
4847 struct btrfs_chunk *chunk, int item_size)
4848{
0b246afa 4849 struct btrfs_super_block *super_copy = fs_info->super_copy;
0b86a832
CM
4850 struct btrfs_disk_key disk_key;
4851 u32 array_size;
4852 u8 *ptr;
4853
34441361 4854 mutex_lock(&fs_info->chunk_mutex);
0b86a832 4855 array_size = btrfs_super_sys_array_size(super_copy);
5f43f86e 4856 if (array_size + item_size + sizeof(disk_key)
fe48a5c0 4857 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
34441361 4858 mutex_unlock(&fs_info->chunk_mutex);
0b86a832 4859 return -EFBIG;
fe48a5c0 4860 }
0b86a832
CM
4861
4862 ptr = super_copy->sys_chunk_array + array_size;
4863 btrfs_cpu_key_to_disk(&disk_key, key);
4864 memcpy(ptr, &disk_key, sizeof(disk_key));
4865 ptr += sizeof(disk_key);
4866 memcpy(ptr, chunk, item_size);
4867 item_size += sizeof(disk_key);
4868 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
34441361 4869 mutex_unlock(&fs_info->chunk_mutex);
fe48a5c0 4870
0b86a832
CM
4871 return 0;
4872}
4873
73c5de00
AJ
4874/*
4875 * sort the devices in descending order by max_avail, total_avail
4876 */
4877static int btrfs_cmp_device_info(const void *a, const void *b)
9b3f68b9 4878{
73c5de00
AJ
4879 const struct btrfs_device_info *di_a = a;
4880 const struct btrfs_device_info *di_b = b;
9b3f68b9 4881
73c5de00 4882 if (di_a->max_avail > di_b->max_avail)
b2117a39 4883 return -1;
73c5de00 4884 if (di_a->max_avail < di_b->max_avail)
b2117a39 4885 return 1;
73c5de00
AJ
4886 if (di_a->total_avail > di_b->total_avail)
4887 return -1;
4888 if (di_a->total_avail < di_b->total_avail)
4889 return 1;
4890 return 0;
b2117a39 4891}
0b86a832 4892
53b381b3
DW
4893static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4894{
ffe2d203 4895 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
53b381b3
DW
4896 return;
4897
ceda0864 4898 btrfs_set_fs_incompat(info, RAID56);
53b381b3
DW
4899}
4900
cfbb825c
DS
4901static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4902{
4903 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4904 return;
4905
4906 btrfs_set_fs_incompat(info, RAID1C34);
4907}
4908
4f2bafe8
NA
4909/*
4910 * Structure used internally for __btrfs_alloc_chunk() function.
4911 * Wraps needed parameters.
4912 */
4913struct alloc_chunk_ctl {
4914 u64 start;
4915 u64 type;
4916 /* Total number of stripes to allocate */
4917 int num_stripes;
4918 /* sub_stripes info for map */
4919 int sub_stripes;
4920 /* Stripes per device */
4921 int dev_stripes;
4922 /* Maximum number of devices to use */
4923 int devs_max;
4924 /* Minimum number of devices to use */
4925 int devs_min;
4926 /* ndevs has to be a multiple of this */
4927 int devs_increment;
4928 /* Number of copies */
4929 int ncopies;
4930 /* Number of stripes worth of bytes to store parity information */
4931 int nparity;
4932 u64 max_stripe_size;
4933 u64 max_chunk_size;
6aafb303 4934 u64 dev_extent_min;
4f2bafe8
NA
4935 u64 stripe_size;
4936 u64 chunk_size;
4937 int ndevs;
4938};
4939
27c314d5
NA
4940static void init_alloc_chunk_ctl_policy_regular(
4941 struct btrfs_fs_devices *fs_devices,
4942 struct alloc_chunk_ctl *ctl)
4943{
4944 u64 type = ctl->type;
4945
4946 if (type & BTRFS_BLOCK_GROUP_DATA) {
4947 ctl->max_stripe_size = SZ_1G;
4948 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4949 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4950 /* For larger filesystems, use larger metadata chunks */
4951 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4952 ctl->max_stripe_size = SZ_1G;
4953 else
4954 ctl->max_stripe_size = SZ_256M;
4955 ctl->max_chunk_size = ctl->max_stripe_size;
4956 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4957 ctl->max_stripe_size = SZ_32M;
4958 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4959 ctl->devs_max = min_t(int, ctl->devs_max,
4960 BTRFS_MAX_DEVS_SYS_CHUNK);
4961 } else {
4962 BUG();
4963 }
4964
4965 /* We don't want a chunk larger than 10% of writable space */
4966 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4967 ctl->max_chunk_size);
6aafb303 4968 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
27c314d5
NA
4969}
4970
1cd6121f
NA
4971static void init_alloc_chunk_ctl_policy_zoned(
4972 struct btrfs_fs_devices *fs_devices,
4973 struct alloc_chunk_ctl *ctl)
4974{
4975 u64 zone_size = fs_devices->fs_info->zone_size;
4976 u64 limit;
4977 int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
4978 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
4979 u64 min_chunk_size = min_data_stripes * zone_size;
4980 u64 type = ctl->type;
4981
4982 ctl->max_stripe_size = zone_size;
4983 if (type & BTRFS_BLOCK_GROUP_DATA) {
4984 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
4985 zone_size);
4986 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4987 ctl->max_chunk_size = ctl->max_stripe_size;
4988 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4989 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4990 ctl->devs_max = min_t(int, ctl->devs_max,
4991 BTRFS_MAX_DEVS_SYS_CHUNK);
4992 }
4993
4994 /* We don't want a chunk larger than 10% of writable space */
4995 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
4996 zone_size),
4997 min_chunk_size);
4998 ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
4999 ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5000}
5001
27c314d5
NA
5002static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5003 struct alloc_chunk_ctl *ctl)
5004{
5005 int index = btrfs_bg_flags_to_raid_index(ctl->type);
5006
5007 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5008 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5009 ctl->devs_max = btrfs_raid_array[index].devs_max;
5010 if (!ctl->devs_max)
5011 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5012 ctl->devs_min = btrfs_raid_array[index].devs_min;
5013 ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5014 ctl->ncopies = btrfs_raid_array[index].ncopies;
5015 ctl->nparity = btrfs_raid_array[index].nparity;
5016 ctl->ndevs = 0;
5017
5018 switch (fs_devices->chunk_alloc_policy) {
5019 case BTRFS_CHUNK_ALLOC_REGULAR:
5020 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5021 break;
1cd6121f
NA
5022 case BTRFS_CHUNK_ALLOC_ZONED:
5023 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5024 break;
27c314d5
NA
5025 default:
5026 BUG();
5027 }
5028}
5029
560156cb
NA
5030static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5031 struct alloc_chunk_ctl *ctl,
5032 struct btrfs_device_info *devices_info)
b2117a39 5033{
560156cb 5034 struct btrfs_fs_info *info = fs_devices->fs_info;
ebcc9301 5035 struct btrfs_device *device;
73c5de00 5036 u64 total_avail;
560156cb 5037 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
73c5de00 5038 int ret;
560156cb
NA
5039 int ndevs = 0;
5040 u64 max_avail;
5041 u64 dev_offset;
0cad8a11 5042
9f680ce0 5043 /*
73c5de00
AJ
5044 * in the first pass through the devices list, we gather information
5045 * about the available holes on each device.
9f680ce0 5046 */
ebcc9301 5047 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
ebbede42 5048 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
31b1a2bd 5049 WARN(1, KERN_ERR
efe120a0 5050 "BTRFS: read-only device in alloc_list\n");
73c5de00
AJ
5051 continue;
5052 }
b2117a39 5053
e12c9621
AJ
5054 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5055 &device->dev_state) ||
401e29c1 5056 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
73c5de00 5057 continue;
b2117a39 5058
73c5de00
AJ
5059 if (device->total_bytes > device->bytes_used)
5060 total_avail = device->total_bytes - device->bytes_used;
5061 else
5062 total_avail = 0;
38c01b96 5063
5064 /* If there is no space on this device, skip it. */
6aafb303 5065 if (total_avail < ctl->dev_extent_min)
38c01b96 5066 continue;
b2117a39 5067
560156cb
NA
5068 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5069 &max_avail);
73c5de00 5070 if (ret && ret != -ENOSPC)
560156cb 5071 return ret;
b2117a39 5072
73c5de00 5073 if (ret == 0)
560156cb 5074 max_avail = dev_extent_want;
b2117a39 5075
6aafb303 5076 if (max_avail < ctl->dev_extent_min) {
4117f207
QW
5077 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5078 btrfs_debug(info,
560156cb 5079 "%s: devid %llu has no free space, have=%llu want=%llu",
4117f207 5080 __func__, device->devid, max_avail,
6aafb303 5081 ctl->dev_extent_min);
73c5de00 5082 continue;
4117f207 5083 }
b2117a39 5084
063d006f
ES
5085 if (ndevs == fs_devices->rw_devices) {
5086 WARN(1, "%s: found more than %llu devices\n",
5087 __func__, fs_devices->rw_devices);
5088 break;
5089 }
73c5de00
AJ
5090 devices_info[ndevs].dev_offset = dev_offset;
5091 devices_info[ndevs].max_avail = max_avail;
5092 devices_info[ndevs].total_avail = total_avail;
5093 devices_info[ndevs].dev = device;
5094 ++ndevs;
5095 }
560156cb 5096 ctl->ndevs = ndevs;
b2117a39 5097
73c5de00
AJ
5098 /*
5099 * now sort the devices by hole size / available space
5100 */
560156cb 5101 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
73c5de00 5102 btrfs_cmp_device_info, NULL);
b2117a39 5103
560156cb
NA
5104 return 0;
5105}
5106
5badf512
NA
5107static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5108 struct btrfs_device_info *devices_info)
5109{
5110 /* Number of stripes that count for block group size */
5111 int data_stripes;
5112
5113 /*
5114 * The primary goal is to maximize the number of stripes, so use as
5115 * many devices as possible, even if the stripes are not maximum sized.
5116 *
5117 * The DUP profile stores more than one stripe per device, the
5118 * max_avail is the total size so we have to adjust.
5119 */
5120 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5121 ctl->dev_stripes);
5122 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5123
5124 /* This will have to be fixed for RAID1 and RAID10 over more drives */
5125 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5126
5127 /*
5128 * Use the number of data stripes to figure out how big this chunk is
5129 * really going to be in terms of logical address space, and compare
5130 * that answer with the max chunk size. If it's higher, we try to
5131 * reduce stripe_size.
5132 */
5133 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5134 /*
5135 * Reduce stripe_size, round it up to a 16MB boundary again and
5136 * then use it, unless it ends up being even bigger than the
5137 * previous value we had already.
5138 */
5139 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5140 data_stripes), SZ_16M),
5141 ctl->stripe_size);
5142 }
5143
5144 /* Align to BTRFS_STRIPE_LEN */
5145 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5146 ctl->chunk_size = ctl->stripe_size * data_stripes;
5147
5148 return 0;
5149}
5150
1cd6121f
NA
5151static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5152 struct btrfs_device_info *devices_info)
5153{
5154 u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5155 /* Number of stripes that count for block group size */
5156 int data_stripes;
5157
5158 /*
5159 * It should hold because:
5160 * dev_extent_min == dev_extent_want == zone_size * dev_stripes
5161 */
5162 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5163
5164 ctl->stripe_size = zone_size;
5165 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5166 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5167
5168 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5169 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5170 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5171 ctl->stripe_size) + ctl->nparity,
5172 ctl->dev_stripes);
5173 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5174 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5175 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5176 }
5177
5178 ctl->chunk_size = ctl->stripe_size * data_stripes;
5179
5180 return 0;
5181}
5182
5badf512
NA
5183static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5184 struct alloc_chunk_ctl *ctl,
5185 struct btrfs_device_info *devices_info)
5186{
5187 struct btrfs_fs_info *info = fs_devices->fs_info;
5188
5189 /*
5190 * Round down to number of usable stripes, devs_increment can be any
5191 * number so we can't use round_down() that requires power of 2, while
5192 * rounddown is safe.
5193 */
5194 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5195
5196 if (ctl->ndevs < ctl->devs_min) {
5197 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5198 btrfs_debug(info,
5199 "%s: not enough devices with free space: have=%d minimum required=%d",
5200 __func__, ctl->ndevs, ctl->devs_min);
5201 }
5202 return -ENOSPC;
5203 }
5204
5205 ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5206
5207 switch (fs_devices->chunk_alloc_policy) {
5208 case BTRFS_CHUNK_ALLOC_REGULAR:
5209 return decide_stripe_size_regular(ctl, devices_info);
1cd6121f
NA
5210 case BTRFS_CHUNK_ALLOC_ZONED:
5211 return decide_stripe_size_zoned(ctl, devices_info);
5badf512
NA
5212 default:
5213 BUG();
5214 }
5215}
5216
dce580ca
NA
5217static int create_chunk(struct btrfs_trans_handle *trans,
5218 struct alloc_chunk_ctl *ctl,
5219 struct btrfs_device_info *devices_info)
560156cb
NA
5220{
5221 struct btrfs_fs_info *info = trans->fs_info;
560156cb
NA
5222 struct map_lookup *map = NULL;
5223 struct extent_map_tree *em_tree;
5224 struct extent_map *em;
dce580ca
NA
5225 u64 start = ctl->start;
5226 u64 type = ctl->type;
560156cb
NA
5227 int ret;
5228 int i;
5229 int j;
5230
dce580ca
NA
5231 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5232 if (!map)
560156cb 5233 return -ENOMEM;
dce580ca 5234 map->num_stripes = ctl->num_stripes;
560156cb 5235
dce580ca
NA
5236 for (i = 0; i < ctl->ndevs; ++i) {
5237 for (j = 0; j < ctl->dev_stripes; ++j) {
5238 int s = i * ctl->dev_stripes + j;
73c5de00
AJ
5239 map->stripes[s].dev = devices_info[i].dev;
5240 map->stripes[s].physical = devices_info[i].dev_offset +
dce580ca 5241 j * ctl->stripe_size;
6324fbf3 5242 }
6324fbf3 5243 }
500ceed8
NB
5244 map->stripe_len = BTRFS_STRIPE_LEN;
5245 map->io_align = BTRFS_STRIPE_LEN;
5246 map->io_width = BTRFS_STRIPE_LEN;
2b82032c 5247 map->type = type;
dce580ca 5248 map->sub_stripes = ctl->sub_stripes;
0b86a832 5249
dce580ca 5250 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
1abe9b8a 5251
172ddd60 5252 em = alloc_extent_map();
2b82032c 5253 if (!em) {
298a8f9c 5254 kfree(map);
dce580ca 5255 return -ENOMEM;
593060d7 5256 }
298a8f9c 5257 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 5258 em->map_lookup = map;
2b82032c 5259 em->start = start;
dce580ca 5260 em->len = ctl->chunk_size;
2b82032c
YZ
5261 em->block_start = 0;
5262 em->block_len = em->len;
dce580ca 5263 em->orig_block_len = ctl->stripe_size;
593060d7 5264
c8bf1b67 5265 em_tree = &info->mapping_tree;
890871be 5266 write_lock(&em_tree->lock);
09a2a8f9 5267 ret = add_extent_mapping(em_tree, em, 0);
0f5d42b2 5268 if (ret) {
1efb72a3 5269 write_unlock(&em_tree->lock);
0f5d42b2 5270 free_extent_map(em);
dce580ca 5271 return ret;
0f5d42b2 5272 }
1efb72a3
NB
5273 write_unlock(&em_tree->lock);
5274
dce580ca 5275 ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
6df9a95e
JB
5276 if (ret)
5277 goto error_del_extent;
2b82032c 5278
bbbf7243
NB
5279 for (i = 0; i < map->num_stripes; i++) {
5280 struct btrfs_device *dev = map->stripes[i].dev;
5281
4f2bafe8 5282 btrfs_device_set_bytes_used(dev,
dce580ca 5283 dev->bytes_used + ctl->stripe_size);
bbbf7243
NB
5284 if (list_empty(&dev->post_commit_list))
5285 list_add_tail(&dev->post_commit_list,
5286 &trans->transaction->dev_update_list);
5287 }
43530c46 5288
dce580ca 5289 atomic64_sub(ctl->stripe_size * map->num_stripes,
4f2bafe8 5290 &info->free_chunk_space);
1c116187 5291
0f5d42b2 5292 free_extent_map(em);
0b246afa 5293 check_raid56_incompat_flag(info, type);
cfbb825c 5294 check_raid1c34_incompat_flag(info, type);
53b381b3 5295
2b82032c 5296 return 0;
b2117a39 5297
6df9a95e 5298error_del_extent:
0f5d42b2
JB
5299 write_lock(&em_tree->lock);
5300 remove_extent_mapping(em_tree, em);
5301 write_unlock(&em_tree->lock);
5302
5303 /* One for our allocation */
5304 free_extent_map(em);
5305 /* One for the tree reference */
5306 free_extent_map(em);
dce580ca
NA
5307
5308 return ret;
5309}
5310
11c67b1a 5311int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
dce580ca
NA
5312{
5313 struct btrfs_fs_info *info = trans->fs_info;
5314 struct btrfs_fs_devices *fs_devices = info->fs_devices;
5315 struct btrfs_device_info *devices_info = NULL;
5316 struct alloc_chunk_ctl ctl;
5317 int ret;
5318
11c67b1a
NB
5319 lockdep_assert_held(&info->chunk_mutex);
5320
dce580ca
NA
5321 if (!alloc_profile_is_valid(type, 0)) {
5322 ASSERT(0);
5323 return -EINVAL;
5324 }
5325
5326 if (list_empty(&fs_devices->alloc_list)) {
5327 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5328 btrfs_debug(info, "%s: no writable device", __func__);
5329 return -ENOSPC;
5330 }
5331
5332 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5333 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5334 ASSERT(0);
5335 return -EINVAL;
5336 }
5337
11c67b1a 5338 ctl.start = find_next_chunk(info);
dce580ca
NA
5339 ctl.type = type;
5340 init_alloc_chunk_ctl(fs_devices, &ctl);
5341
5342 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5343 GFP_NOFS);
5344 if (!devices_info)
5345 return -ENOMEM;
5346
5347 ret = gather_device_info(fs_devices, &ctl, devices_info);
5348 if (ret < 0)
5349 goto out;
5350
5351 ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5352 if (ret < 0)
5353 goto out;
5354
5355 ret = create_chunk(trans, &ctl, devices_info);
5356
5357out:
b2117a39
MX
5358 kfree(devices_info);
5359 return ret;
2b82032c
YZ
5360}
5361
11c67b1a
NB
5362/*
5363 * Chunk allocation falls into two parts. The first part does work
5364 * that makes the new allocated chunk usable, but does not do any operation
5365 * that modifies the chunk tree. The second part does the work that
5366 * requires modifying the chunk tree. This division is important for the
5367 * bootstrap process of adding storage to a seed btrfs.
5368 */
6df9a95e 5369int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
97aff912 5370 u64 chunk_offset, u64 chunk_size)
2b82032c 5371{
97aff912 5372 struct btrfs_fs_info *fs_info = trans->fs_info;
6bccf3ab
JM
5373 struct btrfs_root *extent_root = fs_info->extent_root;
5374 struct btrfs_root *chunk_root = fs_info->chunk_root;
2b82032c 5375 struct btrfs_key key;
2b82032c
YZ
5376 struct btrfs_device *device;
5377 struct btrfs_chunk *chunk;
5378 struct btrfs_stripe *stripe;
6df9a95e
JB
5379 struct extent_map *em;
5380 struct map_lookup *map;
5381 size_t item_size;
5382 u64 dev_offset;
5383 u64 stripe_size;
5384 int i = 0;
140e639f 5385 int ret = 0;
2b82032c 5386
60ca842e 5387 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
592d92ee
LB
5388 if (IS_ERR(em))
5389 return PTR_ERR(em);
6df9a95e 5390
95617d69 5391 map = em->map_lookup;
6df9a95e
JB
5392 item_size = btrfs_chunk_item_size(map->num_stripes);
5393 stripe_size = em->orig_block_len;
5394
2b82032c 5395 chunk = kzalloc(item_size, GFP_NOFS);
6df9a95e
JB
5396 if (!chunk) {
5397 ret = -ENOMEM;
5398 goto out;
5399 }
5400
50460e37
FM
5401 /*
5402 * Take the device list mutex to prevent races with the final phase of
5403 * a device replace operation that replaces the device object associated
5404 * with the map's stripes, because the device object's id can change
5405 * at any time during that final phase of the device replace operation
5406 * (dev-replace.c:btrfs_dev_replace_finishing()).
5407 */
0b246afa 5408 mutex_lock(&fs_info->fs_devices->device_list_mutex);
6df9a95e
JB
5409 for (i = 0; i < map->num_stripes; i++) {
5410 device = map->stripes[i].dev;
5411 dev_offset = map->stripes[i].physical;
2b82032c 5412
0b86a832 5413 ret = btrfs_update_device(trans, device);
3acd3953 5414 if (ret)
50460e37 5415 break;
b5d9071c
NB
5416 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5417 dev_offset, stripe_size);
6df9a95e 5418 if (ret)
50460e37
FM
5419 break;
5420 }
5421 if (ret) {
0b246afa 5422 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
50460e37 5423 goto out;
2b82032c
YZ
5424 }
5425
2b82032c 5426 stripe = &chunk->stripe;
6df9a95e
JB
5427 for (i = 0; i < map->num_stripes; i++) {
5428 device = map->stripes[i].dev;
5429 dev_offset = map->stripes[i].physical;
0b86a832 5430
e17cade2
CM
5431 btrfs_set_stack_stripe_devid(stripe, device->devid);
5432 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5433 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2b82032c 5434 stripe++;
0b86a832 5435 }
0b246afa 5436 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
0b86a832 5437
2b82032c 5438 btrfs_set_stack_chunk_length(chunk, chunk_size);
0b86a832 5439 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2b82032c
YZ
5440 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5441 btrfs_set_stack_chunk_type(chunk, map->type);
5442 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5443 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5444 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
0b246afa 5445 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
2b82032c 5446 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
0b86a832 5447
2b82032c
YZ
5448 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5449 key.type = BTRFS_CHUNK_ITEM_KEY;
5450 key.offset = chunk_offset;
0b86a832 5451
2b82032c 5452 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4ed1d16e
MF
5453 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5454 /*
5455 * TODO: Cleanup of inserted chunk root in case of
5456 * failure.
5457 */
2ff7e61e 5458 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
8f18cf13 5459 }
1abe9b8a 5460
6df9a95e 5461out:
0b86a832 5462 kfree(chunk);
6df9a95e 5463 free_extent_map(em);
4ed1d16e 5464 return ret;
2b82032c 5465}
0b86a832 5466
6f8e0fc7 5467static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
2b82032c 5468{
6f8e0fc7 5469 struct btrfs_fs_info *fs_info = trans->fs_info;
2b82032c 5470 u64 alloc_profile;
2b82032c
YZ
5471 int ret;
5472
1b86826d 5473 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
11c67b1a 5474 ret = btrfs_alloc_chunk(trans, alloc_profile);
79787eaa
JM
5475 if (ret)
5476 return ret;
2b82032c 5477
1b86826d 5478 alloc_profile = btrfs_system_alloc_profile(fs_info);
11c67b1a 5479 ret = btrfs_alloc_chunk(trans, alloc_profile);
79787eaa 5480 return ret;
2b82032c
YZ
5481}
5482
d20983b4
MX
5483static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5484{
fc9a2ac7 5485 const int index = btrfs_bg_flags_to_raid_index(map->type);
2b82032c 5486
fc9a2ac7 5487 return btrfs_raid_array[index].tolerated_failures;
2b82032c
YZ
5488}
5489
2ff7e61e 5490int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2b82032c
YZ
5491{
5492 struct extent_map *em;
5493 struct map_lookup *map;
2b82032c 5494 int readonly = 0;
d20983b4 5495 int miss_ndevs = 0;
2b82032c
YZ
5496 int i;
5497
60ca842e 5498 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
592d92ee 5499 if (IS_ERR(em))
2b82032c
YZ
5500 return 1;
5501
95617d69 5502 map = em->map_lookup;
2b82032c 5503 for (i = 0; i < map->num_stripes; i++) {
e6e674bd
AJ
5504 if (test_bit(BTRFS_DEV_STATE_MISSING,
5505 &map->stripes[i].dev->dev_state)) {
d20983b4
MX
5506 miss_ndevs++;
5507 continue;
5508 }
ebbede42
AJ
5509 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5510 &map->stripes[i].dev->dev_state)) {
2b82032c 5511 readonly = 1;
d20983b4 5512 goto end;
2b82032c
YZ
5513 }
5514 }
d20983b4
MX
5515
5516 /*
5517 * If the number of missing devices is larger than max errors,
5518 * we can not write the data into that chunk successfully, so
5519 * set it readonly.
5520 */
5521 if (miss_ndevs > btrfs_chunk_max_errors(map))
5522 readonly = 1;
5523end:
0b86a832 5524 free_extent_map(em);
2b82032c 5525 return readonly;
0b86a832
CM
5526}
5527
c8bf1b67 5528void btrfs_mapping_tree_free(struct extent_map_tree *tree)
0b86a832
CM
5529{
5530 struct extent_map *em;
5531
d397712b 5532 while (1) {
c8bf1b67
DS
5533 write_lock(&tree->lock);
5534 em = lookup_extent_mapping(tree, 0, (u64)-1);
0b86a832 5535 if (em)
c8bf1b67
DS
5536 remove_extent_mapping(tree, em);
5537 write_unlock(&tree->lock);
0b86a832
CM
5538 if (!em)
5539 break;
0b86a832
CM
5540 /* once for us */
5541 free_extent_map(em);
5542 /* once for the tree */
5543 free_extent_map(em);
5544 }
5545}
5546
5d964051 5547int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
f188591e
CM
5548{
5549 struct extent_map *em;
5550 struct map_lookup *map;
f188591e
CM
5551 int ret;
5552
60ca842e 5553 em = btrfs_get_chunk_map(fs_info, logical, len);
592d92ee
LB
5554 if (IS_ERR(em))
5555 /*
5556 * We could return errors for these cases, but that could get
5557 * ugly and we'd probably do the same thing which is just not do
5558 * anything else and exit, so return 1 so the callers don't try
5559 * to use other copies.
5560 */
fb7669b5 5561 return 1;
fb7669b5 5562
95617d69 5563 map = em->map_lookup;
c7369b3f 5564 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
f188591e 5565 ret = map->num_stripes;
321aecc6
CM
5566 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5567 ret = map->sub_stripes;
53b381b3
DW
5568 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5569 ret = 2;
5570 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
8810f751
LB
5571 /*
5572 * There could be two corrupted data stripes, we need
5573 * to loop retry in order to rebuild the correct data.
e7e02096 5574 *
8810f751
LB
5575 * Fail a stripe at a time on every retry except the
5576 * stripe under reconstruction.
5577 */
5578 ret = map->num_stripes;
f188591e
CM
5579 else
5580 ret = 1;
5581 free_extent_map(em);
ad6d620e 5582
cb5583dd 5583 down_read(&fs_info->dev_replace.rwsem);
6fad823f
LB
5584 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5585 fs_info->dev_replace.tgtdev)
ad6d620e 5586 ret++;
cb5583dd 5587 up_read(&fs_info->dev_replace.rwsem);
ad6d620e 5588
f188591e
CM
5589 return ret;
5590}
5591
2ff7e61e 5592unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
53b381b3
DW
5593 u64 logical)
5594{
5595 struct extent_map *em;
5596 struct map_lookup *map;
0b246afa 5597 unsigned long len = fs_info->sectorsize;
53b381b3 5598
60ca842e 5599 em = btrfs_get_chunk_map(fs_info, logical, len);
53b381b3 5600
69f03f13
NB
5601 if (!WARN_ON(IS_ERR(em))) {
5602 map = em->map_lookup;
5603 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5604 len = map->stripe_len * nr_data_stripes(map);
5605 free_extent_map(em);
5606 }
53b381b3
DW
5607 return len;
5608}
5609
e4ff5fb5 5610int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
53b381b3
DW
5611{
5612 struct extent_map *em;
5613 struct map_lookup *map;
53b381b3
DW
5614 int ret = 0;
5615
60ca842e 5616 em = btrfs_get_chunk_map(fs_info, logical, len);
53b381b3 5617
69f03f13
NB
5618 if(!WARN_ON(IS_ERR(em))) {
5619 map = em->map_lookup;
5620 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5621 ret = 1;
5622 free_extent_map(em);
5623 }
53b381b3
DW
5624 return ret;
5625}
5626
30d9861f 5627static int find_live_mirror(struct btrfs_fs_info *fs_info,
99f92a7c 5628 struct map_lookup *map, int first,
8ba0ae78 5629 int dev_replace_is_ongoing)
dfe25020
CM
5630{
5631 int i;
99f92a7c 5632 int num_stripes;
8ba0ae78 5633 int preferred_mirror;
30d9861f
SB
5634 int tolerance;
5635 struct btrfs_device *srcdev;
5636
99f92a7c 5637 ASSERT((map->type &
c7369b3f 5638 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
99f92a7c
AJ
5639
5640 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5641 num_stripes = map->sub_stripes;
5642 else
5643 num_stripes = map->num_stripes;
5644
33fd2f71
AJ
5645 switch (fs_info->fs_devices->read_policy) {
5646 default:
5647 /* Shouldn't happen, just warn and use pid instead of failing */
5648 btrfs_warn_rl(fs_info,
5649 "unknown read_policy type %u, reset to pid",
5650 fs_info->fs_devices->read_policy);
5651 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5652 fallthrough;
5653 case BTRFS_READ_POLICY_PID:
5654 preferred_mirror = first + (current->pid % num_stripes);
5655 break;
5656 }
8ba0ae78 5657
30d9861f
SB
5658 if (dev_replace_is_ongoing &&
5659 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5660 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5661 srcdev = fs_info->dev_replace.srcdev;
5662 else
5663 srcdev = NULL;
5664
5665 /*
5666 * try to avoid the drive that is the source drive for a
5667 * dev-replace procedure, only choose it if no other non-missing
5668 * mirror is available
5669 */
5670 for (tolerance = 0; tolerance < 2; tolerance++) {
8ba0ae78
AJ
5671 if (map->stripes[preferred_mirror].dev->bdev &&
5672 (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5673 return preferred_mirror;
99f92a7c 5674 for (i = first; i < first + num_stripes; i++) {
30d9861f
SB
5675 if (map->stripes[i].dev->bdev &&
5676 (tolerance || map->stripes[i].dev != srcdev))
5677 return i;
5678 }
dfe25020 5679 }
30d9861f 5680
dfe25020
CM
5681 /* we couldn't find one that doesn't fail. Just return something
5682 * and the io error handling code will clean up eventually
5683 */
8ba0ae78 5684 return preferred_mirror;
dfe25020
CM
5685}
5686
53b381b3 5687/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
8e5cfb55 5688static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
53b381b3 5689{
53b381b3 5690 int i;
53b381b3
DW
5691 int again = 1;
5692
5693 while (again) {
5694 again = 0;
cc7539ed 5695 for (i = 0; i < num_stripes - 1; i++) {
eeb6f172
DS
5696 /* Swap if parity is on a smaller index */
5697 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5698 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5699 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
53b381b3
DW
5700 again = 1;
5701 }
5702 }
5703 }
5704}
5705
6e9606d2
ZL
5706static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5707{
5708 struct btrfs_bio *bbio = kzalloc(
e57cf21e 5709 /* the size of the btrfs_bio */
6e9606d2 5710 sizeof(struct btrfs_bio) +
e57cf21e 5711 /* plus the variable array for the stripes */
6e9606d2 5712 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
e57cf21e 5713 /* plus the variable array for the tgt dev */
6e9606d2 5714 sizeof(int) * (real_stripes) +
e57cf21e
CM
5715 /*
5716 * plus the raid_map, which includes both the tgt dev
5717 * and the stripes
5718 */
5719 sizeof(u64) * (total_stripes),
277fb5fc 5720 GFP_NOFS|__GFP_NOFAIL);
6e9606d2
ZL
5721
5722 atomic_set(&bbio->error, 0);
140475ae 5723 refcount_set(&bbio->refs, 1);
6e9606d2 5724
608769a4
NB
5725 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5726 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5727
6e9606d2
ZL
5728 return bbio;
5729}
5730
5731void btrfs_get_bbio(struct btrfs_bio *bbio)
5732{
140475ae
ER
5733 WARN_ON(!refcount_read(&bbio->refs));
5734 refcount_inc(&bbio->refs);
6e9606d2
ZL
5735}
5736
5737void btrfs_put_bbio(struct btrfs_bio *bbio)
5738{
5739 if (!bbio)
5740 return;
140475ae 5741 if (refcount_dec_and_test(&bbio->refs))
6e9606d2
ZL
5742 kfree(bbio);
5743}
5744
0b3d4cd3
LB
5745/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5746/*
5747 * Please note that, discard won't be sent to target device of device
5748 * replace.
5749 */
5750static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
6b7faadd 5751 u64 logical, u64 *length_ret,
0b3d4cd3
LB
5752 struct btrfs_bio **bbio_ret)
5753{
5754 struct extent_map *em;
5755 struct map_lookup *map;
5756 struct btrfs_bio *bbio;
6b7faadd 5757 u64 length = *length_ret;
0b3d4cd3
LB
5758 u64 offset;
5759 u64 stripe_nr;
5760 u64 stripe_nr_end;
5761 u64 stripe_end_offset;
5762 u64 stripe_cnt;
5763 u64 stripe_len;
5764 u64 stripe_offset;
5765 u64 num_stripes;
5766 u32 stripe_index;
5767 u32 factor = 0;
5768 u32 sub_stripes = 0;
5769 u64 stripes_per_dev = 0;
5770 u32 remaining_stripes = 0;
5771 u32 last_stripe = 0;
5772 int ret = 0;
5773 int i;
5774
5775 /* discard always return a bbio */
5776 ASSERT(bbio_ret);
5777
60ca842e 5778 em = btrfs_get_chunk_map(fs_info, logical, length);
0b3d4cd3
LB
5779 if (IS_ERR(em))
5780 return PTR_ERR(em);
5781
5782 map = em->map_lookup;
5783 /* we don't discard raid56 yet */
5784 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5785 ret = -EOPNOTSUPP;
5786 goto out;
5787 }
5788
5789 offset = logical - em->start;
2d974619 5790 length = min_t(u64, em->start + em->len - logical, length);
6b7faadd 5791 *length_ret = length;
0b3d4cd3
LB
5792
5793 stripe_len = map->stripe_len;
5794 /*
5795 * stripe_nr counts the total number of stripes we have to stride
5796 * to get to this block
5797 */
5798 stripe_nr = div64_u64(offset, stripe_len);
5799
5800 /* stripe_offset is the offset of this block in its stripe */
5801 stripe_offset = offset - stripe_nr * stripe_len;
5802
5803 stripe_nr_end = round_up(offset + length, map->stripe_len);
42c61ab6 5804 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
0b3d4cd3
LB
5805 stripe_cnt = stripe_nr_end - stripe_nr;
5806 stripe_end_offset = stripe_nr_end * map->stripe_len -
5807 (offset + length);
5808 /*
5809 * after this, stripe_nr is the number of stripes on this
5810 * device we have to walk to find the data, and stripe_index is
5811 * the number of our device in the stripe array
5812 */
5813 num_stripes = 1;
5814 stripe_index = 0;
5815 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5816 BTRFS_BLOCK_GROUP_RAID10)) {
5817 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5818 sub_stripes = 1;
5819 else
5820 sub_stripes = map->sub_stripes;
5821
5822 factor = map->num_stripes / sub_stripes;
5823 num_stripes = min_t(u64, map->num_stripes,
5824 sub_stripes * stripe_cnt);
5825 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5826 stripe_index *= sub_stripes;
5827 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5828 &remaining_stripes);
5829 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5830 last_stripe *= sub_stripes;
c7369b3f 5831 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
0b3d4cd3
LB
5832 BTRFS_BLOCK_GROUP_DUP)) {
5833 num_stripes = map->num_stripes;
5834 } else {
5835 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5836 &stripe_index);
5837 }
5838
5839 bbio = alloc_btrfs_bio(num_stripes, 0);
5840 if (!bbio) {
5841 ret = -ENOMEM;
5842 goto out;
5843 }
5844
5845 for (i = 0; i < num_stripes; i++) {
5846 bbio->stripes[i].physical =
5847 map->stripes[stripe_index].physical +
5848 stripe_offset + stripe_nr * map->stripe_len;
5849 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5850
5851 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5852 BTRFS_BLOCK_GROUP_RAID10)) {
5853 bbio->stripes[i].length = stripes_per_dev *
5854 map->stripe_len;
5855
5856 if (i / sub_stripes < remaining_stripes)
5857 bbio->stripes[i].length +=
5858 map->stripe_len;
5859
5860 /*
5861 * Special for the first stripe and
5862 * the last stripe:
5863 *
5864 * |-------|...|-------|
5865 * |----------|
5866 * off end_off
5867 */
5868 if (i < sub_stripes)
5869 bbio->stripes[i].length -=
5870 stripe_offset;
5871
5872 if (stripe_index >= last_stripe &&
5873 stripe_index <= (last_stripe +
5874 sub_stripes - 1))
5875 bbio->stripes[i].length -=
5876 stripe_end_offset;
5877
5878 if (i == sub_stripes - 1)
5879 stripe_offset = 0;
5880 } else {
5881 bbio->stripes[i].length = length;
5882 }
5883
5884 stripe_index++;
5885 if (stripe_index == map->num_stripes) {
5886 stripe_index = 0;
5887 stripe_nr++;
5888 }
5889 }
5890
5891 *bbio_ret = bbio;
5892 bbio->map_type = map->type;
5893 bbio->num_stripes = num_stripes;
5894out:
5895 free_extent_map(em);
5896 return ret;
5897}
5898
5ab56090
LB
5899/*
5900 * In dev-replace case, for repair case (that's the only case where the mirror
5901 * is selected explicitly when calling btrfs_map_block), blocks left of the
5902 * left cursor can also be read from the target drive.
5903 *
5904 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5905 * array of stripes.
5906 * For READ, it also needs to be supported using the same mirror number.
5907 *
5908 * If the requested block is not left of the left cursor, EIO is returned. This
5909 * can happen because btrfs_num_copies() returns one more in the dev-replace
5910 * case.
5911 */
5912static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5913 u64 logical, u64 length,
5914 u64 srcdev_devid, int *mirror_num,
5915 u64 *physical)
5916{
5917 struct btrfs_bio *bbio = NULL;
5918 int num_stripes;
5919 int index_srcdev = 0;
5920 int found = 0;
5921 u64 physical_of_found = 0;
5922 int i;
5923 int ret = 0;
5924
5925 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5926 logical, &length, &bbio, 0, 0);
5927 if (ret) {
5928 ASSERT(bbio == NULL);
5929 return ret;
5930 }
5931
5932 num_stripes = bbio->num_stripes;
5933 if (*mirror_num > num_stripes) {
5934 /*
5935 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5936 * that means that the requested area is not left of the left
5937 * cursor
5938 */
5939 btrfs_put_bbio(bbio);
5940 return -EIO;
5941 }
5942
5943 /*
5944 * process the rest of the function using the mirror_num of the source
5945 * drive. Therefore look it up first. At the end, patch the device
5946 * pointer to the one of the target drive.
5947 */
5948 for (i = 0; i < num_stripes; i++) {
5949 if (bbio->stripes[i].dev->devid != srcdev_devid)
5950 continue;
5951
5952 /*
5953 * In case of DUP, in order to keep it simple, only add the
5954 * mirror with the lowest physical address
5955 */
5956 if (found &&
5957 physical_of_found <= bbio->stripes[i].physical)
5958 continue;
5959
5960 index_srcdev = i;
5961 found = 1;
5962 physical_of_found = bbio->stripes[i].physical;
5963 }
5964
5965 btrfs_put_bbio(bbio);
5966
5967 ASSERT(found);
5968 if (!found)
5969 return -EIO;
5970
5971 *mirror_num = index_srcdev + 1;
5972 *physical = physical_of_found;
5973 return ret;
5974}
5975
73c0f228
LB
5976static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5977 struct btrfs_bio **bbio_ret,
5978 struct btrfs_dev_replace *dev_replace,
5979 int *num_stripes_ret, int *max_errors_ret)
5980{
5981 struct btrfs_bio *bbio = *bbio_ret;
5982 u64 srcdev_devid = dev_replace->srcdev->devid;
5983 int tgtdev_indexes = 0;
5984 int num_stripes = *num_stripes_ret;
5985 int max_errors = *max_errors_ret;
5986 int i;
5987
5988 if (op == BTRFS_MAP_WRITE) {
5989 int index_where_to_add;
5990
5991 /*
5992 * duplicate the write operations while the dev replace
5993 * procedure is running. Since the copying of the old disk to
5994 * the new disk takes place at run time while the filesystem is
5995 * mounted writable, the regular write operations to the old
5996 * disk have to be duplicated to go to the new disk as well.
5997 *
5998 * Note that device->missing is handled by the caller, and that
5999 * the write to the old disk is already set up in the stripes
6000 * array.
6001 */
6002 index_where_to_add = num_stripes;
6003 for (i = 0; i < num_stripes; i++) {
6004 if (bbio->stripes[i].dev->devid == srcdev_devid) {
6005 /* write to new disk, too */
6006 struct btrfs_bio_stripe *new =
6007 bbio->stripes + index_where_to_add;
6008 struct btrfs_bio_stripe *old =
6009 bbio->stripes + i;
6010
6011 new->physical = old->physical;
6012 new->length = old->length;
6013 new->dev = dev_replace->tgtdev;
6014 bbio->tgtdev_map[i] = index_where_to_add;
6015 index_where_to_add++;
6016 max_errors++;
6017 tgtdev_indexes++;
6018 }
6019 }
6020 num_stripes = index_where_to_add;
6021 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6022 int index_srcdev = 0;
6023 int found = 0;
6024 u64 physical_of_found = 0;
6025
6026 /*
6027 * During the dev-replace procedure, the target drive can also
6028 * be used to read data in case it is needed to repair a corrupt
6029 * block elsewhere. This is possible if the requested area is
6030 * left of the left cursor. In this area, the target drive is a
6031 * full copy of the source drive.
6032 */
6033 for (i = 0; i < num_stripes; i++) {
6034 if (bbio->stripes[i].dev->devid == srcdev_devid) {
6035 /*
6036 * In case of DUP, in order to keep it simple,
6037 * only add the mirror with the lowest physical
6038 * address
6039 */
6040 if (found &&
6041 physical_of_found <=
6042 bbio->stripes[i].physical)
6043 continue;
6044 index_srcdev = i;
6045 found = 1;
6046 physical_of_found = bbio->stripes[i].physical;
6047 }
6048 }
6049 if (found) {
6050 struct btrfs_bio_stripe *tgtdev_stripe =
6051 bbio->stripes + num_stripes;
6052
6053 tgtdev_stripe->physical = physical_of_found;
6054 tgtdev_stripe->length =
6055 bbio->stripes[index_srcdev].length;
6056 tgtdev_stripe->dev = dev_replace->tgtdev;
6057 bbio->tgtdev_map[index_srcdev] = num_stripes;
6058
6059 tgtdev_indexes++;
6060 num_stripes++;
6061 }
6062 }
6063
6064 *num_stripes_ret = num_stripes;
6065 *max_errors_ret = max_errors;
6066 bbio->num_tgtdevs = tgtdev_indexes;
6067 *bbio_ret = bbio;
6068}
6069
2b19a1fe
LB
6070static bool need_full_stripe(enum btrfs_map_op op)
6071{
6072 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6073}
6074
5f141126 6075/*
42034313
MR
6076 * Calculate the geometry of a particular (address, len) tuple. This
6077 * information is used to calculate how big a particular bio can get before it
6078 * straddles a stripe.
5f141126 6079 *
42034313
MR
6080 * @fs_info: the filesystem
6081 * @em: mapping containing the logical extent
6082 * @op: type of operation - write or read
6083 * @logical: address that we want to figure out the geometry of
6084 * @len: the length of IO we are going to perform, starting at @logical
6085 * @io_geom: pointer used to return values
5f141126
NB
6086 *
6087 * Returns < 0 in case a chunk for the given logical address cannot be found,
6088 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6089 */
42034313
MR
6090int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6091 enum btrfs_map_op op, u64 logical, u64 len,
6092 struct btrfs_io_geometry *io_geom)
5f141126 6093{
5f141126
NB
6094 struct map_lookup *map;
6095 u64 offset;
6096 u64 stripe_offset;
6097 u64 stripe_nr;
6098 u64 stripe_len;
6099 u64 raid56_full_stripe_start = (u64)-1;
6100 int data_stripes;
6101
6102 ASSERT(op != BTRFS_MAP_DISCARD);
6103
5f141126
NB
6104 map = em->map_lookup;
6105 /* Offset of this logical address in the chunk */
6106 offset = logical - em->start;
6107 /* Len of a stripe in a chunk */
6108 stripe_len = map->stripe_len;
6109 /* Stripe wher this block falls in */
6110 stripe_nr = div64_u64(offset, stripe_len);
6111 /* Offset of stripe in the chunk */
6112 stripe_offset = stripe_nr * stripe_len;
6113 if (offset < stripe_offset) {
6114 btrfs_crit(fs_info,
6115"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6116 stripe_offset, offset, em->start, logical, stripe_len);
42034313 6117 return -EINVAL;
5f141126
NB
6118 }
6119
6120 /* stripe_offset is the offset of this block in its stripe */
6121 stripe_offset = offset - stripe_offset;
6122 data_stripes = nr_data_stripes(map);
6123
6124 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6125 u64 max_len = stripe_len - stripe_offset;
6126
6127 /*
6128 * In case of raid56, we need to know the stripe aligned start
6129 */
6130 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6131 unsigned long full_stripe_len = stripe_len * data_stripes;
6132 raid56_full_stripe_start = offset;
6133
6134 /*
6135 * Allow a write of a full stripe, but make sure we
6136 * don't allow straddling of stripes
6137 */
6138 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6139 full_stripe_len);
6140 raid56_full_stripe_start *= full_stripe_len;
6141
6142 /*
6143 * For writes to RAID[56], allow a full stripeset across
6144 * all disks. For other RAID types and for RAID[56]
6145 * reads, just allow a single stripe (on a single disk).
6146 */
6147 if (op == BTRFS_MAP_WRITE) {
6148 max_len = stripe_len * data_stripes -
6149 (offset - raid56_full_stripe_start);
6150 }
6151 }
6152 len = min_t(u64, em->len - offset, max_len);
6153 } else {
6154 len = em->len - offset;
6155 }
6156
6157 io_geom->len = len;
6158 io_geom->offset = offset;
6159 io_geom->stripe_len = stripe_len;
6160 io_geom->stripe_nr = stripe_nr;
6161 io_geom->stripe_offset = stripe_offset;
6162 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6163
42034313 6164 return 0;
5f141126
NB
6165}
6166
cf8cddd3
CH
6167static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6168 enum btrfs_map_op op,
f2d8d74d 6169 u64 logical, u64 *length,
a1d3c478 6170 struct btrfs_bio **bbio_ret,
8e5cfb55 6171 int mirror_num, int need_raid_map)
0b86a832
CM
6172{
6173 struct extent_map *em;
6174 struct map_lookup *map;
593060d7
CM
6175 u64 stripe_offset;
6176 u64 stripe_nr;
53b381b3 6177 u64 stripe_len;
9d644a62 6178 u32 stripe_index;
cff82672 6179 int data_stripes;
cea9e445 6180 int i;
de11cc12 6181 int ret = 0;
f2d8d74d 6182 int num_stripes;
a236aed1 6183 int max_errors = 0;
2c8cdd6e 6184 int tgtdev_indexes = 0;
a1d3c478 6185 struct btrfs_bio *bbio = NULL;
472262f3
SB
6186 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6187 int dev_replace_is_ongoing = 0;
6188 int num_alloc_stripes;
ad6d620e
SB
6189 int patch_the_first_stripe_for_dev_replace = 0;
6190 u64 physical_to_patch_in_first_stripe = 0;
53b381b3 6191 u64 raid56_full_stripe_start = (u64)-1;
89b798ad
NB
6192 struct btrfs_io_geometry geom;
6193
6194 ASSERT(bbio_ret);
75fb2e9e 6195 ASSERT(op != BTRFS_MAP_DISCARD);
0b3d4cd3 6196
42034313
MR
6197 em = btrfs_get_chunk_map(fs_info, logical, *length);
6198 ASSERT(!IS_ERR(em));
6199
6200 ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
89b798ad
NB
6201 if (ret < 0)
6202 return ret;
0b86a832 6203
95617d69 6204 map = em->map_lookup;
593060d7 6205
89b798ad 6206 *length = geom.len;
89b798ad
NB
6207 stripe_len = geom.stripe_len;
6208 stripe_nr = geom.stripe_nr;
6209 stripe_offset = geom.stripe_offset;
6210 raid56_full_stripe_start = geom.raid56_stripe_offset;
cff82672 6211 data_stripes = nr_data_stripes(map);
593060d7 6212
cb5583dd 6213 down_read(&dev_replace->rwsem);
472262f3 6214 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
53176dde
DS
6215 /*
6216 * Hold the semaphore for read during the whole operation, write is
6217 * requested at commit time but must wait.
6218 */
472262f3 6219 if (!dev_replace_is_ongoing)
cb5583dd 6220 up_read(&dev_replace->rwsem);
472262f3 6221
ad6d620e 6222 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
2b19a1fe 6223 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5ab56090
LB
6224 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6225 dev_replace->srcdev->devid,
6226 &mirror_num,
6227 &physical_to_patch_in_first_stripe);
6228 if (ret)
ad6d620e 6229 goto out;
5ab56090
LB
6230 else
6231 patch_the_first_stripe_for_dev_replace = 1;
ad6d620e
SB
6232 } else if (mirror_num > map->num_stripes) {
6233 mirror_num = 0;
6234 }
6235
f2d8d74d 6236 num_stripes = 1;
cea9e445 6237 stripe_index = 0;
fce3bb9a 6238 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
47c5713f
DS
6239 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6240 &stripe_index);
de483734 6241 if (!need_full_stripe(op))
28e1cc7d 6242 mirror_num = 1;
c7369b3f 6243 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
de483734 6244 if (need_full_stripe(op))
f2d8d74d 6245 num_stripes = map->num_stripes;
2fff734f 6246 else if (mirror_num)
f188591e 6247 stripe_index = mirror_num - 1;
dfe25020 6248 else {
30d9861f 6249 stripe_index = find_live_mirror(fs_info, map, 0,
30d9861f 6250 dev_replace_is_ongoing);
a1d3c478 6251 mirror_num = stripe_index + 1;
dfe25020 6252 }
2fff734f 6253
611f0e00 6254 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
de483734 6255 if (need_full_stripe(op)) {
f2d8d74d 6256 num_stripes = map->num_stripes;
a1d3c478 6257 } else if (mirror_num) {
f188591e 6258 stripe_index = mirror_num - 1;
a1d3c478
JS
6259 } else {
6260 mirror_num = 1;
6261 }
2fff734f 6262
321aecc6 6263 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
9d644a62 6264 u32 factor = map->num_stripes / map->sub_stripes;
321aecc6 6265
47c5713f 6266 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
321aecc6
CM
6267 stripe_index *= map->sub_stripes;
6268
de483734 6269 if (need_full_stripe(op))
f2d8d74d 6270 num_stripes = map->sub_stripes;
321aecc6
CM
6271 else if (mirror_num)
6272 stripe_index += mirror_num - 1;
dfe25020 6273 else {
3e74317a 6274 int old_stripe_index = stripe_index;
30d9861f
SB
6275 stripe_index = find_live_mirror(fs_info, map,
6276 stripe_index,
30d9861f 6277 dev_replace_is_ongoing);
3e74317a 6278 mirror_num = stripe_index - old_stripe_index + 1;
dfe25020 6279 }
53b381b3 6280
ffe2d203 6281 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
de483734 6282 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
53b381b3 6283 /* push stripe_nr back to the start of the full stripe */
42c61ab6 6284 stripe_nr = div64_u64(raid56_full_stripe_start,
cff82672 6285 stripe_len * data_stripes);
53b381b3
DW
6286
6287 /* RAID[56] write or recovery. Return all stripes */
6288 num_stripes = map->num_stripes;
6289 max_errors = nr_parity_stripes(map);
6290
53b381b3
DW
6291 *length = map->stripe_len;
6292 stripe_index = 0;
6293 stripe_offset = 0;
6294 } else {
6295 /*
6296 * Mirror #0 or #1 means the original data block.
6297 * Mirror #2 is RAID5 parity block.
6298 * Mirror #3 is RAID6 Q block.
6299 */
47c5713f 6300 stripe_nr = div_u64_rem(stripe_nr,
cff82672 6301 data_stripes, &stripe_index);
53b381b3 6302 if (mirror_num > 1)
cff82672 6303 stripe_index = data_stripes + mirror_num - 2;
53b381b3
DW
6304
6305 /* We distribute the parity blocks across stripes */
47c5713f
DS
6306 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6307 &stripe_index);
de483734 6308 if (!need_full_stripe(op) && mirror_num <= 1)
28e1cc7d 6309 mirror_num = 1;
53b381b3 6310 }
8790d502
CM
6311 } else {
6312 /*
47c5713f
DS
6313 * after this, stripe_nr is the number of stripes on this
6314 * device we have to walk to find the data, and stripe_index is
6315 * the number of our device in the stripe array
8790d502 6316 */
47c5713f
DS
6317 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6318 &stripe_index);
a1d3c478 6319 mirror_num = stripe_index + 1;
8790d502 6320 }
e042d1ec 6321 if (stripe_index >= map->num_stripes) {
5d163e0e
JM
6322 btrfs_crit(fs_info,
6323 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
e042d1ec
JB
6324 stripe_index, map->num_stripes);
6325 ret = -EINVAL;
6326 goto out;
6327 }
cea9e445 6328
472262f3 6329 num_alloc_stripes = num_stripes;
6fad823f 6330 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
0b3d4cd3 6331 if (op == BTRFS_MAP_WRITE)
ad6d620e 6332 num_alloc_stripes <<= 1;
cf8cddd3 6333 if (op == BTRFS_MAP_GET_READ_MIRRORS)
ad6d620e 6334 num_alloc_stripes++;
2c8cdd6e 6335 tgtdev_indexes = num_stripes;
ad6d620e 6336 }
2c8cdd6e 6337
6e9606d2 6338 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
de11cc12
LZ
6339 if (!bbio) {
6340 ret = -ENOMEM;
6341 goto out;
6342 }
608769a4
NB
6343
6344 for (i = 0; i < num_stripes; i++) {
6345 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6346 stripe_offset + stripe_nr * map->stripe_len;
6347 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6348 stripe_index++;
6349 }
de11cc12 6350
8e5cfb55 6351 /* build raid_map */
2b19a1fe
LB
6352 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6353 (need_full_stripe(op) || mirror_num > 1)) {
8e5cfb55 6354 u64 tmp;
9d644a62 6355 unsigned rot;
8e5cfb55 6356
8e5cfb55 6357 /* Work out the disk rotation on this stripe-set */
47c5713f 6358 div_u64_rem(stripe_nr, num_stripes, &rot);
8e5cfb55
ZL
6359
6360 /* Fill in the logical address of each stripe */
cff82672
DS
6361 tmp = stripe_nr * data_stripes;
6362 for (i = 0; i < data_stripes; i++)
8e5cfb55
ZL
6363 bbio->raid_map[(i+rot) % num_stripes] =
6364 em->start + (tmp + i) * map->stripe_len;
6365
6366 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6367 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6368 bbio->raid_map[(i+rot+1) % num_stripes] =
6369 RAID6_Q_STRIPE;
8e5cfb55 6370
608769a4 6371 sort_parity_stripes(bbio, num_stripes);
593060d7 6372 }
de11cc12 6373
2b19a1fe 6374 if (need_full_stripe(op))
d20983b4 6375 max_errors = btrfs_chunk_max_errors(map);
de11cc12 6376
73c0f228 6377 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
2b19a1fe 6378 need_full_stripe(op)) {
73c0f228
LB
6379 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6380 &max_errors);
472262f3
SB
6381 }
6382
de11cc12 6383 *bbio_ret = bbio;
10f11900 6384 bbio->map_type = map->type;
de11cc12
LZ
6385 bbio->num_stripes = num_stripes;
6386 bbio->max_errors = max_errors;
6387 bbio->mirror_num = mirror_num;
ad6d620e
SB
6388
6389 /*
6390 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6391 * mirror_num == num_stripes + 1 && dev_replace target drive is
6392 * available as a mirror
6393 */
6394 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6395 WARN_ON(num_stripes > 1);
6396 bbio->stripes[0].dev = dev_replace->tgtdev;
6397 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6398 bbio->mirror_num = map->num_stripes + 1;
6399 }
cea9e445 6400out:
73beece9 6401 if (dev_replace_is_ongoing) {
53176dde
DS
6402 lockdep_assert_held(&dev_replace->rwsem);
6403 /* Unlock and let waiting writers proceed */
cb5583dd 6404 up_read(&dev_replace->rwsem);
73beece9 6405 }
0b86a832 6406 free_extent_map(em);
de11cc12 6407 return ret;
0b86a832
CM
6408}
6409
cf8cddd3 6410int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
f2d8d74d 6411 u64 logical, u64 *length,
a1d3c478 6412 struct btrfs_bio **bbio_ret, int mirror_num)
f2d8d74d 6413{
75fb2e9e
DS
6414 if (op == BTRFS_MAP_DISCARD)
6415 return __btrfs_map_block_for_discard(fs_info, logical,
6416 length, bbio_ret);
6417
b3d3fa51 6418 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
8e5cfb55 6419 mirror_num, 0);
f2d8d74d
CM
6420}
6421
af8e2d1d 6422/* For Scrub/replace */
cf8cddd3 6423int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
af8e2d1d 6424 u64 logical, u64 *length,
825ad4c9 6425 struct btrfs_bio **bbio_ret)
af8e2d1d 6426{
825ad4c9 6427 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
af8e2d1d
MX
6428}
6429
4246a0b6 6430static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
8408c716 6431{
326e1dbb
MS
6432 bio->bi_private = bbio->private;
6433 bio->bi_end_io = bbio->end_io;
4246a0b6 6434 bio_endio(bio);
326e1dbb 6435
6e9606d2 6436 btrfs_put_bbio(bbio);
8408c716
MX
6437}
6438
4246a0b6 6439static void btrfs_end_bio(struct bio *bio)
8790d502 6440{
9be3395b 6441 struct btrfs_bio *bbio = bio->bi_private;
7d2b4daa 6442 int is_orig_bio = 0;
8790d502 6443
4e4cbee9 6444 if (bio->bi_status) {
a1d3c478 6445 atomic_inc(&bbio->error);
4e4cbee9
CH
6446 if (bio->bi_status == BLK_STS_IOERR ||
6447 bio->bi_status == BLK_STS_TARGET) {
c31efbdf 6448 struct btrfs_device *dev = btrfs_io_bio(bio)->device;
442a4f63 6449
3eee86c8 6450 ASSERT(dev->bdev);
cfe94440 6451 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
3eee86c8 6452 btrfs_dev_stat_inc_and_print(dev,
597a60fa 6453 BTRFS_DEV_STAT_WRITE_ERRS);
3eee86c8
NB
6454 else if (!(bio->bi_opf & REQ_RAHEAD))
6455 btrfs_dev_stat_inc_and_print(dev,
597a60fa 6456 BTRFS_DEV_STAT_READ_ERRS);
3eee86c8
NB
6457 if (bio->bi_opf & REQ_PREFLUSH)
6458 btrfs_dev_stat_inc_and_print(dev,
597a60fa 6459 BTRFS_DEV_STAT_FLUSH_ERRS);
442a4f63
SB
6460 }
6461 }
8790d502 6462
a1d3c478 6463 if (bio == bbio->orig_bio)
7d2b4daa
CM
6464 is_orig_bio = 1;
6465
c404e0dc
MX
6466 btrfs_bio_counter_dec(bbio->fs_info);
6467
a1d3c478 6468 if (atomic_dec_and_test(&bbio->stripes_pending)) {
7d2b4daa
CM
6469 if (!is_orig_bio) {
6470 bio_put(bio);
a1d3c478 6471 bio = bbio->orig_bio;
7d2b4daa 6472 }
c7b22bb1 6473
9be3395b 6474 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
a236aed1 6475 /* only send an error to the higher layers if it is
53b381b3 6476 * beyond the tolerance of the btrfs bio
a236aed1 6477 */
a1d3c478 6478 if (atomic_read(&bbio->error) > bbio->max_errors) {
4e4cbee9 6479 bio->bi_status = BLK_STS_IOERR;
5dbc8fca 6480 } else {
1259ab75
CM
6481 /*
6482 * this bio is actually up to date, we didn't
6483 * go over the max number of errors
6484 */
2dbe0c77 6485 bio->bi_status = BLK_STS_OK;
1259ab75 6486 }
c55f1396 6487
4246a0b6 6488 btrfs_end_bbio(bbio, bio);
7d2b4daa 6489 } else if (!is_orig_bio) {
8790d502
CM
6490 bio_put(bio);
6491 }
8790d502
CM
6492}
6493
2ff7e61e 6494static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
c31efbdf 6495 u64 physical, struct btrfs_device *dev)
de1ee92a 6496{
2ff7e61e 6497 struct btrfs_fs_info *fs_info = bbio->fs_info;
de1ee92a
JB
6498
6499 bio->bi_private = bbio;
c31efbdf 6500 btrfs_io_bio(bio)->device = dev;
de1ee92a 6501 bio->bi_end_io = btrfs_end_bio;
4f024f37 6502 bio->bi_iter.bi_sector = physical >> 9;
d8e3fb10
NA
6503 /*
6504 * For zone append writing, bi_sector must point the beginning of the
6505 * zone
6506 */
6507 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6508 if (btrfs_dev_is_sequential(dev, physical)) {
6509 u64 zone_start = round_down(physical, fs_info->zone_size);
6510
6511 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6512 } else {
6513 bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6514 bio->bi_opf |= REQ_OP_WRITE;
6515 }
6516 }
672d5990
MT
6517 btrfs_debug_in_rcu(fs_info,
6518 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
1201b58b 6519 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
1db45a35
DS
6520 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6521 dev->devid, bio->bi_iter.bi_size);
74d46992 6522 bio_set_dev(bio, dev->bdev);
c404e0dc 6523
2ff7e61e 6524 btrfs_bio_counter_inc_noblocked(fs_info);
c404e0dc 6525
08635bae 6526 btrfsic_submit_bio(bio);
de1ee92a
JB
6527}
6528
de1ee92a
JB
6529static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6530{
6531 atomic_inc(&bbio->error);
6532 if (atomic_dec_and_test(&bbio->stripes_pending)) {
01327610 6533 /* Should be the original bio. */
8408c716
MX
6534 WARN_ON(bio != bbio->orig_bio);
6535
9be3395b 6536 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
4f024f37 6537 bio->bi_iter.bi_sector = logical >> 9;
102ed2c5
AJ
6538 if (atomic_read(&bbio->error) > bbio->max_errors)
6539 bio->bi_status = BLK_STS_IOERR;
6540 else
6541 bio->bi_status = BLK_STS_OK;
4246a0b6 6542 btrfs_end_bbio(bbio, bio);
de1ee92a
JB
6543 }
6544}
6545
58efbc9f 6546blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
08635bae 6547 int mirror_num)
0b86a832 6548{
0b86a832 6549 struct btrfs_device *dev;
8790d502 6550 struct bio *first_bio = bio;
1201b58b 6551 u64 logical = bio->bi_iter.bi_sector << 9;
0b86a832
CM
6552 u64 length = 0;
6553 u64 map_length;
0b86a832 6554 int ret;
08da757d
ZL
6555 int dev_nr;
6556 int total_devs;
a1d3c478 6557 struct btrfs_bio *bbio = NULL;
0b86a832 6558
4f024f37 6559 length = bio->bi_iter.bi_size;
0b86a832 6560 map_length = length;
cea9e445 6561
0b246afa 6562 btrfs_bio_counter_inc_blocked(fs_info);
bd7d63c2 6563 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
37226b21 6564 &map_length, &bbio, mirror_num, 1);
c404e0dc 6565 if (ret) {
0b246afa 6566 btrfs_bio_counter_dec(fs_info);
58efbc9f 6567 return errno_to_blk_status(ret);
c404e0dc 6568 }
cea9e445 6569
a1d3c478 6570 total_devs = bbio->num_stripes;
53b381b3
DW
6571 bbio->orig_bio = first_bio;
6572 bbio->private = first_bio->bi_private;
6573 bbio->end_io = first_bio->bi_end_io;
0b246afa 6574 bbio->fs_info = fs_info;
53b381b3
DW
6575 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6576
ad1ba2a0 6577 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
cfe94440 6578 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
53b381b3
DW
6579 /* In this case, map_length has been set to the length of
6580 a single stripe; not the whole write */
cfe94440 6581 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
2ff7e61e
JM
6582 ret = raid56_parity_write(fs_info, bio, bbio,
6583 map_length);
53b381b3 6584 } else {
2ff7e61e
JM
6585 ret = raid56_parity_recover(fs_info, bio, bbio,
6586 map_length, mirror_num, 1);
53b381b3 6587 }
4245215d 6588
0b246afa 6589 btrfs_bio_counter_dec(fs_info);
58efbc9f 6590 return errno_to_blk_status(ret);
53b381b3
DW
6591 }
6592
cea9e445 6593 if (map_length < length) {
0b246afa 6594 btrfs_crit(fs_info,
5d163e0e
JM
6595 "mapping failed logical %llu bio len %llu len %llu",
6596 logical, length, map_length);
cea9e445
CM
6597 BUG();
6598 }
a1d3c478 6599
08da757d 6600 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
de1ee92a 6601 dev = bbio->stripes[dev_nr].dev;
fc8a168a
NB
6602 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6603 &dev->dev_state) ||
cfe94440 6604 (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
ebbede42 6605 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
de1ee92a 6606 bbio_error(bbio, first_bio, logical);
de1ee92a
JB
6607 continue;
6608 }
6609
3aa8e074 6610 if (dev_nr < total_devs - 1)
8b6c1d56 6611 bio = btrfs_bio_clone(first_bio);
3aa8e074 6612 else
a1d3c478 6613 bio = first_bio;
de1ee92a 6614
c31efbdf 6615 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
8790d502 6616 }
0b246afa 6617 btrfs_bio_counter_dec(fs_info);
58efbc9f 6618 return BLK_STS_OK;
0b86a832
CM
6619}
6620
09ba3bc9
AJ
6621/*
6622 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6623 * return NULL.
6624 *
6625 * If devid and uuid are both specified, the match must be exact, otherwise
6626 * only devid is used.
6627 *
6628 * If @seed is true, traverse through the seed devices.
6629 */
e4319cd9 6630struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
b2598edf 6631 u64 devid, u8 *uuid, u8 *fsid)
0b86a832 6632{
2b82032c 6633 struct btrfs_device *device;
944d3f9f
NB
6634 struct btrfs_fs_devices *seed_devs;
6635
6636 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6637 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6638 if (device->devid == devid &&
6639 (!uuid || memcmp(device->uuid, uuid,
6640 BTRFS_UUID_SIZE) == 0))
6641 return device;
6642 }
6643 }
2b82032c 6644
944d3f9f 6645 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
2b82032c 6646 if (!fsid ||
944d3f9f
NB
6647 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6648 list_for_each_entry(device, &seed_devs->devices,
09ba3bc9
AJ
6649 dev_list) {
6650 if (device->devid == devid &&
6651 (!uuid || memcmp(device->uuid, uuid,
6652 BTRFS_UUID_SIZE) == 0))
6653 return device;
6654 }
2b82032c 6655 }
2b82032c 6656 }
944d3f9f 6657
2b82032c 6658 return NULL;
0b86a832
CM
6659}
6660
2ff7e61e 6661static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
dfe25020
CM
6662 u64 devid, u8 *dev_uuid)
6663{
6664 struct btrfs_device *device;
fccc0007 6665 unsigned int nofs_flag;
dfe25020 6666
fccc0007
JB
6667 /*
6668 * We call this under the chunk_mutex, so we want to use NOFS for this
6669 * allocation, however we don't want to change btrfs_alloc_device() to
6670 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6671 * places.
6672 */
6673 nofs_flag = memalloc_nofs_save();
12bd2fc0 6674 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
fccc0007 6675 memalloc_nofs_restore(nofs_flag);
12bd2fc0 6676 if (IS_ERR(device))
adfb69af 6677 return device;
12bd2fc0
ID
6678
6679 list_add(&device->dev_list, &fs_devices->devices);
e4404d6e 6680 device->fs_devices = fs_devices;
dfe25020 6681 fs_devices->num_devices++;
12bd2fc0 6682
e6e674bd 6683 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
cd02dca5 6684 fs_devices->missing_devices++;
12bd2fc0 6685
dfe25020
CM
6686 return device;
6687}
6688
12bd2fc0
ID
6689/**
6690 * btrfs_alloc_device - allocate struct btrfs_device
6691 * @fs_info: used only for generating a new devid, can be NULL if
6692 * devid is provided (i.e. @devid != NULL).
6693 * @devid: a pointer to devid for this device. If NULL a new devid
6694 * is generated.
6695 * @uuid: a pointer to UUID for this device. If NULL a new UUID
6696 * is generated.
6697 *
6698 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
48dae9cf 6699 * on error. Returned struct is not linked onto any lists and must be
a425f9d4 6700 * destroyed with btrfs_free_device.
12bd2fc0
ID
6701 */
6702struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6703 const u64 *devid,
6704 const u8 *uuid)
6705{
6706 struct btrfs_device *dev;
6707 u64 tmp;
6708
fae7f21c 6709 if (WARN_ON(!devid && !fs_info))
12bd2fc0 6710 return ERR_PTR(-EINVAL);
12bd2fc0 6711
154f7cb8 6712 dev = __alloc_device(fs_info);
12bd2fc0
ID
6713 if (IS_ERR(dev))
6714 return dev;
6715
6716 if (devid)
6717 tmp = *devid;
6718 else {
6719 int ret;
6720
6721 ret = find_next_devid(fs_info, &tmp);
6722 if (ret) {
a425f9d4 6723 btrfs_free_device(dev);
12bd2fc0
ID
6724 return ERR_PTR(ret);
6725 }
6726 }
6727 dev->devid = tmp;
6728
6729 if (uuid)
6730 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6731 else
6732 generate_random_uuid(dev->uuid);
6733
12bd2fc0
ID
6734 return dev;
6735}
6736
5a2b8e60 6737static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
2b902dfc 6738 u64 devid, u8 *uuid, bool error)
5a2b8e60 6739{
2b902dfc
AJ
6740 if (error)
6741 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6742 devid, uuid);
6743 else
6744 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6745 devid, uuid);
5a2b8e60
AJ
6746}
6747
39e264a4
NB
6748static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6749{
6750 int index = btrfs_bg_flags_to_raid_index(type);
6751 int ncopies = btrfs_raid_array[index].ncopies;
e4f6c6be 6752 const int nparity = btrfs_raid_array[index].nparity;
39e264a4
NB
6753 int data_stripes;
6754
e4f6c6be
DS
6755 if (nparity)
6756 data_stripes = num_stripes - nparity;
6757 else
39e264a4 6758 data_stripes = num_stripes / ncopies;
e4f6c6be 6759
39e264a4
NB
6760 return div_u64(chunk_len, data_stripes);
6761}
6762
9690ac09 6763static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
e06cd3dd
LB
6764 struct btrfs_chunk *chunk)
6765{
9690ac09 6766 struct btrfs_fs_info *fs_info = leaf->fs_info;
c8bf1b67 6767 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
e06cd3dd
LB
6768 struct map_lookup *map;
6769 struct extent_map *em;
6770 u64 logical;
6771 u64 length;
e06cd3dd
LB
6772 u64 devid;
6773 u8 uuid[BTRFS_UUID_SIZE];
6774 int num_stripes;
6775 int ret;
6776 int i;
6777
6778 logical = key->offset;
6779 length = btrfs_chunk_length(leaf, chunk);
e06cd3dd
LB
6780 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6781
075cb3c7
QW
6782 /*
6783 * Only need to verify chunk item if we're reading from sys chunk array,
6784 * as chunk item in tree block is already verified by tree-checker.
6785 */
6786 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
ddaf1d5a 6787 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
075cb3c7
QW
6788 if (ret)
6789 return ret;
6790 }
a061fc8d 6791
c8bf1b67
DS
6792 read_lock(&map_tree->lock);
6793 em = lookup_extent_mapping(map_tree, logical, 1);
6794 read_unlock(&map_tree->lock);
0b86a832
CM
6795
6796 /* already mapped? */
6797 if (em && em->start <= logical && em->start + em->len > logical) {
6798 free_extent_map(em);
0b86a832
CM
6799 return 0;
6800 } else if (em) {
6801 free_extent_map(em);
6802 }
0b86a832 6803
172ddd60 6804 em = alloc_extent_map();
0b86a832
CM
6805 if (!em)
6806 return -ENOMEM;
593060d7 6807 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
0b86a832
CM
6808 if (!map) {
6809 free_extent_map(em);
6810 return -ENOMEM;
6811 }
6812
298a8f9c 6813 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 6814 em->map_lookup = map;
0b86a832
CM
6815 em->start = logical;
6816 em->len = length;
70c8a91c 6817 em->orig_start = 0;
0b86a832 6818 em->block_start = 0;
c8b97818 6819 em->block_len = em->len;
0b86a832 6820
593060d7
CM
6821 map->num_stripes = num_stripes;
6822 map->io_width = btrfs_chunk_io_width(leaf, chunk);
6823 map->io_align = btrfs_chunk_io_align(leaf, chunk);
593060d7
CM
6824 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6825 map->type = btrfs_chunk_type(leaf, chunk);
321aecc6 6826 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
cf90d884 6827 map->verified_stripes = 0;
39e264a4
NB
6828 em->orig_block_len = calc_stripe_length(map->type, em->len,
6829 map->num_stripes);
593060d7
CM
6830 for (i = 0; i < num_stripes; i++) {
6831 map->stripes[i].physical =
6832 btrfs_stripe_offset_nr(leaf, chunk, i);
6833 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
a443755f
CM
6834 read_extent_buffer(leaf, uuid, (unsigned long)
6835 btrfs_stripe_dev_uuid_nr(chunk, i),
6836 BTRFS_UUID_SIZE);
e4319cd9 6837 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
b2598edf 6838 devid, uuid, NULL);
3cdde224 6839 if (!map->stripes[i].dev &&
0b246afa 6840 !btrfs_test_opt(fs_info, DEGRADED)) {
593060d7 6841 free_extent_map(em);
2b902dfc 6842 btrfs_report_missing_device(fs_info, devid, uuid, true);
45dbdbc9 6843 return -ENOENT;
593060d7 6844 }
dfe25020
CM
6845 if (!map->stripes[i].dev) {
6846 map->stripes[i].dev =
2ff7e61e
JM
6847 add_missing_dev(fs_info->fs_devices, devid,
6848 uuid);
adfb69af 6849 if (IS_ERR(map->stripes[i].dev)) {
dfe25020 6850 free_extent_map(em);
adfb69af
AJ
6851 btrfs_err(fs_info,
6852 "failed to init missing dev %llu: %ld",
6853 devid, PTR_ERR(map->stripes[i].dev));
6854 return PTR_ERR(map->stripes[i].dev);
dfe25020 6855 }
2b902dfc 6856 btrfs_report_missing_device(fs_info, devid, uuid, false);
dfe25020 6857 }
e12c9621
AJ
6858 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6859 &(map->stripes[i].dev->dev_state));
6860
0b86a832
CM
6861 }
6862
c8bf1b67
DS
6863 write_lock(&map_tree->lock);
6864 ret = add_extent_mapping(map_tree, em, 0);
6865 write_unlock(&map_tree->lock);
64f64f43
QW
6866 if (ret < 0) {
6867 btrfs_err(fs_info,
6868 "failed to add chunk map, start=%llu len=%llu: %d",
6869 em->start, em->len, ret);
6870 }
0b86a832
CM
6871 free_extent_map(em);
6872
64f64f43 6873 return ret;
0b86a832
CM
6874}
6875
143bede5 6876static void fill_device_from_item(struct extent_buffer *leaf,
0b86a832
CM
6877 struct btrfs_dev_item *dev_item,
6878 struct btrfs_device *device)
6879{
6880 unsigned long ptr;
0b86a832
CM
6881
6882 device->devid = btrfs_device_id(leaf, dev_item);
d6397bae
CB
6883 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6884 device->total_bytes = device->disk_total_bytes;
935e5cc9 6885 device->commit_total_bytes = device->disk_total_bytes;
0b86a832 6886 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
ce7213c7 6887 device->commit_bytes_used = device->bytes_used;
0b86a832
CM
6888 device->type = btrfs_device_type(leaf, dev_item);
6889 device->io_align = btrfs_device_io_align(leaf, dev_item);
6890 device->io_width = btrfs_device_io_width(leaf, dev_item);
6891 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
8dabb742 6892 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
401e29c1 6893 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
0b86a832 6894
410ba3a2 6895 ptr = btrfs_device_uuid(dev_item);
e17cade2 6896 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
0b86a832
CM
6897}
6898
2ff7e61e 6899static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
5f375835 6900 u8 *fsid)
2b82032c
YZ
6901{
6902 struct btrfs_fs_devices *fs_devices;
6903 int ret;
6904
a32bf9a3 6905 lockdep_assert_held(&uuid_mutex);
2dfeca9b 6906 ASSERT(fsid);
2b82032c 6907
427c8fdd 6908 /* This will match only for multi-device seed fs */
944d3f9f 6909 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
44880fdc 6910 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
5f375835
MX
6911 return fs_devices;
6912
2b82032c 6913
7239ff4b 6914 fs_devices = find_fsid(fsid, NULL);
2b82032c 6915 if (!fs_devices) {
0b246afa 6916 if (!btrfs_test_opt(fs_info, DEGRADED))
5f375835
MX
6917 return ERR_PTR(-ENOENT);
6918
7239ff4b 6919 fs_devices = alloc_fs_devices(fsid, NULL);
5f375835
MX
6920 if (IS_ERR(fs_devices))
6921 return fs_devices;
6922
0395d84f 6923 fs_devices->seeding = true;
5f375835
MX
6924 fs_devices->opened = 1;
6925 return fs_devices;
2b82032c 6926 }
e4404d6e 6927
427c8fdd
NB
6928 /*
6929 * Upon first call for a seed fs fsid, just create a private copy of the
6930 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6931 */
e4404d6e 6932 fs_devices = clone_fs_devices(fs_devices);
5f375835
MX
6933 if (IS_ERR(fs_devices))
6934 return fs_devices;
2b82032c 6935
897fb573 6936 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
48d28232
JL
6937 if (ret) {
6938 free_fs_devices(fs_devices);
c83b60c0 6939 return ERR_PTR(ret);
48d28232 6940 }
2b82032c
YZ
6941
6942 if (!fs_devices->seeding) {
0226e0eb 6943 close_fs_devices(fs_devices);
e4404d6e 6944 free_fs_devices(fs_devices);
c83b60c0 6945 return ERR_PTR(-EINVAL);
2b82032c
YZ
6946 }
6947
944d3f9f 6948 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
c83b60c0 6949
5f375835 6950 return fs_devices;
2b82032c
YZ
6951}
6952
17850759 6953static int read_one_dev(struct extent_buffer *leaf,
0b86a832
CM
6954 struct btrfs_dev_item *dev_item)
6955{
17850759 6956 struct btrfs_fs_info *fs_info = leaf->fs_info;
0b246afa 6957 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
0b86a832
CM
6958 struct btrfs_device *device;
6959 u64 devid;
6960 int ret;
44880fdc 6961 u8 fs_uuid[BTRFS_FSID_SIZE];
a443755f
CM
6962 u8 dev_uuid[BTRFS_UUID_SIZE];
6963
0b86a832 6964 devid = btrfs_device_id(leaf, dev_item);
410ba3a2 6965 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
a443755f 6966 BTRFS_UUID_SIZE);
1473b24e 6967 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 6968 BTRFS_FSID_SIZE);
2b82032c 6969
de37aa51 6970 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
2ff7e61e 6971 fs_devices = open_seed_devices(fs_info, fs_uuid);
5f375835
MX
6972 if (IS_ERR(fs_devices))
6973 return PTR_ERR(fs_devices);
2b82032c
YZ
6974 }
6975
e4319cd9 6976 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
b2598edf 6977 fs_uuid);
5f375835 6978 if (!device) {
c5502451 6979 if (!btrfs_test_opt(fs_info, DEGRADED)) {
2b902dfc
AJ
6980 btrfs_report_missing_device(fs_info, devid,
6981 dev_uuid, true);
45dbdbc9 6982 return -ENOENT;
c5502451 6983 }
2b82032c 6984
2ff7e61e 6985 device = add_missing_dev(fs_devices, devid, dev_uuid);
adfb69af
AJ
6986 if (IS_ERR(device)) {
6987 btrfs_err(fs_info,
6988 "failed to add missing dev %llu: %ld",
6989 devid, PTR_ERR(device));
6990 return PTR_ERR(device);
6991 }
2b902dfc 6992 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
5f375835 6993 } else {
c5502451 6994 if (!device->bdev) {
2b902dfc
AJ
6995 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6996 btrfs_report_missing_device(fs_info,
6997 devid, dev_uuid, true);
45dbdbc9 6998 return -ENOENT;
2b902dfc
AJ
6999 }
7000 btrfs_report_missing_device(fs_info, devid,
7001 dev_uuid, false);
c5502451 7002 }
5f375835 7003
e6e674bd
AJ
7004 if (!device->bdev &&
7005 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
cd02dca5
CM
7006 /*
7007 * this happens when a device that was properly setup
7008 * in the device info lists suddenly goes bad.
7009 * device->bdev is NULL, and so we have to set
7010 * device->missing to one here
7011 */
5f375835 7012 device->fs_devices->missing_devices++;
e6e674bd 7013 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
2b82032c 7014 }
5f375835
MX
7015
7016 /* Move the device to its own fs_devices */
7017 if (device->fs_devices != fs_devices) {
e6e674bd
AJ
7018 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7019 &device->dev_state));
5f375835
MX
7020
7021 list_move(&device->dev_list, &fs_devices->devices);
7022 device->fs_devices->num_devices--;
7023 fs_devices->num_devices++;
7024
7025 device->fs_devices->missing_devices--;
7026 fs_devices->missing_devices++;
7027
7028 device->fs_devices = fs_devices;
7029 }
2b82032c
YZ
7030 }
7031
0b246afa 7032 if (device->fs_devices != fs_info->fs_devices) {
ebbede42 7033 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
2b82032c
YZ
7034 if (device->generation !=
7035 btrfs_device_generation(leaf, dev_item))
7036 return -EINVAL;
6324fbf3 7037 }
0b86a832
CM
7038
7039 fill_device_from_item(leaf, dev_item, device);
3a160a93
AJ
7040 if (device->bdev) {
7041 u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
7042
7043 if (device->total_bytes > max_total_bytes) {
7044 btrfs_err(fs_info,
7045 "device total_bytes should be at most %llu but found %llu",
7046 max_total_bytes, device->total_bytes);
7047 return -EINVAL;
7048 }
7049 }
e12c9621 7050 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
ebbede42 7051 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
401e29c1 7052 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2b82032c 7053 device->fs_devices->total_rw_bytes += device->total_bytes;
a5ed45f8
NB
7054 atomic64_add(device->total_bytes - device->bytes_used,
7055 &fs_info->free_chunk_space);
2bf64758 7056 }
0b86a832 7057 ret = 0;
0b86a832
CM
7058 return ret;
7059}
7060
6bccf3ab 7061int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
0b86a832 7062{
6bccf3ab 7063 struct btrfs_root *root = fs_info->tree_root;
ab8d0fc4 7064 struct btrfs_super_block *super_copy = fs_info->super_copy;
a061fc8d 7065 struct extent_buffer *sb;
0b86a832 7066 struct btrfs_disk_key *disk_key;
0b86a832 7067 struct btrfs_chunk *chunk;
1ffb22cf
DS
7068 u8 *array_ptr;
7069 unsigned long sb_array_offset;
84eed90f 7070 int ret = 0;
0b86a832
CM
7071 u32 num_stripes;
7072 u32 array_size;
7073 u32 len = 0;
1ffb22cf 7074 u32 cur_offset;
e06cd3dd 7075 u64 type;
84eed90f 7076 struct btrfs_key key;
0b86a832 7077
0b246afa 7078 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
a83fffb7
DS
7079 /*
7080 * This will create extent buffer of nodesize, superblock size is
7081 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7082 * overallocate but we can keep it as-is, only the first page is used.
7083 */
3fbaf258
JB
7084 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
7085 root->root_key.objectid, 0);
c871b0f2
LB
7086 if (IS_ERR(sb))
7087 return PTR_ERR(sb);
4db8c528 7088 set_extent_buffer_uptodate(sb);
8a334426 7089 /*
01327610 7090 * The sb extent buffer is artificial and just used to read the system array.
4db8c528 7091 * set_extent_buffer_uptodate() call does not properly mark all it's
8a334426
DS
7092 * pages up-to-date when the page is larger: extent does not cover the
7093 * whole page and consequently check_page_uptodate does not find all
7094 * the page's extents up-to-date (the hole beyond sb),
7095 * write_extent_buffer then triggers a WARN_ON.
7096 *
7097 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7098 * but sb spans only this function. Add an explicit SetPageUptodate call
7099 * to silence the warning eg. on PowerPC 64.
7100 */
09cbfeaf 7101 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
727011e0 7102 SetPageUptodate(sb->pages[0]);
4008c04a 7103
a061fc8d 7104 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
0b86a832
CM
7105 array_size = btrfs_super_sys_array_size(super_copy);
7106
1ffb22cf
DS
7107 array_ptr = super_copy->sys_chunk_array;
7108 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7109 cur_offset = 0;
0b86a832 7110
1ffb22cf
DS
7111 while (cur_offset < array_size) {
7112 disk_key = (struct btrfs_disk_key *)array_ptr;
e3540eab
DS
7113 len = sizeof(*disk_key);
7114 if (cur_offset + len > array_size)
7115 goto out_short_read;
7116
0b86a832
CM
7117 btrfs_disk_key_to_cpu(&key, disk_key);
7118
1ffb22cf
DS
7119 array_ptr += len;
7120 sb_array_offset += len;
7121 cur_offset += len;
0b86a832 7122
32ab3d1b
JT
7123 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7124 btrfs_err(fs_info,
7125 "unexpected item type %u in sys_array at offset %u",
7126 (u32)key.type, cur_offset);
7127 ret = -EIO;
7128 break;
7129 }
f5cdedd7 7130
32ab3d1b
JT
7131 chunk = (struct btrfs_chunk *)sb_array_offset;
7132 /*
7133 * At least one btrfs_chunk with one stripe must be present,
7134 * exact stripe count check comes afterwards
7135 */
7136 len = btrfs_chunk_item_size(1);
7137 if (cur_offset + len > array_size)
7138 goto out_short_read;
e06cd3dd 7139
32ab3d1b
JT
7140 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7141 if (!num_stripes) {
7142 btrfs_err(fs_info,
7143 "invalid number of stripes %u in sys_array at offset %u",
7144 num_stripes, cur_offset);
7145 ret = -EIO;
7146 break;
7147 }
e3540eab 7148
32ab3d1b
JT
7149 type = btrfs_chunk_type(sb, chunk);
7150 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
ab8d0fc4 7151 btrfs_err(fs_info,
32ab3d1b
JT
7152 "invalid chunk type %llu in sys_array at offset %u",
7153 type, cur_offset);
84eed90f
CM
7154 ret = -EIO;
7155 break;
0b86a832 7156 }
32ab3d1b
JT
7157
7158 len = btrfs_chunk_item_size(num_stripes);
7159 if (cur_offset + len > array_size)
7160 goto out_short_read;
7161
7162 ret = read_one_chunk(&key, sb, chunk);
7163 if (ret)
7164 break;
7165
1ffb22cf
DS
7166 array_ptr += len;
7167 sb_array_offset += len;
7168 cur_offset += len;
0b86a832 7169 }
d865177a 7170 clear_extent_buffer_uptodate(sb);
1c8b5b6e 7171 free_extent_buffer_stale(sb);
84eed90f 7172 return ret;
e3540eab
DS
7173
7174out_short_read:
ab8d0fc4 7175 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
e3540eab 7176 len, cur_offset);
d865177a 7177 clear_extent_buffer_uptodate(sb);
1c8b5b6e 7178 free_extent_buffer_stale(sb);
e3540eab 7179 return -EIO;
0b86a832
CM
7180}
7181
21634a19
QW
7182/*
7183 * Check if all chunks in the fs are OK for read-write degraded mount
7184 *
6528b99d
AJ
7185 * If the @failing_dev is specified, it's accounted as missing.
7186 *
21634a19
QW
7187 * Return true if all chunks meet the minimal RW mount requirements.
7188 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7189 */
6528b99d
AJ
7190bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7191 struct btrfs_device *failing_dev)
21634a19 7192{
c8bf1b67 7193 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
21634a19
QW
7194 struct extent_map *em;
7195 u64 next_start = 0;
7196 bool ret = true;
7197
c8bf1b67
DS
7198 read_lock(&map_tree->lock);
7199 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7200 read_unlock(&map_tree->lock);
21634a19
QW
7201 /* No chunk at all? Return false anyway */
7202 if (!em) {
7203 ret = false;
7204 goto out;
7205 }
7206 while (em) {
7207 struct map_lookup *map;
7208 int missing = 0;
7209 int max_tolerated;
7210 int i;
7211
7212 map = em->map_lookup;
7213 max_tolerated =
7214 btrfs_get_num_tolerated_disk_barrier_failures(
7215 map->type);
7216 for (i = 0; i < map->num_stripes; i++) {
7217 struct btrfs_device *dev = map->stripes[i].dev;
7218
e6e674bd
AJ
7219 if (!dev || !dev->bdev ||
7220 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
21634a19
QW
7221 dev->last_flush_error)
7222 missing++;
6528b99d
AJ
7223 else if (failing_dev && failing_dev == dev)
7224 missing++;
21634a19
QW
7225 }
7226 if (missing > max_tolerated) {
6528b99d
AJ
7227 if (!failing_dev)
7228 btrfs_warn(fs_info,
52042d8e 7229 "chunk %llu missing %d devices, max tolerance is %d for writable mount",
21634a19
QW
7230 em->start, missing, max_tolerated);
7231 free_extent_map(em);
7232 ret = false;
7233 goto out;
7234 }
7235 next_start = extent_map_end(em);
7236 free_extent_map(em);
7237
c8bf1b67
DS
7238 read_lock(&map_tree->lock);
7239 em = lookup_extent_mapping(map_tree, next_start,
21634a19 7240 (u64)(-1) - next_start);
c8bf1b67 7241 read_unlock(&map_tree->lock);
21634a19
QW
7242 }
7243out:
7244 return ret;
7245}
7246
d85327b1
DS
7247static void readahead_tree_node_children(struct extent_buffer *node)
7248{
7249 int i;
7250 const int nr_items = btrfs_header_nritems(node);
7251
bfb484d9
JB
7252 for (i = 0; i < nr_items; i++)
7253 btrfs_readahead_node_child(node, i);
d85327b1
DS
7254}
7255
5b4aacef 7256int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
0b86a832 7257{
5b4aacef 7258 struct btrfs_root *root = fs_info->chunk_root;
0b86a832
CM
7259 struct btrfs_path *path;
7260 struct extent_buffer *leaf;
7261 struct btrfs_key key;
7262 struct btrfs_key found_key;
7263 int ret;
7264 int slot;
99e3ecfc 7265 u64 total_dev = 0;
d85327b1 7266 u64 last_ra_node = 0;
0b86a832 7267
0b86a832
CM
7268 path = btrfs_alloc_path();
7269 if (!path)
7270 return -ENOMEM;
7271
3dd0f7a3
AJ
7272 /*
7273 * uuid_mutex is needed only if we are mounting a sprout FS
7274 * otherwise we don't need it.
7275 */
b367e47f 7276 mutex_lock(&uuid_mutex);
b367e47f 7277
48cfa61b
BB
7278 /*
7279 * It is possible for mount and umount to race in such a way that
7280 * we execute this code path, but open_fs_devices failed to clear
7281 * total_rw_bytes. We certainly want it cleared before reading the
7282 * device items, so clear it here.
7283 */
7284 fs_info->fs_devices->total_rw_bytes = 0;
7285
395927a9
FDBM
7286 /*
7287 * Read all device items, and then all the chunk items. All
7288 * device items are found before any chunk item (their object id
7289 * is smaller than the lowest possible object id for a chunk
7290 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
0b86a832
CM
7291 */
7292 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7293 key.offset = 0;
7294 key.type = 0;
0b86a832 7295 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
ab59381e
ZL
7296 if (ret < 0)
7297 goto error;
d397712b 7298 while (1) {
d85327b1
DS
7299 struct extent_buffer *node;
7300
0b86a832
CM
7301 leaf = path->nodes[0];
7302 slot = path->slots[0];
7303 if (slot >= btrfs_header_nritems(leaf)) {
7304 ret = btrfs_next_leaf(root, path);
7305 if (ret == 0)
7306 continue;
7307 if (ret < 0)
7308 goto error;
7309 break;
7310 }
d85327b1
DS
7311 /*
7312 * The nodes on level 1 are not locked but we don't need to do
7313 * that during mount time as nothing else can access the tree
7314 */
7315 node = path->nodes[1];
7316 if (node) {
7317 if (last_ra_node != node->start) {
7318 readahead_tree_node_children(node);
7319 last_ra_node = node->start;
7320 }
7321 }
0b86a832 7322 btrfs_item_key_to_cpu(leaf, &found_key, slot);
395927a9
FDBM
7323 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7324 struct btrfs_dev_item *dev_item;
7325 dev_item = btrfs_item_ptr(leaf, slot,
0b86a832 7326 struct btrfs_dev_item);
17850759 7327 ret = read_one_dev(leaf, dev_item);
395927a9
FDBM
7328 if (ret)
7329 goto error;
99e3ecfc 7330 total_dev++;
0b86a832
CM
7331 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7332 struct btrfs_chunk *chunk;
7333 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
01d01caf 7334 mutex_lock(&fs_info->chunk_mutex);
9690ac09 7335 ret = read_one_chunk(&found_key, leaf, chunk);
01d01caf 7336 mutex_unlock(&fs_info->chunk_mutex);
2b82032c
YZ
7337 if (ret)
7338 goto error;
0b86a832
CM
7339 }
7340 path->slots[0]++;
7341 }
99e3ecfc
LB
7342
7343 /*
7344 * After loading chunk tree, we've got all device information,
7345 * do another round of validation checks.
7346 */
0b246afa
JM
7347 if (total_dev != fs_info->fs_devices->total_devices) {
7348 btrfs_err(fs_info,
99e3ecfc 7349 "super_num_devices %llu mismatch with num_devices %llu found here",
0b246afa 7350 btrfs_super_num_devices(fs_info->super_copy),
99e3ecfc
LB
7351 total_dev);
7352 ret = -EINVAL;
7353 goto error;
7354 }
0b246afa
JM
7355 if (btrfs_super_total_bytes(fs_info->super_copy) <
7356 fs_info->fs_devices->total_rw_bytes) {
7357 btrfs_err(fs_info,
99e3ecfc 7358 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
0b246afa
JM
7359 btrfs_super_total_bytes(fs_info->super_copy),
7360 fs_info->fs_devices->total_rw_bytes);
99e3ecfc
LB
7361 ret = -EINVAL;
7362 goto error;
7363 }
0b86a832
CM
7364 ret = 0;
7365error:
b367e47f
LZ
7366 mutex_unlock(&uuid_mutex);
7367
2b82032c 7368 btrfs_free_path(path);
0b86a832
CM
7369 return ret;
7370}
442a4f63 7371
cb517eab
MX
7372void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7373{
944d3f9f 7374 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
cb517eab
MX
7375 struct btrfs_device *device;
7376
944d3f9f
NB
7377 fs_devices->fs_info = fs_info;
7378
7379 mutex_lock(&fs_devices->device_list_mutex);
7380 list_for_each_entry(device, &fs_devices->devices, dev_list)
7381 device->fs_info = fs_info;
944d3f9f
NB
7382
7383 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
944d3f9f 7384 list_for_each_entry(device, &seed_devs->devices, dev_list)
fb456252 7385 device->fs_info = fs_info;
29cc83f6 7386
944d3f9f 7387 seed_devs->fs_info = fs_info;
29cc83f6 7388 }
e17125b5 7389 mutex_unlock(&fs_devices->device_list_mutex);
cb517eab
MX
7390}
7391
1dc990df
DS
7392static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7393 const struct btrfs_dev_stats_item *ptr,
7394 int index)
7395{
7396 u64 val;
7397
7398 read_extent_buffer(eb, &val,
7399 offsetof(struct btrfs_dev_stats_item, values) +
7400 ((unsigned long)ptr) + (index * sizeof(u64)),
7401 sizeof(val));
7402 return val;
7403}
7404
7405static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7406 struct btrfs_dev_stats_item *ptr,
7407 int index, u64 val)
7408{
7409 write_extent_buffer(eb, &val,
7410 offsetof(struct btrfs_dev_stats_item, values) +
7411 ((unsigned long)ptr) + (index * sizeof(u64)),
7412 sizeof(val));
7413}
7414
92e26df4
JB
7415static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7416 struct btrfs_path *path)
733f4fbb 7417{
124604eb 7418 struct btrfs_dev_stats_item *ptr;
733f4fbb 7419 struct extent_buffer *eb;
124604eb
JB
7420 struct btrfs_key key;
7421 int item_size;
7422 int i, ret, slot;
7423
7424 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7425 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7426 key.offset = device->devid;
7427 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7428 if (ret) {
7429 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7430 btrfs_dev_stat_set(device, i, 0);
7431 device->dev_stats_valid = 1;
7432 btrfs_release_path(path);
92e26df4 7433 return ret < 0 ? ret : 0;
124604eb
JB
7434 }
7435 slot = path->slots[0];
7436 eb = path->nodes[0];
7437 item_size = btrfs_item_size_nr(eb, slot);
7438
7439 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7440
7441 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7442 if (item_size >= (1 + i) * sizeof(__le64))
7443 btrfs_dev_stat_set(device, i,
7444 btrfs_dev_stats_value(eb, ptr, i));
7445 else
7446 btrfs_dev_stat_set(device, i, 0);
7447 }
7448
7449 device->dev_stats_valid = 1;
7450 btrfs_dev_stat_print_on_load(device);
7451 btrfs_release_path(path);
92e26df4
JB
7452
7453 return 0;
124604eb
JB
7454}
7455
7456int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7457{
7458 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
733f4fbb
SB
7459 struct btrfs_device *device;
7460 struct btrfs_path *path = NULL;
92e26df4 7461 int ret = 0;
733f4fbb
SB
7462
7463 path = btrfs_alloc_path();
3b80a984
AJ
7464 if (!path)
7465 return -ENOMEM;
733f4fbb
SB
7466
7467 mutex_lock(&fs_devices->device_list_mutex);
92e26df4
JB
7468 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7469 ret = btrfs_device_init_dev_stats(device, path);
7470 if (ret)
7471 goto out;
7472 }
124604eb 7473 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
92e26df4
JB
7474 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7475 ret = btrfs_device_init_dev_stats(device, path);
7476 if (ret)
7477 goto out;
7478 }
733f4fbb 7479 }
92e26df4 7480out:
733f4fbb
SB
7481 mutex_unlock(&fs_devices->device_list_mutex);
7482
733f4fbb 7483 btrfs_free_path(path);
92e26df4 7484 return ret;
733f4fbb
SB
7485}
7486
7487static int update_dev_stat_item(struct btrfs_trans_handle *trans,
733f4fbb
SB
7488 struct btrfs_device *device)
7489{
5495f195 7490 struct btrfs_fs_info *fs_info = trans->fs_info;
6bccf3ab 7491 struct btrfs_root *dev_root = fs_info->dev_root;
733f4fbb
SB
7492 struct btrfs_path *path;
7493 struct btrfs_key key;
7494 struct extent_buffer *eb;
7495 struct btrfs_dev_stats_item *ptr;
7496 int ret;
7497 int i;
7498
242e2956
DS
7499 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7500 key.type = BTRFS_PERSISTENT_ITEM_KEY;
733f4fbb
SB
7501 key.offset = device->devid;
7502
7503 path = btrfs_alloc_path();
fa252992
DS
7504 if (!path)
7505 return -ENOMEM;
733f4fbb
SB
7506 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7507 if (ret < 0) {
0b246afa 7508 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7509 "error %d while searching for dev_stats item for device %s",
606686ee 7510 ret, rcu_str_deref(device->name));
733f4fbb
SB
7511 goto out;
7512 }
7513
7514 if (ret == 0 &&
7515 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7516 /* need to delete old one and insert a new one */
7517 ret = btrfs_del_item(trans, dev_root, path);
7518 if (ret != 0) {
0b246afa 7519 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7520 "delete too small dev_stats item for device %s failed %d",
606686ee 7521 rcu_str_deref(device->name), ret);
733f4fbb
SB
7522 goto out;
7523 }
7524 ret = 1;
7525 }
7526
7527 if (ret == 1) {
7528 /* need to insert a new item */
7529 btrfs_release_path(path);
7530 ret = btrfs_insert_empty_item(trans, dev_root, path,
7531 &key, sizeof(*ptr));
7532 if (ret < 0) {
0b246afa 7533 btrfs_warn_in_rcu(fs_info,
ecaeb14b
DS
7534 "insert dev_stats item for device %s failed %d",
7535 rcu_str_deref(device->name), ret);
733f4fbb
SB
7536 goto out;
7537 }
7538 }
7539
7540 eb = path->nodes[0];
7541 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7542 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7543 btrfs_set_dev_stats_value(eb, ptr, i,
7544 btrfs_dev_stat_read(device, i));
7545 btrfs_mark_buffer_dirty(eb);
7546
7547out:
7548 btrfs_free_path(path);
7549 return ret;
7550}
7551
7552/*
7553 * called from commit_transaction. Writes all changed device stats to disk.
7554 */
196c9d8d 7555int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
733f4fbb 7556{
196c9d8d 7557 struct btrfs_fs_info *fs_info = trans->fs_info;
733f4fbb
SB
7558 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7559 struct btrfs_device *device;
addc3fa7 7560 int stats_cnt;
733f4fbb
SB
7561 int ret = 0;
7562
7563 mutex_lock(&fs_devices->device_list_mutex);
7564 list_for_each_entry(device, &fs_devices->devices, dev_list) {
9deae968
NB
7565 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7566 if (!device->dev_stats_valid || stats_cnt == 0)
733f4fbb
SB
7567 continue;
7568
9deae968
NB
7569
7570 /*
7571 * There is a LOAD-LOAD control dependency between the value of
7572 * dev_stats_ccnt and updating the on-disk values which requires
7573 * reading the in-memory counters. Such control dependencies
7574 * require explicit read memory barriers.
7575 *
7576 * This memory barriers pairs with smp_mb__before_atomic in
7577 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7578 * barrier implied by atomic_xchg in
7579 * btrfs_dev_stats_read_and_reset
7580 */
7581 smp_rmb();
7582
5495f195 7583 ret = update_dev_stat_item(trans, device);
733f4fbb 7584 if (!ret)
addc3fa7 7585 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
733f4fbb
SB
7586 }
7587 mutex_unlock(&fs_devices->device_list_mutex);
7588
7589 return ret;
7590}
7591
442a4f63
SB
7592void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7593{
7594 btrfs_dev_stat_inc(dev, index);
7595 btrfs_dev_stat_print_on_error(dev);
7596}
7597
48a3b636 7598static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
442a4f63 7599{
733f4fbb
SB
7600 if (!dev->dev_stats_valid)
7601 return;
fb456252 7602 btrfs_err_rl_in_rcu(dev->fs_info,
b14af3b4 7603 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 7604 rcu_str_deref(dev->name),
442a4f63
SB
7605 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7606 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7607 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
efe120a0
FH
7608 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7609 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
442a4f63 7610}
c11d2c23 7611
733f4fbb
SB
7612static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7613{
a98cdb85
SB
7614 int i;
7615
7616 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7617 if (btrfs_dev_stat_read(dev, i) != 0)
7618 break;
7619 if (i == BTRFS_DEV_STAT_VALUES_MAX)
7620 return; /* all values == 0, suppress message */
7621
fb456252 7622 btrfs_info_in_rcu(dev->fs_info,
ecaeb14b 7623 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 7624 rcu_str_deref(dev->name),
733f4fbb
SB
7625 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7626 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7627 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7628 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7629 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7630}
7631
2ff7e61e 7632int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
b27f7c0c 7633 struct btrfs_ioctl_get_dev_stats *stats)
c11d2c23
SB
7634{
7635 struct btrfs_device *dev;
0b246afa 7636 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
c11d2c23
SB
7637 int i;
7638
7639 mutex_lock(&fs_devices->device_list_mutex);
b2598edf 7640 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
c11d2c23
SB
7641 mutex_unlock(&fs_devices->device_list_mutex);
7642
7643 if (!dev) {
0b246afa 7644 btrfs_warn(fs_info, "get dev_stats failed, device not found");
c11d2c23 7645 return -ENODEV;
733f4fbb 7646 } else if (!dev->dev_stats_valid) {
0b246afa 7647 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
733f4fbb 7648 return -ENODEV;
b27f7c0c 7649 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
c11d2c23
SB
7650 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7651 if (stats->nr_items > i)
7652 stats->values[i] =
7653 btrfs_dev_stat_read_and_reset(dev, i);
7654 else
4e411a7d 7655 btrfs_dev_stat_set(dev, i, 0);
c11d2c23 7656 }
a69976bc
AJ
7657 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7658 current->comm, task_pid_nr(current));
c11d2c23
SB
7659 } else {
7660 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7661 if (stats->nr_items > i)
7662 stats->values[i] = btrfs_dev_stat_read(dev, i);
7663 }
7664 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7665 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7666 return 0;
7667}
a8a6dab7 7668
935e5cc9 7669/*
bbbf7243
NB
7670 * Update the size and bytes used for each device where it changed. This is
7671 * delayed since we would otherwise get errors while writing out the
7672 * superblocks.
7673 *
7674 * Must be invoked during transaction commit.
935e5cc9 7675 */
bbbf7243 7676void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
935e5cc9 7677{
935e5cc9
MX
7678 struct btrfs_device *curr, *next;
7679
bbbf7243 7680 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
ce7213c7 7681
bbbf7243 7682 if (list_empty(&trans->dev_update_list))
ce7213c7
MX
7683 return;
7684
bbbf7243
NB
7685 /*
7686 * We don't need the device_list_mutex here. This list is owned by the
7687 * transaction and the transaction must complete before the device is
7688 * released.
7689 */
7690 mutex_lock(&trans->fs_info->chunk_mutex);
7691 list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7692 post_commit_list) {
7693 list_del_init(&curr->post_commit_list);
7694 curr->commit_total_bytes = curr->disk_total_bytes;
7695 curr->commit_bytes_used = curr->bytes_used;
ce7213c7 7696 }
bbbf7243 7697 mutex_unlock(&trans->fs_info->chunk_mutex);
ce7213c7 7698}
5a13f430 7699
46df06b8
DS
7700/*
7701 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7702 */
7703int btrfs_bg_type_to_factor(u64 flags)
7704{
44b28ada
DS
7705 const int index = btrfs_bg_flags_to_raid_index(flags);
7706
7707 return btrfs_raid_array[index].ncopies;
46df06b8 7708}
cf90d884
QW
7709
7710
cf90d884
QW
7711
7712static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7713 u64 chunk_offset, u64 devid,
7714 u64 physical_offset, u64 physical_len)
7715{
c8bf1b67 7716 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
cf90d884
QW
7717 struct extent_map *em;
7718 struct map_lookup *map;
05a37c48 7719 struct btrfs_device *dev;
cf90d884
QW
7720 u64 stripe_len;
7721 bool found = false;
7722 int ret = 0;
7723 int i;
7724
7725 read_lock(&em_tree->lock);
7726 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7727 read_unlock(&em_tree->lock);
7728
7729 if (!em) {
7730 btrfs_err(fs_info,
7731"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7732 physical_offset, devid);
7733 ret = -EUCLEAN;
7734 goto out;
7735 }
7736
7737 map = em->map_lookup;
7738 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7739 if (physical_len != stripe_len) {
7740 btrfs_err(fs_info,
7741"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7742 physical_offset, devid, em->start, physical_len,
7743 stripe_len);
7744 ret = -EUCLEAN;
7745 goto out;
7746 }
7747
7748 for (i = 0; i < map->num_stripes; i++) {
7749 if (map->stripes[i].dev->devid == devid &&
7750 map->stripes[i].physical == physical_offset) {
7751 found = true;
7752 if (map->verified_stripes >= map->num_stripes) {
7753 btrfs_err(fs_info,
7754 "too many dev extents for chunk %llu found",
7755 em->start);
7756 ret = -EUCLEAN;
7757 goto out;
7758 }
7759 map->verified_stripes++;
7760 break;
7761 }
7762 }
7763 if (!found) {
7764 btrfs_err(fs_info,
7765 "dev extent physical offset %llu devid %llu has no corresponding chunk",
7766 physical_offset, devid);
7767 ret = -EUCLEAN;
7768 }
05a37c48
QW
7769
7770 /* Make sure no dev extent is beyond device bondary */
b2598edf 7771 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
05a37c48
QW
7772 if (!dev) {
7773 btrfs_err(fs_info, "failed to find devid %llu", devid);
7774 ret = -EUCLEAN;
7775 goto out;
7776 }
1b3922a8 7777
05a37c48
QW
7778 if (physical_offset + physical_len > dev->disk_total_bytes) {
7779 btrfs_err(fs_info,
7780"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7781 devid, physical_offset, physical_len,
7782 dev->disk_total_bytes);
7783 ret = -EUCLEAN;
7784 goto out;
7785 }
381a696e
NA
7786
7787 if (dev->zone_info) {
7788 u64 zone_size = dev->zone_info->zone_size;
7789
7790 if (!IS_ALIGNED(physical_offset, zone_size) ||
7791 !IS_ALIGNED(physical_len, zone_size)) {
7792 btrfs_err(fs_info,
7793"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7794 devid, physical_offset, physical_len);
7795 ret = -EUCLEAN;
7796 goto out;
7797 }
7798 }
7799
cf90d884
QW
7800out:
7801 free_extent_map(em);
7802 return ret;
7803}
7804
7805static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7806{
c8bf1b67 7807 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
cf90d884
QW
7808 struct extent_map *em;
7809 struct rb_node *node;
7810 int ret = 0;
7811
7812 read_lock(&em_tree->lock);
07e1ce09 7813 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
cf90d884
QW
7814 em = rb_entry(node, struct extent_map, rb_node);
7815 if (em->map_lookup->num_stripes !=
7816 em->map_lookup->verified_stripes) {
7817 btrfs_err(fs_info,
7818 "chunk %llu has missing dev extent, have %d expect %d",
7819 em->start, em->map_lookup->verified_stripes,
7820 em->map_lookup->num_stripes);
7821 ret = -EUCLEAN;
7822 goto out;
7823 }
7824 }
7825out:
7826 read_unlock(&em_tree->lock);
7827 return ret;
7828}
7829
7830/*
7831 * Ensure that all dev extents are mapped to correct chunk, otherwise
7832 * later chunk allocation/free would cause unexpected behavior.
7833 *
7834 * NOTE: This will iterate through the whole device tree, which should be of
7835 * the same size level as the chunk tree. This slightly increases mount time.
7836 */
7837int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7838{
7839 struct btrfs_path *path;
7840 struct btrfs_root *root = fs_info->dev_root;
7841 struct btrfs_key key;
5eb19381
QW
7842 u64 prev_devid = 0;
7843 u64 prev_dev_ext_end = 0;
cf90d884
QW
7844 int ret = 0;
7845
42437a63
JB
7846 /*
7847 * We don't have a dev_root because we mounted with ignorebadroots and
7848 * failed to load the root, so we want to skip the verification in this
7849 * case for sure.
7850 *
7851 * However if the dev root is fine, but the tree itself is corrupted
7852 * we'd still fail to mount. This verification is only to make sure
7853 * writes can happen safely, so instead just bypass this check
7854 * completely in the case of IGNOREBADROOTS.
7855 */
7856 if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
7857 return 0;
7858
cf90d884
QW
7859 key.objectid = 1;
7860 key.type = BTRFS_DEV_EXTENT_KEY;
7861 key.offset = 0;
7862
7863 path = btrfs_alloc_path();
7864 if (!path)
7865 return -ENOMEM;
7866
7867 path->reada = READA_FORWARD;
7868 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7869 if (ret < 0)
7870 goto out;
7871
7872 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7873 ret = btrfs_next_item(root, path);
7874 if (ret < 0)
7875 goto out;
7876 /* No dev extents at all? Not good */
7877 if (ret > 0) {
7878 ret = -EUCLEAN;
7879 goto out;
7880 }
7881 }
7882 while (1) {
7883 struct extent_buffer *leaf = path->nodes[0];
7884 struct btrfs_dev_extent *dext;
7885 int slot = path->slots[0];
7886 u64 chunk_offset;
7887 u64 physical_offset;
7888 u64 physical_len;
7889 u64 devid;
7890
7891 btrfs_item_key_to_cpu(leaf, &key, slot);
7892 if (key.type != BTRFS_DEV_EXTENT_KEY)
7893 break;
7894 devid = key.objectid;
7895 physical_offset = key.offset;
7896
7897 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7898 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7899 physical_len = btrfs_dev_extent_length(leaf, dext);
7900
5eb19381
QW
7901 /* Check if this dev extent overlaps with the previous one */
7902 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7903 btrfs_err(fs_info,
7904"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7905 devid, physical_offset, prev_dev_ext_end);
7906 ret = -EUCLEAN;
7907 goto out;
7908 }
7909
cf90d884
QW
7910 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7911 physical_offset, physical_len);
7912 if (ret < 0)
7913 goto out;
5eb19381
QW
7914 prev_devid = devid;
7915 prev_dev_ext_end = physical_offset + physical_len;
7916
cf90d884
QW
7917 ret = btrfs_next_item(root, path);
7918 if (ret < 0)
7919 goto out;
7920 if (ret > 0) {
7921 ret = 0;
7922 break;
7923 }
7924 }
7925
7926 /* Ensure all chunks have corresponding dev extents */
7927 ret = verify_chunk_dev_extent_mapping(fs_info);
7928out:
7929 btrfs_free_path(path);
7930 return ret;
7931}
eede2bf3
OS
7932
7933/*
7934 * Check whether the given block group or device is pinned by any inode being
7935 * used as a swapfile.
7936 */
7937bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7938{
7939 struct btrfs_swapfile_pin *sp;
7940 struct rb_node *node;
7941
7942 spin_lock(&fs_info->swapfile_pins_lock);
7943 node = fs_info->swapfile_pins.rb_node;
7944 while (node) {
7945 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7946 if (ptr < sp->ptr)
7947 node = node->rb_left;
7948 else if (ptr > sp->ptr)
7949 node = node->rb_right;
7950 else
7951 break;
7952 }
7953 spin_unlock(&fs_info->swapfile_pins_lock);
7954 return node != NULL;
7955}