Commit | Line | Data |
---|---|---|
c7f13d42 JB |
1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | ||
3 | #ifndef BTRFS_FS_H | |
4 | #define BTRFS_FS_H | |
5 | ||
243cf8d1 | 6 | #include <linux/blkdev.h> |
3683fbbc JB |
7 | #include <linux/fs.h> |
8 | #include <linux/btrfs_tree.h> | |
9 | #include <linux/sizes.h> | |
10 | #include "extent-io-tree.h" | |
11 | #include "extent_map.h" | |
12 | #include "async-thread.h" | |
13 | #include "block-rsv.h" | |
14 | ||
a56159d4 JB |
15 | #define BTRFS_MAX_EXTENT_SIZE SZ_128M |
16 | ||
17 | #define BTRFS_OLDEST_GENERATION 0ULL | |
18 | ||
19 | #define BTRFS_EMPTY_DIR_SIZE 0 | |
20 | ||
21 | #define BTRFS_DIRTY_METADATA_THRESH SZ_32M | |
22 | ||
23 | #define BTRFS_SUPER_INFO_OFFSET SZ_64K | |
24 | #define BTRFS_SUPER_INFO_SIZE 4096 | |
25 | static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); | |
26 | ||
5630e2bc FM |
27 | /* |
28 | * Number of metadata items necessary for an unlink operation: | |
29 | * | |
30 | * 1 for the possible orphan item | |
31 | * 1 for the dir item | |
32 | * 1 for the dir index | |
33 | * 1 for the inode ref | |
34 | * 1 for the inode | |
35 | * 1 for the parent inode | |
36 | */ | |
37 | #define BTRFS_UNLINK_METADATA_UNITS 6 | |
38 | ||
a56159d4 JB |
39 | /* |
40 | * The reserved space at the beginning of each device. It covers the primary | |
41 | * super block and leaves space for potential use by other tools like | |
42 | * bootloaders or to lower potential damage of accidental overwrite. | |
43 | */ | |
44 | #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) | |
ec8eb376 JB |
45 | /* |
46 | * Runtime (in-memory) states of filesystem | |
47 | */ | |
48 | enum { | |
49 | /* Global indicator of serious filesystem errors */ | |
50 | BTRFS_FS_STATE_ERROR, | |
51 | /* | |
52 | * Filesystem is being remounted, allow to skip some operations, like | |
53 | * defrag | |
54 | */ | |
55 | BTRFS_FS_STATE_REMOUNTING, | |
56 | /* Filesystem in RO mode */ | |
57 | BTRFS_FS_STATE_RO, | |
58 | /* Track if a transaction abort has been reported on this filesystem */ | |
59 | BTRFS_FS_STATE_TRANS_ABORTED, | |
60 | /* | |
61 | * Bio operations should be blocked on this filesystem because a source | |
62 | * or target device is being destroyed as part of a device replace | |
63 | */ | |
64 | BTRFS_FS_STATE_DEV_REPLACING, | |
65 | /* The btrfs_fs_info created for self-tests */ | |
66 | BTRFS_FS_STATE_DUMMY_FS_INFO, | |
67 | ||
68 | BTRFS_FS_STATE_NO_CSUMS, | |
69 | ||
70 | /* Indicates there was an error cleaning up a log tree. */ | |
71 | BTRFS_FS_STATE_LOG_CLEANUP_ERROR, | |
72 | ||
73 | BTRFS_FS_STATE_COUNT | |
74 | }; | |
75 | ||
7966a6b5 JB |
76 | enum { |
77 | BTRFS_FS_CLOSING_START, | |
78 | BTRFS_FS_CLOSING_DONE, | |
79 | BTRFS_FS_LOG_RECOVERING, | |
80 | BTRFS_FS_OPEN, | |
81 | BTRFS_FS_QUOTA_ENABLED, | |
82 | BTRFS_FS_UPDATE_UUID_TREE_GEN, | |
83 | BTRFS_FS_CREATING_FREE_SPACE_TREE, | |
84 | BTRFS_FS_BTREE_ERR, | |
85 | BTRFS_FS_LOG1_ERR, | |
86 | BTRFS_FS_LOG2_ERR, | |
87 | BTRFS_FS_QUOTA_OVERRIDE, | |
88 | /* Used to record internally whether fs has been frozen */ | |
89 | BTRFS_FS_FROZEN, | |
90 | /* | |
91 | * Indicate that balance has been set up from the ioctl and is in the | |
92 | * main phase. The fs_info::balance_ctl is initialized. | |
93 | */ | |
94 | BTRFS_FS_BALANCE_RUNNING, | |
95 | ||
96 | /* | |
97 | * Indicate that relocation of a chunk has started, it's set per chunk | |
98 | * and is toggled between chunks. | |
99 | */ | |
100 | BTRFS_FS_RELOC_RUNNING, | |
101 | ||
102 | /* Indicate that the cleaner thread is awake and doing something. */ | |
103 | BTRFS_FS_CLEANER_RUNNING, | |
104 | ||
105 | /* | |
106 | * The checksumming has an optimized version and is considered fast, | |
107 | * so we don't need to offload checksums to workqueues. | |
108 | */ | |
109 | BTRFS_FS_CSUM_IMPL_FAST, | |
110 | ||
111 | /* Indicate that the discard workqueue can service discards. */ | |
112 | BTRFS_FS_DISCARD_RUNNING, | |
113 | ||
114 | /* Indicate that we need to cleanup space cache v1 */ | |
115 | BTRFS_FS_CLEANUP_SPACE_CACHE_V1, | |
116 | ||
117 | /* Indicate that we can't trust the free space tree for caching yet */ | |
118 | BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, | |
119 | ||
120 | /* Indicate whether there are any tree modification log users */ | |
121 | BTRFS_FS_TREE_MOD_LOG_USERS, | |
122 | ||
123 | /* Indicate that we want the transaction kthread to commit right now. */ | |
124 | BTRFS_FS_COMMIT_TRANS, | |
125 | ||
126 | /* Indicate we have half completed snapshot deletions pending. */ | |
127 | BTRFS_FS_UNFINISHED_DROPS, | |
128 | ||
129 | /* Indicate we have to finish a zone to do next allocation. */ | |
130 | BTRFS_FS_NEED_ZONE_FINISH, | |
131 | ||
c52cc7b7 JB |
132 | /* Indicate that we want to commit the transaction. */ |
133 | BTRFS_FS_NEED_TRANS_COMMIT, | |
134 | ||
bf1f1fec JB |
135 | /* This is set when active zone tracking is needed. */ |
136 | BTRFS_FS_ACTIVE_ZONE_TRACKING, | |
85e79ec7 | 137 | |
b7625f46 QW |
138 | /* |
139 | * Indicate if we have some features changed, this is mostly for | |
140 | * cleaner thread to update the sysfs interface. | |
141 | */ | |
142 | BTRFS_FS_FEATURE_CHANGED, | |
143 | ||
7966a6b5 JB |
144 | #if BITS_PER_LONG == 32 |
145 | /* Indicate if we have error/warn message printed on 32bit systems */ | |
146 | BTRFS_FS_32BIT_ERROR, | |
147 | BTRFS_FS_32BIT_WARN, | |
148 | #endif | |
149 | }; | |
150 | ||
fc97a410 JB |
151 | /* |
152 | * Flags for mount options. | |
153 | * | |
154 | * Note: don't forget to add new options to btrfs_show_options() | |
155 | */ | |
156 | enum { | |
157 | BTRFS_MOUNT_NODATASUM = (1UL << 0), | |
158 | BTRFS_MOUNT_NODATACOW = (1UL << 1), | |
159 | BTRFS_MOUNT_NOBARRIER = (1UL << 2), | |
160 | BTRFS_MOUNT_SSD = (1UL << 3), | |
161 | BTRFS_MOUNT_DEGRADED = (1UL << 4), | |
162 | BTRFS_MOUNT_COMPRESS = (1UL << 5), | |
163 | BTRFS_MOUNT_NOTREELOG = (1UL << 6), | |
164 | BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), | |
165 | BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), | |
166 | BTRFS_MOUNT_NOSSD = (1UL << 9), | |
167 | BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), | |
168 | BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), | |
169 | BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), | |
170 | BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), | |
171 | BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), | |
172 | BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), | |
173 | BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), | |
174 | BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), | |
175 | BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), | |
176 | BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), | |
177 | BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), | |
178 | BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), | |
179 | BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), | |
180 | BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), | |
181 | BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), | |
182 | BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), | |
183 | BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), | |
184 | BTRFS_MOUNT_REF_VERIFY = (1UL << 27), | |
185 | BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), | |
186 | BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), | |
187 | BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), | |
188 | BTRFS_MOUNT_NODISCARD = (1UL << 31), | |
189 | }; | |
190 | ||
d83eb482 JB |
191 | /* |
192 | * Compat flags that we support. If any incompat flags are set other than the | |
193 | * ones specified below then we will fail to mount | |
194 | */ | |
195 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL | |
196 | #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL | |
197 | #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL | |
198 | ||
199 | #define BTRFS_FEATURE_COMPAT_RO_SUPP \ | |
200 | (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ | |
201 | BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ | |
202 | BTRFS_FEATURE_COMPAT_RO_VERITY | \ | |
203 | BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) | |
204 | ||
205 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL | |
206 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL | |
207 | ||
0f202b25 | 208 | #define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ |
d83eb482 JB |
209 | (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ |
210 | BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ | |
211 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ | |
212 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ | |
213 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ | |
214 | BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ | |
215 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ | |
216 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ | |
217 | BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ | |
218 | BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ | |
219 | BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ | |
220 | BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ | |
0f202b25 AJ |
221 | BTRFS_FEATURE_INCOMPAT_ZONED) |
222 | ||
223 | #ifdef CONFIG_BTRFS_DEBUG | |
224 | /* | |
225 | * Features under developmen like Extent tree v2 support is enabled | |
226 | * only under CONFIG_BTRFS_DEBUG. | |
227 | */ | |
228 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ | |
229 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ | |
d83eb482 | 230 | BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) |
0f202b25 | 231 | |
d83eb482 | 232 | #else |
0f202b25 AJ |
233 | |
234 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ | |
235 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) | |
236 | ||
d83eb482 JB |
237 | #endif |
238 | ||
239 | #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ | |
240 | (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) | |
241 | #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL | |
242 | ||
fc97a410 JB |
243 | #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) |
244 | #define BTRFS_DEFAULT_MAX_INLINE (2048) | |
245 | ||
a56159d4 JB |
246 | struct btrfs_dev_replace { |
247 | /* See #define above */ | |
248 | u64 replace_state; | |
249 | /* Seconds since 1-Jan-1970 */ | |
250 | time64_t time_started; | |
251 | /* Seconds since 1-Jan-1970 */ | |
252 | time64_t time_stopped; | |
253 | atomic64_t num_write_errors; | |
254 | atomic64_t num_uncorrectable_read_errors; | |
255 | ||
256 | u64 cursor_left; | |
257 | u64 committed_cursor_left; | |
258 | u64 cursor_left_last_write_of_item; | |
259 | u64 cursor_right; | |
260 | ||
261 | /* See #define above */ | |
262 | u64 cont_reading_from_srcdev_mode; | |
263 | ||
264 | int is_valid; | |
265 | int item_needs_writeback; | |
266 | struct btrfs_device *srcdev; | |
267 | struct btrfs_device *tgtdev; | |
268 | ||
269 | struct mutex lock_finishing_cancel_unmount; | |
270 | struct rw_semaphore rwsem; | |
271 | ||
272 | struct btrfs_scrub_progress scrub_progress; | |
273 | ||
274 | struct percpu_counter bio_counter; | |
275 | wait_queue_head_t replace_wait; | |
276 | }; | |
277 | ||
278 | /* | |
279 | * Free clusters are used to claim free space in relatively large chunks, | |
280 | * allowing us to do less seeky writes. They are used for all metadata | |
281 | * allocations. In ssd_spread mode they are also used for data allocations. | |
282 | */ | |
283 | struct btrfs_free_cluster { | |
284 | spinlock_t lock; | |
285 | spinlock_t refill_lock; | |
286 | struct rb_root root; | |
287 | ||
288 | /* Largest extent in this cluster */ | |
289 | u64 max_size; | |
290 | ||
291 | /* First extent starting offset */ | |
292 | u64 window_start; | |
293 | ||
294 | /* We did a full search and couldn't create a cluster */ | |
295 | bool fragmented; | |
296 | ||
297 | struct btrfs_block_group *block_group; | |
298 | /* | |
299 | * When a cluster is allocated from a block group, we put the cluster | |
300 | * onto a list in the block group so that it can be freed before the | |
301 | * block group is freed. | |
302 | */ | |
303 | struct list_head block_group_list; | |
304 | }; | |
305 | ||
306 | /* Discard control. */ | |
307 | /* | |
308 | * Async discard uses multiple lists to differentiate the discard filter | |
309 | * parameters. Index 0 is for completely free block groups where we need to | |
310 | * ensure the entire block group is trimmed without being lossy. Indices | |
311 | * afterwards represent monotonically decreasing discard filter sizes to | |
312 | * prioritize what should be discarded next. | |
313 | */ | |
314 | #define BTRFS_NR_DISCARD_LISTS 3 | |
315 | #define BTRFS_DISCARD_INDEX_UNUSED 0 | |
316 | #define BTRFS_DISCARD_INDEX_START 1 | |
317 | ||
318 | struct btrfs_discard_ctl { | |
319 | struct workqueue_struct *discard_workers; | |
320 | struct delayed_work work; | |
321 | spinlock_t lock; | |
322 | struct btrfs_block_group *block_group; | |
323 | struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; | |
324 | u64 prev_discard; | |
325 | u64 prev_discard_time; | |
326 | atomic_t discardable_extents; | |
327 | atomic64_t discardable_bytes; | |
328 | u64 max_discard_size; | |
329 | u64 delay_ms; | |
330 | u32 iops_limit; | |
331 | u32 kbps_limit; | |
332 | u64 discard_extent_bytes; | |
333 | u64 discard_bitmap_bytes; | |
334 | atomic64_t discard_bytes_saved; | |
335 | }; | |
336 | ||
337 | /* | |
338 | * Exclusive operations (device replace, resize, device add/remove, balance) | |
339 | */ | |
340 | enum btrfs_exclusive_operation { | |
341 | BTRFS_EXCLOP_NONE, | |
342 | BTRFS_EXCLOP_BALANCE_PAUSED, | |
343 | BTRFS_EXCLOP_BALANCE, | |
344 | BTRFS_EXCLOP_DEV_ADD, | |
345 | BTRFS_EXCLOP_DEV_REMOVE, | |
346 | BTRFS_EXCLOP_DEV_REPLACE, | |
347 | BTRFS_EXCLOP_RESIZE, | |
348 | BTRFS_EXCLOP_SWAP_ACTIVATE, | |
349 | }; | |
350 | ||
351 | /* Store data about transaction commits, exported via sysfs. */ | |
352 | struct btrfs_commit_stats { | |
353 | /* Total number of commits */ | |
354 | u64 commit_count; | |
355 | /* The maximum commit duration so far in ns */ | |
356 | u64 max_commit_dur; | |
357 | /* The last commit duration in ns */ | |
358 | u64 last_commit_dur; | |
359 | /* The total commit duration in ns */ | |
360 | u64 total_commit_dur; | |
361 | }; | |
362 | ||
363 | struct btrfs_fs_info { | |
364 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; | |
365 | unsigned long flags; | |
366 | struct btrfs_root *tree_root; | |
367 | struct btrfs_root *chunk_root; | |
368 | struct btrfs_root *dev_root; | |
369 | struct btrfs_root *fs_root; | |
370 | struct btrfs_root *quota_root; | |
371 | struct btrfs_root *uuid_root; | |
372 | struct btrfs_root *data_reloc_root; | |
373 | struct btrfs_root *block_group_root; | |
374 | ||
375 | /* The log root tree is a directory of all the other log roots */ | |
376 | struct btrfs_root *log_root_tree; | |
377 | ||
378 | /* The tree that holds the global roots (csum, extent, etc) */ | |
379 | rwlock_t global_root_lock; | |
380 | struct rb_root global_root_tree; | |
381 | ||
382 | spinlock_t fs_roots_radix_lock; | |
383 | struct radix_tree_root fs_roots_radix; | |
384 | ||
385 | /* Block group cache stuff */ | |
386 | rwlock_t block_group_cache_lock; | |
387 | struct rb_root_cached block_group_cache_tree; | |
388 | ||
389 | /* Keep track of unallocated space */ | |
390 | atomic64_t free_chunk_space; | |
391 | ||
392 | /* Track ranges which are used by log trees blocks/logged data extents */ | |
393 | struct extent_io_tree excluded_extents; | |
394 | ||
395 | /* logical->physical extent mapping */ | |
396 | struct extent_map_tree mapping_tree; | |
397 | ||
398 | /* | |
399 | * Block reservation for extent, checksum, root tree and delayed dir | |
400 | * index item. | |
401 | */ | |
402 | struct btrfs_block_rsv global_block_rsv; | |
403 | /* Block reservation for metadata operations */ | |
404 | struct btrfs_block_rsv trans_block_rsv; | |
405 | /* Block reservation for chunk tree */ | |
406 | struct btrfs_block_rsv chunk_block_rsv; | |
407 | /* Block reservation for delayed operations */ | |
408 | struct btrfs_block_rsv delayed_block_rsv; | |
409 | /* Block reservation for delayed refs */ | |
410 | struct btrfs_block_rsv delayed_refs_rsv; | |
411 | ||
412 | struct btrfs_block_rsv empty_block_rsv; | |
413 | ||
414 | u64 generation; | |
415 | u64 last_trans_committed; | |
416 | /* | |
417 | * Generation of the last transaction used for block group relocation | |
418 | * since the filesystem was last mounted (or 0 if none happened yet). | |
419 | * Must be written and read while holding btrfs_fs_info::commit_root_sem. | |
420 | */ | |
421 | u64 last_reloc_trans; | |
a56159d4 JB |
422 | |
423 | /* | |
424 | * This is updated to the current trans every time a full commit is | |
425 | * required instead of the faster short fsync log commits | |
426 | */ | |
427 | u64 last_trans_log_full_commit; | |
428 | unsigned long mount_opt; | |
429 | ||
430 | unsigned long compress_type:4; | |
431 | unsigned int compress_level; | |
432 | u32 commit_interval; | |
433 | /* | |
434 | * It is a suggestive number, the read side is safe even it gets a | |
435 | * wrong number because we will write out the data into a regular | |
436 | * extent. The write side(mount/remount) is under ->s_umount lock, | |
437 | * so it is also safe. | |
438 | */ | |
439 | u64 max_inline; | |
440 | ||
441 | struct btrfs_transaction *running_transaction; | |
442 | wait_queue_head_t transaction_throttle; | |
443 | wait_queue_head_t transaction_wait; | |
444 | wait_queue_head_t transaction_blocked_wait; | |
445 | wait_queue_head_t async_submit_wait; | |
446 | ||
447 | /* | |
448 | * Used to protect the incompat_flags, compat_flags, compat_ro_flags | |
449 | * when they are updated. | |
450 | * | |
451 | * Because we do not clear the flags for ever, so we needn't use | |
452 | * the lock on the read side. | |
453 | * | |
454 | * We also needn't use the lock when we mount the fs, because | |
455 | * there is no other task which will update the flag. | |
456 | */ | |
457 | spinlock_t super_lock; | |
458 | struct btrfs_super_block *super_copy; | |
459 | struct btrfs_super_block *super_for_commit; | |
460 | struct super_block *sb; | |
461 | struct inode *btree_inode; | |
462 | struct mutex tree_log_mutex; | |
463 | struct mutex transaction_kthread_mutex; | |
464 | struct mutex cleaner_mutex; | |
465 | struct mutex chunk_mutex; | |
466 | ||
467 | /* | |
468 | * This is taken to make sure we don't set block groups ro after the | |
469 | * free space cache has been allocated on them. | |
470 | */ | |
471 | struct mutex ro_block_group_mutex; | |
472 | ||
473 | /* | |
474 | * This is used during read/modify/write to make sure no two ios are | |
475 | * trying to mod the same stripe at the same time. | |
476 | */ | |
477 | struct btrfs_stripe_hash_table *stripe_hash_table; | |
478 | ||
479 | /* | |
480 | * This protects the ordered operations list only while we are | |
481 | * processing all of the entries on it. This way we make sure the | |
482 | * commit code doesn't find the list temporarily empty because another | |
483 | * function happens to be doing non-waiting preflush before jumping | |
484 | * into the main commit. | |
485 | */ | |
486 | struct mutex ordered_operations_mutex; | |
487 | ||
488 | struct rw_semaphore commit_root_sem; | |
489 | ||
490 | struct rw_semaphore cleanup_work_sem; | |
491 | ||
492 | struct rw_semaphore subvol_sem; | |
493 | ||
494 | spinlock_t trans_lock; | |
495 | /* | |
496 | * The reloc mutex goes with the trans lock, it is taken during commit | |
497 | * to protect us from the relocation code. | |
498 | */ | |
499 | struct mutex reloc_mutex; | |
500 | ||
501 | struct list_head trans_list; | |
502 | struct list_head dead_roots; | |
503 | struct list_head caching_block_groups; | |
504 | ||
505 | spinlock_t delayed_iput_lock; | |
506 | struct list_head delayed_iputs; | |
507 | atomic_t nr_delayed_iputs; | |
508 | wait_queue_head_t delayed_iputs_wait; | |
509 | ||
510 | atomic64_t tree_mod_seq; | |
511 | ||
512 | /* This protects tree_mod_log and tree_mod_seq_list */ | |
513 | rwlock_t tree_mod_log_lock; | |
514 | struct rb_root tree_mod_log; | |
515 | struct list_head tree_mod_seq_list; | |
516 | ||
517 | atomic_t async_delalloc_pages; | |
518 | ||
519 | /* This is used to protect the following list -- ordered_roots. */ | |
520 | spinlock_t ordered_root_lock; | |
521 | ||
522 | /* | |
523 | * All fs/file tree roots in which there are data=ordered extents | |
524 | * pending writeback are added into this list. | |
525 | * | |
526 | * These can span multiple transactions and basically include every | |
527 | * dirty data page that isn't from nodatacow. | |
528 | */ | |
529 | struct list_head ordered_roots; | |
530 | ||
531 | struct mutex delalloc_root_mutex; | |
532 | spinlock_t delalloc_root_lock; | |
533 | /* All fs/file tree roots that have delalloc inodes. */ | |
534 | struct list_head delalloc_roots; | |
535 | ||
536 | /* | |
537 | * There is a pool of worker threads for checksumming during writes and | |
538 | * a pool for checksumming after reads. This is because readers can | |
539 | * run with FS locks held, and the writers may be waiting for those | |
540 | * locks. We don't want ordering in the pending list to cause | |
541 | * deadlocks, and so the two are serviced separately. | |
542 | * | |
543 | * A third pool does submit_bio to avoid deadlocking with the other two. | |
544 | */ | |
545 | struct btrfs_workqueue *workers; | |
546 | struct btrfs_workqueue *hipri_workers; | |
547 | struct btrfs_workqueue *delalloc_workers; | |
548 | struct btrfs_workqueue *flush_workers; | |
549 | struct workqueue_struct *endio_workers; | |
550 | struct workqueue_struct *endio_meta_workers; | |
a56159d4 JB |
551 | struct workqueue_struct *rmw_workers; |
552 | struct workqueue_struct *compressed_write_workers; | |
553 | struct btrfs_workqueue *endio_write_workers; | |
554 | struct btrfs_workqueue *endio_freespace_worker; | |
555 | struct btrfs_workqueue *caching_workers; | |
556 | ||
557 | /* | |
558 | * Fixup workers take dirty pages that didn't properly go through the | |
559 | * cow mechanism and make them safe to write. It happens for the | |
560 | * sys_munmap function call path. | |
561 | */ | |
562 | struct btrfs_workqueue *fixup_workers; | |
563 | struct btrfs_workqueue *delayed_workers; | |
564 | ||
565 | struct task_struct *transaction_kthread; | |
566 | struct task_struct *cleaner_kthread; | |
567 | u32 thread_pool_size; | |
568 | ||
569 | struct kobject *space_info_kobj; | |
570 | struct kobject *qgroups_kobj; | |
571 | struct kobject *discard_kobj; | |
572 | ||
573 | /* Used to keep from writing metadata until there is a nice batch */ | |
574 | struct percpu_counter dirty_metadata_bytes; | |
575 | struct percpu_counter delalloc_bytes; | |
576 | struct percpu_counter ordered_bytes; | |
577 | s32 dirty_metadata_batch; | |
578 | s32 delalloc_batch; | |
579 | ||
580 | struct list_head dirty_cowonly_roots; | |
581 | ||
582 | struct btrfs_fs_devices *fs_devices; | |
583 | ||
584 | /* | |
585 | * The space_info list is effectively read only after initial setup. | |
586 | * It is populated at mount time and cleaned up after all block groups | |
587 | * are removed. RCU is used to protect it. | |
588 | */ | |
589 | struct list_head space_info; | |
590 | ||
591 | struct btrfs_space_info *data_sinfo; | |
592 | ||
593 | struct reloc_control *reloc_ctl; | |
594 | ||
595 | /* data_alloc_cluster is only used in ssd_spread mode */ | |
596 | struct btrfs_free_cluster data_alloc_cluster; | |
597 | ||
598 | /* All metadata allocations go through this cluster. */ | |
599 | struct btrfs_free_cluster meta_alloc_cluster; | |
600 | ||
601 | /* Auto defrag inodes go here. */ | |
602 | spinlock_t defrag_inodes_lock; | |
603 | struct rb_root defrag_inodes; | |
604 | atomic_t defrag_running; | |
605 | ||
606 | /* Used to protect avail_{data, metadata, system}_alloc_bits */ | |
607 | seqlock_t profiles_lock; | |
608 | /* | |
609 | * These three are in extended format (availability of single chunks is | |
610 | * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted | |
611 | * by corresponding BTRFS_BLOCK_GROUP_* bits) | |
612 | */ | |
613 | u64 avail_data_alloc_bits; | |
614 | u64 avail_metadata_alloc_bits; | |
615 | u64 avail_system_alloc_bits; | |
616 | ||
617 | /* Balance state */ | |
618 | spinlock_t balance_lock; | |
619 | struct mutex balance_mutex; | |
620 | atomic_t balance_pause_req; | |
621 | atomic_t balance_cancel_req; | |
622 | struct btrfs_balance_control *balance_ctl; | |
623 | wait_queue_head_t balance_wait_q; | |
624 | ||
625 | /* Cancellation requests for chunk relocation */ | |
626 | atomic_t reloc_cancel_req; | |
627 | ||
628 | u32 data_chunk_allocations; | |
629 | u32 metadata_ratio; | |
630 | ||
631 | void *bdev_holder; | |
632 | ||
633 | /* Private scrub information */ | |
634 | struct mutex scrub_lock; | |
635 | atomic_t scrubs_running; | |
636 | atomic_t scrub_pause_req; | |
637 | atomic_t scrubs_paused; | |
638 | atomic_t scrub_cancel_req; | |
639 | wait_queue_head_t scrub_pause_wait; | |
640 | /* | |
641 | * The worker pointers are NULL iff the refcount is 0, ie. scrub is not | |
642 | * running. | |
643 | */ | |
644 | refcount_t scrub_workers_refcnt; | |
645 | struct workqueue_struct *scrub_workers; | |
646 | struct workqueue_struct *scrub_wr_completion_workers; | |
a56159d4 JB |
647 | struct btrfs_subpage_info *subpage_info; |
648 | ||
649 | struct btrfs_discard_ctl discard_ctl; | |
650 | ||
651 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | |
652 | u32 check_integrity_print_mask; | |
653 | #endif | |
654 | /* Is qgroup tracking in a consistent state? */ | |
655 | u64 qgroup_flags; | |
656 | ||
657 | /* Holds configuration and tracking. Protected by qgroup_lock. */ | |
658 | struct rb_root qgroup_tree; | |
659 | spinlock_t qgroup_lock; | |
660 | ||
661 | /* | |
662 | * Used to avoid frequently calling ulist_alloc()/ulist_free() | |
663 | * when doing qgroup accounting, it must be protected by qgroup_lock. | |
664 | */ | |
665 | struct ulist *qgroup_ulist; | |
666 | ||
667 | /* | |
668 | * Protect user change for quota operations. If a transaction is needed, | |
669 | * it must be started before locking this lock. | |
670 | */ | |
671 | struct mutex qgroup_ioctl_lock; | |
672 | ||
673 | /* List of dirty qgroups to be written at next commit. */ | |
674 | struct list_head dirty_qgroups; | |
675 | ||
676 | /* Used by qgroup for an efficient tree traversal. */ | |
677 | u64 qgroup_seq; | |
678 | ||
679 | /* Qgroup rescan items. */ | |
680 | /* Protects the progress item */ | |
681 | struct mutex qgroup_rescan_lock; | |
682 | struct btrfs_key qgroup_rescan_progress; | |
683 | struct btrfs_workqueue *qgroup_rescan_workers; | |
684 | struct completion qgroup_rescan_completion; | |
685 | struct btrfs_work qgroup_rescan_work; | |
686 | /* Protected by qgroup_rescan_lock */ | |
687 | bool qgroup_rescan_running; | |
688 | u8 qgroup_drop_subtree_thres; | |
689 | ||
690 | /* Filesystem state */ | |
691 | unsigned long fs_state; | |
692 | ||
693 | struct btrfs_delayed_root *delayed_root; | |
694 | ||
695 | /* Extent buffer radix tree */ | |
696 | spinlock_t buffer_lock; | |
697 | /* Entries are eb->start / sectorsize */ | |
698 | struct radix_tree_root buffer_radix; | |
699 | ||
700 | /* Next backup root to be overwritten */ | |
701 | int backup_root_index; | |
702 | ||
703 | /* Device replace state */ | |
704 | struct btrfs_dev_replace dev_replace; | |
705 | ||
706 | struct semaphore uuid_tree_rescan_sem; | |
707 | ||
708 | /* Used to reclaim the metadata space in the background. */ | |
709 | struct work_struct async_reclaim_work; | |
710 | struct work_struct async_data_reclaim_work; | |
711 | struct work_struct preempt_reclaim_work; | |
712 | ||
713 | /* Reclaim partially filled block groups in the background */ | |
714 | struct work_struct reclaim_bgs_work; | |
715 | struct list_head reclaim_bgs; | |
716 | int bg_reclaim_threshold; | |
717 | ||
718 | spinlock_t unused_bgs_lock; | |
719 | struct list_head unused_bgs; | |
720 | struct mutex unused_bg_unpin_mutex; | |
721 | /* Protect block groups that are going to be deleted */ | |
722 | struct mutex reclaim_bgs_lock; | |
723 | ||
724 | /* Cached block sizes */ | |
725 | u32 nodesize; | |
726 | u32 sectorsize; | |
727 | /* ilog2 of sectorsize, use to avoid 64bit division */ | |
728 | u32 sectorsize_bits; | |
729 | u32 csum_size; | |
730 | u32 csums_per_leaf; | |
731 | u32 stripesize; | |
732 | ||
733 | /* | |
734 | * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular | |
735 | * filesystem, on zoned it depends on the device constraints. | |
736 | */ | |
737 | u64 max_extent_size; | |
738 | ||
739 | /* Block groups and devices containing active swapfiles. */ | |
740 | spinlock_t swapfile_pins_lock; | |
741 | struct rb_root swapfile_pins; | |
742 | ||
743 | struct crypto_shash *csum_shash; | |
744 | ||
745 | /* Type of exclusive operation running, protected by super_lock */ | |
746 | enum btrfs_exclusive_operation exclusive_operation; | |
747 | ||
748 | /* | |
749 | * Zone size > 0 when in ZONED mode, otherwise it's used for a check | |
750 | * if the mode is enabled | |
751 | */ | |
752 | u64 zone_size; | |
753 | ||
243cf8d1 CH |
754 | /* Constraints for ZONE_APPEND commands: */ |
755 | struct queue_limits limits; | |
a56159d4 | 756 | u64 max_zone_append_size; |
243cf8d1 | 757 | |
a56159d4 JB |
758 | struct mutex zoned_meta_io_lock; |
759 | spinlock_t treelog_bg_lock; | |
760 | u64 treelog_bg; | |
761 | ||
762 | /* | |
763 | * Start of the dedicated data relocation block group, protected by | |
764 | * relocation_bg_lock. | |
765 | */ | |
766 | spinlock_t relocation_bg_lock; | |
767 | u64 data_reloc_bg; | |
768 | struct mutex zoned_data_reloc_io_lock; | |
769 | ||
770 | u64 nr_global_roots; | |
771 | ||
772 | spinlock_t zone_active_bgs_lock; | |
773 | struct list_head zone_active_bgs; | |
774 | ||
775 | /* Updates are not protected by any lock */ | |
776 | struct btrfs_commit_stats commit_stats; | |
777 | ||
778 | /* | |
779 | * Last generation where we dropped a non-relocation root. | |
780 | * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() | |
781 | * to change it and to read it, respectively. | |
782 | */ | |
783 | u64 last_root_drop_gen; | |
784 | ||
785 | /* | |
786 | * Annotations for transaction events (structures are empty when | |
787 | * compiled without lockdep). | |
788 | */ | |
789 | struct lockdep_map btrfs_trans_num_writers_map; | |
790 | struct lockdep_map btrfs_trans_num_extwriters_map; | |
791 | struct lockdep_map btrfs_state_change_map[4]; | |
792 | struct lockdep_map btrfs_trans_pending_ordered_map; | |
793 | struct lockdep_map btrfs_ordered_extent_map; | |
794 | ||
795 | #ifdef CONFIG_BTRFS_FS_REF_VERIFY | |
796 | spinlock_t ref_verify_lock; | |
797 | struct rb_root block_tree; | |
798 | #endif | |
799 | ||
800 | #ifdef CONFIG_BTRFS_DEBUG | |
801 | struct kobject *debug_kobj; | |
802 | struct list_head allocated_roots; | |
803 | ||
804 | spinlock_t eb_leak_lock; | |
805 | struct list_head allocated_ebs; | |
806 | #endif | |
807 | }; | |
808 | ||
809 | static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, | |
810 | u64 gen) | |
811 | { | |
812 | WRITE_ONCE(fs_info->last_root_drop_gen, gen); | |
813 | } | |
814 | ||
815 | static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) | |
816 | { | |
817 | return READ_ONCE(fs_info->last_root_drop_gen); | |
818 | } | |
819 | ||
a56159d4 JB |
820 | /* |
821 | * Take the number of bytes to be checksummed and figure out how many leaves | |
822 | * it would require to store the csums for that many bytes. | |
823 | */ | |
824 | static inline u64 btrfs_csum_bytes_to_leaves( | |
825 | const struct btrfs_fs_info *fs_info, u64 csum_bytes) | |
826 | { | |
827 | const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; | |
828 | ||
829 | return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); | |
830 | } | |
831 | ||
832 | /* | |
833 | * Use this if we would be adding new items, as we could split nodes as we cow | |
834 | * down the tree. | |
835 | */ | |
d1085c9c | 836 | static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, |
a56159d4 JB |
837 | unsigned num_items) |
838 | { | |
839 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; | |
840 | } | |
841 | ||
842 | /* | |
843 | * Doing a truncate or a modification won't result in new nodes or leaves, just | |
844 | * what we need for COW. | |
845 | */ | |
d1085c9c | 846 | static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, |
a56159d4 JB |
847 | unsigned num_items) |
848 | { | |
849 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; | |
850 | } | |
851 | ||
852 | #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ | |
853 | sizeof(struct btrfs_item)) | |
854 | ||
855 | static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) | |
856 | { | |
857 | return fs_info->zone_size > 0; | |
858 | } | |
859 | ||
860 | /* | |
861 | * Count how many fs_info->max_extent_size cover the @size | |
862 | */ | |
863 | static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) | |
864 | { | |
865 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS | |
866 | if (!fs_info) | |
867 | return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); | |
868 | #endif | |
869 | ||
870 | return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); | |
871 | } | |
872 | ||
873 | bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, | |
874 | enum btrfs_exclusive_operation type); | |
875 | bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, | |
876 | enum btrfs_exclusive_operation type); | |
877 | void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); | |
878 | void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); | |
879 | void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, | |
880 | enum btrfs_exclusive_operation op); | |
881 | ||
c7f13d42 JB |
882 | /* Compatibility and incompatibility defines */ |
883 | void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, | |
884 | const char *name); | |
885 | void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, | |
886 | const char *name); | |
887 | void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, | |
888 | const char *name); | |
889 | void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, | |
890 | const char *name); | |
891 | ||
0d3a9cf8 JB |
892 | #define __btrfs_fs_incompat(fs_info, flags) \ |
893 | (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) | |
894 | ||
895 | #define __btrfs_fs_compat_ro(fs_info, flags) \ | |
896 | (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) | |
897 | ||
c7f13d42 JB |
898 | #define btrfs_set_fs_incompat(__fs_info, opt) \ |
899 | __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) | |
900 | ||
901 | #define btrfs_clear_fs_incompat(__fs_info, opt) \ | |
902 | __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) | |
903 | ||
904 | #define btrfs_fs_incompat(fs_info, opt) \ | |
905 | __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) | |
906 | ||
907 | #define btrfs_set_fs_compat_ro(__fs_info, opt) \ | |
908 | __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) | |
909 | ||
910 | #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ | |
911 | __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) | |
912 | ||
913 | #define btrfs_fs_compat_ro(fs_info, opt) \ | |
914 | __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) | |
915 | ||
fc97a410 JB |
916 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
917 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | |
918 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) | |
919 | #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ | |
920 | BTRFS_MOUNT_##opt) | |
921 | ||
922 | #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ | |
923 | do { \ | |
924 | if (!btrfs_test_opt(fs_info, opt)) \ | |
925 | btrfs_info(fs_info, fmt, ##args); \ | |
926 | btrfs_set_opt(fs_info->mount_opt, opt); \ | |
927 | } while (0) | |
928 | ||
929 | #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ | |
930 | do { \ | |
931 | if (btrfs_test_opt(fs_info, opt)) \ | |
932 | btrfs_info(fs_info, fmt, ##args); \ | |
933 | btrfs_clear_opt(fs_info->mount_opt, opt); \ | |
934 | } while (0) | |
935 | ||
c7f13d42 JB |
936 | static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) |
937 | { | |
938 | /* Do it this way so we only ever do one test_bit in the normal case. */ | |
939 | if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { | |
940 | if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) | |
941 | return 2; | |
942 | return 1; | |
943 | } | |
944 | return 0; | |
945 | } | |
946 | ||
947 | /* | |
948 | * If we remount the fs to be R/O or umount the fs, the cleaner needn't do | |
949 | * anything except sleeping. This function is used to check the status of | |
950 | * the fs. | |
951 | * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, | |
952 | * since setting and checking for SB_RDONLY in the superblock's flags is not | |
953 | * atomic. | |
954 | */ | |
955 | static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) | |
956 | { | |
957 | return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || | |
958 | btrfs_fs_closing(fs_info); | |
959 | } | |
960 | ||
7966a6b5 JB |
961 | static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) |
962 | { | |
963 | clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); | |
964 | } | |
965 | ||
ec8eb376 JB |
966 | #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ |
967 | &(fs_info)->fs_state))) | |
968 | #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ | |
969 | (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ | |
970 | &(fs_info)->fs_state))) | |
971 | ||
972 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS | |
6a6b4daf JB |
973 | |
974 | #define EXPORT_FOR_TESTS | |
975 | ||
ec8eb376 JB |
976 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
977 | { | |
978 | return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); | |
979 | } | |
6a6b4daf JB |
980 | |
981 | void btrfs_test_destroy_inode(struct inode *inode); | |
982 | ||
ec8eb376 | 983 | #else |
6a6b4daf JB |
984 | |
985 | #define EXPORT_FOR_TESTS static | |
986 | ||
ec8eb376 JB |
987 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
988 | { | |
989 | return 0; | |
990 | } | |
991 | #endif | |
992 | ||
c7f13d42 | 993 | #endif |