Commit | Line | Data |
---|---|---|
c1d7c514 | 1 | // SPDX-License-Identifier: GPL-2.0 |
925baedd CM |
2 | /* |
3 | * Copyright (C) 2008 Oracle. All rights reserved. | |
925baedd | 4 | */ |
c1d7c514 | 5 | |
925baedd | 6 | #include <linux/sched.h> |
925baedd CM |
7 | #include <linux/pagemap.h> |
8 | #include <linux/spinlock.h> | |
9 | #include <linux/page-flags.h> | |
4881ee5a | 10 | #include <asm/bug.h> |
602cbe91 | 11 | #include "misc.h" |
925baedd CM |
12 | #include "ctree.h" |
13 | #include "extent_io.h" | |
14 | #include "locking.h" | |
07e81dc9 | 15 | #include "accessors.h" |
925baedd | 16 | |
0a27a047 JB |
17 | /* |
18 | * Lockdep class keys for extent_buffer->lock's in this root. For a given | |
19 | * eb, the lockdep key is determined by the btrfs_root it belongs to and | |
20 | * the level the eb occupies in the tree. | |
21 | * | |
22 | * Different roots are used for different purposes and may nest inside each | |
23 | * other and they require separate keysets. As lockdep keys should be | |
24 | * static, assign keysets according to the purpose of the root as indicated | |
25 | * by btrfs_root->root_key.objectid. This ensures that all special purpose | |
26 | * roots have separate keysets. | |
27 | * | |
28 | * Lock-nesting across peer nodes is always done with the immediate parent | |
29 | * node locked thus preventing deadlock. As lockdep doesn't know this, use | |
30 | * subclass to avoid triggering lockdep warning in such cases. | |
31 | * | |
32 | * The key is set by the readpage_end_io_hook after the buffer has passed | |
33 | * csum validation but before the pages are unlocked. It is also set by | |
34 | * btrfs_init_new_buffer on freshly allocated blocks. | |
35 | * | |
36 | * We also add a check to make sure the highest level of the tree is the | |
37 | * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code | |
38 | * needs update as well. | |
39 | */ | |
40 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
41 | #if BTRFS_MAX_LEVEL != 8 | |
42 | #error | |
43 | #endif | |
44 | ||
45 | #define DEFINE_LEVEL(stem, level) \ | |
46 | .names[level] = "btrfs-" stem "-0" #level, | |
47 | ||
48 | #define DEFINE_NAME(stem) \ | |
49 | DEFINE_LEVEL(stem, 0) \ | |
50 | DEFINE_LEVEL(stem, 1) \ | |
51 | DEFINE_LEVEL(stem, 2) \ | |
52 | DEFINE_LEVEL(stem, 3) \ | |
53 | DEFINE_LEVEL(stem, 4) \ | |
54 | DEFINE_LEVEL(stem, 5) \ | |
55 | DEFINE_LEVEL(stem, 6) \ | |
56 | DEFINE_LEVEL(stem, 7) | |
57 | ||
58 | static struct btrfs_lockdep_keyset { | |
59 | u64 id; /* root objectid */ | |
1a1b0e72 DS |
60 | /* Longest entry: btrfs-block-group-00 */ |
61 | char names[BTRFS_MAX_LEVEL][24]; | |
0a27a047 JB |
62 | struct lock_class_key keys[BTRFS_MAX_LEVEL]; |
63 | } btrfs_lockdep_keysets[] = { | |
64 | { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, | |
65 | { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, | |
66 | { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, | |
67 | { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, | |
68 | { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, | |
69 | { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, | |
70 | { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, | |
71 | { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, | |
72 | { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, | |
73 | { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, | |
74 | { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, | |
1a1b0e72 | 75 | { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, |
0a27a047 JB |
76 | { .id = 0, DEFINE_NAME("tree") }, |
77 | }; | |
78 | ||
79 | #undef DEFINE_LEVEL | |
80 | #undef DEFINE_NAME | |
81 | ||
82 | void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) | |
83 | { | |
84 | struct btrfs_lockdep_keyset *ks; | |
85 | ||
86 | BUG_ON(level >= ARRAY_SIZE(ks->keys)); | |
87 | ||
88 | /* Find the matching keyset, id 0 is the default entry */ | |
89 | for (ks = btrfs_lockdep_keysets; ks->id; ks++) | |
90 | if (ks->id == objectid) | |
91 | break; | |
92 | ||
93 | lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); | |
94 | } | |
95 | ||
b40130b2 JB |
96 | void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) |
97 | { | |
98 | if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) | |
99 | btrfs_set_buffer_lockdep_class(root->root_key.objectid, | |
100 | eb, btrfs_header_level(eb)); | |
101 | } | |
102 | ||
0a27a047 JB |
103 | #endif |
104 | ||
d4e253bb DS |
105 | /* |
106 | * Extent buffer locking | |
107 | * ===================== | |
108 | * | |
196d59ab JB |
109 | * We use a rw_semaphore for tree locking, and the semantics are exactly the |
110 | * same: | |
d4e253bb DS |
111 | * |
112 | * - reader/writer exclusion | |
113 | * - writer/writer exclusion | |
114 | * - reader/reader sharing | |
d4e253bb | 115 | * - try-lock semantics for readers and writers |
d4e253bb | 116 | * |
4048daed JB |
117 | * The rwsem implementation does opportunistic spinning which reduces number of |
118 | * times the locking task needs to sleep. | |
d4e253bb DS |
119 | */ |
120 | ||
b4ce94de | 121 | /* |
196d59ab JB |
122 | * __btrfs_tree_read_lock - lock extent buffer for read |
123 | * @eb: the eb to be locked | |
124 | * @nest: the nesting level to be used for lockdep | |
d4e253bb | 125 | * |
196d59ab JB |
126 | * This takes the read lock on the extent buffer, using the specified nesting |
127 | * level for lockdep purposes. | |
b4ce94de | 128 | */ |
0ecae6ff | 129 | void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
b4ce94de | 130 | { |
34e73cc9 QW |
131 | u64 start_ns = 0; |
132 | ||
133 | if (trace_btrfs_tree_read_lock_enabled()) | |
134 | start_ns = ktime_get_ns(); | |
196d59ab | 135 | |
196d59ab | 136 | down_read_nested(&eb->lock, nest); |
34e73cc9 | 137 | trace_btrfs_tree_read_lock(eb, start_ns); |
b4ce94de CM |
138 | } |
139 | ||
51899412 JB |
140 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
141 | { | |
0ecae6ff | 142 | __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); |
51899412 JB |
143 | } |
144 | ||
b4ce94de | 145 | /* |
196d59ab | 146 | * Try-lock for read. |
d4e253bb | 147 | * |
1a9fd417 | 148 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 149 | */ |
bd681513 | 150 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
b4ce94de | 151 | { |
196d59ab | 152 | if (down_read_trylock(&eb->lock)) { |
196d59ab JB |
153 | trace_btrfs_try_tree_read_lock(eb); |
154 | return 1; | |
b9473439 | 155 | } |
196d59ab | 156 | return 0; |
b4ce94de CM |
157 | } |
158 | ||
159 | /* | |
196d59ab | 160 | * Try-lock for write. |
d4e253bb | 161 | * |
1a9fd417 | 162 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 163 | */ |
bd681513 | 164 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
b4ce94de | 165 | { |
196d59ab JB |
166 | if (down_write_trylock(&eb->lock)) { |
167 | eb->lock_owner = current->pid; | |
168 | trace_btrfs_try_tree_write_lock(eb); | |
169 | return 1; | |
bd681513 | 170 | } |
196d59ab | 171 | return 0; |
b4ce94de CM |
172 | } |
173 | ||
174 | /* | |
4048daed | 175 | * Release read lock. |
bd681513 CM |
176 | */ |
177 | void btrfs_tree_read_unlock(struct extent_buffer *eb) | |
178 | { | |
31aab402 | 179 | trace_btrfs_tree_read_unlock(eb); |
196d59ab | 180 | up_read(&eb->lock); |
bd681513 CM |
181 | } |
182 | ||
bd681513 | 183 | /* |
196d59ab JB |
184 | * __btrfs_tree_lock - lock eb for write |
185 | * @eb: the eb to lock | |
186 | * @nest: the nesting to use for the lock | |
d4e253bb | 187 | * |
196d59ab | 188 | * Returns with the eb->lock write locked. |
b4ce94de | 189 | */ |
fd7ba1c1 | 190 | void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
78d933c7 | 191 | __acquires(&eb->lock) |
b4ce94de | 192 | { |
34e73cc9 QW |
193 | u64 start_ns = 0; |
194 | ||
195 | if (trace_btrfs_tree_lock_enabled()) | |
196 | start_ns = ktime_get_ns(); | |
197 | ||
196d59ab | 198 | down_write_nested(&eb->lock, nest); |
5b25f70f | 199 | eb->lock_owner = current->pid; |
34e73cc9 | 200 | trace_btrfs_tree_lock(eb, start_ns); |
925baedd CM |
201 | } |
202 | ||
fd7ba1c1 JB |
203 | void btrfs_tree_lock(struct extent_buffer *eb) |
204 | { | |
205 | __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); | |
206 | } | |
207 | ||
bd681513 | 208 | /* |
196d59ab | 209 | * Release the write lock. |
bd681513 | 210 | */ |
143bede5 | 211 | void btrfs_tree_unlock(struct extent_buffer *eb) |
925baedd | 212 | { |
31aab402 | 213 | trace_btrfs_tree_unlock(eb); |
ea4ebde0 | 214 | eb->lock_owner = 0; |
196d59ab | 215 | up_write(&eb->lock); |
925baedd | 216 | } |
ed2b1d36 | 217 | |
1f95ec01 DS |
218 | /* |
219 | * This releases any locks held in the path starting at level and going all the | |
220 | * way up to the root. | |
221 | * | |
222 | * btrfs_search_slot will keep the lock held on higher nodes in a few corner | |
223 | * cases, such as COW of the block at slot zero in the node. This ignores | |
224 | * those rules, and it should only be called when there are no more updates to | |
225 | * be done higher up in the tree. | |
226 | */ | |
227 | void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |
228 | { | |
229 | int i; | |
230 | ||
231 | if (path->keep_locks) | |
232 | return; | |
233 | ||
234 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | |
235 | if (!path->nodes[i]) | |
236 | continue; | |
237 | if (!path->locks[i]) | |
238 | continue; | |
239 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); | |
240 | path->locks[i] = 0; | |
241 | } | |
242 | } | |
b908c334 DS |
243 | |
244 | /* | |
245 | * Loop around taking references on and locking the root node of the tree until | |
246 | * we end up with a lock on the root node. | |
247 | * | |
248 | * Return: root extent buffer with write lock held | |
249 | */ | |
250 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |
251 | { | |
252 | struct extent_buffer *eb; | |
253 | ||
254 | while (1) { | |
255 | eb = btrfs_root_node(root); | |
b40130b2 JB |
256 | |
257 | btrfs_maybe_reset_lockdep_class(root, eb); | |
b908c334 DS |
258 | btrfs_tree_lock(eb); |
259 | if (eb == root->node) | |
260 | break; | |
261 | btrfs_tree_unlock(eb); | |
262 | free_extent_buffer(eb); | |
263 | } | |
264 | return eb; | |
265 | } | |
266 | ||
267 | /* | |
268 | * Loop around taking references on and locking the root node of the tree until | |
269 | * we end up with a lock on the root node. | |
270 | * | |
271 | * Return: root extent buffer with read lock held | |
272 | */ | |
1bb96598 | 273 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) |
b908c334 DS |
274 | { |
275 | struct extent_buffer *eb; | |
276 | ||
277 | while (1) { | |
278 | eb = btrfs_root_node(root); | |
b40130b2 JB |
279 | |
280 | btrfs_maybe_reset_lockdep_class(root, eb); | |
1bb96598 | 281 | btrfs_tree_read_lock(eb); |
b908c334 DS |
282 | if (eb == root->node) |
283 | break; | |
284 | btrfs_tree_read_unlock(eb); | |
285 | free_extent_buffer(eb); | |
286 | } | |
287 | return eb; | |
288 | } | |
2992df73 | 289 | |
857bc13f JB |
290 | /* |
291 | * Loop around taking references on and locking the root node of the tree in | |
292 | * nowait mode until we end up with a lock on the root node or returning to | |
293 | * avoid blocking. | |
294 | * | |
295 | * Return: root extent buffer with read lock held or -EAGAIN. | |
296 | */ | |
297 | struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) | |
298 | { | |
299 | struct extent_buffer *eb; | |
300 | ||
301 | while (1) { | |
302 | eb = btrfs_root_node(root); | |
303 | if (!btrfs_try_tree_read_lock(eb)) { | |
304 | free_extent_buffer(eb); | |
305 | return ERR_PTR(-EAGAIN); | |
306 | } | |
307 | if (eb == root->node) | |
308 | break; | |
309 | btrfs_tree_read_unlock(eb); | |
310 | free_extent_buffer(eb); | |
311 | } | |
312 | return eb; | |
313 | } | |
314 | ||
2992df73 NB |
315 | /* |
316 | * DREW locks | |
317 | * ========== | |
318 | * | |
319 | * DREW stands for double-reader-writer-exclusion lock. It's used in situation | |
320 | * where you want to provide A-B exclusion but not AA or BB. | |
321 | * | |
322 | * Currently implementation gives more priority to reader. If a reader and a | |
323 | * writer both race to acquire their respective sides of the lock the writer | |
324 | * would yield its lock as soon as it detects a concurrent reader. Additionally | |
325 | * if there are pending readers no new writers would be allowed to come in and | |
326 | * acquire the lock. | |
327 | */ | |
328 | ||
0b548539 | 329 | void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) |
2992df73 | 330 | { |
2992df73 | 331 | atomic_set(&lock->readers, 0); |
0b548539 | 332 | atomic_set(&lock->writers, 0); |
2992df73 NB |
333 | init_waitqueue_head(&lock->pending_readers); |
334 | init_waitqueue_head(&lock->pending_writers); | |
2992df73 NB |
335 | } |
336 | ||
337 | /* Return true if acquisition is successful, false otherwise */ | |
338 | bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) | |
339 | { | |
340 | if (atomic_read(&lock->readers)) | |
341 | return false; | |
342 | ||
0b548539 | 343 | atomic_inc(&lock->writers); |
2992df73 NB |
344 | |
345 | /* Ensure writers count is updated before we check for pending readers */ | |
0b548539 | 346 | smp_mb__after_atomic(); |
2992df73 NB |
347 | if (atomic_read(&lock->readers)) { |
348 | btrfs_drew_write_unlock(lock); | |
349 | return false; | |
350 | } | |
351 | ||
352 | return true; | |
353 | } | |
354 | ||
355 | void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) | |
356 | { | |
357 | while (true) { | |
358 | if (btrfs_drew_try_write_lock(lock)) | |
359 | return; | |
360 | wait_event(lock->pending_writers, !atomic_read(&lock->readers)); | |
361 | } | |
362 | } | |
363 | ||
364 | void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) | |
365 | { | |
0b548539 | 366 | atomic_dec(&lock->writers); |
2992df73 NB |
367 | cond_wake_up(&lock->pending_readers); |
368 | } | |
369 | ||
370 | void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) | |
371 | { | |
372 | atomic_inc(&lock->readers); | |
373 | ||
374 | /* | |
375 | * Ensure the pending reader count is perceieved BEFORE this reader | |
376 | * goes to sleep in case of active writers. This guarantees new writers | |
377 | * won't be allowed and that the current reader will be woken up when | |
378 | * the last active writer finishes its jobs. | |
379 | */ | |
380 | smp_mb__after_atomic(); | |
381 | ||
0b548539 | 382 | wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); |
2992df73 NB |
383 | } |
384 | ||
385 | void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) | |
386 | { | |
387 | /* | |
388 | * atomic_dec_and_test implies a full barrier, so woken up writers | |
389 | * are guaranteed to see the decrement | |
390 | */ | |
391 | if (atomic_dec_and_test(&lock->readers)) | |
392 | wake_up(&lock->pending_writers); | |
393 | } |