Commit | Line | Data |
---|---|---|
c1d7c514 | 1 | // SPDX-License-Identifier: GPL-2.0 |
925baedd CM |
2 | /* |
3 | * Copyright (C) 2008 Oracle. All rights reserved. | |
925baedd | 4 | */ |
c1d7c514 | 5 | |
925baedd | 6 | #include <linux/sched.h> |
925baedd CM |
7 | #include <linux/pagemap.h> |
8 | #include <linux/spinlock.h> | |
9 | #include <linux/page-flags.h> | |
4881ee5a | 10 | #include <asm/bug.h> |
602cbe91 | 11 | #include "misc.h" |
925baedd CM |
12 | #include "ctree.h" |
13 | #include "extent_io.h" | |
14 | #include "locking.h" | |
07e81dc9 | 15 | #include "accessors.h" |
925baedd | 16 | |
0a27a047 JB |
17 | /* |
18 | * Lockdep class keys for extent_buffer->lock's in this root. For a given | |
19 | * eb, the lockdep key is determined by the btrfs_root it belongs to and | |
20 | * the level the eb occupies in the tree. | |
21 | * | |
22 | * Different roots are used for different purposes and may nest inside each | |
23 | * other and they require separate keysets. As lockdep keys should be | |
24 | * static, assign keysets according to the purpose of the root as indicated | |
25 | * by btrfs_root->root_key.objectid. This ensures that all special purpose | |
26 | * roots have separate keysets. | |
27 | * | |
28 | * Lock-nesting across peer nodes is always done with the immediate parent | |
29 | * node locked thus preventing deadlock. As lockdep doesn't know this, use | |
30 | * subclass to avoid triggering lockdep warning in such cases. | |
31 | * | |
32 | * The key is set by the readpage_end_io_hook after the buffer has passed | |
33 | * csum validation but before the pages are unlocked. It is also set by | |
34 | * btrfs_init_new_buffer on freshly allocated blocks. | |
35 | * | |
36 | * We also add a check to make sure the highest level of the tree is the | |
37 | * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code | |
38 | * needs update as well. | |
39 | */ | |
40 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
41 | #if BTRFS_MAX_LEVEL != 8 | |
42 | #error | |
43 | #endif | |
44 | ||
45 | #define DEFINE_LEVEL(stem, level) \ | |
46 | .names[level] = "btrfs-" stem "-0" #level, | |
47 | ||
48 | #define DEFINE_NAME(stem) \ | |
49 | DEFINE_LEVEL(stem, 0) \ | |
50 | DEFINE_LEVEL(stem, 1) \ | |
51 | DEFINE_LEVEL(stem, 2) \ | |
52 | DEFINE_LEVEL(stem, 3) \ | |
53 | DEFINE_LEVEL(stem, 4) \ | |
54 | DEFINE_LEVEL(stem, 5) \ | |
55 | DEFINE_LEVEL(stem, 6) \ | |
56 | DEFINE_LEVEL(stem, 7) | |
57 | ||
58 | static struct btrfs_lockdep_keyset { | |
59 | u64 id; /* root objectid */ | |
60 | /* Longest entry: btrfs-free-space-00 */ | |
61 | char names[BTRFS_MAX_LEVEL][20]; | |
62 | struct lock_class_key keys[BTRFS_MAX_LEVEL]; | |
63 | } btrfs_lockdep_keysets[] = { | |
64 | { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, | |
65 | { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, | |
66 | { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, | |
67 | { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, | |
68 | { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, | |
69 | { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, | |
70 | { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, | |
71 | { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, | |
72 | { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, | |
73 | { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, | |
74 | { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, | |
75 | { .id = 0, DEFINE_NAME("tree") }, | |
76 | }; | |
77 | ||
78 | #undef DEFINE_LEVEL | |
79 | #undef DEFINE_NAME | |
80 | ||
81 | void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) | |
82 | { | |
83 | struct btrfs_lockdep_keyset *ks; | |
84 | ||
85 | BUG_ON(level >= ARRAY_SIZE(ks->keys)); | |
86 | ||
87 | /* Find the matching keyset, id 0 is the default entry */ | |
88 | for (ks = btrfs_lockdep_keysets; ks->id; ks++) | |
89 | if (ks->id == objectid) | |
90 | break; | |
91 | ||
92 | lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); | |
93 | } | |
94 | ||
b40130b2 JB |
95 | void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) |
96 | { | |
97 | if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) | |
98 | btrfs_set_buffer_lockdep_class(root->root_key.objectid, | |
99 | eb, btrfs_header_level(eb)); | |
100 | } | |
101 | ||
0a27a047 JB |
102 | #endif |
103 | ||
d4e253bb DS |
104 | /* |
105 | * Extent buffer locking | |
106 | * ===================== | |
107 | * | |
196d59ab JB |
108 | * We use a rw_semaphore for tree locking, and the semantics are exactly the |
109 | * same: | |
d4e253bb DS |
110 | * |
111 | * - reader/writer exclusion | |
112 | * - writer/writer exclusion | |
113 | * - reader/reader sharing | |
d4e253bb | 114 | * - try-lock semantics for readers and writers |
d4e253bb | 115 | * |
4048daed JB |
116 | * The rwsem implementation does opportunistic spinning which reduces number of |
117 | * times the locking task needs to sleep. | |
d4e253bb DS |
118 | */ |
119 | ||
b4ce94de | 120 | /* |
196d59ab JB |
121 | * __btrfs_tree_read_lock - lock extent buffer for read |
122 | * @eb: the eb to be locked | |
123 | * @nest: the nesting level to be used for lockdep | |
d4e253bb | 124 | * |
196d59ab JB |
125 | * This takes the read lock on the extent buffer, using the specified nesting |
126 | * level for lockdep purposes. | |
b4ce94de | 127 | */ |
0ecae6ff | 128 | void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
b4ce94de | 129 | { |
34e73cc9 QW |
130 | u64 start_ns = 0; |
131 | ||
132 | if (trace_btrfs_tree_read_lock_enabled()) | |
133 | start_ns = ktime_get_ns(); | |
196d59ab | 134 | |
196d59ab | 135 | down_read_nested(&eb->lock, nest); |
34e73cc9 | 136 | trace_btrfs_tree_read_lock(eb, start_ns); |
b4ce94de CM |
137 | } |
138 | ||
51899412 JB |
139 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
140 | { | |
0ecae6ff | 141 | __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); |
51899412 JB |
142 | } |
143 | ||
b4ce94de | 144 | /* |
196d59ab | 145 | * Try-lock for read. |
d4e253bb | 146 | * |
1a9fd417 | 147 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 148 | */ |
bd681513 | 149 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
b4ce94de | 150 | { |
196d59ab | 151 | if (down_read_trylock(&eb->lock)) { |
196d59ab JB |
152 | trace_btrfs_try_tree_read_lock(eb); |
153 | return 1; | |
b9473439 | 154 | } |
196d59ab | 155 | return 0; |
b4ce94de CM |
156 | } |
157 | ||
158 | /* | |
196d59ab | 159 | * Try-lock for write. |
d4e253bb | 160 | * |
1a9fd417 | 161 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 162 | */ |
bd681513 | 163 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
b4ce94de | 164 | { |
196d59ab JB |
165 | if (down_write_trylock(&eb->lock)) { |
166 | eb->lock_owner = current->pid; | |
167 | trace_btrfs_try_tree_write_lock(eb); | |
168 | return 1; | |
bd681513 | 169 | } |
196d59ab | 170 | return 0; |
b4ce94de CM |
171 | } |
172 | ||
173 | /* | |
4048daed | 174 | * Release read lock. |
bd681513 CM |
175 | */ |
176 | void btrfs_tree_read_unlock(struct extent_buffer *eb) | |
177 | { | |
31aab402 | 178 | trace_btrfs_tree_read_unlock(eb); |
196d59ab | 179 | up_read(&eb->lock); |
bd681513 CM |
180 | } |
181 | ||
bd681513 | 182 | /* |
196d59ab JB |
183 | * __btrfs_tree_lock - lock eb for write |
184 | * @eb: the eb to lock | |
185 | * @nest: the nesting to use for the lock | |
d4e253bb | 186 | * |
196d59ab | 187 | * Returns with the eb->lock write locked. |
b4ce94de | 188 | */ |
fd7ba1c1 | 189 | void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
78d933c7 | 190 | __acquires(&eb->lock) |
b4ce94de | 191 | { |
34e73cc9 QW |
192 | u64 start_ns = 0; |
193 | ||
194 | if (trace_btrfs_tree_lock_enabled()) | |
195 | start_ns = ktime_get_ns(); | |
196 | ||
196d59ab | 197 | down_write_nested(&eb->lock, nest); |
5b25f70f | 198 | eb->lock_owner = current->pid; |
34e73cc9 | 199 | trace_btrfs_tree_lock(eb, start_ns); |
925baedd CM |
200 | } |
201 | ||
fd7ba1c1 JB |
202 | void btrfs_tree_lock(struct extent_buffer *eb) |
203 | { | |
204 | __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); | |
205 | } | |
206 | ||
bd681513 | 207 | /* |
196d59ab | 208 | * Release the write lock. |
bd681513 | 209 | */ |
143bede5 | 210 | void btrfs_tree_unlock(struct extent_buffer *eb) |
925baedd | 211 | { |
31aab402 | 212 | trace_btrfs_tree_unlock(eb); |
ea4ebde0 | 213 | eb->lock_owner = 0; |
196d59ab | 214 | up_write(&eb->lock); |
925baedd | 215 | } |
ed2b1d36 | 216 | |
1f95ec01 DS |
217 | /* |
218 | * This releases any locks held in the path starting at level and going all the | |
219 | * way up to the root. | |
220 | * | |
221 | * btrfs_search_slot will keep the lock held on higher nodes in a few corner | |
222 | * cases, such as COW of the block at slot zero in the node. This ignores | |
223 | * those rules, and it should only be called when there are no more updates to | |
224 | * be done higher up in the tree. | |
225 | */ | |
226 | void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |
227 | { | |
228 | int i; | |
229 | ||
230 | if (path->keep_locks) | |
231 | return; | |
232 | ||
233 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | |
234 | if (!path->nodes[i]) | |
235 | continue; | |
236 | if (!path->locks[i]) | |
237 | continue; | |
238 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); | |
239 | path->locks[i] = 0; | |
240 | } | |
241 | } | |
b908c334 DS |
242 | |
243 | /* | |
244 | * Loop around taking references on and locking the root node of the tree until | |
245 | * we end up with a lock on the root node. | |
246 | * | |
247 | * Return: root extent buffer with write lock held | |
248 | */ | |
249 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |
250 | { | |
251 | struct extent_buffer *eb; | |
252 | ||
253 | while (1) { | |
254 | eb = btrfs_root_node(root); | |
b40130b2 JB |
255 | |
256 | btrfs_maybe_reset_lockdep_class(root, eb); | |
b908c334 DS |
257 | btrfs_tree_lock(eb); |
258 | if (eb == root->node) | |
259 | break; | |
260 | btrfs_tree_unlock(eb); | |
261 | free_extent_buffer(eb); | |
262 | } | |
263 | return eb; | |
264 | } | |
265 | ||
266 | /* | |
267 | * Loop around taking references on and locking the root node of the tree until | |
268 | * we end up with a lock on the root node. | |
269 | * | |
270 | * Return: root extent buffer with read lock held | |
271 | */ | |
1bb96598 | 272 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) |
b908c334 DS |
273 | { |
274 | struct extent_buffer *eb; | |
275 | ||
276 | while (1) { | |
277 | eb = btrfs_root_node(root); | |
b40130b2 JB |
278 | |
279 | btrfs_maybe_reset_lockdep_class(root, eb); | |
1bb96598 | 280 | btrfs_tree_read_lock(eb); |
b908c334 DS |
281 | if (eb == root->node) |
282 | break; | |
283 | btrfs_tree_read_unlock(eb); | |
284 | free_extent_buffer(eb); | |
285 | } | |
286 | return eb; | |
287 | } | |
2992df73 | 288 | |
857bc13f JB |
289 | /* |
290 | * Loop around taking references on and locking the root node of the tree in | |
291 | * nowait mode until we end up with a lock on the root node or returning to | |
292 | * avoid blocking. | |
293 | * | |
294 | * Return: root extent buffer with read lock held or -EAGAIN. | |
295 | */ | |
296 | struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) | |
297 | { | |
298 | struct extent_buffer *eb; | |
299 | ||
300 | while (1) { | |
301 | eb = btrfs_root_node(root); | |
302 | if (!btrfs_try_tree_read_lock(eb)) { | |
303 | free_extent_buffer(eb); | |
304 | return ERR_PTR(-EAGAIN); | |
305 | } | |
306 | if (eb == root->node) | |
307 | break; | |
308 | btrfs_tree_read_unlock(eb); | |
309 | free_extent_buffer(eb); | |
310 | } | |
311 | return eb; | |
312 | } | |
313 | ||
2992df73 NB |
314 | /* |
315 | * DREW locks | |
316 | * ========== | |
317 | * | |
318 | * DREW stands for double-reader-writer-exclusion lock. It's used in situation | |
319 | * where you want to provide A-B exclusion but not AA or BB. | |
320 | * | |
321 | * Currently implementation gives more priority to reader. If a reader and a | |
322 | * writer both race to acquire their respective sides of the lock the writer | |
323 | * would yield its lock as soon as it detects a concurrent reader. Additionally | |
324 | * if there are pending readers no new writers would be allowed to come in and | |
325 | * acquire the lock. | |
326 | */ | |
327 | ||
0b548539 | 328 | void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) |
2992df73 | 329 | { |
2992df73 | 330 | atomic_set(&lock->readers, 0); |
0b548539 | 331 | atomic_set(&lock->writers, 0); |
2992df73 NB |
332 | init_waitqueue_head(&lock->pending_readers); |
333 | init_waitqueue_head(&lock->pending_writers); | |
2992df73 NB |
334 | } |
335 | ||
336 | /* Return true if acquisition is successful, false otherwise */ | |
337 | bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) | |
338 | { | |
339 | if (atomic_read(&lock->readers)) | |
340 | return false; | |
341 | ||
0b548539 | 342 | atomic_inc(&lock->writers); |
2992df73 NB |
343 | |
344 | /* Ensure writers count is updated before we check for pending readers */ | |
0b548539 | 345 | smp_mb__after_atomic(); |
2992df73 NB |
346 | if (atomic_read(&lock->readers)) { |
347 | btrfs_drew_write_unlock(lock); | |
348 | return false; | |
349 | } | |
350 | ||
351 | return true; | |
352 | } | |
353 | ||
354 | void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) | |
355 | { | |
356 | while (true) { | |
357 | if (btrfs_drew_try_write_lock(lock)) | |
358 | return; | |
359 | wait_event(lock->pending_writers, !atomic_read(&lock->readers)); | |
360 | } | |
361 | } | |
362 | ||
363 | void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) | |
364 | { | |
0b548539 | 365 | atomic_dec(&lock->writers); |
2992df73 NB |
366 | cond_wake_up(&lock->pending_readers); |
367 | } | |
368 | ||
369 | void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) | |
370 | { | |
371 | atomic_inc(&lock->readers); | |
372 | ||
373 | /* | |
374 | * Ensure the pending reader count is perceieved BEFORE this reader | |
375 | * goes to sleep in case of active writers. This guarantees new writers | |
376 | * won't be allowed and that the current reader will be woken up when | |
377 | * the last active writer finishes its jobs. | |
378 | */ | |
379 | smp_mb__after_atomic(); | |
380 | ||
0b548539 | 381 | wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); |
2992df73 NB |
382 | } |
383 | ||
384 | void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) | |
385 | { | |
386 | /* | |
387 | * atomic_dec_and_test implies a full barrier, so woken up writers | |
388 | * are guaranteed to see the decrement | |
389 | */ | |
390 | if (atomic_dec_and_test(&lock->readers)) | |
391 | wake_up(&lock->pending_writers); | |
392 | } |