Commit | Line | Data |
---|---|---|
c1d7c514 | 1 | // SPDX-License-Identifier: GPL-2.0 |
925baedd CM |
2 | /* |
3 | * Copyright (C) 2008 Oracle. All rights reserved. | |
925baedd | 4 | */ |
c1d7c514 | 5 | |
925baedd | 6 | #include <linux/sched.h> |
925baedd CM |
7 | #include <linux/pagemap.h> |
8 | #include <linux/spinlock.h> | |
9 | #include <linux/page-flags.h> | |
4881ee5a | 10 | #include <asm/bug.h> |
602cbe91 | 11 | #include "misc.h" |
925baedd CM |
12 | #include "ctree.h" |
13 | #include "extent_io.h" | |
14 | #include "locking.h" | |
15 | ||
0a27a047 JB |
16 | /* |
17 | * Lockdep class keys for extent_buffer->lock's in this root. For a given | |
18 | * eb, the lockdep key is determined by the btrfs_root it belongs to and | |
19 | * the level the eb occupies in the tree. | |
20 | * | |
21 | * Different roots are used for different purposes and may nest inside each | |
22 | * other and they require separate keysets. As lockdep keys should be | |
23 | * static, assign keysets according to the purpose of the root as indicated | |
24 | * by btrfs_root->root_key.objectid. This ensures that all special purpose | |
25 | * roots have separate keysets. | |
26 | * | |
27 | * Lock-nesting across peer nodes is always done with the immediate parent | |
28 | * node locked thus preventing deadlock. As lockdep doesn't know this, use | |
29 | * subclass to avoid triggering lockdep warning in such cases. | |
30 | * | |
31 | * The key is set by the readpage_end_io_hook after the buffer has passed | |
32 | * csum validation but before the pages are unlocked. It is also set by | |
33 | * btrfs_init_new_buffer on freshly allocated blocks. | |
34 | * | |
35 | * We also add a check to make sure the highest level of the tree is the | |
36 | * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code | |
37 | * needs update as well. | |
38 | */ | |
39 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
40 | #if BTRFS_MAX_LEVEL != 8 | |
41 | #error | |
42 | #endif | |
43 | ||
44 | #define DEFINE_LEVEL(stem, level) \ | |
45 | .names[level] = "btrfs-" stem "-0" #level, | |
46 | ||
47 | #define DEFINE_NAME(stem) \ | |
48 | DEFINE_LEVEL(stem, 0) \ | |
49 | DEFINE_LEVEL(stem, 1) \ | |
50 | DEFINE_LEVEL(stem, 2) \ | |
51 | DEFINE_LEVEL(stem, 3) \ | |
52 | DEFINE_LEVEL(stem, 4) \ | |
53 | DEFINE_LEVEL(stem, 5) \ | |
54 | DEFINE_LEVEL(stem, 6) \ | |
55 | DEFINE_LEVEL(stem, 7) | |
56 | ||
57 | static struct btrfs_lockdep_keyset { | |
58 | u64 id; /* root objectid */ | |
59 | /* Longest entry: btrfs-free-space-00 */ | |
60 | char names[BTRFS_MAX_LEVEL][20]; | |
61 | struct lock_class_key keys[BTRFS_MAX_LEVEL]; | |
62 | } btrfs_lockdep_keysets[] = { | |
63 | { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, | |
64 | { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, | |
65 | { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, | |
66 | { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, | |
67 | { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, | |
68 | { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, | |
69 | { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, | |
70 | { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, | |
71 | { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, | |
72 | { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, | |
73 | { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, | |
74 | { .id = 0, DEFINE_NAME("tree") }, | |
75 | }; | |
76 | ||
77 | #undef DEFINE_LEVEL | |
78 | #undef DEFINE_NAME | |
79 | ||
80 | void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) | |
81 | { | |
82 | struct btrfs_lockdep_keyset *ks; | |
83 | ||
84 | BUG_ON(level >= ARRAY_SIZE(ks->keys)); | |
85 | ||
86 | /* Find the matching keyset, id 0 is the default entry */ | |
87 | for (ks = btrfs_lockdep_keysets; ks->id; ks++) | |
88 | if (ks->id == objectid) | |
89 | break; | |
90 | ||
91 | lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); | |
92 | } | |
93 | ||
b40130b2 JB |
94 | void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) |
95 | { | |
96 | if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) | |
97 | btrfs_set_buffer_lockdep_class(root->root_key.objectid, | |
98 | eb, btrfs_header_level(eb)); | |
99 | } | |
100 | ||
0a27a047 JB |
101 | #endif |
102 | ||
d4e253bb DS |
103 | /* |
104 | * Extent buffer locking | |
105 | * ===================== | |
106 | * | |
196d59ab JB |
107 | * We use a rw_semaphore for tree locking, and the semantics are exactly the |
108 | * same: | |
d4e253bb DS |
109 | * |
110 | * - reader/writer exclusion | |
111 | * - writer/writer exclusion | |
112 | * - reader/reader sharing | |
d4e253bb | 113 | * - try-lock semantics for readers and writers |
d4e253bb | 114 | * |
4048daed JB |
115 | * The rwsem implementation does opportunistic spinning which reduces number of |
116 | * times the locking task needs to sleep. | |
d4e253bb DS |
117 | */ |
118 | ||
b4ce94de | 119 | /* |
196d59ab JB |
120 | * __btrfs_tree_read_lock - lock extent buffer for read |
121 | * @eb: the eb to be locked | |
122 | * @nest: the nesting level to be used for lockdep | |
d4e253bb | 123 | * |
196d59ab JB |
124 | * This takes the read lock on the extent buffer, using the specified nesting |
125 | * level for lockdep purposes. | |
b4ce94de | 126 | */ |
0ecae6ff | 127 | void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
b4ce94de | 128 | { |
34e73cc9 QW |
129 | u64 start_ns = 0; |
130 | ||
131 | if (trace_btrfs_tree_read_lock_enabled()) | |
132 | start_ns = ktime_get_ns(); | |
196d59ab | 133 | |
196d59ab | 134 | down_read_nested(&eb->lock, nest); |
34e73cc9 | 135 | trace_btrfs_tree_read_lock(eb, start_ns); |
b4ce94de CM |
136 | } |
137 | ||
51899412 JB |
138 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
139 | { | |
0ecae6ff | 140 | __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); |
51899412 JB |
141 | } |
142 | ||
b4ce94de | 143 | /* |
196d59ab | 144 | * Try-lock for read. |
d4e253bb | 145 | * |
1a9fd417 | 146 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 147 | */ |
bd681513 | 148 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
b4ce94de | 149 | { |
196d59ab | 150 | if (down_read_trylock(&eb->lock)) { |
196d59ab JB |
151 | trace_btrfs_try_tree_read_lock(eb); |
152 | return 1; | |
b9473439 | 153 | } |
196d59ab | 154 | return 0; |
b4ce94de CM |
155 | } |
156 | ||
157 | /* | |
196d59ab | 158 | * Try-lock for write. |
d4e253bb | 159 | * |
1a9fd417 | 160 | * Return 1 if the rwlock has been taken, 0 otherwise |
b4ce94de | 161 | */ |
bd681513 | 162 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
b4ce94de | 163 | { |
196d59ab JB |
164 | if (down_write_trylock(&eb->lock)) { |
165 | eb->lock_owner = current->pid; | |
166 | trace_btrfs_try_tree_write_lock(eb); | |
167 | return 1; | |
bd681513 | 168 | } |
196d59ab | 169 | return 0; |
b4ce94de CM |
170 | } |
171 | ||
172 | /* | |
4048daed | 173 | * Release read lock. |
bd681513 CM |
174 | */ |
175 | void btrfs_tree_read_unlock(struct extent_buffer *eb) | |
176 | { | |
31aab402 | 177 | trace_btrfs_tree_read_unlock(eb); |
196d59ab | 178 | up_read(&eb->lock); |
bd681513 CM |
179 | } |
180 | ||
bd681513 | 181 | /* |
196d59ab JB |
182 | * __btrfs_tree_lock - lock eb for write |
183 | * @eb: the eb to lock | |
184 | * @nest: the nesting to use for the lock | |
d4e253bb | 185 | * |
196d59ab | 186 | * Returns with the eb->lock write locked. |
b4ce94de | 187 | */ |
fd7ba1c1 | 188 | void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
78d933c7 | 189 | __acquires(&eb->lock) |
b4ce94de | 190 | { |
34e73cc9 QW |
191 | u64 start_ns = 0; |
192 | ||
193 | if (trace_btrfs_tree_lock_enabled()) | |
194 | start_ns = ktime_get_ns(); | |
195 | ||
196d59ab | 196 | down_write_nested(&eb->lock, nest); |
5b25f70f | 197 | eb->lock_owner = current->pid; |
34e73cc9 | 198 | trace_btrfs_tree_lock(eb, start_ns); |
925baedd CM |
199 | } |
200 | ||
fd7ba1c1 JB |
201 | void btrfs_tree_lock(struct extent_buffer *eb) |
202 | { | |
203 | __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); | |
204 | } | |
205 | ||
bd681513 | 206 | /* |
196d59ab | 207 | * Release the write lock. |
bd681513 | 208 | */ |
143bede5 | 209 | void btrfs_tree_unlock(struct extent_buffer *eb) |
925baedd | 210 | { |
31aab402 | 211 | trace_btrfs_tree_unlock(eb); |
ea4ebde0 | 212 | eb->lock_owner = 0; |
196d59ab | 213 | up_write(&eb->lock); |
925baedd | 214 | } |
ed2b1d36 | 215 | |
1f95ec01 DS |
216 | /* |
217 | * This releases any locks held in the path starting at level and going all the | |
218 | * way up to the root. | |
219 | * | |
220 | * btrfs_search_slot will keep the lock held on higher nodes in a few corner | |
221 | * cases, such as COW of the block at slot zero in the node. This ignores | |
222 | * those rules, and it should only be called when there are no more updates to | |
223 | * be done higher up in the tree. | |
224 | */ | |
225 | void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |
226 | { | |
227 | int i; | |
228 | ||
229 | if (path->keep_locks) | |
230 | return; | |
231 | ||
232 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | |
233 | if (!path->nodes[i]) | |
234 | continue; | |
235 | if (!path->locks[i]) | |
236 | continue; | |
237 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); | |
238 | path->locks[i] = 0; | |
239 | } | |
240 | } | |
b908c334 DS |
241 | |
242 | /* | |
243 | * Loop around taking references on and locking the root node of the tree until | |
244 | * we end up with a lock on the root node. | |
245 | * | |
246 | * Return: root extent buffer with write lock held | |
247 | */ | |
248 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |
249 | { | |
250 | struct extent_buffer *eb; | |
251 | ||
252 | while (1) { | |
253 | eb = btrfs_root_node(root); | |
b40130b2 JB |
254 | |
255 | btrfs_maybe_reset_lockdep_class(root, eb); | |
b908c334 DS |
256 | btrfs_tree_lock(eb); |
257 | if (eb == root->node) | |
258 | break; | |
259 | btrfs_tree_unlock(eb); | |
260 | free_extent_buffer(eb); | |
261 | } | |
262 | return eb; | |
263 | } | |
264 | ||
265 | /* | |
266 | * Loop around taking references on and locking the root node of the tree until | |
267 | * we end up with a lock on the root node. | |
268 | * | |
269 | * Return: root extent buffer with read lock held | |
270 | */ | |
1bb96598 | 271 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) |
b908c334 DS |
272 | { |
273 | struct extent_buffer *eb; | |
274 | ||
275 | while (1) { | |
276 | eb = btrfs_root_node(root); | |
b40130b2 JB |
277 | |
278 | btrfs_maybe_reset_lockdep_class(root, eb); | |
1bb96598 | 279 | btrfs_tree_read_lock(eb); |
b908c334 DS |
280 | if (eb == root->node) |
281 | break; | |
282 | btrfs_tree_read_unlock(eb); | |
283 | free_extent_buffer(eb); | |
284 | } | |
285 | return eb; | |
286 | } | |
2992df73 NB |
287 | |
288 | /* | |
289 | * DREW locks | |
290 | * ========== | |
291 | * | |
292 | * DREW stands for double-reader-writer-exclusion lock. It's used in situation | |
293 | * where you want to provide A-B exclusion but not AA or BB. | |
294 | * | |
295 | * Currently implementation gives more priority to reader. If a reader and a | |
296 | * writer both race to acquire their respective sides of the lock the writer | |
297 | * would yield its lock as soon as it detects a concurrent reader. Additionally | |
298 | * if there are pending readers no new writers would be allowed to come in and | |
299 | * acquire the lock. | |
300 | */ | |
301 | ||
302 | int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) | |
303 | { | |
304 | int ret; | |
305 | ||
306 | ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); | |
307 | if (ret) | |
308 | return ret; | |
309 | ||
310 | atomic_set(&lock->readers, 0); | |
311 | init_waitqueue_head(&lock->pending_readers); | |
312 | init_waitqueue_head(&lock->pending_writers); | |
313 | ||
314 | return 0; | |
315 | } | |
316 | ||
317 | void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) | |
318 | { | |
319 | percpu_counter_destroy(&lock->writers); | |
320 | } | |
321 | ||
322 | /* Return true if acquisition is successful, false otherwise */ | |
323 | bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) | |
324 | { | |
325 | if (atomic_read(&lock->readers)) | |
326 | return false; | |
327 | ||
328 | percpu_counter_inc(&lock->writers); | |
329 | ||
330 | /* Ensure writers count is updated before we check for pending readers */ | |
331 | smp_mb(); | |
332 | if (atomic_read(&lock->readers)) { | |
333 | btrfs_drew_write_unlock(lock); | |
334 | return false; | |
335 | } | |
336 | ||
337 | return true; | |
338 | } | |
339 | ||
340 | void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) | |
341 | { | |
342 | while (true) { | |
343 | if (btrfs_drew_try_write_lock(lock)) | |
344 | return; | |
345 | wait_event(lock->pending_writers, !atomic_read(&lock->readers)); | |
346 | } | |
347 | } | |
348 | ||
349 | void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) | |
350 | { | |
351 | percpu_counter_dec(&lock->writers); | |
352 | cond_wake_up(&lock->pending_readers); | |
353 | } | |
354 | ||
355 | void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) | |
356 | { | |
357 | atomic_inc(&lock->readers); | |
358 | ||
359 | /* | |
360 | * Ensure the pending reader count is perceieved BEFORE this reader | |
361 | * goes to sleep in case of active writers. This guarantees new writers | |
362 | * won't be allowed and that the current reader will be woken up when | |
363 | * the last active writer finishes its jobs. | |
364 | */ | |
365 | smp_mb__after_atomic(); | |
366 | ||
367 | wait_event(lock->pending_readers, | |
368 | percpu_counter_sum(&lock->writers) == 0); | |
369 | } | |
370 | ||
371 | void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) | |
372 | { | |
373 | /* | |
374 | * atomic_dec_and_test implies a full barrier, so woken up writers | |
375 | * are guaranteed to see the decrement | |
376 | */ | |
377 | if (atomic_dec_and_test(&lock->readers)) | |
378 | wake_up(&lock->pending_writers); | |
379 | } |