btrfs: subpage: make btrfs_alloc_subpage() return btrfs_subpage directly
[linux-block.git] / fs / btrfs / subpage.c
CommitLineData
cac06d84
QW
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/slab.h>
4#include "ctree.h"
5#include "subpage.h"
3d078efa 6#include "btrfs_inode.h"
cac06d84 7
894d1378
QW
8/*
9 * Subpage (sectorsize < PAGE_SIZE) support overview:
10 *
11 * Limitations:
12 *
13 * - Only support 64K page size for now
14 * This is to make metadata handling easier, as 64K page would ensure
15 * all nodesize would fit inside one page, thus we don't need to handle
16 * cases where a tree block crosses several pages.
17 *
18 * - Only metadata read-write for now
19 * The data read-write part is in development.
20 *
21 * - Metadata can't cross 64K page boundary
22 * btrfs-progs and kernel have done that for a while, thus only ancient
23 * filesystems could have such problem. For such case, do a graceful
24 * rejection.
25 *
26 * Special behavior:
27 *
28 * - Metadata
29 * Metadata read is fully supported.
30 * Meaning when reading one tree block will only trigger the read for the
31 * needed range, other unrelated range in the same page will not be touched.
32 *
33 * Metadata write support is partial.
34 * The writeback is still for the full page, but we will only submit
35 * the dirty extent buffers in the page.
36 *
37 * This means, if we have a metadata page like this:
38 *
39 * Page offset
40 * 0 16K 32K 48K 64K
41 * |/////////| |///////////|
42 * \- Tree block A \- Tree block B
43 *
44 * Even if we just want to writeback tree block A, we will also writeback
45 * tree block B if it's also dirty.
46 *
47 * This may cause extra metadata writeback which results more COW.
48 *
49 * Implementation:
50 *
51 * - Common
52 * Both metadata and data will use a new structure, btrfs_subpage, to
53 * record the status of each sector inside a page. This provides the extra
54 * granularity needed.
55 *
56 * - Metadata
57 * Since we have multiple tree blocks inside one page, we can't rely on page
58 * locking anymore, or we will have greatly reduced concurrency or even
59 * deadlocks (hold one tree lock while trying to lock another tree lock in
60 * the same page).
61 *
62 * Thus for metadata locking, subpage support relies on io_tree locking only.
63 * This means a slightly higher tree locking latency.
64 */
65
cac06d84
QW
66int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
67 struct page *page, enum btrfs_subpage_type type)
68{
651fb419 69 struct btrfs_subpage *subpage;
cac06d84
QW
70
71 /*
72 * We have cases like a dummy extent buffer page, which is not mappped
73 * and doesn't need to be locked.
74 */
75 if (page->mapping)
76 ASSERT(PageLocked(page));
651fb419 77
cac06d84
QW
78 /* Either not subpage, or the page already has private attached */
79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
80 return 0;
81
651fb419
QW
82 subpage = btrfs_alloc_subpage(fs_info, type);
83 if (IS_ERR(subpage))
84 return PTR_ERR(subpage);
85
cac06d84
QW
86 attach_page_private(page, subpage);
87 return 0;
88}
89
90void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
91 struct page *page)
92{
93 struct btrfs_subpage *subpage;
94
95 /* Either not subpage, or already detached */
96 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
97 return;
98
99 subpage = (struct btrfs_subpage *)detach_page_private(page);
100 ASSERT(subpage);
760f991f
QW
101 btrfs_free_subpage(subpage);
102}
103
651fb419
QW
104struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
105 enum btrfs_subpage_type type)
760f991f 106{
651fb419
QW
107 struct btrfs_subpage *ret;
108
fdf250db 109 ASSERT(fs_info->sectorsize < PAGE_SIZE);
760f991f 110
651fb419
QW
111 ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
112 if (!ret)
113 return ERR_PTR(-ENOMEM);
114
115 spin_lock_init(&ret->lock);
1e1de387 116 if (type == BTRFS_SUBPAGE_METADATA) {
651fb419 117 atomic_set(&ret->eb_refs, 0);
1e1de387 118 } else {
651fb419
QW
119 atomic_set(&ret->readers, 0);
120 atomic_set(&ret->writers, 0);
1e1de387 121 }
651fb419 122 return ret;
760f991f
QW
123}
124
125void btrfs_free_subpage(struct btrfs_subpage *subpage)
126{
cac06d84
QW
127 kfree(subpage);
128}
8ff8466d
QW
129
130/*
131 * Increase the eb_refs of current subpage.
132 *
133 * This is important for eb allocation, to prevent race with last eb freeing
134 * of the same page.
135 * With the eb_refs increased before the eb inserted into radix tree,
136 * detach_extent_buffer_page() won't detach the page private while we're still
137 * allocating the extent buffer.
138 */
139void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
140 struct page *page)
141{
142 struct btrfs_subpage *subpage;
143
144 if (fs_info->sectorsize == PAGE_SIZE)
145 return;
146
147 ASSERT(PagePrivate(page) && page->mapping);
148 lockdep_assert_held(&page->mapping->private_lock);
149
150 subpage = (struct btrfs_subpage *)page->private;
151 atomic_inc(&subpage->eb_refs);
152}
153
154void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
155 struct page *page)
156{
157 struct btrfs_subpage *subpage;
158
159 if (fs_info->sectorsize == PAGE_SIZE)
160 return;
161
162 ASSERT(PagePrivate(page) && page->mapping);
163 lockdep_assert_held(&page->mapping->private_lock);
164
165 subpage = (struct btrfs_subpage *)page->private;
166 ASSERT(atomic_read(&subpage->eb_refs));
167 atomic_dec(&subpage->eb_refs);
168}
a1d767c1 169
92082d40 170static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
a1d767c1
QW
171 struct page *page, u64 start, u32 len)
172{
a1d767c1
QW
173 /* Basic checks */
174 ASSERT(PagePrivate(page) && page->private);
175 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
176 IS_ALIGNED(len, fs_info->sectorsize));
a1d767c1
QW
177 /*
178 * The range check only works for mapped page, we can still have
179 * unmapped page like dummy extent buffer pages.
180 */
181 if (page->mapping)
182 ASSERT(page_offset(page) <= start &&
183 start + len <= page_offset(page) + PAGE_SIZE);
92082d40
QW
184}
185
186void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
187 struct page *page, u64 start, u32 len)
188{
189 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
190 const int nbits = len >> fs_info->sectorsize_bits;
92082d40
QW
191
192 btrfs_subpage_assert(fs_info, page, start, len);
193
3d078efa 194 atomic_add(nbits, &subpage->readers);
92082d40
QW
195}
196
197void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
198 struct page *page, u64 start, u32 len)
199{
200 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
201 const int nbits = len >> fs_info->sectorsize_bits;
3d078efa
QW
202 bool is_data;
203 bool last;
92082d40
QW
204
205 btrfs_subpage_assert(fs_info, page, start, len);
3d078efa 206 is_data = is_data_inode(page->mapping->host);
92082d40 207 ASSERT(atomic_read(&subpage->readers) >= nbits);
3d078efa
QW
208 last = atomic_sub_and_test(nbits, &subpage->readers);
209
210 /*
211 * For data we need to unlock the page if the last read has finished.
212 *
213 * And please don't replace @last with atomic_sub_and_test() call
214 * inside if () condition.
215 * As we want the atomic_sub_and_test() to be always executed.
216 */
217 if (is_data && last)
92082d40
QW
218 unlock_page(page);
219}
220
1e1de387
QW
221static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
222{
223 u64 orig_start = *start;
224 u32 orig_len = *len;
225
226 *start = max_t(u64, page_offset(page), orig_start);
227 *len = min_t(u64, page_offset(page) + PAGE_SIZE,
228 orig_start + orig_len) - *start;
229}
230
231void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
232 struct page *page, u64 start, u32 len)
233{
234 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
235 const int nbits = (len >> fs_info->sectorsize_bits);
236 int ret;
237
238 btrfs_subpage_assert(fs_info, page, start, len);
239
240 ASSERT(atomic_read(&subpage->readers) == 0);
241 ret = atomic_add_return(nbits, &subpage->writers);
242 ASSERT(ret == nbits);
243}
244
245bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
246 struct page *page, u64 start, u32 len)
247{
248 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
249 const int nbits = (len >> fs_info->sectorsize_bits);
250
251 btrfs_subpage_assert(fs_info, page, start, len);
252
253 ASSERT(atomic_read(&subpage->writers) >= nbits);
254 return atomic_sub_and_test(nbits, &subpage->writers);
255}
256
257/*
258 * Lock a page for delalloc page writeback.
259 *
260 * Return -EAGAIN if the page is not properly initialized.
261 * Return 0 with the page locked, and writer counter updated.
262 *
263 * Even with 0 returned, the page still need extra check to make sure
264 * it's really the correct page, as the caller is using
265 * find_get_pages_contig(), which can race with page invalidating.
266 */
267int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
268 struct page *page, u64 start, u32 len)
269{
270 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
271 lock_page(page);
272 return 0;
273 }
274 lock_page(page);
275 if (!PagePrivate(page) || !page->private) {
276 unlock_page(page);
277 return -EAGAIN;
278 }
279 btrfs_subpage_clamp_range(page, &start, &len);
280 btrfs_subpage_start_writer(fs_info, page, start, len);
281 return 0;
282}
283
284void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
285 struct page *page, u64 start, u32 len)
286{
287 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
288 return unlock_page(page);
289 btrfs_subpage_clamp_range(page, &start, &len);
290 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
291 unlock_page(page);
292}
293
92082d40
QW
294/*
295 * Convert the [start, start + len) range into a u16 bitmap
296 *
297 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
298 */
299static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
300 struct page *page, u64 start, u32 len)
301{
302 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
303 const int nbits = len >> fs_info->sectorsize_bits;
304
305 btrfs_subpage_assert(fs_info, page, start, len);
306
a1d767c1
QW
307 /*
308 * Here nbits can be 16, thus can go beyond u16 range. We make the
309 * first left shift to be calculate in unsigned long (at least u32),
310 * then truncate the result to u16.
311 */
312 return (u16)(((1UL << nbits) - 1) << bit_start);
313}
314
315void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
316 struct page *page, u64 start, u32 len)
317{
318 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
319 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
320 unsigned long flags;
321
322 spin_lock_irqsave(&subpage->lock, flags);
323 subpage->uptodate_bitmap |= tmp;
324 if (subpage->uptodate_bitmap == U16_MAX)
325 SetPageUptodate(page);
326 spin_unlock_irqrestore(&subpage->lock, flags);
327}
328
329void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
330 struct page *page, u64 start, u32 len)
331{
332 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
333 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
334 unsigned long flags;
335
336 spin_lock_irqsave(&subpage->lock, flags);
337 subpage->uptodate_bitmap &= ~tmp;
338 ClearPageUptodate(page);
339 spin_unlock_irqrestore(&subpage->lock, flags);
340}
341
03a816b3
QW
342void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
343 struct page *page, u64 start, u32 len)
344{
345 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
346 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
347 unsigned long flags;
348
349 spin_lock_irqsave(&subpage->lock, flags);
350 subpage->error_bitmap |= tmp;
351 SetPageError(page);
352 spin_unlock_irqrestore(&subpage->lock, flags);
353}
354
355void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
356 struct page *page, u64 start, u32 len)
357{
358 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
359 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
360 unsigned long flags;
361
362 spin_lock_irqsave(&subpage->lock, flags);
363 subpage->error_bitmap &= ~tmp;
364 if (subpage->error_bitmap == 0)
365 ClearPageError(page);
366 spin_unlock_irqrestore(&subpage->lock, flags);
367}
368
d8a5713e
QW
369void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
370 struct page *page, u64 start, u32 len)
371{
372 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
373 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
374 unsigned long flags;
375
376 spin_lock_irqsave(&subpage->lock, flags);
377 subpage->dirty_bitmap |= tmp;
378 spin_unlock_irqrestore(&subpage->lock, flags);
379 set_page_dirty(page);
380}
381
382/*
383 * Extra clear_and_test function for subpage dirty bitmap.
384 *
385 * Return true if we're the last bits in the dirty_bitmap and clear the
386 * dirty_bitmap.
387 * Return false otherwise.
388 *
389 * NOTE: Callers should manually clear page dirty for true case, as we have
390 * extra handling for tree blocks.
391 */
392bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
393 struct page *page, u64 start, u32 len)
394{
395 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
396 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
397 unsigned long flags;
398 bool last = false;
399
400 spin_lock_irqsave(&subpage->lock, flags);
401 subpage->dirty_bitmap &= ~tmp;
402 if (subpage->dirty_bitmap == 0)
403 last = true;
404 spin_unlock_irqrestore(&subpage->lock, flags);
405 return last;
406}
407
408void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
409 struct page *page, u64 start, u32 len)
410{
411 bool last;
412
413 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
414 if (last)
415 clear_page_dirty_for_io(page);
416}
417
3470da3b
QW
418void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
419 struct page *page, u64 start, u32 len)
420{
421 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
422 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
423 unsigned long flags;
424
425 spin_lock_irqsave(&subpage->lock, flags);
426 subpage->writeback_bitmap |= tmp;
427 set_page_writeback(page);
428 spin_unlock_irqrestore(&subpage->lock, flags);
429}
430
431void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
432 struct page *page, u64 start, u32 len)
433{
434 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
435 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
436 unsigned long flags;
437
438 spin_lock_irqsave(&subpage->lock, flags);
439 subpage->writeback_bitmap &= ~tmp;
7c11d0ae
QW
440 if (subpage->writeback_bitmap == 0) {
441 ASSERT(PageWriteback(page));
3470da3b 442 end_page_writeback(page);
7c11d0ae 443 }
3470da3b
QW
444 spin_unlock_irqrestore(&subpage->lock, flags);
445}
446
6f17400b
QW
447void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
448 struct page *page, u64 start, u32 len)
449{
450 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
451 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
452 unsigned long flags;
453
454 spin_lock_irqsave(&subpage->lock, flags);
455 subpage->ordered_bitmap |= tmp;
456 SetPageOrdered(page);
457 spin_unlock_irqrestore(&subpage->lock, flags);
458}
459
460void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
461 struct page *page, u64 start, u32 len)
462{
463 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
464 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
465 unsigned long flags;
466
467 spin_lock_irqsave(&subpage->lock, flags);
468 subpage->ordered_bitmap &= ~tmp;
469 if (subpage->ordered_bitmap == 0)
470 ClearPageOrdered(page);
471 spin_unlock_irqrestore(&subpage->lock, flags);
472}
a1d767c1
QW
473/*
474 * Unlike set/clear which is dependent on each page status, for test all bits
475 * are tested in the same way.
476 */
477#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
478bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
479 struct page *page, u64 start, u32 len) \
480{ \
481 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
482 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
483 unsigned long flags; \
484 bool ret; \
485 \
486 spin_lock_irqsave(&subpage->lock, flags); \
487 ret = ((subpage->name##_bitmap & tmp) == tmp); \
488 spin_unlock_irqrestore(&subpage->lock, flags); \
489 return ret; \
490}
491IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
03a816b3 492IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
d8a5713e 493IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
3470da3b 494IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
6f17400b 495IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
a1d767c1
QW
496
497/*
498 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
499 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
500 * back to regular sectorsize branch.
501 */
502#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
503 test_page_func) \
504void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
505 struct page *page, u64 start, u32 len) \
506{ \
507 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
508 set_page_func(page); \
509 return; \
510 } \
511 btrfs_subpage_set_##name(fs_info, page, start, len); \
512} \
513void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
514 struct page *page, u64 start, u32 len) \
515{ \
516 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
517 clear_page_func(page); \
518 return; \
519 } \
520 btrfs_subpage_clear_##name(fs_info, page, start, len); \
521} \
522bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
523 struct page *page, u64 start, u32 len) \
524{ \
525 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
526 return test_page_func(page); \
527 return btrfs_subpage_test_##name(fs_info, page, start, len); \
60e2d255
QW
528} \
529void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
530 struct page *page, u64 start, u32 len) \
531{ \
532 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
533 set_page_func(page); \
534 return; \
535 } \
536 btrfs_subpage_clamp_range(page, &start, &len); \
537 btrfs_subpage_set_##name(fs_info, page, start, len); \
538} \
539void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
540 struct page *page, u64 start, u32 len) \
541{ \
542 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
543 clear_page_func(page); \
544 return; \
545 } \
546 btrfs_subpage_clamp_range(page, &start, &len); \
547 btrfs_subpage_clear_##name(fs_info, page, start, len); \
548} \
549bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
550 struct page *page, u64 start, u32 len) \
551{ \
552 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
553 return test_page_func(page); \
554 btrfs_subpage_clamp_range(page, &start, &len); \
555 return btrfs_subpage_test_##name(fs_info, page, start, len); \
a1d767c1
QW
556}
557IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
558 PageUptodate);
03a816b3 559IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
d8a5713e
QW
560IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
561 PageDirty);
3470da3b
QW
562IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
563 PageWriteback);
6f17400b
QW
564IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
565 PageOrdered);
cc1d0d93
QW
566
567/*
568 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
569 * is cleared.
570 */
571void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
572 struct page *page)
573{
574 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
575
576 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
577 return;
578
579 ASSERT(!PageDirty(page));
580 if (fs_info->sectorsize == PAGE_SIZE)
581 return;
582
583 ASSERT(PagePrivate(page) && page->private);
584 ASSERT(subpage->dirty_bitmap == 0);
585}