erofs: move zdata.h into zdata.c
[linux-block.git] / fs / erofs / zdata.c
CommitLineData
29b24f6c 1// SPDX-License-Identifier: GPL-2.0-only
02827e17 2/*
02827e17 3 * Copyright (C) 2018 HUAWEI, Inc.
592e7cd0 4 * https://www.huawei.com/
06a304cd 5 * Copyright (C) 2022 Alibaba Cloud
02827e17 6 */
27481233 7#include "compress.h"
3883a79a 8#include <linux/prefetch.h>
99486c51 9#include <linux/psi.h>
3883a79a 10
284db12c
CG
11#include <trace/events/erofs.h>
12
a9a94d93
GX
13#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
14#define Z_EROFS_INLINE_BVECS 2
15
16/*
17 * let's leave a type here in case of introducing
18 * another tagged pointer later.
19 */
20typedef void *z_erofs_next_pcluster_t;
21
22struct z_erofs_bvec {
23 struct page *page;
24 int offset;
25 unsigned int end;
26};
27
28#define __Z_EROFS_BVSET(name, total) \
29struct name { \
30 /* point to the next page which contains the following bvecs */ \
31 struct page *nextpage; \
32 struct z_erofs_bvec bvec[total]; \
33}
34__Z_EROFS_BVSET(z_erofs_bvset,);
35__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
36
37/*
38 * Structure fields follow one of the following exclusion rules.
39 *
40 * I: Modifiable by initialization/destruction paths and read-only
41 * for everyone else;
42 *
43 * L: Field should be protected by the pcluster lock;
44 *
45 * A: Field should be accessed / updated in atomic for parallelized code.
46 */
47struct z_erofs_pcluster {
48 struct erofs_workgroup obj;
49 struct mutex lock;
50
51 /* A: point to next chained pcluster or TAILs */
52 z_erofs_next_pcluster_t next;
53
54 /* L: the maximum decompression size of this round */
55 unsigned int length;
56
57 /* L: total number of bvecs */
58 unsigned int vcnt;
59
60 /* I: page offset of start position of decompression */
61 unsigned short pageofs_out;
62
63 /* I: page offset of inline compressed data */
64 unsigned short pageofs_in;
65
66 union {
67 /* L: inline a certain number of bvec for bootstrap */
68 struct z_erofs_bvset_inline bvset;
69
70 /* I: can be used to free the pcluster by RCU. */
71 struct rcu_head rcu;
72 };
73
74 union {
75 /* I: physical cluster size in pages */
76 unsigned short pclusterpages;
77
78 /* I: tailpacking inline compressed size */
79 unsigned short tailpacking_size;
80 };
81
82 /* I: compression algorithm format */
83 unsigned char algorithmformat;
84
85 /* L: whether partial decompression or not */
86 bool partial;
87
88 /* L: indicate several pageofs_outs or not */
89 bool multibases;
90
91 /* A: compressed bvecs (can be cached or inplaced pages) */
92 struct z_erofs_bvec compressed_bvecs[];
93};
94
95/* let's avoid the valid 32-bit kernel addresses */
96
97/* the chained workgroup has't submitted io (still open) */
98#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
99/* the chained workgroup has already submitted io */
100#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
101
102#define Z_EROFS_PCLUSTER_NIL (NULL)
103
104struct z_erofs_decompressqueue {
105 struct super_block *sb;
106 atomic_t pending_bios;
107 z_erofs_next_pcluster_t head;
108
109 union {
110 struct completion done;
111 struct work_struct work;
112 } u;
113 bool eio, sync;
114};
115
116static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
117{
118 return !pcl->obj.index;
119}
120
121static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
122{
123 if (z_erofs_is_inline_pcluster(pcl))
124 return 1;
125 return pcl->pclusterpages;
126}
127
128/*
129 * bit 30: I/O error occurred on this page
130 * bit 0 - 29: remaining parts to complete this page
131 */
132#define Z_EROFS_PAGE_EIO (1 << 30)
133
134static inline void z_erofs_onlinepage_init(struct page *page)
135{
136 union {
137 atomic_t o;
138 unsigned long v;
139 } u = { .o = ATOMIC_INIT(1) };
140
141 set_page_private(page, u.v);
142 smp_wmb();
143 SetPagePrivate(page);
144}
145
146static inline void z_erofs_onlinepage_split(struct page *page)
147{
148 atomic_inc((atomic_t *)&page->private);
149}
150
151static inline void z_erofs_page_mark_eio(struct page *page)
152{
153 int orig;
154
155 do {
156 orig = atomic_read((atomic_t *)&page->private);
157 } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
158 orig | Z_EROFS_PAGE_EIO) != orig);
159}
160
161static inline void z_erofs_onlinepage_endio(struct page *page)
162{
163 unsigned int v;
164
165 DBG_BUGON(!PagePrivate(page));
166 v = atomic_dec_return((atomic_t *)&page->private);
167 if (!(v & ~Z_EROFS_PAGE_EIO)) {
168 set_page_private(page, 0);
169 ClearPagePrivate(page);
170 if (!(v & Z_EROFS_PAGE_EIO))
171 SetPageUptodate(page);
172 unlock_page(page);
173 }
174}
175
176#define Z_EROFS_ONSTACK_PAGES 32
177
9f6cc76e
GX
178/*
179 * since pclustersize is variable for big pcluster feature, introduce slab
180 * pools implementation for different pcluster sizes.
181 */
182struct z_erofs_pcluster_slab {
183 struct kmem_cache *slab;
184 unsigned int maxpages;
185 char name[48];
186};
187
188#define _PCLP(n) { .maxpages = n }
189
190static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
191 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
192 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
193};
194
06a304cd
GX
195struct z_erofs_bvec_iter {
196 struct page *bvpage;
197 struct z_erofs_bvset *bvset;
198 unsigned int nr, cur;
199};
200
201static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
202{
203 if (iter->bvpage)
204 kunmap_local(iter->bvset);
205 return iter->bvpage;
206}
207
208static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
209{
210 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
211 /* have to access nextpage in advance, otherwise it will be unmapped */
212 struct page *nextpage = iter->bvset->nextpage;
213 struct page *oldpage;
214
215 DBG_BUGON(!nextpage);
216 oldpage = z_erofs_bvec_iter_end(iter);
217 iter->bvpage = nextpage;
218 iter->bvset = kmap_local_page(nextpage);
219 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
220 iter->cur = 0;
221 return oldpage;
222}
223
224static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
225 struct z_erofs_bvset_inline *bvset,
226 unsigned int bootstrap_nr,
227 unsigned int cur)
228{
229 *iter = (struct z_erofs_bvec_iter) {
230 .nr = bootstrap_nr,
231 .bvset = (struct z_erofs_bvset *)bvset,
232 };
233
234 while (cur > iter->nr) {
235 cur -= iter->nr;
236 z_erofs_bvset_flip(iter);
237 }
238 iter->cur = cur;
239}
240
241static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
242 struct z_erofs_bvec *bvec,
243 struct page **candidate_bvpage)
244{
245 if (iter->cur == iter->nr) {
246 if (!*candidate_bvpage)
247 return -EAGAIN;
248
249 DBG_BUGON(iter->bvset->nextpage);
250 iter->bvset->nextpage = *candidate_bvpage;
251 z_erofs_bvset_flip(iter);
252
253 iter->bvset->nextpage = NULL;
254 *candidate_bvpage = NULL;
255 }
256 iter->bvset->bvec[iter->cur++] = *bvec;
257 return 0;
258}
259
260static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
261 struct z_erofs_bvec *bvec,
262 struct page **old_bvpage)
263{
264 if (iter->cur == iter->nr)
265 *old_bvpage = z_erofs_bvset_flip(iter);
266 else
267 *old_bvpage = NULL;
268 *bvec = iter->bvset->bvec[iter->cur++];
269}
270
9f6cc76e
GX
271static void z_erofs_destroy_pcluster_pool(void)
272{
273 int i;
274
275 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
276 if (!pcluster_pool[i].slab)
277 continue;
278 kmem_cache_destroy(pcluster_pool[i].slab);
279 pcluster_pool[i].slab = NULL;
280 }
281}
282
283static int z_erofs_create_pcluster_pool(void)
284{
285 struct z_erofs_pcluster_slab *pcs;
286 struct z_erofs_pcluster *a;
287 unsigned int size;
288
289 for (pcs = pcluster_pool;
290 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
ed722fbc 291 size = struct_size(a, compressed_bvecs, pcs->maxpages);
9f6cc76e
GX
292
293 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
294 pcs->slab = kmem_cache_create(pcs->name, size, 0,
295 SLAB_RECLAIM_ACCOUNT, NULL);
296 if (pcs->slab)
297 continue;
298
299 z_erofs_destroy_pcluster_pool();
300 return -ENOMEM;
301 }
302 return 0;
303}
304
305static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
306{
307 int i;
308
309 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
310 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
311 struct z_erofs_pcluster *pcl;
312
313 if (nrpages > pcs->maxpages)
314 continue;
315
316 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
317 if (!pcl)
318 return ERR_PTR(-ENOMEM);
319 pcl->pclusterpages = nrpages;
320 return pcl;
321 }
322 return ERR_PTR(-EINVAL);
323}
324
325static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
326{
cecf864d 327 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
9f6cc76e
GX
328 int i;
329
330 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
331 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
332
cecf864d 333 if (pclusterpages > pcs->maxpages)
9f6cc76e
GX
334 continue;
335
336 kmem_cache_free(pcs->slab, pcl);
337 return;
338 }
339 DBG_BUGON(1);
340}
341
3883a79a 342static struct workqueue_struct *z_erofs_workqueue __read_mostly;
3883a79a
GX
343
344void z_erofs_exit_zip_subsystem(void)
345{
3883a79a 346 destroy_workqueue(z_erofs_workqueue);
9f6cc76e 347 z_erofs_destroy_pcluster_pool();
3883a79a
GX
348}
349
99634bf3 350static inline int z_erofs_init_workqueue(void)
3883a79a 351{
7dd68b14 352 const unsigned int onlinecpus = num_possible_cpus();
3883a79a
GX
353
354 /*
97e86a85
GX
355 * no need to spawn too many threads, limiting threads could minimum
356 * scheduling overhead, perhaps per-CPU threads should be better?
3883a79a 357 */
0e62ea33
GX
358 z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
359 WQ_UNBOUND | WQ_HIGHPRI,
97e86a85 360 onlinecpus + onlinecpus / 4);
42d40b4a 361 return z_erofs_workqueue ? 0 : -ENOMEM;
3883a79a
GX
362}
363
0a0b7e62 364int __init z_erofs_init_zip_subsystem(void)
3883a79a 365{
9f6cc76e
GX
366 int err = z_erofs_create_pcluster_pool();
367
368 if (err)
369 return err;
370 err = z_erofs_init_workqueue();
371 if (err)
372 z_erofs_destroy_pcluster_pool();
373 return err;
3883a79a
GX
374}
375
db166fc2
GX
376enum z_erofs_pclustermode {
377 Z_EROFS_PCLUSTER_INFLIGHT,
3883a79a 378 /*
db166fc2
GX
379 * The current pclusters was the tail of an exist chain, in addition
380 * that the previous processed chained pclusters are all decided to
97e86a85 381 * be hooked up to it.
db166fc2
GX
382 * A new chain will be created for the remaining pclusters which are
383 * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
384 * the next pcluster cannot reuse the whole page safely for inplace I/O
385 * in the following scenario:
a112152f
GX
386 * ________________________________________________________________
387 * | tail (partial) page | head (partial) page |
db166fc2
GX
388 * | (belongs to the next pcl) | (belongs to the current pcl) |
389 * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
a112152f 390 */
db166fc2 391 Z_EROFS_PCLUSTER_HOOKED,
0b964600 392 /*
db166fc2 393 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
0b964600
GX
394 * could be dispatched into bypass queue later due to uptodated managed
395 * pages. All related online pages cannot be reused for inplace I/O (or
387bab87 396 * bvpage) since it can be directly decoded without I/O submission.
0b964600 397 */
db166fc2 398 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
a112152f 399 /*
97e86a85
GX
400 * The current collection has been linked with the owned chain, and
401 * could also be linked with the remaining collections, which means
402 * if the processing page is the tail page of the collection, thus
403 * the current collection can safely use the whole page (since
404 * the previous collection is under control) for in-place I/O, as
405 * illustrated below:
a112152f 406 * ________________________________________________________________
97e86a85
GX
407 * | tail (partial) page | head (partial) page |
408 * | (of the current cl) | (of the previous collection) |
db166fc2
GX
409 * | PCLUSTER_FOLLOWED or | |
410 * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
a112152f 411 *
97e86a85 412 * [ (*) the above page can be used as inplace I/O. ]
3883a79a 413 */
db166fc2 414 Z_EROFS_PCLUSTER_FOLLOWED,
3883a79a
GX
415};
416
5c6dcc57
GX
417struct z_erofs_decompress_frontend {
418 struct inode *const inode;
419 struct erofs_map_blocks map;
06a304cd 420 struct z_erofs_bvec_iter biter;
3883a79a 421
06a304cd 422 struct page *candidate_bvpage;
bfc4ccb1 423 struct z_erofs_pcluster *pcl, *tailpcl;
97e86a85 424 z_erofs_next_pcluster_t owned_head;
db166fc2 425 enum z_erofs_pclustermode mode;
97e86a85 426
6ea5aad3 427 bool readahead;
97e86a85
GX
428 /* used for applying cache strategy on the fly */
429 bool backmost;
430 erofs_off_t headoffset;
ed722fbc
GX
431
432 /* a pointer used to pick up inplace I/O pages */
433 unsigned int icur;
97e86a85
GX
434};
435
97e86a85 436#define DECOMPRESS_FRONTEND_INIT(__i) { \
5c6dcc57 437 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
db166fc2 438 .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
97e86a85 439
1282dea3
GX
440static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
441{
442 unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
443
444 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
445 return false;
446
447 if (fe->backmost)
448 return true;
449
450 if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
451 fe->map.m_la < fe->headoffset)
452 return true;
453
454 return false;
455}
456
6f39d1e1 457static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
6f39d1e1 458 struct page **pagepool)
105d4ad8 459{
6f39d1e1 460 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
5c6dcc57 461 struct z_erofs_pcluster *pcl = fe->pcl;
1282dea3 462 bool shouldalloc = z_erofs_should_alloc_cache(fe);
92e6efd5 463 bool standalone = true;
6f39d1e1
GX
464 /*
465 * optimistic allocation without direct reclaim since inplace I/O
466 * can be used if low memory otherwise.
467 */
1825c8d7
GX
468 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
469 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
ed722fbc 470 unsigned int i;
92e6efd5 471
db166fc2 472 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
92e6efd5
GX
473 return;
474
ed722fbc 475 for (i = 0; i < pcl->pclusterpages; ++i) {
92e6efd5 476 struct page *page;
b1ed220c 477 void *t; /* mark pages just found for debugging */
1825c8d7 478 struct page *newpage = NULL;
92e6efd5
GX
479
480 /* the compressed page was loaded before */
ed722fbc 481 if (READ_ONCE(pcl->compressed_bvecs[i].page))
105d4ad8
GX
482 continue;
483
ed722fbc 484 page = find_get_page(mc, pcl->obj.index + i);
92e6efd5
GX
485
486 if (page) {
b1ed220c 487 t = (void *)((unsigned long)page | 1);
0b964600
GX
488 } else {
489 /* I/O is needed, no possible to decompress directly */
92e6efd5 490 standalone = false;
1282dea3
GX
491 if (!shouldalloc)
492 continue;
493
494 /*
495 * try to use cached I/O if page allocation
496 * succeeds or fallback to in-place I/O instead
497 * to avoid any direct reclaim.
498 */
499 newpage = erofs_allocpage(pagepool, gfp);
500 if (!newpage)
0b964600 501 continue;
1282dea3 502 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
b1ed220c 503 t = (void *)((unsigned long)newpage | 1);
105d4ad8
GX
504 }
505
b1ed220c 506 if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
105d4ad8
GX
507 continue;
508
eaa9172a 509 if (page)
92e6efd5 510 put_page(page);
eaa9172a
GX
511 else if (newpage)
512 erofs_pagepool_add(pagepool, newpage);
105d4ad8 513 }
92e6efd5 514
0b964600
GX
515 /*
516 * don't do inplace I/O if all compressed pages are available in
517 * managed cache since it can be moved to the bypass queue instead.
518 */
519 if (standalone)
db166fc2 520 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
105d4ad8
GX
521}
522
523/* called by erofs_shrinker to get rid of all compressed_pages */
47e541a1 524int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
97e86a85 525 struct erofs_workgroup *grp)
105d4ad8 526{
97e86a85
GX
527 struct z_erofs_pcluster *const pcl =
528 container_of(grp, struct z_erofs_pcluster, obj);
105d4ad8
GX
529 int i;
530
cecf864d 531 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
105d4ad8
GX
532 /*
533 * refcount of workgroup is now freezed as 1,
534 * therefore no need to worry about available decompression users.
535 */
9f6cc76e 536 for (i = 0; i < pcl->pclusterpages; ++i) {
ed722fbc 537 struct page *page = pcl->compressed_bvecs[i].page;
105d4ad8 538
97e86a85 539 if (!page)
105d4ad8
GX
540 continue;
541
542 /* block other users from reclaiming or migrating the page */
543 if (!trylock_page(page))
544 return -EBUSY;
545
f4d4e5fc 546 if (!erofs_page_is_managed(sbi, page))
97e86a85 547 continue;
105d4ad8 548
97e86a85 549 /* barrier is implied in the following 'unlock_page' */
ed722fbc 550 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
6aaa7b06 551 detach_page_private(page);
105d4ad8 552 unlock_page(page);
105d4ad8
GX
553 }
554 return 0;
555}
556
d252ff3d 557int erofs_try_to_free_cached_page(struct page *page)
105d4ad8 558{
97e86a85 559 struct z_erofs_pcluster *const pcl = (void *)page_private(page);
ed722fbc 560 int ret, i;
105d4ad8 561
ed722fbc
GX
562 if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
563 return 0;
105d4ad8 564
ed722fbc
GX
565 ret = 0;
566 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
567 for (i = 0; i < pcl->pclusterpages; ++i) {
568 if (pcl->compressed_bvecs[i].page == page) {
569 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
570 ret = 1;
571 break;
105d4ad8 572 }
105d4ad8 573 }
ed722fbc
GX
574 erofs_workgroup_unfreeze(&pcl->obj, 1);
575 if (ret)
576 detach_page_private(page);
105d4ad8
GX
577 return ret;
578}
105d4ad8 579
5c6dcc57 580static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
ed722fbc 581 struct z_erofs_bvec *bvec)
3883a79a 582{
5c6dcc57 583 struct z_erofs_pcluster *const pcl = fe->pcl;
97e86a85 584
ed722fbc
GX
585 while (fe->icur > 0) {
586 if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
587 NULL, bvec->page)) {
588 pcl->compressed_bvecs[fe->icur] = *bvec;
3883a79a 589 return true;
ed722fbc
GX
590 }
591 }
3883a79a
GX
592 return false;
593}
594
87ca34a7 595/* callers must be with pcluster lock held */
5c6dcc57 596static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
5b220b20 597 struct z_erofs_bvec *bvec, bool exclusive)
3883a79a
GX
598{
599 int ret;
3883a79a 600
db166fc2 601 if (exclusive) {
06a304cd 602 /* give priority for inplaceio to use file pages first */
ed722fbc 603 if (z_erofs_try_inplace_io(fe, bvec))
06a304cd
GX
604 return 0;
605 /* otherwise, check if it can be used as a bvpage */
db166fc2 606 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
06a304cd
GX
607 !fe->candidate_bvpage)
608 fe->candidate_bvpage = bvec->page;
609 }
610 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
611 fe->pcl->vcnt += (ret >= 0);
612 return ret;
3883a79a
GX
613}
614
5c6dcc57 615static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
3883a79a 616{
5c6dcc57
GX
617 struct z_erofs_pcluster *pcl = f->pcl;
618 z_erofs_next_pcluster_t *owned_head = &f->owned_head;
3883a79a 619
473e15b0
GX
620 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
621 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
622 *owned_head) == Z_EROFS_PCLUSTER_NIL) {
97e86a85 623 *owned_head = &pcl->next;
473e15b0 624 /* so we can attach this pcluster to our submission chain. */
db166fc2 625 f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
473e15b0
GX
626 return;
627 }
628
629 /*
630 * type 2, link to the end of an existing open chain, be careful
631 * that its submission is controlled by the original attached chain.
632 */
267f2492
GX
633 if (*owned_head != &pcl->next && pcl != f->tailpcl &&
634 cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
473e15b0 635 *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
97e86a85 636 *owned_head = Z_EROFS_PCLUSTER_TAIL;
db166fc2 637 f->mode = Z_EROFS_PCLUSTER_HOOKED;
5c6dcc57 638 f->tailpcl = NULL;
473e15b0 639 return;
a112152f 640 }
473e15b0 641 /* type 3, it belongs to a chain, but it isn't the end of the chain */
db166fc2 642 f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
3883a79a
GX
643}
644
83a386c0 645static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
3883a79a 646{
83a386c0 647 struct erofs_map_blocks *map = &fe->map;
cecf864d 648 bool ztailpacking = map->m_flags & EROFS_MAP_META;
97e86a85 649 struct z_erofs_pcluster *pcl;
64094a04 650 struct erofs_workgroup *grp;
97e86a85 651 int err;
e5e3abba 652
c42c0ffe
CZ
653 if (!(map->m_flags & EROFS_MAP_ENCODED) ||
654 (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
8f899262
GX
655 DBG_BUGON(1);
656 return -EFSCORRUPTED;
657 }
658
9f6cc76e 659 /* no available pcluster, let's allocate one */
cecf864d
YH
660 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
661 map->m_plen >> PAGE_SHIFT);
9f6cc76e
GX
662 if (IS_ERR(pcl))
663 return PTR_ERR(pcl);
3883a79a 664
64094a04 665 atomic_set(&pcl->obj.refcount, 1);
8f899262 666 pcl->algorithmformat = map->m_algorithmformat;
2bfab9c0
GX
667 pcl->length = 0;
668 pcl->partial = true;
3883a79a 669
97e86a85 670 /* new pclusters should be claimed as type 1, primary and followed */
5c6dcc57 671 pcl->next = fe->owned_head;
87ca34a7 672 pcl->pageofs_out = map->m_la & ~PAGE_MASK;
db166fc2 673 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
3883a79a 674
23edf3ab
GX
675 /*
676 * lock all primary followed works before visible to others
97e86a85 677 * and mutex_trylock *never* fails for a new pcluster.
23edf3ab 678 */
87ca34a7
GX
679 mutex_init(&pcl->lock);
680 DBG_BUGON(!mutex_trylock(&pcl->lock));
64094a04 681
cecf864d
YH
682 if (ztailpacking) {
683 pcl->obj.index = 0; /* which indicates ztailpacking */
684 pcl->pageofs_in = erofs_blkoff(map->m_pa);
685 pcl->tailpacking_size = map->m_plen;
686 } else {
687 pcl->obj.index = map->m_pa >> PAGE_SHIFT;
23edf3ab 688
83a386c0 689 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
cecf864d
YH
690 if (IS_ERR(grp)) {
691 err = PTR_ERR(grp);
692 goto err_out;
693 }
694
695 if (grp != &pcl->obj) {
5c6dcc57 696 fe->pcl = container_of(grp,
cecf864d
YH
697 struct z_erofs_pcluster, obj);
698 err = -EEXIST;
699 goto err_out;
700 }
3883a79a 701 }
bfc4ccb1 702 /* used to check tail merging loop due to corrupted images */
5c6dcc57
GX
703 if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
704 fe->tailpcl = pcl;
705 fe->owned_head = &pcl->next;
706 fe->pcl = pcl;
9e579fc1 707 return 0;
64094a04
GX
708
709err_out:
87ca34a7 710 mutex_unlock(&pcl->lock);
9f6cc76e 711 z_erofs_free_pcluster(pcl);
64094a04 712 return err;
3883a79a
GX
713}
714
83a386c0 715static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
97e86a85 716{
83a386c0 717 struct erofs_map_blocks *map = &fe->map;
0d823b42 718 struct erofs_workgroup *grp = NULL;
9e579fc1 719 int ret;
a112152f 720
87ca34a7 721 DBG_BUGON(fe->pcl);
3883a79a 722
87ca34a7 723 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
5c6dcc57
GX
724 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
725 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
3883a79a 726
0d823b42
GX
727 if (!(map->m_flags & EROFS_MAP_META)) {
728 grp = erofs_find_workgroup(fe->inode->i_sb,
729 map->m_pa >> PAGE_SHIFT);
730 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
731 DBG_BUGON(1);
732 return -EFSCORRUPTED;
3883a79a
GX
733 }
734
64094a04 735 if (grp) {
5c6dcc57 736 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
0d823b42 737 ret = -EEXIST;
64094a04 738 } else {
83a386c0 739 ret = z_erofs_register_pcluster(fe);
3883a79a
GX
740 }
741
0d823b42 742 if (ret == -EEXIST) {
267f2492
GX
743 mutex_lock(&fe->pcl->lock);
744 /* used to check tail merging loop due to corrupted images */
745 if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
746 fe->tailpcl = fe->pcl;
747
748 z_erofs_try_to_claim_pcluster(fe);
0d823b42 749 } else if (ret) {
9e579fc1 750 return ret;
64094a04 751 }
06a304cd 752 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
387bab87 753 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
81382f5f 754 /* since file-backed online pages are traversed in reverse order */
ed722fbc 755 fe->icur = z_erofs_pclusterpages(fe->pcl);
3883a79a
GX
756 return 0;
757}
758
759/*
97e86a85
GX
760 * keep in mind that no referenced pclusters will be freed
761 * only after a RCU grace period.
3883a79a
GX
762 */
763static void z_erofs_rcu_callback(struct rcu_head *head)
764{
87ca34a7
GX
765 z_erofs_free_pcluster(container_of(head,
766 struct z_erofs_pcluster, rcu));
3883a79a
GX
767}
768
769void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
770{
97e86a85
GX
771 struct z_erofs_pcluster *const pcl =
772 container_of(grp, struct z_erofs_pcluster, obj);
3883a79a 773
87ca34a7 774 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
3883a79a
GX
775}
776
5c6dcc57 777static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
3883a79a 778{
87ca34a7 779 struct z_erofs_pcluster *pcl = fe->pcl;
3883a79a 780
87ca34a7 781 if (!pcl)
3883a79a
GX
782 return false;
783
06a304cd 784 z_erofs_bvec_iter_end(&fe->biter);
87ca34a7 785 mutex_unlock(&pcl->lock);
3883a79a 786
06a304cd
GX
787 if (fe->candidate_bvpage) {
788 DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
789 fe->candidate_bvpage = NULL;
790 }
791
3883a79a 792 /*
97e86a85
GX
793 * if all pending pages are added, don't hold its reference
794 * any longer if the pcluster isn't hosted by ourselves.
3883a79a 795 */
db166fc2 796 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
87ca34a7 797 erofs_workgroup_put(&pcl->obj);
3883a79a 798
87ca34a7 799 fe->pcl = NULL;
3883a79a
GX
800 return true;
801}
802
b15b2e30
YH
803static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
804 struct page *page, unsigned int pageofs,
805 unsigned int len)
806{
807 struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
808 struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
809 u8 *src, *dst;
810 unsigned int i, cnt;
811
e5126de1
YH
812 if (!packed_inode)
813 return -EFSCORRUPTED;
814
b15b2e30
YH
815 pos += EROFS_I(inode)->z_fragmentoff;
816 for (i = 0; i < len; i += cnt) {
817 cnt = min_t(unsigned int, len - i,
818 EROFS_BLKSIZ - erofs_blkoff(pos));
819 src = erofs_bread(&buf, packed_inode,
820 erofs_blknr(pos), EROFS_KMAP);
821 if (IS_ERR(src)) {
822 erofs_put_metabuf(&buf);
823 return PTR_ERR(src);
824 }
825
826 dst = kmap_local_page(page);
827 memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
828 kunmap_local(dst);
829 pos += cnt;
830 }
831 erofs_put_metabuf(&buf);
832 return 0;
833}
834
97e86a85 835static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
eaa9172a 836 struct page *page, struct page **pagepool)
3883a79a 837{
97e86a85 838 struct inode *const inode = fe->inode;
3b423417 839 struct erofs_map_blocks *const map = &fe->map;
3883a79a 840 const loff_t offset = page_offset(page);
5b220b20 841 bool tight = true, exclusive;
2bfab9c0 842 unsigned int cur, end, spiltted;
1e05ff36 843 int err = 0;
3883a79a
GX
844
845 /* register locked file pages as online pages in pack */
846 z_erofs_onlinepage_init(page);
847
848 spiltted = 0;
849 end = PAGE_SIZE;
850repeat:
851 cur = end - 1;
852
39397a46
GX
853 if (offset + cur < map->m_la ||
854 offset + cur >= map->m_la + map->m_llen) {
855 erofs_dbg("out-of-range map @ pos %llu", offset + cur);
3883a79a 856
39397a46
GX
857 if (z_erofs_collector_end(fe))
858 fe->backmost = false;
859 map->m_la = offset + cur;
860 map->m_llen = 0;
861 err = z_erofs_map_blocks_iter(inode, map, 0);
862 if (err)
67148551 863 goto out;
39397a46
GX
864 } else {
865 if (fe->pcl)
866 goto hitted;
87ca34a7 867 /* didn't get a valid pcluster previously (very rare) */
1e5ceeab 868 }
3883a79a 869
b15b2e30
YH
870 if (!(map->m_flags & EROFS_MAP_MAPPED) ||
871 map->m_flags & EROFS_MAP_FRAGMENT)
3883a79a
GX
872 goto hitted;
873
83a386c0 874 err = z_erofs_collector_begin(fe);
8d8a09b0 875 if (err)
67148551 876 goto out;
3883a79a 877
5c6dcc57 878 if (z_erofs_is_inline_pcluster(fe->pcl)) {
09c54379 879 void *mp;
cecf864d 880
09c54379
GX
881 mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
882 erofs_blknr(map->m_pa), EROFS_NO_KMAP);
883 if (IS_ERR(mp)) {
884 err = PTR_ERR(mp);
cecf864d
YH
885 erofs_err(inode->i_sb,
886 "failed to get inline page, err %d", err);
67148551 887 goto out;
cecf864d 888 }
09c54379 889 get_page(fe->map.buf.page);
ed722fbc
GX
890 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
891 fe->map.buf.page);
db166fc2 892 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
cecf864d 893 } else {
6f39d1e1 894 /* bind cache first when cached decompression is preferred */
1282dea3 895 z_erofs_bind_cache(fe, pagepool);
cecf864d 896 }
3883a79a 897hitted:
dc76ea8c
GX
898 /*
899 * Ensure the current partial page belongs to this submit chain rather
900 * than other concurrent submit chains or the noio(bypass) chain since
901 * those chains are handled asynchronously thus the page cannot be used
387bab87 902 * for inplace I/O or bvpage (should be processed in a strict order.)
dc76ea8c 903 */
db166fc2
GX
904 tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
905 fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
dc76ea8c 906
7dd68b14 907 cur = end - min_t(unsigned int, offset + end - map->m_la, end);
8d8a09b0 908 if (!(map->m_flags & EROFS_MAP_MAPPED)) {
3883a79a
GX
909 zero_user_segment(page, cur, end);
910 goto next_part;
911 }
b15b2e30
YH
912 if (map->m_flags & EROFS_MAP_FRAGMENT) {
913 unsigned int pageofs, skip, len;
914
915 if (offset > map->m_la) {
916 pageofs = 0;
917 skip = offset - map->m_la;
918 } else {
919 pageofs = map->m_la & ~PAGE_MASK;
920 skip = 0;
921 }
922 len = min_t(unsigned int, map->m_llen - skip, end - cur);
923 err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
924 if (err)
925 goto out;
926 ++spiltted;
927 tight = false;
928 goto next_part;
929 }
3883a79a 930
5b220b20 931 exclusive = (!cur && (!spiltted || tight));
a112152f 932 if (cur)
db166fc2 933 tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
a112152f 934
3883a79a 935retry:
06a304cd
GX
936 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
937 .page = page,
938 .offset = offset - map->m_la,
939 .end = end,
5b220b20 940 }), exclusive);
06a304cd
GX
941 /* should allocate an additional short-lived page for bvset */
942 if (err == -EAGAIN && !fe->candidate_bvpage) {
943 fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
944 set_page_private(fe->candidate_bvpage,
945 Z_EROFS_SHORTLIVED_PAGE);
946 goto retry;
3883a79a
GX
947 }
948
06a304cd
GX
949 if (err) {
950 DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
67148551 951 goto out;
06a304cd 952 }
3883a79a 953
67148551 954 z_erofs_onlinepage_split(page);
1e05ff36
GX
955 /* bump up the number of spiltted parts of a page */
956 ++spiltted;
267f2492
GX
957 if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
958 fe->pcl->multibases = true;
2bfab9c0
GX
959 if (fe->pcl->length < offset + end - map->m_la) {
960 fe->pcl->length = offset + end - map->m_la;
961 fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
962 }
e7933278
GX
963 if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
964 !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
965 fe->pcl->length == map->m_llen)
966 fe->pcl->partial = false;
3883a79a 967next_part:
2bfab9c0 968 /* shorten the remaining extent to update progress */
3883a79a 969 map->m_llen = offset + cur - map->m_la;
2bfab9c0 970 map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
3883a79a 971
2bc75964
KČ
972 end = cur;
973 if (end > 0)
3883a79a
GX
974 goto repeat;
975
1e05ff36 976out:
67148551
GX
977 if (err)
978 z_erofs_page_mark_eio(page);
3883a79a
GX
979 z_erofs_onlinepage_endio(page);
980
4f761fa2
GX
981 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
982 __func__, page, spiltted, map->m_llen);
1e05ff36 983 return err;
3883a79a
GX
984}
985
40452ffc
HJ
986static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
987 unsigned int readahead_pages)
988{
a2e20a25 989 /* auto: enable for read_folio, disable for readahead */
40452ffc
HJ
990 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
991 !readahead_pages)
992 return true;
993
994 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
995 (readahead_pages <= sbi->opt.max_sync_decompress_pages))
996 return true;
997
998 return false;
999}
1000
6aaa7b06
GX
1001static bool z_erofs_page_is_invalidated(struct page *page)
1002{
1003 return !page->mapping && !z_erofs_is_shortlived_page(page);
1004}
1005
4f05687f
GX
1006struct z_erofs_decompress_backend {
1007 struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1008 struct super_block *sb;
1009 struct z_erofs_pcluster *pcl;
1010
1011 /* pages with the longest decompressed length for deduplication */
1012 struct page **decompressed_pages;
1013 /* pages to keep the compressed data */
1014 struct page **compressed_pages;
1015
267f2492 1016 struct list_head decompressed_secondary_bvecs;
4f05687f 1017 struct page **pagepool;
2bfab9c0 1018 unsigned int onstack_used, nr_pages;
4f05687f
GX
1019};
1020
267f2492
GX
1021struct z_erofs_bvec_item {
1022 struct z_erofs_bvec bvec;
1023 struct list_head list;
1024};
1025
1026static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
1027 struct z_erofs_bvec *bvec)
3fe96ee0 1028{
267f2492 1029 struct z_erofs_bvec_item *item;
3fe96ee0 1030
267f2492
GX
1031 if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
1032 unsigned int pgnr;
3fe96ee0 1033
267f2492
GX
1034 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1035 DBG_BUGON(pgnr >= be->nr_pages);
63bbb856
GX
1036 if (!be->decompressed_pages[pgnr]) {
1037 be->decompressed_pages[pgnr] = bvec->page;
267f2492 1038 return;
63bbb856 1039 }
267f2492
GX
1040 }
1041
1042 /* (cold path) one pcluster is requested multiple times */
1043 item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1044 item->bvec = *bvec;
1045 list_add(&item->list, &be->decompressed_secondary_bvecs);
1046}
1047
1048static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
1049 int err)
1050{
1051 unsigned int off0 = be->pcl->pageofs_out;
1052 struct list_head *p, *n;
1053
1054 list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1055 struct z_erofs_bvec_item *bvi;
1056 unsigned int end, cur;
1057 void *dst, *src;
1058
1059 bvi = container_of(p, struct z_erofs_bvec_item, list);
1060 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1061 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1062 bvi->bvec.end);
1063 dst = kmap_local_page(bvi->bvec.page);
1064 while (cur < end) {
1065 unsigned int pgnr, scur, len;
1066
1067 pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1068 DBG_BUGON(pgnr >= be->nr_pages);
1069
1070 scur = bvi->bvec.offset + cur -
1071 ((pgnr << PAGE_SHIFT) - off0);
1072 len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1073 if (!be->decompressed_pages[pgnr]) {
1074 err = -EFSCORRUPTED;
1075 cur += len;
1076 continue;
1077 }
1078 src = kmap_local_page(be->decompressed_pages[pgnr]);
1079 memcpy(dst + cur, src + scur, len);
1080 kunmap_local(src);
1081 cur += len;
1082 }
1083 kunmap_local(dst);
1084 if (err)
1085 z_erofs_page_mark_eio(bvi->bvec.page);
1086 z_erofs_onlinepage_endio(bvi->bvec.page);
1087 list_del(p);
1088 kfree(bvi);
3fe96ee0 1089 }
3fe96ee0
GX
1090}
1091
267f2492 1092static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
42fec235 1093{
4f05687f 1094 struct z_erofs_pcluster *pcl = be->pcl;
06a304cd
GX
1095 struct z_erofs_bvec_iter biter;
1096 struct page *old_bvpage;
267f2492 1097 int i;
42fec235 1098
387bab87 1099 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
42fec235 1100 for (i = 0; i < pcl->vcnt; ++i) {
06a304cd 1101 struct z_erofs_bvec bvec;
42fec235 1102
06a304cd 1103 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
42fec235 1104
06a304cd 1105 if (old_bvpage)
4f05687f 1106 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
42fec235 1107
06a304cd 1108 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
267f2492 1109 z_erofs_do_decompressed_bvec(be, &bvec);
42fec235 1110 }
06a304cd
GX
1111
1112 old_bvpage = z_erofs_bvec_iter_end(&biter);
1113 if (old_bvpage)
4f05687f 1114 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
42fec235
GX
1115}
1116
4f05687f
GX
1117static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
1118 bool *overlapped)
67139e36 1119{
4f05687f 1120 struct z_erofs_pcluster *pcl = be->pcl;
67139e36 1121 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
67139e36
GX
1122 int i, err = 0;
1123
67139e36 1124 *overlapped = false;
67139e36 1125 for (i = 0; i < pclusterpages; ++i) {
ed722fbc
GX
1126 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1127 struct page *page = bvec->page;
67139e36
GX
1128
1129 /* compressed pages ought to be present before decompressing */
1130 if (!page) {
1131 DBG_BUGON(1);
1132 continue;
1133 }
fe3e5914 1134 be->compressed_pages[i] = page;
67139e36
GX
1135
1136 if (z_erofs_is_inline_pcluster(pcl)) {
1137 if (!PageUptodate(page))
1138 err = -EIO;
1139 continue;
1140 }
1141
1142 DBG_BUGON(z_erofs_page_is_invalidated(page));
1143 if (!z_erofs_is_shortlived_page(page)) {
4f05687f 1144 if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
67139e36
GX
1145 if (!PageUptodate(page))
1146 err = -EIO;
1147 continue;
1148 }
267f2492 1149 z_erofs_do_decompressed_bvec(be, bvec);
67139e36
GX
1150 *overlapped = true;
1151 }
67139e36
GX
1152 }
1153
fe3e5914 1154 if (err)
4f05687f 1155 return err;
4f05687f 1156 return 0;
67139e36
GX
1157}
1158
4f05687f
GX
1159static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
1160 int err)
3883a79a 1161{
4f05687f
GX
1162 struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1163 struct z_erofs_pcluster *pcl = be->pcl;
cecf864d 1164 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
2bfab9c0 1165 unsigned int i, inputsize;
67148551 1166 int err2;
2bfab9c0
GX
1167 struct page *page;
1168 bool overlapped;
3883a79a 1169
87ca34a7 1170 mutex_lock(&pcl->lock);
2bfab9c0 1171 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
3883a79a 1172
fe3e5914
GX
1173 /* allocate (de)compressed page arrays if cannot be kept on stack */
1174 be->decompressed_pages = NULL;
1175 be->compressed_pages = NULL;
1176 be->onstack_used = 0;
2bfab9c0 1177 if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
4f05687f 1178 be->decompressed_pages = be->onstack_pages;
2bfab9c0 1179 be->onstack_used = be->nr_pages;
4f05687f 1180 memset(be->decompressed_pages, 0,
2bfab9c0 1181 sizeof(struct page *) * be->nr_pages);
fe3e5914
GX
1182 }
1183
1184 if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1185 be->compressed_pages = be->onstack_pages + be->onstack_used;
1186
1187 if (!be->decompressed_pages)
4f05687f 1188 be->decompressed_pages =
12724ba3
GX
1189 kcalloc(be->nr_pages, sizeof(struct page *),
1190 GFP_KERNEL | __GFP_NOFAIL);
fe3e5914
GX
1191 if (!be->compressed_pages)
1192 be->compressed_pages =
12724ba3
GX
1193 kcalloc(pclusterpages, sizeof(struct page *),
1194 GFP_KERNEL | __GFP_NOFAIL);
3883a79a 1195
267f2492 1196 z_erofs_parse_out_bvecs(be);
4f05687f 1197 err2 = z_erofs_parse_in_bvecs(be, &overlapped);
67148551
GX
1198 if (err2)
1199 err = err2;
8d8a09b0 1200 if (err)
11152496
GX
1201 goto out;
1202
cecf864d
YH
1203 if (z_erofs_is_inline_pcluster(pcl))
1204 inputsize = pcl->tailpacking_size;
1205 else
1206 inputsize = pclusterpages * PAGE_SIZE;
1207
88aaf5a7 1208 err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
4f05687f
GX
1209 .sb = be->sb,
1210 .in = be->compressed_pages,
1211 .out = be->decompressed_pages,
cecf864d 1212 .pageofs_in = pcl->pageofs_in,
87ca34a7 1213 .pageofs_out = pcl->pageofs_out,
9f6cc76e 1214 .inputsize = inputsize,
2bfab9c0 1215 .outputsize = pcl->length,
97e86a85 1216 .alg = pcl->algorithmformat,
88aaf5a7 1217 .inplace_io = overlapped,
2bfab9c0 1218 .partial_decoding = pcl->partial,
267f2492 1219 .fillgaps = pcl->multibases,
4f05687f 1220 }, be->pagepool);
3883a79a
GX
1221
1222out:
cecf864d
YH
1223 /* must handle all compressed pages before actual file pages */
1224 if (z_erofs_is_inline_pcluster(pcl)) {
ed722fbc
GX
1225 page = pcl->compressed_bvecs[0].page;
1226 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
cecf864d
YH
1227 put_page(page);
1228 } else {
1229 for (i = 0; i < pclusterpages; ++i) {
ed722fbc 1230 page = pcl->compressed_bvecs[i].page;
d61fbb6b 1231
cecf864d
YH
1232 if (erofs_page_is_managed(sbi, page))
1233 continue;
af692e11 1234
cecf864d 1235 /* recycle all individual short-lived pages */
4f05687f 1236 (void)z_erofs_put_shortlivedpage(be->pagepool, page);
ed722fbc 1237 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
cecf864d 1238 }
af692e11 1239 }
fe3e5914
GX
1240 if (be->compressed_pages < be->onstack_pages ||
1241 be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
12724ba3 1242 kfree(be->compressed_pages);
267f2492 1243 z_erofs_fill_other_copies(be, err);
af692e11 1244
2bfab9c0 1245 for (i = 0; i < be->nr_pages; ++i) {
4f05687f 1246 page = be->decompressed_pages[i];
af692e11
GX
1247 if (!page)
1248 continue;
1249
6aaa7b06 1250 DBG_BUGON(z_erofs_page_is_invalidated(page));
3883a79a 1251
6aaa7b06 1252 /* recycle all individual short-lived pages */
4f05687f 1253 if (z_erofs_put_shortlivedpage(be->pagepool, page))
3883a79a 1254 continue;
67148551
GX
1255 if (err)
1256 z_erofs_page_mark_eio(page);
3883a79a
GX
1257 z_erofs_onlinepage_endio(page);
1258 }
1259
4f05687f 1260 if (be->decompressed_pages != be->onstack_pages)
12724ba3 1261 kfree(be->decompressed_pages);
3883a79a 1262
2bfab9c0
GX
1263 pcl->length = 0;
1264 pcl->partial = true;
267f2492 1265 pcl->multibases = false;
06a304cd 1266 pcl->bvset.nextpage = NULL;
87ca34a7 1267 pcl->vcnt = 0;
3883a79a 1268
87ca34a7 1269 /* pcluster lock MUST be taken before the following line */
97e86a85 1270 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
87ca34a7 1271 mutex_unlock(&pcl->lock);
3883a79a
GX
1272 return err;
1273}
1274
0c638f70 1275static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
eaa9172a 1276 struct page **pagepool)
3883a79a 1277{
4f05687f
GX
1278 struct z_erofs_decompress_backend be = {
1279 .sb = io->sb,
1280 .pagepool = pagepool,
267f2492
GX
1281 .decompressed_secondary_bvecs =
1282 LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
4f05687f 1283 };
97e86a85 1284 z_erofs_next_pcluster_t owned = io->head;
3883a79a 1285
97e86a85 1286 while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
4f05687f 1287 /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
97e86a85 1288 DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
4f05687f 1289 /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
97e86a85 1290 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
3883a79a 1291
4f05687f
GX
1292 be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1293 owned = READ_ONCE(be.pcl->next);
3883a79a 1294
4f05687f
GX
1295 z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
1296 erofs_workgroup_put(&be.pcl->obj);
3978c8e3 1297 }
3883a79a
GX
1298}
1299
0c638f70 1300static void z_erofs_decompressqueue_work(struct work_struct *work)
3883a79a 1301{
a4b1fab1
GX
1302 struct z_erofs_decompressqueue *bgq =
1303 container_of(work, struct z_erofs_decompressqueue, u.work);
eaa9172a 1304 struct page *pagepool = NULL;
3883a79a 1305
a4b1fab1 1306 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
0c638f70 1307 z_erofs_decompress_queue(bgq, &pagepool);
3883a79a 1308
eaa9172a 1309 erofs_release_pages(&pagepool);
a4b1fab1 1310 kvfree(bgq);
3883a79a
GX
1311}
1312
7865827c 1313static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
cdba5506 1314 int bios)
7865827c
GX
1315{
1316 struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1317
1318 /* wake up the caller thread for sync decompression */
cdba5506 1319 if (io->sync) {
7865827c 1320 if (!atomic_add_return(bios, &io->pending_bios))
60b30050 1321 complete(&io->u.done);
7865827c
GX
1322 return;
1323 }
1324
1325 if (atomic_add_return(bios, &io->pending_bios))
1326 return;
1327 /* Use workqueue and sync decompression for atomic contexts only */
1328 if (in_atomic() || irqs_disabled()) {
1329 queue_work(z_erofs_workqueue, &io->u.work);
1330 /* enable sync decompression for readahead */
1331 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1332 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1333 return;
1334 }
1335 z_erofs_decompressqueue_work(&io->u.work);
1336}
1337
97e86a85
GX
1338static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
1339 unsigned int nr,
eaa9172a 1340 struct page **pagepool,
9f2731d6 1341 struct address_space *mc)
9248fce7 1342{
97e86a85 1343 const pgoff_t index = pcl->obj.index;
9f2731d6 1344 gfp_t gfp = mapping_gfp_mask(mc);
9248fce7
GX
1345 bool tocache = false;
1346
1347 struct address_space *mapping;
1348 struct page *oldpage, *page;
92e6efd5
GX
1349 int justfound;
1350
9248fce7 1351repeat:
ed722fbc 1352 page = READ_ONCE(pcl->compressed_bvecs[nr].page);
9248fce7
GX
1353 oldpage = page;
1354
1355 if (!page)
1356 goto out_allocpage;
1357
b1ed220c
GX
1358 justfound = (unsigned long)page & 1UL;
1359 page = (struct page *)((unsigned long)page & ~1UL);
92e6efd5 1360
1825c8d7
GX
1361 /*
1362 * preallocated cached pages, which is used to avoid direct reclaim
1363 * otherwise, it will go inplace I/O path instead.
1364 */
1365 if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
ed722fbc 1366 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1825c8d7
GX
1367 set_page_private(page, 0);
1368 tocache = true;
1369 goto out_tocache;
1370 }
9248fce7
GX
1371 mapping = READ_ONCE(page->mapping);
1372
9248fce7 1373 /*
6aaa7b06 1374 * file-backed online pages in plcuster are all locked steady,
9248fce7
GX
1375 * therefore it is impossible for `mapping' to be NULL.
1376 */
1377 if (mapping && mapping != mc)
1378 /* ought to be unmanaged pages */
1379 goto out;
1380
6aaa7b06
GX
1381 /* directly return for shortlived page as well */
1382 if (z_erofs_is_shortlived_page(page))
1383 goto out;
1384
9248fce7
GX
1385 lock_page(page);
1386
92e6efd5
GX
1387 /* only true if page reclaim goes wrong, should never happen */
1388 DBG_BUGON(justfound && PagePrivate(page));
1389
9248fce7
GX
1390 /* the page is still in manage cache */
1391 if (page->mapping == mc) {
ed722fbc 1392 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
9248fce7
GX
1393
1394 if (!PagePrivate(page)) {
92e6efd5
GX
1395 /*
1396 * impossible to be !PagePrivate(page) for
1397 * the current restriction as well if
ed722fbc 1398 * the page is already in compressed_bvecs[].
92e6efd5
GX
1399 */
1400 DBG_BUGON(!justfound);
1401
1402 justfound = 0;
97e86a85 1403 set_page_private(page, (unsigned long)pcl);
9248fce7
GX
1404 SetPagePrivate(page);
1405 }
1406
1407 /* no need to submit io if it is already up-to-date */
1408 if (PageUptodate(page)) {
1409 unlock_page(page);
1410 page = NULL;
1411 }
1412 goto out;
1413 }
1414
1415 /*
1416 * the managed page has been truncated, it's unsafe to
1417 * reuse this one, let's allocate a new cache-managed page.
1418 */
1419 DBG_BUGON(page->mapping);
92e6efd5 1420 DBG_BUGON(!justfound);
9248fce7
GX
1421
1422 tocache = true;
1423 unlock_page(page);
1424 put_page(page);
1425out_allocpage:
5ddcee1f 1426 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
ed722fbc
GX
1427 if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
1428 oldpage, page)) {
eaa9172a 1429 erofs_pagepool_add(pagepool, page);
5ddcee1f
GX
1430 cond_resched();
1431 goto repeat;
1432 }
1825c8d7 1433out_tocache:
bf225074
GX
1434 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1435 /* turn into temporary page if fails (1 ref) */
1436 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
1437 goto out;
a30573b3 1438 }
bf225074
GX
1439 attach_page_private(page, pcl);
1440 /* drop a refcount added by allocpage (then we have 2 refs here) */
1441 put_page(page);
1442
9248fce7
GX
1443out: /* the only exit (for tracing and debugging) */
1444 return page;
1445}
1446
cdba5506
GX
1447static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1448 struct z_erofs_decompressqueue *fgq, bool *fg)
3883a79a 1449{
a4b1fab1 1450 struct z_erofs_decompressqueue *q;
3883a79a 1451
a4b1fab1
GX
1452 if (fg && !*fg) {
1453 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1454 if (!q) {
1455 *fg = true;
1456 goto fg_out;
1457 }
0c638f70 1458 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
a4b1fab1
GX
1459 } else {
1460fg_out:
1461 q = fgq;
60b30050 1462 init_completion(&fgq->u.done);
a4b1fab1 1463 atomic_set(&fgq->pending_bios, 0);
67148551 1464 q->eio = false;
cdba5506 1465 q->sync = true;
3883a79a 1466 }
a4b1fab1
GX
1467 q->sb = sb;
1468 q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1469 return q;
3883a79a
GX
1470}
1471
97e86a85 1472/* define decompression jobqueue types */
7146a4f0 1473enum {
7146a4f0 1474 JQ_BYPASS,
7146a4f0
GX
1475 JQ_SUBMIT,
1476 NR_JOBQUEUES,
1477};
1478
97e86a85
GX
1479static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1480 z_erofs_next_pcluster_t qtail[],
1481 z_erofs_next_pcluster_t owned_head)
7146a4f0 1482{
97e86a85
GX
1483 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1484 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
7146a4f0 1485
97e86a85
GX
1486 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1487 if (owned_head == Z_EROFS_PCLUSTER_TAIL)
1488 owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
7146a4f0 1489
97e86a85 1490 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
7146a4f0
GX
1491
1492 WRITE_ONCE(*submit_qtail, owned_head);
97e86a85 1493 WRITE_ONCE(*bypass_qtail, &pcl->next);
7146a4f0 1494
97e86a85 1495 qtail[JQ_BYPASS] = &pcl->next;
7146a4f0
GX
1496}
1497
7865827c
GX
1498static void z_erofs_decompressqueue_endio(struct bio *bio)
1499{
cdba5506 1500 struct z_erofs_decompressqueue *q = bio->bi_private;
7865827c
GX
1501 blk_status_t err = bio->bi_status;
1502 struct bio_vec *bvec;
1503 struct bvec_iter_all iter_all;
1504
1505 bio_for_each_segment_all(bvec, bio, iter_all) {
1506 struct page *page = bvec->bv_page;
1507
1508 DBG_BUGON(PageUptodate(page));
1509 DBG_BUGON(z_erofs_page_is_invalidated(page));
1510
7865827c
GX
1511 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
1512 if (!err)
1513 SetPageUptodate(page);
1514 unlock_page(page);
1515 }
1516 }
67148551
GX
1517 if (err)
1518 q->eio = true;
cdba5506 1519 z_erofs_decompress_kickoff(q, -1);
7865827c
GX
1520 bio_put(bio);
1521}
1522
83a386c0 1523static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
eaa9172a 1524 struct page **pagepool,
0c638f70
GX
1525 struct z_erofs_decompressqueue *fgq,
1526 bool *force_fg)
3883a79a 1527{
83a386c0
GX
1528 struct super_block *sb = f->inode->i_sb;
1529 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
97e86a85 1530 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
a4b1fab1 1531 struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
5c6dcc57 1532 z_erofs_next_pcluster_t owned_head = f->owned_head;
dfeab2e9 1533 /* bio is NULL initially, so no need to initialize last_{index,bdev} */
3f649ab7 1534 pgoff_t last_index;
dfeab2e9 1535 struct block_device *last_bdev;
1e4a2955
GX
1536 unsigned int nr_bios = 0;
1537 struct bio *bio = NULL;
82e60d00
JW
1538 unsigned long pflags;
1539 int memstall = 0;
3883a79a 1540
cdba5506
GX
1541 /*
1542 * if managed cache is enabled, bypass jobqueue is needed,
1543 * no need to read from device for all pclusters in this queue.
1544 */
1545 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1546 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1547
a4b1fab1
GX
1548 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1549 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
3883a79a
GX
1550
1551 /* by default, all need io submission */
7146a4f0 1552 q[JQ_SUBMIT]->head = owned_head;
3883a79a
GX
1553
1554 do {
dfeab2e9 1555 struct erofs_map_dev mdev;
97e86a85 1556 struct z_erofs_pcluster *pcl;
1e4a2955
GX
1557 pgoff_t cur, end;
1558 unsigned int i = 0;
1559 bool bypass = true;
3883a79a
GX
1560
1561 /* no possible 'owned_head' equals the following */
97e86a85
GX
1562 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1563 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1564
1565 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
3883a79a 1566
cecf864d
YH
1567 /* close the main owned chain at first */
1568 owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
1569 Z_EROFS_PCLUSTER_TAIL_CLOSED);
1570 if (z_erofs_is_inline_pcluster(pcl)) {
1571 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1572 continue;
1573 }
1574
dfeab2e9
GX
1575 /* no device id here, thus it will always succeed */
1576 mdev = (struct erofs_map_dev) {
1577 .m_pa = blknr_to_addr(pcl->obj.index),
1578 };
1579 (void)erofs_map_dev(sb, &mdev);
1580
1581 cur = erofs_blknr(mdev.m_pa);
9f6cc76e 1582 end = cur + pcl->pclusterpages;
3883a79a 1583
1e4a2955
GX
1584 do {
1585 struct page *page;
3883a79a 1586
1e4a2955 1587 page = pickup_page_for_submission(pcl, i++, pagepool,
83a386c0 1588 mc);
1e4a2955
GX
1589 if (!page)
1590 continue;
3883a79a 1591
dfeab2e9
GX
1592 if (bio && (cur != last_index + 1 ||
1593 last_bdev != mdev.m_bdev)) {
3883a79a 1594submit_bio_retry:
1e4a2955 1595 submit_bio(bio);
82e60d00
JW
1596 if (memstall) {
1597 psi_memstall_leave(&pflags);
1598 memstall = 0;
1599 }
1e4a2955
GX
1600 bio = NULL;
1601 }
a5c0b780 1602
82e60d00 1603 if (unlikely(PageWorkingset(page)) && !memstall) {
99486c51 1604 psi_memstall_enter(&pflags);
82e60d00
JW
1605 memstall = 1;
1606 }
99486c51 1607
1e4a2955 1608 if (!bio) {
07888c66
CH
1609 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1610 REQ_OP_READ, GFP_NOIO);
1e4a2955 1611 bio->bi_end_io = z_erofs_decompressqueue_endio;
dfeab2e9 1612
dfeab2e9 1613 last_bdev = mdev.m_bdev;
1e4a2955
GX
1614 bio->bi_iter.bi_sector = (sector_t)cur <<
1615 LOG_SECTORS_PER_BLOCK;
cdba5506 1616 bio->bi_private = q[JQ_SUBMIT];
6ea5aad3
GX
1617 if (f->readahead)
1618 bio->bi_opf |= REQ_RAHEAD;
1e4a2955
GX
1619 ++nr_bios;
1620 }
3883a79a 1621
6c3e485e 1622 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
1e4a2955 1623 goto submit_bio_retry;
3883a79a 1624
1e4a2955
GX
1625 last_index = cur;
1626 bypass = false;
1627 } while (++cur < end);
105d4ad8 1628
1e4a2955 1629 if (!bypass)
97e86a85 1630 qtail[JQ_SUBMIT] = &pcl->next;
7146a4f0 1631 else
97e86a85
GX
1632 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1633 } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
3883a79a 1634
99486c51 1635 if (bio) {
94e4e153 1636 submit_bio(bio);
82e60d00
JW
1637 if (memstall)
1638 psi_memstall_leave(&pflags);
99486c51 1639 }
3883a79a 1640
587a67b7
GX
1641 /*
1642 * although background is preferred, no one is pending for submission.
1643 * don't issue workqueue for decompression but drop it directly instead.
1644 */
1645 if (!*force_fg && !nr_bios) {
1646 kvfree(q[JQ_SUBMIT]);
1e4a2955 1647 return;
587a67b7 1648 }
cdba5506 1649 z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
3883a79a
GX
1650}
1651
83a386c0 1652static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
eaa9172a 1653 struct page **pagepool, bool force_fg)
3883a79a 1654{
a4b1fab1 1655 struct z_erofs_decompressqueue io[NR_JOBQUEUES];
3883a79a 1656
5c6dcc57 1657 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
3883a79a 1658 return;
83a386c0 1659 z_erofs_submit_queue(f, pagepool, io, &force_fg);
3883a79a 1660
0c638f70
GX
1661 /* handle bypass queue (no i/o pclusters) immediately */
1662 z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
4279f3f9 1663
3883a79a
GX
1664 if (!force_fg)
1665 return;
1666
1667 /* wait until all bios are completed */
60b30050 1668 wait_for_completion_io(&io[JQ_SUBMIT].u.done);
3883a79a 1669
0c638f70
GX
1670 /* handle synchronous decompress queue in the caller context */
1671 z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
3883a79a
GX
1672}
1673
38629291
GX
1674/*
1675 * Since partial uptodate is still unimplemented for now, we have to use
1676 * approximate readmore strategies as a start.
1677 */
1678static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1679 struct readahead_control *rac,
1680 erofs_off_t end,
eaa9172a 1681 struct page **pagepool,
38629291
GX
1682 bool backmost)
1683{
1684 struct inode *inode = f->inode;
1685 struct erofs_map_blocks *map = &f->map;
1686 erofs_off_t cur;
1687 int err;
1688
1689 if (backmost) {
1690 map->m_la = end;
622ceadd
GX
1691 err = z_erofs_map_blocks_iter(inode, map,
1692 EROFS_GET_BLOCKS_READMORE);
38629291
GX
1693 if (err)
1694 return;
1695
1696 /* expend ra for the trailing edge if readahead */
1697 if (rac) {
1698 loff_t newstart = readahead_pos(rac);
1699
1700 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1701 readahead_expand(rac, newstart, cur - newstart);
1702 return;
1703 }
1704 end = round_up(end, PAGE_SIZE);
1705 } else {
1706 end = round_up(map->m_la, PAGE_SIZE);
1707
1708 if (!map->m_llen)
1709 return;
1710 }
1711
1712 cur = map->m_la + map->m_llen - 1;
1713 while (cur >= end) {
1714 pgoff_t index = cur >> PAGE_SHIFT;
1715 struct page *page;
1716
1717 page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
aa793b46
GX
1718 if (page) {
1719 if (PageUptodate(page)) {
1720 unlock_page(page);
1721 } else {
1722 err = z_erofs_do_read_page(f, page, pagepool);
1723 if (err)
1724 erofs_err(inode->i_sb,
1725 "readmore error at page %lu @ nid %llu",
1726 index, EROFS_I(inode)->nid);
1727 }
38629291 1728 put_page(page);
38629291
GX
1729 }
1730
38629291
GX
1731 if (cur < PAGE_SIZE)
1732 break;
1733 cur = (index << PAGE_SHIFT) - 1;
1734 }
1735}
1736
a2e20a25 1737static int z_erofs_read_folio(struct file *file, struct folio *folio)
3883a79a 1738{
a2e20a25 1739 struct page *page = &folio->page;
3883a79a 1740 struct inode *const inode = page->mapping->host;
40452ffc 1741 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
97e86a85 1742 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
eaa9172a 1743 struct page *pagepool = NULL;
3883a79a 1744 int err;
3883a79a 1745
ba9ce771 1746 trace_erofs_readpage(page, false);
f0c519fc
GX
1747 f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
1748
38629291
GX
1749 z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
1750 &pagepool, true);
1825c8d7 1751 err = z_erofs_do_read_page(&f, page, &pagepool);
38629291
GX
1752 z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
1753
5c6dcc57 1754 (void)z_erofs_collector_end(&f);
3883a79a 1755
ee45197c 1756 /* if some compressed cluster ready, need submit them anyway */
83a386c0 1757 z_erofs_runqueue(&f, &pagepool,
40452ffc 1758 z_erofs_get_sync_decompress_policy(sbi, 0));
ee45197c
GX
1759
1760 if (err)
4f761fa2 1761 erofs_err(inode->i_sb, "failed to read, err [%d]", err);
3883a79a 1762
09c54379 1763 erofs_put_metabuf(&f.map.buf);
eaa9172a 1764 erofs_release_pages(&pagepool);
ee45197c 1765 return err;
3883a79a
GX
1766}
1767
0615090c 1768static void z_erofs_readahead(struct readahead_control *rac)
3883a79a 1769{
0615090c 1770 struct inode *const inode = rac->mapping->host;
5fb76bb0 1771 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
97e86a85 1772 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
eaa9172a 1773 struct page *pagepool = NULL, *head = NULL, *page;
38629291 1774 unsigned int nr_pages;
3883a79a 1775
6ea5aad3 1776 f.readahead = true;
0615090c 1777 f.headoffset = readahead_pos(rac);
3883a79a 1778
38629291
GX
1779 z_erofs_pcluster_readmore(&f, rac, f.headoffset +
1780 readahead_length(rac) - 1, &pagepool, true);
1781 nr_pages = readahead_count(rac);
1782 trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
2d9b5dcd 1783
38629291 1784 while ((page = readahead_page(rac))) {
3883a79a
GX
1785 set_page_private(page, (unsigned long)head);
1786 head = page;
1787 }
1788
42d40b4a 1789 while (head) {
3883a79a
GX
1790 struct page *page = head;
1791 int err;
1792
1793 /* traversal in reverse order */
1794 head = (void *)page_private(page);
1795
1825c8d7 1796 err = z_erofs_do_read_page(&f, page, &pagepool);
a5876e24 1797 if (err)
4f761fa2
GX
1798 erofs_err(inode->i_sb,
1799 "readahead error at page %lu @ nid %llu",
1800 page->index, EROFS_I(inode)->nid);
3883a79a
GX
1801 put_page(page);
1802 }
38629291 1803 z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
5c6dcc57 1804 (void)z_erofs_collector_end(&f);
3883a79a 1805
83a386c0 1806 z_erofs_runqueue(&f, &pagepool,
40452ffc 1807 z_erofs_get_sync_decompress_policy(sbi, nr_pages));
09c54379 1808 erofs_put_metabuf(&f.map.buf);
eaa9172a 1809 erofs_release_pages(&pagepool);
3883a79a
GX
1810}
1811
0c638f70 1812const struct address_space_operations z_erofs_aops = {
a2e20a25 1813 .read_folio = z_erofs_read_folio,
0615090c 1814 .readahead = z_erofs_readahead,
3883a79a 1815};