Commit | Line | Data |
---|---|---|
29b24f6c | 1 | // SPDX-License-Identifier: GPL-2.0-only |
02827e17 | 2 | /* |
02827e17 | 3 | * Copyright (C) 2018 HUAWEI, Inc. |
592e7cd0 | 4 | * https://www.huawei.com/ |
06a304cd | 5 | * Copyright (C) 2022 Alibaba Cloud |
02827e17 | 6 | */ |
57b78c9f | 7 | #include "zdata.h" |
27481233 | 8 | #include "compress.h" |
3883a79a GX |
9 | #include <linux/prefetch.h> |
10 | ||
284db12c CG |
11 | #include <trace/events/erofs.h> |
12 | ||
9f6cc76e GX |
13 | /* |
14 | * since pclustersize is variable for big pcluster feature, introduce slab | |
15 | * pools implementation for different pcluster sizes. | |
16 | */ | |
17 | struct z_erofs_pcluster_slab { | |
18 | struct kmem_cache *slab; | |
19 | unsigned int maxpages; | |
20 | char name[48]; | |
21 | }; | |
22 | ||
23 | #define _PCLP(n) { .maxpages = n } | |
24 | ||
25 | static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { | |
26 | _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), | |
27 | _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) | |
28 | }; | |
29 | ||
387bab87 GX |
30 | /* (obsoleted) page type for online pages */ |
31 | enum z_erofs_page_type { | |
32 | /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ | |
33 | Z_EROFS_PAGE_TYPE_EXCLUSIVE, | |
34 | ||
35 | Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, | |
36 | ||
37 | Z_EROFS_VLE_PAGE_TYPE_HEAD, | |
38 | Z_EROFS_VLE_PAGE_TYPE_MAX | |
39 | }; | |
40 | ||
06a304cd GX |
41 | struct z_erofs_bvec_iter { |
42 | struct page *bvpage; | |
43 | struct z_erofs_bvset *bvset; | |
44 | unsigned int nr, cur; | |
45 | }; | |
46 | ||
47 | static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) | |
48 | { | |
49 | if (iter->bvpage) | |
50 | kunmap_local(iter->bvset); | |
51 | return iter->bvpage; | |
52 | } | |
53 | ||
54 | static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) | |
55 | { | |
56 | unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; | |
57 | /* have to access nextpage in advance, otherwise it will be unmapped */ | |
58 | struct page *nextpage = iter->bvset->nextpage; | |
59 | struct page *oldpage; | |
60 | ||
61 | DBG_BUGON(!nextpage); | |
62 | oldpage = z_erofs_bvec_iter_end(iter); | |
63 | iter->bvpage = nextpage; | |
64 | iter->bvset = kmap_local_page(nextpage); | |
65 | iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); | |
66 | iter->cur = 0; | |
67 | return oldpage; | |
68 | } | |
69 | ||
70 | static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, | |
71 | struct z_erofs_bvset_inline *bvset, | |
72 | unsigned int bootstrap_nr, | |
73 | unsigned int cur) | |
74 | { | |
75 | *iter = (struct z_erofs_bvec_iter) { | |
76 | .nr = bootstrap_nr, | |
77 | .bvset = (struct z_erofs_bvset *)bvset, | |
78 | }; | |
79 | ||
80 | while (cur > iter->nr) { | |
81 | cur -= iter->nr; | |
82 | z_erofs_bvset_flip(iter); | |
83 | } | |
84 | iter->cur = cur; | |
85 | } | |
86 | ||
87 | static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, | |
88 | struct z_erofs_bvec *bvec, | |
89 | struct page **candidate_bvpage) | |
90 | { | |
91 | if (iter->cur == iter->nr) { | |
92 | if (!*candidate_bvpage) | |
93 | return -EAGAIN; | |
94 | ||
95 | DBG_BUGON(iter->bvset->nextpage); | |
96 | iter->bvset->nextpage = *candidate_bvpage; | |
97 | z_erofs_bvset_flip(iter); | |
98 | ||
99 | iter->bvset->nextpage = NULL; | |
100 | *candidate_bvpage = NULL; | |
101 | } | |
102 | iter->bvset->bvec[iter->cur++] = *bvec; | |
103 | return 0; | |
104 | } | |
105 | ||
106 | static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, | |
107 | struct z_erofs_bvec *bvec, | |
108 | struct page **old_bvpage) | |
109 | { | |
110 | if (iter->cur == iter->nr) | |
111 | *old_bvpage = z_erofs_bvset_flip(iter); | |
112 | else | |
113 | *old_bvpage = NULL; | |
114 | *bvec = iter->bvset->bvec[iter->cur++]; | |
115 | } | |
116 | ||
9f6cc76e GX |
117 | static void z_erofs_destroy_pcluster_pool(void) |
118 | { | |
119 | int i; | |
120 | ||
121 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
122 | if (!pcluster_pool[i].slab) | |
123 | continue; | |
124 | kmem_cache_destroy(pcluster_pool[i].slab); | |
125 | pcluster_pool[i].slab = NULL; | |
126 | } | |
127 | } | |
128 | ||
129 | static int z_erofs_create_pcluster_pool(void) | |
130 | { | |
131 | struct z_erofs_pcluster_slab *pcs; | |
132 | struct z_erofs_pcluster *a; | |
133 | unsigned int size; | |
134 | ||
135 | for (pcs = pcluster_pool; | |
136 | pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { | |
137 | size = struct_size(a, compressed_pages, pcs->maxpages); | |
138 | ||
139 | sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); | |
140 | pcs->slab = kmem_cache_create(pcs->name, size, 0, | |
141 | SLAB_RECLAIM_ACCOUNT, NULL); | |
142 | if (pcs->slab) | |
143 | continue; | |
144 | ||
145 | z_erofs_destroy_pcluster_pool(); | |
146 | return -ENOMEM; | |
147 | } | |
148 | return 0; | |
149 | } | |
150 | ||
151 | static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) | |
152 | { | |
153 | int i; | |
154 | ||
155 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
156 | struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; | |
157 | struct z_erofs_pcluster *pcl; | |
158 | ||
159 | if (nrpages > pcs->maxpages) | |
160 | continue; | |
161 | ||
162 | pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); | |
163 | if (!pcl) | |
164 | return ERR_PTR(-ENOMEM); | |
165 | pcl->pclusterpages = nrpages; | |
166 | return pcl; | |
167 | } | |
168 | return ERR_PTR(-EINVAL); | |
169 | } | |
170 | ||
171 | static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) | |
172 | { | |
cecf864d | 173 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
9f6cc76e GX |
174 | int i; |
175 | ||
176 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
177 | struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; | |
178 | ||
cecf864d | 179 | if (pclusterpages > pcs->maxpages) |
9f6cc76e GX |
180 | continue; |
181 | ||
182 | kmem_cache_free(pcs->slab, pcl); | |
183 | return; | |
184 | } | |
185 | DBG_BUGON(1); | |
186 | } | |
187 | ||
97e86a85 | 188 | /* how to allocate cached pages for a pcluster */ |
92e6efd5 GX |
189 | enum z_erofs_cache_alloctype { |
190 | DONTALLOC, /* don't allocate any cached pages */ | |
1825c8d7 GX |
191 | /* |
192 | * try to use cached I/O if page allocation succeeds or fallback | |
193 | * to in-place I/O instead to avoid any direct reclaim. | |
194 | */ | |
195 | TRYALLOC, | |
92e6efd5 GX |
196 | }; |
197 | ||
198 | /* | |
199 | * tagged pointer with 1-bit tag for all compressed pages | |
200 | * tag 0 - the page is just found with an extra page reference | |
201 | */ | |
202 | typedef tagptr1_t compressed_page_t; | |
203 | ||
204 | #define tag_compressed_page_justfound(page) \ | |
205 | tagptr_fold(compressed_page_t, page, 1) | |
206 | ||
3883a79a | 207 | static struct workqueue_struct *z_erofs_workqueue __read_mostly; |
3883a79a GX |
208 | |
209 | void z_erofs_exit_zip_subsystem(void) | |
210 | { | |
3883a79a | 211 | destroy_workqueue(z_erofs_workqueue); |
9f6cc76e | 212 | z_erofs_destroy_pcluster_pool(); |
3883a79a GX |
213 | } |
214 | ||
99634bf3 | 215 | static inline int z_erofs_init_workqueue(void) |
3883a79a | 216 | { |
7dd68b14 | 217 | const unsigned int onlinecpus = num_possible_cpus(); |
3883a79a GX |
218 | |
219 | /* | |
97e86a85 GX |
220 | * no need to spawn too many threads, limiting threads could minimum |
221 | * scheduling overhead, perhaps per-CPU threads should be better? | |
3883a79a | 222 | */ |
0e62ea33 GX |
223 | z_erofs_workqueue = alloc_workqueue("erofs_unzipd", |
224 | WQ_UNBOUND | WQ_HIGHPRI, | |
97e86a85 | 225 | onlinecpus + onlinecpus / 4); |
42d40b4a | 226 | return z_erofs_workqueue ? 0 : -ENOMEM; |
3883a79a GX |
227 | } |
228 | ||
0a0b7e62 | 229 | int __init z_erofs_init_zip_subsystem(void) |
3883a79a | 230 | { |
9f6cc76e GX |
231 | int err = z_erofs_create_pcluster_pool(); |
232 | ||
233 | if (err) | |
234 | return err; | |
235 | err = z_erofs_init_workqueue(); | |
236 | if (err) | |
237 | z_erofs_destroy_pcluster_pool(); | |
238 | return err; | |
3883a79a GX |
239 | } |
240 | ||
97e86a85 GX |
241 | enum z_erofs_collectmode { |
242 | COLLECT_SECONDARY, | |
243 | COLLECT_PRIMARY, | |
3883a79a | 244 | /* |
97e86a85 GX |
245 | * The current collection was the tail of an exist chain, in addition |
246 | * that the previous processed chained collections are all decided to | |
247 | * be hooked up to it. | |
248 | * A new chain will be created for the remaining collections which are | |
249 | * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, | |
250 | * the next collection cannot reuse the whole page safely in | |
251 | * the following scenario: | |
a112152f GX |
252 | * ________________________________________________________________ |
253 | * | tail (partial) page | head (partial) page | | |
97e86a85 | 254 | * | (belongs to the next cl) | (belongs to the current cl) | |
a112152f GX |
255 | * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| |
256 | */ | |
97e86a85 | 257 | COLLECT_PRIMARY_HOOKED, |
0b964600 GX |
258 | /* |
259 | * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it | |
260 | * could be dispatched into bypass queue later due to uptodated managed | |
261 | * pages. All related online pages cannot be reused for inplace I/O (or | |
387bab87 | 262 | * bvpage) since it can be directly decoded without I/O submission. |
0b964600 | 263 | */ |
97e86a85 | 264 | COLLECT_PRIMARY_FOLLOWED_NOINPLACE, |
a112152f | 265 | /* |
97e86a85 GX |
266 | * The current collection has been linked with the owned chain, and |
267 | * could also be linked with the remaining collections, which means | |
268 | * if the processing page is the tail page of the collection, thus | |
269 | * the current collection can safely use the whole page (since | |
270 | * the previous collection is under control) for in-place I/O, as | |
271 | * illustrated below: | |
a112152f | 272 | * ________________________________________________________________ |
97e86a85 GX |
273 | * | tail (partial) page | head (partial) page | |
274 | * | (of the current cl) | (of the previous collection) | | |
275 | * | PRIMARY_FOLLOWED or | | | |
276 | * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| | |
a112152f | 277 | * |
97e86a85 | 278 | * [ (*) the above page can be used as inplace I/O. ] |
3883a79a | 279 | */ |
97e86a85 | 280 | COLLECT_PRIMARY_FOLLOWED, |
3883a79a GX |
281 | }; |
282 | ||
5c6dcc57 GX |
283 | struct z_erofs_decompress_frontend { |
284 | struct inode *const inode; | |
285 | struct erofs_map_blocks map; | |
06a304cd | 286 | struct z_erofs_bvec_iter biter; |
3883a79a | 287 | |
06a304cd | 288 | struct page *candidate_bvpage; |
bfc4ccb1 | 289 | struct z_erofs_pcluster *pcl, *tailpcl; |
81382f5f GX |
290 | /* a pointer used to pick up inplace I/O pages */ |
291 | struct page **icpage_ptr; | |
97e86a85 GX |
292 | z_erofs_next_pcluster_t owned_head; |
293 | ||
294 | enum z_erofs_collectmode mode; | |
97e86a85 | 295 | |
6ea5aad3 | 296 | bool readahead; |
97e86a85 GX |
297 | /* used for applying cache strategy on the fly */ |
298 | bool backmost; | |
299 | erofs_off_t headoffset; | |
300 | }; | |
301 | ||
97e86a85 | 302 | #define DECOMPRESS_FRONTEND_INIT(__i) { \ |
5c6dcc57 | 303 | .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ |
4398d3c3 | 304 | .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } |
97e86a85 GX |
305 | |
306 | static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; | |
307 | static DEFINE_MUTEX(z_pagemap_global_lock); | |
3883a79a | 308 | |
6f39d1e1 GX |
309 | static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, |
310 | enum z_erofs_cache_alloctype type, | |
311 | struct page **pagepool) | |
105d4ad8 | 312 | { |
6f39d1e1 | 313 | struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); |
5c6dcc57 | 314 | struct z_erofs_pcluster *pcl = fe->pcl; |
92e6efd5 | 315 | bool standalone = true; |
6f39d1e1 GX |
316 | /* |
317 | * optimistic allocation without direct reclaim since inplace I/O | |
318 | * can be used if low memory otherwise. | |
319 | */ | |
1825c8d7 GX |
320 | gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | |
321 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; | |
81382f5f GX |
322 | struct page **pages; |
323 | pgoff_t index; | |
92e6efd5 | 324 | |
5c6dcc57 | 325 | if (fe->mode < COLLECT_PRIMARY_FOLLOWED) |
92e6efd5 GX |
326 | return; |
327 | ||
81382f5f GX |
328 | pages = pcl->compressed_pages; |
329 | index = pcl->obj.index; | |
330 | for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { | |
92e6efd5 GX |
331 | struct page *page; |
332 | compressed_page_t t; | |
1825c8d7 | 333 | struct page *newpage = NULL; |
92e6efd5 GX |
334 | |
335 | /* the compressed page was loaded before */ | |
97e86a85 | 336 | if (READ_ONCE(*pages)) |
105d4ad8 GX |
337 | continue; |
338 | ||
97e86a85 | 339 | page = find_get_page(mc, index); |
92e6efd5 GX |
340 | |
341 | if (page) { | |
342 | t = tag_compressed_page_justfound(page); | |
0b964600 GX |
343 | } else { |
344 | /* I/O is needed, no possible to decompress directly */ | |
92e6efd5 | 345 | standalone = false; |
0b964600 | 346 | switch (type) { |
0b964600 GX |
347 | case TRYALLOC: |
348 | newpage = erofs_allocpage(pagepool, gfp); | |
349 | if (!newpage) | |
350 | continue; | |
351 | set_page_private(newpage, | |
352 | Z_EROFS_PREALLOCATED_PAGE); | |
353 | t = tag_compressed_page_justfound(newpage); | |
354 | break; | |
355 | default: /* DONTALLOC */ | |
356 | continue; | |
357 | } | |
105d4ad8 GX |
358 | } |
359 | ||
97e86a85 | 360 | if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) |
105d4ad8 GX |
361 | continue; |
362 | ||
eaa9172a | 363 | if (page) |
92e6efd5 | 364 | put_page(page); |
eaa9172a GX |
365 | else if (newpage) |
366 | erofs_pagepool_add(pagepool, newpage); | |
105d4ad8 | 367 | } |
92e6efd5 | 368 | |
0b964600 GX |
369 | /* |
370 | * don't do inplace I/O if all compressed pages are available in | |
371 | * managed cache since it can be moved to the bypass queue instead. | |
372 | */ | |
373 | if (standalone) | |
5c6dcc57 | 374 | fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; |
105d4ad8 GX |
375 | } |
376 | ||
377 | /* called by erofs_shrinker to get rid of all compressed_pages */ | |
47e541a1 | 378 | int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, |
97e86a85 | 379 | struct erofs_workgroup *grp) |
105d4ad8 | 380 | { |
97e86a85 GX |
381 | struct z_erofs_pcluster *const pcl = |
382 | container_of(grp, struct z_erofs_pcluster, obj); | |
105d4ad8 GX |
383 | int i; |
384 | ||
cecf864d | 385 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); |
105d4ad8 GX |
386 | /* |
387 | * refcount of workgroup is now freezed as 1, | |
388 | * therefore no need to worry about available decompression users. | |
389 | */ | |
9f6cc76e | 390 | for (i = 0; i < pcl->pclusterpages; ++i) { |
97e86a85 | 391 | struct page *page = pcl->compressed_pages[i]; |
105d4ad8 | 392 | |
97e86a85 | 393 | if (!page) |
105d4ad8 GX |
394 | continue; |
395 | ||
396 | /* block other users from reclaiming or migrating the page */ | |
397 | if (!trylock_page(page)) | |
398 | return -EBUSY; | |
399 | ||
f4d4e5fc | 400 | if (!erofs_page_is_managed(sbi, page)) |
97e86a85 | 401 | continue; |
105d4ad8 | 402 | |
97e86a85 GX |
403 | /* barrier is implied in the following 'unlock_page' */ |
404 | WRITE_ONCE(pcl->compressed_pages[i], NULL); | |
6aaa7b06 | 405 | detach_page_private(page); |
105d4ad8 | 406 | unlock_page(page); |
105d4ad8 GX |
407 | } |
408 | return 0; | |
409 | } | |
410 | ||
d252ff3d | 411 | int erofs_try_to_free_cached_page(struct page *page) |
105d4ad8 | 412 | { |
97e86a85 | 413 | struct z_erofs_pcluster *const pcl = (void *)page_private(page); |
105d4ad8 GX |
414 | int ret = 0; /* 0 - busy */ |
415 | ||
97e86a85 | 416 | if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { |
105d4ad8 GX |
417 | unsigned int i; |
418 | ||
cecf864d | 419 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); |
9f6cc76e | 420 | for (i = 0; i < pcl->pclusterpages; ++i) { |
97e86a85 GX |
421 | if (pcl->compressed_pages[i] == page) { |
422 | WRITE_ONCE(pcl->compressed_pages[i], NULL); | |
105d4ad8 GX |
423 | ret = 1; |
424 | break; | |
425 | } | |
426 | } | |
97e86a85 | 427 | erofs_workgroup_unfreeze(&pcl->obj, 1); |
105d4ad8 | 428 | |
6aaa7b06 GX |
429 | if (ret) |
430 | detach_page_private(page); | |
105d4ad8 GX |
431 | } |
432 | return ret; | |
433 | } | |
105d4ad8 | 434 | |
3883a79a | 435 | /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ |
5c6dcc57 | 436 | static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, |
81382f5f | 437 | struct page *page) |
3883a79a | 438 | { |
5c6dcc57 | 439 | struct z_erofs_pcluster *const pcl = fe->pcl; |
97e86a85 | 440 | |
5c6dcc57 GX |
441 | while (fe->icpage_ptr > pcl->compressed_pages) |
442 | if (!cmpxchg(--fe->icpage_ptr, NULL, page)) | |
3883a79a | 443 | return true; |
3883a79a GX |
444 | return false; |
445 | } | |
446 | ||
87ca34a7 | 447 | /* callers must be with pcluster lock held */ |
5c6dcc57 | 448 | static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, |
06a304cd GX |
449 | struct z_erofs_bvec *bvec, |
450 | enum z_erofs_page_type type) | |
3883a79a GX |
451 | { |
452 | int ret; | |
3883a79a | 453 | |
5c6dcc57 | 454 | if (fe->mode >= COLLECT_PRIMARY && |
06a304cd GX |
455 | type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { |
456 | /* give priority for inplaceio to use file pages first */ | |
457 | if (z_erofs_try_inplace_io(fe, bvec->page)) | |
458 | return 0; | |
459 | /* otherwise, check if it can be used as a bvpage */ | |
460 | if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && | |
461 | !fe->candidate_bvpage) | |
462 | fe->candidate_bvpage = bvec->page; | |
463 | } | |
464 | ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); | |
465 | fe->pcl->vcnt += (ret >= 0); | |
466 | return ret; | |
3883a79a GX |
467 | } |
468 | ||
5c6dcc57 | 469 | static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) |
3883a79a | 470 | { |
5c6dcc57 GX |
471 | struct z_erofs_pcluster *pcl = f->pcl; |
472 | z_erofs_next_pcluster_t *owned_head = &f->owned_head; | |
3883a79a | 473 | |
473e15b0 GX |
474 | /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ |
475 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, | |
476 | *owned_head) == Z_EROFS_PCLUSTER_NIL) { | |
97e86a85 | 477 | *owned_head = &pcl->next; |
473e15b0 | 478 | /* so we can attach this pcluster to our submission chain. */ |
5c6dcc57 | 479 | f->mode = COLLECT_PRIMARY_FOLLOWED; |
473e15b0 GX |
480 | return; |
481 | } | |
482 | ||
483 | /* | |
484 | * type 2, link to the end of an existing open chain, be careful | |
485 | * that its submission is controlled by the original attached chain. | |
486 | */ | |
487 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | |
488 | *owned_head) == Z_EROFS_PCLUSTER_TAIL) { | |
97e86a85 | 489 | *owned_head = Z_EROFS_PCLUSTER_TAIL; |
5c6dcc57 GX |
490 | f->mode = COLLECT_PRIMARY_HOOKED; |
491 | f->tailpcl = NULL; | |
473e15b0 | 492 | return; |
a112152f | 493 | } |
473e15b0 | 494 | /* type 3, it belongs to a chain, but it isn't the end of the chain */ |
5c6dcc57 | 495 | f->mode = COLLECT_PRIMARY; |
3883a79a GX |
496 | } |
497 | ||
83a386c0 | 498 | static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) |
3883a79a | 499 | { |
83a386c0 | 500 | struct erofs_map_blocks *map = &fe->map; |
5c6dcc57 | 501 | struct z_erofs_pcluster *pcl = fe->pcl; |
97e86a85 | 502 | unsigned int length; |
97e86a85 | 503 | |
64094a04 | 504 | /* to avoid unexpected loop formed by corrupted images */ |
5c6dcc57 | 505 | if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { |
bfc4ccb1 | 506 | DBG_BUGON(1); |
9e579fc1 | 507 | return -EFSCORRUPTED; |
bfc4ccb1 | 508 | } |
97e86a85 | 509 | |
87ca34a7 | 510 | if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { |
97e86a85 | 511 | DBG_BUGON(1); |
9e579fc1 | 512 | return -EFSCORRUPTED; |
97e86a85 | 513 | } |
3883a79a | 514 | |
97e86a85 GX |
515 | length = READ_ONCE(pcl->length); |
516 | if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { | |
517 | if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { | |
518 | DBG_BUGON(1); | |
9e579fc1 | 519 | return -EFSCORRUPTED; |
97e86a85 GX |
520 | } |
521 | } else { | |
522 | unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; | |
3883a79a | 523 | |
97e86a85 GX |
524 | if (map->m_flags & EROFS_MAP_FULL_MAPPED) |
525 | llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; | |
3883a79a | 526 | |
97e86a85 GX |
527 | while (llen > length && |
528 | length != cmpxchg_relaxed(&pcl->length, length, llen)) { | |
529 | cpu_relax(); | |
530 | length = READ_ONCE(pcl->length); | |
531 | } | |
532 | } | |
87ca34a7 | 533 | mutex_lock(&pcl->lock); |
bfc4ccb1 | 534 | /* used to check tail merging loop due to corrupted images */ |
5c6dcc57 GX |
535 | if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) |
536 | fe->tailpcl = pcl; | |
473e15b0 | 537 | |
5c6dcc57 | 538 | z_erofs_try_to_claim_pcluster(fe); |
9e579fc1 | 539 | return 0; |
3883a79a GX |
540 | } |
541 | ||
83a386c0 | 542 | static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) |
3883a79a | 543 | { |
83a386c0 | 544 | struct erofs_map_blocks *map = &fe->map; |
cecf864d | 545 | bool ztailpacking = map->m_flags & EROFS_MAP_META; |
97e86a85 | 546 | struct z_erofs_pcluster *pcl; |
64094a04 | 547 | struct erofs_workgroup *grp; |
97e86a85 | 548 | int err; |
e5e3abba | 549 | |
8f899262 GX |
550 | if (!(map->m_flags & EROFS_MAP_ENCODED)) { |
551 | DBG_BUGON(1); | |
552 | return -EFSCORRUPTED; | |
553 | } | |
554 | ||
9f6cc76e | 555 | /* no available pcluster, let's allocate one */ |
cecf864d YH |
556 | pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : |
557 | map->m_plen >> PAGE_SHIFT); | |
9f6cc76e GX |
558 | if (IS_ERR(pcl)) |
559 | return PTR_ERR(pcl); | |
3883a79a | 560 | |
64094a04 | 561 | atomic_set(&pcl->obj.refcount, 1); |
8f899262 | 562 | pcl->algorithmformat = map->m_algorithmformat; |
97e86a85 GX |
563 | pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | |
564 | (map->m_flags & EROFS_MAP_FULL_MAPPED ? | |
565 | Z_EROFS_PCLUSTER_FULL_LENGTH : 0); | |
3883a79a | 566 | |
97e86a85 | 567 | /* new pclusters should be claimed as type 1, primary and followed */ |
5c6dcc57 | 568 | pcl->next = fe->owned_head; |
87ca34a7 | 569 | pcl->pageofs_out = map->m_la & ~PAGE_MASK; |
5c6dcc57 | 570 | fe->mode = COLLECT_PRIMARY_FOLLOWED; |
3883a79a | 571 | |
23edf3ab GX |
572 | /* |
573 | * lock all primary followed works before visible to others | |
97e86a85 | 574 | * and mutex_trylock *never* fails for a new pcluster. |
23edf3ab | 575 | */ |
87ca34a7 GX |
576 | mutex_init(&pcl->lock); |
577 | DBG_BUGON(!mutex_trylock(&pcl->lock)); | |
64094a04 | 578 | |
cecf864d YH |
579 | if (ztailpacking) { |
580 | pcl->obj.index = 0; /* which indicates ztailpacking */ | |
581 | pcl->pageofs_in = erofs_blkoff(map->m_pa); | |
582 | pcl->tailpacking_size = map->m_plen; | |
583 | } else { | |
584 | pcl->obj.index = map->m_pa >> PAGE_SHIFT; | |
23edf3ab | 585 | |
83a386c0 | 586 | grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); |
cecf864d YH |
587 | if (IS_ERR(grp)) { |
588 | err = PTR_ERR(grp); | |
589 | goto err_out; | |
590 | } | |
591 | ||
592 | if (grp != &pcl->obj) { | |
5c6dcc57 | 593 | fe->pcl = container_of(grp, |
cecf864d YH |
594 | struct z_erofs_pcluster, obj); |
595 | err = -EEXIST; | |
596 | goto err_out; | |
597 | } | |
3883a79a | 598 | } |
bfc4ccb1 | 599 | /* used to check tail merging loop due to corrupted images */ |
5c6dcc57 GX |
600 | if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) |
601 | fe->tailpcl = pcl; | |
602 | fe->owned_head = &pcl->next; | |
603 | fe->pcl = pcl; | |
9e579fc1 | 604 | return 0; |
64094a04 GX |
605 | |
606 | err_out: | |
87ca34a7 | 607 | mutex_unlock(&pcl->lock); |
9f6cc76e | 608 | z_erofs_free_pcluster(pcl); |
64094a04 | 609 | return err; |
3883a79a GX |
610 | } |
611 | ||
83a386c0 | 612 | static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) |
97e86a85 | 613 | { |
83a386c0 | 614 | struct erofs_map_blocks *map = &fe->map; |
0d823b42 | 615 | struct erofs_workgroup *grp = NULL; |
9e579fc1 | 616 | int ret; |
a112152f | 617 | |
87ca34a7 | 618 | DBG_BUGON(fe->pcl); |
3883a79a | 619 | |
87ca34a7 | 620 | /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ |
5c6dcc57 GX |
621 | DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); |
622 | DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | |
3883a79a | 623 | |
0d823b42 GX |
624 | if (!(map->m_flags & EROFS_MAP_META)) { |
625 | grp = erofs_find_workgroup(fe->inode->i_sb, | |
626 | map->m_pa >> PAGE_SHIFT); | |
627 | } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { | |
628 | DBG_BUGON(1); | |
629 | return -EFSCORRUPTED; | |
3883a79a GX |
630 | } |
631 | ||
64094a04 | 632 | if (grp) { |
5c6dcc57 | 633 | fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); |
0d823b42 | 634 | ret = -EEXIST; |
64094a04 | 635 | } else { |
83a386c0 | 636 | ret = z_erofs_register_pcluster(fe); |
3883a79a GX |
637 | } |
638 | ||
0d823b42 GX |
639 | if (ret == -EEXIST) { |
640 | ret = z_erofs_lookup_pcluster(fe); | |
641 | if (ret) { | |
642 | erofs_workgroup_put(&fe->pcl->obj); | |
643 | return ret; | |
644 | } | |
645 | } else if (ret) { | |
9e579fc1 | 646 | return ret; |
64094a04 | 647 | } |
06a304cd | 648 | z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, |
387bab87 | 649 | Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); |
81382f5f | 650 | /* since file-backed online pages are traversed in reverse order */ |
5c6dcc57 GX |
651 | fe->icpage_ptr = fe->pcl->compressed_pages + |
652 | z_erofs_pclusterpages(fe->pcl); | |
3883a79a GX |
653 | return 0; |
654 | } | |
655 | ||
656 | /* | |
97e86a85 GX |
657 | * keep in mind that no referenced pclusters will be freed |
658 | * only after a RCU grace period. | |
3883a79a GX |
659 | */ |
660 | static void z_erofs_rcu_callback(struct rcu_head *head) | |
661 | { | |
87ca34a7 GX |
662 | z_erofs_free_pcluster(container_of(head, |
663 | struct z_erofs_pcluster, rcu)); | |
3883a79a GX |
664 | } |
665 | ||
666 | void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) | |
667 | { | |
97e86a85 GX |
668 | struct z_erofs_pcluster *const pcl = |
669 | container_of(grp, struct z_erofs_pcluster, obj); | |
3883a79a | 670 | |
87ca34a7 | 671 | call_rcu(&pcl->rcu, z_erofs_rcu_callback); |
3883a79a GX |
672 | } |
673 | ||
5c6dcc57 | 674 | static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) |
3883a79a | 675 | { |
87ca34a7 | 676 | struct z_erofs_pcluster *pcl = fe->pcl; |
3883a79a | 677 | |
87ca34a7 | 678 | if (!pcl) |
3883a79a GX |
679 | return false; |
680 | ||
06a304cd | 681 | z_erofs_bvec_iter_end(&fe->biter); |
87ca34a7 | 682 | mutex_unlock(&pcl->lock); |
3883a79a | 683 | |
06a304cd GX |
684 | if (fe->candidate_bvpage) { |
685 | DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); | |
686 | fe->candidate_bvpage = NULL; | |
687 | } | |
688 | ||
3883a79a | 689 | /* |
97e86a85 GX |
690 | * if all pending pages are added, don't hold its reference |
691 | * any longer if the pcluster isn't hosted by ourselves. | |
3883a79a | 692 | */ |
5c6dcc57 | 693 | if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) |
87ca34a7 | 694 | erofs_workgroup_put(&pcl->obj); |
3883a79a | 695 | |
87ca34a7 | 696 | fe->pcl = NULL; |
3883a79a GX |
697 | return true; |
698 | } | |
699 | ||
97e86a85 | 700 | static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, |
4279f3f9 | 701 | unsigned int cachestrategy, |
97e86a85 | 702 | erofs_off_t la) |
92e6efd5 | 703 | { |
4279f3f9 GX |
704 | if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) |
705 | return false; | |
706 | ||
92e6efd5 GX |
707 | if (fe->backmost) |
708 | return true; | |
709 | ||
4279f3f9 GX |
710 | return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && |
711 | la < fe->headoffset; | |
92e6efd5 | 712 | } |
92e6efd5 | 713 | |
97e86a85 | 714 | static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, |
eaa9172a | 715 | struct page *page, struct page **pagepool) |
3883a79a | 716 | { |
97e86a85 | 717 | struct inode *const inode = fe->inode; |
bda17a45 | 718 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
3b423417 | 719 | struct erofs_map_blocks *const map = &fe->map; |
3883a79a | 720 | const loff_t offset = page_offset(page); |
dc76ea8c | 721 | bool tight = true; |
3883a79a | 722 | |
92e6efd5 | 723 | enum z_erofs_cache_alloctype cache_strategy; |
3883a79a | 724 | enum z_erofs_page_type page_type; |
7dd68b14 | 725 | unsigned int cur, end, spiltted, index; |
1e05ff36 | 726 | int err = 0; |
3883a79a GX |
727 | |
728 | /* register locked file pages as online pages in pack */ | |
729 | z_erofs_onlinepage_init(page); | |
730 | ||
731 | spiltted = 0; | |
732 | end = PAGE_SIZE; | |
733 | repeat: | |
734 | cur = end - 1; | |
735 | ||
39397a46 GX |
736 | if (offset + cur < map->m_la || |
737 | offset + cur >= map->m_la + map->m_llen) { | |
738 | erofs_dbg("out-of-range map @ pos %llu", offset + cur); | |
3883a79a | 739 | |
39397a46 GX |
740 | if (z_erofs_collector_end(fe)) |
741 | fe->backmost = false; | |
742 | map->m_la = offset + cur; | |
743 | map->m_llen = 0; | |
744 | err = z_erofs_map_blocks_iter(inode, map, 0); | |
745 | if (err) | |
746 | goto err_out; | |
747 | } else { | |
748 | if (fe->pcl) | |
749 | goto hitted; | |
87ca34a7 | 750 | /* didn't get a valid pcluster previously (very rare) */ |
1e5ceeab | 751 | } |
3883a79a | 752 | |
8d8a09b0 | 753 | if (!(map->m_flags & EROFS_MAP_MAPPED)) |
3883a79a GX |
754 | goto hitted; |
755 | ||
83a386c0 | 756 | err = z_erofs_collector_begin(fe); |
8d8a09b0 | 757 | if (err) |
3883a79a GX |
758 | goto err_out; |
759 | ||
5c6dcc57 | 760 | if (z_erofs_is_inline_pcluster(fe->pcl)) { |
09c54379 | 761 | void *mp; |
cecf864d | 762 | |
09c54379 GX |
763 | mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, |
764 | erofs_blknr(map->m_pa), EROFS_NO_KMAP); | |
765 | if (IS_ERR(mp)) { | |
766 | err = PTR_ERR(mp); | |
cecf864d YH |
767 | erofs_err(inode->i_sb, |
768 | "failed to get inline page, err %d", err); | |
769 | goto err_out; | |
770 | } | |
09c54379 | 771 | get_page(fe->map.buf.page); |
5c6dcc57 GX |
772 | WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); |
773 | fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; | |
cecf864d | 774 | } else { |
6f39d1e1 | 775 | /* bind cache first when cached decompression is preferred */ |
cecf864d YH |
776 | if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, |
777 | map->m_la)) | |
778 | cache_strategy = TRYALLOC; | |
779 | else | |
780 | cache_strategy = DONTALLOC; | |
92e6efd5 | 781 | |
6f39d1e1 | 782 | z_erofs_bind_cache(fe, cache_strategy, pagepool); |
cecf864d | 783 | } |
3883a79a | 784 | hitted: |
dc76ea8c GX |
785 | /* |
786 | * Ensure the current partial page belongs to this submit chain rather | |
787 | * than other concurrent submit chains or the noio(bypass) chain since | |
788 | * those chains are handled asynchronously thus the page cannot be used | |
387bab87 | 789 | * for inplace I/O or bvpage (should be processed in a strict order.) |
dc76ea8c | 790 | */ |
5c6dcc57 GX |
791 | tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && |
792 | fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); | |
dc76ea8c | 793 | |
7dd68b14 | 794 | cur = end - min_t(unsigned int, offset + end - map->m_la, end); |
8d8a09b0 | 795 | if (!(map->m_flags & EROFS_MAP_MAPPED)) { |
3883a79a GX |
796 | zero_user_segment(page, cur, end); |
797 | goto next_part; | |
798 | } | |
799 | ||
800 | /* let's derive page type */ | |
801 | page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : | |
802 | (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : | |
803 | (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : | |
804 | Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); | |
805 | ||
a112152f | 806 | if (cur) |
5c6dcc57 | 807 | tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); |
a112152f | 808 | |
3883a79a | 809 | retry: |
06a304cd GX |
810 | err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { |
811 | .page = page, | |
812 | .offset = offset - map->m_la, | |
813 | .end = end, | |
814 | }), page_type); | |
815 | /* should allocate an additional short-lived page for bvset */ | |
816 | if (err == -EAGAIN && !fe->candidate_bvpage) { | |
817 | fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); | |
818 | set_page_private(fe->candidate_bvpage, | |
819 | Z_EROFS_SHORTLIVED_PAGE); | |
820 | goto retry; | |
3883a79a GX |
821 | } |
822 | ||
06a304cd GX |
823 | if (err) { |
824 | DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); | |
3883a79a | 825 | goto err_out; |
06a304cd | 826 | } |
3883a79a | 827 | |
97e86a85 | 828 | index = page->index - (map->m_la >> PAGE_SHIFT); |
3883a79a | 829 | |
3883a79a | 830 | z_erofs_onlinepage_fixup(page, index, true); |
3883a79a | 831 | |
1e05ff36 GX |
832 | /* bump up the number of spiltted parts of a page */ |
833 | ++spiltted; | |
834 | /* also update nr_pages */ | |
87ca34a7 | 835 | fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); |
3883a79a GX |
836 | next_part: |
837 | /* can be used for verification */ | |
838 | map->m_llen = offset + cur - map->m_la; | |
839 | ||
2bc75964 KČ |
840 | end = cur; |
841 | if (end > 0) | |
3883a79a GX |
842 | goto repeat; |
843 | ||
1e05ff36 | 844 | out: |
3883a79a GX |
845 | z_erofs_onlinepage_endio(page); |
846 | ||
4f761fa2 GX |
847 | erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", |
848 | __func__, page, spiltted, map->m_llen); | |
1e05ff36 | 849 | return err; |
3883a79a | 850 | |
1e05ff36 | 851 | /* if some error occurred while processing this page */ |
3883a79a | 852 | err_out: |
1e05ff36 GX |
853 | SetPageError(page); |
854 | goto out; | |
3883a79a GX |
855 | } |
856 | ||
40452ffc HJ |
857 | static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, |
858 | unsigned int readahead_pages) | |
859 | { | |
a2e20a25 | 860 | /* auto: enable for read_folio, disable for readahead */ |
40452ffc HJ |
861 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && |
862 | !readahead_pages) | |
863 | return true; | |
864 | ||
865 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && | |
866 | (readahead_pages <= sbi->opt.max_sync_decompress_pages)) | |
867 | return true; | |
868 | ||
869 | return false; | |
870 | } | |
871 | ||
6aaa7b06 GX |
872 | static bool z_erofs_page_is_invalidated(struct page *page) |
873 | { | |
874 | return !page->mapping && !z_erofs_is_shortlived_page(page); | |
875 | } | |
876 | ||
42fec235 GX |
877 | static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, |
878 | struct page **pages, struct page **pagepool) | |
879 | { | |
06a304cd GX |
880 | struct z_erofs_bvec_iter biter; |
881 | struct page *old_bvpage; | |
42fec235 GX |
882 | int i, err = 0; |
883 | ||
387bab87 | 884 | z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); |
42fec235 | 885 | for (i = 0; i < pcl->vcnt; ++i) { |
06a304cd | 886 | struct z_erofs_bvec bvec; |
42fec235 GX |
887 | unsigned int pagenr; |
888 | ||
06a304cd | 889 | z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); |
42fec235 | 890 | |
06a304cd GX |
891 | if (old_bvpage) |
892 | z_erofs_put_shortlivedpage(pagepool, old_bvpage); | |
42fec235 | 893 | |
06a304cd | 894 | pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; |
42fec235 | 895 | DBG_BUGON(pagenr >= pcl->nr_pages); |
06a304cd | 896 | DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); |
42fec235 GX |
897 | /* |
898 | * currently EROFS doesn't support multiref(dedup), | |
899 | * so here erroring out one multiref page. | |
900 | */ | |
901 | if (pages[pagenr]) { | |
902 | DBG_BUGON(1); | |
903 | SetPageError(pages[pagenr]); | |
904 | z_erofs_onlinepage_endio(pages[pagenr]); | |
905 | err = -EFSCORRUPTED; | |
906 | } | |
06a304cd | 907 | pages[pagenr] = bvec.page; |
42fec235 | 908 | } |
06a304cd GX |
909 | |
910 | old_bvpage = z_erofs_bvec_iter_end(&biter); | |
911 | if (old_bvpage) | |
912 | z_erofs_put_shortlivedpage(pagepool, old_bvpage); | |
42fec235 GX |
913 | return err; |
914 | } | |
915 | ||
67139e36 GX |
916 | static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, |
917 | struct z_erofs_pcluster *pcl, struct page **pages, | |
918 | struct page **pagepool, bool *overlapped) | |
919 | { | |
920 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); | |
921 | struct page **compressed_pages; | |
922 | int i, err = 0; | |
923 | ||
924 | /* XXX: will have a better approach in the following commits */ | |
925 | compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), | |
926 | GFP_KERNEL | __GFP_NOFAIL); | |
927 | *overlapped = false; | |
928 | ||
929 | for (i = 0; i < pclusterpages; ++i) { | |
930 | unsigned int pagenr; | |
931 | struct page *page = pcl->compressed_pages[i]; | |
932 | ||
933 | /* compressed pages ought to be present before decompressing */ | |
934 | if (!page) { | |
935 | DBG_BUGON(1); | |
936 | continue; | |
937 | } | |
938 | compressed_pages[i] = page; | |
939 | ||
940 | if (z_erofs_is_inline_pcluster(pcl)) { | |
941 | if (!PageUptodate(page)) | |
942 | err = -EIO; | |
943 | continue; | |
944 | } | |
945 | ||
946 | DBG_BUGON(z_erofs_page_is_invalidated(page)); | |
947 | if (!z_erofs_is_shortlived_page(page)) { | |
948 | if (erofs_page_is_managed(sbi, page)) { | |
949 | if (!PageUptodate(page)) | |
950 | err = -EIO; | |
951 | continue; | |
952 | } | |
953 | ||
954 | /* | |
955 | * only if non-head page can be selected | |
956 | * for inplace decompression | |
957 | */ | |
958 | pagenr = z_erofs_onlinepage_index(page); | |
959 | ||
960 | DBG_BUGON(pagenr >= pcl->nr_pages); | |
961 | if (pages[pagenr]) { | |
962 | DBG_BUGON(1); | |
963 | SetPageError(pages[pagenr]); | |
964 | z_erofs_onlinepage_endio(pages[pagenr]); | |
965 | err = -EFSCORRUPTED; | |
966 | } | |
967 | pages[pagenr] = page; | |
968 | ||
969 | *overlapped = true; | |
970 | } | |
971 | ||
972 | /* PG_error needs checking for all non-managed pages */ | |
973 | if (PageError(page)) { | |
974 | DBG_BUGON(PageUptodate(page)); | |
975 | err = -EIO; | |
976 | } | |
977 | } | |
978 | ||
979 | if (err) { | |
980 | kfree(compressed_pages); | |
981 | return ERR_PTR(err); | |
982 | } | |
983 | return compressed_pages; | |
984 | } | |
985 | ||
97e86a85 GX |
986 | static int z_erofs_decompress_pcluster(struct super_block *sb, |
987 | struct z_erofs_pcluster *pcl, | |
eaa9172a | 988 | struct page **pagepool) |
3883a79a GX |
989 | { |
990 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | |
cecf864d | 991 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
9f6cc76e | 992 | unsigned int i, inputsize, outputsize, llen, nr_pages; |
97e86a85 | 993 | struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; |
3883a79a | 994 | struct page **pages, **compressed_pages, *page; |
3883a79a | 995 | |
b6a76183 | 996 | bool overlapped, partial; |
3883a79a GX |
997 | int err; |
998 | ||
999 | might_sleep(); | |
87ca34a7 | 1000 | DBG_BUGON(!READ_ONCE(pcl->nr_pages)); |
3883a79a | 1001 | |
87ca34a7 GX |
1002 | mutex_lock(&pcl->lock); |
1003 | nr_pages = pcl->nr_pages; | |
3883a79a | 1004 | |
8d8a09b0 | 1005 | if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { |
3883a79a | 1006 | pages = pages_onstack; |
97e86a85 GX |
1007 | } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && |
1008 | mutex_trylock(&z_pagemap_global_lock)) { | |
3883a79a | 1009 | pages = z_pagemap_global; |
97e86a85 | 1010 | } else { |
441dfcc8 CY |
1011 | gfp_t gfp_flags = GFP_KERNEL; |
1012 | ||
97e86a85 | 1013 | if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) |
441dfcc8 CY |
1014 | gfp_flags |= __GFP_NOFAIL; |
1015 | ||
447a3621 | 1016 | pages = kvmalloc_array(nr_pages, sizeof(struct page *), |
441dfcc8 | 1017 | gfp_flags); |
3883a79a GX |
1018 | |
1019 | /* fallback to global pagemap for the lowmem scenario */ | |
8d8a09b0 | 1020 | if (!pages) { |
441dfcc8 CY |
1021 | mutex_lock(&z_pagemap_global_lock); |
1022 | pages = z_pagemap_global; | |
3883a79a GX |
1023 | } |
1024 | } | |
1025 | ||
1026 | for (i = 0; i < nr_pages; ++i) | |
1027 | pages[i] = NULL; | |
1028 | ||
42fec235 | 1029 | err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); |
67139e36 GX |
1030 | compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, |
1031 | pagepool, &overlapped); | |
1032 | if (IS_ERR(compressed_pages)) { | |
1033 | err = PTR_ERR(compressed_pages); | |
1034 | compressed_pages = NULL; | |
3883a79a GX |
1035 | } |
1036 | ||
8d8a09b0 | 1037 | if (err) |
11152496 GX |
1038 | goto out; |
1039 | ||
97e86a85 | 1040 | llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; |
87ca34a7 | 1041 | if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { |
97e86a85 GX |
1042 | outputsize = llen; |
1043 | partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); | |
b6a76183 | 1044 | } else { |
87ca34a7 | 1045 | outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; |
b6a76183 GX |
1046 | partial = true; |
1047 | } | |
3883a79a | 1048 | |
cecf864d YH |
1049 | if (z_erofs_is_inline_pcluster(pcl)) |
1050 | inputsize = pcl->tailpacking_size; | |
1051 | else | |
1052 | inputsize = pclusterpages * PAGE_SIZE; | |
1053 | ||
88aaf5a7 GX |
1054 | err = z_erofs_decompress(&(struct z_erofs_decompress_req) { |
1055 | .sb = sb, | |
1056 | .in = compressed_pages, | |
1057 | .out = pages, | |
cecf864d | 1058 | .pageofs_in = pcl->pageofs_in, |
87ca34a7 | 1059 | .pageofs_out = pcl->pageofs_out, |
9f6cc76e | 1060 | .inputsize = inputsize, |
88aaf5a7 | 1061 | .outputsize = outputsize, |
97e86a85 | 1062 | .alg = pcl->algorithmformat, |
88aaf5a7 | 1063 | .inplace_io = overlapped, |
b6a76183 | 1064 | .partial_decoding = partial |
97e86a85 | 1065 | }, pagepool); |
3883a79a GX |
1066 | |
1067 | out: | |
cecf864d YH |
1068 | /* must handle all compressed pages before actual file pages */ |
1069 | if (z_erofs_is_inline_pcluster(pcl)) { | |
67139e36 GX |
1070 | page = pcl->compressed_pages[0]; |
1071 | WRITE_ONCE(pcl->compressed_pages[0], NULL); | |
cecf864d YH |
1072 | put_page(page); |
1073 | } else { | |
1074 | for (i = 0; i < pclusterpages; ++i) { | |
67139e36 | 1075 | page = pcl->compressed_pages[i]; |
d61fbb6b | 1076 | |
cecf864d YH |
1077 | if (erofs_page_is_managed(sbi, page)) |
1078 | continue; | |
af692e11 | 1079 | |
cecf864d YH |
1080 | /* recycle all individual short-lived pages */ |
1081 | (void)z_erofs_put_shortlivedpage(pagepool, page); | |
67139e36 | 1082 | WRITE_ONCE(pcl->compressed_pages[i], NULL); |
cecf864d | 1083 | } |
af692e11 | 1084 | } |
67139e36 | 1085 | kfree(compressed_pages); |
af692e11 | 1086 | |
3883a79a GX |
1087 | for (i = 0; i < nr_pages; ++i) { |
1088 | page = pages[i]; | |
af692e11 GX |
1089 | if (!page) |
1090 | continue; | |
1091 | ||
6aaa7b06 | 1092 | DBG_BUGON(z_erofs_page_is_invalidated(page)); |
3883a79a | 1093 | |
6aaa7b06 GX |
1094 | /* recycle all individual short-lived pages */ |
1095 | if (z_erofs_put_shortlivedpage(pagepool, page)) | |
3883a79a GX |
1096 | continue; |
1097 | ||
8d8a09b0 | 1098 | if (err < 0) |
3883a79a GX |
1099 | SetPageError(page); |
1100 | ||
1101 | z_erofs_onlinepage_endio(page); | |
1102 | } | |
1103 | ||
3883a79a GX |
1104 | if (pages == z_pagemap_global) |
1105 | mutex_unlock(&z_pagemap_global_lock); | |
8d8a09b0 | 1106 | else if (pages != pages_onstack) |
3883a79a GX |
1107 | kvfree(pages); |
1108 | ||
87ca34a7 | 1109 | pcl->nr_pages = 0; |
06a304cd | 1110 | pcl->bvset.nextpage = NULL; |
87ca34a7 | 1111 | pcl->vcnt = 0; |
3883a79a | 1112 | |
87ca34a7 | 1113 | /* pcluster lock MUST be taken before the following line */ |
97e86a85 | 1114 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); |
87ca34a7 | 1115 | mutex_unlock(&pcl->lock); |
3883a79a GX |
1116 | return err; |
1117 | } | |
1118 | ||
0c638f70 | 1119 | static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, |
eaa9172a | 1120 | struct page **pagepool) |
3883a79a | 1121 | { |
97e86a85 | 1122 | z_erofs_next_pcluster_t owned = io->head; |
3883a79a | 1123 | |
97e86a85 GX |
1124 | while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { |
1125 | struct z_erofs_pcluster *pcl; | |
3883a79a GX |
1126 | |
1127 | /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ | |
97e86a85 | 1128 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); |
3883a79a GX |
1129 | |
1130 | /* no possible that 'owned' equals NULL */ | |
97e86a85 | 1131 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); |
3883a79a | 1132 | |
97e86a85 GX |
1133 | pcl = container_of(owned, struct z_erofs_pcluster, next); |
1134 | owned = READ_ONCE(pcl->next); | |
3883a79a | 1135 | |
a4b1fab1 | 1136 | z_erofs_decompress_pcluster(io->sb, pcl, pagepool); |
87ca34a7 | 1137 | erofs_workgroup_put(&pcl->obj); |
3978c8e3 | 1138 | } |
3883a79a GX |
1139 | } |
1140 | ||
0c638f70 | 1141 | static void z_erofs_decompressqueue_work(struct work_struct *work) |
3883a79a | 1142 | { |
a4b1fab1 GX |
1143 | struct z_erofs_decompressqueue *bgq = |
1144 | container_of(work, struct z_erofs_decompressqueue, u.work); | |
eaa9172a | 1145 | struct page *pagepool = NULL; |
3883a79a | 1146 | |
a4b1fab1 | 1147 | DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
0c638f70 | 1148 | z_erofs_decompress_queue(bgq, &pagepool); |
3883a79a | 1149 | |
eaa9172a | 1150 | erofs_release_pages(&pagepool); |
a4b1fab1 | 1151 | kvfree(bgq); |
3883a79a GX |
1152 | } |
1153 | ||
7865827c GX |
1154 | static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, |
1155 | bool sync, int bios) | |
1156 | { | |
1157 | struct erofs_sb_info *const sbi = EROFS_SB(io->sb); | |
1158 | ||
1159 | /* wake up the caller thread for sync decompression */ | |
1160 | if (sync) { | |
7865827c | 1161 | if (!atomic_add_return(bios, &io->pending_bios)) |
60b30050 HJ |
1162 | complete(&io->u.done); |
1163 | ||
7865827c GX |
1164 | return; |
1165 | } | |
1166 | ||
1167 | if (atomic_add_return(bios, &io->pending_bios)) | |
1168 | return; | |
1169 | /* Use workqueue and sync decompression for atomic contexts only */ | |
1170 | if (in_atomic() || irqs_disabled()) { | |
1171 | queue_work(z_erofs_workqueue, &io->u.work); | |
1172 | /* enable sync decompression for readahead */ | |
1173 | if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) | |
1174 | sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; | |
1175 | return; | |
1176 | } | |
1177 | z_erofs_decompressqueue_work(&io->u.work); | |
1178 | } | |
1179 | ||
97e86a85 GX |
1180 | static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, |
1181 | unsigned int nr, | |
eaa9172a | 1182 | struct page **pagepool, |
9f2731d6 | 1183 | struct address_space *mc) |
9248fce7 | 1184 | { |
97e86a85 | 1185 | const pgoff_t index = pcl->obj.index; |
9f2731d6 | 1186 | gfp_t gfp = mapping_gfp_mask(mc); |
9248fce7 GX |
1187 | bool tocache = false; |
1188 | ||
1189 | struct address_space *mapping; | |
1190 | struct page *oldpage, *page; | |
1191 | ||
92e6efd5 GX |
1192 | compressed_page_t t; |
1193 | int justfound; | |
1194 | ||
9248fce7 | 1195 | repeat: |
97e86a85 | 1196 | page = READ_ONCE(pcl->compressed_pages[nr]); |
9248fce7 GX |
1197 | oldpage = page; |
1198 | ||
1199 | if (!page) | |
1200 | goto out_allocpage; | |
1201 | ||
92e6efd5 GX |
1202 | /* process the target tagged pointer */ |
1203 | t = tagptr_init(compressed_page_t, page); | |
1204 | justfound = tagptr_unfold_tags(t); | |
1205 | page = tagptr_unfold_ptr(t); | |
1206 | ||
1825c8d7 GX |
1207 | /* |
1208 | * preallocated cached pages, which is used to avoid direct reclaim | |
1209 | * otherwise, it will go inplace I/O path instead. | |
1210 | */ | |
1211 | if (page->private == Z_EROFS_PREALLOCATED_PAGE) { | |
1212 | WRITE_ONCE(pcl->compressed_pages[nr], page); | |
1213 | set_page_private(page, 0); | |
1214 | tocache = true; | |
1215 | goto out_tocache; | |
1216 | } | |
9248fce7 GX |
1217 | mapping = READ_ONCE(page->mapping); |
1218 | ||
9248fce7 | 1219 | /* |
6aaa7b06 | 1220 | * file-backed online pages in plcuster are all locked steady, |
9248fce7 GX |
1221 | * therefore it is impossible for `mapping' to be NULL. |
1222 | */ | |
1223 | if (mapping && mapping != mc) | |
1224 | /* ought to be unmanaged pages */ | |
1225 | goto out; | |
1226 | ||
6aaa7b06 GX |
1227 | /* directly return for shortlived page as well */ |
1228 | if (z_erofs_is_shortlived_page(page)) | |
1229 | goto out; | |
1230 | ||
9248fce7 GX |
1231 | lock_page(page); |
1232 | ||
92e6efd5 GX |
1233 | /* only true if page reclaim goes wrong, should never happen */ |
1234 | DBG_BUGON(justfound && PagePrivate(page)); | |
1235 | ||
9248fce7 GX |
1236 | /* the page is still in manage cache */ |
1237 | if (page->mapping == mc) { | |
97e86a85 | 1238 | WRITE_ONCE(pcl->compressed_pages[nr], page); |
9248fce7 | 1239 | |
11152496 | 1240 | ClearPageError(page); |
9248fce7 | 1241 | if (!PagePrivate(page)) { |
92e6efd5 GX |
1242 | /* |
1243 | * impossible to be !PagePrivate(page) for | |
1244 | * the current restriction as well if | |
1245 | * the page is already in compressed_pages[]. | |
1246 | */ | |
1247 | DBG_BUGON(!justfound); | |
1248 | ||
1249 | justfound = 0; | |
97e86a85 | 1250 | set_page_private(page, (unsigned long)pcl); |
9248fce7 GX |
1251 | SetPagePrivate(page); |
1252 | } | |
1253 | ||
1254 | /* no need to submit io if it is already up-to-date */ | |
1255 | if (PageUptodate(page)) { | |
1256 | unlock_page(page); | |
1257 | page = NULL; | |
1258 | } | |
1259 | goto out; | |
1260 | } | |
1261 | ||
1262 | /* | |
1263 | * the managed page has been truncated, it's unsafe to | |
1264 | * reuse this one, let's allocate a new cache-managed page. | |
1265 | */ | |
1266 | DBG_BUGON(page->mapping); | |
92e6efd5 | 1267 | DBG_BUGON(!justfound); |
9248fce7 GX |
1268 | |
1269 | tocache = true; | |
1270 | unlock_page(page); | |
1271 | put_page(page); | |
1272 | out_allocpage: | |
5ddcee1f | 1273 | page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); |
5ddcee1f | 1274 | if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { |
eaa9172a | 1275 | erofs_pagepool_add(pagepool, page); |
5ddcee1f GX |
1276 | cond_resched(); |
1277 | goto repeat; | |
1278 | } | |
1825c8d7 | 1279 | out_tocache: |
bf225074 GX |
1280 | if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { |
1281 | /* turn into temporary page if fails (1 ref) */ | |
1282 | set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); | |
1283 | goto out; | |
a30573b3 | 1284 | } |
bf225074 GX |
1285 | attach_page_private(page, pcl); |
1286 | /* drop a refcount added by allocpage (then we have 2 refs here) */ | |
1287 | put_page(page); | |
1288 | ||
9248fce7 GX |
1289 | out: /* the only exit (for tracing and debugging) */ |
1290 | return page; | |
1291 | } | |
1292 | ||
a4b1fab1 GX |
1293 | static struct z_erofs_decompressqueue * |
1294 | jobqueue_init(struct super_block *sb, | |
1295 | struct z_erofs_decompressqueue *fgq, bool *fg) | |
3883a79a | 1296 | { |
a4b1fab1 | 1297 | struct z_erofs_decompressqueue *q; |
3883a79a | 1298 | |
a4b1fab1 GX |
1299 | if (fg && !*fg) { |
1300 | q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); | |
1301 | if (!q) { | |
1302 | *fg = true; | |
1303 | goto fg_out; | |
1304 | } | |
0c638f70 | 1305 | INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); |
a4b1fab1 GX |
1306 | } else { |
1307 | fg_out: | |
1308 | q = fgq; | |
60b30050 | 1309 | init_completion(&fgq->u.done); |
a4b1fab1 | 1310 | atomic_set(&fgq->pending_bios, 0); |
3883a79a | 1311 | } |
a4b1fab1 GX |
1312 | q->sb = sb; |
1313 | q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | |
1314 | return q; | |
3883a79a GX |
1315 | } |
1316 | ||
97e86a85 | 1317 | /* define decompression jobqueue types */ |
7146a4f0 | 1318 | enum { |
7146a4f0 | 1319 | JQ_BYPASS, |
7146a4f0 GX |
1320 | JQ_SUBMIT, |
1321 | NR_JOBQUEUES, | |
1322 | }; | |
1323 | ||
1324 | static void *jobqueueset_init(struct super_block *sb, | |
a4b1fab1 GX |
1325 | struct z_erofs_decompressqueue *q[], |
1326 | struct z_erofs_decompressqueue *fgq, bool *fg) | |
7146a4f0 | 1327 | { |
7146a4f0 GX |
1328 | /* |
1329 | * if managed cache is enabled, bypass jobqueue is needed, | |
97e86a85 | 1330 | * no need to read from device for all pclusters in this queue. |
7146a4f0 | 1331 | */ |
a4b1fab1 GX |
1332 | q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); |
1333 | q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); | |
7146a4f0 | 1334 | |
a4b1fab1 | 1335 | return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); |
7146a4f0 GX |
1336 | } |
1337 | ||
97e86a85 GX |
1338 | static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, |
1339 | z_erofs_next_pcluster_t qtail[], | |
1340 | z_erofs_next_pcluster_t owned_head) | |
7146a4f0 | 1341 | { |
97e86a85 GX |
1342 | z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; |
1343 | z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; | |
7146a4f0 | 1344 | |
97e86a85 GX |
1345 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
1346 | if (owned_head == Z_EROFS_PCLUSTER_TAIL) | |
1347 | owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | |
7146a4f0 | 1348 | |
97e86a85 | 1349 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); |
7146a4f0 GX |
1350 | |
1351 | WRITE_ONCE(*submit_qtail, owned_head); | |
97e86a85 | 1352 | WRITE_ONCE(*bypass_qtail, &pcl->next); |
7146a4f0 | 1353 | |
97e86a85 | 1354 | qtail[JQ_BYPASS] = &pcl->next; |
7146a4f0 GX |
1355 | } |
1356 | ||
7865827c GX |
1357 | static void z_erofs_decompressqueue_endio(struct bio *bio) |
1358 | { | |
1359 | tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); | |
1360 | struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); | |
1361 | blk_status_t err = bio->bi_status; | |
1362 | struct bio_vec *bvec; | |
1363 | struct bvec_iter_all iter_all; | |
1364 | ||
1365 | bio_for_each_segment_all(bvec, bio, iter_all) { | |
1366 | struct page *page = bvec->bv_page; | |
1367 | ||
1368 | DBG_BUGON(PageUptodate(page)); | |
1369 | DBG_BUGON(z_erofs_page_is_invalidated(page)); | |
1370 | ||
1371 | if (err) | |
1372 | SetPageError(page); | |
1373 | ||
1374 | if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { | |
1375 | if (!err) | |
1376 | SetPageUptodate(page); | |
1377 | unlock_page(page); | |
1378 | } | |
1379 | } | |
1380 | z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); | |
1381 | bio_put(bio); | |
1382 | } | |
1383 | ||
83a386c0 | 1384 | static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, |
eaa9172a | 1385 | struct page **pagepool, |
0c638f70 GX |
1386 | struct z_erofs_decompressqueue *fgq, |
1387 | bool *force_fg) | |
3883a79a | 1388 | { |
83a386c0 GX |
1389 | struct super_block *sb = f->inode->i_sb; |
1390 | struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); | |
97e86a85 | 1391 | z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; |
a4b1fab1 | 1392 | struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; |
7146a4f0 | 1393 | void *bi_private; |
5c6dcc57 | 1394 | z_erofs_next_pcluster_t owned_head = f->owned_head; |
dfeab2e9 | 1395 | /* bio is NULL initially, so no need to initialize last_{index,bdev} */ |
3f649ab7 | 1396 | pgoff_t last_index; |
dfeab2e9 | 1397 | struct block_device *last_bdev; |
1e4a2955 GX |
1398 | unsigned int nr_bios = 0; |
1399 | struct bio *bio = NULL; | |
3883a79a | 1400 | |
a4b1fab1 GX |
1401 | bi_private = jobqueueset_init(sb, q, fgq, force_fg); |
1402 | qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; | |
1403 | qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; | |
3883a79a GX |
1404 | |
1405 | /* by default, all need io submission */ | |
7146a4f0 | 1406 | q[JQ_SUBMIT]->head = owned_head; |
3883a79a GX |
1407 | |
1408 | do { | |
dfeab2e9 | 1409 | struct erofs_map_dev mdev; |
97e86a85 | 1410 | struct z_erofs_pcluster *pcl; |
1e4a2955 GX |
1411 | pgoff_t cur, end; |
1412 | unsigned int i = 0; | |
1413 | bool bypass = true; | |
3883a79a GX |
1414 | |
1415 | /* no possible 'owned_head' equals the following */ | |
97e86a85 GX |
1416 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
1417 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); | |
1418 | ||
1419 | pcl = container_of(owned_head, struct z_erofs_pcluster, next); | |
3883a79a | 1420 | |
cecf864d YH |
1421 | /* close the main owned chain at first */ |
1422 | owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | |
1423 | Z_EROFS_PCLUSTER_TAIL_CLOSED); | |
1424 | if (z_erofs_is_inline_pcluster(pcl)) { | |
1425 | move_to_bypass_jobqueue(pcl, qtail, owned_head); | |
1426 | continue; | |
1427 | } | |
1428 | ||
dfeab2e9 GX |
1429 | /* no device id here, thus it will always succeed */ |
1430 | mdev = (struct erofs_map_dev) { | |
1431 | .m_pa = blknr_to_addr(pcl->obj.index), | |
1432 | }; | |
1433 | (void)erofs_map_dev(sb, &mdev); | |
1434 | ||
1435 | cur = erofs_blknr(mdev.m_pa); | |
9f6cc76e | 1436 | end = cur + pcl->pclusterpages; |
3883a79a | 1437 | |
1e4a2955 GX |
1438 | do { |
1439 | struct page *page; | |
3883a79a | 1440 | |
1e4a2955 | 1441 | page = pickup_page_for_submission(pcl, i++, pagepool, |
83a386c0 | 1442 | mc); |
1e4a2955 GX |
1443 | if (!page) |
1444 | continue; | |
3883a79a | 1445 | |
dfeab2e9 GX |
1446 | if (bio && (cur != last_index + 1 || |
1447 | last_bdev != mdev.m_bdev)) { | |
3883a79a | 1448 | submit_bio_retry: |
1e4a2955 GX |
1449 | submit_bio(bio); |
1450 | bio = NULL; | |
1451 | } | |
a5c0b780 | 1452 | |
1e4a2955 | 1453 | if (!bio) { |
07888c66 CH |
1454 | bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, |
1455 | REQ_OP_READ, GFP_NOIO); | |
1e4a2955 | 1456 | bio->bi_end_io = z_erofs_decompressqueue_endio; |
dfeab2e9 | 1457 | |
dfeab2e9 | 1458 | last_bdev = mdev.m_bdev; |
1e4a2955 GX |
1459 | bio->bi_iter.bi_sector = (sector_t)cur << |
1460 | LOG_SECTORS_PER_BLOCK; | |
1461 | bio->bi_private = bi_private; | |
6ea5aad3 GX |
1462 | if (f->readahead) |
1463 | bio->bi_opf |= REQ_RAHEAD; | |
1e4a2955 GX |
1464 | ++nr_bios; |
1465 | } | |
3883a79a | 1466 | |
6c3e485e | 1467 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) |
1e4a2955 | 1468 | goto submit_bio_retry; |
3883a79a | 1469 | |
1e4a2955 GX |
1470 | last_index = cur; |
1471 | bypass = false; | |
1472 | } while (++cur < end); | |
105d4ad8 | 1473 | |
1e4a2955 | 1474 | if (!bypass) |
97e86a85 | 1475 | qtail[JQ_SUBMIT] = &pcl->next; |
7146a4f0 | 1476 | else |
97e86a85 GX |
1477 | move_to_bypass_jobqueue(pcl, qtail, owned_head); |
1478 | } while (owned_head != Z_EROFS_PCLUSTER_TAIL); | |
3883a79a | 1479 | |
42d40b4a | 1480 | if (bio) |
94e4e153 | 1481 | submit_bio(bio); |
3883a79a | 1482 | |
587a67b7 GX |
1483 | /* |
1484 | * although background is preferred, no one is pending for submission. | |
1485 | * don't issue workqueue for decompression but drop it directly instead. | |
1486 | */ | |
1487 | if (!*force_fg && !nr_bios) { | |
1488 | kvfree(q[JQ_SUBMIT]); | |
1e4a2955 | 1489 | return; |
587a67b7 | 1490 | } |
a4b1fab1 | 1491 | z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); |
3883a79a GX |
1492 | } |
1493 | ||
83a386c0 | 1494 | static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, |
eaa9172a | 1495 | struct page **pagepool, bool force_fg) |
3883a79a | 1496 | { |
a4b1fab1 | 1497 | struct z_erofs_decompressqueue io[NR_JOBQUEUES]; |
3883a79a | 1498 | |
5c6dcc57 | 1499 | if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) |
3883a79a | 1500 | return; |
83a386c0 | 1501 | z_erofs_submit_queue(f, pagepool, io, &force_fg); |
3883a79a | 1502 | |
0c638f70 GX |
1503 | /* handle bypass queue (no i/o pclusters) immediately */ |
1504 | z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); | |
4279f3f9 | 1505 | |
3883a79a GX |
1506 | if (!force_fg) |
1507 | return; | |
1508 | ||
1509 | /* wait until all bios are completed */ | |
60b30050 | 1510 | wait_for_completion_io(&io[JQ_SUBMIT].u.done); |
3883a79a | 1511 | |
0c638f70 GX |
1512 | /* handle synchronous decompress queue in the caller context */ |
1513 | z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); | |
3883a79a GX |
1514 | } |
1515 | ||
38629291 GX |
1516 | /* |
1517 | * Since partial uptodate is still unimplemented for now, we have to use | |
1518 | * approximate readmore strategies as a start. | |
1519 | */ | |
1520 | static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, | |
1521 | struct readahead_control *rac, | |
1522 | erofs_off_t end, | |
eaa9172a | 1523 | struct page **pagepool, |
38629291 GX |
1524 | bool backmost) |
1525 | { | |
1526 | struct inode *inode = f->inode; | |
1527 | struct erofs_map_blocks *map = &f->map; | |
1528 | erofs_off_t cur; | |
1529 | int err; | |
1530 | ||
1531 | if (backmost) { | |
1532 | map->m_la = end; | |
622ceadd GX |
1533 | err = z_erofs_map_blocks_iter(inode, map, |
1534 | EROFS_GET_BLOCKS_READMORE); | |
38629291 GX |
1535 | if (err) |
1536 | return; | |
1537 | ||
1538 | /* expend ra for the trailing edge if readahead */ | |
1539 | if (rac) { | |
1540 | loff_t newstart = readahead_pos(rac); | |
1541 | ||
1542 | cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); | |
1543 | readahead_expand(rac, newstart, cur - newstart); | |
1544 | return; | |
1545 | } | |
1546 | end = round_up(end, PAGE_SIZE); | |
1547 | } else { | |
1548 | end = round_up(map->m_la, PAGE_SIZE); | |
1549 | ||
1550 | if (!map->m_llen) | |
1551 | return; | |
1552 | } | |
1553 | ||
1554 | cur = map->m_la + map->m_llen - 1; | |
1555 | while (cur >= end) { | |
1556 | pgoff_t index = cur >> PAGE_SHIFT; | |
1557 | struct page *page; | |
1558 | ||
1559 | page = erofs_grab_cache_page_nowait(inode->i_mapping, index); | |
aa793b46 GX |
1560 | if (page) { |
1561 | if (PageUptodate(page)) { | |
1562 | unlock_page(page); | |
1563 | } else { | |
1564 | err = z_erofs_do_read_page(f, page, pagepool); | |
1565 | if (err) | |
1566 | erofs_err(inode->i_sb, | |
1567 | "readmore error at page %lu @ nid %llu", | |
1568 | index, EROFS_I(inode)->nid); | |
1569 | } | |
38629291 | 1570 | put_page(page); |
38629291 GX |
1571 | } |
1572 | ||
38629291 GX |
1573 | if (cur < PAGE_SIZE) |
1574 | break; | |
1575 | cur = (index << PAGE_SHIFT) - 1; | |
1576 | } | |
1577 | } | |
1578 | ||
a2e20a25 | 1579 | static int z_erofs_read_folio(struct file *file, struct folio *folio) |
3883a79a | 1580 | { |
a2e20a25 | 1581 | struct page *page = &folio->page; |
3883a79a | 1582 | struct inode *const inode = page->mapping->host; |
40452ffc | 1583 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
97e86a85 | 1584 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
eaa9172a | 1585 | struct page *pagepool = NULL; |
3883a79a | 1586 | int err; |
3883a79a | 1587 | |
ba9ce771 | 1588 | trace_erofs_readpage(page, false); |
f0c519fc GX |
1589 | f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; |
1590 | ||
38629291 GX |
1591 | z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, |
1592 | &pagepool, true); | |
1825c8d7 | 1593 | err = z_erofs_do_read_page(&f, page, &pagepool); |
38629291 GX |
1594 | z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); |
1595 | ||
5c6dcc57 | 1596 | (void)z_erofs_collector_end(&f); |
3883a79a | 1597 | |
ee45197c | 1598 | /* if some compressed cluster ready, need submit them anyway */ |
83a386c0 | 1599 | z_erofs_runqueue(&f, &pagepool, |
40452ffc | 1600 | z_erofs_get_sync_decompress_policy(sbi, 0)); |
ee45197c GX |
1601 | |
1602 | if (err) | |
4f761fa2 | 1603 | erofs_err(inode->i_sb, "failed to read, err [%d]", err); |
3883a79a | 1604 | |
09c54379 | 1605 | erofs_put_metabuf(&f.map.buf); |
eaa9172a | 1606 | erofs_release_pages(&pagepool); |
ee45197c | 1607 | return err; |
3883a79a GX |
1608 | } |
1609 | ||
0615090c | 1610 | static void z_erofs_readahead(struct readahead_control *rac) |
3883a79a | 1611 | { |
0615090c | 1612 | struct inode *const inode = rac->mapping->host; |
5fb76bb0 | 1613 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
97e86a85 | 1614 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
eaa9172a | 1615 | struct page *pagepool = NULL, *head = NULL, *page; |
38629291 | 1616 | unsigned int nr_pages; |
3883a79a | 1617 | |
6ea5aad3 | 1618 | f.readahead = true; |
0615090c | 1619 | f.headoffset = readahead_pos(rac); |
3883a79a | 1620 | |
38629291 GX |
1621 | z_erofs_pcluster_readmore(&f, rac, f.headoffset + |
1622 | readahead_length(rac) - 1, &pagepool, true); | |
1623 | nr_pages = readahead_count(rac); | |
1624 | trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); | |
2d9b5dcd | 1625 | |
38629291 | 1626 | while ((page = readahead_page(rac))) { |
3883a79a GX |
1627 | set_page_private(page, (unsigned long)head); |
1628 | head = page; | |
1629 | } | |
1630 | ||
42d40b4a | 1631 | while (head) { |
3883a79a GX |
1632 | struct page *page = head; |
1633 | int err; | |
1634 | ||
1635 | /* traversal in reverse order */ | |
1636 | head = (void *)page_private(page); | |
1637 | ||
1825c8d7 | 1638 | err = z_erofs_do_read_page(&f, page, &pagepool); |
a5876e24 | 1639 | if (err) |
4f761fa2 GX |
1640 | erofs_err(inode->i_sb, |
1641 | "readahead error at page %lu @ nid %llu", | |
1642 | page->index, EROFS_I(inode)->nid); | |
3883a79a GX |
1643 | put_page(page); |
1644 | } | |
38629291 | 1645 | z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); |
5c6dcc57 | 1646 | (void)z_erofs_collector_end(&f); |
3883a79a | 1647 | |
83a386c0 | 1648 | z_erofs_runqueue(&f, &pagepool, |
40452ffc | 1649 | z_erofs_get_sync_decompress_policy(sbi, nr_pages)); |
09c54379 | 1650 | erofs_put_metabuf(&f.map.buf); |
eaa9172a | 1651 | erofs_release_pages(&pagepool); |
3883a79a GX |
1652 | } |
1653 | ||
0c638f70 | 1654 | const struct address_space_operations z_erofs_aops = { |
a2e20a25 | 1655 | .read_folio = z_erofs_read_folio, |
0615090c | 1656 | .readahead = z_erofs_readahead, |
3883a79a | 1657 | }; |