Commit | Line | Data |
---|---|---|
29b24f6c | 1 | // SPDX-License-Identifier: GPL-2.0-only |
02827e17 | 2 | /* |
02827e17 | 3 | * Copyright (C) 2018 HUAWEI, Inc. |
592e7cd0 | 4 | * https://www.huawei.com/ |
06a304cd | 5 | * Copyright (C) 2022 Alibaba Cloud |
02827e17 | 6 | */ |
27481233 | 7 | #include "compress.h" |
3883a79a | 8 | #include <linux/prefetch.h> |
99486c51 | 9 | #include <linux/psi.h> |
3883a79a | 10 | |
284db12c CG |
11 | #include <trace/events/erofs.h> |
12 | ||
a9a94d93 GX |
13 | #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) |
14 | #define Z_EROFS_INLINE_BVECS 2 | |
15 | ||
16 | /* | |
17 | * let's leave a type here in case of introducing | |
18 | * another tagged pointer later. | |
19 | */ | |
20 | typedef void *z_erofs_next_pcluster_t; | |
21 | ||
22 | struct z_erofs_bvec { | |
23 | struct page *page; | |
24 | int offset; | |
25 | unsigned int end; | |
26 | }; | |
27 | ||
28 | #define __Z_EROFS_BVSET(name, total) \ | |
29 | struct name { \ | |
30 | /* point to the next page which contains the following bvecs */ \ | |
31 | struct page *nextpage; \ | |
32 | struct z_erofs_bvec bvec[total]; \ | |
33 | } | |
34 | __Z_EROFS_BVSET(z_erofs_bvset,); | |
35 | __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); | |
36 | ||
37 | /* | |
38 | * Structure fields follow one of the following exclusion rules. | |
39 | * | |
40 | * I: Modifiable by initialization/destruction paths and read-only | |
41 | * for everyone else; | |
42 | * | |
43 | * L: Field should be protected by the pcluster lock; | |
44 | * | |
45 | * A: Field should be accessed / updated in atomic for parallelized code. | |
46 | */ | |
47 | struct z_erofs_pcluster { | |
48 | struct erofs_workgroup obj; | |
49 | struct mutex lock; | |
50 | ||
51 | /* A: point to next chained pcluster or TAILs */ | |
52 | z_erofs_next_pcluster_t next; | |
53 | ||
54 | /* L: the maximum decompression size of this round */ | |
55 | unsigned int length; | |
56 | ||
57 | /* L: total number of bvecs */ | |
58 | unsigned int vcnt; | |
59 | ||
60 | /* I: page offset of start position of decompression */ | |
61 | unsigned short pageofs_out; | |
62 | ||
63 | /* I: page offset of inline compressed data */ | |
64 | unsigned short pageofs_in; | |
65 | ||
66 | union { | |
67 | /* L: inline a certain number of bvec for bootstrap */ | |
68 | struct z_erofs_bvset_inline bvset; | |
69 | ||
70 | /* I: can be used to free the pcluster by RCU. */ | |
71 | struct rcu_head rcu; | |
72 | }; | |
73 | ||
74 | union { | |
75 | /* I: physical cluster size in pages */ | |
76 | unsigned short pclusterpages; | |
77 | ||
78 | /* I: tailpacking inline compressed size */ | |
79 | unsigned short tailpacking_size; | |
80 | }; | |
81 | ||
82 | /* I: compression algorithm format */ | |
83 | unsigned char algorithmformat; | |
84 | ||
85 | /* L: whether partial decompression or not */ | |
86 | bool partial; | |
87 | ||
88 | /* L: indicate several pageofs_outs or not */ | |
89 | bool multibases; | |
90 | ||
91 | /* A: compressed bvecs (can be cached or inplaced pages) */ | |
92 | struct z_erofs_bvec compressed_bvecs[]; | |
93 | }; | |
94 | ||
95 | /* let's avoid the valid 32-bit kernel addresses */ | |
96 | ||
97 | /* the chained workgroup has't submitted io (still open) */ | |
98 | #define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) | |
99 | /* the chained workgroup has already submitted io */ | |
100 | #define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) | |
101 | ||
102 | #define Z_EROFS_PCLUSTER_NIL (NULL) | |
103 | ||
104 | struct z_erofs_decompressqueue { | |
105 | struct super_block *sb; | |
106 | atomic_t pending_bios; | |
107 | z_erofs_next_pcluster_t head; | |
108 | ||
109 | union { | |
110 | struct completion done; | |
111 | struct work_struct work; | |
112 | } u; | |
113 | bool eio, sync; | |
114 | }; | |
115 | ||
116 | static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) | |
117 | { | |
118 | return !pcl->obj.index; | |
119 | } | |
120 | ||
121 | static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) | |
122 | { | |
123 | if (z_erofs_is_inline_pcluster(pcl)) | |
124 | return 1; | |
125 | return pcl->pclusterpages; | |
126 | } | |
127 | ||
128 | /* | |
129 | * bit 30: I/O error occurred on this page | |
130 | * bit 0 - 29: remaining parts to complete this page | |
131 | */ | |
132 | #define Z_EROFS_PAGE_EIO (1 << 30) | |
133 | ||
134 | static inline void z_erofs_onlinepage_init(struct page *page) | |
135 | { | |
136 | union { | |
137 | atomic_t o; | |
138 | unsigned long v; | |
139 | } u = { .o = ATOMIC_INIT(1) }; | |
140 | ||
141 | set_page_private(page, u.v); | |
142 | smp_wmb(); | |
143 | SetPagePrivate(page); | |
144 | } | |
145 | ||
146 | static inline void z_erofs_onlinepage_split(struct page *page) | |
147 | { | |
148 | atomic_inc((atomic_t *)&page->private); | |
149 | } | |
150 | ||
151 | static inline void z_erofs_page_mark_eio(struct page *page) | |
152 | { | |
153 | int orig; | |
154 | ||
155 | do { | |
156 | orig = atomic_read((atomic_t *)&page->private); | |
157 | } while (atomic_cmpxchg((atomic_t *)&page->private, orig, | |
158 | orig | Z_EROFS_PAGE_EIO) != orig); | |
159 | } | |
160 | ||
161 | static inline void z_erofs_onlinepage_endio(struct page *page) | |
162 | { | |
163 | unsigned int v; | |
164 | ||
165 | DBG_BUGON(!PagePrivate(page)); | |
166 | v = atomic_dec_return((atomic_t *)&page->private); | |
167 | if (!(v & ~Z_EROFS_PAGE_EIO)) { | |
168 | set_page_private(page, 0); | |
169 | ClearPagePrivate(page); | |
170 | if (!(v & Z_EROFS_PAGE_EIO)) | |
171 | SetPageUptodate(page); | |
172 | unlock_page(page); | |
173 | } | |
174 | } | |
175 | ||
176 | #define Z_EROFS_ONSTACK_PAGES 32 | |
177 | ||
9f6cc76e GX |
178 | /* |
179 | * since pclustersize is variable for big pcluster feature, introduce slab | |
180 | * pools implementation for different pcluster sizes. | |
181 | */ | |
182 | struct z_erofs_pcluster_slab { | |
183 | struct kmem_cache *slab; | |
184 | unsigned int maxpages; | |
185 | char name[48]; | |
186 | }; | |
187 | ||
188 | #define _PCLP(n) { .maxpages = n } | |
189 | ||
190 | static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { | |
191 | _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), | |
192 | _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) | |
193 | }; | |
194 | ||
06a304cd GX |
195 | struct z_erofs_bvec_iter { |
196 | struct page *bvpage; | |
197 | struct z_erofs_bvset *bvset; | |
198 | unsigned int nr, cur; | |
199 | }; | |
200 | ||
201 | static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) | |
202 | { | |
203 | if (iter->bvpage) | |
204 | kunmap_local(iter->bvset); | |
205 | return iter->bvpage; | |
206 | } | |
207 | ||
208 | static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) | |
209 | { | |
210 | unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; | |
211 | /* have to access nextpage in advance, otherwise it will be unmapped */ | |
212 | struct page *nextpage = iter->bvset->nextpage; | |
213 | struct page *oldpage; | |
214 | ||
215 | DBG_BUGON(!nextpage); | |
216 | oldpage = z_erofs_bvec_iter_end(iter); | |
217 | iter->bvpage = nextpage; | |
218 | iter->bvset = kmap_local_page(nextpage); | |
219 | iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); | |
220 | iter->cur = 0; | |
221 | return oldpage; | |
222 | } | |
223 | ||
224 | static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, | |
225 | struct z_erofs_bvset_inline *bvset, | |
226 | unsigned int bootstrap_nr, | |
227 | unsigned int cur) | |
228 | { | |
229 | *iter = (struct z_erofs_bvec_iter) { | |
230 | .nr = bootstrap_nr, | |
231 | .bvset = (struct z_erofs_bvset *)bvset, | |
232 | }; | |
233 | ||
234 | while (cur > iter->nr) { | |
235 | cur -= iter->nr; | |
236 | z_erofs_bvset_flip(iter); | |
237 | } | |
238 | iter->cur = cur; | |
239 | } | |
240 | ||
241 | static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, | |
242 | struct z_erofs_bvec *bvec, | |
243 | struct page **candidate_bvpage) | |
244 | { | |
245 | if (iter->cur == iter->nr) { | |
246 | if (!*candidate_bvpage) | |
247 | return -EAGAIN; | |
248 | ||
249 | DBG_BUGON(iter->bvset->nextpage); | |
250 | iter->bvset->nextpage = *candidate_bvpage; | |
251 | z_erofs_bvset_flip(iter); | |
252 | ||
253 | iter->bvset->nextpage = NULL; | |
254 | *candidate_bvpage = NULL; | |
255 | } | |
256 | iter->bvset->bvec[iter->cur++] = *bvec; | |
257 | return 0; | |
258 | } | |
259 | ||
260 | static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, | |
261 | struct z_erofs_bvec *bvec, | |
262 | struct page **old_bvpage) | |
263 | { | |
264 | if (iter->cur == iter->nr) | |
265 | *old_bvpage = z_erofs_bvset_flip(iter); | |
266 | else | |
267 | *old_bvpage = NULL; | |
268 | *bvec = iter->bvset->bvec[iter->cur++]; | |
269 | } | |
270 | ||
9f6cc76e GX |
271 | static void z_erofs_destroy_pcluster_pool(void) |
272 | { | |
273 | int i; | |
274 | ||
275 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
276 | if (!pcluster_pool[i].slab) | |
277 | continue; | |
278 | kmem_cache_destroy(pcluster_pool[i].slab); | |
279 | pcluster_pool[i].slab = NULL; | |
280 | } | |
281 | } | |
282 | ||
283 | static int z_erofs_create_pcluster_pool(void) | |
284 | { | |
285 | struct z_erofs_pcluster_slab *pcs; | |
286 | struct z_erofs_pcluster *a; | |
287 | unsigned int size; | |
288 | ||
289 | for (pcs = pcluster_pool; | |
290 | pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { | |
ed722fbc | 291 | size = struct_size(a, compressed_bvecs, pcs->maxpages); |
9f6cc76e GX |
292 | |
293 | sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); | |
294 | pcs->slab = kmem_cache_create(pcs->name, size, 0, | |
295 | SLAB_RECLAIM_ACCOUNT, NULL); | |
296 | if (pcs->slab) | |
297 | continue; | |
298 | ||
299 | z_erofs_destroy_pcluster_pool(); | |
300 | return -ENOMEM; | |
301 | } | |
302 | return 0; | |
303 | } | |
304 | ||
305 | static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) | |
306 | { | |
307 | int i; | |
308 | ||
309 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
310 | struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; | |
311 | struct z_erofs_pcluster *pcl; | |
312 | ||
313 | if (nrpages > pcs->maxpages) | |
314 | continue; | |
315 | ||
316 | pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); | |
317 | if (!pcl) | |
318 | return ERR_PTR(-ENOMEM); | |
319 | pcl->pclusterpages = nrpages; | |
320 | return pcl; | |
321 | } | |
322 | return ERR_PTR(-EINVAL); | |
323 | } | |
324 | ||
325 | static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) | |
326 | { | |
cecf864d | 327 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
9f6cc76e GX |
328 | int i; |
329 | ||
330 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { | |
331 | struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; | |
332 | ||
cecf864d | 333 | if (pclusterpages > pcs->maxpages) |
9f6cc76e GX |
334 | continue; |
335 | ||
336 | kmem_cache_free(pcs->slab, pcl); | |
337 | return; | |
338 | } | |
339 | DBG_BUGON(1); | |
340 | } | |
341 | ||
3883a79a | 342 | static struct workqueue_struct *z_erofs_workqueue __read_mostly; |
3883a79a GX |
343 | |
344 | void z_erofs_exit_zip_subsystem(void) | |
345 | { | |
3883a79a | 346 | destroy_workqueue(z_erofs_workqueue); |
9f6cc76e | 347 | z_erofs_destroy_pcluster_pool(); |
3883a79a GX |
348 | } |
349 | ||
99634bf3 | 350 | static inline int z_erofs_init_workqueue(void) |
3883a79a | 351 | { |
7dd68b14 | 352 | const unsigned int onlinecpus = num_possible_cpus(); |
3883a79a GX |
353 | |
354 | /* | |
97e86a85 GX |
355 | * no need to spawn too many threads, limiting threads could minimum |
356 | * scheduling overhead, perhaps per-CPU threads should be better? | |
3883a79a | 357 | */ |
0e62ea33 GX |
358 | z_erofs_workqueue = alloc_workqueue("erofs_unzipd", |
359 | WQ_UNBOUND | WQ_HIGHPRI, | |
97e86a85 | 360 | onlinecpus + onlinecpus / 4); |
42d40b4a | 361 | return z_erofs_workqueue ? 0 : -ENOMEM; |
3883a79a GX |
362 | } |
363 | ||
0a0b7e62 | 364 | int __init z_erofs_init_zip_subsystem(void) |
3883a79a | 365 | { |
9f6cc76e GX |
366 | int err = z_erofs_create_pcluster_pool(); |
367 | ||
368 | if (err) | |
369 | return err; | |
370 | err = z_erofs_init_workqueue(); | |
371 | if (err) | |
372 | z_erofs_destroy_pcluster_pool(); | |
373 | return err; | |
3883a79a GX |
374 | } |
375 | ||
db166fc2 GX |
376 | enum z_erofs_pclustermode { |
377 | Z_EROFS_PCLUSTER_INFLIGHT, | |
3883a79a | 378 | /* |
db166fc2 GX |
379 | * The current pclusters was the tail of an exist chain, in addition |
380 | * that the previous processed chained pclusters are all decided to | |
97e86a85 | 381 | * be hooked up to it. |
db166fc2 GX |
382 | * A new chain will be created for the remaining pclusters which are |
383 | * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, | |
384 | * the next pcluster cannot reuse the whole page safely for inplace I/O | |
385 | * in the following scenario: | |
a112152f GX |
386 | * ________________________________________________________________ |
387 | * | tail (partial) page | head (partial) page | | |
db166fc2 GX |
388 | * | (belongs to the next pcl) | (belongs to the current pcl) | |
389 | * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| | |
a112152f | 390 | */ |
db166fc2 | 391 | Z_EROFS_PCLUSTER_HOOKED, |
0b964600 | 392 | /* |
db166fc2 | 393 | * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it |
0b964600 GX |
394 | * could be dispatched into bypass queue later due to uptodated managed |
395 | * pages. All related online pages cannot be reused for inplace I/O (or | |
387bab87 | 396 | * bvpage) since it can be directly decoded without I/O submission. |
0b964600 | 397 | */ |
db166fc2 | 398 | Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, |
a112152f | 399 | /* |
97e86a85 GX |
400 | * The current collection has been linked with the owned chain, and |
401 | * could also be linked with the remaining collections, which means | |
402 | * if the processing page is the tail page of the collection, thus | |
403 | * the current collection can safely use the whole page (since | |
404 | * the previous collection is under control) for in-place I/O, as | |
405 | * illustrated below: | |
a112152f | 406 | * ________________________________________________________________ |
97e86a85 GX |
407 | * | tail (partial) page | head (partial) page | |
408 | * | (of the current cl) | (of the previous collection) | | |
db166fc2 GX |
409 | * | PCLUSTER_FOLLOWED or | | |
410 | * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| | |
a112152f | 411 | * |
97e86a85 | 412 | * [ (*) the above page can be used as inplace I/O. ] |
3883a79a | 413 | */ |
db166fc2 | 414 | Z_EROFS_PCLUSTER_FOLLOWED, |
3883a79a GX |
415 | }; |
416 | ||
5c6dcc57 GX |
417 | struct z_erofs_decompress_frontend { |
418 | struct inode *const inode; | |
419 | struct erofs_map_blocks map; | |
06a304cd | 420 | struct z_erofs_bvec_iter biter; |
3883a79a | 421 | |
06a304cd | 422 | struct page *candidate_bvpage; |
bfc4ccb1 | 423 | struct z_erofs_pcluster *pcl, *tailpcl; |
97e86a85 | 424 | z_erofs_next_pcluster_t owned_head; |
db166fc2 | 425 | enum z_erofs_pclustermode mode; |
97e86a85 | 426 | |
6ea5aad3 | 427 | bool readahead; |
97e86a85 GX |
428 | /* used for applying cache strategy on the fly */ |
429 | bool backmost; | |
430 | erofs_off_t headoffset; | |
ed722fbc GX |
431 | |
432 | /* a pointer used to pick up inplace I/O pages */ | |
433 | unsigned int icur; | |
97e86a85 GX |
434 | }; |
435 | ||
97e86a85 | 436 | #define DECOMPRESS_FRONTEND_INIT(__i) { \ |
5c6dcc57 | 437 | .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ |
db166fc2 | 438 | .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } |
97e86a85 | 439 | |
1282dea3 GX |
440 | static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) |
441 | { | |
442 | unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; | |
443 | ||
444 | if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) | |
445 | return false; | |
446 | ||
447 | if (fe->backmost) | |
448 | return true; | |
449 | ||
450 | if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && | |
451 | fe->map.m_la < fe->headoffset) | |
452 | return true; | |
453 | ||
454 | return false; | |
455 | } | |
456 | ||
6f39d1e1 | 457 | static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, |
6f39d1e1 | 458 | struct page **pagepool) |
105d4ad8 | 459 | { |
6f39d1e1 | 460 | struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); |
5c6dcc57 | 461 | struct z_erofs_pcluster *pcl = fe->pcl; |
1282dea3 | 462 | bool shouldalloc = z_erofs_should_alloc_cache(fe); |
92e6efd5 | 463 | bool standalone = true; |
6f39d1e1 GX |
464 | /* |
465 | * optimistic allocation without direct reclaim since inplace I/O | |
466 | * can be used if low memory otherwise. | |
467 | */ | |
1825c8d7 GX |
468 | gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | |
469 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; | |
ed722fbc | 470 | unsigned int i; |
92e6efd5 | 471 | |
db166fc2 | 472 | if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) |
92e6efd5 GX |
473 | return; |
474 | ||
ed722fbc | 475 | for (i = 0; i < pcl->pclusterpages; ++i) { |
92e6efd5 | 476 | struct page *page; |
b1ed220c | 477 | void *t; /* mark pages just found for debugging */ |
1825c8d7 | 478 | struct page *newpage = NULL; |
92e6efd5 GX |
479 | |
480 | /* the compressed page was loaded before */ | |
ed722fbc | 481 | if (READ_ONCE(pcl->compressed_bvecs[i].page)) |
105d4ad8 GX |
482 | continue; |
483 | ||
ed722fbc | 484 | page = find_get_page(mc, pcl->obj.index + i); |
92e6efd5 GX |
485 | |
486 | if (page) { | |
b1ed220c | 487 | t = (void *)((unsigned long)page | 1); |
0b964600 GX |
488 | } else { |
489 | /* I/O is needed, no possible to decompress directly */ | |
92e6efd5 | 490 | standalone = false; |
1282dea3 GX |
491 | if (!shouldalloc) |
492 | continue; | |
493 | ||
494 | /* | |
495 | * try to use cached I/O if page allocation | |
496 | * succeeds or fallback to in-place I/O instead | |
497 | * to avoid any direct reclaim. | |
498 | */ | |
499 | newpage = erofs_allocpage(pagepool, gfp); | |
500 | if (!newpage) | |
0b964600 | 501 | continue; |
1282dea3 | 502 | set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); |
b1ed220c | 503 | t = (void *)((unsigned long)newpage | 1); |
105d4ad8 GX |
504 | } |
505 | ||
b1ed220c | 506 | if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) |
105d4ad8 GX |
507 | continue; |
508 | ||
eaa9172a | 509 | if (page) |
92e6efd5 | 510 | put_page(page); |
eaa9172a GX |
511 | else if (newpage) |
512 | erofs_pagepool_add(pagepool, newpage); | |
105d4ad8 | 513 | } |
92e6efd5 | 514 | |
0b964600 GX |
515 | /* |
516 | * don't do inplace I/O if all compressed pages are available in | |
517 | * managed cache since it can be moved to the bypass queue instead. | |
518 | */ | |
519 | if (standalone) | |
db166fc2 | 520 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; |
105d4ad8 GX |
521 | } |
522 | ||
523 | /* called by erofs_shrinker to get rid of all compressed_pages */ | |
47e541a1 | 524 | int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, |
97e86a85 | 525 | struct erofs_workgroup *grp) |
105d4ad8 | 526 | { |
97e86a85 GX |
527 | struct z_erofs_pcluster *const pcl = |
528 | container_of(grp, struct z_erofs_pcluster, obj); | |
105d4ad8 GX |
529 | int i; |
530 | ||
cecf864d | 531 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); |
105d4ad8 GX |
532 | /* |
533 | * refcount of workgroup is now freezed as 1, | |
534 | * therefore no need to worry about available decompression users. | |
535 | */ | |
9f6cc76e | 536 | for (i = 0; i < pcl->pclusterpages; ++i) { |
ed722fbc | 537 | struct page *page = pcl->compressed_bvecs[i].page; |
105d4ad8 | 538 | |
97e86a85 | 539 | if (!page) |
105d4ad8 GX |
540 | continue; |
541 | ||
542 | /* block other users from reclaiming or migrating the page */ | |
543 | if (!trylock_page(page)) | |
544 | return -EBUSY; | |
545 | ||
f4d4e5fc | 546 | if (!erofs_page_is_managed(sbi, page)) |
97e86a85 | 547 | continue; |
105d4ad8 | 548 | |
97e86a85 | 549 | /* barrier is implied in the following 'unlock_page' */ |
ed722fbc | 550 | WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); |
6aaa7b06 | 551 | detach_page_private(page); |
105d4ad8 | 552 | unlock_page(page); |
105d4ad8 GX |
553 | } |
554 | return 0; | |
555 | } | |
556 | ||
d252ff3d | 557 | int erofs_try_to_free_cached_page(struct page *page) |
105d4ad8 | 558 | { |
97e86a85 | 559 | struct z_erofs_pcluster *const pcl = (void *)page_private(page); |
ed722fbc | 560 | int ret, i; |
105d4ad8 | 561 | |
ed722fbc GX |
562 | if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) |
563 | return 0; | |
105d4ad8 | 564 | |
ed722fbc GX |
565 | ret = 0; |
566 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); | |
567 | for (i = 0; i < pcl->pclusterpages; ++i) { | |
568 | if (pcl->compressed_bvecs[i].page == page) { | |
569 | WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); | |
570 | ret = 1; | |
571 | break; | |
105d4ad8 | 572 | } |
105d4ad8 | 573 | } |
ed722fbc GX |
574 | erofs_workgroup_unfreeze(&pcl->obj, 1); |
575 | if (ret) | |
576 | detach_page_private(page); | |
105d4ad8 GX |
577 | return ret; |
578 | } | |
105d4ad8 | 579 | |
5c6dcc57 | 580 | static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, |
ed722fbc | 581 | struct z_erofs_bvec *bvec) |
3883a79a | 582 | { |
5c6dcc57 | 583 | struct z_erofs_pcluster *const pcl = fe->pcl; |
97e86a85 | 584 | |
ed722fbc GX |
585 | while (fe->icur > 0) { |
586 | if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, | |
587 | NULL, bvec->page)) { | |
588 | pcl->compressed_bvecs[fe->icur] = *bvec; | |
3883a79a | 589 | return true; |
ed722fbc GX |
590 | } |
591 | } | |
3883a79a GX |
592 | return false; |
593 | } | |
594 | ||
87ca34a7 | 595 | /* callers must be with pcluster lock held */ |
5c6dcc57 | 596 | static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, |
5b220b20 | 597 | struct z_erofs_bvec *bvec, bool exclusive) |
3883a79a GX |
598 | { |
599 | int ret; | |
3883a79a | 600 | |
db166fc2 | 601 | if (exclusive) { |
06a304cd | 602 | /* give priority for inplaceio to use file pages first */ |
ed722fbc | 603 | if (z_erofs_try_inplace_io(fe, bvec)) |
06a304cd GX |
604 | return 0; |
605 | /* otherwise, check if it can be used as a bvpage */ | |
db166fc2 | 606 | if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && |
06a304cd GX |
607 | !fe->candidate_bvpage) |
608 | fe->candidate_bvpage = bvec->page; | |
609 | } | |
610 | ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); | |
611 | fe->pcl->vcnt += (ret >= 0); | |
612 | return ret; | |
3883a79a GX |
613 | } |
614 | ||
5c6dcc57 | 615 | static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) |
3883a79a | 616 | { |
5c6dcc57 GX |
617 | struct z_erofs_pcluster *pcl = f->pcl; |
618 | z_erofs_next_pcluster_t *owned_head = &f->owned_head; | |
3883a79a | 619 | |
473e15b0 GX |
620 | /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ |
621 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, | |
622 | *owned_head) == Z_EROFS_PCLUSTER_NIL) { | |
97e86a85 | 623 | *owned_head = &pcl->next; |
473e15b0 | 624 | /* so we can attach this pcluster to our submission chain. */ |
db166fc2 | 625 | f->mode = Z_EROFS_PCLUSTER_FOLLOWED; |
473e15b0 GX |
626 | return; |
627 | } | |
628 | ||
629 | /* | |
630 | * type 2, link to the end of an existing open chain, be careful | |
631 | * that its submission is controlled by the original attached chain. | |
632 | */ | |
267f2492 GX |
633 | if (*owned_head != &pcl->next && pcl != f->tailpcl && |
634 | cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | |
473e15b0 | 635 | *owned_head) == Z_EROFS_PCLUSTER_TAIL) { |
97e86a85 | 636 | *owned_head = Z_EROFS_PCLUSTER_TAIL; |
db166fc2 | 637 | f->mode = Z_EROFS_PCLUSTER_HOOKED; |
5c6dcc57 | 638 | f->tailpcl = NULL; |
473e15b0 | 639 | return; |
a112152f | 640 | } |
473e15b0 | 641 | /* type 3, it belongs to a chain, but it isn't the end of the chain */ |
db166fc2 | 642 | f->mode = Z_EROFS_PCLUSTER_INFLIGHT; |
3883a79a GX |
643 | } |
644 | ||
83a386c0 | 645 | static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) |
3883a79a | 646 | { |
83a386c0 | 647 | struct erofs_map_blocks *map = &fe->map; |
cecf864d | 648 | bool ztailpacking = map->m_flags & EROFS_MAP_META; |
97e86a85 | 649 | struct z_erofs_pcluster *pcl; |
64094a04 | 650 | struct erofs_workgroup *grp; |
97e86a85 | 651 | int err; |
e5e3abba | 652 | |
c42c0ffe CZ |
653 | if (!(map->m_flags & EROFS_MAP_ENCODED) || |
654 | (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { | |
8f899262 GX |
655 | DBG_BUGON(1); |
656 | return -EFSCORRUPTED; | |
657 | } | |
658 | ||
9f6cc76e | 659 | /* no available pcluster, let's allocate one */ |
cecf864d YH |
660 | pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : |
661 | map->m_plen >> PAGE_SHIFT); | |
9f6cc76e GX |
662 | if (IS_ERR(pcl)) |
663 | return PTR_ERR(pcl); | |
3883a79a | 664 | |
64094a04 | 665 | atomic_set(&pcl->obj.refcount, 1); |
8f899262 | 666 | pcl->algorithmformat = map->m_algorithmformat; |
2bfab9c0 GX |
667 | pcl->length = 0; |
668 | pcl->partial = true; | |
3883a79a | 669 | |
97e86a85 | 670 | /* new pclusters should be claimed as type 1, primary and followed */ |
5c6dcc57 | 671 | pcl->next = fe->owned_head; |
87ca34a7 | 672 | pcl->pageofs_out = map->m_la & ~PAGE_MASK; |
db166fc2 | 673 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; |
3883a79a | 674 | |
23edf3ab GX |
675 | /* |
676 | * lock all primary followed works before visible to others | |
97e86a85 | 677 | * and mutex_trylock *never* fails for a new pcluster. |
23edf3ab | 678 | */ |
87ca34a7 GX |
679 | mutex_init(&pcl->lock); |
680 | DBG_BUGON(!mutex_trylock(&pcl->lock)); | |
64094a04 | 681 | |
cecf864d YH |
682 | if (ztailpacking) { |
683 | pcl->obj.index = 0; /* which indicates ztailpacking */ | |
684 | pcl->pageofs_in = erofs_blkoff(map->m_pa); | |
685 | pcl->tailpacking_size = map->m_plen; | |
686 | } else { | |
687 | pcl->obj.index = map->m_pa >> PAGE_SHIFT; | |
23edf3ab | 688 | |
83a386c0 | 689 | grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); |
cecf864d YH |
690 | if (IS_ERR(grp)) { |
691 | err = PTR_ERR(grp); | |
692 | goto err_out; | |
693 | } | |
694 | ||
695 | if (grp != &pcl->obj) { | |
5c6dcc57 | 696 | fe->pcl = container_of(grp, |
cecf864d YH |
697 | struct z_erofs_pcluster, obj); |
698 | err = -EEXIST; | |
699 | goto err_out; | |
700 | } | |
3883a79a | 701 | } |
bfc4ccb1 | 702 | /* used to check tail merging loop due to corrupted images */ |
5c6dcc57 GX |
703 | if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) |
704 | fe->tailpcl = pcl; | |
705 | fe->owned_head = &pcl->next; | |
706 | fe->pcl = pcl; | |
9e579fc1 | 707 | return 0; |
64094a04 GX |
708 | |
709 | err_out: | |
87ca34a7 | 710 | mutex_unlock(&pcl->lock); |
9f6cc76e | 711 | z_erofs_free_pcluster(pcl); |
64094a04 | 712 | return err; |
3883a79a GX |
713 | } |
714 | ||
83a386c0 | 715 | static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) |
97e86a85 | 716 | { |
83a386c0 | 717 | struct erofs_map_blocks *map = &fe->map; |
0d823b42 | 718 | struct erofs_workgroup *grp = NULL; |
9e579fc1 | 719 | int ret; |
a112152f | 720 | |
87ca34a7 | 721 | DBG_BUGON(fe->pcl); |
3883a79a | 722 | |
87ca34a7 | 723 | /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ |
5c6dcc57 GX |
724 | DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); |
725 | DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | |
3883a79a | 726 | |
0d823b42 GX |
727 | if (!(map->m_flags & EROFS_MAP_META)) { |
728 | grp = erofs_find_workgroup(fe->inode->i_sb, | |
729 | map->m_pa >> PAGE_SHIFT); | |
730 | } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { | |
731 | DBG_BUGON(1); | |
732 | return -EFSCORRUPTED; | |
3883a79a GX |
733 | } |
734 | ||
64094a04 | 735 | if (grp) { |
5c6dcc57 | 736 | fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); |
0d823b42 | 737 | ret = -EEXIST; |
64094a04 | 738 | } else { |
83a386c0 | 739 | ret = z_erofs_register_pcluster(fe); |
3883a79a GX |
740 | } |
741 | ||
0d823b42 | 742 | if (ret == -EEXIST) { |
267f2492 GX |
743 | mutex_lock(&fe->pcl->lock); |
744 | /* used to check tail merging loop due to corrupted images */ | |
745 | if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) | |
746 | fe->tailpcl = fe->pcl; | |
747 | ||
748 | z_erofs_try_to_claim_pcluster(fe); | |
0d823b42 | 749 | } else if (ret) { |
9e579fc1 | 750 | return ret; |
64094a04 | 751 | } |
06a304cd | 752 | z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, |
387bab87 | 753 | Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); |
81382f5f | 754 | /* since file-backed online pages are traversed in reverse order */ |
ed722fbc | 755 | fe->icur = z_erofs_pclusterpages(fe->pcl); |
3883a79a GX |
756 | return 0; |
757 | } | |
758 | ||
759 | /* | |
97e86a85 GX |
760 | * keep in mind that no referenced pclusters will be freed |
761 | * only after a RCU grace period. | |
3883a79a GX |
762 | */ |
763 | static void z_erofs_rcu_callback(struct rcu_head *head) | |
764 | { | |
87ca34a7 GX |
765 | z_erofs_free_pcluster(container_of(head, |
766 | struct z_erofs_pcluster, rcu)); | |
3883a79a GX |
767 | } |
768 | ||
769 | void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) | |
770 | { | |
97e86a85 GX |
771 | struct z_erofs_pcluster *const pcl = |
772 | container_of(grp, struct z_erofs_pcluster, obj); | |
3883a79a | 773 | |
87ca34a7 | 774 | call_rcu(&pcl->rcu, z_erofs_rcu_callback); |
3883a79a GX |
775 | } |
776 | ||
5c6dcc57 | 777 | static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) |
3883a79a | 778 | { |
87ca34a7 | 779 | struct z_erofs_pcluster *pcl = fe->pcl; |
3883a79a | 780 | |
87ca34a7 | 781 | if (!pcl) |
3883a79a GX |
782 | return false; |
783 | ||
06a304cd | 784 | z_erofs_bvec_iter_end(&fe->biter); |
87ca34a7 | 785 | mutex_unlock(&pcl->lock); |
3883a79a | 786 | |
06a304cd GX |
787 | if (fe->candidate_bvpage) { |
788 | DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); | |
789 | fe->candidate_bvpage = NULL; | |
790 | } | |
791 | ||
3883a79a | 792 | /* |
97e86a85 GX |
793 | * if all pending pages are added, don't hold its reference |
794 | * any longer if the pcluster isn't hosted by ourselves. | |
3883a79a | 795 | */ |
db166fc2 | 796 | if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) |
87ca34a7 | 797 | erofs_workgroup_put(&pcl->obj); |
3883a79a | 798 | |
87ca34a7 | 799 | fe->pcl = NULL; |
3883a79a GX |
800 | return true; |
801 | } | |
802 | ||
b15b2e30 YH |
803 | static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, |
804 | struct page *page, unsigned int pageofs, | |
805 | unsigned int len) | |
806 | { | |
807 | struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; | |
808 | struct erofs_buf buf = __EROFS_BUF_INITIALIZER; | |
809 | u8 *src, *dst; | |
810 | unsigned int i, cnt; | |
811 | ||
e5126de1 YH |
812 | if (!packed_inode) |
813 | return -EFSCORRUPTED; | |
814 | ||
b15b2e30 YH |
815 | pos += EROFS_I(inode)->z_fragmentoff; |
816 | for (i = 0; i < len; i += cnt) { | |
817 | cnt = min_t(unsigned int, len - i, | |
818 | EROFS_BLKSIZ - erofs_blkoff(pos)); | |
819 | src = erofs_bread(&buf, packed_inode, | |
820 | erofs_blknr(pos), EROFS_KMAP); | |
821 | if (IS_ERR(src)) { | |
822 | erofs_put_metabuf(&buf); | |
823 | return PTR_ERR(src); | |
824 | } | |
825 | ||
826 | dst = kmap_local_page(page); | |
827 | memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); | |
828 | kunmap_local(dst); | |
829 | pos += cnt; | |
830 | } | |
831 | erofs_put_metabuf(&buf); | |
832 | return 0; | |
833 | } | |
834 | ||
97e86a85 | 835 | static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, |
eaa9172a | 836 | struct page *page, struct page **pagepool) |
3883a79a | 837 | { |
97e86a85 | 838 | struct inode *const inode = fe->inode; |
3b423417 | 839 | struct erofs_map_blocks *const map = &fe->map; |
3883a79a | 840 | const loff_t offset = page_offset(page); |
5b220b20 | 841 | bool tight = true, exclusive; |
2bfab9c0 | 842 | unsigned int cur, end, spiltted; |
1e05ff36 | 843 | int err = 0; |
3883a79a GX |
844 | |
845 | /* register locked file pages as online pages in pack */ | |
846 | z_erofs_onlinepage_init(page); | |
847 | ||
848 | spiltted = 0; | |
849 | end = PAGE_SIZE; | |
850 | repeat: | |
851 | cur = end - 1; | |
852 | ||
39397a46 GX |
853 | if (offset + cur < map->m_la || |
854 | offset + cur >= map->m_la + map->m_llen) { | |
855 | erofs_dbg("out-of-range map @ pos %llu", offset + cur); | |
3883a79a | 856 | |
39397a46 GX |
857 | if (z_erofs_collector_end(fe)) |
858 | fe->backmost = false; | |
859 | map->m_la = offset + cur; | |
860 | map->m_llen = 0; | |
861 | err = z_erofs_map_blocks_iter(inode, map, 0); | |
862 | if (err) | |
67148551 | 863 | goto out; |
39397a46 GX |
864 | } else { |
865 | if (fe->pcl) | |
866 | goto hitted; | |
87ca34a7 | 867 | /* didn't get a valid pcluster previously (very rare) */ |
1e5ceeab | 868 | } |
3883a79a | 869 | |
b15b2e30 YH |
870 | if (!(map->m_flags & EROFS_MAP_MAPPED) || |
871 | map->m_flags & EROFS_MAP_FRAGMENT) | |
3883a79a GX |
872 | goto hitted; |
873 | ||
83a386c0 | 874 | err = z_erofs_collector_begin(fe); |
8d8a09b0 | 875 | if (err) |
67148551 | 876 | goto out; |
3883a79a | 877 | |
5c6dcc57 | 878 | if (z_erofs_is_inline_pcluster(fe->pcl)) { |
09c54379 | 879 | void *mp; |
cecf864d | 880 | |
09c54379 GX |
881 | mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, |
882 | erofs_blknr(map->m_pa), EROFS_NO_KMAP); | |
883 | if (IS_ERR(mp)) { | |
884 | err = PTR_ERR(mp); | |
cecf864d YH |
885 | erofs_err(inode->i_sb, |
886 | "failed to get inline page, err %d", err); | |
67148551 | 887 | goto out; |
cecf864d | 888 | } |
09c54379 | 889 | get_page(fe->map.buf.page); |
ed722fbc GX |
890 | WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, |
891 | fe->map.buf.page); | |
db166fc2 | 892 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; |
cecf864d | 893 | } else { |
6f39d1e1 | 894 | /* bind cache first when cached decompression is preferred */ |
1282dea3 | 895 | z_erofs_bind_cache(fe, pagepool); |
cecf864d | 896 | } |
3883a79a | 897 | hitted: |
dc76ea8c GX |
898 | /* |
899 | * Ensure the current partial page belongs to this submit chain rather | |
900 | * than other concurrent submit chains or the noio(bypass) chain since | |
901 | * those chains are handled asynchronously thus the page cannot be used | |
387bab87 | 902 | * for inplace I/O or bvpage (should be processed in a strict order.) |
dc76ea8c | 903 | */ |
db166fc2 GX |
904 | tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && |
905 | fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); | |
dc76ea8c | 906 | |
7dd68b14 | 907 | cur = end - min_t(unsigned int, offset + end - map->m_la, end); |
8d8a09b0 | 908 | if (!(map->m_flags & EROFS_MAP_MAPPED)) { |
3883a79a GX |
909 | zero_user_segment(page, cur, end); |
910 | goto next_part; | |
911 | } | |
b15b2e30 YH |
912 | if (map->m_flags & EROFS_MAP_FRAGMENT) { |
913 | unsigned int pageofs, skip, len; | |
914 | ||
915 | if (offset > map->m_la) { | |
916 | pageofs = 0; | |
917 | skip = offset - map->m_la; | |
918 | } else { | |
919 | pageofs = map->m_la & ~PAGE_MASK; | |
920 | skip = 0; | |
921 | } | |
922 | len = min_t(unsigned int, map->m_llen - skip, end - cur); | |
923 | err = z_erofs_read_fragment(inode, skip, page, pageofs, len); | |
924 | if (err) | |
925 | goto out; | |
926 | ++spiltted; | |
927 | tight = false; | |
928 | goto next_part; | |
929 | } | |
3883a79a | 930 | |
5b220b20 | 931 | exclusive = (!cur && (!spiltted || tight)); |
a112152f | 932 | if (cur) |
db166fc2 | 933 | tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); |
a112152f | 934 | |
3883a79a | 935 | retry: |
06a304cd GX |
936 | err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { |
937 | .page = page, | |
938 | .offset = offset - map->m_la, | |
939 | .end = end, | |
5b220b20 | 940 | }), exclusive); |
06a304cd GX |
941 | /* should allocate an additional short-lived page for bvset */ |
942 | if (err == -EAGAIN && !fe->candidate_bvpage) { | |
943 | fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); | |
944 | set_page_private(fe->candidate_bvpage, | |
945 | Z_EROFS_SHORTLIVED_PAGE); | |
946 | goto retry; | |
3883a79a GX |
947 | } |
948 | ||
06a304cd GX |
949 | if (err) { |
950 | DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); | |
67148551 | 951 | goto out; |
06a304cd | 952 | } |
3883a79a | 953 | |
67148551 | 954 | z_erofs_onlinepage_split(page); |
1e05ff36 GX |
955 | /* bump up the number of spiltted parts of a page */ |
956 | ++spiltted; | |
267f2492 GX |
957 | if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) |
958 | fe->pcl->multibases = true; | |
2bfab9c0 GX |
959 | if (fe->pcl->length < offset + end - map->m_la) { |
960 | fe->pcl->length = offset + end - map->m_la; | |
961 | fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; | |
962 | } | |
e7933278 GX |
963 | if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && |
964 | !(map->m_flags & EROFS_MAP_PARTIAL_REF) && | |
965 | fe->pcl->length == map->m_llen) | |
966 | fe->pcl->partial = false; | |
3883a79a | 967 | next_part: |
2bfab9c0 | 968 | /* shorten the remaining extent to update progress */ |
3883a79a | 969 | map->m_llen = offset + cur - map->m_la; |
2bfab9c0 | 970 | map->m_flags &= ~EROFS_MAP_FULL_MAPPED; |
3883a79a | 971 | |
2bc75964 KČ |
972 | end = cur; |
973 | if (end > 0) | |
3883a79a GX |
974 | goto repeat; |
975 | ||
1e05ff36 | 976 | out: |
67148551 GX |
977 | if (err) |
978 | z_erofs_page_mark_eio(page); | |
3883a79a GX |
979 | z_erofs_onlinepage_endio(page); |
980 | ||
4f761fa2 GX |
981 | erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", |
982 | __func__, page, spiltted, map->m_llen); | |
1e05ff36 | 983 | return err; |
3883a79a GX |
984 | } |
985 | ||
40452ffc HJ |
986 | static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, |
987 | unsigned int readahead_pages) | |
988 | { | |
a2e20a25 | 989 | /* auto: enable for read_folio, disable for readahead */ |
40452ffc HJ |
990 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && |
991 | !readahead_pages) | |
992 | return true; | |
993 | ||
994 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && | |
995 | (readahead_pages <= sbi->opt.max_sync_decompress_pages)) | |
996 | return true; | |
997 | ||
998 | return false; | |
999 | } | |
1000 | ||
6aaa7b06 GX |
1001 | static bool z_erofs_page_is_invalidated(struct page *page) |
1002 | { | |
1003 | return !page->mapping && !z_erofs_is_shortlived_page(page); | |
1004 | } | |
1005 | ||
4f05687f GX |
1006 | struct z_erofs_decompress_backend { |
1007 | struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; | |
1008 | struct super_block *sb; | |
1009 | struct z_erofs_pcluster *pcl; | |
1010 | ||
1011 | /* pages with the longest decompressed length for deduplication */ | |
1012 | struct page **decompressed_pages; | |
1013 | /* pages to keep the compressed data */ | |
1014 | struct page **compressed_pages; | |
1015 | ||
267f2492 | 1016 | struct list_head decompressed_secondary_bvecs; |
4f05687f | 1017 | struct page **pagepool; |
2bfab9c0 | 1018 | unsigned int onstack_used, nr_pages; |
4f05687f GX |
1019 | }; |
1020 | ||
267f2492 GX |
1021 | struct z_erofs_bvec_item { |
1022 | struct z_erofs_bvec bvec; | |
1023 | struct list_head list; | |
1024 | }; | |
1025 | ||
1026 | static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, | |
1027 | struct z_erofs_bvec *bvec) | |
3fe96ee0 | 1028 | { |
267f2492 | 1029 | struct z_erofs_bvec_item *item; |
3fe96ee0 | 1030 | |
267f2492 GX |
1031 | if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { |
1032 | unsigned int pgnr; | |
3fe96ee0 | 1033 | |
267f2492 GX |
1034 | pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; |
1035 | DBG_BUGON(pgnr >= be->nr_pages); | |
63bbb856 GX |
1036 | if (!be->decompressed_pages[pgnr]) { |
1037 | be->decompressed_pages[pgnr] = bvec->page; | |
267f2492 | 1038 | return; |
63bbb856 | 1039 | } |
267f2492 GX |
1040 | } |
1041 | ||
1042 | /* (cold path) one pcluster is requested multiple times */ | |
1043 | item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); | |
1044 | item->bvec = *bvec; | |
1045 | list_add(&item->list, &be->decompressed_secondary_bvecs); | |
1046 | } | |
1047 | ||
1048 | static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, | |
1049 | int err) | |
1050 | { | |
1051 | unsigned int off0 = be->pcl->pageofs_out; | |
1052 | struct list_head *p, *n; | |
1053 | ||
1054 | list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { | |
1055 | struct z_erofs_bvec_item *bvi; | |
1056 | unsigned int end, cur; | |
1057 | void *dst, *src; | |
1058 | ||
1059 | bvi = container_of(p, struct z_erofs_bvec_item, list); | |
1060 | cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; | |
1061 | end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, | |
1062 | bvi->bvec.end); | |
1063 | dst = kmap_local_page(bvi->bvec.page); | |
1064 | while (cur < end) { | |
1065 | unsigned int pgnr, scur, len; | |
1066 | ||
1067 | pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; | |
1068 | DBG_BUGON(pgnr >= be->nr_pages); | |
1069 | ||
1070 | scur = bvi->bvec.offset + cur - | |
1071 | ((pgnr << PAGE_SHIFT) - off0); | |
1072 | len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); | |
1073 | if (!be->decompressed_pages[pgnr]) { | |
1074 | err = -EFSCORRUPTED; | |
1075 | cur += len; | |
1076 | continue; | |
1077 | } | |
1078 | src = kmap_local_page(be->decompressed_pages[pgnr]); | |
1079 | memcpy(dst + cur, src + scur, len); | |
1080 | kunmap_local(src); | |
1081 | cur += len; | |
1082 | } | |
1083 | kunmap_local(dst); | |
1084 | if (err) | |
1085 | z_erofs_page_mark_eio(bvi->bvec.page); | |
1086 | z_erofs_onlinepage_endio(bvi->bvec.page); | |
1087 | list_del(p); | |
1088 | kfree(bvi); | |
3fe96ee0 | 1089 | } |
3fe96ee0 GX |
1090 | } |
1091 | ||
267f2492 | 1092 | static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) |
42fec235 | 1093 | { |
4f05687f | 1094 | struct z_erofs_pcluster *pcl = be->pcl; |
06a304cd GX |
1095 | struct z_erofs_bvec_iter biter; |
1096 | struct page *old_bvpage; | |
267f2492 | 1097 | int i; |
42fec235 | 1098 | |
387bab87 | 1099 | z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); |
42fec235 | 1100 | for (i = 0; i < pcl->vcnt; ++i) { |
06a304cd | 1101 | struct z_erofs_bvec bvec; |
42fec235 | 1102 | |
06a304cd | 1103 | z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); |
42fec235 | 1104 | |
06a304cd | 1105 | if (old_bvpage) |
4f05687f | 1106 | z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); |
42fec235 | 1107 | |
06a304cd | 1108 | DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); |
267f2492 | 1109 | z_erofs_do_decompressed_bvec(be, &bvec); |
42fec235 | 1110 | } |
06a304cd GX |
1111 | |
1112 | old_bvpage = z_erofs_bvec_iter_end(&biter); | |
1113 | if (old_bvpage) | |
4f05687f | 1114 | z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); |
42fec235 GX |
1115 | } |
1116 | ||
4f05687f GX |
1117 | static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, |
1118 | bool *overlapped) | |
67139e36 | 1119 | { |
4f05687f | 1120 | struct z_erofs_pcluster *pcl = be->pcl; |
67139e36 | 1121 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
67139e36 GX |
1122 | int i, err = 0; |
1123 | ||
67139e36 | 1124 | *overlapped = false; |
67139e36 | 1125 | for (i = 0; i < pclusterpages; ++i) { |
ed722fbc GX |
1126 | struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; |
1127 | struct page *page = bvec->page; | |
67139e36 GX |
1128 | |
1129 | /* compressed pages ought to be present before decompressing */ | |
1130 | if (!page) { | |
1131 | DBG_BUGON(1); | |
1132 | continue; | |
1133 | } | |
fe3e5914 | 1134 | be->compressed_pages[i] = page; |
67139e36 GX |
1135 | |
1136 | if (z_erofs_is_inline_pcluster(pcl)) { | |
1137 | if (!PageUptodate(page)) | |
1138 | err = -EIO; | |
1139 | continue; | |
1140 | } | |
1141 | ||
1142 | DBG_BUGON(z_erofs_page_is_invalidated(page)); | |
1143 | if (!z_erofs_is_shortlived_page(page)) { | |
4f05687f | 1144 | if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { |
67139e36 GX |
1145 | if (!PageUptodate(page)) |
1146 | err = -EIO; | |
1147 | continue; | |
1148 | } | |
267f2492 | 1149 | z_erofs_do_decompressed_bvec(be, bvec); |
67139e36 GX |
1150 | *overlapped = true; |
1151 | } | |
67139e36 GX |
1152 | } |
1153 | ||
fe3e5914 | 1154 | if (err) |
4f05687f | 1155 | return err; |
4f05687f | 1156 | return 0; |
67139e36 GX |
1157 | } |
1158 | ||
4f05687f GX |
1159 | static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, |
1160 | int err) | |
3883a79a | 1161 | { |
4f05687f GX |
1162 | struct erofs_sb_info *const sbi = EROFS_SB(be->sb); |
1163 | struct z_erofs_pcluster *pcl = be->pcl; | |
cecf864d | 1164 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
2bfab9c0 | 1165 | unsigned int i, inputsize; |
67148551 | 1166 | int err2; |
2bfab9c0 GX |
1167 | struct page *page; |
1168 | bool overlapped; | |
3883a79a | 1169 | |
87ca34a7 | 1170 | mutex_lock(&pcl->lock); |
2bfab9c0 | 1171 | be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; |
3883a79a | 1172 | |
fe3e5914 GX |
1173 | /* allocate (de)compressed page arrays if cannot be kept on stack */ |
1174 | be->decompressed_pages = NULL; | |
1175 | be->compressed_pages = NULL; | |
1176 | be->onstack_used = 0; | |
2bfab9c0 | 1177 | if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { |
4f05687f | 1178 | be->decompressed_pages = be->onstack_pages; |
2bfab9c0 | 1179 | be->onstack_used = be->nr_pages; |
4f05687f | 1180 | memset(be->decompressed_pages, 0, |
2bfab9c0 | 1181 | sizeof(struct page *) * be->nr_pages); |
fe3e5914 GX |
1182 | } |
1183 | ||
1184 | if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) | |
1185 | be->compressed_pages = be->onstack_pages + be->onstack_used; | |
1186 | ||
1187 | if (!be->decompressed_pages) | |
4f05687f | 1188 | be->decompressed_pages = |
12724ba3 GX |
1189 | kcalloc(be->nr_pages, sizeof(struct page *), |
1190 | GFP_KERNEL | __GFP_NOFAIL); | |
fe3e5914 GX |
1191 | if (!be->compressed_pages) |
1192 | be->compressed_pages = | |
12724ba3 GX |
1193 | kcalloc(pclusterpages, sizeof(struct page *), |
1194 | GFP_KERNEL | __GFP_NOFAIL); | |
3883a79a | 1195 | |
267f2492 | 1196 | z_erofs_parse_out_bvecs(be); |
4f05687f | 1197 | err2 = z_erofs_parse_in_bvecs(be, &overlapped); |
67148551 GX |
1198 | if (err2) |
1199 | err = err2; | |
8d8a09b0 | 1200 | if (err) |
11152496 GX |
1201 | goto out; |
1202 | ||
cecf864d YH |
1203 | if (z_erofs_is_inline_pcluster(pcl)) |
1204 | inputsize = pcl->tailpacking_size; | |
1205 | else | |
1206 | inputsize = pclusterpages * PAGE_SIZE; | |
1207 | ||
88aaf5a7 | 1208 | err = z_erofs_decompress(&(struct z_erofs_decompress_req) { |
4f05687f GX |
1209 | .sb = be->sb, |
1210 | .in = be->compressed_pages, | |
1211 | .out = be->decompressed_pages, | |
cecf864d | 1212 | .pageofs_in = pcl->pageofs_in, |
87ca34a7 | 1213 | .pageofs_out = pcl->pageofs_out, |
9f6cc76e | 1214 | .inputsize = inputsize, |
2bfab9c0 | 1215 | .outputsize = pcl->length, |
97e86a85 | 1216 | .alg = pcl->algorithmformat, |
88aaf5a7 | 1217 | .inplace_io = overlapped, |
2bfab9c0 | 1218 | .partial_decoding = pcl->partial, |
267f2492 | 1219 | .fillgaps = pcl->multibases, |
4f05687f | 1220 | }, be->pagepool); |
3883a79a GX |
1221 | |
1222 | out: | |
cecf864d YH |
1223 | /* must handle all compressed pages before actual file pages */ |
1224 | if (z_erofs_is_inline_pcluster(pcl)) { | |
ed722fbc GX |
1225 | page = pcl->compressed_bvecs[0].page; |
1226 | WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); | |
cecf864d YH |
1227 | put_page(page); |
1228 | } else { | |
1229 | for (i = 0; i < pclusterpages; ++i) { | |
ed722fbc | 1230 | page = pcl->compressed_bvecs[i].page; |
d61fbb6b | 1231 | |
cecf864d YH |
1232 | if (erofs_page_is_managed(sbi, page)) |
1233 | continue; | |
af692e11 | 1234 | |
cecf864d | 1235 | /* recycle all individual short-lived pages */ |
4f05687f | 1236 | (void)z_erofs_put_shortlivedpage(be->pagepool, page); |
ed722fbc | 1237 | WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); |
cecf864d | 1238 | } |
af692e11 | 1239 | } |
fe3e5914 GX |
1240 | if (be->compressed_pages < be->onstack_pages || |
1241 | be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) | |
12724ba3 | 1242 | kfree(be->compressed_pages); |
267f2492 | 1243 | z_erofs_fill_other_copies(be, err); |
af692e11 | 1244 | |
2bfab9c0 | 1245 | for (i = 0; i < be->nr_pages; ++i) { |
4f05687f | 1246 | page = be->decompressed_pages[i]; |
af692e11 GX |
1247 | if (!page) |
1248 | continue; | |
1249 | ||
6aaa7b06 | 1250 | DBG_BUGON(z_erofs_page_is_invalidated(page)); |
3883a79a | 1251 | |
6aaa7b06 | 1252 | /* recycle all individual short-lived pages */ |
4f05687f | 1253 | if (z_erofs_put_shortlivedpage(be->pagepool, page)) |
3883a79a | 1254 | continue; |
67148551 GX |
1255 | if (err) |
1256 | z_erofs_page_mark_eio(page); | |
3883a79a GX |
1257 | z_erofs_onlinepage_endio(page); |
1258 | } | |
1259 | ||
4f05687f | 1260 | if (be->decompressed_pages != be->onstack_pages) |
12724ba3 | 1261 | kfree(be->decompressed_pages); |
3883a79a | 1262 | |
2bfab9c0 GX |
1263 | pcl->length = 0; |
1264 | pcl->partial = true; | |
267f2492 | 1265 | pcl->multibases = false; |
06a304cd | 1266 | pcl->bvset.nextpage = NULL; |
87ca34a7 | 1267 | pcl->vcnt = 0; |
3883a79a | 1268 | |
87ca34a7 | 1269 | /* pcluster lock MUST be taken before the following line */ |
97e86a85 | 1270 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); |
87ca34a7 | 1271 | mutex_unlock(&pcl->lock); |
3883a79a GX |
1272 | return err; |
1273 | } | |
1274 | ||
0c638f70 | 1275 | static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, |
eaa9172a | 1276 | struct page **pagepool) |
3883a79a | 1277 | { |
4f05687f GX |
1278 | struct z_erofs_decompress_backend be = { |
1279 | .sb = io->sb, | |
1280 | .pagepool = pagepool, | |
267f2492 GX |
1281 | .decompressed_secondary_bvecs = |
1282 | LIST_HEAD_INIT(be.decompressed_secondary_bvecs), | |
4f05687f | 1283 | }; |
97e86a85 | 1284 | z_erofs_next_pcluster_t owned = io->head; |
3883a79a | 1285 | |
97e86a85 | 1286 | while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { |
4f05687f | 1287 | /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ |
97e86a85 | 1288 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); |
4f05687f | 1289 | /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ |
97e86a85 | 1290 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); |
3883a79a | 1291 | |
4f05687f GX |
1292 | be.pcl = container_of(owned, struct z_erofs_pcluster, next); |
1293 | owned = READ_ONCE(be.pcl->next); | |
3883a79a | 1294 | |
4f05687f GX |
1295 | z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); |
1296 | erofs_workgroup_put(&be.pcl->obj); | |
3978c8e3 | 1297 | } |
3883a79a GX |
1298 | } |
1299 | ||
0c638f70 | 1300 | static void z_erofs_decompressqueue_work(struct work_struct *work) |
3883a79a | 1301 | { |
a4b1fab1 GX |
1302 | struct z_erofs_decompressqueue *bgq = |
1303 | container_of(work, struct z_erofs_decompressqueue, u.work); | |
eaa9172a | 1304 | struct page *pagepool = NULL; |
3883a79a | 1305 | |
a4b1fab1 | 1306 | DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
0c638f70 | 1307 | z_erofs_decompress_queue(bgq, &pagepool); |
3883a79a | 1308 | |
eaa9172a | 1309 | erofs_release_pages(&pagepool); |
a4b1fab1 | 1310 | kvfree(bgq); |
3883a79a GX |
1311 | } |
1312 | ||
7865827c | 1313 | static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, |
cdba5506 | 1314 | int bios) |
7865827c GX |
1315 | { |
1316 | struct erofs_sb_info *const sbi = EROFS_SB(io->sb); | |
1317 | ||
1318 | /* wake up the caller thread for sync decompression */ | |
cdba5506 | 1319 | if (io->sync) { |
7865827c | 1320 | if (!atomic_add_return(bios, &io->pending_bios)) |
60b30050 | 1321 | complete(&io->u.done); |
7865827c GX |
1322 | return; |
1323 | } | |
1324 | ||
1325 | if (atomic_add_return(bios, &io->pending_bios)) | |
1326 | return; | |
1327 | /* Use workqueue and sync decompression for atomic contexts only */ | |
1328 | if (in_atomic() || irqs_disabled()) { | |
1329 | queue_work(z_erofs_workqueue, &io->u.work); | |
1330 | /* enable sync decompression for readahead */ | |
1331 | if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) | |
1332 | sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; | |
1333 | return; | |
1334 | } | |
1335 | z_erofs_decompressqueue_work(&io->u.work); | |
1336 | } | |
1337 | ||
97e86a85 GX |
1338 | static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, |
1339 | unsigned int nr, | |
eaa9172a | 1340 | struct page **pagepool, |
9f2731d6 | 1341 | struct address_space *mc) |
9248fce7 | 1342 | { |
97e86a85 | 1343 | const pgoff_t index = pcl->obj.index; |
9f2731d6 | 1344 | gfp_t gfp = mapping_gfp_mask(mc); |
9248fce7 GX |
1345 | bool tocache = false; |
1346 | ||
1347 | struct address_space *mapping; | |
1348 | struct page *oldpage, *page; | |
92e6efd5 GX |
1349 | int justfound; |
1350 | ||
9248fce7 | 1351 | repeat: |
ed722fbc | 1352 | page = READ_ONCE(pcl->compressed_bvecs[nr].page); |
9248fce7 GX |
1353 | oldpage = page; |
1354 | ||
1355 | if (!page) | |
1356 | goto out_allocpage; | |
1357 | ||
b1ed220c GX |
1358 | justfound = (unsigned long)page & 1UL; |
1359 | page = (struct page *)((unsigned long)page & ~1UL); | |
92e6efd5 | 1360 | |
1825c8d7 GX |
1361 | /* |
1362 | * preallocated cached pages, which is used to avoid direct reclaim | |
1363 | * otherwise, it will go inplace I/O path instead. | |
1364 | */ | |
1365 | if (page->private == Z_EROFS_PREALLOCATED_PAGE) { | |
ed722fbc | 1366 | WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); |
1825c8d7 GX |
1367 | set_page_private(page, 0); |
1368 | tocache = true; | |
1369 | goto out_tocache; | |
1370 | } | |
9248fce7 GX |
1371 | mapping = READ_ONCE(page->mapping); |
1372 | ||
9248fce7 | 1373 | /* |
6aaa7b06 | 1374 | * file-backed online pages in plcuster are all locked steady, |
9248fce7 GX |
1375 | * therefore it is impossible for `mapping' to be NULL. |
1376 | */ | |
1377 | if (mapping && mapping != mc) | |
1378 | /* ought to be unmanaged pages */ | |
1379 | goto out; | |
1380 | ||
6aaa7b06 GX |
1381 | /* directly return for shortlived page as well */ |
1382 | if (z_erofs_is_shortlived_page(page)) | |
1383 | goto out; | |
1384 | ||
9248fce7 GX |
1385 | lock_page(page); |
1386 | ||
92e6efd5 GX |
1387 | /* only true if page reclaim goes wrong, should never happen */ |
1388 | DBG_BUGON(justfound && PagePrivate(page)); | |
1389 | ||
9248fce7 GX |
1390 | /* the page is still in manage cache */ |
1391 | if (page->mapping == mc) { | |
ed722fbc | 1392 | WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); |
9248fce7 GX |
1393 | |
1394 | if (!PagePrivate(page)) { | |
92e6efd5 GX |
1395 | /* |
1396 | * impossible to be !PagePrivate(page) for | |
1397 | * the current restriction as well if | |
ed722fbc | 1398 | * the page is already in compressed_bvecs[]. |
92e6efd5 GX |
1399 | */ |
1400 | DBG_BUGON(!justfound); | |
1401 | ||
1402 | justfound = 0; | |
97e86a85 | 1403 | set_page_private(page, (unsigned long)pcl); |
9248fce7 GX |
1404 | SetPagePrivate(page); |
1405 | } | |
1406 | ||
1407 | /* no need to submit io if it is already up-to-date */ | |
1408 | if (PageUptodate(page)) { | |
1409 | unlock_page(page); | |
1410 | page = NULL; | |
1411 | } | |
1412 | goto out; | |
1413 | } | |
1414 | ||
1415 | /* | |
1416 | * the managed page has been truncated, it's unsafe to | |
1417 | * reuse this one, let's allocate a new cache-managed page. | |
1418 | */ | |
1419 | DBG_BUGON(page->mapping); | |
92e6efd5 | 1420 | DBG_BUGON(!justfound); |
9248fce7 GX |
1421 | |
1422 | tocache = true; | |
1423 | unlock_page(page); | |
1424 | put_page(page); | |
1425 | out_allocpage: | |
5ddcee1f | 1426 | page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); |
ed722fbc GX |
1427 | if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, |
1428 | oldpage, page)) { | |
eaa9172a | 1429 | erofs_pagepool_add(pagepool, page); |
5ddcee1f GX |
1430 | cond_resched(); |
1431 | goto repeat; | |
1432 | } | |
1825c8d7 | 1433 | out_tocache: |
bf225074 GX |
1434 | if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { |
1435 | /* turn into temporary page if fails (1 ref) */ | |
1436 | set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); | |
1437 | goto out; | |
a30573b3 | 1438 | } |
bf225074 GX |
1439 | attach_page_private(page, pcl); |
1440 | /* drop a refcount added by allocpage (then we have 2 refs here) */ | |
1441 | put_page(page); | |
1442 | ||
9248fce7 GX |
1443 | out: /* the only exit (for tracing and debugging) */ |
1444 | return page; | |
1445 | } | |
1446 | ||
cdba5506 GX |
1447 | static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, |
1448 | struct z_erofs_decompressqueue *fgq, bool *fg) | |
3883a79a | 1449 | { |
a4b1fab1 | 1450 | struct z_erofs_decompressqueue *q; |
3883a79a | 1451 | |
a4b1fab1 GX |
1452 | if (fg && !*fg) { |
1453 | q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); | |
1454 | if (!q) { | |
1455 | *fg = true; | |
1456 | goto fg_out; | |
1457 | } | |
0c638f70 | 1458 | INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); |
a4b1fab1 GX |
1459 | } else { |
1460 | fg_out: | |
1461 | q = fgq; | |
60b30050 | 1462 | init_completion(&fgq->u.done); |
a4b1fab1 | 1463 | atomic_set(&fgq->pending_bios, 0); |
67148551 | 1464 | q->eio = false; |
cdba5506 | 1465 | q->sync = true; |
3883a79a | 1466 | } |
a4b1fab1 GX |
1467 | q->sb = sb; |
1468 | q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | |
1469 | return q; | |
3883a79a GX |
1470 | } |
1471 | ||
97e86a85 | 1472 | /* define decompression jobqueue types */ |
7146a4f0 | 1473 | enum { |
7146a4f0 | 1474 | JQ_BYPASS, |
7146a4f0 GX |
1475 | JQ_SUBMIT, |
1476 | NR_JOBQUEUES, | |
1477 | }; | |
1478 | ||
97e86a85 GX |
1479 | static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, |
1480 | z_erofs_next_pcluster_t qtail[], | |
1481 | z_erofs_next_pcluster_t owned_head) | |
7146a4f0 | 1482 | { |
97e86a85 GX |
1483 | z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; |
1484 | z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; | |
7146a4f0 | 1485 | |
97e86a85 GX |
1486 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
1487 | if (owned_head == Z_EROFS_PCLUSTER_TAIL) | |
1488 | owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | |
7146a4f0 | 1489 | |
97e86a85 | 1490 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); |
7146a4f0 GX |
1491 | |
1492 | WRITE_ONCE(*submit_qtail, owned_head); | |
97e86a85 | 1493 | WRITE_ONCE(*bypass_qtail, &pcl->next); |
7146a4f0 | 1494 | |
97e86a85 | 1495 | qtail[JQ_BYPASS] = &pcl->next; |
7146a4f0 GX |
1496 | } |
1497 | ||
7865827c GX |
1498 | static void z_erofs_decompressqueue_endio(struct bio *bio) |
1499 | { | |
cdba5506 | 1500 | struct z_erofs_decompressqueue *q = bio->bi_private; |
7865827c GX |
1501 | blk_status_t err = bio->bi_status; |
1502 | struct bio_vec *bvec; | |
1503 | struct bvec_iter_all iter_all; | |
1504 | ||
1505 | bio_for_each_segment_all(bvec, bio, iter_all) { | |
1506 | struct page *page = bvec->bv_page; | |
1507 | ||
1508 | DBG_BUGON(PageUptodate(page)); | |
1509 | DBG_BUGON(z_erofs_page_is_invalidated(page)); | |
1510 | ||
7865827c GX |
1511 | if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { |
1512 | if (!err) | |
1513 | SetPageUptodate(page); | |
1514 | unlock_page(page); | |
1515 | } | |
1516 | } | |
67148551 GX |
1517 | if (err) |
1518 | q->eio = true; | |
cdba5506 | 1519 | z_erofs_decompress_kickoff(q, -1); |
7865827c GX |
1520 | bio_put(bio); |
1521 | } | |
1522 | ||
83a386c0 | 1523 | static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, |
eaa9172a | 1524 | struct page **pagepool, |
0c638f70 GX |
1525 | struct z_erofs_decompressqueue *fgq, |
1526 | bool *force_fg) | |
3883a79a | 1527 | { |
83a386c0 GX |
1528 | struct super_block *sb = f->inode->i_sb; |
1529 | struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); | |
97e86a85 | 1530 | z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; |
a4b1fab1 | 1531 | struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; |
5c6dcc57 | 1532 | z_erofs_next_pcluster_t owned_head = f->owned_head; |
dfeab2e9 | 1533 | /* bio is NULL initially, so no need to initialize last_{index,bdev} */ |
3f649ab7 | 1534 | pgoff_t last_index; |
dfeab2e9 | 1535 | struct block_device *last_bdev; |
1e4a2955 GX |
1536 | unsigned int nr_bios = 0; |
1537 | struct bio *bio = NULL; | |
82e60d00 JW |
1538 | unsigned long pflags; |
1539 | int memstall = 0; | |
3883a79a | 1540 | |
cdba5506 GX |
1541 | /* |
1542 | * if managed cache is enabled, bypass jobqueue is needed, | |
1543 | * no need to read from device for all pclusters in this queue. | |
1544 | */ | |
1545 | q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); | |
1546 | q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); | |
1547 | ||
a4b1fab1 GX |
1548 | qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; |
1549 | qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; | |
3883a79a GX |
1550 | |
1551 | /* by default, all need io submission */ | |
7146a4f0 | 1552 | q[JQ_SUBMIT]->head = owned_head; |
3883a79a GX |
1553 | |
1554 | do { | |
dfeab2e9 | 1555 | struct erofs_map_dev mdev; |
97e86a85 | 1556 | struct z_erofs_pcluster *pcl; |
1e4a2955 GX |
1557 | pgoff_t cur, end; |
1558 | unsigned int i = 0; | |
1559 | bool bypass = true; | |
3883a79a GX |
1560 | |
1561 | /* no possible 'owned_head' equals the following */ | |
97e86a85 GX |
1562 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); |
1563 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); | |
1564 | ||
1565 | pcl = container_of(owned_head, struct z_erofs_pcluster, next); | |
3883a79a | 1566 | |
cecf864d YH |
1567 | /* close the main owned chain at first */ |
1568 | owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | |
1569 | Z_EROFS_PCLUSTER_TAIL_CLOSED); | |
1570 | if (z_erofs_is_inline_pcluster(pcl)) { | |
1571 | move_to_bypass_jobqueue(pcl, qtail, owned_head); | |
1572 | continue; | |
1573 | } | |
1574 | ||
dfeab2e9 GX |
1575 | /* no device id here, thus it will always succeed */ |
1576 | mdev = (struct erofs_map_dev) { | |
1577 | .m_pa = blknr_to_addr(pcl->obj.index), | |
1578 | }; | |
1579 | (void)erofs_map_dev(sb, &mdev); | |
1580 | ||
1581 | cur = erofs_blknr(mdev.m_pa); | |
9f6cc76e | 1582 | end = cur + pcl->pclusterpages; |
3883a79a | 1583 | |
1e4a2955 GX |
1584 | do { |
1585 | struct page *page; | |
3883a79a | 1586 | |
1e4a2955 | 1587 | page = pickup_page_for_submission(pcl, i++, pagepool, |
83a386c0 | 1588 | mc); |
1e4a2955 GX |
1589 | if (!page) |
1590 | continue; | |
3883a79a | 1591 | |
dfeab2e9 GX |
1592 | if (bio && (cur != last_index + 1 || |
1593 | last_bdev != mdev.m_bdev)) { | |
3883a79a | 1594 | submit_bio_retry: |
1e4a2955 | 1595 | submit_bio(bio); |
82e60d00 JW |
1596 | if (memstall) { |
1597 | psi_memstall_leave(&pflags); | |
1598 | memstall = 0; | |
1599 | } | |
1e4a2955 GX |
1600 | bio = NULL; |
1601 | } | |
a5c0b780 | 1602 | |
82e60d00 | 1603 | if (unlikely(PageWorkingset(page)) && !memstall) { |
99486c51 | 1604 | psi_memstall_enter(&pflags); |
82e60d00 JW |
1605 | memstall = 1; |
1606 | } | |
99486c51 | 1607 | |
1e4a2955 | 1608 | if (!bio) { |
07888c66 CH |
1609 | bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, |
1610 | REQ_OP_READ, GFP_NOIO); | |
1e4a2955 | 1611 | bio->bi_end_io = z_erofs_decompressqueue_endio; |
dfeab2e9 | 1612 | |
dfeab2e9 | 1613 | last_bdev = mdev.m_bdev; |
1e4a2955 GX |
1614 | bio->bi_iter.bi_sector = (sector_t)cur << |
1615 | LOG_SECTORS_PER_BLOCK; | |
cdba5506 | 1616 | bio->bi_private = q[JQ_SUBMIT]; |
6ea5aad3 GX |
1617 | if (f->readahead) |
1618 | bio->bi_opf |= REQ_RAHEAD; | |
1e4a2955 GX |
1619 | ++nr_bios; |
1620 | } | |
3883a79a | 1621 | |
6c3e485e | 1622 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) |
1e4a2955 | 1623 | goto submit_bio_retry; |
3883a79a | 1624 | |
1e4a2955 GX |
1625 | last_index = cur; |
1626 | bypass = false; | |
1627 | } while (++cur < end); | |
105d4ad8 | 1628 | |
1e4a2955 | 1629 | if (!bypass) |
97e86a85 | 1630 | qtail[JQ_SUBMIT] = &pcl->next; |
7146a4f0 | 1631 | else |
97e86a85 GX |
1632 | move_to_bypass_jobqueue(pcl, qtail, owned_head); |
1633 | } while (owned_head != Z_EROFS_PCLUSTER_TAIL); | |
3883a79a | 1634 | |
99486c51 | 1635 | if (bio) { |
94e4e153 | 1636 | submit_bio(bio); |
82e60d00 JW |
1637 | if (memstall) |
1638 | psi_memstall_leave(&pflags); | |
99486c51 | 1639 | } |
3883a79a | 1640 | |
587a67b7 GX |
1641 | /* |
1642 | * although background is preferred, no one is pending for submission. | |
1643 | * don't issue workqueue for decompression but drop it directly instead. | |
1644 | */ | |
1645 | if (!*force_fg && !nr_bios) { | |
1646 | kvfree(q[JQ_SUBMIT]); | |
1e4a2955 | 1647 | return; |
587a67b7 | 1648 | } |
cdba5506 | 1649 | z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); |
3883a79a GX |
1650 | } |
1651 | ||
83a386c0 | 1652 | static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, |
eaa9172a | 1653 | struct page **pagepool, bool force_fg) |
3883a79a | 1654 | { |
a4b1fab1 | 1655 | struct z_erofs_decompressqueue io[NR_JOBQUEUES]; |
3883a79a | 1656 | |
5c6dcc57 | 1657 | if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) |
3883a79a | 1658 | return; |
83a386c0 | 1659 | z_erofs_submit_queue(f, pagepool, io, &force_fg); |
3883a79a | 1660 | |
0c638f70 GX |
1661 | /* handle bypass queue (no i/o pclusters) immediately */ |
1662 | z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); | |
4279f3f9 | 1663 | |
3883a79a GX |
1664 | if (!force_fg) |
1665 | return; | |
1666 | ||
1667 | /* wait until all bios are completed */ | |
60b30050 | 1668 | wait_for_completion_io(&io[JQ_SUBMIT].u.done); |
3883a79a | 1669 | |
0c638f70 GX |
1670 | /* handle synchronous decompress queue in the caller context */ |
1671 | z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); | |
3883a79a GX |
1672 | } |
1673 | ||
38629291 GX |
1674 | /* |
1675 | * Since partial uptodate is still unimplemented for now, we have to use | |
1676 | * approximate readmore strategies as a start. | |
1677 | */ | |
1678 | static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, | |
1679 | struct readahead_control *rac, | |
1680 | erofs_off_t end, | |
eaa9172a | 1681 | struct page **pagepool, |
38629291 GX |
1682 | bool backmost) |
1683 | { | |
1684 | struct inode *inode = f->inode; | |
1685 | struct erofs_map_blocks *map = &f->map; | |
1686 | erofs_off_t cur; | |
1687 | int err; | |
1688 | ||
1689 | if (backmost) { | |
1690 | map->m_la = end; | |
622ceadd GX |
1691 | err = z_erofs_map_blocks_iter(inode, map, |
1692 | EROFS_GET_BLOCKS_READMORE); | |
38629291 GX |
1693 | if (err) |
1694 | return; | |
1695 | ||
1696 | /* expend ra for the trailing edge if readahead */ | |
1697 | if (rac) { | |
1698 | loff_t newstart = readahead_pos(rac); | |
1699 | ||
1700 | cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); | |
1701 | readahead_expand(rac, newstart, cur - newstart); | |
1702 | return; | |
1703 | } | |
1704 | end = round_up(end, PAGE_SIZE); | |
1705 | } else { | |
1706 | end = round_up(map->m_la, PAGE_SIZE); | |
1707 | ||
1708 | if (!map->m_llen) | |
1709 | return; | |
1710 | } | |
1711 | ||
1712 | cur = map->m_la + map->m_llen - 1; | |
1713 | while (cur >= end) { | |
1714 | pgoff_t index = cur >> PAGE_SHIFT; | |
1715 | struct page *page; | |
1716 | ||
1717 | page = erofs_grab_cache_page_nowait(inode->i_mapping, index); | |
aa793b46 GX |
1718 | if (page) { |
1719 | if (PageUptodate(page)) { | |
1720 | unlock_page(page); | |
1721 | } else { | |
1722 | err = z_erofs_do_read_page(f, page, pagepool); | |
1723 | if (err) | |
1724 | erofs_err(inode->i_sb, | |
1725 | "readmore error at page %lu @ nid %llu", | |
1726 | index, EROFS_I(inode)->nid); | |
1727 | } | |
38629291 | 1728 | put_page(page); |
38629291 GX |
1729 | } |
1730 | ||
38629291 GX |
1731 | if (cur < PAGE_SIZE) |
1732 | break; | |
1733 | cur = (index << PAGE_SHIFT) - 1; | |
1734 | } | |
1735 | } | |
1736 | ||
a2e20a25 | 1737 | static int z_erofs_read_folio(struct file *file, struct folio *folio) |
3883a79a | 1738 | { |
a2e20a25 | 1739 | struct page *page = &folio->page; |
3883a79a | 1740 | struct inode *const inode = page->mapping->host; |
40452ffc | 1741 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
97e86a85 | 1742 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
eaa9172a | 1743 | struct page *pagepool = NULL; |
3883a79a | 1744 | int err; |
3883a79a | 1745 | |
ba9ce771 | 1746 | trace_erofs_readpage(page, false); |
f0c519fc GX |
1747 | f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; |
1748 | ||
38629291 GX |
1749 | z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, |
1750 | &pagepool, true); | |
1825c8d7 | 1751 | err = z_erofs_do_read_page(&f, page, &pagepool); |
38629291 GX |
1752 | z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); |
1753 | ||
5c6dcc57 | 1754 | (void)z_erofs_collector_end(&f); |
3883a79a | 1755 | |
ee45197c | 1756 | /* if some compressed cluster ready, need submit them anyway */ |
83a386c0 | 1757 | z_erofs_runqueue(&f, &pagepool, |
40452ffc | 1758 | z_erofs_get_sync_decompress_policy(sbi, 0)); |
ee45197c GX |
1759 | |
1760 | if (err) | |
4f761fa2 | 1761 | erofs_err(inode->i_sb, "failed to read, err [%d]", err); |
3883a79a | 1762 | |
09c54379 | 1763 | erofs_put_metabuf(&f.map.buf); |
eaa9172a | 1764 | erofs_release_pages(&pagepool); |
ee45197c | 1765 | return err; |
3883a79a GX |
1766 | } |
1767 | ||
0615090c | 1768 | static void z_erofs_readahead(struct readahead_control *rac) |
3883a79a | 1769 | { |
0615090c | 1770 | struct inode *const inode = rac->mapping->host; |
5fb76bb0 | 1771 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
97e86a85 | 1772 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
eaa9172a | 1773 | struct page *pagepool = NULL, *head = NULL, *page; |
38629291 | 1774 | unsigned int nr_pages; |
3883a79a | 1775 | |
6ea5aad3 | 1776 | f.readahead = true; |
0615090c | 1777 | f.headoffset = readahead_pos(rac); |
3883a79a | 1778 | |
38629291 GX |
1779 | z_erofs_pcluster_readmore(&f, rac, f.headoffset + |
1780 | readahead_length(rac) - 1, &pagepool, true); | |
1781 | nr_pages = readahead_count(rac); | |
1782 | trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); | |
2d9b5dcd | 1783 | |
38629291 | 1784 | while ((page = readahead_page(rac))) { |
3883a79a GX |
1785 | set_page_private(page, (unsigned long)head); |
1786 | head = page; | |
1787 | } | |
1788 | ||
42d40b4a | 1789 | while (head) { |
3883a79a GX |
1790 | struct page *page = head; |
1791 | int err; | |
1792 | ||
1793 | /* traversal in reverse order */ | |
1794 | head = (void *)page_private(page); | |
1795 | ||
1825c8d7 | 1796 | err = z_erofs_do_read_page(&f, page, &pagepool); |
a5876e24 | 1797 | if (err) |
4f761fa2 GX |
1798 | erofs_err(inode->i_sb, |
1799 | "readahead error at page %lu @ nid %llu", | |
1800 | page->index, EROFS_I(inode)->nid); | |
3883a79a GX |
1801 | put_page(page); |
1802 | } | |
38629291 | 1803 | z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); |
5c6dcc57 | 1804 | (void)z_erofs_collector_end(&f); |
3883a79a | 1805 | |
83a386c0 | 1806 | z_erofs_runqueue(&f, &pagepool, |
40452ffc | 1807 | z_erofs_get_sync_decompress_policy(sbi, nr_pages)); |
09c54379 | 1808 | erofs_put_metabuf(&f.map.buf); |
eaa9172a | 1809 | erofs_release_pages(&pagepool); |
3883a79a GX |
1810 | } |
1811 | ||
0c638f70 | 1812 | const struct address_space_operations z_erofs_aops = { |
a2e20a25 | 1813 | .read_folio = z_erofs_read_folio, |
0615090c | 1814 | .readahead = z_erofs_readahead, |
3883a79a | 1815 | }; |