Commit | Line | Data |
---|---|---|
8a1d0f9c EB |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
704528d8 | 3 | * Data verification functions, i.e. hooks for ->readahead() |
8a1d0f9c EB |
4 | * |
5 | * Copyright 2019 Google LLC | |
6 | */ | |
7 | ||
8 | #include "fsverity_private.h" | |
9 | ||
10 | #include <crypto/hash.h> | |
11 | #include <linux/bio.h> | |
8a1d0f9c EB |
12 | |
13 | static struct workqueue_struct *fsverity_read_workqueue; | |
14 | ||
8a1d0f9c EB |
15 | static inline int cmp_hashes(const struct fsverity_info *vi, |
16 | const u8 *want_hash, const u8 *real_hash, | |
5306892a | 17 | u64 data_pos, int level) |
8a1d0f9c EB |
18 | { |
19 | const unsigned int hsize = vi->tree_params.digest_size; | |
20 | ||
21 | if (memcmp(want_hash, real_hash, hsize) == 0) | |
22 | return 0; | |
23 | ||
24 | fsverity_err(vi->inode, | |
5306892a EB |
25 | "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", |
26 | data_pos, level, | |
8a1d0f9c EB |
27 | vi->tree_params.hash_alg->name, hsize, want_hash, |
28 | vi->tree_params.hash_alg->name, hsize, real_hash); | |
29 | return -EBADMSG; | |
30 | } | |
31 | ||
5306892a EB |
32 | static bool data_is_zeroed(struct inode *inode, struct page *page, |
33 | unsigned int len, unsigned int offset) | |
34 | { | |
35 | void *virt = kmap_local_page(page); | |
36 | ||
37 | if (memchr_inv(virt + offset, 0, len)) { | |
38 | kunmap_local(virt); | |
39 | fsverity_err(inode, | |
40 | "FILE CORRUPTED! Data past EOF is not zeroed"); | |
41 | return false; | |
42 | } | |
43 | kunmap_local(virt); | |
44 | return true; | |
45 | } | |
46 | ||
47 | /* | |
48 | * Returns true if the hash block with index @hblock_idx in the tree, located in | |
49 | * @hpage, has already been verified. | |
50 | */ | |
51 | static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, | |
52 | unsigned long hblock_idx) | |
53 | { | |
54 | bool verified; | |
55 | unsigned int blocks_per_page; | |
56 | unsigned int i; | |
57 | ||
58 | /* | |
59 | * When the Merkle tree block size and page size are the same, then the | |
60 | * ->hash_block_verified bitmap isn't allocated, and we use PG_checked | |
61 | * to directly indicate whether the page's block has been verified. | |
62 | * | |
63 | * Using PG_checked also guarantees that we re-verify hash pages that | |
64 | * get evicted and re-instantiated from the backing storage, as new | |
65 | * pages always start out with PG_checked cleared. | |
66 | */ | |
67 | if (!vi->hash_block_verified) | |
68 | return PageChecked(hpage); | |
69 | ||
70 | /* | |
71 | * When the Merkle tree block size and page size differ, we use a bitmap | |
72 | * to indicate whether each hash block has been verified. | |
73 | * | |
74 | * However, we still need to ensure that hash pages that get evicted and | |
75 | * re-instantiated from the backing storage are re-verified. To do | |
76 | * this, we use PG_checked again, but now it doesn't really mean | |
77 | * "checked". Instead, now it just serves as an indicator for whether | |
78 | * the hash page is newly instantiated or not. | |
79 | * | |
80 | * The first thread that sees PG_checked=0 must clear the corresponding | |
81 | * bitmap bits, then set PG_checked=1. This requires a spinlock. To | |
82 | * avoid having to take this spinlock in the common case of | |
83 | * PG_checked=1, we start with an opportunistic lockless read. | |
84 | */ | |
85 | if (PageChecked(hpage)) { | |
86 | /* | |
87 | * A read memory barrier is needed here to give ACQUIRE | |
88 | * semantics to the above PageChecked() test. | |
89 | */ | |
90 | smp_rmb(); | |
91 | return test_bit(hblock_idx, vi->hash_block_verified); | |
92 | } | |
93 | spin_lock(&vi->hash_page_init_lock); | |
94 | if (PageChecked(hpage)) { | |
95 | verified = test_bit(hblock_idx, vi->hash_block_verified); | |
96 | } else { | |
97 | blocks_per_page = vi->tree_params.blocks_per_page; | |
98 | hblock_idx = round_down(hblock_idx, blocks_per_page); | |
99 | for (i = 0; i < blocks_per_page; i++) | |
100 | clear_bit(hblock_idx + i, vi->hash_block_verified); | |
101 | /* | |
102 | * A write memory barrier is needed here to give RELEASE | |
103 | * semantics to the below SetPageChecked() operation. | |
104 | */ | |
105 | smp_wmb(); | |
106 | SetPageChecked(hpage); | |
107 | verified = false; | |
108 | } | |
109 | spin_unlock(&vi->hash_page_init_lock); | |
110 | return verified; | |
111 | } | |
112 | ||
8a1d0f9c | 113 | /* |
5306892a | 114 | * Verify a single data block against the file's Merkle tree. |
8a1d0f9c EB |
115 | * |
116 | * In principle, we need to verify the entire path to the root node. However, | |
5306892a EB |
117 | * for efficiency the filesystem may cache the hash blocks. Therefore we need |
118 | * only ascend the tree until an already-verified hash block is seen, and then | |
119 | * verify the path to that block. | |
8a1d0f9c | 120 | * |
5306892a | 121 | * Return: %true if the data block is valid, else %false. |
8a1d0f9c | 122 | */ |
5306892a EB |
123 | static bool |
124 | verify_data_block(struct inode *inode, struct fsverity_info *vi, | |
125 | struct ahash_request *req, struct page *data_page, | |
126 | u64 data_pos, unsigned int dblock_offset_in_page, | |
127 | unsigned long max_ra_pages) | |
8a1d0f9c EB |
128 | { |
129 | const struct merkle_tree_params *params = &vi->tree_params; | |
130 | const unsigned int hsize = params->digest_size; | |
8a1d0f9c EB |
131 | int level; |
132 | u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; | |
133 | const u8 *want_hash; | |
134 | u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; | |
5306892a EB |
135 | /* The hash blocks that are traversed, indexed by level */ |
136 | struct { | |
137 | /* Page containing the hash block */ | |
138 | struct page *page; | |
139 | /* Index of the hash block in the tree overall */ | |
140 | unsigned long index; | |
141 | /* Byte offset of the hash block within @page */ | |
142 | unsigned int offset_in_page; | |
143 | /* Byte offset of the wanted hash within @page */ | |
144 | unsigned int hoffset; | |
145 | } hblocks[FS_VERITY_MAX_LEVELS]; | |
146 | /* | |
147 | * The index of the previous level's block within that level; also the | |
148 | * index of that block's hash within the current level. | |
149 | */ | |
150 | u64 hidx = data_pos >> params->log_blocksize; | |
8a1d0f9c EB |
151 | int err; |
152 | ||
5306892a EB |
153 | if (unlikely(data_pos >= inode->i_size)) { |
154 | /* | |
155 | * This can happen in the data page spanning EOF when the Merkle | |
156 | * tree block size is less than the page size. The Merkle tree | |
157 | * doesn't cover data blocks fully past EOF. But the entire | |
158 | * page spanning EOF can be visible to userspace via a mmap, and | |
159 | * any part past EOF should be all zeroes. Therefore, we need | |
160 | * to verify that any data blocks fully past EOF are all zeroes. | |
161 | */ | |
162 | return data_is_zeroed(inode, data_page, params->block_size, | |
163 | dblock_offset_in_page); | |
164 | } | |
8a1d0f9c | 165 | |
8a1d0f9c | 166 | /* |
5306892a EB |
167 | * Starting at the leaf level, ascend the tree saving hash blocks along |
168 | * the way until we find a hash block that has already been verified, or | |
169 | * until we reach the root. | |
8a1d0f9c EB |
170 | */ |
171 | for (level = 0; level < params->num_levels; level++) { | |
5306892a EB |
172 | unsigned long next_hidx; |
173 | unsigned long hblock_idx; | |
174 | pgoff_t hpage_idx; | |
175 | unsigned int hblock_offset_in_page; | |
8a1d0f9c EB |
176 | unsigned int hoffset; |
177 | struct page *hpage; | |
178 | ||
5306892a EB |
179 | /* |
180 | * The index of the block in the current level; also the index | |
181 | * of that block's hash within the next level. | |
182 | */ | |
183 | next_hidx = hidx >> params->log_arity; | |
184 | ||
185 | /* Index of the hash block in the tree overall */ | |
186 | hblock_idx = params->level_start[level] + next_hidx; | |
187 | ||
188 | /* Index of the hash page in the tree overall */ | |
189 | hpage_idx = hblock_idx >> params->log_blocks_per_page; | |
190 | ||
191 | /* Byte offset of the hash block within the page */ | |
192 | hblock_offset_in_page = | |
193 | (hblock_idx << params->log_blocksize) & ~PAGE_MASK; | |
194 | ||
195 | /* Byte offset of the hash within the page */ | |
196 | hoffset = hblock_offset_in_page + | |
197 | ((hidx << params->log_digestsize) & | |
198 | (params->block_size - 1)); | |
8a1d0f9c | 199 | |
5306892a EB |
200 | hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, |
201 | hpage_idx, level == 0 ? min(max_ra_pages, | |
202 | params->tree_pages - hpage_idx) : 0); | |
8a1d0f9c EB |
203 | if (IS_ERR(hpage)) { |
204 | err = PTR_ERR(hpage); | |
205 | fsverity_err(inode, | |
206 | "Error %d reading Merkle tree page %lu", | |
5306892a | 207 | err, hpage_idx); |
8a1d0f9c EB |
208 | goto out; |
209 | } | |
5306892a | 210 | if (is_hash_block_verified(vi, hpage, hblock_idx)) { |
c987918a | 211 | memcpy_from_page(_want_hash, hpage, hoffset, hsize); |
8a1d0f9c EB |
212 | want_hash = _want_hash; |
213 | put_page(hpage); | |
8a1d0f9c EB |
214 | goto descend; |
215 | } | |
5306892a EB |
216 | hblocks[level].page = hpage; |
217 | hblocks[level].index = hblock_idx; | |
218 | hblocks[level].offset_in_page = hblock_offset_in_page; | |
219 | hblocks[level].hoffset = hoffset; | |
220 | hidx = next_hidx; | |
8a1d0f9c EB |
221 | } |
222 | ||
223 | want_hash = vi->root_hash; | |
8a1d0f9c | 224 | descend: |
f45555bf | 225 | /* Descend the tree verifying hash blocks. */ |
8a1d0f9c | 226 | for (; level > 0; level--) { |
5306892a EB |
227 | struct page *hpage = hblocks[level - 1].page; |
228 | unsigned long hblock_idx = hblocks[level - 1].index; | |
229 | unsigned int hblock_offset_in_page = | |
230 | hblocks[level - 1].offset_in_page; | |
231 | unsigned int hoffset = hblocks[level - 1].hoffset; | |
8a1d0f9c | 232 | |
5306892a EB |
233 | err = fsverity_hash_block(params, inode, req, hpage, |
234 | hblock_offset_in_page, real_hash); | |
8a1d0f9c EB |
235 | if (err) |
236 | goto out; | |
5306892a | 237 | err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); |
8a1d0f9c EB |
238 | if (err) |
239 | goto out; | |
5306892a EB |
240 | /* |
241 | * Mark the hash block as verified. This must be atomic and | |
242 | * idempotent, as the same hash block might be verified by | |
243 | * multiple threads concurrently. | |
244 | */ | |
245 | if (vi->hash_block_verified) | |
246 | set_bit(hblock_idx, vi->hash_block_verified); | |
247 | else | |
248 | SetPageChecked(hpage); | |
c987918a | 249 | memcpy_from_page(_want_hash, hpage, hoffset, hsize); |
8a1d0f9c EB |
250 | want_hash = _want_hash; |
251 | put_page(hpage); | |
8a1d0f9c EB |
252 | } |
253 | ||
f45555bf | 254 | /* Finally, verify the data block. */ |
5306892a EB |
255 | err = fsverity_hash_block(params, inode, req, data_page, |
256 | dblock_offset_in_page, real_hash); | |
8a1d0f9c EB |
257 | if (err) |
258 | goto out; | |
5306892a | 259 | err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); |
8a1d0f9c EB |
260 | out: |
261 | for (; level > 0; level--) | |
5306892a | 262 | put_page(hblocks[level - 1].page); |
8a1d0f9c EB |
263 | |
264 | return err == 0; | |
265 | } | |
266 | ||
5306892a EB |
267 | static bool |
268 | verify_data_blocks(struct inode *inode, struct fsverity_info *vi, | |
5d0f0e57 EB |
269 | struct ahash_request *req, struct folio *data_folio, |
270 | size_t len, size_t offset, unsigned long max_ra_pages) | |
5306892a EB |
271 | { |
272 | const unsigned int block_size = vi->tree_params.block_size; | |
5d0f0e57 | 273 | u64 pos = (u64)data_folio->index << PAGE_SHIFT; |
5306892a EB |
274 | |
275 | if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) | |
276 | return false; | |
5d0f0e57 EB |
277 | if (WARN_ON_ONCE(!folio_test_locked(data_folio) || |
278 | folio_test_uptodate(data_folio))) | |
5306892a EB |
279 | return false; |
280 | do { | |
5d0f0e57 EB |
281 | struct page *data_page = |
282 | folio_page(data_folio, offset >> PAGE_SHIFT); | |
283 | ||
284 | if (!verify_data_block(inode, vi, req, data_page, pos + offset, | |
285 | offset & ~PAGE_MASK, max_ra_pages)) | |
5306892a EB |
286 | return false; |
287 | offset += block_size; | |
288 | len -= block_size; | |
289 | } while (len); | |
290 | return true; | |
291 | } | |
292 | ||
8a1d0f9c | 293 | /** |
5d0f0e57 EB |
294 | * fsverity_verify_blocks() - verify data in a folio |
295 | * @folio: the folio containing the data to verify | |
296 | * @len: the length of the data to verify in the folio | |
297 | * @offset: the offset of the data to verify in the folio | |
8a1d0f9c | 298 | * |
5306892a | 299 | * Verify data that has just been read from a verity file. The data must be |
5d0f0e57 | 300 | * located in a pagecache folio that is still locked and not yet uptodate. The |
5306892a | 301 | * length and offset of the data must be Merkle tree block size aligned. |
8a1d0f9c | 302 | * |
5306892a | 303 | * Return: %true if the data is valid, else %false. |
8a1d0f9c | 304 | */ |
5d0f0e57 | 305 | bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) |
8a1d0f9c | 306 | { |
5d0f0e57 | 307 | struct inode *inode = folio->mapping->host; |
5306892a | 308 | struct fsverity_info *vi = inode->i_verity_info; |
8a1d0f9c EB |
309 | struct ahash_request *req; |
310 | bool valid; | |
311 | ||
439bea10 EB |
312 | /* This allocation never fails, since it's mempool-backed. */ |
313 | req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); | |
8a1d0f9c | 314 | |
5d0f0e57 | 315 | valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); |
8a1d0f9c | 316 | |
439bea10 | 317 | fsverity_free_hash_request(vi->tree_params.hash_alg, req); |
8a1d0f9c EB |
318 | |
319 | return valid; | |
320 | } | |
5306892a | 321 | EXPORT_SYMBOL_GPL(fsverity_verify_blocks); |
8a1d0f9c EB |
322 | |
323 | #ifdef CONFIG_BLOCK | |
324 | /** | |
325 | * fsverity_verify_bio() - verify a 'read' bio that has just completed | |
6377a38b | 326 | * @bio: the bio to verify |
8a1d0f9c | 327 | * |
5306892a EB |
328 | * Verify the bio's data against the file's Merkle tree. All bio data segments |
329 | * must be aligned to the file's Merkle tree block size. If any data fails | |
330 | * verification, then bio->bi_status is set to an error status. | |
8a1d0f9c | 331 | * |
704528d8 | 332 | * This is a helper function for use by the ->readahead() method of filesystems |
8a1d0f9c EB |
333 | * that issue bios to read data directly into the page cache. Filesystems that |
334 | * populate the page cache without issuing bios (e.g. non block-based | |
335 | * filesystems) must instead call fsverity_verify_page() directly on each page. | |
336 | * All filesystems must also call fsverity_verify_page() on holes. | |
337 | */ | |
338 | void fsverity_verify_bio(struct bio *bio) | |
339 | { | |
340 | struct inode *inode = bio_first_page_all(bio)->mapping->host; | |
5306892a | 341 | struct fsverity_info *vi = inode->i_verity_info; |
8a1d0f9c | 342 | struct ahash_request *req; |
5d0f0e57 | 343 | struct folio_iter fi; |
fd39073d | 344 | unsigned long max_ra_pages = 0; |
8a1d0f9c | 345 | |
439bea10 | 346 | /* This allocation never fails, since it's mempool-backed. */ |
9098f36b | 347 | req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); |
8a1d0f9c | 348 | |
fd39073d EB |
349 | if (bio->bi_opf & REQ_RAHEAD) { |
350 | /* | |
351 | * If this bio is for data readahead, then we also do readahead | |
352 | * of the first (largest) level of the Merkle tree. Namely, | |
353 | * when a Merkle tree page is read, we also try to piggy-back on | |
354 | * some additional pages -- up to 1/4 the number of data pages. | |
355 | * | |
356 | * This improves sequential read performance, as it greatly | |
357 | * reduces the number of I/O requests made to the Merkle tree. | |
358 | */ | |
9098f36b | 359 | max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); |
fd39073d EB |
360 | } |
361 | ||
5d0f0e57 EB |
362 | bio_for_each_folio_all(fi, bio) { |
363 | if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, | |
364 | fi.offset, max_ra_pages)) { | |
98dc08ba EB |
365 | bio->bi_status = BLK_STS_IOERR; |
366 | break; | |
367 | } | |
8a1d0f9c EB |
368 | } |
369 | ||
9098f36b | 370 | fsverity_free_hash_request(vi->tree_params.hash_alg, req); |
8a1d0f9c EB |
371 | } |
372 | EXPORT_SYMBOL_GPL(fsverity_verify_bio); | |
373 | #endif /* CONFIG_BLOCK */ | |
374 | ||
375 | /** | |
376 | * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue | |
6377a38b | 377 | * @work: the work to enqueue |
8a1d0f9c EB |
378 | * |
379 | * Enqueue verification work for asynchronous processing. | |
380 | */ | |
381 | void fsverity_enqueue_verify_work(struct work_struct *work) | |
382 | { | |
383 | queue_work(fsverity_read_workqueue, work); | |
384 | } | |
385 | EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); | |
386 | ||
387 | int __init fsverity_init_workqueue(void) | |
388 | { | |
389 | /* | |
f959325e NH |
390 | * Use a high-priority workqueue to prioritize verification work, which |
391 | * blocks reads from completing, over regular application tasks. | |
8a1d0f9c | 392 | * |
f959325e NH |
393 | * For performance reasons, don't use an unbound workqueue. Using an |
394 | * unbound workqueue for crypto operations causes excessive scheduler | |
395 | * latency on ARM64. | |
8a1d0f9c EB |
396 | */ |
397 | fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", | |
f959325e | 398 | WQ_HIGHPRI, |
8a1d0f9c EB |
399 | num_online_cpus()); |
400 | if (!fsverity_read_workqueue) | |
401 | return -ENOMEM; | |
402 | return 0; | |
403 | } | |
432434c9 EB |
404 | |
405 | void __init fsverity_exit_workqueue(void) | |
406 | { | |
407 | destroy_workqueue(fsverity_read_workqueue); | |
408 | fsverity_read_workqueue = NULL; | |
409 | } |