Commit | Line | Data |
---|---|---|
02e83f46 DW |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/slab.h> | |
3 | #include <linux/stat.h> | |
4 | #include <linux/sched/xacct.h> | |
5 | #include <linux/fcntl.h> | |
6 | #include <linux/file.h> | |
7 | #include <linux/uio.h> | |
8 | #include <linux/fsnotify.h> | |
9 | #include <linux/security.h> | |
10 | #include <linux/export.h> | |
11 | #include <linux/syscalls.h> | |
12 | #include <linux/pagemap.h> | |
13 | #include <linux/splice.h> | |
14 | #include <linux/compat.h> | |
15 | #include <linux/mount.h> | |
16 | #include <linux/fs.h> | |
17 | #include "internal.h" | |
18 | ||
19 | #include <linux/uaccess.h> | |
20 | #include <asm/unistd.h> | |
21 | ||
22 | /* | |
23 | * Performs necessary checks before doing a clone. | |
24 | * | |
25 | * Can adjust amount of bytes to clone via @req_count argument. | |
26 | * Returns appropriate error code that caller should return or | |
27 | * zero in case the clone should be allowed. | |
28 | */ | |
1b2c54d6 DW |
29 | static int generic_remap_checks(struct file *file_in, loff_t pos_in, |
30 | struct file *file_out, loff_t pos_out, | |
31 | loff_t *req_count, unsigned int remap_flags) | |
02e83f46 DW |
32 | { |
33 | struct inode *inode_in = file_in->f_mapping->host; | |
34 | struct inode *inode_out = file_out->f_mapping->host; | |
35 | uint64_t count = *req_count; | |
36 | uint64_t bcount; | |
37 | loff_t size_in, size_out; | |
38 | loff_t bs = inode_out->i_sb->s_blocksize; | |
39 | int ret; | |
40 | ||
41 | /* The start of both ranges must be aligned to an fs block. */ | |
42 | if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) | |
43 | return -EINVAL; | |
44 | ||
45 | /* Ensure offsets don't wrap. */ | |
46 | if (pos_in + count < pos_in || pos_out + count < pos_out) | |
47 | return -EINVAL; | |
48 | ||
49 | size_in = i_size_read(inode_in); | |
50 | size_out = i_size_read(inode_out); | |
51 | ||
52 | /* Dedupe requires both ranges to be within EOF. */ | |
53 | if ((remap_flags & REMAP_FILE_DEDUP) && | |
54 | (pos_in >= size_in || pos_in + count > size_in || | |
55 | pos_out >= size_out || pos_out + count > size_out)) | |
56 | return -EINVAL; | |
57 | ||
58 | /* Ensure the infile range is within the infile. */ | |
59 | if (pos_in >= size_in) | |
60 | return -EINVAL; | |
61 | count = min(count, size_in - (uint64_t)pos_in); | |
62 | ||
63 | ret = generic_write_check_limits(file_out, pos_out, &count); | |
64 | if (ret) | |
65 | return ret; | |
66 | ||
67 | /* | |
68 | * If the user wanted us to link to the infile's EOF, round up to the | |
69 | * next block boundary for this check. | |
70 | * | |
71 | * Otherwise, make sure the count is also block-aligned, having | |
72 | * already confirmed the starting offsets' block alignment. | |
73 | */ | |
74 | if (pos_in + count == size_in) { | |
75 | bcount = ALIGN(size_in, bs) - pos_in; | |
76 | } else { | |
77 | if (!IS_ALIGNED(count, bs)) | |
78 | count = ALIGN_DOWN(count, bs); | |
79 | bcount = count; | |
80 | } | |
81 | ||
82 | /* Don't allow overlapped cloning within the same file. */ | |
83 | if (inode_in == inode_out && | |
84 | pos_out + bcount > pos_in && | |
85 | pos_out < pos_in + bcount) | |
86 | return -EINVAL; | |
87 | ||
88 | /* | |
89 | * We shortened the request but the caller can't deal with that, so | |
90 | * bounce the request back to userspace. | |
91 | */ | |
92 | if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) | |
93 | return -EINVAL; | |
94 | ||
95 | *req_count = count; | |
96 | return 0; | |
97 | } | |
1b2c54d6 DW |
98 | |
99 | static int remap_verify_area(struct file *file, loff_t pos, loff_t len, | |
100 | bool write) | |
101 | { | |
1b2c54d6 DW |
102 | if (unlikely(pos < 0 || len < 0)) |
103 | return -EINVAL; | |
104 | ||
105 | if (unlikely((loff_t) (pos + len) < 0)) | |
106 | return -EINVAL; | |
107 | ||
1b2c54d6 DW |
108 | return security_file_permission(file, write ? MAY_WRITE : MAY_READ); |
109 | } | |
110 | ||
111 | /* | |
112 | * Ensure that we don't remap a partial EOF block in the middle of something | |
113 | * else. Assume that the offsets have already been checked for block | |
114 | * alignment. | |
115 | * | |
116 | * For clone we only link a partial EOF block above or at the destination file's | |
117 | * EOF. For deduplication we accept a partial EOF block only if it ends at the | |
118 | * destination file's EOF (can not link it into the middle of a file). | |
119 | * | |
120 | * Shorten the request if possible. | |
121 | */ | |
122 | static int generic_remap_check_len(struct inode *inode_in, | |
123 | struct inode *inode_out, | |
124 | loff_t pos_out, | |
125 | loff_t *len, | |
126 | unsigned int remap_flags) | |
127 | { | |
128 | u64 blkmask = i_blocksize(inode_in) - 1; | |
129 | loff_t new_len = *len; | |
130 | ||
131 | if ((*len & blkmask) == 0) | |
132 | return 0; | |
133 | ||
134 | if (pos_out + *len < i_size_read(inode_out)) | |
135 | new_len &= ~blkmask; | |
136 | ||
137 | if (new_len == *len) | |
138 | return 0; | |
139 | ||
140 | if (remap_flags & REMAP_FILE_CAN_SHORTEN) { | |
141 | *len = new_len; | |
142 | return 0; | |
143 | } | |
144 | ||
145 | return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; | |
146 | } | |
147 | ||
148 | /* Read a page's worth of file data into the page cache. */ | |
338f379c | 149 | static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos) |
1b2c54d6 | 150 | { |
338f379c | 151 | struct folio *folio; |
1b2c54d6 | 152 | |
338f379c MWO |
153 | folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); |
154 | if (IS_ERR(folio)) | |
155 | return folio; | |
156 | if (!folio_test_uptodate(folio)) { | |
157 | folio_put(folio); | |
1b2c54d6 DW |
158 | return ERR_PTR(-EIO); |
159 | } | |
338f379c | 160 | return folio; |
1b2c54d6 DW |
161 | } |
162 | ||
163 | /* | |
338f379c MWO |
164 | * Lock two folios, ensuring that we lock in offset order if the folios |
165 | * are from the same file. | |
1b2c54d6 | 166 | */ |
338f379c | 167 | static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) |
1b2c54d6 DW |
168 | { |
169 | /* Always lock in order of increasing index. */ | |
338f379c MWO |
170 | if (folio1->index > folio2->index) |
171 | swap(folio1, folio2); | |
1b2c54d6 | 172 | |
338f379c MWO |
173 | folio_lock(folio1); |
174 | if (folio1 != folio2) | |
175 | folio_lock(folio2); | |
1b2c54d6 DW |
176 | } |
177 | ||
338f379c MWO |
178 | /* Unlock two folios, being careful not to unlock the same folio twice. */ |
179 | static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) | |
1b2c54d6 | 180 | { |
338f379c MWO |
181 | folio_unlock(folio1); |
182 | if (folio1 != folio2) | |
183 | folio_unlock(folio2); | |
1b2c54d6 DW |
184 | } |
185 | ||
186 | /* | |
187 | * Compare extents of two files to see if they are the same. | |
188 | * Caller must have locked both inodes to prevent write races. | |
189 | */ | |
190 | static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, | |
338f379c | 191 | struct inode *dest, loff_t dstoff, |
1b2c54d6 DW |
192 | loff_t len, bool *is_same) |
193 | { | |
338f379c MWO |
194 | bool same = true; |
195 | int error = -EINVAL; | |
196 | ||
1b2c54d6 | 197 | while (len) { |
338f379c MWO |
198 | struct folio *src_folio, *dst_folio; |
199 | void *src_addr, *dst_addr; | |
200 | loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), | |
201 | PAGE_SIZE - offset_in_page(dstoff)); | |
202 | ||
1b2c54d6 DW |
203 | cmp_len = min(cmp_len, len); |
204 | if (cmp_len <= 0) | |
205 | goto out_error; | |
206 | ||
338f379c MWO |
207 | src_folio = vfs_dedupe_get_folio(src, srcoff); |
208 | if (IS_ERR(src_folio)) { | |
209 | error = PTR_ERR(src_folio); | |
1b2c54d6 DW |
210 | goto out_error; |
211 | } | |
338f379c MWO |
212 | dst_folio = vfs_dedupe_get_folio(dest, dstoff); |
213 | if (IS_ERR(dst_folio)) { | |
214 | error = PTR_ERR(dst_folio); | |
215 | folio_put(src_folio); | |
1b2c54d6 DW |
216 | goto out_error; |
217 | } | |
218 | ||
338f379c | 219 | vfs_lock_two_folios(src_folio, dst_folio); |
1b2c54d6 DW |
220 | |
221 | /* | |
338f379c | 222 | * Now that we've locked both folios, make sure they're still |
1b2c54d6 DW |
223 | * mapped to the file data we're interested in. If not, |
224 | * someone is invalidating pages on us and we lose. | |
225 | */ | |
338f379c MWO |
226 | if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || |
227 | src_folio->mapping != src->i_mapping || | |
228 | dst_folio->mapping != dest->i_mapping) { | |
1b2c54d6 DW |
229 | same = false; |
230 | goto unlock; | |
231 | } | |
232 | ||
338f379c MWO |
233 | src_addr = kmap_local_folio(src_folio, |
234 | offset_in_folio(src_folio, srcoff)); | |
235 | dst_addr = kmap_local_folio(dst_folio, | |
236 | offset_in_folio(dst_folio, dstoff)); | |
1b2c54d6 | 237 | |
338f379c MWO |
238 | flush_dcache_folio(src_folio); |
239 | flush_dcache_folio(dst_folio); | |
1b2c54d6 | 240 | |
338f379c | 241 | if (memcmp(src_addr, dst_addr, cmp_len)) |
1b2c54d6 DW |
242 | same = false; |
243 | ||
338f379c MWO |
244 | kunmap_local(dst_addr); |
245 | kunmap_local(src_addr); | |
1b2c54d6 | 246 | unlock: |
338f379c MWO |
247 | vfs_unlock_two_folios(src_folio, dst_folio); |
248 | folio_put(dst_folio); | |
249 | folio_put(src_folio); | |
1b2c54d6 DW |
250 | |
251 | if (!same) | |
252 | break; | |
253 | ||
254 | srcoff += cmp_len; | |
338f379c | 255 | dstoff += cmp_len; |
1b2c54d6 DW |
256 | len -= cmp_len; |
257 | } | |
258 | ||
259 | *is_same = same; | |
260 | return 0; | |
261 | ||
262 | out_error: | |
263 | return error; | |
264 | } | |
265 | ||
266 | /* | |
267 | * Check that the two inodes are eligible for cloning, the ranges make | |
268 | * sense, and then flush all dirty data. Caller must ensure that the | |
269 | * inodes have been locked against any other modifications. | |
270 | * | |
271 | * If there's an error, then the usual negative error code is returned. | |
272 | * Otherwise returns 0 with *len set to the request length. | |
273 | */ | |
274 | int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, | |
275 | struct file *file_out, loff_t pos_out, | |
276 | loff_t *len, unsigned int remap_flags) | |
277 | { | |
278 | struct inode *inode_in = file_inode(file_in); | |
279 | struct inode *inode_out = file_inode(file_out); | |
280 | bool same_inode = (inode_in == inode_out); | |
281 | int ret; | |
282 | ||
283 | /* Don't touch certain kinds of inodes */ | |
284 | if (IS_IMMUTABLE(inode_out)) | |
285 | return -EPERM; | |
286 | ||
287 | if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) | |
288 | return -ETXTBSY; | |
289 | ||
290 | /* Don't reflink dirs, pipes, sockets... */ | |
291 | if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) | |
292 | return -EISDIR; | |
293 | if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) | |
294 | return -EINVAL; | |
295 | ||
296 | /* Zero length dedupe exits immediately; reflink goes to EOF. */ | |
297 | if (*len == 0) { | |
298 | loff_t isize = i_size_read(inode_in); | |
299 | ||
300 | if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) | |
301 | return 0; | |
302 | if (pos_in > isize) | |
303 | return -EINVAL; | |
304 | *len = isize - pos_in; | |
305 | if (*len == 0) | |
306 | return 0; | |
307 | } | |
308 | ||
309 | /* Check that we don't violate system file offset limits. */ | |
310 | ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, | |
311 | remap_flags); | |
312 | if (ret) | |
313 | return ret; | |
314 | ||
315 | /* Wait for the completion of any pending IOs on both files */ | |
316 | inode_dio_wait(inode_in); | |
317 | if (!same_inode) | |
318 | inode_dio_wait(inode_out); | |
319 | ||
320 | ret = filemap_write_and_wait_range(inode_in->i_mapping, | |
321 | pos_in, pos_in + *len - 1); | |
322 | if (ret) | |
323 | return ret; | |
324 | ||
325 | ret = filemap_write_and_wait_range(inode_out->i_mapping, | |
326 | pos_out, pos_out + *len - 1); | |
327 | if (ret) | |
328 | return ret; | |
329 | ||
330 | /* | |
331 | * Check that the extents are the same. | |
332 | */ | |
333 | if (remap_flags & REMAP_FILE_DEDUP) { | |
334 | bool is_same = false; | |
335 | ||
336 | ret = vfs_dedupe_file_range_compare(inode_in, pos_in, | |
337 | inode_out, pos_out, *len, &is_same); | |
338 | if (ret) | |
339 | return ret; | |
340 | if (!is_same) | |
341 | return -EBADE; | |
342 | } | |
343 | ||
344 | ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, | |
345 | remap_flags); | |
346 | if (ret) | |
347 | return ret; | |
348 | ||
349 | /* If can't alter the file contents, we're done. */ | |
350 | if (!(remap_flags & REMAP_FILE_DEDUP)) | |
351 | ret = file_modified(file_out); | |
352 | ||
353 | return ret; | |
354 | } | |
355 | EXPORT_SYMBOL(generic_remap_file_range_prep); | |
356 | ||
357 | loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, | |
358 | struct file *file_out, loff_t pos_out, | |
359 | loff_t len, unsigned int remap_flags) | |
360 | { | |
361 | loff_t ret; | |
362 | ||
363 | WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); | |
364 | ||
365 | /* | |
366 | * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on | |
367 | * the same mount. Practically, they only need to be on the same file | |
368 | * system. | |
369 | */ | |
370 | if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) | |
371 | return -EXDEV; | |
372 | ||
373 | ret = generic_file_rw_checks(file_in, file_out); | |
374 | if (ret < 0) | |
375 | return ret; | |
376 | ||
377 | if (!file_in->f_op->remap_file_range) | |
378 | return -EOPNOTSUPP; | |
379 | ||
380 | ret = remap_verify_area(file_in, pos_in, len, false); | |
381 | if (ret) | |
382 | return ret; | |
383 | ||
384 | ret = remap_verify_area(file_out, pos_out, len, true); | |
385 | if (ret) | |
386 | return ret; | |
387 | ||
388 | ret = file_in->f_op->remap_file_range(file_in, pos_in, | |
389 | file_out, pos_out, len, remap_flags); | |
390 | if (ret < 0) | |
391 | return ret; | |
392 | ||
393 | fsnotify_access(file_in); | |
394 | fsnotify_modify(file_out); | |
395 | return ret; | |
396 | } | |
397 | EXPORT_SYMBOL(do_clone_file_range); | |
398 | ||
399 | loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, | |
400 | struct file *file_out, loff_t pos_out, | |
401 | loff_t len, unsigned int remap_flags) | |
402 | { | |
403 | loff_t ret; | |
404 | ||
405 | file_start_write(file_out); | |
406 | ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, | |
407 | remap_flags); | |
408 | file_end_write(file_out); | |
409 | ||
410 | return ret; | |
411 | } | |
412 | EXPORT_SYMBOL(vfs_clone_file_range); | |
413 | ||
414 | /* Check whether we are allowed to dedupe the destination file */ | |
415 | static bool allow_file_dedupe(struct file *file) | |
416 | { | |
0f5d220b CB |
417 | struct user_namespace *mnt_userns = file_mnt_user_ns(file); |
418 | struct inode *inode = file_inode(file); | |
419 | ||
1b2c54d6 DW |
420 | if (capable(CAP_SYS_ADMIN)) |
421 | return true; | |
422 | if (file->f_mode & FMODE_WRITE) | |
423 | return true; | |
0f5d220b | 424 | if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) |
1b2c54d6 | 425 | return true; |
0f5d220b | 426 | if (!inode_permission(mnt_userns, inode, MAY_WRITE)) |
1b2c54d6 DW |
427 | return true; |
428 | return false; | |
429 | } | |
430 | ||
431 | loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, | |
432 | struct file *dst_file, loff_t dst_pos, | |
433 | loff_t len, unsigned int remap_flags) | |
434 | { | |
435 | loff_t ret; | |
436 | ||
437 | WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | | |
438 | REMAP_FILE_CAN_SHORTEN)); | |
439 | ||
440 | ret = mnt_want_write_file(dst_file); | |
441 | if (ret) | |
442 | return ret; | |
443 | ||
3078d85c MS |
444 | /* |
445 | * This is redundant if called from vfs_dedupe_file_range(), but other | |
446 | * callers need it and it's not performance sesitive... | |
447 | */ | |
448 | ret = remap_verify_area(src_file, src_pos, len, false); | |
449 | if (ret) | |
450 | goto out_drop_write; | |
451 | ||
1b2c54d6 | 452 | ret = remap_verify_area(dst_file, dst_pos, len, true); |
3078d85c | 453 | if (ret) |
1b2c54d6 DW |
454 | goto out_drop_write; |
455 | ||
456 | ret = -EPERM; | |
457 | if (!allow_file_dedupe(dst_file)) | |
458 | goto out_drop_write; | |
459 | ||
460 | ret = -EXDEV; | |
461 | if (src_file->f_path.mnt != dst_file->f_path.mnt) | |
462 | goto out_drop_write; | |
463 | ||
464 | ret = -EISDIR; | |
465 | if (S_ISDIR(file_inode(dst_file)->i_mode)) | |
466 | goto out_drop_write; | |
467 | ||
468 | ret = -EINVAL; | |
469 | if (!dst_file->f_op->remap_file_range) | |
470 | goto out_drop_write; | |
471 | ||
472 | if (len == 0) { | |
473 | ret = 0; | |
474 | goto out_drop_write; | |
475 | } | |
476 | ||
477 | ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, | |
478 | dst_pos, len, remap_flags | REMAP_FILE_DEDUP); | |
479 | out_drop_write: | |
480 | mnt_drop_write_file(dst_file); | |
481 | ||
482 | return ret; | |
483 | } | |
484 | EXPORT_SYMBOL(vfs_dedupe_file_range_one); | |
485 | ||
486 | int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) | |
487 | { | |
488 | struct file_dedupe_range_info *info; | |
489 | struct inode *src = file_inode(file); | |
490 | u64 off; | |
491 | u64 len; | |
492 | int i; | |
493 | int ret; | |
494 | u16 count = same->dest_count; | |
495 | loff_t deduped; | |
496 | ||
497 | if (!(file->f_mode & FMODE_READ)) | |
498 | return -EINVAL; | |
499 | ||
500 | if (same->reserved1 || same->reserved2) | |
501 | return -EINVAL; | |
502 | ||
503 | off = same->src_offset; | |
504 | len = same->src_length; | |
505 | ||
506 | if (S_ISDIR(src->i_mode)) | |
507 | return -EISDIR; | |
508 | ||
509 | if (!S_ISREG(src->i_mode)) | |
510 | return -EINVAL; | |
511 | ||
512 | if (!file->f_op->remap_file_range) | |
513 | return -EOPNOTSUPP; | |
514 | ||
515 | ret = remap_verify_area(file, off, len, false); | |
516 | if (ret < 0) | |
517 | return ret; | |
518 | ret = 0; | |
519 | ||
520 | if (off + len > i_size_read(src)) | |
521 | return -EINVAL; | |
522 | ||
523 | /* Arbitrary 1G limit on a single dedupe request, can be raised. */ | |
524 | len = min_t(u64, len, 1 << 30); | |
525 | ||
526 | /* pre-format output fields to sane values */ | |
527 | for (i = 0; i < count; i++) { | |
528 | same->info[i].bytes_deduped = 0ULL; | |
529 | same->info[i].status = FILE_DEDUPE_RANGE_SAME; | |
530 | } | |
531 | ||
532 | for (i = 0, info = same->info; i < count; i++, info++) { | |
533 | struct fd dst_fd = fdget(info->dest_fd); | |
534 | struct file *dst_file = dst_fd.file; | |
535 | ||
536 | if (!dst_file) { | |
537 | info->status = -EBADF; | |
538 | goto next_loop; | |
539 | } | |
540 | ||
541 | if (info->reserved) { | |
542 | info->status = -EINVAL; | |
543 | goto next_fdput; | |
544 | } | |
545 | ||
546 | deduped = vfs_dedupe_file_range_one(file, off, dst_file, | |
547 | info->dest_offset, len, | |
548 | REMAP_FILE_CAN_SHORTEN); | |
549 | if (deduped == -EBADE) | |
550 | info->status = FILE_DEDUPE_RANGE_DIFFERS; | |
551 | else if (deduped < 0) | |
552 | info->status = deduped; | |
553 | else | |
554 | info->bytes_deduped = len; | |
555 | ||
556 | next_fdput: | |
557 | fdput(dst_fd); | |
558 | next_loop: | |
559 | if (fatal_signal_pending(current)) | |
560 | break; | |
561 | } | |
562 | return ret; | |
563 | } | |
564 | EXPORT_SYMBOL(vfs_dedupe_file_range); |