Commit | Line | Data |
---|---|---|
3993baeb DW |
1 | /* |
2 | * Copyright (C) 2016 Oracle. All Rights Reserved. | |
3 | * | |
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License | |
8 | * as published by the Free Software Foundation; either version 2 | |
9 | * of the License, or (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it would be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write the Free Software Foundation, | |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | |
19 | */ | |
20 | #include "xfs.h" | |
21 | #include "xfs_fs.h" | |
22 | #include "xfs_shared.h" | |
23 | #include "xfs_format.h" | |
24 | #include "xfs_log_format.h" | |
25 | #include "xfs_trans_resv.h" | |
26 | #include "xfs_mount.h" | |
27 | #include "xfs_defer.h" | |
28 | #include "xfs_da_format.h" | |
29 | #include "xfs_da_btree.h" | |
30 | #include "xfs_inode.h" | |
31 | #include "xfs_trans.h" | |
32 | #include "xfs_inode_item.h" | |
33 | #include "xfs_bmap.h" | |
34 | #include "xfs_bmap_util.h" | |
35 | #include "xfs_error.h" | |
36 | #include "xfs_dir2.h" | |
37 | #include "xfs_dir2_priv.h" | |
38 | #include "xfs_ioctl.h" | |
39 | #include "xfs_trace.h" | |
40 | #include "xfs_log.h" | |
41 | #include "xfs_icache.h" | |
42 | #include "xfs_pnfs.h" | |
43 | #include "xfs_refcount_btree.h" | |
44 | #include "xfs_refcount.h" | |
45 | #include "xfs_bmap_btree.h" | |
46 | #include "xfs_trans_space.h" | |
47 | #include "xfs_bit.h" | |
48 | #include "xfs_alloc.h" | |
49 | #include "xfs_quota_defs.h" | |
50 | #include "xfs_quota.h" | |
51 | #include "xfs_btree.h" | |
52 | #include "xfs_bmap_btree.h" | |
53 | #include "xfs_reflink.h" | |
2a06705c | 54 | #include "xfs_iomap.h" |
3993baeb DW |
55 | |
56 | /* | |
57 | * Copy on Write of Shared Blocks | |
58 | * | |
59 | * XFS must preserve "the usual" file semantics even when two files share | |
60 | * the same physical blocks. This means that a write to one file must not | |
61 | * alter the blocks in a different file; the way that we'll do that is | |
62 | * through the use of a copy-on-write mechanism. At a high level, that | |
63 | * means that when we want to write to a shared block, we allocate a new | |
64 | * block, write the data to the new block, and if that succeeds we map the | |
65 | * new block into the file. | |
66 | * | |
67 | * XFS provides a "delayed allocation" mechanism that defers the allocation | |
68 | * of disk blocks to dirty-but-not-yet-mapped file blocks as long as | |
69 | * possible. This reduces fragmentation by enabling the filesystem to ask | |
70 | * for bigger chunks less often, which is exactly what we want for CoW. | |
71 | * | |
72 | * The delalloc mechanism begins when the kernel wants to make a block | |
73 | * writable (write_begin or page_mkwrite). If the offset is not mapped, we | |
74 | * create a delalloc mapping, which is a regular in-core extent, but without | |
75 | * a real startblock. (For delalloc mappings, the startblock encodes both | |
76 | * a flag that this is a delalloc mapping, and a worst-case estimate of how | |
77 | * many blocks might be required to put the mapping into the BMBT.) delalloc | |
78 | * mappings are a reservation against the free space in the filesystem; | |
79 | * adjacent mappings can also be combined into fewer larger mappings. | |
80 | * | |
81 | * When dirty pages are being written out (typically in writepage), the | |
82 | * delalloc reservations are converted into real mappings by allocating | |
83 | * blocks and replacing the delalloc mapping with real ones. A delalloc | |
84 | * mapping can be replaced by several real ones if the free space is | |
85 | * fragmented. | |
86 | * | |
87 | * We want to adapt the delalloc mechanism for copy-on-write, since the | |
88 | * write paths are similar. The first two steps (creating the reservation | |
89 | * and allocating the blocks) are exactly the same as delalloc except that | |
90 | * the mappings must be stored in a separate CoW fork because we do not want | |
91 | * to disturb the mapping in the data fork until we're sure that the write | |
92 | * succeeded. IO completion in this case is the process of removing the old | |
93 | * mapping from the data fork and moving the new mapping from the CoW fork to | |
94 | * the data fork. This will be discussed shortly. | |
95 | * | |
96 | * For now, unaligned directio writes will be bounced back to the page cache. | |
97 | * Block-aligned directio writes will use the same mechanism as buffered | |
98 | * writes. | |
99 | * | |
100 | * CoW remapping must be done after the data block write completes, | |
101 | * because we don't want to destroy the old data fork map until we're sure | |
102 | * the new block has been written. Since the new mappings are kept in a | |
103 | * separate fork, we can simply iterate these mappings to find the ones | |
104 | * that cover the file blocks that we just CoW'd. For each extent, simply | |
105 | * unmap the corresponding range in the data fork, map the new range into | |
106 | * the data fork, and remove the extent from the CoW fork. | |
107 | * | |
108 | * Since the remapping operation can be applied to an arbitrary file | |
109 | * range, we record the need for the remap step as a flag in the ioend | |
110 | * instead of declaring a new IO type. This is required for direct io | |
111 | * because we only have ioend for the whole dio, and we have to be able to | |
112 | * remember the presence of unwritten blocks and CoW blocks with a single | |
113 | * ioend structure. Better yet, the more ground we can cover with one | |
114 | * ioend, the better. | |
115 | */ | |
2a06705c DW |
116 | |
117 | /* | |
118 | * Given an AG extent, find the lowest-numbered run of shared blocks | |
119 | * within that range and return the range in fbno/flen. If | |
120 | * find_end_of_shared is true, return the longest contiguous extent of | |
121 | * shared blocks. If there are no shared extents, fbno and flen will | |
122 | * be set to NULLAGBLOCK and 0, respectively. | |
123 | */ | |
124 | int | |
125 | xfs_reflink_find_shared( | |
126 | struct xfs_mount *mp, | |
127 | xfs_agnumber_t agno, | |
128 | xfs_agblock_t agbno, | |
129 | xfs_extlen_t aglen, | |
130 | xfs_agblock_t *fbno, | |
131 | xfs_extlen_t *flen, | |
132 | bool find_end_of_shared) | |
133 | { | |
134 | struct xfs_buf *agbp; | |
135 | struct xfs_btree_cur *cur; | |
136 | int error; | |
137 | ||
138 | error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); | |
139 | if (error) | |
140 | return error; | |
141 | ||
142 | cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); | |
143 | ||
144 | error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, | |
145 | find_end_of_shared); | |
146 | ||
147 | xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); | |
148 | ||
149 | xfs_buf_relse(agbp); | |
150 | return error; | |
151 | } | |
152 | ||
153 | /* | |
154 | * Trim the mapping to the next block where there's a change in the | |
155 | * shared/unshared status. More specifically, this means that we | |
156 | * find the lowest-numbered extent of shared blocks that coincides with | |
157 | * the given block mapping. If the shared extent overlaps the start of | |
158 | * the mapping, trim the mapping to the end of the shared extent. If | |
159 | * the shared region intersects the mapping, trim the mapping to the | |
160 | * start of the shared extent. If there are no shared regions that | |
161 | * overlap, just return the original extent. | |
162 | */ | |
163 | int | |
164 | xfs_reflink_trim_around_shared( | |
165 | struct xfs_inode *ip, | |
166 | struct xfs_bmbt_irec *irec, | |
167 | bool *shared, | |
168 | bool *trimmed) | |
169 | { | |
170 | xfs_agnumber_t agno; | |
171 | xfs_agblock_t agbno; | |
172 | xfs_extlen_t aglen; | |
173 | xfs_agblock_t fbno; | |
174 | xfs_extlen_t flen; | |
175 | int error = 0; | |
176 | ||
177 | /* Holes, unwritten, and delalloc extents cannot be shared */ | |
178 | if (!xfs_is_reflink_inode(ip) || | |
179 | ISUNWRITTEN(irec) || | |
180 | irec->br_startblock == HOLESTARTBLOCK || | |
181 | irec->br_startblock == DELAYSTARTBLOCK) { | |
182 | *shared = false; | |
183 | return 0; | |
184 | } | |
185 | ||
186 | trace_xfs_reflink_trim_around_shared(ip, irec); | |
187 | ||
188 | agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); | |
189 | agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); | |
190 | aglen = irec->br_blockcount; | |
191 | ||
192 | error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, | |
193 | aglen, &fbno, &flen, true); | |
194 | if (error) | |
195 | return error; | |
196 | ||
197 | *shared = *trimmed = false; | |
198 | if (fbno == NULLAGBLOCK) { | |
199 | /* No shared blocks at all. */ | |
200 | return 0; | |
201 | } else if (fbno == agbno) { | |
202 | /* | |
203 | * The start of this extent is shared. Truncate the | |
204 | * mapping at the end of the shared region so that a | |
205 | * subsequent iteration starts at the start of the | |
206 | * unshared region. | |
207 | */ | |
208 | irec->br_blockcount = flen; | |
209 | *shared = true; | |
210 | if (flen != aglen) | |
211 | *trimmed = true; | |
212 | return 0; | |
213 | } else { | |
214 | /* | |
215 | * There's a shared extent midway through this extent. | |
216 | * Truncate the mapping at the start of the shared | |
217 | * extent so that a subsequent iteration starts at the | |
218 | * start of the shared region. | |
219 | */ | |
220 | irec->br_blockcount = fbno - agbno; | |
221 | *trimmed = true; | |
222 | return 0; | |
223 | } | |
224 | } | |
225 | ||
226 | /* Create a CoW reservation for a range of blocks within a file. */ | |
227 | static int | |
228 | __xfs_reflink_reserve_cow( | |
229 | struct xfs_inode *ip, | |
230 | xfs_fileoff_t *offset_fsb, | |
231 | xfs_fileoff_t end_fsb) | |
232 | { | |
233 | struct xfs_bmbt_irec got, prev, imap; | |
234 | xfs_fileoff_t orig_end_fsb; | |
235 | int nimaps, eof = 0, error = 0; | |
236 | bool shared = false, trimmed = false; | |
237 | xfs_extnum_t idx; | |
238 | ||
239 | /* Already reserved? Skip the refcount btree access. */ | |
240 | xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, | |
241 | &got, &prev); | |
242 | if (!eof && got.br_startoff <= *offset_fsb) { | |
243 | end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; | |
244 | trace_xfs_reflink_cow_found(ip, &got); | |
245 | goto done; | |
246 | } | |
247 | ||
248 | /* Read extent from the source file. */ | |
249 | nimaps = 1; | |
250 | error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, | |
251 | &imap, &nimaps, 0); | |
252 | if (error) | |
253 | goto out_unlock; | |
254 | ASSERT(nimaps == 1); | |
255 | ||
256 | /* Trim the mapping to the nearest shared extent boundary. */ | |
257 | error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); | |
258 | if (error) | |
259 | goto out_unlock; | |
260 | ||
261 | end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; | |
262 | ||
263 | /* Not shared? Just report the (potentially capped) extent. */ | |
264 | if (!shared) | |
265 | goto done; | |
266 | ||
267 | /* | |
268 | * Fork all the shared blocks from our write offset until the end of | |
269 | * the extent. | |
270 | */ | |
271 | error = xfs_qm_dqattach_locked(ip, 0); | |
272 | if (error) | |
273 | goto out_unlock; | |
274 | ||
275 | retry: | |
276 | error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, | |
277 | end_fsb - *offset_fsb, &got, | |
278 | &prev, &idx, eof); | |
279 | switch (error) { | |
280 | case 0: | |
281 | break; | |
282 | case -ENOSPC: | |
283 | case -EDQUOT: | |
284 | /* retry without any preallocation */ | |
285 | trace_xfs_reflink_cow_enospc(ip, &imap); | |
286 | if (end_fsb != orig_end_fsb) { | |
287 | end_fsb = orig_end_fsb; | |
288 | goto retry; | |
289 | } | |
290 | /*FALLTHRU*/ | |
291 | default: | |
292 | goto out_unlock; | |
293 | } | |
294 | ||
295 | trace_xfs_reflink_cow_alloc(ip, &got); | |
296 | done: | |
297 | *offset_fsb = end_fsb; | |
298 | out_unlock: | |
299 | return error; | |
300 | } | |
301 | ||
302 | /* Create a CoW reservation for part of a file. */ | |
303 | int | |
304 | xfs_reflink_reserve_cow_range( | |
305 | struct xfs_inode *ip, | |
306 | xfs_off_t offset, | |
307 | xfs_off_t count) | |
308 | { | |
309 | struct xfs_mount *mp = ip->i_mount; | |
310 | xfs_fileoff_t offset_fsb, end_fsb; | |
311 | int error; | |
312 | ||
313 | trace_xfs_reflink_reserve_cow_range(ip, offset, count); | |
314 | ||
315 | offset_fsb = XFS_B_TO_FSBT(mp, offset); | |
316 | end_fsb = XFS_B_TO_FSB(mp, offset + count); | |
317 | ||
318 | xfs_ilock(ip, XFS_ILOCK_EXCL); | |
319 | while (offset_fsb < end_fsb) { | |
320 | error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb); | |
321 | if (error) { | |
322 | trace_xfs_reflink_reserve_cow_range_error(ip, error, | |
323 | _RET_IP_); | |
324 | break; | |
325 | } | |
326 | } | |
327 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | |
328 | ||
329 | return error; | |
330 | } | |
ef473667 DW |
331 | |
332 | /* | |
333 | * Find the CoW reservation (and whether or not it needs block allocation) | |
334 | * for a given byte offset of a file. | |
335 | */ | |
336 | bool | |
337 | xfs_reflink_find_cow_mapping( | |
338 | struct xfs_inode *ip, | |
339 | xfs_off_t offset, | |
340 | struct xfs_bmbt_irec *imap, | |
341 | bool *need_alloc) | |
342 | { | |
343 | struct xfs_bmbt_irec irec; | |
344 | struct xfs_ifork *ifp; | |
345 | struct xfs_bmbt_rec_host *gotp; | |
346 | xfs_fileoff_t bno; | |
347 | xfs_extnum_t idx; | |
348 | ||
349 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); | |
350 | ASSERT(xfs_is_reflink_inode(ip)); | |
351 | ||
352 | /* Find the extent in the CoW fork. */ | |
353 | ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); | |
354 | bno = XFS_B_TO_FSBT(ip->i_mount, offset); | |
355 | gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); | |
356 | if (!gotp) | |
357 | return false; | |
358 | ||
359 | xfs_bmbt_get_all(gotp, &irec); | |
360 | if (bno >= irec.br_startoff + irec.br_blockcount || | |
361 | bno < irec.br_startoff) | |
362 | return false; | |
363 | ||
364 | trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, | |
365 | &irec); | |
366 | ||
367 | /* If it's still delalloc, we must allocate later. */ | |
368 | *imap = irec; | |
369 | *need_alloc = !!(isnullstartblock(irec.br_startblock)); | |
370 | ||
371 | return true; | |
372 | } | |
373 | ||
374 | /* | |
375 | * Trim an extent to end at the next CoW reservation past offset_fsb. | |
376 | */ | |
377 | int | |
378 | xfs_reflink_trim_irec_to_next_cow( | |
379 | struct xfs_inode *ip, | |
380 | xfs_fileoff_t offset_fsb, | |
381 | struct xfs_bmbt_irec *imap) | |
382 | { | |
383 | struct xfs_bmbt_irec irec; | |
384 | struct xfs_ifork *ifp; | |
385 | struct xfs_bmbt_rec_host *gotp; | |
386 | xfs_extnum_t idx; | |
387 | ||
388 | if (!xfs_is_reflink_inode(ip)) | |
389 | return 0; | |
390 | ||
391 | /* Find the extent in the CoW fork. */ | |
392 | ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); | |
393 | gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); | |
394 | if (!gotp) | |
395 | return 0; | |
396 | xfs_bmbt_get_all(gotp, &irec); | |
397 | ||
398 | /* This is the extent before; try sliding up one. */ | |
399 | if (irec.br_startoff < offset_fsb) { | |
400 | idx++; | |
401 | if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) | |
402 | return 0; | |
403 | gotp = xfs_iext_get_ext(ifp, idx); | |
404 | xfs_bmbt_get_all(gotp, &irec); | |
405 | } | |
406 | ||
407 | if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) | |
408 | return 0; | |
409 | ||
410 | imap->br_blockcount = irec.br_startoff - imap->br_startoff; | |
411 | trace_xfs_reflink_trim_irec(ip, imap); | |
412 | ||
413 | return 0; | |
414 | } |