Commit | Line | Data |
---|---|---|
1802d0be | 1 | // SPDX-License-Identifier: GPL-2.0-only |
fa60ce2c | 2 | /* |
028ba5df TY |
3 | * move_extents.c |
4 | * | |
5 | * Copyright (C) 2011 Oracle. All rights reserved. | |
028ba5df TY |
6 | */ |
7 | #include <linux/fs.h> | |
8 | #include <linux/types.h> | |
9 | #include <linux/mount.h> | |
10 | #include <linux/swap.h> | |
11 | ||
12 | #include <cluster/masklog.h> | |
13 | ||
14 | #include "ocfs2.h" | |
15 | #include "ocfs2_ioctl.h" | |
16 | ||
17 | #include "alloc.h" | |
6194ae42 | 18 | #include "localalloc.h" |
028ba5df TY |
19 | #include "aops.h" |
20 | #include "dlmglue.h" | |
21 | #include "extent_map.h" | |
22 | #include "inode.h" | |
23 | #include "journal.h" | |
24 | #include "suballoc.h" | |
25 | #include "uptodate.h" | |
26 | #include "super.h" | |
27 | #include "dir.h" | |
28 | #include "buffer_head_io.h" | |
29 | #include "sysfile.h" | |
028ba5df TY |
30 | #include "refcounttree.h" |
31 | #include "move_extents.h" | |
32 | ||
33 | struct ocfs2_move_extents_context { | |
34 | struct inode *inode; | |
35 | struct file *file; | |
36 | int auto_defrag; | |
4dfa66bd | 37 | int partial; |
028ba5df TY |
38 | int credits; |
39 | u32 new_phys_cpos; | |
40 | u32 clusters_moved; | |
41 | u64 refcount_loc; | |
42 | struct ocfs2_move_extents *range; | |
43 | struct ocfs2_extent_tree et; | |
44 | struct ocfs2_alloc_context *meta_ac; | |
45 | struct ocfs2_alloc_context *data_ac; | |
46 | struct ocfs2_cached_dealloc_ctxt dealloc; | |
47 | }; | |
de474ee8 | 48 | |
8f603e56 TY |
49 | static int __ocfs2_move_extent(handle_t *handle, |
50 | struct ocfs2_move_extents_context *context, | |
51 | u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, | |
52 | int ext_flags) | |
53 | { | |
54 | int ret = 0, index; | |
55 | struct inode *inode = context->inode; | |
56 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
57 | struct ocfs2_extent_rec *rec, replace_rec; | |
58 | struct ocfs2_path *path = NULL; | |
59 | struct ocfs2_extent_list *el; | |
60 | u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); | |
61 | u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); | |
62 | ||
c7dd3392 | 63 | ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, |
8f603e56 TY |
64 | p_cpos, new_p_cpos, len); |
65 | if (ret) { | |
66 | mlog_errno(ret); | |
67 | goto out; | |
68 | } | |
69 | ||
70 | memset(&replace_rec, 0, sizeof(replace_rec)); | |
71 | replace_rec.e_cpos = cpu_to_le32(cpos); | |
72 | replace_rec.e_leaf_clusters = cpu_to_le16(len); | |
73 | replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, | |
74 | new_p_cpos)); | |
75 | ||
76 | path = ocfs2_new_path_from_et(&context->et); | |
77 | if (!path) { | |
78 | ret = -ENOMEM; | |
79 | mlog_errno(ret); | |
80 | goto out; | |
81 | } | |
82 | ||
83 | ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); | |
84 | if (ret) { | |
85 | mlog_errno(ret); | |
86 | goto out; | |
87 | } | |
88 | ||
89 | el = path_leaf_el(path); | |
90 | ||
91 | index = ocfs2_search_extent_list(el, cpos); | |
981035b4 | 92 | if (index == -1) { |
17a5b9ab | 93 | ret = ocfs2_error(inode->i_sb, |
7ecef14a JP |
94 | "Inode %llu has an extent at cpos %u which can no longer be found\n", |
95 | (unsigned long long)ino, cpos); | |
8f603e56 TY |
96 | goto out; |
97 | } | |
98 | ||
99 | rec = &el->l_recs[index]; | |
100 | ||
101 | BUG_ON(ext_flags != rec->e_flags); | |
102 | /* | |
103 | * after moving/defraging to new location, the extent is not going | |
104 | * to be refcounted anymore. | |
105 | */ | |
106 | replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; | |
107 | ||
8f603e56 TY |
108 | ret = ocfs2_split_extent(handle, &context->et, path, index, |
109 | &replace_rec, context->meta_ac, | |
110 | &context->dealloc); | |
111 | if (ret) { | |
112 | mlog_errno(ret); | |
113 | goto out; | |
114 | } | |
115 | ||
8f603e56 TY |
116 | context->new_phys_cpos = new_p_cpos; |
117 | ||
118 | /* | |
119 | * need I to append truncate log for old clusters? | |
120 | */ | |
121 | if (old_blkno) { | |
122 | if (ext_flags & OCFS2_EXT_REFCOUNTED) | |
123 | ret = ocfs2_decrease_refcount(inode, handle, | |
124 | ocfs2_blocks_to_clusters(osb->sb, | |
125 | old_blkno), | |
126 | len, context->meta_ac, | |
127 | &context->dealloc, 1); | |
128 | else | |
129 | ret = ocfs2_truncate_log_append(osb, handle, | |
130 | old_blkno, len); | |
131 | } | |
132 | ||
6fdb702d | 133 | ocfs2_update_inode_fsync_trans(handle, inode, 0); |
8f603e56 | 134 | out: |
4704aa30 | 135 | ocfs2_free_path(path); |
8f603e56 TY |
136 | return ret; |
137 | } | |
138 | ||
de474ee8 | 139 | /* |
e21e5744 LC |
140 | * lock allocator, and reserve appropriate number of bits for |
141 | * meta blocks. | |
de474ee8 | 142 | */ |
e21e5744 | 143 | static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, |
de474ee8 TY |
144 | struct ocfs2_extent_tree *et, |
145 | u32 clusters_to_move, | |
146 | u32 extents_to_split, | |
147 | struct ocfs2_alloc_context **meta_ac, | |
de474ee8 TY |
148 | int extra_blocks, |
149 | int *credits) | |
150 | { | |
151 | int ret, num_free_extents; | |
152 | unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; | |
153 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
154 | ||
964f14a0 | 155 | num_free_extents = ocfs2_num_free_extents(et); |
de474ee8 TY |
156 | if (num_free_extents < 0) { |
157 | ret = num_free_extents; | |
158 | mlog_errno(ret); | |
159 | goto out; | |
160 | } | |
161 | ||
162 | if (!num_free_extents || | |
163 | (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) | |
164 | extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); | |
165 | ||
166 | ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); | |
167 | if (ret) { | |
168 | mlog_errno(ret); | |
169 | goto out; | |
170 | } | |
171 | ||
de474ee8 | 172 | |
06f9da6e | 173 | *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); |
de474ee8 TY |
174 | |
175 | mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", | |
176 | extra_blocks, clusters_to_move, *credits); | |
177 | out: | |
178 | if (ret) { | |
179 | if (*meta_ac) { | |
180 | ocfs2_free_alloc_context(*meta_ac); | |
181 | *meta_ac = NULL; | |
182 | } | |
183 | } | |
184 | ||
185 | return ret; | |
186 | } | |
202ee5fa TY |
187 | |
188 | /* | |
189 | * Using one journal handle to guarantee the data consistency in case | |
190 | * crash happens anywhere. | |
dda54e76 TY |
191 | * |
192 | * XXX: defrag can end up with finishing partial extent as requested, | |
193 | * due to not enough contiguous clusters can be found in allocator. | |
202ee5fa TY |
194 | */ |
195 | static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, | |
4dfa66bd | 196 | u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) |
202ee5fa | 197 | { |
4dfa66bd | 198 | int ret, credits = 0, extra_blocks = 0, partial = context->partial; |
202ee5fa TY |
199 | handle_t *handle; |
200 | struct inode *inode = context->inode; | |
201 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
202 | struct inode *tl_inode = osb->osb_tl_inode; | |
203 | struct ocfs2_refcount_tree *ref_tree = NULL; | |
204 | u32 new_phys_cpos, new_len; | |
205 | u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | |
6194ae42 | 206 | int need_free = 0; |
202ee5fa | 207 | |
4dfa66bd | 208 | if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { |
84e40080 | 209 | BUG_ON(!ocfs2_is_refcount_inode(inode)); |
202ee5fa TY |
210 | BUG_ON(!context->refcount_loc); |
211 | ||
212 | ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, | |
213 | &ref_tree, NULL); | |
214 | if (ret) { | |
215 | mlog_errno(ret); | |
216 | return ret; | |
217 | } | |
218 | ||
219 | ret = ocfs2_prepare_refcount_change_for_del(inode, | |
220 | context->refcount_loc, | |
221 | phys_blkno, | |
4dfa66bd | 222 | *len, |
202ee5fa TY |
223 | &credits, |
224 | &extra_blocks); | |
225 | if (ret) { | |
226 | mlog_errno(ret); | |
227 | goto out; | |
228 | } | |
229 | } | |
230 | ||
e21e5744 LC |
231 | ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, |
232 | *len, 1, | |
233 | &context->meta_ac, | |
234 | extra_blocks, &credits); | |
202ee5fa TY |
235 | if (ret) { |
236 | mlog_errno(ret); | |
237 | goto out; | |
238 | } | |
239 | ||
240 | /* | |
241 | * should be using allocation reservation strategy there? | |
242 | * | |
243 | * if (context->data_ac) | |
244 | * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; | |
245 | */ | |
246 | ||
5955102c | 247 | inode_lock(tl_inode); |
202ee5fa TY |
248 | |
249 | if (ocfs2_truncate_log_needs_flush(osb)) { | |
250 | ret = __ocfs2_flush_truncate_log(osb); | |
251 | if (ret < 0) { | |
252 | mlog_errno(ret); | |
253 | goto out_unlock_mutex; | |
254 | } | |
255 | } | |
256 | ||
e21e5744 LC |
257 | /* |
258 | * Make sure ocfs2_reserve_cluster is called after | |
259 | * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. | |
260 | * | |
261 | * If ocfs2_reserve_cluster is called | |
262 | * before __ocfs2_flush_truncate_log, dead lock on global bitmap | |
263 | * may happen. | |
264 | * | |
265 | */ | |
266 | ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); | |
267 | if (ret) { | |
268 | mlog_errno(ret); | |
269 | goto out_unlock_mutex; | |
270 | } | |
271 | ||
202ee5fa TY |
272 | handle = ocfs2_start_trans(osb, credits); |
273 | if (IS_ERR(handle)) { | |
274 | ret = PTR_ERR(handle); | |
275 | mlog_errno(ret); | |
276 | goto out_unlock_mutex; | |
277 | } | |
278 | ||
4dfa66bd | 279 | ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, |
202ee5fa TY |
280 | &new_phys_cpos, &new_len); |
281 | if (ret) { | |
282 | mlog_errno(ret); | |
283 | goto out_commit; | |
284 | } | |
285 | ||
286 | /* | |
4dfa66bd TY |
287 | * allowing partial extent moving is kind of 'pros and cons', it makes |
288 | * whole defragmentation less likely to fail, on the contrary, the bad | |
289 | * thing is it may make the fs even more fragmented after moving, let | |
290 | * userspace make a good decision here. | |
202ee5fa | 291 | */ |
4dfa66bd TY |
292 | if (new_len != *len) { |
293 | mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); | |
294 | if (!partial) { | |
295 | context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; | |
296 | ret = -ENOSPC; | |
6194ae42 | 297 | need_free = 1; |
4dfa66bd TY |
298 | goto out_commit; |
299 | } | |
202ee5fa TY |
300 | } |
301 | ||
302 | mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, | |
303 | phys_cpos, new_phys_cpos); | |
304 | ||
4dfa66bd | 305 | ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, |
202ee5fa TY |
306 | new_phys_cpos, ext_flags); |
307 | if (ret) | |
308 | mlog_errno(ret); | |
309 | ||
4dfa66bd TY |
310 | if (partial && (new_len != *len)) |
311 | *len = new_len; | |
312 | ||
202ee5fa TY |
313 | /* |
314 | * Here we should write the new page out first if we are | |
315 | * in write-back mode. | |
316 | */ | |
4dfa66bd | 317 | ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); |
202ee5fa TY |
318 | if (ret) |
319 | mlog_errno(ret); | |
320 | ||
321 | out_commit: | |
6194ae42 LC |
322 | if (need_free && context->data_ac) { |
323 | struct ocfs2_alloc_context *data_ac = context->data_ac; | |
324 | ||
325 | if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) | |
326 | ocfs2_free_local_alloc_bits(osb, handle, data_ac, | |
327 | new_phys_cpos, new_len); | |
328 | else | |
329 | ocfs2_free_clusters(handle, | |
330 | data_ac->ac_inode, | |
331 | data_ac->ac_bh, | |
332 | ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), | |
333 | new_len); | |
334 | } | |
335 | ||
202ee5fa TY |
336 | ocfs2_commit_trans(osb, handle); |
337 | ||
338 | out_unlock_mutex: | |
5955102c | 339 | inode_unlock(tl_inode); |
202ee5fa TY |
340 | |
341 | if (context->data_ac) { | |
342 | ocfs2_free_alloc_context(context->data_ac); | |
343 | context->data_ac = NULL; | |
344 | } | |
345 | ||
346 | if (context->meta_ac) { | |
347 | ocfs2_free_alloc_context(context->meta_ac); | |
348 | context->meta_ac = NULL; | |
349 | } | |
350 | ||
351 | out: | |
352 | if (ref_tree) | |
353 | ocfs2_unlock_refcount_tree(osb, ref_tree, 1); | |
354 | ||
355 | return ret; | |
356 | } | |
1c06b912 TY |
357 | |
358 | /* | |
359 | * find the victim alloc group, where #blkno fits. | |
360 | */ | |
361 | static int ocfs2_find_victim_alloc_group(struct inode *inode, | |
362 | u64 vict_blkno, | |
363 | int type, int slot, | |
364 | int *vict_bit, | |
365 | struct buffer_head **ret_bh) | |
366 | { | |
6aea6f50 | 367 | int ret, i, bits_per_unit = 0; |
1c06b912 TY |
368 | u64 blkno; |
369 | char namebuf[40]; | |
370 | ||
371 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
372 | struct buffer_head *ac_bh = NULL, *gd_bh = NULL; | |
373 | struct ocfs2_chain_list *cl; | |
374 | struct ocfs2_chain_rec *rec; | |
375 | struct ocfs2_dinode *ac_dinode; | |
376 | struct ocfs2_group_desc *bg; | |
377 | ||
378 | ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); | |
379 | ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, | |
380 | strlen(namebuf), &blkno); | |
381 | if (ret) { | |
382 | ret = -ENOENT; | |
383 | goto out; | |
384 | } | |
385 | ||
386 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); | |
387 | if (ret) { | |
388 | mlog_errno(ret); | |
389 | goto out; | |
390 | } | |
391 | ||
392 | ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; | |
393 | cl = &(ac_dinode->id2.i_chain); | |
394 | rec = &(cl->cl_recs[0]); | |
395 | ||
396 | if (type == GLOBAL_BITMAP_SYSTEM_INODE) | |
6aea6f50 TY |
397 | bits_per_unit = osb->s_clustersize_bits - |
398 | inode->i_sb->s_blocksize_bits; | |
1c06b912 TY |
399 | /* |
400 | * 'vict_blkno' was out of the valid range. | |
401 | */ | |
402 | if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || | |
7fa05c6e | 403 | (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << |
6aea6f50 | 404 | bits_per_unit))) { |
1c06b912 TY |
405 | ret = -EINVAL; |
406 | goto out; | |
407 | } | |
408 | ||
409 | for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { | |
410 | ||
411 | rec = &(cl->cl_recs[i]); | |
412 | if (!rec) | |
413 | continue; | |
414 | ||
415 | bg = NULL; | |
416 | ||
417 | do { | |
418 | if (!bg) | |
419 | blkno = le64_to_cpu(rec->c_blkno); | |
420 | else | |
421 | blkno = le64_to_cpu(bg->bg_next_group); | |
422 | ||
423 | if (gd_bh) { | |
424 | brelse(gd_bh); | |
425 | gd_bh = NULL; | |
426 | } | |
427 | ||
428 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); | |
429 | if (ret) { | |
430 | mlog_errno(ret); | |
431 | goto out; | |
432 | } | |
433 | ||
434 | bg = (struct ocfs2_group_desc *)gd_bh->b_data; | |
435 | ||
436 | if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + | |
236b9254 | 437 | (le16_to_cpu(bg->bg_bits) << bits_per_unit))) { |
1c06b912 TY |
438 | |
439 | *ret_bh = gd_bh; | |
6aea6f50 TY |
440 | *vict_bit = (vict_blkno - blkno) >> |
441 | bits_per_unit; | |
1c06b912 TY |
442 | mlog(0, "find the victim group: #%llu, " |
443 | "total_bits: %u, vict_bit: %u\n", | |
444 | blkno, le16_to_cpu(bg->bg_bits), | |
445 | *vict_bit); | |
446 | goto out; | |
447 | } | |
448 | ||
449 | } while (le64_to_cpu(bg->bg_next_group)); | |
450 | } | |
451 | ||
452 | ret = -EINVAL; | |
453 | out: | |
454 | brelse(ac_bh); | |
455 | ||
456 | /* | |
457 | * caller has to release the gd_bh properly. | |
458 | */ | |
459 | return ret; | |
460 | } | |
99e4c750 TY |
461 | |
462 | /* | |
463 | * XXX: helper to validate and adjust moving goal. | |
464 | */ | |
465 | static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, | |
466 | struct ocfs2_move_extents *range) | |
467 | { | |
468 | int ret, goal_bit = 0; | |
469 | ||
470 | struct buffer_head *gd_bh = NULL; | |
7f4804d4 | 471 | struct ocfs2_group_desc *bg; |
99e4c750 TY |
472 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
473 | int c_to_b = 1 << (osb->s_clustersize_bits - | |
474 | inode->i_sb->s_blocksize_bits); | |
475 | ||
ea5e1675 TY |
476 | /* |
477 | * make goal become cluster aligned. | |
478 | */ | |
479 | range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, | |
480 | range->me_goal); | |
99e4c750 TY |
481 | /* |
482 | * validate goal sits within global_bitmap, and return the victim | |
483 | * group desc | |
484 | */ | |
485 | ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, | |
486 | GLOBAL_BITMAP_SYSTEM_INODE, | |
487 | OCFS2_INVALID_SLOT, | |
488 | &goal_bit, &gd_bh); | |
489 | if (ret) | |
490 | goto out; | |
491 | ||
492 | bg = (struct ocfs2_group_desc *)gd_bh->b_data; | |
493 | ||
7f4804d4 DC |
494 | /* |
495 | * moving goal is not allowd to start with a group desc blok(#0 blk) | |
496 | * let's compromise to the latter cluster. | |
497 | */ | |
498 | if (range->me_goal == le64_to_cpu(bg->bg_blkno)) | |
499 | range->me_goal += c_to_b; | |
500 | ||
99e4c750 TY |
501 | /* |
502 | * movement is not gonna cross two groups. | |
503 | */ | |
504 | if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < | |
505 | range->me_len) { | |
506 | ret = -EINVAL; | |
507 | goto out; | |
508 | } | |
509 | /* | |
510 | * more exact validations/adjustments will be performed later during | |
511 | * moving operation for each extent range. | |
512 | */ | |
513 | mlog(0, "extents get ready to be moved to #%llu block\n", | |
514 | range->me_goal); | |
515 | ||
516 | out: | |
517 | brelse(gd_bh); | |
518 | ||
519 | return ret; | |
520 | } | |
e6b5859c TY |
521 | |
522 | static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, | |
523 | int *goal_bit, u32 move_len, u32 max_hop, | |
524 | u32 *phys_cpos) | |
525 | { | |
526 | int i, used, last_free_bits = 0, base_bit = *goal_bit; | |
527 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; | |
528 | u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | |
529 | le64_to_cpu(gd->bg_blkno)); | |
530 | ||
531 | for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { | |
532 | ||
533 | used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); | |
534 | if (used) { | |
535 | /* | |
536 | * we even tried searching the free chunk by jumping | |
537 | * a 'max_hop' distance, but still failed. | |
538 | */ | |
539 | if ((i - base_bit) > max_hop) { | |
540 | *phys_cpos = 0; | |
541 | break; | |
542 | } | |
543 | ||
544 | if (last_free_bits) | |
545 | last_free_bits = 0; | |
546 | ||
547 | continue; | |
548 | } else | |
549 | last_free_bits++; | |
550 | ||
551 | if (last_free_bits == move_len) { | |
236b9254 | 552 | i -= move_len; |
e6b5859c TY |
553 | *goal_bit = i; |
554 | *phys_cpos = base_cpos + i; | |
555 | break; | |
556 | } | |
557 | } | |
558 | ||
559 | mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); | |
560 | } | |
8473aa8a | 561 | |
e0847717 TY |
562 | static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, |
563 | u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, | |
564 | u32 len, int ext_flags) | |
565 | { | |
566 | int ret, credits = 0, extra_blocks = 0, goal_bit = 0; | |
567 | handle_t *handle; | |
568 | struct inode *inode = context->inode; | |
569 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
570 | struct inode *tl_inode = osb->osb_tl_inode; | |
571 | struct inode *gb_inode = NULL; | |
572 | struct buffer_head *gb_bh = NULL; | |
573 | struct buffer_head *gd_bh = NULL; | |
574 | struct ocfs2_group_desc *gd; | |
575 | struct ocfs2_refcount_tree *ref_tree = NULL; | |
576 | u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, | |
577 | context->range->me_threshold); | |
578 | u64 phys_blkno, new_phys_blkno; | |
579 | ||
580 | phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | |
581 | ||
582 | if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { | |
84e40080 | 583 | BUG_ON(!ocfs2_is_refcount_inode(inode)); |
e0847717 TY |
584 | BUG_ON(!context->refcount_loc); |
585 | ||
586 | ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, | |
587 | &ref_tree, NULL); | |
588 | if (ret) { | |
589 | mlog_errno(ret); | |
590 | return ret; | |
591 | } | |
592 | ||
593 | ret = ocfs2_prepare_refcount_change_for_del(inode, | |
594 | context->refcount_loc, | |
595 | phys_blkno, | |
596 | len, | |
597 | &credits, | |
598 | &extra_blocks); | |
599 | if (ret) { | |
600 | mlog_errno(ret); | |
601 | goto out; | |
602 | } | |
603 | } | |
604 | ||
e21e5744 LC |
605 | ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, |
606 | len, 1, | |
607 | &context->meta_ac, | |
608 | extra_blocks, &credits); | |
e0847717 TY |
609 | if (ret) { |
610 | mlog_errno(ret); | |
611 | goto out; | |
612 | } | |
613 | ||
614 | /* | |
615 | * need to count 2 extra credits for global_bitmap inode and | |
616 | * group descriptor. | |
617 | */ | |
618 | credits += OCFS2_INODE_UPDATE_CREDITS + 1; | |
619 | ||
620 | /* | |
621 | * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() | |
622 | * logic, while we still need to lock the global_bitmap. | |
623 | */ | |
624 | gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, | |
625 | OCFS2_INVALID_SLOT); | |
626 | if (!gb_inode) { | |
627 | mlog(ML_ERROR, "unable to get global_bitmap inode\n"); | |
628 | ret = -EIO; | |
629 | goto out; | |
630 | } | |
631 | ||
5955102c | 632 | inode_lock(gb_inode); |
e0847717 TY |
633 | |
634 | ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); | |
635 | if (ret) { | |
636 | mlog_errno(ret); | |
637 | goto out_unlock_gb_mutex; | |
638 | } | |
639 | ||
5955102c | 640 | inode_lock(tl_inode); |
e0847717 TY |
641 | |
642 | handle = ocfs2_start_trans(osb, credits); | |
643 | if (IS_ERR(handle)) { | |
644 | ret = PTR_ERR(handle); | |
645 | mlog_errno(ret); | |
646 | goto out_unlock_tl_inode; | |
647 | } | |
648 | ||
649 | new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); | |
650 | ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, | |
651 | GLOBAL_BITMAP_SYSTEM_INODE, | |
652 | OCFS2_INVALID_SLOT, | |
653 | &goal_bit, &gd_bh); | |
654 | if (ret) { | |
655 | mlog_errno(ret); | |
656 | goto out_commit; | |
657 | } | |
658 | ||
659 | /* | |
660 | * probe the victim cluster group to find a proper | |
661 | * region to fit wanted movement, it even will perfrom | |
662 | * a best-effort attempt by compromising to a threshold | |
663 | * around the goal. | |
664 | */ | |
665 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, | |
666 | new_phys_cpos); | |
3d75be7c | 667 | if (!*new_phys_cpos) { |
e0847717 TY |
668 | ret = -ENOSPC; |
669 | goto out_commit; | |
670 | } | |
671 | ||
672 | ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, | |
673 | *new_phys_cpos, ext_flags); | |
674 | if (ret) { | |
675 | mlog_errno(ret); | |
676 | goto out_commit; | |
677 | } | |
678 | ||
679 | gd = (struct ocfs2_group_desc *)gd_bh->b_data; | |
680 | ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, | |
681 | le16_to_cpu(gd->bg_chain)); | |
682 | if (ret) { | |
683 | mlog_errno(ret); | |
684 | goto out_commit; | |
685 | } | |
686 | ||
687 | ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, | |
4eb7b93e | 688 | goal_bit, len, 0, 0); |
db66c715 YL |
689 | if (ret) { |
690 | ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, | |
691 | le16_to_cpu(gd->bg_chain)); | |
e0847717 | 692 | mlog_errno(ret); |
db66c715 | 693 | } |
e0847717 TY |
694 | |
695 | /* | |
696 | * Here we should write the new page out first if we are | |
697 | * in write-back mode. | |
698 | */ | |
699 | ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); | |
700 | if (ret) | |
701 | mlog_errno(ret); | |
702 | ||
703 | out_commit: | |
704 | ocfs2_commit_trans(osb, handle); | |
705 | brelse(gd_bh); | |
706 | ||
707 | out_unlock_tl_inode: | |
5955102c | 708 | inode_unlock(tl_inode); |
e0847717 TY |
709 | |
710 | ocfs2_inode_unlock(gb_inode, 1); | |
711 | out_unlock_gb_mutex: | |
5955102c | 712 | inode_unlock(gb_inode); |
e0847717 TY |
713 | brelse(gb_bh); |
714 | iput(gb_inode); | |
715 | ||
716 | out: | |
717 | if (context->meta_ac) { | |
718 | ocfs2_free_alloc_context(context->meta_ac); | |
719 | context->meta_ac = NULL; | |
720 | } | |
721 | ||
722 | if (ref_tree) | |
723 | ocfs2_unlock_refcount_tree(osb, ref_tree, 1); | |
724 | ||
725 | return ret; | |
726 | } | |
ee16cc03 TY |
727 | |
728 | /* | |
729 | * Helper to calculate the defraging length in one run according to threshold. | |
730 | */ | |
731 | static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, | |
732 | u32 threshold, int *skip) | |
733 | { | |
734 | if ((*alloc_size + *len_defraged) < threshold) { | |
735 | /* | |
736 | * proceed defragmentation until we meet the thresh | |
737 | */ | |
738 | *len_defraged += *alloc_size; | |
739 | } else if (*len_defraged == 0) { | |
740 | /* | |
741 | * XXX: skip a large extent. | |
742 | */ | |
743 | *skip = 1; | |
744 | } else { | |
745 | /* | |
746 | * split this extent to coalesce with former pieces as | |
747 | * to reach the threshold. | |
748 | * | |
749 | * we're done here with one cycle of defragmentation | |
750 | * in a size of 'thresh', resetting 'len_defraged' | |
751 | * forces a new defragmentation. | |
752 | */ | |
753 | *alloc_size = threshold - *len_defraged; | |
754 | *len_defraged = 0; | |
755 | } | |
756 | } | |
53069d4e TY |
757 | |
758 | static int __ocfs2_move_extents_range(struct buffer_head *di_bh, | |
759 | struct ocfs2_move_extents_context *context) | |
760 | { | |
761 | int ret = 0, flags, do_defrag, skip = 0; | |
762 | u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; | |
763 | u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; | |
764 | ||
765 | struct inode *inode = context->inode; | |
766 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | |
767 | struct ocfs2_move_extents *range = context->range; | |
768 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
769 | ||
f17c20dd | 770 | if ((i_size_read(inode) == 0) || (range->me_len == 0)) |
53069d4e TY |
771 | return 0; |
772 | ||
773 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | |
774 | return 0; | |
775 | ||
776 | context->refcount_loc = le64_to_cpu(di->i_refcount_loc); | |
777 | ||
778 | ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); | |
779 | ocfs2_init_dealloc_ctxt(&context->dealloc); | |
780 | ||
781 | /* | |
782 | * TO-DO XXX: | |
783 | * | |
784 | * - xattr extents. | |
785 | */ | |
786 | ||
787 | do_defrag = context->auto_defrag; | |
788 | ||
789 | /* | |
790 | * extents moving happens in unit of clusters, for the sake | |
791 | * of simplicity, we may ignore two clusters where 'byte_start' | |
792 | * and 'byte_start + len' were within. | |
793 | */ | |
794 | move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); | |
795 | len_to_move = (range->me_start + range->me_len) >> | |
796 | osb->s_clustersize_bits; | |
797 | if (len_to_move >= move_start) | |
798 | len_to_move -= move_start; | |
799 | else | |
800 | len_to_move = 0; | |
801 | ||
dda54e76 | 802 | if (do_defrag) { |
53069d4e | 803 | defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; |
dda54e76 TY |
804 | if (defrag_thresh <= 1) |
805 | goto done; | |
806 | } else | |
53069d4e TY |
807 | new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, |
808 | range->me_goal); | |
809 | ||
810 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " | |
811 | "thresh: %u\n", | |
812 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | |
813 | (unsigned long long)range->me_start, | |
814 | (unsigned long long)range->me_len, | |
815 | move_start, len_to_move, defrag_thresh); | |
816 | ||
817 | cpos = move_start; | |
818 | while (len_to_move) { | |
819 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, | |
820 | &flags); | |
821 | if (ret) { | |
822 | mlog_errno(ret); | |
823 | goto out; | |
824 | } | |
825 | ||
826 | if (alloc_size > len_to_move) | |
827 | alloc_size = len_to_move; | |
828 | ||
829 | /* | |
830 | * XXX: how to deal with a hole: | |
831 | * | |
832 | * - skip the hole of course | |
833 | * - force a new defragmentation | |
834 | */ | |
835 | if (!phys_cpos) { | |
836 | if (do_defrag) | |
837 | len_defraged = 0; | |
838 | ||
839 | goto next; | |
840 | } | |
841 | ||
842 | if (do_defrag) { | |
843 | ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, | |
844 | defrag_thresh, &skip); | |
845 | /* | |
846 | * skip large extents | |
847 | */ | |
848 | if (skip) { | |
849 | skip = 0; | |
850 | goto next; | |
851 | } | |
852 | ||
853 | mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " | |
854 | "alloc_size: %u, len_defraged: %u\n", | |
855 | cpos, phys_cpos, alloc_size, len_defraged); | |
856 | ||
857 | ret = ocfs2_defrag_extent(context, cpos, phys_cpos, | |
4dfa66bd | 858 | &alloc_size, flags); |
53069d4e TY |
859 | } else { |
860 | ret = ocfs2_move_extent(context, cpos, phys_cpos, | |
861 | &new_phys_cpos, alloc_size, | |
862 | flags); | |
863 | ||
864 | new_phys_cpos += alloc_size; | |
865 | } | |
866 | ||
867 | if (ret < 0) { | |
868 | mlog_errno(ret); | |
869 | goto out; | |
870 | } | |
871 | ||
872 | context->clusters_moved += alloc_size; | |
873 | next: | |
874 | cpos += alloc_size; | |
875 | len_to_move -= alloc_size; | |
876 | } | |
877 | ||
dda54e76 | 878 | done: |
53069d4e TY |
879 | range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; |
880 | ||
881 | out: | |
882 | range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, | |
883 | context->clusters_moved); | |
884 | range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, | |
885 | context->new_phys_cpos); | |
886 | ||
887 | ocfs2_schedule_truncate_log_flush(osb, 1); | |
888 | ocfs2_run_deallocs(osb, &context->dealloc); | |
889 | ||
890 | return ret; | |
891 | } | |
892 | ||
893 | static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) | |
894 | { | |
895 | int status; | |
896 | handle_t *handle; | |
897 | struct inode *inode = context->inode; | |
898 | struct ocfs2_dinode *di; | |
899 | struct buffer_head *di_bh = NULL; | |
900 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
901 | ||
53069d4e TY |
902 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) |
903 | return -EROFS; | |
904 | ||
5955102c | 905 | inode_lock(inode); |
53069d4e TY |
906 | |
907 | /* | |
908 | * This prevents concurrent writes from other nodes | |
909 | */ | |
910 | status = ocfs2_rw_lock(inode, 1); | |
911 | if (status) { | |
912 | mlog_errno(status); | |
913 | goto out; | |
914 | } | |
915 | ||
916 | status = ocfs2_inode_lock(inode, &di_bh, 1); | |
917 | if (status) { | |
918 | mlog_errno(status); | |
919 | goto out_rw_unlock; | |
920 | } | |
921 | ||
922 | /* | |
923 | * rememer ip_xattr_sem also needs to be held if necessary | |
924 | */ | |
925 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | |
926 | ||
927 | status = __ocfs2_move_extents_range(di_bh, context); | |
928 | ||
929 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | |
930 | if (status) { | |
931 | mlog_errno(status); | |
932 | goto out_inode_unlock; | |
933 | } | |
934 | ||
935 | /* | |
936 | * We update ctime for these changes | |
937 | */ | |
938 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | |
939 | if (IS_ERR(handle)) { | |
940 | status = PTR_ERR(handle); | |
941 | mlog_errno(status); | |
942 | goto out_inode_unlock; | |
943 | } | |
944 | ||
945 | status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | |
946 | OCFS2_JOURNAL_ACCESS_WRITE); | |
947 | if (status) { | |
948 | mlog_errno(status); | |
949 | goto out_commit; | |
950 | } | |
951 | ||
952 | di = (struct ocfs2_dinode *)di_bh->b_data; | |
6861de97 | 953 | inode_set_ctime_current(inode); |
fd6acbbc JL |
954 | di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); |
955 | di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); | |
6fdb702d | 956 | ocfs2_update_inode_fsync_trans(handle, inode, 0); |
53069d4e TY |
957 | |
958 | ocfs2_journal_dirty(handle, di_bh); | |
959 | ||
960 | out_commit: | |
961 | ocfs2_commit_trans(osb, handle); | |
962 | ||
963 | out_inode_unlock: | |
964 | brelse(di_bh); | |
965 | ocfs2_inode_unlock(inode, 1); | |
966 | out_rw_unlock: | |
967 | ocfs2_rw_unlock(inode, 1); | |
968 | out: | |
5955102c | 969 | inode_unlock(inode); |
53069d4e TY |
970 | |
971 | return status; | |
972 | } | |
973 | ||
974 | int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) | |
975 | { | |
976 | int status; | |
977 | ||
496ad9aa | 978 | struct inode *inode = file_inode(filp); |
53069d4e | 979 | struct ocfs2_move_extents range; |
85a258b7 DC |
980 | struct ocfs2_move_extents_context *context; |
981 | ||
982 | if (!argp) | |
983 | return -EINVAL; | |
53069d4e | 984 | |
a561be71 | 985 | status = mnt_want_write_file(filp); |
53069d4e TY |
986 | if (status) |
987 | return status; | |
988 | ||
bfbca926 YL |
989 | if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { |
990 | status = -EPERM; | |
85a258b7 | 991 | goto out_drop; |
bfbca926 | 992 | } |
53069d4e TY |
993 | |
994 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { | |
995 | status = -EPERM; | |
85a258b7 | 996 | goto out_drop; |
53069d4e TY |
997 | } |
998 | ||
999 | context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); | |
1000 | if (!context) { | |
1001 | status = -ENOMEM; | |
1002 | mlog_errno(status); | |
85a258b7 | 1003 | goto out_drop; |
53069d4e TY |
1004 | } |
1005 | ||
1006 | context->inode = inode; | |
1007 | context->file = filp; | |
1008 | ||
85a258b7 DC |
1009 | if (copy_from_user(&range, argp, sizeof(range))) { |
1010 | status = -EFAULT; | |
1011 | goto out_free; | |
53069d4e TY |
1012 | } |
1013 | ||
bfbca926 YL |
1014 | if (range.me_start > i_size_read(inode)) { |
1015 | status = -EINVAL; | |
85a258b7 | 1016 | goto out_free; |
bfbca926 | 1017 | } |
53069d4e TY |
1018 | |
1019 | if (range.me_start + range.me_len > i_size_read(inode)) | |
1020 | range.me_len = i_size_read(inode) - range.me_start; | |
1021 | ||
1022 | context->range = ⦥ | |
1023 | ||
236b9254 HZO |
1024 | /* |
1025 | * ok, the default theshold for the defragmentation | |
1026 | * is 1M, since our maximum clustersize was 1M also. | |
1027 | * any thought? | |
1028 | */ | |
1029 | if (!range.me_threshold) | |
1030 | range.me_threshold = 1024 * 1024; | |
1031 | ||
1032 | if (range.me_threshold > i_size_read(inode)) | |
1033 | range.me_threshold = i_size_read(inode); | |
1034 | ||
53069d4e TY |
1035 | if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { |
1036 | context->auto_defrag = 1; | |
dda54e76 | 1037 | |
4dfa66bd TY |
1038 | if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) |
1039 | context->partial = 1; | |
53069d4e TY |
1040 | } else { |
1041 | /* | |
1042 | * first best-effort attempt to validate and adjust the goal | |
1043 | * (physical address in block), while it can't guarantee later | |
1044 | * operation can succeed all the time since global_bitmap may | |
1045 | * change a bit over time. | |
1046 | */ | |
1047 | ||
1048 | status = ocfs2_validate_and_adjust_move_goal(inode, &range); | |
1049 | if (status) | |
85a258b7 | 1050 | goto out_copy; |
53069d4e TY |
1051 | } |
1052 | ||
1053 | status = ocfs2_move_extents(context); | |
1054 | if (status) | |
1055 | mlog_errno(status); | |
85a258b7 | 1056 | out_copy: |
53069d4e TY |
1057 | /* |
1058 | * movement/defragmentation may end up being partially completed, | |
1059 | * that's the reason why we need to return userspace the finished | |
1060 | * length and new_offset even if failure happens somewhere. | |
1061 | */ | |
85a258b7 DC |
1062 | if (copy_to_user(argp, &range, sizeof(range))) |
1063 | status = -EFAULT; | |
53069d4e | 1064 | |
85a258b7 | 1065 | out_free: |
53069d4e | 1066 | kfree(context); |
85a258b7 | 1067 | out_drop: |
2a79f17e | 1068 | mnt_drop_write_file(filp); |
53069d4e TY |
1069 | |
1070 | return status; | |
1071 | } |